Merge branch 'x86-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 23 Jun 2015 00:59:09 +0000 (17:59 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 23 Jun 2015 00:59:09 +0000 (17:59 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 23 Jun 2015 00:59:09 +0000 (17:59 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 23 Jun 2015 00:59:09 +0000 (17:59 -0700)
diff --git a/Documentation/ABI/testing/sysfs-firmware-efi b/Documentation/ABI/testing/sysfs-firmware-efi

index 05874da7ce80302d92e15781deb90c2f1522c9e4..e794eac32a90a51c4d01577affe71466cde99a1f 100644 (file)
--- a/Documentation/ABI/testing/sysfs-firmware-efi
+++ b/Documentation/ABI/testing/sysfs-firmware-efi
@@ -18,3 +18,13 @@ Contact:     Dave Young <dyoung@redhat.com>
  Description:   It shows the physical address of config table entry in the EFI
                 system table.
  Users:         Kexec
+
+What:          /sys/firmware/efi/systab
+Date:          April 2005
+Contact:       linux-efi@vger.kernel.org
+Description:   Displays the physical addresses of all EFI Configuration
+               Tables found via the EFI System Table. The order in
+               which the tables are printed forms an ABI and newer
+               versions are always printed first, i.e. ACPI20 comes
+               before ACPI.
+Users:         dmidecode
diff --git a/Documentation/ABI/testing/sysfs-firmware-efi-esrt b/Documentation/ABI/testing/sysfs-firmware-efi-esrt

new file mode 100644 (file)

index 0000000..6e431d1
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-efi-esrt
@@ -0,0 +1,81 @@
+What:          /sys/firmware/efi/esrt/
+Date:          February 2015
+Contact:       Peter Jones <pjones@redhat.com>
+Description:   Provides userland access to read the EFI System Resource Table
+               (ESRT), a catalog of firmware for which can be updated with
+               the UEFI UpdateCapsule mechanism described in section 7.5 of
+               the UEFI Standard.
+Users:         fwupdate - https://github.com/rhinstaller/fwupdate
+
+What:          /sys/firmware/efi/esrt/fw_resource_count
+Date:          February 2015
+Contact:       Peter Jones <pjones@redhat.com>
+Description:   The number of entries in the ESRT
+
+What:          /sys/firmware/efi/esrt/fw_resource_count_max
+Date:          February 2015
+Contact:       Peter Jones <pjones@redhat.com>
+Description:   The maximum number of entries that /could/ be registered
+               in the allocation the table is currently in.  This is
+               really only useful to the system firmware itself.
+
+What:          /sys/firmware/efi/esrt/fw_resource_version
+Date:          February 2015
+Contact:       Peter Jones <pjones@redhat.com>
+Description:   The version of the ESRT structure provided by the firmware.
+
+What:          /sys/firmware/efi/esrt/entries/entry$N/
+Date:          February 2015
+Contact:       Peter Jones <pjones@redhat.com>
+Description:   Each ESRT entry is identified by a GUID, and each gets a
+               subdirectory under entries/ .
+               example: /sys/firmware/efi/esrt/entries/entry0/
+
+What:          /sys/firmware/efi/esrt/entries/entry$N/fw_type
+Date:          February 2015
+Contact:       Peter Jones <pjones@redhat.com>
+Description:   What kind of firmware entry this is:
+               0 - Unknown
+               1 - System Firmware
+               2 - Device Firmware
+               3 - UEFI Driver
+
+What:          /sys/firmware/efi/esrt/entries/entry$N/fw_class
+Date:          February 2015
+Contact:       Peter Jones <pjones@redhat.com>
+Description:   This is the entry's guid, and will match the directory name.
+
+What:          /sys/firmware/efi/esrt/entries/entry$N/fw_version
+Date:          February 2015
+Contact:       Peter Jones <pjones@redhat.com>
+Description:   The version of the firmware currently installed.  This is a
+               32-bit unsigned integer.
+
+What:          /sys/firmware/efi/esrt/entries/entry$N/lowest_supported_fw_version
+Date:          February 2015
+Contact:       Peter Jones <pjones@redhat.com>
+Description:   The lowest version of the firmware that can be installed.
+
+What:          /sys/firmware/efi/esrt/entries/entry$N/capsule_flags
+Date:          February 2015
+Contact:       Peter Jones <pjones@redhat.com>
+Description:   Flags that must be passed to UpdateCapsule()
+
+What:          /sys/firmware/efi/esrt/entries/entry$N/last_attempt_version
+Date:          February 2015
+Contact:       Peter Jones <pjones@redhat.com>
+Description:   The last firmware version for which an update was attempted.
+
+What:          /sys/firmware/efi/esrt/entries/entry$N/last_attempt_status
+Date:          February 2015
+Contact:       Peter Jones <pjones@redhat.com>
+Description:   The result of the last firmware update attempt for the
+               firmware resource entry.
+               0 - Success
+               1 - Insufficient resources
+               2 - Incorrect version
+               3 - Invalid format
+               4 - Authentication error
+               5 - AC power event
+               6 - Battery power event
+
diff --git a/Documentation/RCU/arrayRCU.txt b/Documentation/RCU/arrayRCU.txt

index 453ebe6953eefe0b305a82da60259849756bfea9..f05a9afb2c39b61efb841886e6f272adfa9da797 100644 (file)
--- a/Documentation/RCU/arrayRCU.txt
+++ b/Documentation/RCU/arrayRCU.txt
@@ -10,7 +10,19 @@ also be used to protect arrays.  Three situations are as follows:
  
  3.  Resizeable Arrays
  
-Each of these situations are discussed below.
+Each of these three situations involves an RCU-protected pointer to an
+array that is separately indexed.  It might be tempting to consider use
+of RCU to instead protect the index into an array, however, this use
+case is -not- supported.  The problem with RCU-protected indexes into
+arrays is that compilers can play way too many optimization games with
+integers, which means that the rules governing handling of these indexes
+are far more trouble than they are worth.  If RCU-protected indexes into
+arrays prove to be particularly valuable (which they have not thus far),
+explicit cooperation from the compiler will be required to permit them
+to be safely used.
+
+That aside, each of the three RCU-protected pointer situations are
+described in the following sections.
  
  
  Situation 1: Hash Tables
@@ -36,9 +48,9 @@ Quick Quiz:  Why is it so important that updates be rare when
  Situation 3: Resizeable Arrays
  
  Use of RCU for resizeable arrays is demonstrated by the grow_ary()
-function used by the System V IPC code.  The array is used to map from
-semaphore, message-queue, and shared-memory IDs to the data structure
-that represents the corresponding IPC construct.  The grow_ary()
+function formerly used by the System V IPC code.  The array is used
+to map from semaphore, message-queue, and shared-memory IDs to the data
+structure that represents the corresponding IPC construct.  The grow_ary()
  function does not acquire any locks; instead its caller must hold the
  ids->sem semaphore.
  
diff --git a/Documentation/RCU/lockdep.txt b/Documentation/RCU/lockdep.txt

index cd83d2348fef8c4a3ff3bee57fc9904976b46390..da51d306885077b050d6c3624b96cb08c75758bb 100644 (file)
--- a/Documentation/RCU/lockdep.txt
+++ b/Documentation/RCU/lockdep.txt
@@ -47,11 +47,6 @@ checking of rcu_dereference() primitives:
                 Use explicit check expression "c" along with
                 srcu_read_lock_held()().  This is useful in code that
                 is invoked by both SRCU readers and updaters.
-       rcu_dereference_index_check(p, c):
-               Use explicit check expression "c", but the caller
-               must supply one of the rcu_read_lock_held() functions.
-               This is useful in code that uses RCU-protected arrays
-               that is invoked by both RCU readers and updaters.
         rcu_dereference_raw(p):
                 Don't check.  (Use sparingly, if at all.)
         rcu_dereference_protected(p, c):
@@ -64,11 +59,6 @@ checking of rcu_dereference() primitives:
                 but retain the compiler constraints that prevent duplicating
                 or coalescsing.  This is useful when when testing the
                 value of the pointer itself, for example, against NULL.
-       rcu_access_index(idx):
-               Return the value of the index and omit all barriers, but
-               retain the compiler constraints that prevent duplicating
-               or coalescsing.  This is useful when when testing the
-               value of the index itself, for example, against -1.
  
  The rcu_dereference_check() check expression can be any boolean
  expression, but would normally include a lockdep expression.  However,
diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt

index ceb05da5a5acd3e8a6735543284c7a64c1c91228..1e6c0da994f544b6fc3acaa786d4f4f932551db6 100644 (file)
--- a/Documentation/RCU/rcu_dereference.txt
+++ b/Documentation/RCU/rcu_dereference.txt
@@ -25,17 +25,6 @@ o    You must use one of the rcu_dereference() family of primitives
         for an example where the compiler can in fact deduce the exact
         value of the pointer, and thus cause misordering.
  
-o      Do not use single-element RCU-protected arrays.  The compiler
-       is within its right to assume that the value of an index into
-       such an array must necessarily evaluate to zero.  The compiler
-       could then substitute the constant zero for the computation, so
-       that the array index no longer depended on the value returned
-       by rcu_dereference().  If the array index no longer depends
-       on rcu_dereference(), then both the compiler and the CPU
-       are within their rights to order the array access before the
-       rcu_dereference(), which can cause the array access to return
-       garbage.
-
  o      Avoid cancellation when using the "+" and "-" infix arithmetic
         operators.  For example, for a given variable "x", avoid
         "(x-x)".  There are similar arithmetic pitfalls from other
@@ -76,14 +65,15 @@ o   Do not use the results from the boolean "&&" and "||" when
         dereferencing.  For example, the following (rather improbable)
         code is buggy:
  
-               int a[2];
-               int index;
-               int force_zero_index = 1;
+               int *p;
+               int *q;
  
                 ...
  
-               r1 = rcu_dereference(i1)
-               r2 = a[r1 && force_zero_index];  /* BUGGY!!! */
+               p = rcu_dereference(gp)
+               q = &global_q;
+               q += p != &oom_p1 && p != &oom_p2;
+               r1 = *q;  /* BUGGY!!! */
  
         The reason this is buggy is that "&&" and "||" are often compiled
         using branches.  While weak-memory machines such as ARM or PowerPC
@@ -94,14 +84,15 @@ o   Do not use the results from relational operators ("==", "!=",
         ">", ">=", "<", or "<=") when dereferencing.  For example,
         the following (quite strange) code is buggy:
  
-               int a[2];
-               int index;
-               int flip_index = 0;
+               int *p;
+               int *q;
  
                 ...
  
-               r1 = rcu_dereference(i1)
-               r2 = a[r1 != flip_index];  /* BUGGY!!! */
+               p = rcu_dereference(gp)
+               q = &global_q;
+               q += p > &oom_p;
+               r1 = *q;  /* BUGGY!!! */
  
         As before, the reason this is buggy is that relational operators
         are often compiled using branches.  And as before, although
@@ -193,6 +184,11 @@ o  Be very careful about comparing pointers obtained from
                 pointer.  Note that the volatile cast in rcu_dereference()
                 will normally prevent the compiler from knowing too much.
  
+               However, please note that if the compiler knows that the
+               pointer takes on only one of two values, a not-equal
+               comparison will provide exactly the information that the
+               compiler needs to deduce the value of the pointer.
+
  o      Disable any value-speculation optimizations that your compiler
         might provide, especially if you are making use of feedback-based
         optimizations that take data collected from prior runs.  Such
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt

index 88dfce182f660904a609aa97e670a4ce87659985..5746b0c77f3e4c53da9d68b1213bae991f062092 100644 (file)
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -256,7 +256,9 @@ rcu_dereference()
         If you are going to be fetching multiple fields from the
         RCU-protected structure, using the local variable is of
         course preferred.  Repeated rcu_dereference() calls look
-       ugly and incur unnecessary overhead on Alpha CPUs.
+       ugly, do not guarantee that the same pointer will be returned
+       if an update happened while in the critical section, and incur
+       unnecessary overhead on Alpha CPUs.
  
         Note that the value returned by rcu_dereference() is valid
         only within the enclosing RCU read-side critical section.
@@ -879,9 +881,7 @@ SRCU:       Initialization/cleanup
  
  All:  lockdep-checked RCU-protected pointer access
  
-       rcu_access_index
         rcu_access_pointer
-       rcu_dereference_index_check
         rcu_dereference_raw
         rcu_lockdep_assert
         rcu_sleep_check
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt

index 0aad6deb2d9638e3b0d7bf16db928e1bd0b80d8b..12b1b25b4da9711c95ab013adf1bec4214964d2c 100644 (file)
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -1,6 +1,6 @@
  
  Export CPU topology info via sysfs. Items (attributes) are similar
-to /proc/cpuinfo.
+to /proc/cpuinfo output of some architectures:
  
  1) /sys/devices/system/cpu/cpuX/topology/physical_package_id:
  
@@ -23,20 +23,35 @@ to /proc/cpuinfo.
  4) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
  
         internal kernel map of cpuX's hardware threads within the same
-       core as cpuX
+       core as cpuX.
  
-5) /sys/devices/system/cpu/cpuX/topology/core_siblings:
+5) /sys/devices/system/cpu/cpuX/topology/thread_siblings_list:
+
+       human-readable list of cpuX's hardware threads within the same
+       core as cpuX.
+
+6) /sys/devices/system/cpu/cpuX/topology/core_siblings:
  
         internal kernel map of cpuX's hardware threads within the same
         physical_package_id.
  
-6) /sys/devices/system/cpu/cpuX/topology/book_siblings:
+7) /sys/devices/system/cpu/cpuX/topology/core_siblings_list:
+
+       human-readable list of cpuX's hardware threads within the same
+       physical_package_id.
+
+8) /sys/devices/system/cpu/cpuX/topology/book_siblings:
  
         internal kernel map of cpuX's hardware threads within the same
         book_id.
  
+9) /sys/devices/system/cpu/cpuX/topology/book_siblings_list:
+
+       human-readable list of cpuX's hardware threads within the same
+       book_id.
+
  To implement it in an architecture-neutral way, a new source file,
-drivers/base/topology.c, is to export the 4 or 6 attributes. The two book
+drivers/base/topology.c, is to export the 6 or 9 attributes. The three book
  related sysfs files will only be created if CONFIG_SCHED_BOOK is selected.
  
  For an architecture to support this feature, it must define some of
@@ -44,20 +59,22 @@ these macros in include/asm-XXX/topology.h:
  #define topology_physical_package_id(cpu)
  #define topology_core_id(cpu)
  #define topology_book_id(cpu)
-#define topology_thread_cpumask(cpu)
+#define topology_sibling_cpumask(cpu)
  #define topology_core_cpumask(cpu)
  #define topology_book_cpumask(cpu)
  
-The type of **_id is int.
-The type of siblings is (const) struct cpumask *.
+The type of **_id macros is int.
+The type of **_cpumask macros is (const) struct cpumask *. The latter
+correspond with appropriate **_siblings sysfs attributes (except for
+topology_sibling_cpumask() which corresponds with thread_siblings).
  
  To be consistent on all architectures, include/linux/topology.h
  provides default definitions for any of the above macros that are
  not defined by include/asm-XXX/topology.h:
  1) physical_package_id: -1
  2) core_id: 0
-3) thread_siblings: just the given CPU
-4) core_siblings: just the given CPU
+3) sibling_cpumask: just the given CPU
+4) core_cpumask: just the given CPU
  
  For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
  default definitions for topology_book_id() and topology_book_cpumask().
diff --git a/Documentation/devicetree/bindings/clock/at91-clock.txt b/Documentation/devicetree/bindings/clock/at91-clock.txt

index 7a4d4926f44e47b9a80077192ae9dacbd1089e7e..5ba6450693b9816dfc1dacef96570e14c22a111b 100644 (file)
--- a/Documentation/devicetree/bindings/clock/at91-clock.txt
+++ b/Documentation/devicetree/bindings/clock/at91-clock.txt
@@ -248,7 +248,7 @@ Required properties for peripheral clocks:
  - #address-cells : shall be 1 (reg is used to encode clk id).
  - clocks : shall be the master clock phandle.
         e.g. clocks = <&mck>;
-- name: device tree node describing a specific system clock.
+- name: device tree node describing a specific peripheral clock.
         * #clock-cells : from common clock binding; shall be set to 0.
         * reg: peripheral id. See Atmel's datasheets to get a full
           list of peripheral ids.
diff --git a/Documentation/devicetree/bindings/input/touchscreen/tsc2005.txt b/Documentation/devicetree/bindings/input/touchscreen/tsc2005.txt

index 4b641c7bf1c252a3465aa7e028e18042cb7ad61b..09089a6d69ed8d1c9b29115e6abce75a6d1a2fcd 100644 (file)
--- a/Documentation/devicetree/bindings/input/touchscreen/tsc2005.txt
+++ b/Documentation/devicetree/bindings/input/touchscreen/tsc2005.txt
@@ -32,8 +32,8 @@ Example:
                 touchscreen-fuzz-x = <4>;
                 touchscreen-fuzz-y = <7>;
                 touchscreen-fuzz-pressure = <2>;
-               touchscreen-max-x = <4096>;
-               touchscreen-max-y = <4096>;
+               touchscreen-size-x = <4096>;
+               touchscreen-size-y = <4096>;
                 touchscreen-max-pressure = <2048>;
  
                 ti,x-plate-ohms = <280>;
diff --git a/Documentation/devicetree/bindings/usb/renesas_usbhs.txt b/Documentation/devicetree/bindings/usb/renesas_usbhs.txt

index dc2a18f0b3a10a9e1bd5814fc429fe9246b82ec7..ddbe304beb212238e859640905b83886e5164ac7 100644 (file)
--- a/Documentation/devicetree/bindings/usb/renesas_usbhs.txt
+++ b/Documentation/devicetree/bindings/usb/renesas_usbhs.txt
@@ -15,10 +15,8 @@ Optional properties:
    - phys: phandle + phy specifier pair
    - phy-names: must be "usb"
    - dmas: Must contain a list of references to DMA specifiers.
-  - dma-names : Must contain a list of DMA names:
-   - tx0 ... tx<n>
-   - rx0 ... rx<n>
-    - This <n> means DnFIFO in USBHS module.
+  - dma-names : named "ch%d", where %d is the channel number ranging from zero
+                to the number of channels (DnFIFOs) minus one.
  
  Example:
         usbhs: usb@e6590000 {
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking

index 0a926e2ba3ab68ffd541f5a619cd673e59826821..6a34a0f4d37ccf33248cfd81d2c917d45d8401bc 100644 (file)
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -50,8 +50,8 @@ prototypes:
         int (*rename2) (struct inode *, struct dentry *,
                         struct inode *, struct dentry *, unsigned int);
         int (*readlink) (struct dentry *, char __user *,int);
-       void * (*follow_link) (struct dentry *, struct nameidata *);
-       void (*put_link) (struct dentry *, struct nameidata *, void *);
+       const char *(*follow_link) (struct dentry *, void **);
+       void (*put_link) (struct inode *, void *);
         void (*truncate) (struct inode *);
         int (*permission) (struct inode *, int, unsigned int);
         int (*get_acl)(struct inode *, int);
diff --git a/Documentation/filesystems/automount-support.txt b/Documentation/filesystems/automount-support.txt

index 7cac200e2a85dd1efebc9f05e84cb8a2b9532b37..7eb762eb31361739bac381a7453ecb449facf161 100644 (file)
--- a/Documentation/filesystems/automount-support.txt
+++ b/Documentation/filesystems/automount-support.txt
@@ -1,41 +1,15 @@
-Support is available for filesystems that wish to do automounting support (such
-as kAFS which can be found in fs/afs/). This facility includes allowing
-in-kernel mounts to be performed and mountpoint degradation to be
-requested. The latter can also be requested by userspace.
+Support is available for filesystems that wish to do automounting
+support (such as kAFS which can be found in fs/afs/ and NFS in
+fs/nfs/). This facility includes allowing in-kernel mounts to be
+performed and mountpoint degradation to be requested. The latter can
+also be requested by userspace.
  
  
  ======================
  IN-KERNEL AUTOMOUNTING
  ======================
  
-A filesystem can now mount another filesystem on one of its directories by the
-following procedure:
-
- (1) Give the directory a follow_link() operation.
-
-     When the directory is accessed, the follow_link op will be called, and
-     it will be provided with the location of the mountpoint in the nameidata
-     structure (vfsmount and dentry).
-
- (2) Have the follow_link() op do the following steps:
-
-     (a) Call vfs_kern_mount() to call the appropriate filesystem to set up a
-         superblock and gain a vfsmount structure representing it.
-
-     (b) Copy the nameidata provided as an argument and substitute the dentry
-        argument into it the copy.
-
-     (c) Call do_add_mount() to install the new vfsmount into the namespace's
-        mountpoint tree, thus making it accessible to userspace. Use the
-        nameidata set up in (b) as the destination.
-
-        If the mountpoint will be automatically expired, then do_add_mount()
-        should also be given the location of an expiration list (see further
-        down).
-
-     (d) Release the path in the nameidata argument and substitute in the new
-        vfsmount and its root dentry. The ref counts on these will need
-        incrementing.
+See section "Mount Traps" of  Documentation/filesystems/autofs4.txt
  
  Then from userspace, you can just do something like:
  
@@ -61,17 +35,18 @@ AUTOMATIC MOUNTPOINT EXPIRY
  ===========================
  
  Automatic expiration of mountpoints is easy, provided you've mounted the
-mountpoint to be expired in the automounting procedure outlined above.
+mountpoint to be expired in the automounting procedure outlined separately.
  
  To do expiration, you need to follow these steps:
  
- (3) Create at least one list off which the vfsmounts to be expired can be
-     hung. Access to this list will be governed by the vfsmount_lock.
+ (1) Create at least one list off which the vfsmounts to be expired can be
+     hung.
  
- (4) In step (2c) above, the call to do_add_mount() should be provided with a
-     pointer to this list. It will hang the vfsmount off of it if it succeeds.
+ (2) When a new mountpoint is created in the ->d_automount method, add
+     the mnt to the list using mnt_set_expiry()
+             mnt_set_expiry(newmnt, &afs_vfsmounts);
  
- (5) When you want mountpoints to be expired, call mark_mounts_for_expiry()
+ (3) When you want mountpoints to be expired, call mark_mounts_for_expiry()
       with a pointer to this list. This will process the list, marking every
       vfsmount thereon for potential expiry on the next call.
  
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting

index e69274de8d0c9c1754cc40b8d99d78a724e63e50..3eae250254d581d253dd035109d568f8ef0c7873 100644 (file)
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -483,3 +483,20 @@ in your dentry operations instead.
  --
  [mandatory]
         ->aio_read/->aio_write are gone.  Use ->read_iter/->write_iter.
+---
+[recommended]
+       for embedded ("fast") symlinks just set inode->i_link to wherever the
+       symlink body is and use simple_follow_link() as ->follow_link().
+--
+[mandatory]
+       calling conventions for ->follow_link() have changed.  Instead of returning
+       cookie and using nd_set_link() to store the body to traverse, we return
+       the body to traverse and store the cookie using explicit void ** argument.
+       nameidata isn't passed at all - nd_jump_link() doesn't need it and
+       nd_[gs]et_link() is gone.
+--
+[mandatory]
+       calling conventions for ->put_link() have changed.  It gets inode instead of
+       dentry,  it does not get nameidata at all and it gets called only when cookie
+       is non-NULL.  Note that link body isn't available anymore, so if you need it,
+       store it as cookie.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt

index 5d833b32bbcd1046de40a15fee169ed462d274fc..b403b29ef7107cd9bfad4a4d0d509cbeb22f145e 100644 (file)
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -350,8 +350,8 @@ struct inode_operations {
         int (*rename2) (struct inode *, struct dentry *,
                         struct inode *, struct dentry *, unsigned int);
         int (*readlink) (struct dentry *, char __user *,int);
-        void * (*follow_link) (struct dentry *, struct nameidata *);
-        void (*put_link) (struct dentry *, struct nameidata *, void *);
+       const char *(*follow_link) (struct dentry *, void **);
+       void (*put_link) (struct inode *, void *);
         int (*permission) (struct inode *, int);
         int (*get_acl)(struct inode *, int);
         int (*setattr) (struct dentry *, struct iattr *);
@@ -436,16 +436,18 @@ otherwise noted.
  
    follow_link: called by the VFS to follow a symbolic link to the
         inode it points to.  Only required if you want to support
-       symbolic links.  This method returns a void pointer cookie
-       that is passed to put_link().
+       symbolic links.  This method returns the symlink body
+       to traverse (and possibly resets the current position with
+       nd_jump_link()).  If the body won't go away until the inode
+       is gone, nothing else is needed; if it needs to be otherwise
+       pinned, the data needed to release whatever we'd grabbed
+       is to be stored in void * variable passed by address to
+       follow_link() instance.
  
    put_link: called by the VFS to release resources allocated by
-       follow_link().  The cookie returned by follow_link() is passed
-       to this method as the last parameter.  It is used by
-       filesystems such as NFS where page cache is not stable
-       (i.e. page that was installed when the symbolic link walk
-       started might not be in the page cache at the end of the
-       walk).
+       follow_link().  The cookie stored by follow_link() is passed
+       to this method as the last parameter; only called when
+       cookie isn't NULL.
  
    permission: called by the VFS to check for access rights on a POSIX-like
         filesystem.
diff --git a/Documentation/i2c/slave-interface b/Documentation/i2c/slave-interface

index 389bb5d618549e5db99ed5ea8cb1b23f38ca48f4..b228ca54bcf4863cdad2a12e4d2533e4fe689a71 100644 (file)
--- a/Documentation/i2c/slave-interface
+++ b/Documentation/i2c/slave-interface
@@ -31,10 +31,10 @@ User manual
  ===========
  
  I2C slave backends behave like standard I2C clients. So, you can instantiate
-them like described in the document 'instantiating-devices'. A quick example
-for instantiating the slave-eeprom driver from userspace:
+them as described in the document 'instantiating-devices'. A quick example for
+instantiating the slave-eeprom driver from userspace at address 0x64 on bus 1:
  
-  # echo 0-0064 > /sys/bus/i2c/drivers/i2c-slave-eeprom/bind
+  # echo slave-24c02 0x64 > /sys/bus/i2c/devices/i2c-1/new_device
  
  Each backend should come with separate documentation to describe its specific
  behaviour and setup.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index a320a41e7412726fdbd6f1a943b5d4dbf24e4059..7bd4501f0cf93c5da1310ae95fded41f785f0cb3 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -943,6 +943,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                         Enable debug messages at boot time.  See
                         Documentation/dynamic-debug-howto.txt for details.
  
+       nompx           [X86] Disables Intel Memory Protection Extensions.
+                       See Documentation/x86/intel_mpx.txt for more
+                       information about the feature.
+
         eagerfpu=       [X86]
                         on      enable eager fpu restore
                         off     disable eager fpu restore
@@ -1487,6 +1491,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                         By default, super page will be supported if Intel IOMMU
                         has the capability. With this option, super page will
                         not be supported.
+               ecs_off [Default Off]
+                       By default, extended context tables will be supported if
+                       the hardware advertises that it has support both for the
+                       extended tables themselves, and also PASID support. With
+                       this option set, extended tables will not be used even
+                       on hardware which claims to support them.
  
         intel_idle.max_cstate=  [KNL,HW,ACPI,X86]
                         0       disables intel_idle and fall back on acpi_idle.
@@ -2998,11 +3008,34 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                         Set maximum number of finished RCU callbacks to
                         process in one batch.
  
+       rcutree.dump_tree=      [KNL]
+                       Dump the structure of the rcu_node combining tree
+                       out at early boot.  This is used for diagnostic
+                       purposes, to verify correct tree setup.
+
+       rcutree.gp_cleanup_delay=       [KNL]
+                       Set the number of jiffies to delay each step of
+                       RCU grace-period cleanup.  This only has effect
+                       when CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP is set.
+
         rcutree.gp_init_delay=  [KNL]
                         Set the number of jiffies to delay each step of
                         RCU grace-period initialization.  This only has
-                       effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT is
-                       set.
+                       effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT
+                       is set.
+
+       rcutree.gp_preinit_delay=       [KNL]
+                       Set the number of jiffies to delay each step of
+                       RCU grace-period pre-initialization, that is,
+                       the propagation of recent CPU-hotplug changes up
+                       the rcu_node combining tree.  This only has effect
+                       when CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT is set.
+
+       rcutree.rcu_fanout_exact= [KNL]
+                       Disable autobalancing of the rcu_node combining
+                       tree.  This is used by rcutorture, and might
+                       possibly be useful for architectures having high
+                       cache-to-cache transfer latencies.
  
         rcutree.rcu_fanout_leaf= [KNL]
                         Increase the number of CPUs assigned to each
@@ -3107,7 +3140,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                         test, hence the "fake".
  
         rcutorture.nreaders= [KNL]
-                       Set number of RCU readers.
+                       Set number of RCU readers.  The value -1 selects
+                       N-1, where N is the number of CPUs.  A value
+                       "n" less than -1 selects N-n-2, where N is again
+                       the number of CPUs.  For example, -2 selects N
+                       (the number of CPUs), -3 selects N+1, and so on.
  
         rcutorture.object_debug= [KNL]
                         Enable debug-object double-call_rcu() testing.
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt

index fe4020e4b4688da0f653b7876b01ad7957746382..13feb697271f0a270334dd3807255ed14af7d7ed 100644 (file)
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -617,16 +617,16 @@ case what's actually required is:
  However, stores are not speculated.  This means that ordering -is- provided
  for load-store control dependencies, as in the following example:
  
-       q = ACCESS_ONCE(a);
+       q = READ_ONCE_CTRL(a);
         if (q) {
                 ACCESS_ONCE(b) = p;
         }
  
-Control dependencies pair normally with other types of barriers.
-That said, please note that ACCESS_ONCE() is not optional!  Without the
-ACCESS_ONCE(), might combine the load from 'a' with other loads from
-'a', and the store to 'b' with other stores to 'b', with possible highly
-counterintuitive effects on ordering.
+Control dependencies pair normally with other types of barriers.  That
+said, please note that READ_ONCE_CTRL() is not optional!  Without the
+READ_ONCE_CTRL(), the compiler might combine the load from 'a' with
+other loads from 'a', and the store to 'b' with other stores to 'b',
+with possible highly counterintuitive effects on ordering.
  
  Worse yet, if the compiler is able to prove (say) that the value of
  variable 'a' is always non-zero, it would be well within its rights
@@ -636,12 +636,15 @@ as follows:
         q = a;
         b = p;  /* BUG: Compiler and CPU can both reorder!!! */
  
-So don't leave out the ACCESS_ONCE().
+Finally, the READ_ONCE_CTRL() includes an smp_read_barrier_depends()
+that DEC Alpha needs in order to respect control depedencies.
+
+So don't leave out the READ_ONCE_CTRL().
  
  It is tempting to try to enforce ordering on identical stores on both
  branches of the "if" statement as follows:
  
-       q = ACCESS_ONCE(a);
+       q = READ_ONCE_CTRL(a);
         if (q) {
                 barrier();
                 ACCESS_ONCE(b) = p;
@@ -655,7 +658,7 @@ branches of the "if" statement as follows:
  Unfortunately, current compilers will transform this as follows at high
  optimization levels:
  
-       q = ACCESS_ONCE(a);
+       q = READ_ONCE_CTRL(a);
         barrier();
         ACCESS_ONCE(b) = p;  /* BUG: No ordering vs. load from a!!! */
         if (q) {
@@ -685,7 +688,7 @@ memory barriers, for example, smp_store_release():
  In contrast, without explicit memory barriers, two-legged-if control
  ordering is guaranteed only when the stores differ, for example:
  
-       q = ACCESS_ONCE(a);
+       q = READ_ONCE_CTRL(a);
         if (q) {
                 ACCESS_ONCE(b) = p;
                 do_something();
@@ -694,14 +697,14 @@ ordering is guaranteed only when the stores differ, for example:
                 do_something_else();
         }
  
-The initial ACCESS_ONCE() is still required to prevent the compiler from
-proving the value of 'a'.
+The initial READ_ONCE_CTRL() is still required to prevent the compiler
+from proving the value of 'a'.
  
  In addition, you need to be careful what you do with the local variable 'q',
  otherwise the compiler might be able to guess the value and again remove
  the needed conditional.  For example:
  
-       q = ACCESS_ONCE(a);
+       q = READ_ONCE_CTRL(a);
         if (q % MAX) {
                 ACCESS_ONCE(b) = p;
                 do_something();
@@ -714,7 +717,7 @@ If MAX is defined to be 1, then the compiler knows that (q % MAX) is
  equal to zero, in which case the compiler is within its rights to
  transform the above code into the following:
  
-       q = ACCESS_ONCE(a);
+       q = READ_ONCE_CTRL(a);
         ACCESS_ONCE(b) = p;
         do_something_else();
  
@@ -725,7 +728,7 @@ is gone, and the barrier won't bring it back.  Therefore, if you are
  relying on this ordering, you should make sure that MAX is greater than
  one, perhaps as follows:
  
-       q = ACCESS_ONCE(a);
+       q = READ_ONCE_CTRL(a);
         BUILD_BUG_ON(MAX <= 1); /* Order load from a with store to b. */
         if (q % MAX) {
                 ACCESS_ONCE(b) = p;
@@ -742,14 +745,15 @@ of the 'if' statement.
  You must also be careful not to rely too much on boolean short-circuit
  evaluation.  Consider this example:
  
-       q = ACCESS_ONCE(a);
+       q = READ_ONCE_CTRL(a);
         if (a || 1 > 0)
                 ACCESS_ONCE(b) = 1;
  
-Because the second condition is always true, the compiler can transform
-this example as following, defeating control dependency:
+Because the first condition cannot fault and the second condition is
+always true, the compiler can transform this example as following,
+defeating control dependency:
  
-       q = ACCESS_ONCE(a);
+       q = READ_ONCE_CTRL(a);
         ACCESS_ONCE(b) = 1;
  
  This example underscores the need to ensure that the compiler cannot
@@ -762,8 +766,8 @@ demonstrated by two related examples, with the initial values of
  x and y both being zero:
  
         CPU 0                     CPU 1
-       =====================     =====================
-       r1 = ACCESS_ONCE(x);      r2 = ACCESS_ONCE(y);
+       =======================   =======================
+       r1 = READ_ONCE_CTRL(x);   r2 = READ_ONCE_CTRL(y);
         if (r1 > 0)               if (r2 > 0)
           ACCESS_ONCE(y) = 1;       ACCESS_ONCE(x) = 1;
  
@@ -783,7 +787,8 @@ But because control dependencies do -not- provide transitivity, the above
  assertion can fail after the combined three-CPU example completes.  If you
  need the three-CPU example to provide ordering, you will need smp_mb()
  between the loads and stores in the CPU 0 and CPU 1 code fragments,
-that is, just before or just after the "if" statements.
+that is, just before or just after the "if" statements.  Furthermore,
+the original two-CPU example is very fragile and should be avoided.
  
  These two examples are the LB and WWC litmus tests from this paper:
  http://www.cl.cam.ac.uk/users/pes20/ppc-supplemental/test6.pdf and this
@@ -791,6 +796,12 @@ site: https://www.cl.cam.ac.uk/~pes20/ppcmem/index.html.
  
  In summary:
  
+  (*) Control dependencies must be headed by READ_ONCE_CTRL().
+      Or, as a much less preferable alternative, interpose
+      be headed by READ_ONCE() or an ACCESS_ONCE() read and must
+      have smp_read_barrier_depends() between this read and the
+      control-dependent write.
+
    (*) Control dependencies can order prior loads against later stores.
        However, they do -not- guarantee any other sort of ordering:
        Not prior loads against later loads, nor prior stores against
@@ -1784,10 +1795,9 @@ for each construct.  These operations all imply certain barriers:
  
       Memory operations issued before the ACQUIRE may be completed after
       the ACQUIRE operation has completed.  An smp_mb__before_spinlock(),
-     combined with a following ACQUIRE, orders prior loads against
-     subsequent loads and stores and also orders prior stores against
-     subsequent stores.  Note that this is weaker than smp_mb()!  The
-     smp_mb__before_spinlock() primitive is free on many architectures.
+     combined with a following ACQUIRE, orders prior stores against
+     subsequent loads and stores. Note that this is weaker than smp_mb()!
+     The smp_mb__before_spinlock() primitive is free on many architectures.
  
   (2) RELEASE operation implication:
  
diff --git a/Documentation/networking/udplite.txt b/Documentation/networking/udplite.txt

index d727a38291005f962848ed40a1ab11db4c167899..53a726855e49bfa4c313e46e15df1eec7cb610ae 100644 (file)
--- a/Documentation/networking/udplite.txt
+++ b/Documentation/networking/udplite.txt
@@ -20,7 +20,7 @@
         files/UDP-Lite-HOWTO.txt
  
     o The Wireshark UDP-Lite WiKi (with capture files):
-       http://wiki.wireshark.org/Lightweight_User_Datagram_Protocol
+       https://wiki.wireshark.org/Lightweight_User_Datagram_Protocol
  
     o The Protocol Spec, RFC 3828, http://www.ietf.org/rfc/rfc3828.txt
  
diff --git a/Documentation/preempt-locking.txt b/Documentation/preempt-locking.txt

index 57883ca2498bb5ce818139a73755518e78150df9..e89ce6624af2fab481a708ad1a0e4e20d1bc0c1c 100644 (file)
--- a/Documentation/preempt-locking.txt
+++ b/Documentation/preempt-locking.txt
@@ -48,7 +48,7 @@ preemption must be disabled around such regions.
  
  Note, some FPU functions are already explicitly preempt safe.  For example,
  kernel_fpu_begin and kernel_fpu_end will disable and enable preemption.
-However, math_state_restore must be called with preemption disabled.
+However, fpu__restore() must be called with preemption disabled.
  
  
  RULE #3: Lock acquire and release must be performed by same task
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt

index 21461a0441c12990d1564caffa48f9a7482aaf58..e114513a2731dc667071cee0e860d489c80049c0 100644 (file)
--- a/Documentation/scheduler/sched-deadline.txt
+++ b/Documentation/scheduler/sched-deadline.txt
@@ -8,6 +8,10 @@ CONTENTS
   1. Overview
   2. Scheduling algorithm
   3. Scheduling Real-Time Tasks
+   3.1 Definitions
+   3.2 Schedulability Analysis for Uniprocessor Systems
+   3.3 Schedulability Analysis for Multiprocessor Systems
+   3.4 Relationship with SCHED_DEADLINE Parameters
   4. Bandwidth management
     4.1 System-wide settings
     4.2 Task interface
@@ -43,7 +47,7 @@ CONTENTS
   "deadline", to schedule tasks. A SCHED_DEADLINE task should receive
   "runtime" microseconds of execution time every "period" microseconds, and
   these "runtime" microseconds are available within "deadline" microseconds
- from the beginning of the period.  In order to implement this behaviour,
+ from the beginning of the period.  In order to implement this behavior,
   every time the task wakes up, the scheduler computes a "scheduling deadline"
   consistent with the guarantee (using the CBS[2,3] algorithm). Tasks are then
   scheduled using EDF[1] on these scheduling deadlines (the task with the
@@ -52,7 +56,7 @@ CONTENTS
   "admission control" strategy (see Section "4. Bandwidth management") is used
   (clearly, if the system is overloaded this guarantee cannot be respected).
  
- Summing up, the CBS[2,3] algorithms assigns scheduling deadlines to tasks so
+ Summing up, the CBS[2,3] algorithm assigns scheduling deadlines to tasks so
   that each task runs for at most its runtime every period, avoiding any
   interference between different tasks (bandwidth isolation), while the EDF[1]
   algorithm selects the task with the earliest scheduling deadline as the one
@@ -63,7 +67,7 @@ CONTENTS
   In more details, the CBS algorithm assigns scheduling deadlines to
   tasks in the following way:
  
-  - Each SCHED_DEADLINE task is characterised by the "runtime",
+  - Each SCHED_DEADLINE task is characterized by the "runtime",
      "deadline", and "period" parameters;
  
    - The state of the task is described by a "scheduling deadline", and
@@ -78,7 +82,7 @@ CONTENTS
  
      then, if the scheduling deadline is smaller than the current time, or
      this condition is verified, the scheduling deadline and the
-    remaining runtime are re-initialised as
+    remaining runtime are re-initialized as
  
           scheduling deadline = current time + deadline
           remaining runtime = runtime
@@ -126,31 +130,37 @@ CONTENTS
   suited for periodic or sporadic real-time tasks that need guarantees on their
   timing behavior, e.g., multimedia, streaming, control applications, etc.
  
+3.1 Definitions
+------------------------
+
   A typical real-time task is composed of a repetition of computation phases
   (task instances, or jobs) which are activated on a periodic or sporadic
   fashion.
- Each job J_j (where J_j is the j^th job of the task) is characterised by an
+ Each job J_j (where J_j is the j^th job of the task) is characterized by an
   arrival time r_j (the time when the job starts), an amount of computation
   time c_j needed to finish the job, and a job absolute deadline d_j, which
   is the time within which the job should be finished. The maximum execution
- time max_j{c_j} is called "Worst Case Execution Time" (WCET) for the task.
+ time max{c_j} is called "Worst Case Execution Time" (WCET) for the task.
   A real-time task can be periodic with period P if r_{j+1} = r_j + P, or
   sporadic with minimum inter-arrival time P is r_{j+1} >= r_j + P. Finally,
   d_j = r_j + D, where D is the task's relative deadline.
- The utilisation of a real-time task is defined as the ratio between its
+ Summing up, a real-time task can be described as
+       Task = (WCET, D, P)
+
+ The utilization of a real-time task is defined as the ratio between its
   WCET and its period (or minimum inter-arrival time), and represents
   the fraction of CPU time needed to execute the task.
  
- If the total utilisation sum_i(WCET_i/P_i) is larger than M (with M equal
+ If the total utilization U=sum(WCET_i/P_i) is larger than M (with M equal
   to the number of CPUs), then the scheduler is unable to respect all the
   deadlines.
- Note that total utilisation is defined as the sum of the utilisations
+ Note that total utilization is defined as the sum of the utilizations
   WCET_i/P_i over all the real-time tasks in the system. When considering
   multiple real-time tasks, the parameters of the i-th task are indicated
   with the "_i" suffix.
- Moreover, if the total utilisation is larger than M, then we risk starving
+ Moreover, if the total utilization is larger than M, then we risk starving
   non- real-time tasks by real-time tasks.
- If, instead, the total utilisation is smaller than M, then non real-time
+ If, instead, the total utilization is smaller than M, then non real-time
   tasks will not be starved and the system might be able to respect all the
   deadlines.
   As a matter of fact, in this case it is possible to provide an upper bound
@@ -159,38 +169,119 @@ CONTENTS
   More precisely, it can be proven that using a global EDF scheduler the
   maximum tardiness of each task is smaller or equal than
         ((M − 1) · WCET_max − WCET_min)/(M − (M − 2) · U_max) + WCET_max
- where WCET_max = max_i{WCET_i} is the maximum WCET, WCET_min=min_i{WCET_i}
- is the minimum WCET, and U_max = max_i{WCET_i/P_i} is the maximum utilisation.
+ where WCET_max = max{WCET_i} is the maximum WCET, WCET_min=min{WCET_i}
+ is the minimum WCET, and U_max = max{WCET_i/P_i} is the maximum
+ utilization[12].
+
+3.2 Schedulability Analysis for Uniprocessor Systems
+------------------------
  
   If M=1 (uniprocessor system), or in case of partitioned scheduling (each
   real-time task is statically assigned to one and only one CPU), it is
   possible to formally check if all the deadlines are respected.
   If D_i = P_i for all tasks, then EDF is able to respect all the deadlines
- of all the tasks executing on a CPU if and only if the total utilisation
+ of all the tasks executing on a CPU if and only if the total utilization
   of the tasks running on such a CPU is smaller or equal than 1.
   If D_i != P_i for some task, then it is possible to define the density of
- a task as C_i/min{D_i,T_i}, and EDF is able to respect all the deadlines
- of all the tasks running on a CPU if the sum sum_i C_i/min{D_i,T_i} of the
- densities of the tasks running on such a CPU is smaller or equal than 1
- (notice that this condition is only sufficient, and not necessary).
+ a task as WCET_i/min{D_i,P_i}, and EDF is able to respect all the deadlines
+ of all the tasks running on a CPU if the sum of the densities of the tasks
+ running on such a CPU is smaller or equal than 1:
+       sum(WCET_i / min{D_i, P_i}) <= 1
+ It is important to notice that this condition is only sufficient, and not
+ necessary: there are task sets that are schedulable, but do not respect the
+ condition. For example, consider the task set {Task_1,Task_2} composed by
+ Task_1=(50ms,50ms,100ms) and Task_2=(10ms,100ms,100ms).
+ EDF is clearly able to schedule the two tasks without missing any deadline
+ (Task_1 is scheduled as soon as it is released, and finishes just in time
+ to respect its deadline; Task_2 is scheduled immediately after Task_1, hence
+ its response time cannot be larger than 50ms + 10ms = 60ms) even if
+       50 / min{50,100} + 10 / min{100, 100} = 50 / 50 + 10 / 100 = 1.1
+ Of course it is possible to test the exact schedulability of tasks with
+ D_i != P_i (checking a condition that is both sufficient and necessary),
+ but this cannot be done by comparing the total utilization or density with
+ a constant. Instead, the so called "processor demand" approach can be used,
+ computing the total amount of CPU time h(t) needed by all the tasks to
+ respect all of their deadlines in a time interval of size t, and comparing
+ such a time with the interval size t. If h(t) is smaller than t (that is,
+ the amount of time needed by the tasks in a time interval of size t is
+ smaller than the size of the interval) for all the possible values of t, then
+ EDF is able to schedule the tasks respecting all of their deadlines. Since
+ performing this check for all possible values of t is impossible, it has been
+ proven[4,5,6] that it is sufficient to perform the test for values of t
+ between 0 and a maximum value L. The cited papers contain all of the
+ mathematical details and explain how to compute h(t) and L.
+ In any case, this kind of analysis is too complex as well as too
+ time-consuming to be performed on-line. Hence, as explained in Section
+ 4 Linux uses an admission test based on the tasks' utilizations.
+
+3.3 Schedulability Analysis for Multiprocessor Systems
+------------------------
  
   On multiprocessor systems with global EDF scheduling (non partitioned
   systems), a sufficient test for schedulability can not be based on the
- utilisations (it can be shown that task sets with utilisations slightly
- larger than 1 can miss deadlines regardless of the number of CPUs M).
- However, as previously stated, enforcing that the total utilisation is smaller
- than M is enough to guarantee that non real-time tasks are not starved and
- that the tardiness of real-time tasks has an upper bound.
+ utilizations or densities: it can be shown that even if D_i = P_i task
+ sets with utilizations slightly larger than 1 can miss deadlines regardless
+ of the number of CPUs.
+
+ Consider a set {Task_1,...Task_{M+1}} of M+1 tasks on a system with M
+ CPUs, with the first task Task_1=(P,P,P) having period, relative deadline
+ and WCET equal to P. The remaining M tasks Task_i=(e,P-1,P-1) have an
+ arbitrarily small worst case execution time (indicated as "e" here) and a
+ period smaller than the one of the first task. Hence, if all the tasks
+ activate at the same time t, global EDF schedules these M tasks first
+ (because their absolute deadlines are equal to t + P - 1, hence they are
+ smaller than the absolute deadline of Task_1, which is t + P). As a
+ result, Task_1 can be scheduled only at time t + e, and will finish at
+ time t + e + P, after its absolute deadline. The total utilization of the
+ task set is U = M · e / (P - 1) + P / P = M · e / (P - 1) + 1, and for small
+ values of e this can become very close to 1. This is known as "Dhall's
+ effect"[7]. Note: the example in the original paper by Dhall has been
+ slightly simplified here (for example, Dhall more correctly computed
+ lim_{e->0}U).
+
+ More complex schedulability tests for global EDF have been developed in
+ real-time literature[8,9], but they are not based on a simple comparison
+ between total utilization (or density) and a fixed constant. If all tasks
+ have D_i = P_i, a sufficient schedulability condition can be expressed in
+ a simple way:
+       sum(WCET_i / P_i) <= M - (M - 1) · U_max
+ where U_max = max{WCET_i / P_i}[10]. Notice that for U_max = 1,
+ M - (M - 1) · U_max becomes M - M + 1 = 1 and this schedulability condition
+ just confirms the Dhall's effect. A more complete survey of the literature
+ about schedulability tests for multi-processor real-time scheduling can be
+ found in [11].
+
+ As seen, enforcing that the total utilization is smaller than M does not
+ guarantee that global EDF schedules the tasks without missing any deadline
+ (in other words, global EDF is not an optimal scheduling algorithm). However,
+ a total utilization smaller than M is enough to guarantee that non real-time
+ tasks are not starved and that the tardiness of real-time tasks has an upper
+ bound[12] (as previously noted). Different bounds on the maximum tardiness
+ experienced by real-time tasks have been developed in various papers[13,14],
+ but the theoretical result that is important for SCHED_DEADLINE is that if
+ the total utilization is smaller or equal than M then the response times of
+ the tasks are limited.
+
+3.4 Relationship with SCHED_DEADLINE Parameters
+------------------------
  
- SCHED_DEADLINE can be used to schedule real-time tasks guaranteeing that
- the jobs' deadlines of a task are respected. In order to do this, a task
- must be scheduled by setting:
+ Finally, it is important to understand the relationship between the
+ SCHED_DEADLINE scheduling parameters described in Section 2 (runtime,
+ deadline and period) and the real-time task parameters (WCET, D, P)
+ described in this section. Note that the tasks' temporal constraints are
+ represented by its absolute deadlines d_j = r_j + D described above, while
+ SCHED_DEADLINE schedules the tasks according to scheduling deadlines (see
+ Section 2).
+ If an admission test is used to guarantee that the scheduling deadlines
+ are respected, then SCHED_DEADLINE can be used to schedule real-time tasks
+ guaranteeing that all the jobs' deadlines of a task are respected.
+ In order to do this, a task must be scheduled by setting:
  
    - runtime >= WCET
    - deadline = D
    - period <= P
  
- IOW, if runtime >= WCET and if period is >= P, then the scheduling deadlines
+ IOW, if runtime >= WCET and if period is <= P, then the scheduling deadlines
   and the absolute deadlines (d_j) coincide, so a proper admission control
   allows to respect the jobs' absolute deadlines for this task (this is what is
   called "hard schedulability property" and is an extension of Lemma 1 of [2]).
@@ -206,6 +297,39 @@ CONTENTS
        Symposium, 1998. http://retis.sssup.it/~giorgio/paps/1998/rtss98-cbs.pdf
    3 - L. Abeni. Server Mechanisms for Multimedia Applications. ReTiS Lab
        Technical Report. http://disi.unitn.it/~abeni/tr-98-01.pdf
+  4 - J. Y. Leung and M.L. Merril. A Note on Preemptive Scheduling of
+      Periodic, Real-Time Tasks. Information Processing Letters, vol. 11,
+      no. 3, pp. 115-118, 1980.
+  5 - S. K. Baruah, A. K. Mok and L. E. Rosier. Preemptively Scheduling
+      Hard-Real-Time Sporadic Tasks on One Processor. Proceedings of the
+      11th IEEE Real-time Systems Symposium, 1990.
+  6 - S. K. Baruah, L. E. Rosier and R. R. Howell. Algorithms and Complexity
+      Concerning the Preemptive Scheduling of Periodic Real-Time tasks on
+      One Processor. Real-Time Systems Journal, vol. 4, no. 2, pp 301-324,
+      1990.
+  7 - S. J. Dhall and C. L. Liu. On a real-time scheduling problem. Operations
+      research, vol. 26, no. 1, pp 127-140, 1978.
+  8 - T. Baker. Multiprocessor EDF and Deadline Monotonic Schedulability
+      Analysis. Proceedings of the 24th IEEE Real-Time Systems Symposium, 2003.
+  9 - T. Baker. An Analysis of EDF Schedulability on a Multiprocessor.
+      IEEE Transactions on Parallel and Distributed Systems, vol. 16, no. 8,
+      pp 760-768, 2005.
+  10 - J. Goossens, S. Funk and S. Baruah, Priority-Driven Scheduling of
+       Periodic Task Systems on Multiprocessors. Real-Time Systems Journal,
+       vol. 25, no. 2–3, pp. 187–205, 2003.
+  11 - R. Davis and A. Burns. A Survey of Hard Real-Time Scheduling for
+       Multiprocessor Systems. ACM Computing Surveys, vol. 43, no. 4, 2011.
+       http://www-users.cs.york.ac.uk/~robdavis/papers/MPSurveyv5.0.pdf
+  12 - U. C. Devi and J. H. Anderson. Tardiness Bounds under Global EDF
+       Scheduling on a Multiprocessor. Real-Time Systems Journal, vol. 32,
+       no. 2, pp 133-189, 2008.
+  13 - P. Valente and G. Lipari. An Upper Bound to the Lateness of Soft
+       Real-Time Tasks Scheduled by EDF on Multiprocessors. Proceedings of
+       the 26th IEEE Real-Time Systems Symposium, 2005.
+  14 - J. Erickson, U. Devi and S. Baruah. Improved tardiness bounds for
+       Global EDF. Proceedings of the 22nd Euromicro Conference on
+       Real-Time Systems, 2010.
+
  
  4. Bandwidth management
  =======================
@@ -218,10 +342,10 @@ CONTENTS
   no guarantee can be given on the actual scheduling of the -deadline tasks.
  
   As already stated in Section 3, a necessary condition to be respected to
- correctly schedule a set of real-time tasks is that the total utilisation
+ correctly schedule a set of real-time tasks is that the total utilization
   is smaller than M. When talking about -deadline tasks, this requires that
   the sum of the ratio between runtime and period for all tasks is smaller
- than M. Notice that the ratio runtime/period is equivalent to the utilisation
+ than M. Notice that the ratio runtime/period is equivalent to the utilization
   of a "traditional" real-time task, and is also often referred to as
   "bandwidth".
   The interface used to control the CPU bandwidth that can be allocated
@@ -251,7 +375,7 @@ CONTENTS
   The system wide settings are configured under the /proc virtual file system.
  
   For now the -rt knobs are used for -deadline admission control and the
- -deadline runtime is accounted against the -rt runtime. We realise that this
+ -deadline runtime is accounted against the -rt runtime. We realize that this
   isn't entirely desirable; however, it is better to have a small interface for
   now, and be able to change it easily later. The ideal situation (see 5.) is to
   run -rt tasks from a -deadline server; in which case the -rt bandwidth is a
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt

index 88b85899d30953a6be096a9e045fcf54be3676b4..7c1f9fad667460ff143b867ca5ce5d629560e876 100644 (file)
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -1124,7 +1124,6 @@ The boot loader *must* fill out the following fields in bp,
  
      o hdr.code32_start
      o hdr.cmd_line_ptr
-    o hdr.cmdline_size
      o hdr.ramdisk_image (if applicable)
      o hdr.ramdisk_size  (if applicable)
  
diff --git a/Documentation/x86/kernel-stacks b/Documentation/x86/kernel-stacks

new file mode 100644 (file)

index 0000000..0f3a6c2
--- /dev/null
+++ b/Documentation/x86/kernel-stacks
@@ -0,0 +1,141 @@
+Kernel stacks on x86-64 bit
+---------------------------
+
+Most of the text from Keith Owens, hacked by AK
+
+x86_64 page size (PAGE_SIZE) is 4K.
+
+Like all other architectures, x86_64 has a kernel stack for every
+active thread.  These thread stacks are THREAD_SIZE (2*PAGE_SIZE) big.
+These stacks contain useful data as long as a thread is alive or a
+zombie. While the thread is in user space the kernel stack is empty
+except for the thread_info structure at the bottom.
+
+In addition to the per thread stacks, there are specialized stacks
+associated with each CPU.  These stacks are only used while the kernel
+is in control on that CPU; when a CPU returns to user space the
+specialized stacks contain no useful data.  The main CPU stacks are:
+
+* Interrupt stack.  IRQSTACKSIZE
+
+  Used for external hardware interrupts.  If this is the first external
+  hardware interrupt (i.e. not a nested hardware interrupt) then the
+  kernel switches from the current task to the interrupt stack.  Like
+  the split thread and interrupt stacks on i386, this gives more room
+  for kernel interrupt processing without having to increase the size
+  of every per thread stack.
+
+  The interrupt stack is also used when processing a softirq.
+
+Switching to the kernel interrupt stack is done by software based on a
+per CPU interrupt nest counter. This is needed because x86-64 "IST"
+hardware stacks cannot nest without races.
+
+x86_64 also has a feature which is not available on i386, the ability
+to automatically switch to a new stack for designated events such as
+double fault or NMI, which makes it easier to handle these unusual
+events on x86_64.  This feature is called the Interrupt Stack Table
+(IST).  There can be up to 7 IST entries per CPU. The IST code is an
+index into the Task State Segment (TSS). The IST entries in the TSS
+point to dedicated stacks; each stack can be a different size.
+
+An IST is selected by a non-zero value in the IST field of an
+interrupt-gate descriptor.  When an interrupt occurs and the hardware
+loads such a descriptor, the hardware automatically sets the new stack
+pointer based on the IST value, then invokes the interrupt handler.  If
+the interrupt came from user mode, then the interrupt handler prologue
+will switch back to the per-thread stack.  If software wants to allow
+nested IST interrupts then the handler must adjust the IST values on
+entry to and exit from the interrupt handler.  (This is occasionally
+done, e.g. for debug exceptions.)
+
+Events with different IST codes (i.e. with different stacks) can be
+nested.  For example, a debug interrupt can safely be interrupted by an
+NMI.  arch/x86_64/kernel/entry.S::paranoidentry adjusts the stack
+pointers on entry to and exit from all IST events, in theory allowing
+IST events with the same code to be nested.  However in most cases, the
+stack size allocated to an IST assumes no nesting for the same code.
+If that assumption is ever broken then the stacks will become corrupt.
+
+The currently assigned IST stacks are :-
+
+* DOUBLEFAULT_STACK.  EXCEPTION_STKSZ (PAGE_SIZE).
+
+  Used for interrupt 8 - Double Fault Exception (#DF).
+
+  Invoked when handling one exception causes another exception. Happens
+  when the kernel is very confused (e.g. kernel stack pointer corrupt).
+  Using a separate stack allows the kernel to recover from it well enough
+  in many cases to still output an oops.
+
+* NMI_STACK.  EXCEPTION_STKSZ (PAGE_SIZE).
+
+  Used for non-maskable interrupts (NMI).
+
+  NMI can be delivered at any time, including when the kernel is in the
+  middle of switching stacks.  Using IST for NMI events avoids making
+  assumptions about the previous state of the kernel stack.
+
+* DEBUG_STACK.  DEBUG_STKSZ
+
+  Used for hardware debug interrupts (interrupt 1) and for software
+  debug interrupts (INT3).
+
+  When debugging a kernel, debug interrupts (both hardware and
+  software) can occur at any time.  Using IST for these interrupts
+  avoids making assumptions about the previous state of the kernel
+  stack.
+
+* MCE_STACK.  EXCEPTION_STKSZ (PAGE_SIZE).
+
+  Used for interrupt 18 - Machine Check Exception (#MC).
+
+  MCE can be delivered at any time, including when the kernel is in the
+  middle of switching stacks.  Using IST for MCE events avoids making
+  assumptions about the previous state of the kernel stack.
+
+For more details see the Intel IA32 or AMD AMD64 architecture manuals.
+
+
+Printing backtraces on x86
+--------------------------
+
+The question about the '?' preceding function names in an x86 stacktrace
+keeps popping up, here's an indepth explanation. It helps if the reader
+stares at print_context_stack() and the whole machinery in and around
+arch/x86/kernel/dumpstack.c.
+
+Adapted from Ingo's mail, Message-ID: <20150521101614.GA10889@gmail.com>:
+
+We always scan the full kernel stack for return addresses stored on
+the kernel stack(s) [*], from stack top to stack bottom, and print out
+anything that 'looks like' a kernel text address.
+
+If it fits into the frame pointer chain, we print it without a question
+mark, knowing that it's part of the real backtrace.
+
+If the address does not fit into our expected frame pointer chain we
+still print it, but we print a '?'. It can mean two things:
+
+ - either the address is not part of the call chain: it's just stale
+   values on the kernel stack, from earlier function calls. This is
+   the common case.
+
+ - or it is part of the call chain, but the frame pointer was not set
+   up properly within the function, so we don't recognize it.
+
+This way we will always print out the real call chain (plus a few more
+entries), regardless of whether the frame pointer was set up correctly
+or not - but in most cases we'll get the call chain right as well. The
+entries printed are strictly in stack order, so you can deduce more
+information from that as well.
+
+The most important property of this method is that we _never_ lose
+information: we always strive to print _all_ addresses on the stack(s)
+that look like kernel text addresses, so if debug information is wrong,
+we still print out the real call chain as well - just with more question
+marks than ideal.
+
+[*] For things like IRQ and IST stacks, we also scan those stacks, in
+    the right order, and try to cross from one stack into another
+    reconstructing the call chain. This works most of the time.
diff --git a/Documentation/x86/x86_64/kernel-stacks b/Documentation/x86/x86_64/kernel-stacks

deleted file mode 100644 (file)

index e3c8a49..0000000
--- a/Documentation/x86/x86_64/kernel-stacks
+++ /dev/null
@@ -1,101 +0,0 @@
-Most of the text from Keith Owens, hacked by AK
-
-x86_64 page size (PAGE_SIZE) is 4K.
-
-Like all other architectures, x86_64 has a kernel stack for every
-active thread.  These thread stacks are THREAD_SIZE (2*PAGE_SIZE) big.
-These stacks contain useful data as long as a thread is alive or a
-zombie. While the thread is in user space the kernel stack is empty
-except for the thread_info structure at the bottom.
-
-In addition to the per thread stacks, there are specialized stacks
-associated with each CPU.  These stacks are only used while the kernel
-is in control on that CPU; when a CPU returns to user space the
-specialized stacks contain no useful data.  The main CPU stacks are:
-
-* Interrupt stack.  IRQSTACKSIZE
-
-  Used for external hardware interrupts.  If this is the first external
-  hardware interrupt (i.e. not a nested hardware interrupt) then the
-  kernel switches from the current task to the interrupt stack.  Like
-  the split thread and interrupt stacks on i386, this gives more room
-  for kernel interrupt processing without having to increase the size
-  of every per thread stack.
-
-  The interrupt stack is also used when processing a softirq.
-
-Switching to the kernel interrupt stack is done by software based on a
-per CPU interrupt nest counter. This is needed because x86-64 "IST"
-hardware stacks cannot nest without races.
-
-x86_64 also has a feature which is not available on i386, the ability
-to automatically switch to a new stack for designated events such as
-double fault or NMI, which makes it easier to handle these unusual
-events on x86_64.  This feature is called the Interrupt Stack Table
-(IST).  There can be up to 7 IST entries per CPU. The IST code is an
-index into the Task State Segment (TSS). The IST entries in the TSS
-point to dedicated stacks; each stack can be a different size.
-
-An IST is selected by a non-zero value in the IST field of an
-interrupt-gate descriptor.  When an interrupt occurs and the hardware
-loads such a descriptor, the hardware automatically sets the new stack
-pointer based on the IST value, then invokes the interrupt handler.  If
-the interrupt came from user mode, then the interrupt handler prologue
-will switch back to the per-thread stack.  If software wants to allow
-nested IST interrupts then the handler must adjust the IST values on
-entry to and exit from the interrupt handler.  (This is occasionally
-done, e.g. for debug exceptions.)
-
-Events with different IST codes (i.e. with different stacks) can be
-nested.  For example, a debug interrupt can safely be interrupted by an
-NMI.  arch/x86_64/kernel/entry.S::paranoidentry adjusts the stack
-pointers on entry to and exit from all IST events, in theory allowing
-IST events with the same code to be nested.  However in most cases, the
-stack size allocated to an IST assumes no nesting for the same code.
-If that assumption is ever broken then the stacks will become corrupt.
-
-The currently assigned IST stacks are :-
-
-* STACKFAULT_STACK.  EXCEPTION_STKSZ (PAGE_SIZE).
-
-  Used for interrupt 12 - Stack Fault Exception (#SS).
-
-  This allows the CPU to recover from invalid stack segments. Rarely
-  happens.
-
-* DOUBLEFAULT_STACK.  EXCEPTION_STKSZ (PAGE_SIZE).
-
-  Used for interrupt 8 - Double Fault Exception (#DF).
-
-  Invoked when handling one exception causes another exception. Happens
-  when the kernel is very confused (e.g. kernel stack pointer corrupt).
-  Using a separate stack allows the kernel to recover from it well enough
-  in many cases to still output an oops.
-
-* NMI_STACK.  EXCEPTION_STKSZ (PAGE_SIZE).
-
-  Used for non-maskable interrupts (NMI).
-
-  NMI can be delivered at any time, including when the kernel is in the
-  middle of switching stacks.  Using IST for NMI events avoids making
-  assumptions about the previous state of the kernel stack.
-
-* DEBUG_STACK.  DEBUG_STKSZ
-
-  Used for hardware debug interrupts (interrupt 1) and for software
-  debug interrupts (INT3).
-
-  When debugging a kernel, debug interrupts (both hardware and
-  software) can occur at any time.  Using IST for these interrupts
-  avoids making assumptions about the previous state of the kernel
-  stack.
-
-* MCE_STACK.  EXCEPTION_STKSZ (PAGE_SIZE).
-
-  Used for interrupt 18 - Machine Check Exception (#MC).
-
-  MCE can be delivered at any time, including when the kernel is in the
-  middle of switching stacks.  Using IST for MCE events avoids making
-  assumptions about the previous state of the kernel stack.
-
-For more details see the Intel IA32 or AMD AMD64 architecture manuals.
diff --git a/MAINTAINERS b/MAINTAINERS

index 2987968e235c5d4feb58b02616cd3864f2d8e92f..a655435705aa47563a630a3736d486033eae4e7b 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -51,9 +51,9 @@ trivial patch so apply some common sense.
         or does something very odd once a month document it.
  
         PLEASE remember that submissions must be made under the terms
-       of the OSDL certificate of contribution and should include a
-       Signed-off-by: line.  The current version of this "Developer's
-       Certificate of Origin" (DCO) is listed in the file
+       of the Linux Foundation certificate of contribution and should
+       include a Signed-off-by: line.  The current version of this
+       "Developer's Certificate of Origin" (DCO) is listed in the file
         Documentation/SubmittingPatches.
  
  6.     Make sure you have the right to send any changes you make. If you
@@ -7575,6 +7575,7 @@ F:        drivers/pci/host/pci-exynos.c
  
  PCI DRIVER FOR SYNOPSIS DESIGNWARE
  M:     Jingoo Han <jingoohan1@gmail.com>
+M:     Pratyush Anand <pratyush.anand@gmail.com>
  L:     linux-pci@vger.kernel.org
  S:     Maintained
  F:     drivers/pci/host/*designware*
@@ -7588,8 +7589,9 @@ F:        Documentation/devicetree/bindings/pci/host-generic-pci.txt
  F:     drivers/pci/host/pci-host-generic.c
  
  PCIE DRIVER FOR ST SPEAR13XX
+M:     Pratyush Anand <pratyush.anand@gmail.com>
  L:     linux-pci@vger.kernel.org
-S:     Orphan
+S:     Maintained
  F:     drivers/pci/host/*spear*
  
  PCMCIA SUBSYSTEM
@@ -7632,7 +7634,6 @@ F:        kernel/delayacct.c
  
  PERFORMANCE EVENTS SUBSYSTEM
  M:     Peter Zijlstra <a.p.zijlstra@chello.nl>
-M:     Paul Mackerras <paulus@samba.org>
  M:     Ingo Molnar <mingo@redhat.com>
  M:     Arnaldo Carvalho de Melo <acme@kernel.org>
  L:     linux-kernel@vger.kernel.org
diff --git a/Makefile b/Makefile

index aee7e5cb4c151653fd00397eb90bba764e2b7c38..6c6f14628f329d0ba10f5632fb362c818c437ff5 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
  VERSION = 4
  PATCHLEVEL = 1
  SUBLEVEL = 0
-EXTRAVERSION = -rc6
+EXTRAVERSION =
  NAME = Hurr durr I'ma sheep
  
  # *DOCUMENTATION*
@@ -215,7 +215,6 @@ VPATH               := $(srctree)$(if $(KBUILD_EXTMOD),:$(KBUILD_EXTMOD))
  
  export srctree objtree VPATH
  
-
  # SUBARCH tells the usermode build what the underlying arch is.  That is set
  # first, and if a usermode build is happening, the "ARCH=um" on the command
  # line overrides the setting of ARCH below.  If a native build is happening,
@@ -1497,11 +1496,11 @@ image_name:
  # Clear a bunch of variables before executing the submake
  tools/: FORCE
         $(Q)mkdir -p $(objtree)/tools
-       $(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(filter --j% -j,$(MAKEFLAGS))" O=$(objtree) subdir=tools -C $(src)/tools/
+       $(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(filter --j% -j,$(MAKEFLAGS))" O=$(O) subdir=tools -C $(src)/tools/
  
  tools/%: FORCE
         $(Q)mkdir -p $(objtree)/tools
-       $(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(filter --j% -j,$(MAKEFLAGS))" O=$(objtree) subdir=tools -C $(src)/tools/ $*
+       $(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(filter --j% -j,$(MAKEFLAGS))" O=$(O) subdir=tools -C $(src)/tools/ $*
  
  # Single targets
  # ---------------------------------------------------------------------------
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c

index 9d0ac091a52a7d16cf1f78f402ab48c511924a24..4a905bd667e2ef71542e2585469404478860bce5 100644 (file)
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -23,8 +23,7 @@
  #include <linux/smp.h>
  #include <linux/interrupt.h>
  #include <linux/module.h>
-
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
  
  extern void die_if_kernel(char *,struct pt_regs *,long, unsigned long *);
  
@@ -107,7 +106,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
  
         /* If we're in an interrupt context, or have no user context,
            we must not take the fault.  */
-       if (!mm || in_atomic())
+       if (!mm || faulthandler_disabled())
                 goto no_context;
  
  #ifdef CONFIG_ALPHA_LARGE_VMALLOC
diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h

index 4dc64ddebecebe354ef90b14fe28bc8804569a52..05b5aaf5b0f91e5580395e08ae778f5ddace5b3c 100644 (file)
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -53,7 +53,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
         if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
                 return -EFAULT;
  
-       pagefault_disable();    /* implies preempt_disable() */
+       pagefault_disable();
  
         switch (op) {
         case FUTEX_OP_SET:
@@ -75,7 +75,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
                 ret = -ENOSYS;
         }
  
-       pagefault_enable();     /* subsumes preempt_enable() */
+       pagefault_enable();
  
         if (!ret) {
                 switch (cmp) {
@@ -104,7 +104,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
         return ret;
  }
  
-/* Compare-xchg with preemption disabled.
+/* Compare-xchg with pagefaults disabled.
   *  Notes:
   *      -Best-Effort: Exchg happens only if compare succeeds.
   *          If compare fails, returns; leaving retry/looping to upper layers
@@ -121,7 +121,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
         if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
                 return -EFAULT;
  
-       pagefault_disable();    /* implies preempt_disable() */
+       pagefault_disable();
  
         /* TBD : can use llock/scond */
         __asm__ __volatile__(
@@ -142,7 +142,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
         : "r"(oldval), "r"(newval), "r"(uaddr), "ir"(-EFAULT)
         : "cc", "memory");
  
-       pagefault_enable();     /* subsumes preempt_enable() */
+       pagefault_enable();
  
         *uval = val;
         return val;
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c

index 6a2e006cbcce1f1cd69866e0f0f9f94463d73dcb..d948e4e9d89c4ebe7e5676f449c9377b4fbe3535 100644 (file)
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -86,7 +86,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs)
          * If we're in an interrupt or have no user
          * context, we must not take the fault..
          */
-       if (in_atomic() || !mm)
+       if (faulthandler_disabled() || !mm)
                 goto no_context;
  
         if (user_mode(regs))
diff --git a/arch/arm/boot/dts/am335x-bone-common.dtsi b/arch/arm/boot/dts/am335x-bone-common.dtsi

index c3255e0c90aa829fc792f02d1265d413f3c6e624..dbb3f4d2bf84ebf4565555949053c94619ea161d 100644 (file)
--- a/arch/arm/boot/dts/am335x-bone-common.dtsi
+++ b/arch/arm/boot/dts/am335x-bone-common.dtsi
@@ -223,6 +223,25 @@
  /include/ "tps65217.dtsi"
  
  &tps {
+       /*
+        * Configure pmic to enter OFF-state instead of SLEEP-state ("RTC-only
+        * mode") at poweroff.  Most BeagleBone versions do not support RTC-only
+        * mode and risk hardware damage if this mode is entered.
+        *
+        * For details, see linux-omap mailing list May 2015 thread
+        *      [PATCH] ARM: dts: am335x-bone* enable pmic-shutdown-controller
+        * In particular, messages:
+        *      http://www.spinics.net/lists/linux-omap/msg118585.html
+        *      http://www.spinics.net/lists/linux-omap/msg118615.html
+        *
+        * You can override this later with
+        *      &tps {  /delete-property/ ti,pmic-shutdown-controller;  }
+        * if you want to use RTC-only mode and made sure you are not affected
+        * by the hardware problems. (Tip: double-check by performing a current
+        * measurement after shutdown: it should be less than 1 mA.)
+        */
+       ti,pmic-shutdown-controller;
+
         regulators {
                 dcdc1_reg: regulator@0 {
                         regulator-name = "vdds_dpr";
diff --git a/arch/arm/boot/dts/am35xx-clocks.dtsi b/arch/arm/boot/dts/am35xx-clocks.dtsi

index 518b8fde88b0c87005fe68e413cfee769befac07..18cc826e9db534714a1b4d8a3cfc497e43ffcc85 100644 (file)
--- a/arch/arm/boot/dts/am35xx-clocks.dtsi
+++ b/arch/arm/boot/dts/am35xx-clocks.dtsi
@@ -12,7 +12,7 @@
                 #clock-cells = <0>;
                 compatible = "ti,am35xx-gate-clock";
                 clocks = <&ipss_ick>;
-               reg = <0x059c>;
+               reg = <0x032c>;
                 ti,bit-shift = <1>;
         };
  
@@ -20,7 +20,7 @@
                 #clock-cells = <0>;
                 compatible = "ti,gate-clock";
                 clocks = <&rmii_ck>;
-               reg = <0x059c>;
+               reg = <0x032c>;
                 ti,bit-shift = <9>;
         };
  
@@ -28,7 +28,7 @@
                 #clock-cells = <0>;
                 compatible = "ti,am35xx-gate-clock";
                 clocks = <&ipss_ick>;
-               reg = <0x059c>;
+               reg = <0x032c>;
                 ti,bit-shift = <2>;
         };
  
@@ -36,7 +36,7 @@
                 #clock-cells = <0>;
                 compatible = "ti,gate-clock";
                 clocks = <&pclk_ck>;
-               reg = <0x059c>;
+               reg = <0x032c>;
                 ti,bit-shift = <10>;
         };
  
@@ -44,7 +44,7 @@
                 #clock-cells = <0>;
                 compatible = "ti,am35xx-gate-clock";
                 clocks = <&ipss_ick>;
-               reg = <0x059c>;
+               reg = <0x032c>;
                 ti,bit-shift = <0>;
         };
  
@@ -52,7 +52,7 @@
                 #clock-cells = <0>;
                 compatible = "ti,gate-clock";
                 clocks = <&sys_ck>;
-               reg = <0x059c>;
+               reg = <0x032c>;
                 ti,bit-shift = <8>;
         };
  
@@ -60,7 +60,7 @@
                 #clock-cells = <0>;
                 compatible = "ti,am35xx-gate-clock";
                 clocks = <&sys_ck>;
-               reg = <0x059c>;
+               reg = <0x032c>;
                 ti,bit-shift = <3>;
         };
  };
diff --git a/arch/arm/boot/dts/armada-xp-linksys-mamba.dts b/arch/arm/boot/dts/armada-xp-linksys-mamba.dts

index a2cf2154dcdb68d8374c2bea4b136fccaccb7aa2..fdd187c55aa5f78b5ab61d15dc12c1ad001990d2 100644 (file)
--- a/arch/arm/boot/dts/armada-xp-linksys-mamba.dts
+++ b/arch/arm/boot/dts/armada-xp-linksys-mamba.dts
@@ -95,6 +95,11 @@
  
                 internal-regs {
  
+                       rtc@10300 {
+                               /* No crystal connected to the internal RTC */
+                               status = "disabled";
+                       };
+
                         /* J10: VCC, NC, RX, NC, TX, GND  */
                         serial@12000 {
                                 status = "okay";
diff --git a/arch/arm/boot/dts/dm816x.dtsi b/arch/arm/boot/dts/dm816x.dtsi

index de8427be830a32e24a01ace97f11303435528b7b..289806adb343806aefce22e63b6caa1d558741fb 100644 (file)
--- a/arch/arm/boot/dts/dm816x.dtsi
+++ b/arch/arm/boot/dts/dm816x.dtsi
@@ -382,7 +382,7 @@
                         ti,hwmods = "usb_otg_hs";
  
                         usb0: usb@47401000 {
-                               compatible = "ti,musb-am33xx";
+                               compatible = "ti,musb-dm816";
                                 reg = <0x47401400 0x400
                                        0x47401000 0x200>;
                                 reg-names = "mc", "control";
@@ -422,7 +422,7 @@
                         };
  
                         usb1: usb@47401800 {
-                               compatible = "ti,musb-am33xx";
+                               compatible = "ti,musb-dm816";
                                 reg = <0x47401c00 0x400
                                        0x47401800 0x200>;
                                 reg-names = "mc", "control";
diff --git a/arch/arm/boot/dts/omap3-n900.dts b/arch/arm/boot/dts/omap3-n900.dts

index 5c16145920eafd9604f0571dfe60a23986313871..5f5e0f3d5b64fcb2283f72b9df923c793be7f75c 100644 (file)
--- a/arch/arm/boot/dts/omap3-n900.dts
+++ b/arch/arm/boot/dts/omap3-n900.dts
@@ -832,8 +832,8 @@
                 touchscreen-fuzz-x = <4>;
                 touchscreen-fuzz-y = <7>;
                 touchscreen-fuzz-pressure = <2>;
-               touchscreen-max-x = <4096>;
-               touchscreen-max-y = <4096>;
+               touchscreen-size-x = <4096>;
+               touchscreen-size-y = <4096>;
                 touchscreen-max-pressure = <2048>;
  
                 ti,x-plate-ohms = <280>;
diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h

index 4e78065a16aa3c6a3dae5db147f36fda00dcf18a..5eed82809d82b7aa9c74670fd9c9624bbd930803 100644 (file)
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -93,6 +93,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
         if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
                 return -EFAULT;
  
+       preempt_disable();
         __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
         "1:     " TUSER(ldr) "  %1, [%4]\n"
         "       teq     %1, %2\n"
@@ -104,6 +105,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
         : "cc", "memory");
  
         *uval = val;
+       preempt_enable();
+
         return ret;
  }
  
@@ -124,7 +127,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
         if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
                 return -EFAULT;
  
-       pagefault_disable();    /* implies preempt_disable() */
+#ifndef CONFIG_SMP
+       preempt_disable();
+#endif
+       pagefault_disable();
  
         switch (op) {
         case FUTEX_OP_SET:
@@ -146,7 +152,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
                 ret = -ENOSYS;
         }
  
-       pagefault_enable();     /* subsumes preempt_enable() */
+       pagefault_enable();
+#ifndef CONFIG_SMP
+       preempt_enable();
+#endif
  
         if (!ret) {
                 switch (cmp) {
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h

index 2fe85fff5ccacd3dfe67d72c4b6fd5d65342e5e9..370f7a732900ae12e8831e6f3ce7390d16455fc8 100644 (file)
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -18,7 +18,7 @@ extern struct cputopo_arm cpu_topology[NR_CPUS];
  #define topology_physical_package_id(cpu)      (cpu_topology[cpu].socket_id)
  #define topology_core_id(cpu)          (cpu_topology[cpu].core_id)
  #define topology_core_cpumask(cpu)     (&cpu_topology[cpu].core_sibling)
-#define topology_thread_cpumask(cpu)   (&cpu_topology[cpu].thread_sibling)
+#define topology_sibling_cpumask(cpu)  (&cpu_topology[cpu].thread_sibling)
  
  void init_cpu_topology(void);
  void store_cpu_topology(unsigned int cpuid);
diff --git a/arch/arm/mach-exynos/suspend.c b/arch/arm/mach-exynos/suspend.c

index c0b6dccbf7bd5d8d14c05172d11d6ee690bf8fef..7d23ce04cad5201919a58aefccafc88e882dd844 100644 (file)
--- a/arch/arm/mach-exynos/suspend.c
+++ b/arch/arm/mach-exynos/suspend.c
@@ -87,8 +87,8 @@ static unsigned int exynos_pmu_spare3;
  static u32 exynos_irqwake_intmask = 0xffffffff;
  
  static const struct exynos_wkup_irq exynos3250_wkup_irq[] = {
-       { 105, BIT(1) }, /* RTC alarm */
-       { 106, BIT(2) }, /* RTC tick */
+       { 73, BIT(1) }, /* RTC alarm */
+       { 74, BIT(2) }, /* RTC tick */
         { /* sentinel */ },
  };
  
diff --git a/arch/arm/mach-omap2/sleep34xx.S b/arch/arm/mach-omap2/sleep34xx.S

index d1dedc8195ed2569508e0d522301bdf535aefda8..eafd120b53f1bc15c82f2cc47dc8033e31ca566e 100644 (file)
--- a/arch/arm/mach-omap2/sleep34xx.S
+++ b/arch/arm/mach-omap2/sleep34xx.S
@@ -203,23 +203,8 @@ save_context_wfi:
          */
         ldr     r1, kernel_flush
         blx     r1
-       /*
-        * The kernel doesn't interwork: v7_flush_dcache_all in particluar will
-        * always return in Thumb state when CONFIG_THUMB2_KERNEL is enabled.
-        * This sequence switches back to ARM.  Note that .align may insert a
-        * nop: bx pc needs to be word-aligned in order to work.
-        */
- THUMB(        .thumb          )
- THUMB(        .align          )
- THUMB(        bx      pc      )
- THUMB(        nop             )
-       .arm
-
         b       omap3_do_wfi
-
-/*
- * Local variables
- */
+ENDPROC(omap34xx_cpu_suspend)
  omap3_do_wfi_sram_addr:
         .word omap3_do_wfi_sram
  kernel_flush:
@@ -364,10 +349,7 @@ exit_nonoff_modes:
   * ===================================
   */
         ldmfd   sp!, {r4 - r11, pc}     @ restore regs and return
-
-/*
- * Local variables
- */
+ENDPROC(omap3_do_wfi)
  sdrc_power:
         .word   SDRC_POWER_V
  cm_idlest1_core:
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c

index 6333d9c178757fe4f365b8e765b2a9ea75e2b80b..0d629b8f973fc2ca63aacb59e5baaf718b194543 100644 (file)
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -276,7 +276,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
          * If we're in an interrupt or have no user
          * context, we must not take the fault..
          */
-       if (in_atomic() || !mm)
+       if (faulthandler_disabled() || !mm)
                 goto no_context;
  
         if (user_mode(regs))
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c

index b98895d9fe57cc4bde62392a3f7144b9ab23c70f..ee8dfa793989785488a306a9edd8b7899f3f1f3b 100644 (file)
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -59,6 +59,7 @@ void *kmap_atomic(struct page *page)
         void *kmap;
         int type;
  
+       preempt_disable();
         pagefault_disable();
         if (!PageHighMem(page))
                 return page_address(page);
@@ -121,6 +122,7 @@ void __kunmap_atomic(void *kvaddr)
                 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
         }
         pagefault_enable();
+       preempt_enable();
  }
  EXPORT_SYMBOL(__kunmap_atomic);
  
@@ -130,6 +132,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
         int idx, type;
         struct page *page = pfn_to_page(pfn);
  
+       preempt_disable();
         pagefault_disable();
         if (!PageHighMem(page))
                 return page_address(page);
diff --git a/arch/arm64/boot/dts/mediatek/mt8173-evb.dts b/arch/arm64/boot/dts/mediatek/mt8173-evb.dts

index 43d54017b779d4e211462b8bebe2604025bb08ea..d0ab012fa379eb97c6e43ebad83ee18185d2b598 100644 (file)
--- a/arch/arm64/boot/dts/mediatek/mt8173-evb.dts
+++ b/arch/arm64/boot/dts/mediatek/mt8173-evb.dts
@@ -16,7 +16,8 @@
  #include "mt8173.dtsi"
  
  / {
-       model = "mediatek,mt8173-evb";
+       model = "MediaTek MT8173 evaluation board";
+       compatible = "mediatek,mt8173-evb", "mediatek,mt8173";
  
         aliases {
                 serial0 = &uart0;
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h

index 5f750dc96e0fd64123851ac787659f5953bc71e5..74069b3bd919c7ff3c5722f9f2c1dce212abd3fd 100644 (file)
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -58,7 +58,7 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
         if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
                 return -EFAULT;
  
-       pagefault_disable();    /* implies preempt_disable() */
+       pagefault_disable();
  
         switch (op) {
         case FUTEX_OP_SET:
@@ -85,7 +85,7 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
                 ret = -ENOSYS;
         }
  
-       pagefault_enable();     /* subsumes preempt_enable() */
+       pagefault_enable();
  
         if (!ret) {
                 switch (cmp) {
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h

index 7ebcd31ce51cae9be1030f706e5ecb03a7af7e2c..225ec3524fbfc6ed29ae1d8d1d5041569a886089 100644 (file)
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -18,7 +18,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
  #define topology_physical_package_id(cpu)      (cpu_topology[cpu].cluster_id)
  #define topology_core_id(cpu)          (cpu_topology[cpu].core_id)
  #define topology_core_cpumask(cpu)     (&cpu_topology[cpu].core_sibling)
-#define topology_thread_cpumask(cpu)   (&cpu_topology[cpu].thread_sibling)
+#define topology_sibling_cpumask(cpu)  (&cpu_topology[cpu].thread_sibling)
  
  void init_cpu_topology(void);
  void store_cpu_topology(unsigned int cpuid);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c

index 96da13167d4a5c77564952a6d3a0fdce35d6580f..0948d327d013651c7b9978023139ea9cd89ecaeb 100644 (file)
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -211,7 +211,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
          * If we're in an interrupt or have no user context, we must not take
          * the fault.
          */
-       if (in_atomic() || !mm)
+       if (faulthandler_disabled() || !mm)
                 goto no_context;
  
         if (user_mode(regs))
diff --git a/arch/avr32/include/asm/uaccess.h b/arch/avr32/include/asm/uaccess.h

index a46f7cf3e1eab23d4cdfc224d21fe571917ef413..68cf638faf4867aef2d92b91f9ab48770100f132 100644 (file)
--- a/arch/avr32/include/asm/uaccess.h
+++ b/arch/avr32/include/asm/uaccess.h
@@ -97,7 +97,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
   * @x:   Value to copy to user space.
   * @ptr: Destination address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple value from kernel space to user
   * space.  It supports simple types like char and int, but not larger
@@ -116,7 +117,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
   * @x:   Variable to store result.
   * @ptr: Source address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple variable from user space to kernel
   * space.  It supports simple types like char and int, but not larger
@@ -136,7 +138,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
   * @x:   Value to copy to user space.
   * @ptr: Destination address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple value from kernel space to user
   * space.  It supports simple types like char and int, but not larger
@@ -158,7 +161,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
   * @x:   Variable to store result.
   * @ptr: Source address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple variable from user space to kernel
   * space.  It supports simple types like char and int, but not larger
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c

index d223a8b57c1eaad282289e75089654153ab598d6..c03533937a9f0aa273a75c76ecb66f2731b2d39c 100644 (file)
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -14,11 +14,11 @@
  #include <linux/pagemap.h>
  #include <linux/kdebug.h>
  #include <linux/kprobes.h>
+#include <linux/uaccess.h>
  
  #include <asm/mmu_context.h>
  #include <asm/sysreg.h>
  #include <asm/tlb.h>
-#include <asm/uaccess.h>
  
  #ifdef CONFIG_KPROBES
  static inline int notify_page_fault(struct pt_regs *regs, int trap)
@@ -81,7 +81,7 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
          * If we're in an interrupt or have no user context, we must
          * not take the fault...
          */
-       if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM))
+       if (faulthandler_disabled() || !mm || regs->sr & SYSREG_BIT(GM))
                 goto no_context;
  
         local_irq_enable();
diff --git a/arch/blackfin/include/asm/io.h b/arch/blackfin/include/asm/io.h

index 4e8ad0523118d631ea24f6b9f8fd1c3ffb123194..6abebe82d4e93ed0329f271cd54e2af5c1bc38d2 100644 (file)
--- a/arch/blackfin/include/asm/io.h
+++ b/arch/blackfin/include/asm/io.h
@@ -10,6 +10,7 @@
  #include <linux/compiler.h>
  #include <linux/types.h>
  #include <asm/byteorder.h>
+#include <asm/def_LPBlackfin.h>
  
  #define __raw_readb bfin_read8
  #define __raw_readw bfin_read16
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c

index 83f12f2ed9e31b8705ed4f8f5ec1c4cbd8822fb4..3066d40a6db14425c162d399d89e5c6db66786fe 100644 (file)
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -8,7 +8,7 @@
  #include <linux/interrupt.h>
  #include <linux/module.h>
  #include <linux/wait.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
  #include <arch/system.h>
  
  extern int find_fixup_code(struct pt_regs *);
@@ -109,11 +109,11 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
         info.si_code = SEGV_MAPERR;
  
         /*
-        * If we're in an interrupt or "atomic" operation or have no
+        * If we're in an interrupt, have pagefaults disabled or have no
          * user context, we must not take the fault.
          */
  
-       if (in_atomic() || !mm)
+       if (faulthandler_disabled() || !mm)
                 goto no_context;
  
         if (user_mode(regs))
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c

index ec4917ddf67872aa46b60c6b067b0a67ec5417a4..61d99767fe1691e70286bf8d52a05aee506bbccc 100644 (file)
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -19,9 +19,9 @@
  #include <linux/kernel.h>
  #include <linux/ptrace.h>
  #include <linux/hardirq.h>
+#include <linux/uaccess.h>
  
  #include <asm/pgtable.h>
-#include <asm/uaccess.h>
  #include <asm/gdb-stub.h>
  
  /*****************************************************************************/
@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
          * If we're in an interrupt or have no user
          * context, we must not take the fault..
          */
-       if (in_atomic() || !mm)
+       if (faulthandler_disabled() || !mm)
                 goto no_context;
  
         if (user_mode(__frame))
diff --git a/arch/frv/mm/highmem.c b/arch/frv/mm/highmem.c

index bed9a9bd3c10c84e004c845839f0ad53c0565e45..785344bbdc07c360e81768c8472336bebd0baa3e 100644 (file)
--- a/arch/frv/mm/highmem.c
+++ b/arch/frv/mm/highmem.c
@@ -42,6 +42,7 @@ void *kmap_atomic(struct page *page)
         unsigned long paddr;
         int type;
  
+       preempt_disable();
         pagefault_disable();
         type = kmap_atomic_idx_push();
         paddr = page_to_phys(page);
@@ -85,5 +86,6 @@ void __kunmap_atomic(void *kvaddr)
         }
         kmap_atomic_idx_pop();
         pagefault_enable();
+       preempt_enable();
  }
  EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/hexagon/include/asm/uaccess.h b/arch/hexagon/include/asm/uaccess.h

index e4127e4d6a5bbde7f1d0ec42017df98353177c93..f000a382bc7f62f28dfe980ab184904c70681914 100644 (file)
--- a/arch/hexagon/include/asm/uaccess.h
+++ b/arch/hexagon/include/asm/uaccess.h
@@ -36,7 +36,8 @@
   * @addr: User space pointer to start of block to check
   * @size: Size of block to check
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Checks if a pointer to a block of memory in user space is valid.
   *
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h

index 6437ca21f61b49d0b05fe9d124ed0e5aa8391d92..3ad8f698836346793816a79ada16035f40b8197e 100644 (file)
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -53,7 +53,7 @@ void build_cpu_to_node_map(void);
  #define topology_physical_package_id(cpu)      (cpu_data(cpu)->socket_id)
  #define topology_core_id(cpu)                  (cpu_data(cpu)->core_id)
  #define topology_core_cpumask(cpu)             (&cpu_core_map[cpu])
-#define topology_thread_cpumask(cpu)           (&per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_cpumask(cpu)          (&per_cpu(cpu_sibling_map, cpu))
  #endif
  
  extern void arch_fix_phys_package_id(int num, u32 slot);
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c

index 15051e9c2c6f98f3f2e8743739f10b63f795be3a..b054c5c6e7137cf85ba00f8c60fa33b8719b0c07 100644 (file)
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -127,7 +127,7 @@ int smp_num_siblings = 1;
  volatile int ia64_cpu_to_sapicid[NR_CPUS];
  EXPORT_SYMBOL(ia64_cpu_to_sapicid);
  
-static volatile cpumask_t cpu_callin_map;
+static cpumask_t cpu_callin_map;
  
  struct smp_boot_data smp_boot_data __initdata;
  
@@ -477,6 +477,7 @@ do_boot_cpu (int sapicid, int cpu, struct task_struct *idle)
         for (timeout = 0; timeout < 100000; timeout++) {
                 if (cpumask_test_cpu(cpu, &cpu_callin_map))
                         break;  /* It has booted */
+               barrier(); /* Make sure we re-read cpu_callin_map */
                 udelay(100);
         }
         Dprintk("\n");
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c

index ba5ba7accd0d6bb4dbab34f7fc307c4306347f4a..70b40d1205a6b9b3ec7efcbc9e60ec64c2eff712 100644 (file)
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -11,10 +11,10 @@
  #include <linux/kprobes.h>
  #include <linux/kdebug.h>
  #include <linux/prefetch.h>
+#include <linux/uaccess.h>
  
  #include <asm/pgtable.h>
  #include <asm/processor.h>
-#include <asm/uaccess.h>
  
  extern int die(char *, struct pt_regs *, long);
  
@@ -96,7 +96,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
         /*
          * If we're in an interrupt or have no user context, we must not take the fault..
          */
-       if (in_atomic() || !mm)
+       if (faulthandler_disabled() || !mm)
                 goto no_context;
  
  #ifdef CONFIG_VIRTUAL_MEM_MAP
diff --git a/arch/m32r/include/asm/uaccess.h b/arch/m32r/include/asm/uaccess.h

index 71adff209405e15b052e96d832ba5a26295bb98e..cac7014daef3aa6949d17a72824a757dfdbee5b4 100644 (file)
--- a/arch/m32r/include/asm/uaccess.h
+++ b/arch/m32r/include/asm/uaccess.h
@@ -91,7 +91,8 @@ static inline void set_fs(mm_segment_t s)
   * @addr: User space pointer to start of block to check
   * @size: Size of block to check
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Checks if a pointer to a block of memory in user space is valid.
   *
@@ -155,7 +156,8 @@ extern int fixup_exception(struct pt_regs *regs);
   * @x:   Variable to store result.
   * @ptr: Source address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple variable from user space to kernel
   * space.  It supports simple types like char and int, but not larger
@@ -175,7 +177,8 @@ extern int fixup_exception(struct pt_regs *regs);
   * @x:   Value to copy to user space.
   * @ptr: Destination address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple value from kernel space to user
   * space.  It supports simple types like char and int, but not larger
@@ -194,7 +197,8 @@ extern int fixup_exception(struct pt_regs *regs);
   * @x:   Variable to store result.
   * @ptr: Source address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple variable from user space to kernel
   * space.  It supports simple types like char and int, but not larger
@@ -274,7 +278,8 @@ do {                                                                        \
   * @x:   Value to copy to user space.
   * @ptr: Destination address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple value from kernel space to user
   * space.  It supports simple types like char and int, but not larger
@@ -568,7 +573,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
   * @from: Source address, in kernel space.
   * @n:    Number of bytes to copy.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from kernel space to user space.  Caller must check
   * the specified block with access_ok() before calling this function.
@@ -588,7 +594,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
   * @from: Source address, in kernel space.
   * @n:    Number of bytes to copy.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from kernel space to user space.
   *
@@ -606,7 +613,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
   * @from: Source address, in user space.
   * @n:    Number of bytes to copy.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from user space to kernel space.  Caller must check
   * the specified block with access_ok() before calling this function.
@@ -626,7 +634,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
   * @from: Source address, in user space.
   * @n:    Number of bytes to copy.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from user space to kernel space.
   *
@@ -677,7 +686,8 @@ unsigned long clear_user(void __user *mem, unsigned long len);
   * strlen_user: - Get the size of a string in user space.
   * @str: The string to measure.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Get the size of a NUL-terminated string in user space.
   *
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c

index e3d4d4890104cc27e2eb9de2f22cb6f53f939c90..8f9875b7933d5582277a777693a2e44c1483d362 100644 (file)
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -24,9 +24,9 @@
  #include <linux/vt_kern.h>             /* For unblank_screen() */
  #include <linux/highmem.h>
  #include <linux/module.h>
+#include <linux/uaccess.h>
  
  #include <asm/m32r.h>
-#include <asm/uaccess.h>
  #include <asm/hardirq.h>
  #include <asm/mmu_context.h>
  #include <asm/tlbflush.h>
@@ -111,10 +111,10 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
         mm = tsk->mm;
  
         /*
-        * If we're in an interrupt or have no user context or are running in an
-        * atomic region then we must not take the fault..
+        * If we're in an interrupt or have no user context or have pagefaults
+        * disabled then we must not take the fault.
          */
-       if (in_atomic() || !mm)
+       if (faulthandler_disabled() || !mm)
                 goto bad_area_nosemaphore;
  
         if (error_code & ACE_USERMODE)
diff --git a/arch/m68k/include/asm/irqflags.h b/arch/m68k/include/asm/irqflags.h

index a823cd73dc09e35bdfe683ab0637dd3c19ffdcb1..b5941818346f65dfbaad65823d2e61a59673f44a 100644 (file)
--- a/arch/m68k/include/asm/irqflags.h
+++ b/arch/m68k/include/asm/irqflags.h
@@ -2,9 +2,6 @@
  #define _M68K_IRQFLAGS_H
  
  #include <linux/types.h>
-#ifdef CONFIG_MMU
-#include <linux/preempt_mask.h>
-#endif
  #include <linux/preempt.h>
  #include <asm/thread_info.h>
  #include <asm/entry.h>
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c

index b2f04aee46ecc2f7a5fb1db26d8e4279f6b6ea2e..6a94cdd0c8308cb70151d477872ccbeda2022ea8 100644 (file)
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -10,10 +10,10 @@
  #include <linux/ptrace.h>
  #include <linux/interrupt.h>
  #include <linux/module.h>
+#include <linux/uaccess.h>
  
  #include <asm/setup.h>
  #include <asm/traps.h>
-#include <asm/uaccess.h>
  #include <asm/pgalloc.h>
  
  extern void die_if_kernel(char *, struct pt_regs *, long);
@@ -81,7 +81,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
          * If we're in an interrupt or have no user
          * context, we must not take the fault..
          */
-       if (in_atomic() || !mm)
+       if (faulthandler_disabled() || !mm)
                 goto no_context;
  
         if (user_mode(regs))
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c

index 2de5dc695a87fa96d41a83e127166a7126d10df0..f57edca63609bf15f1a71bf5a90c8837d6a9fd13 100644 (file)
--- a/arch/metag/mm/fault.c
+++ b/arch/metag/mm/fault.c
@@ -105,7 +105,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
  
         mm = tsk->mm;
  
-       if (in_atomic() || !mm)
+       if (faulthandler_disabled() || !mm)
                 goto no_context;
  
         if (user_mode(regs))
diff --git a/arch/metag/mm/highmem.c b/arch/metag/mm/highmem.c

index d71f621a2c0b92adb5679162d1cd59568440986b..807f1b1c4e6567738f676c935ad21483e4972a32 100644 (file)
--- a/arch/metag/mm/highmem.c
+++ b/arch/metag/mm/highmem.c
@@ -43,7 +43,7 @@ void *kmap_atomic(struct page *page)
         unsigned long vaddr;
         int type;
  
-       /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+       preempt_disable();
         pagefault_disable();
         if (!PageHighMem(page))
                 return page_address(page);
@@ -82,6 +82,7 @@ void __kunmap_atomic(void *kvaddr)
         }
  
         pagefault_enable();
+       preempt_enable();
  }
  EXPORT_SYMBOL(__kunmap_atomic);
  
@@ -95,6 +96,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
         unsigned long vaddr;
         int type;
  
+       preempt_disable();
         pagefault_disable();
  
         type = kmap_atomic_idx_push();
diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h

index 62942fd126728688cb8358685802eead5c8b5fcf..331b0d35f89ce301ad9ba876be7322f417d7c909 100644 (file)
--- a/arch/microblaze/include/asm/uaccess.h
+++ b/arch/microblaze/include/asm/uaccess.h
@@ -178,7 +178,8 @@ extern long __user_bad(void);
   * @x:   Variable to store result.
   * @ptr: Source address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple variable from user space to kernel
   * space.  It supports simple types like char and int, but not larger
@@ -290,7 +291,8 @@ extern long __user_bad(void);
   * @x:   Value to copy to user space.
   * @ptr: Destination address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple value from kernel space to user
   * space.  It supports simple types like char and int, but not larger
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c

index d46a5ebb7570e07869ea03b9b995374aa3bff82e..177dfc0036436284d4e016b0987e4516faa7f445 100644 (file)
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -107,14 +107,14 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
         if ((error_code & 0x13) == 0x13 || (error_code & 0x11) == 0x11)
                 is_write = 0;
  
-       if (unlikely(in_atomic() || !mm)) {
+       if (unlikely(faulthandler_disabled() || !mm)) {
                 if (kernel_mode(regs))
                         goto bad_area_nosemaphore;
  
-               /* in_atomic() in user mode is really bad,
+               /* faulthandler_disabled() in user mode is really bad,
                    as is current->mm == NULL. */
-               pr_emerg("Page fault in user mode with in_atomic(), mm = %p\n",
-                                                                       mm);
+               pr_emerg("Page fault in user mode with faulthandler_disabled(), mm = %p\n",
+                        mm);
                 pr_emerg("r15 = %lx  MSR = %lx\n",
                        regs->r15, regs->msr);
                 die("Weird page fault", regs, SIGSEGV);
diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c

index 5a92576fad927127eb05fbe24833b57e18ba7155..2fcc5a52d84d1c2cf25d0cc45c356ce14549d74e 100644 (file)
--- a/arch/microblaze/mm/highmem.c
+++ b/arch/microblaze/mm/highmem.c
@@ -37,7 +37,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
         unsigned long vaddr;
         int idx, type;
  
-       /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+       preempt_disable();
         pagefault_disable();
         if (!PageHighMem(page))
                 return page_address(page);
@@ -63,6 +63,7 @@ void __kunmap_atomic(void *kvaddr)
  
         if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
                 pagefault_enable();
+               preempt_enable();
                 return;
         }
  
@@ -84,5 +85,6 @@ void __kunmap_atomic(void *kvaddr)
  #endif
         kmap_atomic_idx_pop();
         pagefault_enable();
+       preempt_enable();
  }
  EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/mips/ath79/setup.c b/arch/mips/ath79/setup.c

index a73c93c3d44a1069149945cf2732aeed26c918bd..7fc8397d16f21d713ad3e073308f69c75dd88692 100644 (file)
--- a/arch/mips/ath79/setup.c
+++ b/arch/mips/ath79/setup.c
@@ -225,7 +225,7 @@ void __init plat_time_init(void)
         ddr_clk_rate = ath79_get_sys_clk_rate("ddr");
         ref_clk_rate = ath79_get_sys_clk_rate("ref");
  
-       pr_info("Clocks: CPU:%lu.%03luMHz, DDR:%lu.%03luMHz, AHB:%lu.%03luMHz, Ref:%lu.%03luMHz",
+       pr_info("Clocks: CPU:%lu.%03luMHz, DDR:%lu.%03luMHz, AHB:%lu.%03luMHz, Ref:%lu.%03luMHz\n",
                 cpu_clk_rate / 1000000, (cpu_clk_rate / 1000) % 1000,
                 ddr_clk_rate / 1000000, (ddr_clk_rate / 1000) % 1000,
                 ahb_clk_rate / 1000000, (ahb_clk_rate / 1000) % 1000,
diff --git a/arch/mips/cobalt/Makefile b/arch/mips/cobalt/Makefile

index 558e94977942033dc8247bcc510ebb705aa9698a..68f0c5871adcdf51f40380ffbba09b1e5e52202c 100644 (file)
--- a/arch/mips/cobalt/Makefile
+++ b/arch/mips/cobalt/Makefile
@@ -2,7 +2,6 @@
  # Makefile for the Cobalt micro systems family specific parts of the kernel
  #
  
-obj-y := buttons.o irq.o lcd.o led.o reset.o rtc.o serial.o setup.o time.o
+obj-y := buttons.o irq.o lcd.o led.o mtd.o reset.o rtc.o serial.o setup.o time.o
  
  obj-$(CONFIG_PCI)              += pci.o
-obj-$(CONFIG_MTD_PHYSMAP)      += mtd.o
diff --git a/arch/mips/include/asm/pgtable-bits.h b/arch/mips/include/asm/pgtable-bits.h

index 18ae5ddef118c071e1240486e90f08be3e4b0871..c28a8499aec7f4fa18c5bd4c71922812d2ebf143 100644 (file)
--- a/arch/mips/include/asm/pgtable-bits.h
+++ b/arch/mips/include/asm/pgtable-bits.h
@@ -113,7 +113,7 @@
  #define _PAGE_PRESENT_SHIFT    0
  #define _PAGE_PRESENT          (1 << _PAGE_PRESENT_SHIFT)
  /* R2 or later cores check for RI/XI support to determine _PAGE_READ */
-#ifdef CONFIG_CPU_MIPSR2
+#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
  #define _PAGE_WRITE_SHIFT      (_PAGE_PRESENT_SHIFT + 1)
  #define _PAGE_WRITE            (1 << _PAGE_WRITE_SHIFT)
  #else
@@ -135,16 +135,16 @@
  #define _PAGE_SPLITTING                (1 << _PAGE_SPLITTING_SHIFT)
  
  /* Only R2 or newer cores have the XI bit */
-#ifdef CONFIG_CPU_MIPSR2
+#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
  #define _PAGE_NO_EXEC_SHIFT    (_PAGE_SPLITTING_SHIFT + 1)
  #else
  #define _PAGE_GLOBAL_SHIFT     (_PAGE_SPLITTING_SHIFT + 1)
  #define _PAGE_GLOBAL           (1 << _PAGE_GLOBAL_SHIFT)
-#endif /* CONFIG_CPU_MIPSR2 */
+#endif /* CONFIG_CPU_MIPSR2 || CONFIG_CPU_MIPSR6 */
  
  #endif /* CONFIG_64BIT && CONFIG_MIPS_HUGE_TLB_SUPPORT */
  
-#ifdef CONFIG_CPU_MIPSR2
+#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
  /* XI - page cannot be executed */
  #ifndef _PAGE_NO_EXEC_SHIFT
  #define _PAGE_NO_EXEC_SHIFT    (_PAGE_MODIFIED_SHIFT + 1)
@@ -160,10 +160,10 @@
  #define _PAGE_GLOBAL_SHIFT     (_PAGE_NO_READ_SHIFT + 1)
  #define _PAGE_GLOBAL           (1 << _PAGE_GLOBAL_SHIFT)
  
-#else  /* !CONFIG_CPU_MIPSR2 */
+#else  /* !CONFIG_CPU_MIPSR2 && !CONFIG_CPU_MIPSR6 */
  #define _PAGE_GLOBAL_SHIFT     (_PAGE_MODIFIED_SHIFT + 1)
  #define _PAGE_GLOBAL           (1 << _PAGE_GLOBAL_SHIFT)
-#endif /* CONFIG_CPU_MIPSR2 */
+#endif /* CONFIG_CPU_MIPSR2 || CONFIG_CPU_MIPSR6 */
  
  #define _PAGE_VALID_SHIFT      (_PAGE_GLOBAL_SHIFT + 1)
  #define _PAGE_VALID            (1 << _PAGE_VALID_SHIFT)
@@ -205,7 +205,7 @@
   */
  static inline uint64_t pte_to_entrylo(unsigned long pte_val)
  {
-#ifdef CONFIG_CPU_MIPSR2
+#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
         if (cpu_has_rixi) {
                 int sa;
  #ifdef CONFIG_32BIT
diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h

index e92d6c4b5ed192305b0b1f1605481f745cfadb10..7163cd7fdd69a622892e4be83acbe0450e8f2af0 100644 (file)
--- a/arch/mips/include/asm/switch_to.h
+++ b/arch/mips/include/asm/switch_to.h
@@ -104,7 +104,6 @@ do {                                                                        \
         if (test_and_clear_tsk_thread_flag(prev, TIF_USEDMSA))          \
                 __fpsave = FP_SAVE_VECTOR;                              \
         (last) = resume(prev, next, task_thread_info(next), __fpsave);  \
-       disable_msa();                                                  \
  } while (0)
  
  #define finish_arch_switch(prev)                                       \
@@ -122,6 +121,7 @@ do {                                                                        \
         if (cpu_has_userlocal)                                          \
                 write_c0_userlocal(current_thread_info()->tp_value);    \
         __restore_watch();                                              \
+       disable_msa();                                                  \
  } while (0)
  
  #endif /* _ASM_SWITCH_TO_H */
diff --git a/arch/mips/include/asm/topology.h b/arch/mips/include/asm/topology.h

index 3e307ec2afbae2f7308df902f450006bafd006f9..7afda4150a59d928537cddd33e8ed0105d90629b 100644 (file)
--- a/arch/mips/include/asm/topology.h
+++ b/arch/mips/include/asm/topology.h
@@ -15,7 +15,7 @@
  #define topology_physical_package_id(cpu)      (cpu_data[cpu].package)
  #define topology_core_id(cpu)                  (cpu_data[cpu].core)
  #define topology_core_cpumask(cpu)             (&cpu_core_map[cpu])
-#define topology_thread_cpumask(cpu)           (&cpu_sibling_map[cpu])
+#define topology_sibling_cpumask(cpu)          (&cpu_sibling_map[cpu])
  #endif
  
  #endif /* __ASM_TOPOLOGY_H */
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h

index bf8b32450ef6b4b5c91993f52fa205e79d2e9226..9722357d285471bc1d27e944ba334027b5b35469 100644 (file)
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -103,7 +103,8 @@ extern u64 __ua_limit;
   * @addr: User space pointer to start of block to check
   * @size: Size of block to check
   *
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Checks if a pointer to a block of memory in user space is valid.
   *
@@ -138,7 +139,8 @@ extern u64 __ua_limit;
   * @x:  Value to copy to user space.
   * @ptr: Destination address, in user space.
   *
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple value from kernel space to user
   * space.  It supports simple types like char and int, but not larger
@@ -157,7 +159,8 @@ extern u64 __ua_limit;
   * @x:  Variable to store result.
   * @ptr: Source address, in user space.
   *
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple variable from user space to kernel
   * space.  It supports simple types like char and int, but not larger
@@ -177,7 +180,8 @@ extern u64 __ua_limit;
   * @x:  Value to copy to user space.
   * @ptr: Destination address, in user space.
   *
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple value from kernel space to user
   * space.  It supports simple types like char and int, but not larger
@@ -199,7 +203,8 @@ extern u64 __ua_limit;
   * @x:  Variable to store result.
   * @ptr: Source address, in user space.
   *
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple variable from user space to kernel
   * space.  It supports simple types like char and int, but not larger
@@ -498,7 +503,8 @@ extern void __put_user_unknown(void);
   * @x:  Value to copy to user space.
   * @ptr: Destination address, in user space.
   *
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple value from kernel space to user
   * space.  It supports simple types like char and int, but not larger
@@ -517,7 +523,8 @@ extern void __put_user_unknown(void);
   * @x:  Variable to store result.
   * @ptr: Source address, in user space.
   *
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple variable from user space to kernel
   * space.  It supports simple types like char and int, but not larger
@@ -537,7 +544,8 @@ extern void __put_user_unknown(void);
   * @x:  Value to copy to user space.
   * @ptr: Destination address, in user space.
   *
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple value from kernel space to user
   * space.  It supports simple types like char and int, but not larger
@@ -559,7 +567,8 @@ extern void __put_user_unknown(void);
   * @x:  Variable to store result.
   * @ptr: Source address, in user space.
   *
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple variable from user space to kernel
   * space.  It supports simple types like char and int, but not larger
@@ -815,7 +824,8 @@ extern size_t __copy_user(void *__to, const void *__from, size_t __n);
   * @from: Source address, in kernel space.
   * @n:   Number of bytes to copy.
   *
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from kernel space to user space.  Caller must check
   * the specified block with access_ok() before calling this function.
@@ -888,7 +898,8 @@ extern size_t __copy_user_inatomic(void *__to, const void *__from, size_t __n);
   * @from: Source address, in kernel space.
   * @n:   Number of bytes to copy.
   *
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from kernel space to user space.
   *
@@ -1075,7 +1086,8 @@ extern size_t __copy_in_user_eva(void *__to, const void *__from, size_t __n);
   * @from: Source address, in user space.
   * @n:   Number of bytes to copy.
   *
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from user space to kernel space.  Caller must check
   * the specified block with access_ok() before calling this function.
@@ -1107,7 +1119,8 @@ extern size_t __copy_in_user_eva(void *__to, const void *__from, size_t __n);
   * @from: Source address, in user space.
   * @n:   Number of bytes to copy.
   *
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from user space to kernel space.
   *
@@ -1329,7 +1342,8 @@ strncpy_from_user(char *__to, const char __user *__from, long __len)
   * strlen_user: - Get the size of a string in user space.
   * @str: The string to measure.
   *
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Get the size of a NUL-terminated string in user space.
   *
@@ -1398,7 +1412,8 @@ static inline long __strnlen_user(const char __user *s, long n)
   * strnlen_user: - Get the size of a string in user space.
   * @str: The string to measure.
   *
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Get the size of a NUL-terminated string in user space.
   *
diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c

index e36515dcd3b29efcdb0014c7dfd4541805eb4e4c..209e5b76c1bce56f02ceeb1fdeffeccc6fe46bd8 100644 (file)
--- a/arch/mips/kernel/cpu-probe.c
+++ b/arch/mips/kernel/cpu-probe.c
@@ -74,13 +74,12 @@ static inline void cpu_set_fpu_fcsr_mask(struct cpuinfo_mips *c)
  {
         unsigned long sr, mask, fcsr, fcsr0, fcsr1;
  
+       fcsr = c->fpu_csr31;
         mask = FPU_CSR_ALL_X | FPU_CSR_ALL_E | FPU_CSR_ALL_S | FPU_CSR_RM;
  
         sr = read_c0_status();
         __enable_fpu(FPU_AS_IS);
  
-       fcsr = read_32bit_cp1_register(CP1_STATUS);
-
         fcsr0 = fcsr & mask;
         write_32bit_cp1_register(CP1_STATUS, fcsr0);
         fcsr0 = read_32bit_cp1_register(CP1_STATUS);
diff --git a/arch/mips/kernel/irq.c b/arch/mips/kernel/irq.c

index 51f57d8416625a26cc73d38a615c750fd4dd2f51..3c8a18a00a65fee62e7cc11068d3866b18b04fd1 100644 (file)
--- a/arch/mips/kernel/irq.c
+++ b/arch/mips/kernel/irq.c
@@ -109,7 +109,7 @@ void __init init_IRQ(void)
  #endif
  }
  
-#ifdef DEBUG_STACKOVERFLOW
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
  static inline void check_stack_overflow(void)
  {
         unsigned long sp;
diff --git a/arch/mips/kernel/signal-common.h b/arch/mips/kernel/signal-common.h

index 06805e09bcd35751857a6f4a1d367d6796163c1b..0b85f827cd1836165fe4d146ec1d543953edeb7b 100644 (file)
--- a/arch/mips/kernel/signal-common.h
+++ b/arch/mips/kernel/signal-common.h
@@ -28,12 +28,7 @@ extern void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
  extern int fpcsr_pending(unsigned int __user *fpcsr);
  
  /* Make sure we will not lose FPU ownership */
-#ifdef CONFIG_PREEMPT
-#define lock_fpu_owner()       preempt_disable()
-#define unlock_fpu_owner()     preempt_enable()
-#else
-#define lock_fpu_owner()       pagefault_disable()
-#define unlock_fpu_owner()     pagefault_enable()
-#endif
+#define lock_fpu_owner()       ({ preempt_disable(); pagefault_disable(); })
+#define unlock_fpu_owner()     ({ pagefault_enable(); preempt_enable(); })
  
  #endif /* __SIGNAL_COMMON_H */
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c

index 4b50c5787e25bdb4bdb28eb7736296e1d5d3812c..d5fa3eaf39a106546f52d82ec3e5391302ef8dec 100644 (file)
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -2409,7 +2409,7 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
                 if (vcpu->mmio_needed == 2)
                         *gpr = *(int16_t *) run->mmio.data;
                 else
-                       *gpr = *(int16_t *) run->mmio.data;
+                       *gpr = *(uint16_t *)run->mmio.data;
  
                 break;
         case 1:
diff --git a/arch/mips/loongson/common/Makefile b/arch/mips/loongson/common/Makefile

index e70c33fdb88153ac6bfdf12a4f632d3b3a26ccb9..f2e8153e44f536213e196002f005bb86da9ef72f 100644 (file)
--- a/arch/mips/loongson/common/Makefile
+++ b/arch/mips/loongson/common/Makefile
@@ -3,15 +3,13 @@
  #
  
  obj-y += setup.o init.o cmdline.o env.o time.o reset.o irq.o \
-    bonito-irq.o mem.o machtype.o platform.o
+    bonito-irq.o mem.o machtype.o platform.o serial.o
  obj-$(CONFIG_PCI) += pci.o
  
  #
  # Serial port support
  #
  obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
-loongson-serial-$(CONFIG_SERIAL_8250) := serial.o
-obj-y += $(loongson-serial-m) $(loongson-serial-y)
  obj-$(CONFIG_LOONGSON_UART_BASE) += uart_base.o
  obj-$(CONFIG_LOONGSON_MC146818) += rtc.o
  
diff --git a/arch/mips/loongson/loongson-3/smp.c b/arch/mips/loongson/loongson-3/smp.c

index e3c68b5da18da4012de0aaed6363d5a5484d5e41..509877c6e9d908d7bac6110982c7208ab69204af 100644 (file)
--- a/arch/mips/loongson/loongson-3/smp.c
+++ b/arch/mips/loongson/loongson-3/smp.c
@@ -272,7 +272,7 @@ void loongson3_ipi_interrupt(struct pt_regs *regs)
         if (action & SMP_ASK_C0COUNT) {
                 BUG_ON(cpu != 0);
                 c0count = read_c0_count();
-               for (i = 1; i < loongson_sysconf.nr_cpus; i++)
+               for (i = 1; i < num_possible_cpus(); i++)
                         per_cpu(core0_c0count, i) = c0count;
         }
  }
diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c

index 0dbb65a51ce5b1c2913cfec00571710e3a0ecb10..2e03ab1735911d202ce82c97b4911b5c1002ed70 100644 (file)
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -1372,7 +1372,7 @@ static int probe_scache(void)
         scache_size = addr;
         c->scache.linesz = 16 << ((config & R4K_CONF_SB) >> 22);
         c->scache.ways = 1;
-       c->dcache.waybit = 0;           /* does not matter */
+       c->scache.waybit = 0;           /* does not matter */
  
         return 1;
  }
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c

index 7ff8637e530d7974d002594797d42044505a0467..36c0f26fac6b0780318958a59fc2665a444a10ea 100644 (file)
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -21,10 +21,10 @@
  #include <linux/module.h>
  #include <linux/kprobes.h>
  #include <linux/perf_event.h>
+#include <linux/uaccess.h>
  
  #include <asm/branch.h>
  #include <asm/mmu_context.h>
-#include <asm/uaccess.h>
  #include <asm/ptrace.h>
  #include <asm/highmem.h>               /* For VMALLOC_END */
  #include <linux/kdebug.h>
@@ -94,7 +94,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
          * If we're in an interrupt or have no user
          * context, we must not take the fault..
          */
-       if (in_atomic() || !mm)
+       if (faulthandler_disabled() || !mm)
                 goto bad_area_nosemaphore;
  
         if (user_mode(regs))
diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c

index da815d295239baaaf6e1d1069e2255baa8d75358..11661cbc11a8193f7a9817fb0101fd9483db9f23 100644 (file)
--- a/arch/mips/mm/highmem.c
+++ b/arch/mips/mm/highmem.c
@@ -47,7 +47,7 @@ void *kmap_atomic(struct page *page)
         unsigned long vaddr;
         int idx, type;
  
-       /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+       preempt_disable();
         pagefault_disable();
         if (!PageHighMem(page))
                 return page_address(page);
@@ -72,6 +72,7 @@ void __kunmap_atomic(void *kvaddr)
  
         if (vaddr < FIXADDR_START) { // FIXME
                 pagefault_enable();
+               preempt_enable();
                 return;
         }
  
@@ -92,6 +93,7 @@ void __kunmap_atomic(void *kvaddr)
  #endif
         kmap_atomic_idx_pop();
         pagefault_enable();
+       preempt_enable();
  }
  EXPORT_SYMBOL(__kunmap_atomic);
  
@@ -104,6 +106,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
         unsigned long vaddr;
         int idx, type;
  
+       preempt_disable();
         pagefault_disable();
  
         type = kmap_atomic_idx_push();
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c

index faa5c9822eccf48bc433f886a9939d93a1112bf1..198a3147dd7d08b78746790c628d2012a65c61ec 100644 (file)
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -90,6 +90,7 @@ static void *__kmap_pgprot(struct page *page, unsigned long addr, pgprot_t prot)
  
         BUG_ON(Page_dcache_dirty(page));
  
+       preempt_disable();
         pagefault_disable();
         idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1);
         idx += in_interrupt() ? FIX_N_COLOURS : 0;
@@ -152,6 +153,7 @@ void kunmap_coherent(void)
         write_c0_entryhi(old_ctx);
         local_irq_restore(flags);
         pagefault_enable();
+       preempt_enable();
  }
  
  void copy_user_highpage(struct page *to, struct page *from,
diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c

index 5d6139390bf830adf503d67d004a5322d8eb7ad4..e23fdf2a9c80d2f0dbbb498343efb859c08f3b4e 100644 (file)
--- a/arch/mips/net/bpf_jit.c
+++ b/arch/mips/net/bpf_jit.c
@@ -681,11 +681,7 @@ static unsigned int get_stack_depth(struct jit_ctx *ctx)
                 sp_off += config_enabled(CONFIG_64BIT) ?
                         (ARGS_USED_BY_JIT + 1) * RSIZE : RSIZE;
  
-       /*
-        * Subtract the bytes for the last registers since we only care about
-        * the location on the stack pointer.
-        */
-       return sp_off - RSIZE;
+       return sp_off;
  }
  
  static void build_prologue(struct jit_ctx *ctx)
diff --git a/arch/mips/ralink/ill_acc.c b/arch/mips/ralink/ill_acc.c

index e20b02e3ae28be201789dd260ab79a382be944f8..e10d10b9e82a98bf5e53382d88cbc98b769ef53c 100644 (file)
--- a/arch/mips/ralink/ill_acc.c
+++ b/arch/mips/ralink/ill_acc.c
@@ -41,7 +41,7 @@ static irqreturn_t ill_acc_irq_handler(int irq, void *_priv)
                 addr, (type >> ILL_ACC_OFF_S) & ILL_ACC_OFF_M,
                 type & ILL_ACC_LEN_M);
  
-       rt_memc_w32(REG_ILL_ACC_TYPE, REG_ILL_ACC_TYPE);
+       rt_memc_w32(ILL_INT_STATUS, REG_ILL_ACC_TYPE);
  
         return IRQ_HANDLED;
  }
diff --git a/arch/mn10300/include/asm/highmem.h b/arch/mn10300/include/asm/highmem.h

index 2fbbe4d920aa2efb353ed5fd52babaf309a386db..1ddea5afba09344ba8e807e6ce8f5edc5c26241f 100644 (file)
--- a/arch/mn10300/include/asm/highmem.h
+++ b/arch/mn10300/include/asm/highmem.h
@@ -75,6 +75,7 @@ static inline void *kmap_atomic(struct page *page)
         unsigned long vaddr;
         int idx, type;
  
+       preempt_disable();
         pagefault_disable();
         if (page < highmem_start_page)
                 return page_address(page);
@@ -98,6 +99,7 @@ static inline void __kunmap_atomic(unsigned long vaddr)
  
         if (vaddr < FIXADDR_START) { /* FIXME */
                 pagefault_enable();
+               preempt_enable();
                 return;
         }
  
@@ -122,6 +124,7 @@ static inline void __kunmap_atomic(unsigned long vaddr)
  
         kmap_atomic_idx_pop();
         pagefault_enable();
+       preempt_enable();
  }
  #endif /* __KERNEL__ */
  
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c

index 0c2cc5d39c8e37ce1cfe5be191902bc435c41090..4a1d181ed32f7690a82cfcba3eb264f9311b0a0f 100644 (file)
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -23,8 +23,8 @@
  #include <linux/interrupt.h>
  #include <linux/init.h>
  #include <linux/vt_kern.h>             /* For unblank_screen() */
+#include <linux/uaccess.h>
  
-#include <asm/uaccess.h>
  #include <asm/pgalloc.h>
  #include <asm/hardirq.h>
  #include <asm/cpu-regs.h>
@@ -168,7 +168,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code,
          * If we're in an interrupt or have no user
          * context, we must not take the fault..
          */
-       if (in_atomic() || !mm)
+       if (faulthandler_disabled() || !mm)
                 goto no_context;
  
         if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c

index 0c9b6afe69e9094815cc1e73084422368b3a2e52..b51878b0c6b87362074c68832c1b4355f7c127cc 100644 (file)
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -77,7 +77,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause,
          * If we're in an interrupt or have no user
          * context, we must not take the fault..
          */
-       if (in_atomic() || !mm)
+       if (faulthandler_disabled() || !mm)
                 goto bad_area_nosemaphore;
  
         if (user_mode(regs))
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h

index de65f66ea64e7538f4f7c431ca3800e86c152e5d..ec2df4bab3022dfc35a83539afcd24207e199f9f 100644 (file)
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -142,6 +142,7 @@ static inline void kunmap(struct page *page)
  
  static inline void *kmap_atomic(struct page *page)
  {
+       preempt_disable();
         pagefault_disable();
         return page_address(page);
  }
@@ -150,6 +151,7 @@ static inline void __kunmap_atomic(void *addr)
  {
         flush_kernel_dcache_page_addr(addr);
         pagefault_enable();
+       preempt_enable();
  }
  
  #define kmap_atomic_prot(page, prot)   kmap_atomic(page)
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c

index 47ee620d15d27850ab8ebac1f739dfd3215dae9b..6548fd1d2e62defc2dfc6e8dfc125a42aa8e986e 100644 (file)
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -26,9 +26,9 @@
  #include <linux/console.h>
  #include <linux/bug.h>
  #include <linux/ratelimit.h>
+#include <linux/uaccess.h>
  
  #include <asm/assembly.h>
-#include <asm/uaccess.h>
  #include <asm/io.h>
  #include <asm/irq.h>
  #include <asm/traps.h>
@@ -800,7 +800,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
              * unless pagefault_disable() was called before.
              */
  
-           if (fault_space == 0 && !in_atomic())
+           if (fault_space == 0 && !faulthandler_disabled())
             {
                 pdc_chassis_send_status(PDC_CHASSIS_DIRECT_PANIC);
                 parisc_terminate("Kernel Fault", regs, code, fault_address);
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c

index e5120e653240c4fa52d4895c7d1d206d3d12e68c..15503adddf4f59695d34f3b5adb428250594bf66 100644 (file)
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -15,8 +15,8 @@
  #include <linux/sched.h>
  #include <linux/interrupt.h>
  #include <linux/module.h>
+#include <linux/uaccess.h>
  
-#include <asm/uaccess.h>
  #include <asm/traps.h>
  
  /* Various important other fields */
@@ -207,7 +207,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
         int fault;
         unsigned int flags;
  
-       if (in_atomic())
+       if (pagefault_disabled())
                 goto no_context;
  
         tsk = current;
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h

index 39505d660a704080e6a1e4fe66507715f23a1477..51ccc7232042e9f415c26580cca3a83d02175625 100644 (file)
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -89,5 +89,6 @@ do {                                                                  \
  
  #define smp_mb__before_atomic()     smp_mb()
  #define smp_mb__after_atomic()      smp_mb()
+#define smp_mb__before_spinlock()   smp_mb()
  
  #endif /* _ASM_POWERPC_BARRIER_H */
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h

index 5f1048eaa5b6041d1194457bec16509dcabe7907..8b3b46b7b0f2795b6195eb95ee649d3dece6dc9a 100644 (file)
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -87,7 +87,7 @@ static inline int prrn_is_enabled(void)
  #include <asm/smp.h>
  
  #define topology_physical_package_id(cpu)      (cpu_to_chip_id(cpu))
-#define topology_thread_cpumask(cpu)   (per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_cpumask(cpu)  (per_cpu(cpu_sibling_map, cpu))
  #define topology_core_cpumask(cpu)     (per_cpu(cpu_core_map, cpu))
  #define topology_core_id(cpu)          (cpu_to_core_id(cpu))
  #endif
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c

index 3cf529ceec5bd6a85ad28b5dcf8e73dcd5d04c3c..ac93a3bd27300f9d058f45a2df4838379ecde52e 100644 (file)
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -27,11 +27,11 @@ int enter_vmx_usercopy(void)
         if (in_interrupt())
                 return 0;
  
-       /* This acts as preempt_disable() as well and will make
-        * enable_kernel_altivec(). We need to disable page faults
-        * as they can call schedule and thus make us lose the VMX
-        * context. So on page faults, we just fail which will cause
-        * a fallback to the normal non-vmx copy.
+       preempt_disable();
+       /*
+        * We need to disable page faults as they can call schedule and
+        * thus make us lose the VMX context. So on page faults, we just
+        * fail which will cause a fallback to the normal non-vmx copy.
          */
         pagefault_disable();
  
@@ -47,6 +47,7 @@ int enter_vmx_usercopy(void)
  int exit_vmx_usercopy(void)
  {
         pagefault_enable();
+       preempt_enable();
         return 0;
  }
  
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c

index b396868d2aa7c48438f1577df35759991b165f6a..6d535973b200dde0f64ae19a3d47f1eaca6a4c99 100644 (file)
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -33,13 +33,13 @@
  #include <linux/ratelimit.h>
  #include <linux/context_tracking.h>
  #include <linux/hugetlb.h>
+#include <linux/uaccess.h>
  
  #include <asm/firmware.h>
  #include <asm/page.h>
  #include <asm/pgtable.h>
  #include <asm/mmu.h>
  #include <asm/mmu_context.h>
-#include <asm/uaccess.h>
  #include <asm/tlbflush.h>
  #include <asm/siginfo.h>
  #include <asm/debug.h>
@@ -272,15 +272,16 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
         if (!arch_irq_disabled_regs(regs))
                 local_irq_enable();
  
-       if (in_atomic() || mm == NULL) {
+       if (faulthandler_disabled() || mm == NULL) {
                 if (!user_mode(regs)) {
                         rc = SIGSEGV;
                         goto bail;
                 }
-               /* in_atomic() in user mode is really bad,
+               /* faulthandler_disabled() in user mode is really bad,
                    as is current->mm == NULL. */
                 printk(KERN_EMERG "Page fault in user mode with "
-                      "in_atomic() = %d mm = %p\n", in_atomic(), mm);
+                      "faulthandler_disabled() = %d mm = %p\n",
+                      faulthandler_disabled(), mm);
                 printk(KERN_EMERG "NIP = %lx  MSR = %lx\n",
                        regs->nip, regs->msr);
                 die("Weird page fault", regs, SIGSEGV);
diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c

index e7450bdbe83a9380264fc149c4831b587226cd36..e292c8a609523bd30e860f5e1a469872389e7421 100644 (file)
--- a/arch/powerpc/mm/highmem.c
+++ b/arch/powerpc/mm/highmem.c
@@ -34,7 +34,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
         unsigned long vaddr;
         int idx, type;
  
-       /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+       preempt_disable();
         pagefault_disable();
         if (!PageHighMem(page))
                 return page_address(page);
@@ -59,6 +59,7 @@ void __kunmap_atomic(void *kvaddr)
  
         if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
                 pagefault_enable();
+               preempt_enable();
                 return;
         }
  
@@ -82,5 +83,6 @@ void __kunmap_atomic(void *kvaddr)
  
         kmap_atomic_idx_pop();
         pagefault_enable();
+       preempt_enable();
  }
  EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c

index cbd3d069897f61d11c8c4497700f1d746178a0f7..723a099f6be31ac425873a6363be7c19f916dea6 100644 (file)
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -217,7 +217,7 @@ static DEFINE_RAW_SPINLOCK(tlbivax_lock);
  static int mm_is_core_local(struct mm_struct *mm)
  {
         return cpumask_subset(mm_cpumask(mm),
-                             topology_thread_cpumask(smp_processor_id()));
+                             topology_sibling_cpumask(smp_processor_id()));
  }
  
  struct tlb_flush_param {
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h

index b1453a2ae1ca583b2d4a0dc99bc325fd30ddf10d..4990f6c66288582b21ad01295b032cb13da9ee31 100644 (file)
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -22,7 +22,8 @@ DECLARE_PER_CPU(struct cpu_topology_s390, cpu_topology);
  
  #define topology_physical_package_id(cpu) (per_cpu(cpu_topology, cpu).socket_id)
  #define topology_thread_id(cpu)                  (per_cpu(cpu_topology, cpu).thread_id)
-#define topology_thread_cpumask(cpu)     (&per_cpu(cpu_topology, cpu).thread_mask)
+#define topology_sibling_cpumask(cpu) \
+               (&per_cpu(cpu_topology, cpu).thread_mask)
  #define topology_core_id(cpu)            (per_cpu(cpu_topology, cpu).core_id)
  #define topology_core_cpumask(cpu)       (&per_cpu(cpu_topology, cpu).core_mask)
  #define topology_book_id(cpu)            (per_cpu(cpu_topology, cpu).book_id)
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h

index d64a7a62164f854e3fd627694189c37e2860c92b..9dd4cc47ddc79298886fe40a5e229628b2ceff3d 100644 (file)
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -98,7 +98,8 @@ static inline unsigned long extable_fixup(const struct exception_table_entry *x)
   * @from: Source address, in user space.
   * @n:   Number of bytes to copy.
   *
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from user space to kernel space.  Caller must check
   * the specified block with access_ok() before calling this function.
@@ -118,7 +119,8 @@ unsigned long __must_check __copy_from_user(void *to, const void __user *from,
   * @from: Source address, in kernel space.
   * @n:   Number of bytes to copy.
   *
- * Context: User context only. This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from kernel space to user space.  Caller must check
   * the specified block with access_ok() before calling this function.
@@ -264,7 +266,8 @@ int __get_user_bad(void) __attribute__((noreturn));
   * @from: Source address, in kernel space.
   * @n:    Number of bytes to copy.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from kernel space to user space.
   *
@@ -290,7 +293,8 @@ __compiletime_warning("copy_from_user() buffer size is not provably correct")
   * @from: Source address, in user space.
   * @n:    Number of bytes to copy.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from user space to kernel space.
   *
@@ -348,7 +352,8 @@ static inline unsigned long strnlen_user(const char __user *src, unsigned long n
   * strlen_user: - Get the size of a string in user space.
   * @str: The string to measure.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Get the size of a NUL-terminated string in user space.
   *
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c

index 76515bcea2f18f5f78e59b86c0bd331fed0380f2..4c8f5d7f9c23d74c960cd1172f266391ba8ec1a3 100644 (file)
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -399,7 +399,7 @@ static inline int do_exception(struct pt_regs *regs, int access)
          * user context.
          */
         fault = VM_FAULT_BADCONTEXT;
-       if (unlikely(!user_space_fault(regs) || in_atomic() || !mm))
+       if (unlikely(!user_space_fault(regs) || faulthandler_disabled() || !mm))
                 goto out;
  
         address = trans_exc_code & __FAIL_ADDR_MASK;
diff --git a/arch/s390/net/bpf_jit.h b/arch/s390/net/bpf_jit.h

index ba8593a515baaa274d968aa64e6f54125238c032..de156ba3bd71c0d4db274a619c7c9fd6038c119c 100644 (file)
--- a/arch/s390/net/bpf_jit.h
+++ b/arch/s390/net/bpf_jit.h
@@ -48,7 +48,9 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
   * We get 160 bytes stack space from calling function, but only use
   * 11 * 8 byte (old backchain + r15 - r6) for storing registers.
   */
-#define STK_OFF (MAX_BPF_STACK + 8 + 4 + 4 + (160 - 11 * 8))
+#define STK_SPACE      (MAX_BPF_STACK + 8 + 4 + 4 + 160)
+#define STK_160_UNUSED (160 - 11 * 8)
+#define STK_OFF                (STK_SPACE - STK_160_UNUSED)
  #define STK_OFF_TMP    160     /* Offset of tmp buffer on stack */
  #define STK_OFF_HLEN   168     /* Offset of SKB header length on stack */
  
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c

index 20c146d1251ae2cd6c07279bf371adae6b2e3a1e..55423d8be580113d045d30edbf86d26fb74340ff 100644 (file)
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -384,13 +384,16 @@ static void bpf_jit_prologue(struct bpf_jit *jit)
         }
         /* Setup stack and backchain */
         if (jit->seen & SEEN_STACK) {
-               /* lgr %bfp,%r15 (BPF frame pointer) */
-               EMIT4(0xb9040000, BPF_REG_FP, REG_15);
+               if (jit->seen & SEEN_FUNC)
+                       /* lgr %w1,%r15 (backchain) */
+                       EMIT4(0xb9040000, REG_W1, REG_15);
+               /* la %bfp,STK_160_UNUSED(%r15) (BPF frame pointer) */
+               EMIT4_DISP(0x41000000, BPF_REG_FP, REG_15, STK_160_UNUSED);
                 /* aghi %r15,-STK_OFF */
                 EMIT4_IMM(0xa70b0000, REG_15, -STK_OFF);
                 if (jit->seen & SEEN_FUNC)
-                       /* stg %bfp,152(%r15) (backchain) */
-                       EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_FP, REG_0,
+                       /* stg %w1,152(%r15) (backchain) */
+                       EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0,
                                       REG_15, 152);
         }
         /*
diff --git a/arch/score/include/asm/uaccess.h b/arch/score/include/asm/uaccess.h

index ab66ddde777b5a3413cb2e334fa4ca6be6614c97..20a3591225ccea9aa3d93acc11a97a34bf254501 100644 (file)
--- a/arch/score/include/asm/uaccess.h
+++ b/arch/score/include/asm/uaccess.h
@@ -36,7 +36,8 @@
   * @addr: User space pointer to start of block to check
   * @size: Size of block to check
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Checks if a pointer to a block of memory in user space is valid.
   *
@@ -61,7 +62,8 @@
   * @x:   Value to copy to user space.
   * @ptr: Destination address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple value from kernel space to user
   * space.  It supports simple types like char and int, but not larger
@@ -79,7 +81,8 @@
   * @x:   Variable to store result.
   * @ptr: Source address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple variable from user space to kernel
   * space.  It supports simple types like char and int, but not larger
@@ -98,7 +101,8 @@
   * @x:   Value to copy to user space.
   * @ptr: Destination address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple value from kernel space to user
   * space.  It supports simple types like char and int, but not larger
@@ -119,7 +123,8 @@
   * @x:   Variable to store result.
   * @ptr: Source address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple variable from user space to kernel
   * space.  It supports simple types like char and int, but not larger
diff --git a/arch/score/lib/string.S b/arch/score/lib/string.S

index 00b7d3a2fc60681253eb2e1c1b874e48bbd02a4a..16efa3ad037f7cffbdbb4a5ffcf57a5d25325648 100644 (file)
--- a/arch/score/lib/string.S
+++ b/arch/score/lib/string.S
@@ -175,10 +175,10 @@ ENTRY(__clear_user)
         br      r3
  
         .section .fixup, "ax"
+99:
         br      r3
         .previous
         .section __ex_table, "a"
         .align  2
-99:
         .word   0b, 99b
         .previous
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c

index 6860beb2a280d0a4a65a67c89ad2201b33513068..37a6c2e0e96926f26902484969befaf860fc52fc 100644 (file)
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -34,6 +34,7 @@
  #include <linux/string.h>
  #include <linux/types.h>
  #include <linux/ptrace.h>
+#include <linux/uaccess.h>
  
  /*
   * This routine handles page faults.  It determines the address,
@@ -73,7 +74,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
         * If we're in an interrupt or have no user
         * context, we must not take the fault..
         */
-       if (in_atomic() || !mm)
+       if (pagefault_disabled() || !mm)
                 goto bad_area_nosemaphore;
  
         if (user_mode(regs))
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c

index a58fec9b55e016df85cdfb7c214cc385e300479c..79d8276377d1e2f62e6c9231f735ef0f1c5ca22e 100644 (file)
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -17,6 +17,7 @@
  #include <linux/kprobes.h>
  #include <linux/perf_event.h>
  #include <linux/kdebug.h>
+#include <linux/uaccess.h>
  #include <asm/io_trapped.h>
  #include <asm/mmu_context.h>
  #include <asm/tlbflush.h>
@@ -438,9 +439,9 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
  
         /*
          * If we're in an interrupt, have no user context or are running
-        * in an atomic region then we must not take the fault:
+        * with pagefaults disabled then we must not take the fault:
          */
-       if (unlikely(in_atomic() || !mm)) {
+       if (unlikely(faulthandler_disabled() || !mm)) {
                 bad_area_nosemaphore(regs, error_code, address);
                 return;
         }
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h

index d1761df5cca6fe2814c19a343274d2884a3cf0c4..01d17046225a8aa084b2a8f9d70a235bcab681f8 100644 (file)
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -41,7 +41,7 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
  #define topology_physical_package_id(cpu)      (cpu_data(cpu).proc_id)
  #define topology_core_id(cpu)                  (cpu_data(cpu).core_id)
  #define topology_core_cpumask(cpu)             (&cpu_core_sib_map[cpu])
-#define topology_thread_cpumask(cpu)           (&per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_cpumask(cpu)          (&per_cpu(cpu_sibling_map, cpu))
  #endif /* CONFIG_SMP */
  
  extern cpumask_t cpu_core_map[NR_CPUS];
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c

index 70d817154fe8bfd04aeaa71f45f15667f4962c23..c399e7b3b035250d66ed4522d2da190dc6169aa3 100644 (file)
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -21,6 +21,7 @@
  #include <linux/perf_event.h>
  #include <linux/interrupt.h>
  #include <linux/kdebug.h>
+#include <linux/uaccess.h>
  
  #include <asm/page.h>
  #include <asm/pgtable.h>
@@ -29,7 +30,6 @@
  #include <asm/setup.h>
  #include <asm/smp.h>
  #include <asm/traps.h>
-#include <asm/uaccess.h>
  
  #include "mm_32.h"
  
@@ -196,7 +196,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
          * If we're in an interrupt or have no user
          * context, we must not take the fault..
          */
-       if (in_atomic() || !mm)
+       if (pagefault_disabled() || !mm)
                 goto no_context;
  
         perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c

index 4798232494294a7ece0bef232216dd4a26408d88..e9268ea1a68de5364d468973e61b3cbd232c19c9 100644 (file)
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -22,12 +22,12 @@
  #include <linux/kdebug.h>
  #include <linux/percpu.h>
  #include <linux/context_tracking.h>
+#include <linux/uaccess.h>
  
  #include <asm/page.h>
  #include <asm/pgtable.h>
  #include <asm/openprom.h>
  #include <asm/oplib.h>
-#include <asm/uaccess.h>
  #include <asm/asi.h>
  #include <asm/lsu.h>
  #include <asm/sections.h>
@@ -330,7 +330,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
          * If we're in an interrupt or have no user
          * context, we must not take the fault..
          */
-       if (in_atomic() || !mm)
+       if (faulthandler_disabled() || !mm)
                 goto intr_or_no_mm;
  
         perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c

index 449f864f0cefdb8918bf0002d40dc87ba2cb8db3..a454ec5ff07af7f33d8ec548051101c3895e9b82 100644 (file)
--- a/arch/sparc/mm/highmem.c
+++ b/arch/sparc/mm/highmem.c
@@ -53,7 +53,7 @@ void *kmap_atomic(struct page *page)
         unsigned long vaddr;
         long idx, type;
  
-       /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+       preempt_disable();
         pagefault_disable();
         if (!PageHighMem(page))
                 return page_address(page);
@@ -91,6 +91,7 @@ void __kunmap_atomic(void *kvaddr)
  
         if (vaddr < FIXADDR_START) { // FIXME
                 pagefault_enable();
+               preempt_enable();
                 return;
         }
  
@@ -126,5 +127,6 @@ void __kunmap_atomic(void *kvaddr)
  
         kmap_atomic_idx_pop();
         pagefault_enable();
+       preempt_enable();
  }
  EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c

index 559cb744112ccd608bf4288470398fb21350b0ce..c5d08b89a96c811ce3d77d81e0ec7b14570d0ac8 100644 (file)
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2738,7 +2738,7 @@ void hugetlb_setup(struct pt_regs *regs)
         struct mm_struct *mm = current->mm;
         struct tsb_config *tp;
  
-       if (in_atomic() || !mm) {
+       if (faulthandler_disabled() || !mm) {
                 const struct exception_table_entry *entry;
  
                 entry = search_exception_tables(regs->tpc);
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h

index 938311844233b8c7e2753b5982bbdd0a302d5304..76b0d0ebb24433e1b064cda5dbb23eef5e9127dc 100644 (file)
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -55,7 +55,7 @@ static inline const struct cpumask *cpumask_of_node(int node)
  #define topology_physical_package_id(cpu)       ((void)(cpu), 0)
  #define topology_core_id(cpu)                   (cpu)
  #define topology_core_cpumask(cpu)              ((void)(cpu), cpu_online_mask)
-#define topology_thread_cpumask(cpu)            cpumask_of(cpu)
+#define topology_sibling_cpumask(cpu)           cpumask_of(cpu)
  #endif
  
  #endif /* _ASM_TILE_TOPOLOGY_H */
diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h

index f41cb53cf645dcb59287550aff82833e526c2e2c..a33276bf5ca1dfd284bc50545e107800c93cf651 100644 (file)
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -78,7 +78,8 @@ int __range_ok(unsigned long addr, unsigned long size);
   * @addr: User space pointer to start of block to check
   * @size: Size of block to check
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Checks if a pointer to a block of memory in user space is valid.
   *
@@ -192,7 +193,8 @@ extern int __get_user_bad(void)
   * @x:   Variable to store result.
   * @ptr: Source address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple variable from user space to kernel
   * space.  It supports simple types like char and int, but not larger
@@ -274,7 +276,8 @@ extern int __put_user_bad(void)
   * @x:   Value to copy to user space.
   * @ptr: Destination address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple value from kernel space to user
   * space.  It supports simple types like char and int, but not larger
@@ -330,7 +333,8 @@ extern int __put_user_bad(void)
   * @from: Source address, in kernel space.
   * @n:    Number of bytes to copy.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from kernel space to user space.  Caller must check
   * the specified block with access_ok() before calling this function.
@@ -366,7 +370,8 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
   * @from: Source address, in user space.
   * @n:    Number of bytes to copy.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from user space to kernel space.  Caller must check
   * the specified block with access_ok() before calling this function.
@@ -437,7 +442,8 @@ static inline unsigned long __must_check copy_from_user(void *to,
   * @from: Source address, in user space.
   * @n:    Number of bytes to copy.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from user space to user space.  Caller must check
   * the specified blocks with access_ok() before calling this function.
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c

index e83cc999da029b469fde1f2e7f2b4d0f5a96dbb6..3f4f58d34a92b6029615756892c1fe207d6f6701 100644 (file)
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -354,9 +354,9 @@ static int handle_page_fault(struct pt_regs *regs,
  
         /*
          * If we're in an interrupt, have no user context or are running in an
-        * atomic region then we must not take the fault.
+        * region with pagefaults disabled then we must not take the fault.
          */
-       if (in_atomic() || !mm) {
+       if (pagefault_disabled() || !mm) {
                 vma = NULL;  /* happy compiler */
                 goto bad_area_nosemaphore;
         }
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c

index 6aa2f26254471e730e29b0c1f1382c3f7a4d0f64..fcd545014e79dcc83a662f4db5a37e7511e695a0 100644 (file)
--- a/arch/tile/mm/highmem.c
+++ b/arch/tile/mm/highmem.c
@@ -201,7 +201,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
         int idx, type;
         pte_t *pte;
  
-       /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+       preempt_disable();
         pagefault_disable();
  
         /* Avoid icache flushes by disallowing atomic executable mappings. */
@@ -259,6 +259,7 @@ void __kunmap_atomic(void *kvaddr)
         }
  
         pagefault_enable();
+       preempt_enable();
  }
  EXPORT_SYMBOL(__kunmap_atomic);
  
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c

index 8e4daf44e9805ed2380b72826a699ed139819ee3..47ff9b7f3e5d39d8a05e088b1308568836d4e332 100644 (file)
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -7,6 +7,7 @@
  #include <linux/sched.h>
  #include <linux/hardirq.h>
  #include <linux/module.h>
+#include <linux/uaccess.h>
  #include <asm/current.h>
  #include <asm/pgtable.h>
  #include <asm/tlbflush.h>
@@ -35,10 +36,10 @@ int handle_page_fault(unsigned long address, unsigned long ip,
         *code_out = SEGV_MAPERR;
  
         /*
-        * If the fault was during atomic operation, don't take the fault, just
+        * If the fault was with pagefaults disabled, don't take the fault, just
          * fail.
          */
-       if (in_atomic())
+       if (faulthandler_disabled())
                 goto out_nosemaphore;
  
         if (is_user)
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c

index 0dc922dba9154d7cfcfe5352ec8ec77169e8082f..afccef5529ccb8d44409519aa075476af747722e 100644 (file)
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -218,7 +218,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
          * If we're in an interrupt or have no user
          * context, we must not take the fault..
          */
-       if (in_atomic() || !mm)
+       if (faulthandler_disabled() || !mm)
                 goto no_context;
  
         if (user_mode(regs))
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug

index a5973f8517503759475fba8481512d693dcedbfc..a15893d17c55988b542ad362f8c0074958130587 100644 (file)
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -332,6 +332,18 @@ config X86_DEBUG_STATIC_CPU_HAS
  
           If unsure, say N.
  
+config X86_DEBUG_FPU
+       bool "Debug the x86 FPU code"
+       depends on DEBUG_KERNEL
+       default y
+       ---help---
+         If this option is enabled then there will be extra sanity
+         checks and (boot time) debug printouts added to the kernel.
+         This debugging adds some small amount of runtime overhead
+         to the kernel.
+
+         If unsure, say N.
+
  config PUNIT_ATOM_DEBUG
         tristate "ATOM Punit debug driver"
         select DEBUG_FS
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h

index 89dd0d78013aaff6c889340e0e3caceb4c8f8c88..805d25ca5f1db1602498c7047025b973ac788b3c 100644 (file)
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -2,15 +2,14 @@
  #define BOOT_COMPRESSED_MISC_H
  
  /*
- * we have to be careful, because no indirections are allowed here, and
- * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
- * we just keep it from happening
+ * Special hack: we have to be careful, because no indirections are allowed here,
+ * and paravirt_ops is a kind of one. As it will only run in baremetal anyway,
+ * we just keep it from happening. (This list needs to be extended when new
+ * paravirt and debugging variants are added.)
   */
  #undef CONFIG_PARAVIRT
+#undef CONFIG_PARAVIRT_SPINLOCKS
  #undef CONFIG_KASAN
-#ifdef CONFIG_X86_32
-#define _ASM_X86_DESC_H 1
-#endif
  
  #include <linux/linkage.h>
  #include <linux/screen_info.h>
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c

index 112cefacf2af07b216cb598232bcc586a76fa919..b419f43ce0c589ce934195b77296760dc300830c 100644 (file)
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -32,7 +32,7 @@
  #include <crypto/lrw.h>
  #include <crypto/xts.h>
  #include <asm/cpu_device_id.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  #include <asm/crypto/aes.h>
  #include <crypto/ablk_helper.h>
  #include <crypto/scatterwalk.h>
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c

index baf0ac21ace5664835ed9435207ddabc3dc0cc7a..4c65c70e628bb2776c17c106973b2f2b668890fb 100644 (file)
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -19,8 +19,7 @@
  #include <crypto/ctr.h>
  #include <crypto/lrw.h>
  #include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
  #include <asm/crypto/camellia.h>
  #include <asm/crypto/glue_helper.h>
  
@@ -561,16 +560,15 @@ static struct crypto_alg cmll_algs[10] = { {
  
  static int __init camellia_aesni_init(void)
  {
-       u64 xcr0;
+       const char *feature_name;
  
         if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
                 pr_info("AVX2 or AES-NI instructions are not detected.\n");
                 return -ENODEV;
         }
  
-       xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-       if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-               pr_info("AVX2 detected but unusable.\n");
+       if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+               pr_info("CPU feature '%s' is not supported.\n", feature_name);
                 return -ENODEV;
         }
  
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c

index 78818a1e73e3f62b0e7be68123fcef148e978f77..80a0e4389c9ad3f5e6e1f6d8bc5292e391801ff2 100644 (file)
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -19,8 +19,7 @@
  #include <crypto/ctr.h>
  #include <crypto/lrw.h>
  #include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
  #include <asm/crypto/camellia.h>
  #include <asm/crypto/glue_helper.h>
  
@@ -553,16 +552,10 @@ static struct crypto_alg cmll_algs[10] = { {
  
  static int __init camellia_aesni_init(void)
  {
-       u64 xcr0;
+       const char *feature_name;
  
-       if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
-               pr_info("AVX or AES-NI instructions are not detected.\n");
-               return -ENODEV;
-       }
-
-       xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-       if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-               pr_info("AVX detected but unusable.\n");
+       if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+               pr_info("CPU feature '%s' is not supported.\n", feature_name);
                 return -ENODEV;
         }
  
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c

index 236c80974457b97575a585911aabbdf72b23c4f8..be00aa48b2b5e3044ea8a397b0df37428e409447 100644 (file)
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -31,8 +31,7 @@
  #include <crypto/cast5.h>
  #include <crypto/cryptd.h>
  #include <crypto/ctr.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
  #include <asm/crypto/glue_helper.h>
  
  #define CAST5_PARALLEL_BLOCKS 16
@@ -468,16 +467,10 @@ static struct crypto_alg cast5_algs[6] = { {
  
  static int __init cast5_init(void)
  {
-       u64 xcr0;
+       const char *feature_name;
  
-       if (!cpu_has_avx || !cpu_has_osxsave) {
-               pr_info("AVX instructions are not detected.\n");
-               return -ENODEV;
-       }
-
-       xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-       if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-               pr_info("AVX detected but unusable.\n");
+       if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+               pr_info("CPU feature '%s' is not supported.\n", feature_name);
                 return -ENODEV;
         }
  
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c

index f448810ca4ac1dbac428d9eadee47ecaf546c6a3..5dbba72242217541a92056068378abd86e67c378 100644 (file)
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -36,8 +36,7 @@
  #include <crypto/ctr.h>
  #include <crypto/lrw.h>
  #include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
  #include <asm/crypto/glue_helper.h>
  
  #define CAST6_PARALLEL_BLOCKS 8
@@ -590,16 +589,10 @@ static struct crypto_alg cast6_algs[10] = { {
  
  static int __init cast6_init(void)
  {
-       u64 xcr0;
+       const char *feature_name;
  
-       if (!cpu_has_avx || !cpu_has_osxsave) {
-               pr_info("AVX instructions are not detected.\n");
-               return -ENODEV;
-       }
-
-       xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-       if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-               pr_info("AVX detected but unusable.\n");
+       if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+               pr_info("CPU feature '%s' is not supported.\n", feature_name);
                 return -ENODEV;
         }
  
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c

index 1937fc1d876338aa0aa9bb5fddea9e0aa3541707..07d2c6c86a5483216684970489fcba3b4478007d 100644 (file)
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -35,7 +35,7 @@
  
  #include <asm/cpufeature.h>
  #include <asm/cpu_device_id.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  
  #define CHKSUM_BLOCK_SIZE      1
  #define CHKSUM_DIGEST_SIZE     4
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c

index 28640c3d6af7f6172a8fe39d4553c98019614e24..81a595d75cf5959bbcae8c2096ebdf0f538bf7f9 100644 (file)
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -32,8 +32,7 @@
  
  #include <asm/cpufeature.h>
  #include <asm/cpu_device_id.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
  
  #define CHKSUM_BLOCK_SIZE      1
  #define CHKSUM_DIGEST_SIZE     4
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c

index b6c67bf30fdf6704f6d83b093ee73ae7d9b77fcf..a3fcfc97a311d5b660e0dcda3be489038f231dce 100644 (file)
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -29,7 +29,7 @@
  #include <linux/init.h>
  #include <linux/string.h>
  #include <linux/kernel.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  #include <asm/cpufeature.h>
  #include <asm/cpu_device_id.h>
  
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c

index f368ba261739fa09be28bc02fe34cf3112099fa8..5a2f30f9f52dca78820b67cef7c025c53aa0b735 100644 (file)
--- a/arch/x86/crypto/fpu.c
+++ b/arch/x86/crypto/fpu.c
@@ -18,7 +18,7 @@
  #include <linux/module.h>
  #include <linux/slab.h>
  #include <linux/crypto.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  
  struct crypto_fpu_ctx {
         struct crypto_blkcipher *child;
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c

index 2079baf06bdd3b64e7f61ec3fecf088c4bd8bc24..64d7cf1b50e112ab370ac63e59e9ff89d4d677e0 100644 (file)
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -19,7 +19,7 @@
  #include <crypto/cryptd.h>
  #include <crypto/gf128mul.h>
  #include <crypto/internal/hash.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  #include <asm/cpu_device_id.h>
  
  #define GHASH_BLOCK_SIZE       16
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c

index 2f63dc89e7a9ed0fa55b1ede20e7bd0ba2c060ca..7d838dc4d888f30010dfba7c21561a16bfe4e3d0 100644 (file)
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -20,8 +20,7 @@
  #include <crypto/lrw.h>
  #include <crypto/xts.h>
  #include <crypto/serpent.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
  #include <asm/crypto/serpent-avx.h>
  #include <asm/crypto/glue_helper.h>
  
@@ -537,16 +536,14 @@ static struct crypto_alg srp_algs[10] = { {
  
  static int __init init(void)
  {
-       u64 xcr0;
+       const char *feature_name;
  
         if (!cpu_has_avx2 || !cpu_has_osxsave) {
                 pr_info("AVX2 instructions are not detected.\n");
                 return -ENODEV;
         }
-
-       xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-       if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-               pr_info("AVX detected but unusable.\n");
+       if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+               pr_info("CPU feature '%s' is not supported.\n", feature_name);
                 return -ENODEV;
         }
  
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c

index c8d478af84563e72a4ca21b9b5fd725b4c8aa248..da7dafc9b16d5cca124e4f6d01d4462f0cb92b15 100644 (file)
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -36,8 +36,7 @@
  #include <crypto/ctr.h>
  #include <crypto/lrw.h>
  #include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
  #include <asm/crypto/serpent-avx.h>
  #include <asm/crypto/glue_helper.h>
  
@@ -596,16 +595,10 @@ static struct crypto_alg serpent_algs[10] = { {
  
  static int __init serpent_init(void)
  {
-       u64 xcr0;
+       const char *feature_name;
  
-       if (!cpu_has_avx || !cpu_has_osxsave) {
-               printk(KERN_INFO "AVX instructions are not detected.\n");
-               return -ENODEV;
-       }
-
-       xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-       if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-               printk(KERN_INFO "AVX detected but unusable.\n");
+       if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+               pr_info("CPU feature '%s' is not supported.\n", feature_name);
                 return -ENODEV;
         }
  
diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c

index e510b1c5d690a5115231c608a51b6d4a37932d76..f53ed1dc88eafdbfe95489e49cae16efe1eb8086 100644 (file)
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -65,11 +65,8 @@
  #include <crypto/mcryptd.h>
  #include <crypto/crypto_wq.h>
  #include <asm/byteorder.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
  #include <linux/hardirq.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/api.h>
  #include "sha_mb_ctx.h"
  
  #define FLUSH_INTERVAL 1000 /* in usec */
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c

index 33d1b9dc14cc751203ba72af1585f6685558b7fc..7c48e8b20848e5c890825f25db91c63616dce65d 100644 (file)
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -29,9 +29,7 @@
  #include <linux/types.h>
  #include <crypto/sha.h>
  #include <crypto/sha1_base.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
  
  
  asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
@@ -123,15 +121,9 @@ static struct shash_alg alg = {
  #ifdef CONFIG_AS_AVX
  static bool __init avx_usable(void)
  {
-       u64 xcr0;
-
-       if (!cpu_has_avx || !cpu_has_osxsave)
-               return false;
-
-       xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-       if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-               pr_info("AVX detected but unusable.\n");
-
+       if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+               if (cpu_has_avx)
+                       pr_info("AVX detected but unusable.\n");
                 return false;
         }
  
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c

index ccc338881ee81886b6de43963df23d179d510b22..f8097fc0d1d1b56eb6d48389ac2778145897cfb2 100644 (file)
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -37,9 +37,7 @@
  #include <linux/types.h>
  #include <crypto/sha.h>
  #include <crypto/sha256_base.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
  #include <linux/string.h>
  
  asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
@@ -132,15 +130,9 @@ static struct shash_alg algs[] = { {
  #ifdef CONFIG_AS_AVX
  static bool __init avx_usable(void)
  {
-       u64 xcr0;
-
-       if (!cpu_has_avx || !cpu_has_osxsave)
-               return false;
-
-       xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-       if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-               pr_info("AVX detected but unusable.\n");
-
+       if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+               if (cpu_has_avx)
+                       pr_info("AVX detected but unusable.\n");
                 return false;
         }
  
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c

index d9fa4c1e063ff631cc09e168c6c6747527edf334..2edad7b81870154055e86694142a66de7fbc48ff 100644 (file)
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -35,9 +35,7 @@
  #include <linux/types.h>
  #include <crypto/sha.h>
  #include <crypto/sha512_base.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
  
  #include <linux/string.h>
  
@@ -131,15 +129,9 @@ static struct shash_alg algs[] = { {
  #ifdef CONFIG_AS_AVX
  static bool __init avx_usable(void)
  {
-       u64 xcr0;
-
-       if (!cpu_has_avx || !cpu_has_osxsave)
-               return false;
-
-       xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-       if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-               pr_info("AVX detected but unusable.\n");
-
+       if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+               if (cpu_has_avx)
+                       pr_info("AVX detected but unusable.\n");
                 return false;
         }
  
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c

index b5e2d56518517010e6869d592aa505450965e36f..c2bd0ce718eee505272249386dffef3cc417dad8 100644 (file)
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -36,9 +36,7 @@
  #include <crypto/ctr.h>
  #include <crypto/lrw.h>
  #include <crypto/xts.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
  #include <asm/crypto/twofish.h>
  #include <asm/crypto/glue_helper.h>
  #include <crypto/scatterwalk.h>
@@ -558,16 +556,10 @@ static struct crypto_alg twofish_algs[10] = { {
  
  static int __init twofish_init(void)
  {
-       u64 xcr0;
+       const char *feature_name;
  
-       if (!cpu_has_avx || !cpu_has_osxsave) {
-               printk(KERN_INFO "AVX instructions are not detected.\n");
-               return -ENODEV;
-       }
-
-       xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-       if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-               printk(KERN_INFO "AVX detected but unusable.\n");
+       if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+               pr_info("CPU feature '%s' is not supported.\n", feature_name);
                 return -ENODEV;
         }
  
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S

index e9acf5f4fc923caa2bd71cfa175462617130ed89..e5a17114a8c4f917b56819bc6f7ff80ff0ac4798 100644 (file)
--- a/arch/x86/entry/thunk_32.S
+++ b/arch/x86/entry/thunk_32.S
@@ -35,8 +35,6 @@
  
  #ifdef CONFIG_PREEMPT
         THUNK ___preempt_schedule, preempt_schedule
-#ifdef CONFIG_CONTEXT_TRACKING
-       THUNK ___preempt_schedule_context, preempt_schedule_context
-#endif
+       THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
  #endif
  
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S

index 3e95681b4e2dde8cabe3e5f8a0d2d4ceca4e23fa..efb2b932b7483419ec6921ac800506012e9030ae 100644 (file)
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -46,9 +46,7 @@
  
  #ifdef CONFIG_PREEMPT
         THUNK ___preempt_schedule, preempt_schedule
-#ifdef CONFIG_CONTEXT_TRACKING
-       THUNK ___preempt_schedule_context, preempt_schedule_context
-#endif
+       THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
  #endif
  
  #if defined(CONFIG_TRACE_IRQFLAGS) \
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c

index c81d35e6c7f1d91c22734793c006c0f5f33c0c10..ae3a29ae875b5508d62b4d91c79db3f1dc26581d 100644 (file)
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -21,8 +21,8 @@
  #include <linux/binfmts.h>
  #include <asm/ucontext.h>
  #include <asm/uaccess.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
  #include <asm/ptrace.h>
  #include <asm/ia32_unistd.h>
  #include <asm/user32.h>
@@ -198,7 +198,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
                 buf = compat_ptr(tmp);
         } get_user_catch(err);
  
-       err |= restore_xstate_sig(buf, 1);
+       err |= fpu__restore_sig(buf, 1);
  
         force_iret();
  
@@ -308,6 +308,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
                                  size_t frame_size,
                                  void __user **fpstate)
  {
+       struct fpu *fpu = &current->thread.fpu;
         unsigned long sp;
  
         /* Default to using normal stack */
@@ -322,12 +323,12 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
                  ksig->ka.sa.sa_restorer)
                 sp = (unsigned long) ksig->ka.sa.sa_restorer;
  
-       if (used_math()) {
+       if (fpu->fpstate_active) {
                 unsigned long fx_aligned, math_size;
  
-               sp = alloc_mathframe(sp, 1, &fx_aligned, &math_size);
+               sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
                 *fpstate = (struct _fpstate_ia32 __user *) sp;
-               if (save_xstate_sig(*fpstate, (void __user *)fx_aligned,
+               if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
                                     math_size) < 0)
                         return (void __user *) -1L;
         }
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h

index ba32af062f61d69164a792630e3257c8cdc6deb5..7bfc85bbb8ffc0578011ceac2c08548bd140ade3 100644 (file)
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -52,6 +52,12 @@ struct alt_instr {
         u8  padlen;             /* length of build-time padding */
  } __packed;
  
+/*
+ * Debug flag that can be tested to see whether alternative
+ * instructions were patched in already:
+ */
+extern int alternatives_patched;
+
  extern void alternative_instructions(void);
  extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
  
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h

index aaac3b2fb746d3e61019f9e0804d7bf2913f8de2..1a5da2e63aeeebc062bd0f5c08b36e4bda32707d 100644 (file)
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -98,11 +98,22 @@ static inline u16 amd_get_node_id(struct pci_dev *pdev)
         return 0;
  }
  
+static inline bool amd_gart_present(void)
+{
+       /* GART present only on Fam15h, upto model 0fh */
+       if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+           (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
+               return true;
+
+       return false;
+}
+
  #else
  
  #define amd_nb_num(x)          0
  #define amd_nb_has_feature(x)  false
  #define node_to_amd_nb(x)      NULL
+#define amd_gart_present(x)    false
  
  #endif
  
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h

index 1eef55596e82cade6ecc0e910488a6fcc9e0b74a..03bb1065c3352826843a305b15399751a5c860ae 100644 (file)
--- a/arch/x86/include/asm/crypto/glue_helper.h
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -7,7 +7,7 @@
  
  #include <linux/kernel.h>
  #include <linux/crypto.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  #include <crypto/b128ops.h>
  
  typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h

index 808dae63eeea6f73eb312d4f5b7f77cf4e869e6c..1f5b7287d1ad8df92f789003018fec3913b03e1c 100644 (file)
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -127,50 +127,14 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
  
  #define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
  
-static inline void *
+void *
  dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
-               gfp_t gfp, struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *memory;
-
-       gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
-               return memory;
-
-       if (!dev)
-               dev = &x86_dma_fallback_dev;
-
-       if (!is_device_dma_capable(dev))
-               return NULL;
-
-       if (!ops->alloc)
-               return NULL;
-
-       memory = ops->alloc(dev, size, dma_handle,
-                           dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
-       return memory;
-}
+               gfp_t gfp, struct dma_attrs *attrs);
  
  #define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
  
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *vaddr, dma_addr_t bus,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       WARN_ON(irqs_disabled());       /* for portability */
-
-       if (dma_release_from_coherent(dev, get_order(size), vaddr))
-               return;
-
-       debug_dma_free_coherent(dev, size, vaddr, bus);
-       if (ops->free)
-               ops->free(dev, size, vaddr, bus, attrs);
-}
+void dma_free_attrs(struct device *dev, size_t size,
+                   void *vaddr, dma_addr_t bus,
+                   struct dma_attrs *attrs);
  
  #endif
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h

index 3738b138b843d46467c75a910d916cc79ebad25f..155162ea0e00292b619cc2a02ff8b2f31fafd02b 100644 (file)
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -1,7 +1,7 @@
  #ifndef _ASM_X86_EFI_H
  #define _ASM_X86_EFI_H
  
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  #include <asm/pgtable.h>
  
  /*
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h

deleted file mode 100644 (file)

index da5e967..0000000
--- a/arch/x86/include/asm/fpu-internal.h
+++ /dev/null
@@ -1,626 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- *     Gareth Hughes <gareth@valinux.com>, May 2000
- * x86-64 work by Andi Kleen 2002
- */
-
-#ifndef _FPU_INTERNAL_H
-#define _FPU_INTERNAL_H
-
-#include <linux/kernel_stat.h>
-#include <linux/regset.h>
-#include <linux/compat.h>
-#include <linux/slab.h>
-#include <asm/asm.h>
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-#include <asm/sigcontext.h>
-#include <asm/user.h>
-#include <asm/uaccess.h>
-#include <asm/xsave.h>
-#include <asm/smap.h>
-
-#ifdef CONFIG_X86_64
-# include <asm/sigcontext32.h>
-# include <asm/user32.h>
-struct ksignal;
-int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
-                       compat_sigset_t *set, struct pt_regs *regs);
-int ia32_setup_frame(int sig, struct ksignal *ksig,
-                    compat_sigset_t *set, struct pt_regs *regs);
-#else
-# define user_i387_ia32_struct user_i387_struct
-# define user32_fxsr_struct    user_fxsr_struct
-# define ia32_setup_frame      __setup_frame
-# define ia32_setup_rt_frame   __setup_rt_frame
-#endif
-
-extern unsigned int mxcsr_feature_mask;
-extern void fpu_init(void);
-extern void eager_fpu_init(void);
-
-DECLARE_PER_CPU(struct task_struct *, fpu_owner_task);
-
-extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
-                             struct task_struct *tsk);
-extern void convert_to_fxsr(struct task_struct *tsk,
-                           const struct user_i387_ia32_struct *env);
-
-extern user_regset_active_fn fpregs_active, xfpregs_active;
-extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
-                               xstateregs_get;
-extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
-                                xstateregs_set;
-
-/*
- * xstateregs_active == fpregs_active. Please refer to the comment
- * at the definition of fpregs_active.
- */
-#define xstateregs_active      fpregs_active
-
-#ifdef CONFIG_MATH_EMULATION
-extern void finit_soft_fpu(struct i387_soft_struct *soft);
-#else
-static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
-#endif
-
-/*
- * Must be run with preemption disabled: this clears the fpu_owner_task,
- * on this CPU.
- *
- * This will disable any lazy FPU state restore of the current FPU state,
- * but if the current thread owns the FPU, it will still be saved by.
- */
-static inline void __cpu_disable_lazy_restore(unsigned int cpu)
-{
-       per_cpu(fpu_owner_task, cpu) = NULL;
-}
-
-/*
- * Used to indicate that the FPU state in memory is newer than the FPU
- * state in registers, and the FPU state should be reloaded next time the
- * task is run. Only safe on the current task, or non-running tasks.
- */
-static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk)
-{
-       tsk->thread.fpu.last_cpu = ~0;
-}
-
-static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
-{
-       return new == this_cpu_read_stable(fpu_owner_task) &&
-               cpu == new->thread.fpu.last_cpu;
-}
-
-static inline int is_ia32_compat_frame(void)
-{
-       return config_enabled(CONFIG_IA32_EMULATION) &&
-              test_thread_flag(TIF_IA32);
-}
-
-static inline int is_ia32_frame(void)
-{
-       return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame();
-}
-
-static inline int is_x32_frame(void)
-{
-       return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32);
-}
-
-#define X87_FSW_ES (1 << 7)    /* Exception Summary */
-
-static __always_inline __pure bool use_eager_fpu(void)
-{
-       return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
-}
-
-static __always_inline __pure bool use_xsaveopt(void)
-{
-       return static_cpu_has_safe(X86_FEATURE_XSAVEOPT);
-}
-
-static __always_inline __pure bool use_xsave(void)
-{
-       return static_cpu_has_safe(X86_FEATURE_XSAVE);
-}
-
-static __always_inline __pure bool use_fxsr(void)
-{
-       return static_cpu_has_safe(X86_FEATURE_FXSR);
-}
-
-static inline void fx_finit(struct i387_fxsave_struct *fx)
-{
-       fx->cwd = 0x37f;
-       fx->mxcsr = MXCSR_DEFAULT;
-}
-
-extern void __sanitize_i387_state(struct task_struct *);
-
-static inline void sanitize_i387_state(struct task_struct *tsk)
-{
-       if (!use_xsaveopt())
-               return;
-       __sanitize_i387_state(tsk);
-}
-
-#define user_insn(insn, output, input...)                              \
-({                                                                     \
-       int err;                                                        \
-       asm volatile(ASM_STAC "\n"                                      \
-                    "1:" #insn "\n\t"                                  \
-                    "2: " ASM_CLAC "\n"                                \
-                    ".section .fixup,\"ax\"\n"                         \
-                    "3:  movl $-1,%[err]\n"                            \
-                    "    jmp  2b\n"                                    \
-                    ".previous\n"                                      \
-                    _ASM_EXTABLE(1b, 3b)                               \
-                    : [err] "=r" (err), output                         \
-                    : "0"(0), input);                                  \
-       err;                                                            \
-})
-
-#define check_insn(insn, output, input...)                             \
-({                                                                     \
-       int err;                                                        \
-       asm volatile("1:" #insn "\n\t"                                  \
-                    "2:\n"                                             \
-                    ".section .fixup,\"ax\"\n"                         \
-                    "3:  movl $-1,%[err]\n"                            \
-                    "    jmp  2b\n"                                    \
-                    ".previous\n"                                      \
-                    _ASM_EXTABLE(1b, 3b)                               \
-                    : [err] "=r" (err), output                         \
-                    : "0"(0), input);                                  \
-       err;                                                            \
-})
-
-static inline int fsave_user(struct i387_fsave_struct __user *fx)
-{
-       return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
-}
-
-static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
-{
-       if (config_enabled(CONFIG_X86_32))
-               return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
-       else if (config_enabled(CONFIG_AS_FXSAVEQ))
-               return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
-
-       /* See comment in fpu_fxsave() below. */
-       return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx));
-}
-
-static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
-{
-       if (config_enabled(CONFIG_X86_32))
-               return check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-       else if (config_enabled(CONFIG_AS_FXSAVEQ))
-               return check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-
-       /* See comment in fpu_fxsave() below. */
-       return check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx),
-                         "m" (*fx));
-}
-
-static inline int fxrstor_user(struct i387_fxsave_struct __user *fx)
-{
-       if (config_enabled(CONFIG_X86_32))
-               return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-       else if (config_enabled(CONFIG_AS_FXSAVEQ))
-               return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-
-       /* See comment in fpu_fxsave() below. */
-       return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx),
-                         "m" (*fx));
-}
-
-static inline int frstor_checking(struct i387_fsave_struct *fx)
-{
-       return check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int frstor_user(struct i387_fsave_struct __user *fx)
-{
-       return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline void fpu_fxsave(struct fpu *fpu)
-{
-       if (config_enabled(CONFIG_X86_32))
-               asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave));
-       else if (config_enabled(CONFIG_AS_FXSAVEQ))
-               asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state->fxsave));
-       else {
-               /* Using "rex64; fxsave %0" is broken because, if the memory
-                * operand uses any extended registers for addressing, a second
-                * REX prefix will be generated (to the assembler, rex64
-                * followed by semicolon is a separate instruction), and hence
-                * the 64-bitness is lost.
-                *
-                * Using "fxsaveq %0" would be the ideal choice, but is only
-                * supported starting with gas 2.16.
-                *
-                * Using, as a workaround, the properly prefixed form below
-                * isn't accepted by any binutils version so far released,
-                * complaining that the same type of prefix is used twice if
-                * an extended register is needed for addressing (fix submitted
-                * to mainline 2005-11-21).
-                *
-                *  asm volatile("rex64/fxsave %0" : "=m" (fpu->state->fxsave));
-                *
-                * This, however, we can work around by forcing the compiler to
-                * select an addressing mode that doesn't require extended
-                * registers.
-                */
-               asm volatile( "rex64/fxsave (%[fx])"
-                            : "=m" (fpu->state->fxsave)
-                            : [fx] "R" (&fpu->state->fxsave));
-       }
-}
-
-/*
- * These must be called with preempt disabled. Returns
- * 'true' if the FPU state is still intact.
- */
-static inline int fpu_save_init(struct fpu *fpu)
-{
-       if (use_xsave()) {
-               fpu_xsave(fpu);
-
-               /*
-                * xsave header may indicate the init state of the FP.
-                */
-               if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
-                       return 1;
-       } else if (use_fxsr()) {
-               fpu_fxsave(fpu);
-       } else {
-               asm volatile("fnsave %[fx]; fwait"
-                            : [fx] "=m" (fpu->state->fsave));
-               return 0;
-       }
-
-       /*
-        * If exceptions are pending, we need to clear them so
-        * that we don't randomly get exceptions later.
-        *
-        * FIXME! Is this perhaps only true for the old-style
-        * irq13 case? Maybe we could leave the x87 state
-        * intact otherwise?
-        */
-       if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) {
-               asm volatile("fnclex");
-               return 0;
-       }
-       return 1;
-}
-
-static inline int __save_init_fpu(struct task_struct *tsk)
-{
-       return fpu_save_init(&tsk->thread.fpu);
-}
-
-static inline int fpu_restore_checking(struct fpu *fpu)
-{
-       if (use_xsave())
-               return fpu_xrstor_checking(&fpu->state->xsave);
-       else if (use_fxsr())
-               return fxrstor_checking(&fpu->state->fxsave);
-       else
-               return frstor_checking(&fpu->state->fsave);
-}
-
-static inline int restore_fpu_checking(struct task_struct *tsk)
-{
-       /*
-        * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
-        * pending. Clear the x87 state here by setting it to fixed values.
-        * "m" is a random variable that should be in L1.
-        */
-       if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) {
-               asm volatile(
-                       "fnclex\n\t"
-                       "emms\n\t"
-                       "fildl %P[addr]"        /* set F?P to defined value */
-                       : : [addr] "m" (tsk->thread.fpu.has_fpu));
-       }
-
-       return fpu_restore_checking(&tsk->thread.fpu);
-}
-
-/*
- * Software FPU state helpers. Careful: these need to
- * be preemption protection *and* they need to be
- * properly paired with the CR0.TS changes!
- */
-static inline int __thread_has_fpu(struct task_struct *tsk)
-{
-       return tsk->thread.fpu.has_fpu;
-}
-
-/* Must be paired with an 'stts' after! */
-static inline void __thread_clear_has_fpu(struct task_struct *tsk)
-{
-       tsk->thread.fpu.has_fpu = 0;
-       this_cpu_write(fpu_owner_task, NULL);
-}
-
-/* Must be paired with a 'clts' before! */
-static inline void __thread_set_has_fpu(struct task_struct *tsk)
-{
-       tsk->thread.fpu.has_fpu = 1;
-       this_cpu_write(fpu_owner_task, tsk);
-}
-
-/*
- * Encapsulate the CR0.TS handling together with the
- * software flag.
- *
- * These generally need preemption protection to work,
- * do try to avoid using these on their own.
- */
-static inline void __thread_fpu_end(struct task_struct *tsk)
-{
-       __thread_clear_has_fpu(tsk);
-       if (!use_eager_fpu())
-               stts();
-}
-
-static inline void __thread_fpu_begin(struct task_struct *tsk)
-{
-       if (!use_eager_fpu())
-               clts();
-       __thread_set_has_fpu(tsk);
-}
-
-static inline void drop_fpu(struct task_struct *tsk)
-{
-       /*
-        * Forget coprocessor state..
-        */
-       preempt_disable();
-       tsk->thread.fpu_counter = 0;
-
-       if (__thread_has_fpu(tsk)) {
-               /* Ignore delayed exceptions from user space */
-               asm volatile("1: fwait\n"
-                            "2:\n"
-                            _ASM_EXTABLE(1b, 2b));
-               __thread_fpu_end(tsk);
-       }
-
-       clear_stopped_child_used_math(tsk);
-       preempt_enable();
-}
-
-static inline void restore_init_xstate(void)
-{
-       if (use_xsave())
-               xrstor_state(init_xstate_buf, -1);
-       else
-               fxrstor_checking(&init_xstate_buf->i387);
-}
-
-/*
- * Reset the FPU state in the eager case and drop it in the lazy case (later use
- * will reinit it).
- */
-static inline void fpu_reset_state(struct task_struct *tsk)
-{
-       if (!use_eager_fpu())
-               drop_fpu(tsk);
-       else
-               restore_init_xstate();
-}
-
-/*
- * FPU state switching for scheduling.
- *
- * This is a two-stage process:
- *
- *  - switch_fpu_prepare() saves the old state and
- *    sets the new state of the CR0.TS bit. This is
- *    done within the context of the old process.
- *
- *  - switch_fpu_finish() restores the new state as
- *    necessary.
- */
-typedef struct { int preload; } fpu_switch_t;
-
-static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu)
-{
-       fpu_switch_t fpu;
-
-       /*
-        * If the task has used the math, pre-load the FPU on xsave processors
-        * or if the past 5 consecutive context-switches used math.
-        */
-       fpu.preload = tsk_used_math(new) &&
-                     (use_eager_fpu() || new->thread.fpu_counter > 5);
-
-       if (__thread_has_fpu(old)) {
-               if (!__save_init_fpu(old))
-                       task_disable_lazy_fpu_restore(old);
-               else
-                       old->thread.fpu.last_cpu = cpu;
-
-               /* But leave fpu_owner_task! */
-               old->thread.fpu.has_fpu = 0;
-
-               /* Don't change CR0.TS if we just switch! */
-               if (fpu.preload) {
-                       new->thread.fpu_counter++;
-                       __thread_set_has_fpu(new);
-                       prefetch(new->thread.fpu.state);
-               } else if (!use_eager_fpu())
-                       stts();
-       } else {
-               old->thread.fpu_counter = 0;
-               task_disable_lazy_fpu_restore(old);
-               if (fpu.preload) {
-                       new->thread.fpu_counter++;
-                       if (fpu_lazy_restore(new, cpu))
-                               fpu.preload = 0;
-                       else
-                               prefetch(new->thread.fpu.state);
-                       __thread_fpu_begin(new);
-               }
-       }
-       return fpu;
-}
-
-/*
- * By the time this gets called, we've already cleared CR0.TS and
- * given the process the FPU if we are going to preload the FPU
- * state - all we need to do is to conditionally restore the register
- * state itself.
- */
-static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu)
-{
-       if (fpu.preload) {
-               if (unlikely(restore_fpu_checking(new)))
-                       fpu_reset_state(new);
-       }
-}
-
-/*
- * Signal frame handlers...
- */
-extern int save_xstate_sig(void __user *buf, void __user *fx, int size);
-extern int __restore_xstate_sig(void __user *buf, void __user *fx, int size);
-
-static inline int xstate_sigframe_size(void)
-{
-       return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size;
-}
-
-static inline int restore_xstate_sig(void __user *buf, int ia32_frame)
-{
-       void __user *buf_fx = buf;
-       int size = xstate_sigframe_size();
-
-       if (ia32_frame && use_fxsr()) {
-               buf_fx = buf + sizeof(struct i387_fsave_struct);
-               size += sizeof(struct i387_fsave_struct);
-       }
-
-       return __restore_xstate_sig(buf, buf_fx, size);
-}
-
-/*
- * Needs to be preemption-safe.
- *
- * NOTE! user_fpu_begin() must be used only immediately before restoring
- * the save state. It does not do any saving/restoring on its own. In
- * lazy FPU mode, it is just an optimization to avoid a #NM exception,
- * the task can lose the FPU right after preempt_enable().
- */
-static inline void user_fpu_begin(void)
-{
-       preempt_disable();
-       if (!user_has_fpu())
-               __thread_fpu_begin(current);
-       preempt_enable();
-}
-
-static inline void __save_fpu(struct task_struct *tsk)
-{
-       if (use_xsave()) {
-               if (unlikely(system_state == SYSTEM_BOOTING))
-                       xsave_state_booting(&tsk->thread.fpu.state->xsave, -1);
-               else
-                       xsave_state(&tsk->thread.fpu.state->xsave, -1);
-       } else
-               fpu_fxsave(&tsk->thread.fpu);
-}
-
-/*
- * i387 state interaction
- */
-static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
-{
-       if (cpu_has_fxsr) {
-               return tsk->thread.fpu.state->fxsave.cwd;
-       } else {
-               return (unsigned short)tsk->thread.fpu.state->fsave.cwd;
-       }
-}
-
-static inline unsigned short get_fpu_swd(struct task_struct *tsk)
-{
-       if (cpu_has_fxsr) {
-               return tsk->thread.fpu.state->fxsave.swd;
-       } else {
-               return (unsigned short)tsk->thread.fpu.state->fsave.swd;
-       }
-}
-
-static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
-{
-       if (cpu_has_xmm) {
-               return tsk->thread.fpu.state->fxsave.mxcsr;
-       } else {
-               return MXCSR_DEFAULT;
-       }
-}
-
-static bool fpu_allocated(struct fpu *fpu)
-{
-       return fpu->state != NULL;
-}
-
-static inline int fpu_alloc(struct fpu *fpu)
-{
-       if (fpu_allocated(fpu))
-               return 0;
-       fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
-       if (!fpu->state)
-               return -ENOMEM;
-       WARN_ON((unsigned long)fpu->state & 15);
-       return 0;
-}
-
-static inline void fpu_free(struct fpu *fpu)
-{
-       if (fpu->state) {
-               kmem_cache_free(task_xstate_cachep, fpu->state);
-               fpu->state = NULL;
-       }
-}
-
-static inline void fpu_copy(struct task_struct *dst, struct task_struct *src)
-{
-       if (use_eager_fpu()) {
-               memset(&dst->thread.fpu.state->xsave, 0, xstate_size);
-               __save_fpu(dst);
-       } else {
-               struct fpu *dfpu = &dst->thread.fpu;
-               struct fpu *sfpu = &src->thread.fpu;
-
-               unlazy_fpu(src);
-               memcpy(dfpu->state, sfpu->state, xstate_size);
-       }
-}
-
-static inline unsigned long
-alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx,
-               unsigned long *size)
-{
-       unsigned long frame_size = xstate_sigframe_size();
-
-       *buf_fx = sp = round_down(sp - frame_size, 64);
-       if (ia32_frame && use_fxsr()) {
-               frame_size += sizeof(struct i387_fsave_struct);
-               sp -= sizeof(struct i387_fsave_struct);
-       }
-
-       *size = frame_size;
-       return sp;
-}
-
-#endif
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h

new file mode 100644 (file)

index 0000000..1429a7c
--- /dev/null
+++ b/arch/x86/include/asm/fpu/api.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ *     Gareth Hughes <gareth@valinux.com>, May 2000
+ * x86-64 work by Andi Kleen 2002
+ */
+
+#ifndef _ASM_X86_FPU_API_H
+#define _ASM_X86_FPU_API_H
+
+/*
+ * Careful: __kernel_fpu_begin/end() must be called with preempt disabled
+ * and they don't touch the preempt state on their own.
+ * If you enable preemption after __kernel_fpu_begin(), preempt notifier
+ * should call the __kernel_fpu_end() to prevent the kernel/user FPU
+ * state from getting corrupted. KVM for example uses this model.
+ *
+ * All other cases use kernel_fpu_begin/end() which disable preemption
+ * during kernel FPU usage.
+ */
+extern void __kernel_fpu_begin(void);
+extern void __kernel_fpu_end(void);
+extern void kernel_fpu_begin(void);
+extern void kernel_fpu_end(void);
+extern bool irq_fpu_usable(void);
+
+/*
+ * Some instructions like VIA's padlock instructions generate a spurious
+ * DNA fault but don't modify SSE registers. And these instructions
+ * get used from interrupt context as well. To prevent these kernel instructions
+ * in interrupt context interacting wrongly with other user/kernel fpu usage, we
+ * should use them only in the context of irq_ts_save/restore()
+ */
+extern int  irq_ts_save(void);
+extern void irq_ts_restore(int TS_state);
+
+/*
+ * Query the presence of one or more xfeatures. Works on any legacy CPU as well.
+ *
+ * If 'feature_name' is set then put a human-readable description of
+ * the feature there as well - this can be used to print error (or success)
+ * messages.
+ */
+extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
+
+#endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h

new file mode 100644 (file)

index 0000000..3c3550c
--- /dev/null
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -0,0 +1,694 @@
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ *     Gareth Hughes <gareth@valinux.com>, May 2000
+ * x86-64 work by Andi Kleen 2002
+ */
+
+#ifndef _ASM_X86_FPU_INTERNAL_H
+#define _ASM_X86_FPU_INTERNAL_H
+
+#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <asm/user.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/xstate.h>
+
+/*
+ * High level FPU state handling functions:
+ */
+extern void fpu__activate_curr(struct fpu *fpu);
+extern void fpu__activate_fpstate_read(struct fpu *fpu);
+extern void fpu__activate_fpstate_write(struct fpu *fpu);
+extern void fpu__save(struct fpu *fpu);
+extern void fpu__restore(struct fpu *fpu);
+extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
+extern void fpu__drop(struct fpu *fpu);
+extern int  fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu);
+extern void fpu__clear(struct fpu *fpu);
+extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
+extern int  dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
+
+/*
+ * Boot time FPU initialization functions:
+ */
+extern void fpu__init_cpu(void);
+extern void fpu__init_system_xstate(void);
+extern void fpu__init_cpu_xstate(void);
+extern void fpu__init_system(struct cpuinfo_x86 *c);
+extern void fpu__init_check_bugs(void);
+extern void fpu__resume_cpu(void);
+
+/*
+ * Debugging facility:
+ */
+#ifdef CONFIG_X86_DEBUG_FPU
+# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
+#else
+# define WARN_ON_FPU(x) ({ (void)(x); 0; })
+#endif
+
+/*
+ * FPU related CPU feature flag helper routines:
+ */
+static __always_inline __pure bool use_eager_fpu(void)
+{
+       return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
+}
+
+static __always_inline __pure bool use_xsaveopt(void)
+{
+       return static_cpu_has_safe(X86_FEATURE_XSAVEOPT);
+}
+
+static __always_inline __pure bool use_xsave(void)
+{
+       return static_cpu_has_safe(X86_FEATURE_XSAVE);
+}
+
+static __always_inline __pure bool use_fxsr(void)
+{
+       return static_cpu_has_safe(X86_FEATURE_FXSR);
+}
+
+/*
+ * fpstate handling functions:
+ */
+
+extern union fpregs_state init_fpstate;
+
+extern void fpstate_init(union fpregs_state *state);
+#ifdef CONFIG_MATH_EMULATION
+extern void fpstate_init_soft(struct swregs_state *soft);
+#else
+static inline void fpstate_init_soft(struct swregs_state *soft) {}
+#endif
+static inline void fpstate_init_fxstate(struct fxregs_state *fx)
+{
+       fx->cwd = 0x37f;
+       fx->mxcsr = MXCSR_DEFAULT;
+}
+extern void fpstate_sanitize_xstate(struct fpu *fpu);
+
+#define user_insn(insn, output, input...)                              \
+({                                                                     \
+       int err;                                                        \
+       asm volatile(ASM_STAC "\n"                                      \
+                    "1:" #insn "\n\t"                                  \
+                    "2: " ASM_CLAC "\n"                                \
+                    ".section .fixup,\"ax\"\n"                         \
+                    "3:  movl $-1,%[err]\n"                            \
+                    "    jmp  2b\n"                                    \
+                    ".previous\n"                                      \
+                    _ASM_EXTABLE(1b, 3b)                               \
+                    : [err] "=r" (err), output                         \
+                    : "0"(0), input);                                  \
+       err;                                                            \
+})
+
+#define check_insn(insn, output, input...)                             \
+({                                                                     \
+       int err;                                                        \
+       asm volatile("1:" #insn "\n\t"                                  \
+                    "2:\n"                                             \
+                    ".section .fixup,\"ax\"\n"                         \
+                    "3:  movl $-1,%[err]\n"                            \
+                    "    jmp  2b\n"                                    \
+                    ".previous\n"                                      \
+                    _ASM_EXTABLE(1b, 3b)                               \
+                    : [err] "=r" (err), output                         \
+                    : "0"(0), input);                                  \
+       err;                                                            \
+})
+
+static inline int copy_fregs_to_user(struct fregs_state __user *fx)
+{
+       return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
+}
+
+static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
+{
+       if (config_enabled(CONFIG_X86_32))
+               return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
+       else if (config_enabled(CONFIG_AS_FXSAVEQ))
+               return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
+
+       /* See comment in copy_fxregs_to_kernel() below. */
+       return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx));
+}
+
+static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
+{
+       int err;
+
+       if (config_enabled(CONFIG_X86_32)) {
+               err = check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+       } else {
+               if (config_enabled(CONFIG_AS_FXSAVEQ)) {
+                       err = check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+               } else {
+                       /* See comment in copy_fxregs_to_kernel() below. */
+                       err = check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx));
+               }
+       }
+       /* Copying from a kernel buffer to FPU registers should never fail: */
+       WARN_ON_FPU(err);
+}
+
+static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
+{
+       if (config_enabled(CONFIG_X86_32))
+               return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+       else if (config_enabled(CONFIG_AS_FXSAVEQ))
+               return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+
+       /* See comment in copy_fxregs_to_kernel() below. */
+       return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx),
+                         "m" (*fx));
+}
+
+static inline void copy_kernel_to_fregs(struct fregs_state *fx)
+{
+       int err = check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+
+       WARN_ON_FPU(err);
+}
+
+static inline int copy_user_to_fregs(struct fregs_state __user *fx)
+{
+       return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void copy_fxregs_to_kernel(struct fpu *fpu)
+{
+       if (config_enabled(CONFIG_X86_32))
+               asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
+       else if (config_enabled(CONFIG_AS_FXSAVEQ))
+               asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
+       else {
+               /* Using "rex64; fxsave %0" is broken because, if the memory
+                * operand uses any extended registers for addressing, a second
+                * REX prefix will be generated (to the assembler, rex64
+                * followed by semicolon is a separate instruction), and hence
+                * the 64-bitness is lost.
+                *
+                * Using "fxsaveq %0" would be the ideal choice, but is only
+                * supported starting with gas 2.16.
+                *
+                * Using, as a workaround, the properly prefixed form below
+                * isn't accepted by any binutils version so far released,
+                * complaining that the same type of prefix is used twice if
+                * an extended register is needed for addressing (fix submitted
+                * to mainline 2005-11-21).
+                *
+                *  asm volatile("rex64/fxsave %0" : "=m" (fpu->state.fxsave));
+                *
+                * This, however, we can work around by forcing the compiler to
+                * select an addressing mode that doesn't require extended
+                * registers.
+                */
+               asm volatile( "rex64/fxsave (%[fx])"
+                            : "=m" (fpu->state.fxsave)
+                            : [fx] "R" (&fpu->state.fxsave));
+       }
+}
+
+/* These macros all use (%edi)/(%rdi) as the single memory argument. */
+#define XSAVE          ".byte " REX_PREFIX "0x0f,0xae,0x27"
+#define XSAVEOPT       ".byte " REX_PREFIX "0x0f,0xae,0x37"
+#define XSAVES         ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
+#define XRSTOR         ".byte " REX_PREFIX "0x0f,0xae,0x2f"
+#define XRSTORS                ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+
+/* xstate instruction fault handler: */
+#define xstate_fault(__err)            \
+                                       \
+       ".section .fixup,\"ax\"\n"      \
+                                       \
+       "3:  movl $-2,%[_err]\n"        \
+       "    jmp  2b\n"                 \
+                                       \
+       ".previous\n"                   \
+                                       \
+       _ASM_EXTABLE(1b, 3b)            \
+       : [_err] "=r" (__err)
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
+{
+       u64 mask = -1;
+       u32 lmask = mask;
+       u32 hmask = mask >> 32;
+       int err = 0;
+
+       WARN_ON(system_state != SYSTEM_BOOTING);
+
+       if (boot_cpu_has(X86_FEATURE_XSAVES))
+               asm volatile("1:"XSAVES"\n\t"
+                       "2:\n\t"
+                            xstate_fault(err)
+                       : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
+                       : "memory");
+       else
+               asm volatile("1:"XSAVE"\n\t"
+                       "2:\n\t"
+                            xstate_fault(err)
+                       : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
+                       : "memory");
+
+       /* We should never fault when copying to a kernel buffer: */
+       WARN_ON_FPU(err);
+}
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
+{
+       u64 mask = -1;
+       u32 lmask = mask;
+       u32 hmask = mask >> 32;
+       int err = 0;
+
+       WARN_ON(system_state != SYSTEM_BOOTING);
+
+       if (boot_cpu_has(X86_FEATURE_XSAVES))
+               asm volatile("1:"XRSTORS"\n\t"
+                       "2:\n\t"
+                            xstate_fault(err)
+                       : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
+                       : "memory");
+       else
+               asm volatile("1:"XRSTOR"\n\t"
+                       "2:\n\t"
+                            xstate_fault(err)
+                       : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
+                       : "memory");
+
+       /* We should never fault when copying from a kernel buffer: */
+       WARN_ON_FPU(err);
+}
+
+/*
+ * Save processor xstate to xsave area.
+ */
+static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
+{
+       u64 mask = -1;
+       u32 lmask = mask;
+       u32 hmask = mask >> 32;
+       int err = 0;
+
+       WARN_ON(!alternatives_patched);
+
+       /*
+        * If xsaves is enabled, xsaves replaces xsaveopt because
+        * it supports compact format and supervisor states in addition to
+        * modified optimization in xsaveopt.
+        *
+        * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave
+        * because xsaveopt supports modified optimization which is not
+        * supported by xsave.
+        *
+        * If none of xsaves and xsaveopt is enabled, use xsave.
+        */
+       alternative_input_2(
+               "1:"XSAVE,
+               XSAVEOPT,
+               X86_FEATURE_XSAVEOPT,
+               XSAVES,
+               X86_FEATURE_XSAVES,
+               [xstate] "D" (xstate), "a" (lmask), "d" (hmask) :
+               "memory");
+       asm volatile("2:\n\t"
+                    xstate_fault(err)
+                    : "0" (err)
+                    : "memory");
+
+       /* We should never fault when copying to a kernel buffer: */
+       WARN_ON_FPU(err);
+}
+
+/*
+ * Restore processor xstate from xsave area.
+ */
+static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
+{
+       u32 lmask = mask;
+       u32 hmask = mask >> 32;
+       int err = 0;
+
+       /*
+        * Use xrstors to restore context if it is enabled. xrstors supports
+        * compacted format of xsave area which is not supported by xrstor.
+        */
+       alternative_input(
+               "1: " XRSTOR,
+               XRSTORS,
+               X86_FEATURE_XSAVES,
+               "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask)
+               : "memory");
+
+       asm volatile("2:\n"
+                    xstate_fault(err)
+                    : "0" (err)
+                    : "memory");
+
+       /* We should never fault when copying from a kernel buffer: */
+       WARN_ON_FPU(err);
+}
+
+/*
+ * Save xstate to user space xsave area.
+ *
+ * We don't use modified optimization because xrstor/xrstors might track
+ * a different application.
+ *
+ * We don't use compacted format xsave area for
+ * backward compatibility for old applications which don't understand
+ * compacted format of xsave area.
+ */
+static inline int copy_xregs_to_user(struct xregs_state __user *buf)
+{
+       int err;
+
+       /*
+        * Clear the xsave header first, so that reserved fields are
+        * initialized to zero.
+        */
+       err = __clear_user(&buf->header, sizeof(buf->header));
+       if (unlikely(err))
+               return -EFAULT;
+
+       __asm__ __volatile__(ASM_STAC "\n"
+                            "1:"XSAVE"\n"
+                            "2: " ASM_CLAC "\n"
+                            xstate_fault(err)
+                            : "D" (buf), "a" (-1), "d" (-1), "0" (err)
+                            : "memory");
+       return err;
+}
+
+/*
+ * Restore xstate from user space xsave area.
+ */
+static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
+{
+       struct xregs_state *xstate = ((__force struct xregs_state *)buf);
+       u32 lmask = mask;
+       u32 hmask = mask >> 32;
+       int err = 0;
+
+       __asm__ __volatile__(ASM_STAC "\n"
+                            "1:"XRSTOR"\n"
+                            "2: " ASM_CLAC "\n"
+                            xstate_fault(err)
+                            : "D" (xstate), "a" (lmask), "d" (hmask), "0" (err)
+                            : "memory");       /* memory required? */
+       return err;
+}
+
+/*
+ * These must be called with preempt disabled. Returns
+ * 'true' if the FPU state is still intact and we can
+ * keep registers active.
+ *
+ * The legacy FNSAVE instruction cleared all FPU state
+ * unconditionally, so registers are essentially destroyed.
+ * Modern FPU state can be kept in registers, if there are
+ * no pending FP exceptions.
+ */
+static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
+{
+       if (likely(use_xsave())) {
+               copy_xregs_to_kernel(&fpu->state.xsave);
+               return 1;
+       }
+
+       if (likely(use_fxsr())) {
+               copy_fxregs_to_kernel(fpu);
+               return 1;
+       }
+
+       /*
+        * Legacy FPU register saving, FNSAVE always clears FPU registers,
+        * so we have to mark them inactive:
+        */
+       asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
+
+       return 0;
+}
+
+static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate)
+{
+       if (use_xsave()) {
+               copy_kernel_to_xregs(&fpstate->xsave, -1);
+       } else {
+               if (use_fxsr())
+                       copy_kernel_to_fxregs(&fpstate->fxsave);
+               else
+                       copy_kernel_to_fregs(&fpstate->fsave);
+       }
+}
+
+static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
+{
+       /*
+        * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
+        * pending. Clear the x87 state here by setting it to fixed values.
+        * "m" is a random variable that should be in L1.
+        */
+       if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) {
+               asm volatile(
+                       "fnclex\n\t"
+                       "emms\n\t"
+                       "fildl %P[addr]"        /* set F?P to defined value */
+                       : : [addr] "m" (fpstate));
+       }
+
+       __copy_kernel_to_fpregs(fpstate);
+}
+
+extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
+
+/*
+ * FPU context switch related helper methods:
+ */
+
+DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+/*
+ * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx,
+ * on this CPU.
+ *
+ * This will disable any lazy FPU state restore of the current FPU state,
+ * but if the current thread owns the FPU, it will still be saved by.
+ */
+static inline void __cpu_disable_lazy_restore(unsigned int cpu)
+{
+       per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
+}
+
+static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu)
+{
+       return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
+}
+
+
+/*
+ * Wrap lazy FPU TS handling in a 'hw fpregs activation/deactivation'
+ * idiom, which is then paired with the sw-flag (fpregs_active) later on:
+ */
+
+static inline void __fpregs_activate_hw(void)
+{
+       if (!use_eager_fpu())
+               clts();
+}
+
+static inline void __fpregs_deactivate_hw(void)
+{
+       if (!use_eager_fpu())
+               stts();
+}
+
+/* Must be paired with an 'stts' (fpregs_deactivate_hw()) after! */
+static inline void __fpregs_deactivate(struct fpu *fpu)
+{
+       WARN_ON_FPU(!fpu->fpregs_active);
+
+       fpu->fpregs_active = 0;
+       this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+}
+
+/* Must be paired with a 'clts' (fpregs_activate_hw()) before! */
+static inline void __fpregs_activate(struct fpu *fpu)
+{
+       WARN_ON_FPU(fpu->fpregs_active);
+
+       fpu->fpregs_active = 1;
+       this_cpu_write(fpu_fpregs_owner_ctx, fpu);
+}
+
+/*
+ * The question "does this thread have fpu access?"
+ * is slightly racy, since preemption could come in
+ * and revoke it immediately after the test.
+ *
+ * However, even in that very unlikely scenario,
+ * we can just assume we have FPU access - typically
+ * to save the FP state - we'll just take a #NM
+ * fault and get the FPU access back.
+ */
+static inline int fpregs_active(void)
+{
+       return current->thread.fpu.fpregs_active;
+}
+
+/*
+ * Encapsulate the CR0.TS handling together with the
+ * software flag.
+ *
+ * These generally need preemption protection to work,
+ * do try to avoid using these on their own.
+ */
+static inline void fpregs_activate(struct fpu *fpu)
+{
+       __fpregs_activate_hw();
+       __fpregs_activate(fpu);
+}
+
+static inline void fpregs_deactivate(struct fpu *fpu)
+{
+       __fpregs_deactivate(fpu);
+       __fpregs_deactivate_hw();
+}
+
+/*
+ * FPU state switching for scheduling.
+ *
+ * This is a two-stage process:
+ *
+ *  - switch_fpu_prepare() saves the old state and
+ *    sets the new state of the CR0.TS bit. This is
+ *    done within the context of the old process.
+ *
+ *  - switch_fpu_finish() restores the new state as
+ *    necessary.
+ */
+typedef struct { int preload; } fpu_switch_t;
+
+static inline fpu_switch_t
+switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu)
+{
+       fpu_switch_t fpu;
+
+       /*
+        * If the task has used the math, pre-load the FPU on xsave processors
+        * or if the past 5 consecutive context-switches used math.
+        */
+       fpu.preload = new_fpu->fpstate_active &&
+                     (use_eager_fpu() || new_fpu->counter > 5);
+
+       if (old_fpu->fpregs_active) {
+               if (!copy_fpregs_to_fpstate(old_fpu))
+                       old_fpu->last_cpu = -1;
+               else
+                       old_fpu->last_cpu = cpu;
+
+               /* But leave fpu_fpregs_owner_ctx! */
+               old_fpu->fpregs_active = 0;
+
+               /* Don't change CR0.TS if we just switch! */
+               if (fpu.preload) {
+                       new_fpu->counter++;
+                       __fpregs_activate(new_fpu);
+                       prefetch(&new_fpu->state);
+               } else {
+                       __fpregs_deactivate_hw();
+               }
+       } else {
+               old_fpu->counter = 0;
+               old_fpu->last_cpu = -1;
+               if (fpu.preload) {
+                       new_fpu->counter++;
+                       if (fpu_want_lazy_restore(new_fpu, cpu))
+                               fpu.preload = 0;
+                       else
+                               prefetch(&new_fpu->state);
+                       fpregs_activate(new_fpu);
+               }
+       }
+       return fpu;
+}
+
+/*
+ * Misc helper functions:
+ */
+
+/*
+ * By the time this gets called, we've already cleared CR0.TS and
+ * given the process the FPU if we are going to preload the FPU
+ * state - all we need to do is to conditionally restore the register
+ * state itself.
+ */
+static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch)
+{
+       if (fpu_switch.preload)
+               copy_kernel_to_fpregs(&new_fpu->state);
+}
+
+/*
+ * Needs to be preemption-safe.
+ *
+ * NOTE! user_fpu_begin() must be used only immediately before restoring
+ * the save state. It does not do any saving/restoring on its own. In
+ * lazy FPU mode, it is just an optimization to avoid a #NM exception,
+ * the task can lose the FPU right after preempt_enable().
+ */
+static inline void user_fpu_begin(void)
+{
+       struct fpu *fpu = &current->thread.fpu;
+
+       preempt_disable();
+       if (!fpregs_active())
+               fpregs_activate(fpu);
+       preempt_enable();
+}
+
+/*
+ * MXCSR and XCR definitions:
+ */
+
+extern unsigned int mxcsr_feature_mask;
+
+#define XCR_XFEATURE_ENABLED_MASK      0x00000000
+
+static inline u64 xgetbv(u32 index)
+{
+       u32 eax, edx;
+
+       asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
+                    : "=a" (eax), "=d" (edx)
+                    : "c" (index));
+       return eax + ((u64)edx << 32);
+}
+
+static inline void xsetbv(u32 index, u64 value)
+{
+       u32 eax = value;
+       u32 edx = value >> 32;
+
+       asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
+                    : : "a" (eax), "d" (edx), "c" (index));
+}
+
+#endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h

new file mode 100644 (file)

index 0000000..39d3107
--- /dev/null
+++ b/arch/x86/include/asm/fpu/regset.h
@@ -0,0 +1,21 @@
+/*
+ * FPU regset handling methods:
+ */
+#ifndef _ASM_X86_FPU_REGSET_H
+#define _ASM_X86_FPU_REGSET_H
+
+#include <linux/regset.h>
+
+extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active;
+extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
+                               xstateregs_get;
+extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
+                                xstateregs_set;
+
+/*
+ * xstateregs_active == regset_fpregs_active. Please refer to the comment
+ * at the definition of regset_fpregs_active.
+ */
+#define xstateregs_active      regset_fpregs_active
+
+#endif /* _ASM_X86_FPU_REGSET_H */
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h

new file mode 100644 (file)

index 0000000..7358e9d
--- /dev/null
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -0,0 +1,33 @@
+/*
+ * x86 FPU signal frame handling methods:
+ */
+#ifndef _ASM_X86_FPU_SIGNAL_H
+#define _ASM_X86_FPU_SIGNAL_H
+
+#ifdef CONFIG_X86_64
+# include <asm/sigcontext32.h>
+# include <asm/user32.h>
+struct ksignal;
+int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
+                       compat_sigset_t *set, struct pt_regs *regs);
+int ia32_setup_frame(int sig, struct ksignal *ksig,
+                    compat_sigset_t *set, struct pt_regs *regs);
+#else
+# define user_i387_ia32_struct user_i387_struct
+# define user32_fxsr_struct    user_fxsr_struct
+# define ia32_setup_frame      __setup_frame
+# define ia32_setup_rt_frame   __setup_rt_frame
+#endif
+
+extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
+                             struct task_struct *tsk);
+extern void convert_to_fxsr(struct task_struct *tsk,
+                           const struct user_i387_ia32_struct *env);
+
+unsigned long
+fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
+                    unsigned long *buf_fx, unsigned long *size);
+
+extern void fpu__init_prepare_fx_sw_frame(void);
+
+#endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h

new file mode 100644 (file)

index 0000000..0637826
--- /dev/null
+++ b/arch/x86/include/asm/fpu/types.h
@@ -0,0 +1,293 @@
+/*
+ * FPU data structures:
+ */
+#ifndef _ASM_X86_FPU_H
+#define _ASM_X86_FPU_H
+
+/*
+ * The legacy x87 FPU state format, as saved by FSAVE and
+ * restored by the FRSTOR instructions:
+ */
+struct fregs_state {
+       u32                     cwd;    /* FPU Control Word             */
+       u32                     swd;    /* FPU Status Word              */
+       u32                     twd;    /* FPU Tag Word                 */
+       u32                     fip;    /* FPU IP Offset                */
+       u32                     fcs;    /* FPU IP Selector              */
+       u32                     foo;    /* FPU Operand Pointer Offset   */
+       u32                     fos;    /* FPU Operand Pointer Selector */
+
+       /* 8*10 bytes for each FP-reg = 80 bytes:                       */
+       u32                     st_space[20];
+
+       /* Software status information [not touched by FSAVE]:          */
+       u32                     status;
+};
+
+/*
+ * The legacy fx SSE/MMX FPU state format, as saved by FXSAVE and
+ * restored by the FXRSTOR instructions. It's similar to the FSAVE
+ * format, but differs in some areas, plus has extensions at
+ * the end for the XMM registers.
+ */
+struct fxregs_state {
+       u16                     cwd; /* Control Word                    */
+       u16                     swd; /* Status Word                     */
+       u16                     twd; /* Tag Word                        */
+       u16                     fop; /* Last Instruction Opcode         */
+       union {
+               struct {
+                       u64     rip; /* Instruction Pointer             */
+                       u64     rdp; /* Data Pointer                    */
+               };
+               struct {
+                       u32     fip; /* FPU IP Offset                   */
+                       u32     fcs; /* FPU IP Selector                 */
+                       u32     foo; /* FPU Operand Offset              */
+                       u32     fos; /* FPU Operand Selector            */
+               };
+       };
+       u32                     mxcsr;          /* MXCSR Register State */
+       u32                     mxcsr_mask;     /* MXCSR Mask           */
+
+       /* 8*16 bytes for each FP-reg = 128 bytes:                      */
+       u32                     st_space[32];
+
+       /* 16*16 bytes for each XMM-reg = 256 bytes:                    */
+       u32                     xmm_space[64];
+
+       u32                     padding[12];
+
+       union {
+               u32             padding1[12];
+               u32             sw_reserved[12];
+       };
+
+} __attribute__((aligned(16)));
+
+/* Default value for fxregs_state.mxcsr: */
+#define MXCSR_DEFAULT          0x1f80
+
+/*
+ * Software based FPU emulation state. This is arbitrary really,
+ * it matches the x87 format to make it easier to understand:
+ */
+struct swregs_state {
+       u32                     cwd;
+       u32                     swd;
+       u32                     twd;
+       u32                     fip;
+       u32                     fcs;
+       u32                     foo;
+       u32                     fos;
+       /* 8*10 bytes for each FP-reg = 80 bytes: */
+       u32                     st_space[20];
+       u8                      ftop;
+       u8                      changed;
+       u8                      lookahead;
+       u8                      no_update;
+       u8                      rm;
+       u8                      alimit;
+       struct math_emu_info    *info;
+       u32                     entry_eip;
+};
+
+/*
+ * List of XSAVE features Linux knows about:
+ */
+enum xfeature_bit {
+       XSTATE_BIT_FP,
+       XSTATE_BIT_SSE,
+       XSTATE_BIT_YMM,
+       XSTATE_BIT_BNDREGS,
+       XSTATE_BIT_BNDCSR,
+       XSTATE_BIT_OPMASK,
+       XSTATE_BIT_ZMM_Hi256,
+       XSTATE_BIT_Hi16_ZMM,
+
+       XFEATURES_NR_MAX,
+};
+
+#define XSTATE_FP              (1 << XSTATE_BIT_FP)
+#define XSTATE_SSE             (1 << XSTATE_BIT_SSE)
+#define XSTATE_YMM             (1 << XSTATE_BIT_YMM)
+#define XSTATE_BNDREGS         (1 << XSTATE_BIT_BNDREGS)
+#define XSTATE_BNDCSR          (1 << XSTATE_BIT_BNDCSR)
+#define XSTATE_OPMASK          (1 << XSTATE_BIT_OPMASK)
+#define XSTATE_ZMM_Hi256       (1 << XSTATE_BIT_ZMM_Hi256)
+#define XSTATE_Hi16_ZMM                (1 << XSTATE_BIT_Hi16_ZMM)
+
+#define XSTATE_FPSSE           (XSTATE_FP | XSTATE_SSE)
+#define XSTATE_AVX512          (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
+
+/*
+ * There are 16x 256-bit AVX registers named YMM0-YMM15.
+ * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15)
+ * and are stored in 'struct fxregs_state::xmm_space[]'.
+ *
+ * The high 128 bits are stored here:
+ *    16x 128 bits == 256 bytes.
+ */
+struct ymmh_struct {
+       u8                              ymmh_space[256];
+};
+
+/* We don't support LWP yet: */
+struct lwp_struct {
+       u8                              reserved[128];
+};
+
+/* Intel MPX support: */
+struct bndreg {
+       u64                             lower_bound;
+       u64                             upper_bound;
+} __packed;
+
+struct bndcsr {
+       u64                             bndcfgu;
+       u64                             bndstatus;
+} __packed;
+
+struct mpx_struct {
+       struct bndreg                   bndreg[4];
+       struct bndcsr                   bndcsr;
+};
+
+struct xstate_header {
+       u64                             xfeatures;
+       u64                             xcomp_bv;
+       u64                             reserved[6];
+} __attribute__((packed));
+
+/* New processor state extensions should be added here: */
+#define XSTATE_RESERVE                 (sizeof(struct ymmh_struct) + \
+                                        sizeof(struct lwp_struct)  + \
+                                        sizeof(struct mpx_struct)  )
+/*
+ * This is our most modern FPU state format, as saved by the XSAVE
+ * and restored by the XRSTOR instructions.
+ *
+ * It consists of a legacy fxregs portion, an xstate header and
+ * subsequent fixed size areas as defined by the xstate header.
+ * Not all CPUs support all the extensions.
+ */
+struct xregs_state {
+       struct fxregs_state             i387;
+       struct xstate_header            header;
+       u8                              __reserved[XSTATE_RESERVE];
+} __attribute__ ((packed, aligned (64)));
+
+/*
+ * This is a union of all the possible FPU state formats
+ * put together, so that we can pick the right one runtime.
+ *
+ * The size of the structure is determined by the largest
+ * member - which is the xsave area:
+ */
+union fpregs_state {
+       struct fregs_state              fsave;
+       struct fxregs_state             fxsave;
+       struct swregs_state             soft;
+       struct xregs_state              xsave;
+};
+
+/*
+ * Highest level per task FPU state data structure that
+ * contains the FPU register state plus various FPU
+ * state fields:
+ */
+struct fpu {
+       /*
+        * @state:
+        *
+        * In-memory copy of all FPU registers that we save/restore
+        * over context switches. If the task is using the FPU then
+        * the registers in the FPU are more recent than this state
+        * copy. If the task context-switches away then they get
+        * saved here and represent the FPU state.
+        *
+        * After context switches there may be a (short) time period
+        * during which the in-FPU hardware registers are unchanged
+        * and still perfectly match this state, if the tasks
+        * scheduled afterwards are not using the FPU.
+        *
+        * This is the 'lazy restore' window of optimization, which
+        * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
+        *
+        * We detect whether a subsequent task uses the FPU via setting
+        * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
+        *
+        * During this window, if the task gets scheduled again, we
+        * might be able to skip having to do a restore from this
+        * memory buffer to the hardware registers - at the cost of
+        * incurring the overhead of #NM fault traps.
+        *
+        * Note that on modern CPUs that support the XSAVEOPT (or other
+        * optimized XSAVE instructions), we don't use #NM traps anymore,
+        * as the hardware can track whether FPU registers need saving
+        * or not. On such CPUs we activate the non-lazy ('eagerfpu')
+        * logic, which unconditionally saves/restores all FPU state
+        * across context switches. (if FPU state exists.)
+        */
+       union fpregs_state              state;
+
+       /*
+        * @last_cpu:
+        *
+        * Records the last CPU on which this context was loaded into
+        * FPU registers. (In the lazy-restore case we might be
+        * able to reuse FPU registers across multiple context switches
+        * this way, if no intermediate task used the FPU.)
+        *
+        * A value of -1 is used to indicate that the FPU state in context
+        * memory is newer than the FPU state in registers, and that the
+        * FPU state should be reloaded next time the task is run.
+        */
+       unsigned int                    last_cpu;
+
+       /*
+        * @fpstate_active:
+        *
+        * This flag indicates whether this context is active: if the task
+        * is not running then we can restore from this context, if the task
+        * is running then we should save into this context.
+        */
+       unsigned char                   fpstate_active;
+
+       /*
+        * @fpregs_active:
+        *
+        * This flag determines whether a given context is actively
+        * loaded into the FPU's registers and that those registers
+        * represent the task's current FPU state.
+        *
+        * Note the interaction with fpstate_active:
+        *
+        *   # task does not use the FPU:
+        *   fpstate_active == 0
+        *
+        *   # task uses the FPU and regs are active:
+        *   fpstate_active == 1 && fpregs_active == 1
+        *
+        *   # the regs are inactive but still match fpstate:
+        *   fpstate_active == 1 && fpregs_active == 0 && fpregs_owner == fpu
+        *
+        * The third state is what we use for the lazy restore optimization
+        * on lazy-switching CPUs.
+        */
+       unsigned char                   fpregs_active;
+
+       /*
+        * @counter:
+        *
+        * This counter contains the number of consecutive context switches
+        * during which the FPU stays used. If this is over a threshold, the
+        * lazy FPU restore logic becomes eager, to save the trap overhead.
+        * This is an unsigned char so that after 256 iterations the counter
+        * wraps and the context switch behavior turns lazy again; this is to
+        * deal with bursty apps that only use the FPU for a short time:
+        */
+       unsigned char                   counter;
+};
+
+#endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h

new file mode 100644 (file)

index 0000000..4656b25
--- /dev/null
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -0,0 +1,46 @@
+#ifndef __ASM_X86_XSAVE_H
+#define __ASM_X86_XSAVE_H
+
+#include <linux/types.h>
+#include <asm/processor.h>
+#include <linux/uaccess.h>
+
+/* Bit 63 of XCR0 is reserved for future expansion */
+#define XSTATE_EXTEND_MASK     (~(XSTATE_FPSSE | (1ULL << 63)))
+
+#define XSTATE_CPUID           0x0000000d
+
+#define FXSAVE_SIZE    512
+
+#define XSAVE_HDR_SIZE     64
+#define XSAVE_HDR_OFFSET    FXSAVE_SIZE
+
+#define XSAVE_YMM_SIZE     256
+#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
+
+/* Supported features which support lazy state saving */
+#define XSTATE_LAZY    (XSTATE_FP | XSTATE_SSE | XSTATE_YMM                  \
+                       | XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
+
+/* Supported features which require eager state saving */
+#define XSTATE_EAGER   (XSTATE_BNDREGS | XSTATE_BNDCSR)
+
+/* All currently supported features */
+#define XCNTXT_MASK    (XSTATE_LAZY | XSTATE_EAGER)
+
+#ifdef CONFIG_X86_64
+#define REX_PREFIX     "0x48, "
+#else
+#define REX_PREFIX
+#endif
+
+extern unsigned int xstate_size;
+extern u64 xfeatures_mask;
+extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
+
+extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
+
+void *get_xsave_addr(struct xregs_state *xsave, int xstate);
+const void *get_xsave_field_ptr(int xstate_field);
+
+#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h

deleted file mode 100644 (file)

index 6eb6fcb..0000000
--- a/arch/x86/include/asm/i387.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- *     Gareth Hughes <gareth@valinux.com>, May 2000
- * x86-64 work by Andi Kleen 2002
- */
-
-#ifndef _ASM_X86_I387_H
-#define _ASM_X86_I387_H
-
-#ifndef __ASSEMBLY__
-
-#include <linux/sched.h>
-#include <linux/hardirq.h>
-
-struct pt_regs;
-struct user_i387_struct;
-
-extern int init_fpu(struct task_struct *child);
-extern void fpu_finit(struct fpu *fpu);
-extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
-extern void math_state_restore(void);
-
-extern bool irq_fpu_usable(void);
-
-/*
- * Careful: __kernel_fpu_begin/end() must be called with preempt disabled
- * and they don't touch the preempt state on their own.
- * If you enable preemption after __kernel_fpu_begin(), preempt notifier
- * should call the __kernel_fpu_end() to prevent the kernel/user FPU
- * state from getting corrupted. KVM for example uses this model.
- *
- * All other cases use kernel_fpu_begin/end() which disable preemption
- * during kernel FPU usage.
- */
-extern void __kernel_fpu_begin(void);
-extern void __kernel_fpu_end(void);
-
-static inline void kernel_fpu_begin(void)
-{
-       preempt_disable();
-       WARN_ON_ONCE(!irq_fpu_usable());
-       __kernel_fpu_begin();
-}
-
-static inline void kernel_fpu_end(void)
-{
-       __kernel_fpu_end();
-       preempt_enable();
-}
-
-/* Must be called with preempt disabled */
-extern void kernel_fpu_disable(void);
-extern void kernel_fpu_enable(void);
-
-/*
- * Some instructions like VIA's padlock instructions generate a spurious
- * DNA fault but don't modify SSE registers. And these instructions
- * get used from interrupt context as well. To prevent these kernel instructions
- * in interrupt context interacting wrongly with other user/kernel fpu usage, we
- * should use them only in the context of irq_ts_save/restore()
- */
-static inline int irq_ts_save(void)
-{
-       /*
-        * If in process context and not atomic, we can take a spurious DNA fault.
-        * Otherwise, doing clts() in process context requires disabling preemption
-        * or some heavy lifting like kernel_fpu_begin()
-        */
-       if (!in_atomic())
-               return 0;
-
-       if (read_cr0() & X86_CR0_TS) {
-               clts();
-               return 1;
-       }
-
-       return 0;
-}
-
-static inline void irq_ts_restore(int TS_state)
-{
-       if (TS_state)
-               stts();
-}
-
-/*
- * The question "does this thread have fpu access?"
- * is slightly racy, since preemption could come in
- * and revoke it immediately after the test.
- *
- * However, even in that very unlikely scenario,
- * we can just assume we have FPU access - typically
- * to save the FP state - we'll just take a #NM
- * fault and get the FPU access back.
- */
-static inline int user_has_fpu(void)
-{
-       return current->thread.fpu.has_fpu;
-}
-
-extern void unlazy_fpu(struct task_struct *tsk);
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index f4a555beef1908b78c9ad6992d4727d59d9c81ee..f8c0ec3a4a979f75cdc2cf321739ff8c655450dd 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1002,8 +1002,6 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
  
  void kvm_inject_nmi(struct kvm_vcpu *vcpu);
  
-int fx_init(struct kvm_vcpu *vcpu);
-
  void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                        const u8 *new, int bytes);
  int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h

index 2fb20d6f7e23b0ccace549901dacf89b51e9c381..9e6278c7140eac3cac2e5841002300c3b903e457 100644 (file)
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -1,6 +1,8 @@
  #ifndef _ASM_X86_MICROCODE_H
  #define _ASM_X86_MICROCODE_H
  
+#include <linux/earlycpio.h>
+
  #define native_rdmsr(msr, val1, val2)                  \
  do {                                                   \
         u64 __val = native_read_msr((msr));             \
@@ -152,6 +154,7 @@ extern void __init load_ucode_bsp(void);
  extern void load_ucode_ap(void);
  extern int __init save_microcode_in_initrd(void);
  void reload_early_microcode(void);
+extern bool get_builtin_firmware(struct cpio_data *cd, const char *name);
  #else
  static inline void __init load_ucode_bsp(void) {}
  static inline void load_ucode_ap(void) {}
@@ -160,6 +163,9 @@ static inline int __init save_microcode_in_initrd(void)
         return 0;
  }
  static inline void reload_early_microcode(void) {}
+static inline bool get_builtin_firmware(struct cpio_data *cd, const char *name)
+{
+       return false;
+}
  #endif
-
  #endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h

index af935397e053e4a504daed8c14cfc06686a08794..ac6d328977a67e4fd0ae5674e2d7e8a30539ff92 100644 (file)
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -65,12 +65,12 @@ extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, s
  extern u8 amd_ucode_patch[PATCH_MAX_SIZE];
  
  #ifdef CONFIG_MICROCODE_AMD_EARLY
-extern void __init load_ucode_amd_bsp(void);
+extern void __init load_ucode_amd_bsp(unsigned int family);
  extern void load_ucode_amd_ap(void);
  extern int __init save_microcode_in_initrd_amd(void);
  void reload_ucode_amd(void);
  #else
-static inline void __init load_ucode_amd_bsp(void) {}
+static inline void __init load_ucode_amd_bsp(unsigned int family) {}
  static inline void load_ucode_amd_ap(void) {}
  static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; }
  void reload_ucode_amd(void) {}
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h

index 2b9209c46ca939991abed04a1c5d4ef786b0c698..7991c606125d01b137a6f0e1ed16f1e8c1d86015 100644 (file)
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -51,20 +51,11 @@ struct extended_sigtable {
         (((struct microcode_intel *)mc)->hdr.datasize ? \
          ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
  
-#define sigmatch(s1, s2, p1, p2) \
-       (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
-
  #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
  
-extern int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc);
+extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev);
  extern int microcode_sanity_check(void *mc, int print_err);
-extern int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc);
-
-static inline int
-revision_is_newer(struct microcode_header_intel *mc_header, int rev)
-{
-       return (mc_header->rev <= rev) ? 0 : 1;
-}
+extern int find_matching_signature(void *mc, unsigned int csig, int cpf);
  
  #ifdef CONFIG_MICROCODE_INTEL_EARLY
  extern void __init load_ucode_intel_bsp(void);
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h

index 883f6b933fa4b6501af7a050fc161eafdb8f8d91..5e8daee7c5c94be6fc48bc7f32e593fb46948557 100644 (file)
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -142,6 +142,19 @@ static inline void arch_exit_mmap(struct mm_struct *mm)
         paravirt_arch_exit_mmap(mm);
  }
  
+#ifdef CONFIG_X86_64
+static inline bool is_64bit_mm(struct mm_struct *mm)
+{
+       return  !config_enabled(CONFIG_IA32_EMULATION) ||
+               !(mm->context.ia32_compat == TIF_IA32);
+}
+#else
+static inline bool is_64bit_mm(struct mm_struct *mm)
+{
+       return false;
+}
+#endif
+
  static inline void arch_bprm_mm_init(struct mm_struct *mm,
                 struct vm_area_struct *vma)
  {
diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h

index a952a13d59a71bb354699cacd918d5583984693c..7a35495275a9b7b1150bde2935757a212defde59 100644 (file)
--- a/arch/x86/include/asm/mpx.h
+++ b/arch/x86/include/asm/mpx.h
@@ -13,55 +13,50 @@
  #define MPX_BNDCFG_ENABLE_FLAG 0x1
  #define MPX_BD_ENTRY_VALID_FLAG        0x1
  
-#ifdef CONFIG_X86_64
-
-/* upper 28 bits [47:20] of the virtual address in 64-bit used to
- * index into bounds directory (BD).
- */
-#define MPX_BD_ENTRY_OFFSET    28
-#define MPX_BD_ENTRY_SHIFT     3
-/* bits [19:3] of the virtual address in 64-bit used to index into
- * bounds table (BT).
+/*
+ * The upper 28 bits [47:20] of the virtual address in 64-bit
+ * are used to index into bounds directory (BD).
+ *
+ * The directory is 2G (2^31) in size, and with 8-byte entries
+ * it has 2^28 entries.
   */
-#define MPX_BT_ENTRY_OFFSET    17
-#define MPX_BT_ENTRY_SHIFT     5
-#define MPX_IGN_BITS           3
-#define MPX_BD_ENTRY_TAIL      3
+#define MPX_BD_SIZE_BYTES_64   (1UL<<31)
+#define MPX_BD_ENTRY_BYTES_64  8
+#define MPX_BD_NR_ENTRIES_64   (MPX_BD_SIZE_BYTES_64/MPX_BD_ENTRY_BYTES_64)
  
-#else
-
-#define MPX_BD_ENTRY_OFFSET    20
-#define MPX_BD_ENTRY_SHIFT     2
-#define MPX_BT_ENTRY_OFFSET    10
-#define MPX_BT_ENTRY_SHIFT     4
-#define MPX_IGN_BITS           2
-#define MPX_BD_ENTRY_TAIL      2
+/*
+ * The 32-bit directory is 4MB (2^22) in size, and with 4-byte
+ * entries it has 2^20 entries.
+ */
+#define MPX_BD_SIZE_BYTES_32   (1UL<<22)
+#define MPX_BD_ENTRY_BYTES_32  4
+#define MPX_BD_NR_ENTRIES_32   (MPX_BD_SIZE_BYTES_32/MPX_BD_ENTRY_BYTES_32)
  
-#endif
+/*
+ * A 64-bit table is 4MB total in size, and an entry is
+ * 4 64-bit pointers in size.
+ */
+#define MPX_BT_SIZE_BYTES_64   (1UL<<22)
+#define MPX_BT_ENTRY_BYTES_64  32
+#define MPX_BT_NR_ENTRIES_64   (MPX_BT_SIZE_BYTES_64/MPX_BT_ENTRY_BYTES_64)
  
-#define MPX_BD_SIZE_BYTES (1UL<<(MPX_BD_ENTRY_OFFSET+MPX_BD_ENTRY_SHIFT))
-#define MPX_BT_SIZE_BYTES (1UL<<(MPX_BT_ENTRY_OFFSET+MPX_BT_ENTRY_SHIFT))
+/*
+ * A 32-bit table is 16kB total in size, and an entry is
+ * 4 32-bit pointers in size.
+ */
+#define MPX_BT_SIZE_BYTES_32   (1UL<<14)
+#define MPX_BT_ENTRY_BYTES_32  16
+#define MPX_BT_NR_ENTRIES_32   (MPX_BT_SIZE_BYTES_32/MPX_BT_ENTRY_BYTES_32)
  
  #define MPX_BNDSTA_TAIL                2
  #define MPX_BNDCFG_TAIL                12
  #define MPX_BNDSTA_ADDR_MASK   (~((1UL<<MPX_BNDSTA_TAIL)-1))
-#define MPX_BNDCFG_ADDR_MASK   (~((1UL<<MPX_BNDCFG_TAIL)-1))
-#define MPX_BT_ADDR_MASK       (~((1UL<<MPX_BD_ENTRY_TAIL)-1))
-
  #define MPX_BNDCFG_ADDR_MASK   (~((1UL<<MPX_BNDCFG_TAIL)-1))
  #define MPX_BNDSTA_ERROR_CODE  0x3
  
-#define MPX_BD_ENTRY_MASK      ((1<<MPX_BD_ENTRY_OFFSET)-1)
-#define MPX_BT_ENTRY_MASK      ((1<<MPX_BT_ENTRY_OFFSET)-1)
-#define MPX_GET_BD_ENTRY_OFFSET(addr)  ((((addr)>>(MPX_BT_ENTRY_OFFSET+ \
-               MPX_IGN_BITS)) & MPX_BD_ENTRY_MASK) << MPX_BD_ENTRY_SHIFT)
-#define MPX_GET_BT_ENTRY_OFFSET(addr)  ((((addr)>>MPX_IGN_BITS) & \
-               MPX_BT_ENTRY_MASK) << MPX_BT_ENTRY_SHIFT)
-
  #ifdef CONFIG_X86_INTEL_MPX
-siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
-                               struct xsave_struct *xsave_buf);
-int mpx_handle_bd_fault(struct xsave_struct *xsave_buf);
+siginfo_t *mpx_generate_siginfo(struct pt_regs *regs);
+int mpx_handle_bd_fault(void);
  static inline int kernel_managing_mpx_tables(struct mm_struct *mm)
  {
         return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR);
@@ -77,12 +72,11 @@ static inline void mpx_mm_init(struct mm_struct *mm)
  void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
                       unsigned long start, unsigned long end);
  #else
-static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
-                                             struct xsave_struct *xsave_buf)
+static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
  {
         return NULL;
  }
-static inline int mpx_handle_bd_fault(struct xsave_struct *xsave_buf)
+static inline int mpx_handle_bd_fault(void)
  {
         return -EINVAL;
  }
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h

index 8f32718425339f426778bbe09e2c468e5ff63814..dca71714f86076f5fae6508fe6bbcb2c573723ae 100644 (file)
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -99,11 +99,9 @@ static __always_inline bool should_resched(void)
    extern asmlinkage void ___preempt_schedule(void);
  # define __preempt_schedule() asm ("call ___preempt_schedule")
    extern asmlinkage void preempt_schedule(void);
-# ifdef CONFIG_CONTEXT_TRACKING
-    extern asmlinkage void ___preempt_schedule_context(void);
-#   define __preempt_schedule_context() asm ("call ___preempt_schedule_context")
-    extern asmlinkage void preempt_schedule_context(void);
-# endif
+  extern asmlinkage void ___preempt_schedule_notrace(void);
+# define __preempt_schedule_notrace() asm ("call ___preempt_schedule_notrace")
+  extern asmlinkage void preempt_schedule_notrace(void);
  #endif
  
  #endif /* __ASM_PREEMPT_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h

index 23ba6765b718c790dbf698edbb2758a2ca6f9102..43e6519df0d507429a9533b51c7a28f2c0f3b90b 100644 (file)
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -21,6 +21,7 @@ struct mm_struct;
  #include <asm/desc_defs.h>
  #include <asm/nops.h>
  #include <asm/special_insns.h>
+#include <asm/fpu/types.h>
  
  #include <linux/personality.h>
  #include <linux/cpumask.h>
@@ -52,11 +53,16 @@ static inline void *current_text_addr(void)
         return pc;
  }
  
+/*
+ * These alignment constraints are for performance in the vSMP case,
+ * but in the task_struct case we must also meet hardware imposed
+ * alignment requirements of the FPU state:
+ */
  #ifdef CONFIG_X86_VSMP
  # define ARCH_MIN_TASKALIGN            (1 << INTERNODE_CACHE_SHIFT)
  # define ARCH_MIN_MMSTRUCT_ALIGN       (1 << INTERNODE_CACHE_SHIFT)
  #else
-# define ARCH_MIN_TASKALIGN            16
+# define ARCH_MIN_TASKALIGN            __alignof__(union fpregs_state)
  # define ARCH_MIN_MMSTRUCT_ALIGN       0
  #endif
  
@@ -166,7 +172,6 @@ extern const struct seq_operations cpuinfo_op;
  #define cache_line_size()      (boot_cpu_data.x86_cache_alignment)
  
  extern void cpu_detect(struct cpuinfo_x86 *c);
-extern void fpu_detect(struct cpuinfo_x86 *c);
  
  extern void early_cpu_init(void);
  extern void identify_boot_cpu(void);
@@ -313,128 +318,6 @@ struct orig_ist {
         unsigned long           ist[7];
  };
  
-#define        MXCSR_DEFAULT           0x1f80
-
-struct i387_fsave_struct {
-       u32                     cwd;    /* FPU Control Word             */
-       u32                     swd;    /* FPU Status Word              */
-       u32                     twd;    /* FPU Tag Word                 */
-       u32                     fip;    /* FPU IP Offset                */
-       u32                     fcs;    /* FPU IP Selector              */
-       u32                     foo;    /* FPU Operand Pointer Offset   */
-       u32                     fos;    /* FPU Operand Pointer Selector */
-
-       /* 8*10 bytes for each FP-reg = 80 bytes:                       */
-       u32                     st_space[20];
-
-       /* Software status information [not touched by FSAVE ]:         */
-       u32                     status;
-};
-
-struct i387_fxsave_struct {
-       u16                     cwd; /* Control Word                    */
-       u16                     swd; /* Status Word                     */
-       u16                     twd; /* Tag Word                        */
-       u16                     fop; /* Last Instruction Opcode         */
-       union {
-               struct {
-                       u64     rip; /* Instruction Pointer             */
-                       u64     rdp; /* Data Pointer                    */
-               };
-               struct {
-                       u32     fip; /* FPU IP Offset                   */
-                       u32     fcs; /* FPU IP Selector                 */
-                       u32     foo; /* FPU Operand Offset              */
-                       u32     fos; /* FPU Operand Selector            */
-               };
-       };
-       u32                     mxcsr;          /* MXCSR Register State */
-       u32                     mxcsr_mask;     /* MXCSR Mask           */
-
-       /* 8*16 bytes for each FP-reg = 128 bytes:                      */
-       u32                     st_space[32];
-
-       /* 16*16 bytes for each XMM-reg = 256 bytes:                    */
-       u32                     xmm_space[64];
-
-       u32                     padding[12];
-
-       union {
-               u32             padding1[12];
-               u32             sw_reserved[12];
-       };
-
-} __attribute__((aligned(16)));
-
-struct i387_soft_struct {
-       u32                     cwd;
-       u32                     swd;
-       u32                     twd;
-       u32                     fip;
-       u32                     fcs;
-       u32                     foo;
-       u32                     fos;
-       /* 8*10 bytes for each FP-reg = 80 bytes: */
-       u32                     st_space[20];
-       u8                      ftop;
-       u8                      changed;
-       u8                      lookahead;
-       u8                      no_update;
-       u8                      rm;
-       u8                      alimit;
-       struct math_emu_info    *info;
-       u32                     entry_eip;
-};
-
-struct ymmh_struct {
-       /* 16 * 16 bytes for each YMMH-reg = 256 bytes */
-       u32 ymmh_space[64];
-};
-
-/* We don't support LWP yet: */
-struct lwp_struct {
-       u8 reserved[128];
-};
-
-struct bndreg {
-       u64 lower_bound;
-       u64 upper_bound;
-} __packed;
-
-struct bndcsr {
-       u64 bndcfgu;
-       u64 bndstatus;
-} __packed;
-
-struct xsave_hdr_struct {
-       u64 xstate_bv;
-       u64 xcomp_bv;
-       u64 reserved[6];
-} __attribute__((packed));
-
-struct xsave_struct {
-       struct i387_fxsave_struct i387;
-       struct xsave_hdr_struct xsave_hdr;
-       struct ymmh_struct ymmh;
-       struct lwp_struct lwp;
-       struct bndreg bndreg[4];
-       struct bndcsr bndcsr;
-       /* new processor state extensions will go here */
-} __attribute__ ((packed, aligned (64)));
-
-union thread_xstate {
-       struct i387_fsave_struct        fsave;
-       struct i387_fxsave_struct       fxsave;
-       struct i387_soft_struct         soft;
-       struct xsave_struct             xsave;
-};
-
-struct fpu {
-       unsigned int last_cpu;
-       unsigned int has_fpu;
-       union thread_xstate *state;
-};
-
  #ifdef CONFIG_X86_64
  DECLARE_PER_CPU(struct orig_ist, orig_ist);
  
@@ -483,8 +366,6 @@ DECLARE_PER_CPU(struct irq_stack *, softirq_stack);
  #endif /* X86_64 */
  
  extern unsigned int xstate_size;
-extern void free_thread_xstate(struct task_struct *);
-extern struct kmem_cache *task_xstate_cachep;
  
  struct perf_event;
  
@@ -508,6 +389,10 @@ struct thread_struct {
         unsigned long           fs;
  #endif
         unsigned long           gs;
+
+       /* Floating point and extended processor state */
+       struct fpu              fpu;
+
         /* Save middle states of ptrace breakpoints */
         struct perf_event       *ptrace_bps[HBP_NUM];
         /* Debug status used for traps, single steps, etc... */
@@ -518,8 +403,6 @@ struct thread_struct {
         unsigned long           cr2;
         unsigned long           trap_nr;
         unsigned long           error_code;
-       /* floating point and extended processor state */
-       struct fpu              fpu;
  #ifdef CONFIG_X86_32
         /* Virtual 86 mode info */
         struct vm86_struct __user *vm86_info;
@@ -535,15 +418,6 @@ struct thread_struct {
         unsigned long           iopl;
         /* Max allowed port in the bitmap, in bytes: */
         unsigned                io_bitmap_max;
-       /*
-        * fpu_counter contains the number of consecutive context switches
-        * that the FPU is used. If this is over a threshold, the lazy fpu
-        * saving becomes unlazy to save the trap. This is an unsigned char
-        * so that after 256 times the counter wraps and the behavior turns
-        * lazy again; this to deal with bursty apps that only use FPU for
-        * a short time
-        */
-       unsigned char fpu_counter;
  };
  
  /*
@@ -928,24 +802,25 @@ extern int get_tsc_mode(unsigned long adr);
  extern int set_tsc_mode(unsigned int val);
  
  /* Register/unregister a process' MPX related resource */
-#define MPX_ENABLE_MANAGEMENT(tsk)     mpx_enable_management((tsk))
-#define MPX_DISABLE_MANAGEMENT(tsk)    mpx_disable_management((tsk))
+#define MPX_ENABLE_MANAGEMENT()        mpx_enable_management()
+#define MPX_DISABLE_MANAGEMENT()       mpx_disable_management()
  
  #ifdef CONFIG_X86_INTEL_MPX
-extern int mpx_enable_management(struct task_struct *tsk);
-extern int mpx_disable_management(struct task_struct *tsk);
+extern int mpx_enable_management(void);
+extern int mpx_disable_management(void);
  #else
-static inline int mpx_enable_management(struct task_struct *tsk)
+static inline int mpx_enable_management(void)
  {
         return -EINVAL;
  }
-static inline int mpx_disable_management(struct task_struct *tsk)
+static inline int mpx_disable_management(void)
  {
         return -EINVAL;
  }
  #endif /* CONFIG_X86_INTEL_MPX */
  
  extern u16 amd_get_nb_id(int cpu);
+extern u32 amd_get_nodes_per_socket(void);
  
  static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
  {
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h

index 19507ffa5d28e9ce3ddece3856dd9cde4446f7f8..5fabf1362942c65e5fc4327511e51487a14bd5d7 100644 (file)
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -107,7 +107,7 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
  static inline int user_mode(struct pt_regs *regs)
  {
  #ifdef CONFIG_X86_32
-       return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL;
+       return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL;
  #else
         return !!(regs->cs & 3);
  #endif
diff --git a/arch/x86/include/asm/simd.h b/arch/x86/include/asm/simd.h

index ee80b92f00962a392a824221dbdbeb781b6312bf..6c8a7ed13365ae5dd675fd4b158c8034ba213a29 100644 (file)
--- a/arch/x86/include/asm/simd.h
+++ b/arch/x86/include/asm/simd.h
@@ -1,5 +1,5 @@
  
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  
  /*
   * may_use_simd - whether it is allowable at this time to issue SIMD
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h

index 17a8dced12daef5c9fcc5351c165994225295372..222a6a3ca2b5ebeeff21037b8a3b517b8fd3353a 100644 (file)
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -37,16 +37,6 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
  DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
  DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
  
-static inline struct cpumask *cpu_sibling_mask(int cpu)
-{
-       return per_cpu(cpu_sibling_map, cpu);
-}
-
-static inline struct cpumask *cpu_core_mask(int cpu)
-{
-       return per_cpu(cpu_core_map, cpu);
-}
-
  static inline struct cpumask *cpu_llc_shared_mask(int cpu)
  {
         return per_cpu(cpu_llc_shared_map, cpu);
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h

index 6a998598f172424f198bf7e034676a8c47b2eb9b..c2e00bb2a1365cef17911e262d28d0fad75e6828 100644 (file)
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -39,7 +39,9 @@
  #include <asm/processor.h>
  #include <asm/percpu.h>
  #include <asm/desc.h>
+
  #include <linux/random.h>
+#include <linux/sched.h>
  
  /*
   * 24 byte read-only segment initializer for stack canary.  Linker
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h

index 552d6c90a6d43dbebbae0756ba10061bc0969e8d..d1793f06854d28f22481c40f03cd2171d7b13495 100644 (file)
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -7,7 +7,7 @@
  #define _ASM_X86_SUSPEND_32_H
  
  #include <asm/desc.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  
  /* image of the saved processor state */
  struct saved_context {
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h

index bc6232834babf33973c8a4e6da1ce63cd159dce4..7ebf0ebe4e687f3704cac47b3de3dd1b906d27e8 100644 (file)
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -7,7 +7,7 @@
  #define _ASM_X86_SUSPEND_64_H
  
  #include <asm/desc.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  
  /*
   * Image of the saved processor state, used by the low level ACPI suspend to
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h

index 8d717faeed225cf53d9cf09bb57228e74845940a..0fb46482dfde160b9dcfad6ef57841e07c3830e2 100644 (file)
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -124,7 +124,7 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
  
  #ifdef ENABLE_TOPO_DEFINES
  #define topology_core_cpumask(cpu)             (per_cpu(cpu_core_map, cpu))
-#define topology_thread_cpumask(cpu)           (per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_cpumask(cpu)          (per_cpu(cpu_sibling_map, cpu))
  #endif
  
  static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/include/asm/trace/mpx.h b/arch/x86/include/asm/trace/mpx.h

new file mode 100644 (file)

index 0000000..173dd3b
--- /dev/null
+++ b/arch/x86/include/asm/trace/mpx.h
@@ -0,0 +1,132 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mpx
+
+#if !defined(_TRACE_MPX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MPX_H
+
+#include <linux/tracepoint.h>
+
+#ifdef CONFIG_X86_INTEL_MPX
+
+TRACE_EVENT(mpx_bounds_register_exception,
+
+       TP_PROTO(void *addr_referenced,
+                const struct bndreg *bndreg),
+       TP_ARGS(addr_referenced, bndreg),
+
+       TP_STRUCT__entry(
+               __field(void *, addr_referenced)
+               __field(u64, lower_bound)
+               __field(u64, upper_bound)
+       ),
+
+       TP_fast_assign(
+               __entry->addr_referenced = addr_referenced;
+               __entry->lower_bound = bndreg->lower_bound;
+               __entry->upper_bound = bndreg->upper_bound;
+       ),
+       /*
+        * Note that we are printing out the '~' of the upper
+        * bounds register here.  It is actually stored in its
+        * one's complement form so that its 'init' state
+        * corresponds to all 0's.  But, that looks like
+        * gibberish when printed out, so print out the 1's
+        * complement instead of the actual value here.  Note
+        * though that you still need to specify filters for the
+        * actual value, not the displayed one.
+        */
+       TP_printk("address referenced: 0x%p bounds: lower: 0x%llx ~upper: 0x%llx",
+               __entry->addr_referenced,
+               __entry->lower_bound,
+               ~__entry->upper_bound
+       )
+);
+
+TRACE_EVENT(bounds_exception_mpx,
+
+       TP_PROTO(const struct bndcsr *bndcsr),
+       TP_ARGS(bndcsr),
+
+       TP_STRUCT__entry(
+               __field(u64, bndcfgu)
+               __field(u64, bndstatus)
+       ),
+
+       TP_fast_assign(
+               /* need to get rid of the 'const' on bndcsr */
+               __entry->bndcfgu   = (u64)bndcsr->bndcfgu;
+               __entry->bndstatus = (u64)bndcsr->bndstatus;
+       ),
+
+       TP_printk("bndcfgu:0x%llx bndstatus:0x%llx",
+               __entry->bndcfgu,
+               __entry->bndstatus)
+);
+
+DECLARE_EVENT_CLASS(mpx_range_trace,
+
+       TP_PROTO(unsigned long start,
+                unsigned long end),
+       TP_ARGS(start, end),
+
+       TP_STRUCT__entry(
+               __field(unsigned long, start)
+               __field(unsigned long, end)
+       ),
+
+       TP_fast_assign(
+               __entry->start = start;
+               __entry->end   = end;
+       ),
+
+       TP_printk("[0x%p:0x%p]",
+               (void *)__entry->start,
+               (void *)__entry->end
+       )
+);
+
+DEFINE_EVENT(mpx_range_trace, mpx_unmap_zap,
+       TP_PROTO(unsigned long start, unsigned long end),
+       TP_ARGS(start, end)
+);
+
+DEFINE_EVENT(mpx_range_trace, mpx_unmap_search,
+       TP_PROTO(unsigned long start, unsigned long end),
+       TP_ARGS(start, end)
+);
+
+TRACE_EVENT(mpx_new_bounds_table,
+
+       TP_PROTO(unsigned long table_vaddr),
+       TP_ARGS(table_vaddr),
+
+       TP_STRUCT__entry(
+               __field(unsigned long, table_vaddr)
+       ),
+
+       TP_fast_assign(
+               __entry->table_vaddr = table_vaddr;
+       ),
+
+       TP_printk("table vaddr:%p", (void *)__entry->table_vaddr)
+);
+
+#else
+
+/*
+ * This gets used outside of MPX-specific code, so we need a stub.
+ */
+static inline void trace_bounds_exception_mpx(const struct bndcsr *bndcsr)
+{
+}
+
+#endif /* CONFIG_X86_INTEL_MPX */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH asm/trace/
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE mpx
+#endif /* _TRACE_MPX_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h

index ace9dec050b17b1a766899a946ab83d6bbe31641..a8df874f3e8825b0ea6bde91909b8b3d7b380ba7 100644 (file)
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -74,7 +74,8 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
   * @addr: User space pointer to start of block to check
   * @size: Size of block to check
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Checks if a pointer to a block of memory in user space is valid.
   *
@@ -145,7 +146,8 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
   * @x:   Variable to store result.
   * @ptr: Source address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple variable from user space to kernel
   * space.  It supports simple types like char and int, but not larger
@@ -240,7 +242,8 @@ extern void __put_user_8(void);
   * @x:   Value to copy to user space.
   * @ptr: Destination address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple value from kernel space to user
   * space.  It supports simple types like char and int, but not larger
@@ -455,7 +458,8 @@ struct __large_struct { unsigned long buf[100]; };
   * @x:   Variable to store result.
   * @ptr: Source address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple variable from user space to kernel
   * space.  It supports simple types like char and int, but not larger
@@ -479,7 +483,8 @@ struct __large_struct { unsigned long buf[100]; };
   * @x:   Value to copy to user space.
   * @ptr: Destination address, in user space.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * This macro copies a single simple value from kernel space to user
   * space.  It supports simple types like char and int, but not larger
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h

index 0ed5504c606081642912ae8ca31d3dc438f45200..f5dcb5204dcd5b27e8b8e9a1b87612a28cda10c6 100644 (file)
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -74,7 +74,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
   * @from: Source address, in kernel space.
   * @n:    Number of bytes to copy.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from kernel space to user space.  Caller must check
   * the specified block with access_ok() before calling this function.
@@ -121,7 +122,8 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
   * @from: Source address, in user space.
   * @n:    Number of bytes to copy.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from user space to kernel space.  Caller must check
   * the specified block with access_ok() before calling this function.
diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h

index ccab4af1646d440584bf981c3a08784590b961dc..59a54e869f1598f05a38310e955d360f59bd87ad 100644 (file)
--- a/arch/x86/include/asm/user.h
+++ b/arch/x86/include/asm/user.h
@@ -14,8 +14,8 @@ struct user_ymmh_regs {
         __u32 ymmh_space[64];
  };
  
-struct user_xsave_hdr {
-       __u64 xstate_bv;
+struct user_xstate_header {
+       __u64 xfeatures;
         __u64 reserved1[2];
         __u64 reserved2[5];
  };
@@ -41,11 +41,11 @@ struct user_xsave_hdr {
   * particular process/thread.
   *
   * Also when the user modifies certain state FP/SSE/etc through the
- * ptrace interface, they must ensure that the xsave_hdr.xstate_bv
+ * ptrace interface, they must ensure that the header.xfeatures
   * bytes[512..519] of the memory layout are updated correspondingly.
   * i.e., for example when FP state is modified to a non-init state,
- * xsave_hdr.xstate_bv's bit 0 must be set to '1', when SSE is modified to
- * non-init state, xsave_hdr.xstate_bv's bit 1 must to be set to '1', etc.
+ * header.xfeatures's bit 0 must be set to '1', when SSE is modified to
+ * non-init state, header.xfeatures's bit 1 must to be set to '1', etc.
   */
  #define USER_XSTATE_FX_SW_WORDS 6
  #define USER_XSTATE_XCR0_WORD  0
@@ -55,7 +55,7 @@ struct user_xstateregs {
                 __u64 fpx_space[58];
                 __u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS];
         } i387;
-       struct user_xsave_hdr xsave_hdr;
+       struct user_xstate_header header;
         struct user_ymmh_regs ymmh;
         /* further processor state extensions go here */
  };
diff --git a/arch/x86/include/asm/xcr.h b/arch/x86/include/asm/xcr.h

deleted file mode 100644 (file)

index f2cba4e..0000000
--- a/arch/x86/include/asm/xcr.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- *   Copyright 2008 rPath, Inc. - All Rights Reserved
- *
- *   This file is part of the Linux kernel, and is made available under
- *   the terms of the GNU General Public License version 2 or (at your
- *   option) any later version; incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
-
-/*
- * asm-x86/xcr.h
- *
- * Definitions for the eXtended Control Register instructions
- */
-
-#ifndef _ASM_X86_XCR_H
-#define _ASM_X86_XCR_H
-
-#define XCR_XFEATURE_ENABLED_MASK      0x00000000
-
-#ifdef __KERNEL__
-# ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-
-static inline u64 xgetbv(u32 index)
-{
-       u32 eax, edx;
-
-       asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
-                    : "=a" (eax), "=d" (edx)
-                    : "c" (index));
-       return eax + ((u64)edx << 32);
-}
-
-static inline void xsetbv(u32 index, u64 value)
-{
-       u32 eax = value;
-       u32 edx = value >> 32;
-
-       asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
-                    : : "a" (eax), "d" (edx), "c" (index));
-}
-
-# endif /* __ASSEMBLY__ */
-#endif /* __KERNEL__ */
-
-#endif /* _ASM_X86_XCR_H */
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h

index d8829751b3f895e9fd19fa6bd597758650f417b8..1f5c5161ead682664dc30fc5dda802de2de0bc4b 100644 (file)
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -36,7 +36,7 @@
   * no advantages to be gotten from x86-64 here anyways.
   */
  
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  
  #ifdef CONFIG_X86_32
  /* reduce register pressure */
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h

index ce05722e3c68bce4d72a1bcdc9e798b5014581cf..5a08bc8bff33934e10b4b9afe8e3236ac8c5ce93 100644 (file)
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -26,7 +26,7 @@
  #define XO3(x, y)      "       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
  #define XO4(x, y)      "       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
  
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  
  static void
  xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h

index 492b29802f571b0363a22fe686f708c8235c317a..7c0a517ec7511a667166c216df8357087ff3e7b0 100644 (file)
--- a/arch/x86/include/asm/xor_avx.h
+++ b/arch/x86/include/asm/xor_avx.h
@@ -18,7 +18,7 @@
  #ifdef CONFIG_AS_AVX
  
  #include <linux/compiler.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  
  #define BLOCK4(i) \
                 BLOCK(32 * i, 0) \
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h

deleted file mode 100644 (file)

index c9a6d68..0000000
--- a/arch/x86/include/asm/xsave.h
+++ /dev/null
@@ -1,257 +0,0 @@
-#ifndef __ASM_X86_XSAVE_H
-#define __ASM_X86_XSAVE_H
-
-#include <linux/types.h>
-#include <asm/processor.h>
-
-#define XSTATE_CPUID           0x0000000d
-
-#define XSTATE_FP              0x1
-#define XSTATE_SSE             0x2
-#define XSTATE_YMM             0x4
-#define XSTATE_BNDREGS         0x8
-#define XSTATE_BNDCSR          0x10
-#define XSTATE_OPMASK          0x20
-#define XSTATE_ZMM_Hi256       0x40
-#define XSTATE_Hi16_ZMM                0x80
-
-#define XSTATE_FPSSE   (XSTATE_FP | XSTATE_SSE)
-#define XSTATE_AVX512  (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
-/* Bit 63 of XCR0 is reserved for future expansion */
-#define XSTATE_EXTEND_MASK     (~(XSTATE_FPSSE | (1ULL << 63)))
-
-#define FXSAVE_SIZE    512
-
-#define XSAVE_HDR_SIZE     64
-#define XSAVE_HDR_OFFSET    FXSAVE_SIZE
-
-#define XSAVE_YMM_SIZE     256
-#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
-
-/* Supported features which support lazy state saving */
-#define XSTATE_LAZY    (XSTATE_FP | XSTATE_SSE | XSTATE_YMM                  \
-                       | XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
-
-/* Supported features which require eager state saving */
-#define XSTATE_EAGER   (XSTATE_BNDREGS | XSTATE_BNDCSR)
-
-/* All currently supported features */
-#define XCNTXT_MASK    (XSTATE_LAZY | XSTATE_EAGER)
-
-#ifdef CONFIG_X86_64
-#define REX_PREFIX     "0x48, "
-#else
-#define REX_PREFIX
-#endif
-
-extern unsigned int xstate_size;
-extern u64 pcntxt_mask;
-extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
-extern struct xsave_struct *init_xstate_buf;
-
-extern void xsave_init(void);
-extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
-extern int init_fpu(struct task_struct *child);
-
-/* These macros all use (%edi)/(%rdi) as the single memory argument. */
-#define XSAVE          ".byte " REX_PREFIX "0x0f,0xae,0x27"
-#define XSAVEOPT       ".byte " REX_PREFIX "0x0f,0xae,0x37"
-#define XSAVES         ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
-#define XRSTOR         ".byte " REX_PREFIX "0x0f,0xae,0x2f"
-#define XRSTORS                ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
-
-#define xstate_fault   ".section .fixup,\"ax\"\n"      \
-                       "3:  movl $-1,%[err]\n"         \
-                       "    jmp  2b\n"                 \
-                       ".previous\n"                   \
-                       _ASM_EXTABLE(1b, 3b)            \
-                       : [err] "=r" (err)
-
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask)
-{
-       u32 lmask = mask;
-       u32 hmask = mask >> 32;
-       int err = 0;
-
-       WARN_ON(system_state != SYSTEM_BOOTING);
-
-       if (boot_cpu_has(X86_FEATURE_XSAVES))
-               asm volatile("1:"XSAVES"\n\t"
-                       "2:\n\t"
-                            xstate_fault
-                       : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-                       :   "memory");
-       else
-               asm volatile("1:"XSAVE"\n\t"
-                       "2:\n\t"
-                            xstate_fault
-                       : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-                       :   "memory");
-       return err;
-}
-
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask)
-{
-       u32 lmask = mask;
-       u32 hmask = mask >> 32;
-       int err = 0;
-
-       WARN_ON(system_state != SYSTEM_BOOTING);
-
-       if (boot_cpu_has(X86_FEATURE_XSAVES))
-               asm volatile("1:"XRSTORS"\n\t"
-                       "2:\n\t"
-                            xstate_fault
-                       : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-                       :   "memory");
-       else
-               asm volatile("1:"XRSTOR"\n\t"
-                       "2:\n\t"
-                            xstate_fault
-                       : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-                       :   "memory");
-       return err;
-}
-
-/*
- * Save processor xstate to xsave area.
- */
-static inline int xsave_state(struct xsave_struct *fx, u64 mask)
-{
-       u32 lmask = mask;
-       u32 hmask = mask >> 32;
-       int err = 0;
-
-       /*
-        * If xsaves is enabled, xsaves replaces xsaveopt because
-        * it supports compact format and supervisor states in addition to
-        * modified optimization in xsaveopt.
-        *
-        * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave
-        * because xsaveopt supports modified optimization which is not
-        * supported by xsave.
-        *
-        * If none of xsaves and xsaveopt is enabled, use xsave.
-        */
-       alternative_input_2(
-               "1:"XSAVE,
-               XSAVEOPT,
-               X86_FEATURE_XSAVEOPT,
-               XSAVES,
-               X86_FEATURE_XSAVES,
-               [fx] "D" (fx), "a" (lmask), "d" (hmask) :
-               "memory");
-       asm volatile("2:\n\t"
-                    xstate_fault
-                    : "0" (0)
-                    : "memory");
-
-       return err;
-}
-
-/*
- * Restore processor xstate from xsave area.
- */
-static inline int xrstor_state(struct xsave_struct *fx, u64 mask)
-{
-       int err = 0;
-       u32 lmask = mask;
-       u32 hmask = mask >> 32;
-
-       /*
-        * Use xrstors to restore context if it is enabled. xrstors supports
-        * compacted format of xsave area which is not supported by xrstor.
-        */
-       alternative_input(
-               "1: " XRSTOR,
-               XRSTORS,
-               X86_FEATURE_XSAVES,
-               "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-               : "memory");
-
-       asm volatile("2:\n"
-                    xstate_fault
-                    : "0" (0)
-                    : "memory");
-
-       return err;
-}
-
-/*
- * Save xstate context for old process during context switch.
- */
-static inline void fpu_xsave(struct fpu *fpu)
-{
-       xsave_state(&fpu->state->xsave, -1);
-}
-
-/*
- * Restore xstate context for new process during context switch.
- */
-static inline int fpu_xrstor_checking(struct xsave_struct *fx)
-{
-       return xrstor_state(fx, -1);
-}
-
-/*
- * Save xstate to user space xsave area.
- *
- * We don't use modified optimization because xrstor/xrstors might track
- * a different application.
- *
- * We don't use compacted format xsave area for
- * backward compatibility for old applications which don't understand
- * compacted format of xsave area.
- */
-static inline int xsave_user(struct xsave_struct __user *buf)
-{
-       int err;
-
-       /*
-        * Clear the xsave header first, so that reserved fields are
-        * initialized to zero.
-        */
-       err = __clear_user(&buf->xsave_hdr, sizeof(buf->xsave_hdr));
-       if (unlikely(err))
-               return -EFAULT;
-
-       __asm__ __volatile__(ASM_STAC "\n"
-                            "1:"XSAVE"\n"
-                            "2: " ASM_CLAC "\n"
-                            xstate_fault
-                            : "D" (buf), "a" (-1), "d" (-1), "0" (0)
-                            : "memory");
-       return err;
-}
-
-/*
- * Restore xstate from user space xsave area.
- */
-static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)
-{
-       int err = 0;
-       struct xsave_struct *xstate = ((__force struct xsave_struct *)buf);
-       u32 lmask = mask;
-       u32 hmask = mask >> 32;
-
-       __asm__ __volatile__(ASM_STAC "\n"
-                            "1:"XRSTOR"\n"
-                            "2: " ASM_CLAC "\n"
-                            xstate_fault
-                            : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0)
-                            : "memory");       /* memory required? */
-       return err;
-}
-
-void *get_xsave_addr(struct xsave_struct *xsave, int xstate);
-void setup_xstate_comp(void);
-
-#endif
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h

index 16dc4e8a2cd34845042445915f9e9d74d90546c6..0e8a973de9ee8aec0c555a5e9e8b23348e2cc10b 100644 (file)
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -25,7 +25,7 @@ struct _fpx_sw_bytes {
         __u32 extended_size;    /* total size of the layout referred by
                                  * fpstate pointer in the sigcontext.
                                  */
-       __u64 xstate_bv;
+       __u64 xfeatures;
                                 /* feature bit mask (including fp/sse/extended
                                  * state) that is present in the memory
                                  * layout.
@@ -209,8 +209,8 @@ struct sigcontext {
  
  #endif /* !__i386__ */
  
-struct _xsave_hdr {
-       __u64 xstate_bv;
+struct _header {
+       __u64 xfeatures;
         __u64 reserved1[2];
         __u64 reserved2[5];
  };
@@ -228,7 +228,7 @@ struct _ymmh_state {
   */
  struct _xstate {
         struct _fpstate fpstate;
-       struct _xsave_hdr xstate_hdr;
+       struct _header xstate_hdr;
         struct _ymmh_state ymmh;
         /* new processor state extensions go here */
  };
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile

index 01663ee5f1b76bb8efd3237602cce59365ed8a8c..0f15af41bd80b764c80f90f0153a96e136a2106e 100644 (file)
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -41,7 +41,7 @@ obj-y                 += pci-iommu_table.o
  obj-y                  += resource.o
  
  obj-y                          += process.o
-obj-y                          += i387.o xsave.o
+obj-y                          += fpu/
  obj-y                          += ptrace.o
  obj-$(CONFIG_X86_32)           += tls.o
  obj-$(CONFIG_IA32_EMULATION)   += tls.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c

index b0932c4341b3fd467ec10764d47e88b50d3d1d35..c42827eb86cf0c52c36389d0c26dc937776b03bf 100644 (file)
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -21,6 +21,10 @@
  #include <asm/io.h>
  #include <asm/fixmap.h>
  
+int __read_mostly alternatives_patched;
+
+EXPORT_SYMBOL_GPL(alternatives_patched);
+
  #define MAX_PATCH_LEN (255-1)
  
  static int __initdata_or_module debug_alternative;
@@ -636,6 +640,7 @@ void __init alternative_instructions(void)
         apply_paravirt(__parainstructions, __parainstructions_end);
  
         restart_nmi();
+       alternatives_patched = 1;
  }
  
  /**
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c

index 5caed1dd7ccf89e6595fc4a7db7d1d5b7f92f0a6..29fa475ec51823e61a9e94f34142e37341a1208b 100644 (file)
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -89,9 +89,7 @@ int amd_cache_northbridges(void)
                         next_northbridge(link, amd_nb_link_ids);
         }
  
-       /* GART present only on Fam15h upto model 0fh */
-       if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
-           (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
+       if (amd_gart_present())
                 amd_northbridges.flags |= AMD_NB_GART;
  
         /*
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c

index 76164e173a24f7251e3e684e66aa8afd7792f1e1..6e85f713641dda77bd2d9dfeb99fa10cdbe0cb0d 100644 (file)
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -262,6 +262,9 @@ void __init early_gart_iommu_check(void)
         u64 aper_base = 0, last_aper_base = 0;
         int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0;
  
+       if (!amd_gart_present())
+               return;
+
         if (!early_pci_allowed())
                 return;
  
@@ -355,6 +358,9 @@ int __init gart_iommu_hole_init(void)
         int fix, slot, valid_agp = 0;
         int i, node;
  
+       if (!amd_gart_present())
+               return -ENODEV;
+
         if (gart_iommu_aperture_disabled || !fix_aperture ||
             !early_pci_allowed())
                 return -ENODEV;
@@ -452,7 +458,7 @@ out:
                    force_iommu ||
                    valid_agp ||
                    fallback_aper_force) {
-               pr_info("Your BIOS doesn't leave a aperture memory hole\n");
+               pr_info("Your BIOS doesn't leave an aperture memory hole\n");
                 pr_info("Please enable the IOMMU option in the BIOS setup\n");
                 pr_info("This costs you %dMB of RAM\n",
                         32 << fallback_aper_order);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c

index eb4f01269b5d831e0323ec629d43065e146c7fbc..dd3a4baffe50cca6595a57755e17c7d284ee999c 100644 (file)
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -19,6 +19,13 @@
  
  #include "cpu.h"
  
+/*
+ * nodes_per_socket: Stores the number of nodes per socket.
+ * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX
+ * Node Identifiers[10:8]
+ */
+static u32 nodes_per_socket = 1;
+
  static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
  {
         u32 gprs[8] = { 0 };
@@ -291,7 +298,7 @@ static int nearby_node(int apicid)
  #ifdef CONFIG_SMP
  static void amd_get_topology(struct cpuinfo_x86 *c)
  {
-       u32 nodes, cores_per_cu = 1;
+       u32 cores_per_cu = 1;
         u8 node_id;
         int cpu = smp_processor_id();
  
@@ -300,7 +307,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
                 u32 eax, ebx, ecx, edx;
  
                 cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
-               nodes = ((ecx >> 8) & 7) + 1;
+               nodes_per_socket = ((ecx >> 8) & 7) + 1;
                 node_id = ecx & 7;
  
                 /* get compute unit information */
@@ -311,18 +318,18 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
                 u64 value;
  
                 rdmsrl(MSR_FAM10H_NODE_ID, value);
-               nodes = ((value >> 3) & 7) + 1;
+               nodes_per_socket = ((value >> 3) & 7) + 1;
                 node_id = value & 7;
         } else
                 return;
  
         /* fixup multi-node processor information */
-       if (nodes > 1) {
+       if (nodes_per_socket > 1) {
                 u32 cores_per_node;
                 u32 cus_per_node;
  
                 set_cpu_cap(c, X86_FEATURE_AMD_DCM);
-               cores_per_node = c->x86_max_cores / nodes;
+               cores_per_node = c->x86_max_cores / nodes_per_socket;
                 cus_per_node = cores_per_node / cores_per_cu;
  
                 /* store NodeID, use llc_shared_map to store sibling info */
@@ -366,6 +373,12 @@ u16 amd_get_nb_id(int cpu)
  }
  EXPORT_SYMBOL_GPL(amd_get_nb_id);
  
+u32 amd_get_nodes_per_socket(void)
+{
+       return nodes_per_socket;
+}
+EXPORT_SYMBOL_GPL(amd_get_nodes_per_socket);
+
  static void srat_detect_node(struct cpuinfo_x86 *c)
  {
  #ifdef CONFIG_NUMA
@@ -520,8 +533,16 @@ static void early_init_amd(struct cpuinfo_x86 *c)
                         set_cpu_cap(c, X86_FEATURE_K6_MTRR);
  #endif
  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
-       /* check CPU config space for extended APIC ID */
-       if (cpu_has_apic && c->x86 >= 0xf) {
+       /*
+        * ApicID can always be treated as an 8-bit value for AMD APIC versions
+        * >= 0x10, but even old K8s came out of reset with version 0x10. So, we
+        * can safely set X86_FEATURE_EXTD_APICID unconditionally for families
+        * after 16h.
+        */
+       if (cpu_has_apic && c->x86 > 0x16) {
+               set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+       } else if (cpu_has_apic && c->x86 >= 0xf) {
+               /* check CPU config space for extended APIC ID */
                 unsigned int val;
                 val = read_pci_config(0, 24, 0, 0x68);
                 if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c

index 03445346ee0aae247f31ebf2aef6a48be0dfce8a..bd17db15a2c1ef07412c73f064f282a3f55c9313 100644 (file)
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -12,57 +12,11 @@
  #include <asm/bugs.h>
  #include <asm/processor.h>
  #include <asm/processor-flags.h>
-#include <asm/i387.h>
+#include <asm/fpu/internal.h>
  #include <asm/msr.h>
  #include <asm/paravirt.h>
  #include <asm/alternative.h>
  
-static double __initdata x = 4195835.0;
-static double __initdata y = 3145727.0;
-
-/*
- * This used to check for exceptions..
- * However, it turns out that to support that,
- * the XMM trap handlers basically had to
- * be buggy. So let's have a correct XMM trap
- * handler, and forget about printing out
- * some status at boot.
- *
- * We should really only care about bugs here
- * anyway. Not features.
- */
-static void __init check_fpu(void)
-{
-       s32 fdiv_bug;
-
-       kernel_fpu_begin();
-
-       /*
-        * trap_init() enabled FXSR and company _before_ testing for FP
-        * problems here.
-        *
-        * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug
-        */
-       __asm__("fninit\n\t"
-               "fldl %1\n\t"
-               "fdivl %2\n\t"
-               "fmull %2\n\t"
-               "fldl %1\n\t"
-               "fsubp %%st,%%st(1)\n\t"
-               "fistpl %0\n\t"
-               "fwait\n\t"
-               "fninit"
-               : "=m" (*&fdiv_bug)
-               : "m" (*&x), "m" (*&y));
-
-       kernel_fpu_end();
-
-       if (fdiv_bug) {
-               set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
-               pr_warn("Hmm, FPU with FDIV bug\n");
-       }
-}
-
  void __init check_bugs(void)
  {
         identify_boot_cpu();
@@ -85,10 +39,5 @@ void __init check_bugs(void)
                 '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
         alternative_instructions();
  
-       /*
-        * kernel_fpu_begin/end() in check_fpu() relies on the patched
-        * alternative instructions.
-        */
-       if (cpu_has_fpu)
-               check_fpu();
+       fpu__init_check_bugs();
  }
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

index cc7f753e571db91de6efbdd873348db0a64fd5e5..9fc5e3d9d9c8390f4c9bb177449979061f20bb14 100644 (file)
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -5,6 +5,7 @@
  #include <linux/module.h>
  #include <linux/percpu.h>
  #include <linux/string.h>
+#include <linux/ctype.h>
  #include <linux/delay.h>
  #include <linux/sched.h>
  #include <linux/init.h>
@@ -31,8 +32,7 @@
  #include <asm/setup.h>
  #include <asm/apic.h>
  #include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
  #include <asm/mtrr.h>
  #include <linux/numa.h>
  #include <asm/asm.h>
@@ -145,32 +145,21 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
  } };
  EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
  
-static int __init x86_xsave_setup(char *s)
+static int __init x86_mpx_setup(char *s)
  {
+       /* require an exact match without trailing characters */
         if (strlen(s))
                 return 0;
-       setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-       setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-       setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-       setup_clear_cpu_cap(X86_FEATURE_AVX);
-       setup_clear_cpu_cap(X86_FEATURE_AVX2);
-       return 1;
-}
-__setup("noxsave", x86_xsave_setup);
  
-static int __init x86_xsaveopt_setup(char *s)
-{
-       setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-       return 1;
-}
-__setup("noxsaveopt", x86_xsaveopt_setup);
+       /* do not emit a message if the feature is not present */
+       if (!boot_cpu_has(X86_FEATURE_MPX))
+               return 1;
  
-static int __init x86_xsaves_setup(char *s)
-{
-       setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+       setup_clear_cpu_cap(X86_FEATURE_MPX);
+       pr_info("nompx: Intel Memory Protection Extensions (MPX) disabled\n");
         return 1;
  }
-__setup("noxsaves", x86_xsaves_setup);
+__setup("nompx", x86_mpx_setup);
  
  #ifdef CONFIG_X86_32
  static int cachesize_override = -1;
@@ -183,14 +172,6 @@ static int __init cachesize_setup(char *str)
  }
  __setup("cachesize=", cachesize_setup);
  
-static int __init x86_fxsr_setup(char *s)
-{
-       setup_clear_cpu_cap(X86_FEATURE_FXSR);
-       setup_clear_cpu_cap(X86_FEATURE_XMM);
-       return 1;
-}
-__setup("nofxsr", x86_fxsr_setup);
-
  static int __init x86_sep_setup(char *s)
  {
         setup_clear_cpu_cap(X86_FEATURE_SEP);
@@ -419,7 +400,7 @@ static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
  static void get_model_name(struct cpuinfo_x86 *c)
  {
         unsigned int *v;
-       char *p, *q;
+       char *p, *q, *s;
  
         if (c->extended_cpuid_level < 0x80000004)
                 return;
@@ -430,19 +411,21 @@ static void get_model_name(struct cpuinfo_x86 *c)
         cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
         c->x86_model_id[48] = 0;
  
-       /*
-        * Intel chips right-justify this string for some dumb reason;
-        * undo that brain damage:
-        */
-       p = q = &c->x86_model_id[0];
+       /* Trim whitespace */
+       p = q = s = &c->x86_model_id[0];
+
         while (*p == ' ')
                 p++;
-       if (p != q) {
-               while (*p)
-                       *q++ = *p++;
-               while (q <= &c->x86_model_id[48])
-                       *q++ = '\0';    /* Zero-pad the rest */
+
+       while (*p) {
+               /* Note the last non-whitespace index */
+               if (!isspace(*p))
+                       s = q;
+
+               *q++ = *p++;
         }
+
+       *(s + 1) = '\0';
  }
  
  void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
@@ -759,7 +742,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
         cpu_detect(c);
         get_cpu_vendor(c);
         get_cpu_cap(c);
-       fpu_detect(c);
+       fpu__init_system(c);
  
         if (this_cpu->c_early_init)
                 this_cpu->c_early_init(c);
@@ -1122,7 +1105,7 @@ void print_cpu_info(struct cpuinfo_x86 *c)
                 printk(KERN_CONT "%s ", vendor);
  
         if (c->x86_model_id[0])
-               printk(KERN_CONT "%s", strim(c->x86_model_id));
+               printk(KERN_CONT "%s", c->x86_model_id);
         else
                 printk(KERN_CONT "%d86", c->x86);
  
@@ -1179,8 +1162,6 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
  DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
  EXPORT_PER_CPU_SYMBOL(__preempt_count);
  
-DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
-
  /*
   * Special IST stacks which the CPU switches to when it calls
   * an IST-marked descriptor entry. Up to 7 stacks (hardware
@@ -1271,7 +1252,6 @@ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
  EXPORT_PER_CPU_SYMBOL(current_task);
  DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
  EXPORT_PER_CPU_SYMBOL(__preempt_count);
-DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
  
  /*
   * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
@@ -1435,7 +1415,7 @@ void cpu_init(void)
         clear_all_debug_regs();
         dbg_restore_debug_regs();
  
-       fpu_init();
+       fpu__init_cpu();
  
         if (is_uv_system())
                 uv_cpu_init();
@@ -1491,7 +1471,7 @@ void cpu_init(void)
         clear_all_debug_regs();
         dbg_restore_debug_regs();
  
-       fpu_init();
+       fpu__init_cpu();
  }
  #endif
  
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c

index 5b974c97e31e77766ee7fbf7bbf524c5e82d915d..df919ff103c3ae845e727388765ebc1842df0cbc 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -53,9 +53,12 @@
  static DEFINE_MUTEX(mce_chrdev_read_mutex);
  
  #define rcu_dereference_check_mce(p) \
-       rcu_dereference_index_check((p), \
-                             rcu_read_lock_sched_held() || \
-                             lockdep_is_held(&mce_chrdev_read_mutex))
+({ \
+       rcu_lockdep_assert(rcu_read_lock_sched_held() || \
+                          lockdep_is_held(&mce_chrdev_read_mutex), \
+                          "suspicious rcu_dereference_check_mce() usage"); \
+       smp_load_acquire(&(p)); \
+})
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/mce.h>
@@ -1913,7 +1916,7 @@ out:
  static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
  {
         poll_wait(file, &mce_chrdev_wait, wait);
-       if (rcu_access_index(mcelog.next))
+       if (READ_ONCE(mcelog.next))
                 return POLLIN | POLLRDNORM;
         if (!mce_apei_read_done && apei_check_mce())
                 return POLLIN | POLLRDNORM;
@@ -1958,8 +1961,8 @@ void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
  }
  EXPORT_SYMBOL_GPL(register_mce_write_callback);
  
-ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
-                        size_t usize, loff_t *off)
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+                               size_t usize, loff_t *off)
  {
         if (mce_write)
                 return mce_write(filp, ubuf, usize, off);
diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c

index 737737edbd1ef5bbc478cf2b6f898f25c231ea81..e8a215a9a34557542380e4a9e7fcf64a1ed63d91 100644 (file)
--- a/arch/x86/kernel/cpu/microcode/amd_early.c
+++ b/arch/x86/kernel/cpu/microcode/amd_early.c
@@ -228,7 +228,23 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
         }
  }
  
-void __init load_ucode_amd_bsp(void)
+static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
+                                             unsigned int family)
+{
+#ifdef CONFIG_X86_64
+       char fw_name[36] = "amd-ucode/microcode_amd.bin";
+
+       if (family >= 0x15)
+               snprintf(fw_name, sizeof(fw_name),
+                        "amd-ucode/microcode_amd_fam%.2xh.bin", family);
+
+       return get_builtin_firmware(cp, fw_name);
+#else
+       return false;
+#endif
+}
+
+void __init load_ucode_amd_bsp(unsigned int family)
  {
         struct cpio_data cp;
         void **data;
@@ -243,8 +259,10 @@ void __init load_ucode_amd_bsp(void)
  #endif
  
         cp = find_ucode_in_initrd();
-       if (!cp.data)
-               return;
+       if (!cp.data) {
+               if (!load_builtin_amd_microcode(&cp, family))
+                       return;
+       }
  
         *data = cp.data;
         *size = cp.size;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c

index 36a83617eb21cc19245794a89c986eba45179d3a..6236a54a63f449ce2ea824be13a3bcca2f57e4b9 100644 (file)
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -1,74 +1,16 @@
  /*
- *     Intel CPU Microcode Update Driver for Linux
+ * CPU Microcode Update Driver for Linux
   *
- *     Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *                   2006      Shaohua Li <shaohua.li@intel.com>
+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *           2006      Shaohua Li <shaohua.li@intel.com>
+ *           2013-2015 Borislav Petkov <bp@alien8.de>
   *
- *     This driver allows to upgrade microcode on Intel processors
- *     belonging to IA-32 family - PentiumPro, Pentium II,
- *     Pentium III, Xeon, Pentium 4, etc.
+ * This driver allows to upgrade microcode on x86 processors.
   *
- *     Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- *     Software Developer's Manual
- *     Order Number 253668 or free download from:
- *
- *     http://developer.intel.com/Assets/PDF/manual/253668.pdf 
- *
- *     For more information, go to http://www.urbanmyth.org/microcode
- *
- *     This program is free software; you can redistribute it and/or
- *     modify it under the terms of the GNU General Public License
- *     as published by the Free Software Foundation; either version
- *     2 of the License, or (at your option) any later version.
- *
- *     1.0     16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *             Initial release.
- *     1.01    18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *             Added read() support + cleanups.
- *     1.02    21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *             Added 'device trimming' support. open(O_WRONLY) zeroes
- *             and frees the saved copy of applied microcode.
- *     1.03    29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *             Made to use devfs (/dev/cpu/microcode) + cleanups.
- *     1.04    06 Jun 2000, Simon Trimmer <simon@veritas.com>
- *             Added misc device support (now uses both devfs and misc).
- *             Added MICROCODE_IOCFREE ioctl to clear memory.
- *     1.05    09 Jun 2000, Simon Trimmer <simon@veritas.com>
- *             Messages for error cases (non Intel & no suitable microcode).
- *     1.06    03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- *             Removed ->release(). Removed exclusive open and status bitmap.
- *             Added microcode_rwsem to serialize read()/write()/ioctl().
- *             Removed global kernel lock usage.
- *     1.07    07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- *             Write 0 to 0x8B msr and then cpuid before reading revision,
- *             so that it works even if there were no update done by the
- *             BIOS. Otherwise, reading from 0x8B gives junk (which happened
- *             to be 0 on my machine which is why it worked even when I
- *             disabled update by the BIOS)
- *             Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- *     1.08    11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- *                          Tigran Aivazian <tigran@veritas.com>
- *             Intel Pentium 4 processor support and bugfixes.
- *     1.09    30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- *             Bugfix for HT (Hyper-Threading) enabled processors
- *             whereby processor resources are shared by all logical processors
- *             in a single CPU package.
- *     1.10    28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- *             Tigran Aivazian <tigran@veritas.com>,
- *             Serialize updates as required on HT processors due to
- *             speculative nature of implementation.
- *     1.11    22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- *             Fix the panic when writing zero-length microcode chunk.
- *     1.12    29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- *             Jun Nakajima <jun.nakajima@intel.com>
- *             Support for the microcode updates in the new format.
- *     1.13    10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- *             Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- *             because we no longer hold a copy of applied microcode
- *             in kernel memory.
- *     1.14    25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- *             Fix sigmatch() macro to handle old CPUs with pf == 0.
- *             Thanks to Stuart Swales for pointing out this bug.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
   */
  
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c

index a413a69cbd744f2e2873434ee20b66e86fb466dd..8ebc421d62996ae8d90b4828fd52a4fae198cae1 100644 (file)
--- a/arch/x86/kernel/cpu/microcode/core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -3,6 +3,7 @@
   *
   *     Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
   *                        H Peter Anvin" <hpa@zytor.com>
+ *               (C) 2015 Borislav Petkov <bp@alien8.de>
   *
   *     This driver allows to early upgrade microcode on Intel processors
   *     belonging to IA-32 family - PentiumPro, Pentium II,
@@ -17,6 +18,7 @@
   *     2 of the License, or (at your option) any later version.
   */
  #include <linux/module.h>
+#include <linux/firmware.h>
  #include <asm/microcode.h>
  #include <asm/microcode_intel.h>
  #include <asm/microcode_amd.h>
@@ -43,9 +45,29 @@ static bool __init check_loader_disabled_bsp(void)
         return *res;
  }
  
+extern struct builtin_fw __start_builtin_fw[];
+extern struct builtin_fw __end_builtin_fw[];
+
+bool get_builtin_firmware(struct cpio_data *cd, const char *name)
+{
+#ifdef CONFIG_FW_LOADER
+       struct builtin_fw *b_fw;
+
+       for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) {
+               if (!strcmp(name, b_fw->name)) {
+                       cd->size = b_fw->size;
+                       cd->data = b_fw->data;
+                       return true;
+               }
+       }
+#endif
+       return false;
+}
+
  void __init load_ucode_bsp(void)
  {
-       int vendor, family;
+       int vendor;
+       unsigned int family;
  
         if (check_loader_disabled_bsp())
                 return;
@@ -63,7 +85,7 @@ void __init load_ucode_bsp(void)
                 break;
         case X86_VENDOR_AMD:
                 if (family >= 0x10)
-                       load_ucode_amd_bsp();
+                       load_ucode_amd_bsp(family);
                 break;
         default:
                 break;
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c

index a41beadb3db9a396e5b74795e62a49648b367870..969dc17eb1b4b86775d5496bb6ebe9ba67110b4c 100644 (file)
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -1,74 +1,13 @@
  /*
- *     Intel CPU Microcode Update Driver for Linux
+ * Intel CPU Microcode Update Driver for Linux
   *
- *     Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *                   2006      Shaohua Li <shaohua.li@intel.com>
+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *              2006 Shaohua Li <shaohua.li@intel.com>
   *
- *     This driver allows to upgrade microcode on Intel processors
- *     belonging to IA-32 family - PentiumPro, Pentium II,
- *     Pentium III, Xeon, Pentium 4, etc.
- *
- *     Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- *     Software Developer's Manual
- *     Order Number 253668 or free download from:
- *
- *     http://developer.intel.com/Assets/PDF/manual/253668.pdf 
- *
- *     For more information, go to http://www.urbanmyth.org/microcode
- *
- *     This program is free software; you can redistribute it and/or
- *     modify it under the terms of the GNU General Public License
- *     as published by the Free Software Foundation; either version
- *     2 of the License, or (at your option) any later version.
- *
- *     1.0     16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *             Initial release.
- *     1.01    18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *             Added read() support + cleanups.
- *     1.02    21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *             Added 'device trimming' support. open(O_WRONLY) zeroes
- *             and frees the saved copy of applied microcode.
- *     1.03    29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *             Made to use devfs (/dev/cpu/microcode) + cleanups.
- *     1.04    06 Jun 2000, Simon Trimmer <simon@veritas.com>
- *             Added misc device support (now uses both devfs and misc).
- *             Added MICROCODE_IOCFREE ioctl to clear memory.
- *     1.05    09 Jun 2000, Simon Trimmer <simon@veritas.com>
- *             Messages for error cases (non Intel & no suitable microcode).
- *     1.06    03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- *             Removed ->release(). Removed exclusive open and status bitmap.
- *             Added microcode_rwsem to serialize read()/write()/ioctl().
- *             Removed global kernel lock usage.
- *     1.07    07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- *             Write 0 to 0x8B msr and then cpuid before reading revision,
- *             so that it works even if there were no update done by the
- *             BIOS. Otherwise, reading from 0x8B gives junk (which happened
- *             to be 0 on my machine which is why it worked even when I
- *             disabled update by the BIOS)
- *             Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- *     1.08    11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- *                          Tigran Aivazian <tigran@veritas.com>
- *             Intel Pentium 4 processor support and bugfixes.
- *     1.09    30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- *             Bugfix for HT (Hyper-Threading) enabled processors
- *             whereby processor resources are shared by all logical processors
- *             in a single CPU package.
- *     1.10    28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- *             Tigran Aivazian <tigran@veritas.com>,
- *             Serialize updates as required on HT processors due to
- *             speculative nature of implementation.
- *     1.11    22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- *             Fix the panic when writing zero-length microcode chunk.
- *     1.12    29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- *             Jun Nakajima <jun.nakajima@intel.com>
- *             Support for the microcode updates in the new format.
- *     1.13    10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- *             Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- *             because we no longer hold a copy of applied microcode
- *             in kernel memory.
- *     1.14    25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- *             Fix sigmatch() macro to handle old CPUs with pf == 0.
- *             Thanks to Stuart Swales for pointing out this bug.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
   */
  
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -124,7 +63,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
         cpf = cpu_sig.pf;
         crev = cpu_sig.rev;
  
-       return get_matching_microcode(csig, cpf, crev, mc_intel);
+       return has_newer_microcode(mc_intel, csig, cpf, crev);
  }
  
  static int apply_microcode_intel(int cpu)
@@ -226,7 +165,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
  
                 csig = uci->cpu_sig.sig;
                 cpf = uci->cpu_sig.pf;
-               if (get_matching_microcode(csig, cpf, new_rev, mc)) {
+               if (has_newer_microcode(mc, csig, cpf, new_rev)) {
                         vfree(new_mc);
                         new_rev = mc_header.rev;
                         new_mc  = mc;
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c

index 2f49ab4ac0ae137d7ab0b851cf4b9e751d58922c..8187b7247d1c3e97b9a7321a1d0e12ccda1e0962 100644 (file)
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -59,10 +59,10 @@ load_microcode_early(struct microcode_intel **saved,
                 ucode_ptr = saved[i];
                 mc_hdr    = (struct microcode_header_intel *)ucode_ptr;
  
-               ret = get_matching_microcode(uci->cpu_sig.sig,
-                                            uci->cpu_sig.pf,
-                                            new_rev,
-                                            ucode_ptr);
+               ret = has_newer_microcode(ucode_ptr,
+                                         uci->cpu_sig.sig,
+                                         uci->cpu_sig.pf,
+                                         new_rev);
                 if (!ret)
                         continue;
  
@@ -246,7 +246,7 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved,
                              u8 *ucode_ptr, unsigned int num_saved)
  {
         struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
-       unsigned int sig, pf, new_rev;
+       unsigned int sig, pf;
         int found = 0, i;
  
         mc_hdr = (struct microcode_header_intel *)ucode_ptr;
@@ -255,14 +255,13 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved,
                 mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
                 sig          = mc_saved_hdr->sig;
                 pf           = mc_saved_hdr->pf;
-               new_rev      = mc_hdr->rev;
  
-               if (!get_matching_sig(sig, pf, new_rev, ucode_ptr))
+               if (!find_matching_signature(ucode_ptr, sig, pf))
                         continue;
  
                 found = 1;
  
-               if (!revision_is_newer(mc_hdr, new_rev))
+               if (mc_hdr->rev <= mc_saved_hdr->rev)
                         continue;
  
                 /*
@@ -522,6 +521,27 @@ out:
  EXPORT_SYMBOL_GPL(save_mc_for_early);
  #endif
  
+static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
+{
+#ifdef CONFIG_X86_64
+       unsigned int eax = 0x00000001, ebx, ecx = 0, edx;
+       unsigned int family, model, stepping;
+       char name[30];
+
+       native_cpuid(&eax, &ebx, &ecx, &edx);
+
+       family   = __x86_family(eax);
+       model    = x86_model(eax);
+       stepping = eax & 0xf;
+
+       sprintf(name, "intel-ucode/%02x-%02x-%02x", family, model, stepping);
+
+       return get_builtin_firmware(cp, name);
+#else
+       return false;
+#endif
+}
+
  static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
  static __init enum ucode_state
  scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
@@ -540,8 +560,10 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
         cd.size = 0;
  
         cd = find_cpio_data(p, (void *)start, size, &offset);
-       if (!cd.data)
-               return UCODE_ERROR;
+       if (!cd.data) {
+               if (!load_builtin_intel_microcode(&cd))
+                       return UCODE_ERROR;
+       }
  
         return get_matching_model_microcode(0, start, cd.data, cd.size,
                                             mc_saved_data, initrd, uci);
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c

index cd47a510a3f174233300d8763705b6f200faf9f4..1883d252ff7d60ce7707a8108283144de64ce14d 100644 (file)
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -31,11 +31,18 @@
  #include <asm/processor.h>
  #include <asm/msr.h>
  
-static inline int
-update_match_cpu(unsigned int csig, unsigned int cpf,
-                unsigned int sig, unsigned int pf)
+static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
+                                       unsigned int s2, unsigned int p2)
  {
-       return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
+       if (s1 != s2)
+               return false;
+
+       /* Processor flags are either both 0 ... */
+       if (!p1 && !p2)
+               return true;
+
+       /* ... or they intersect. */
+       return p1 & p2;
  }
  
  int microcode_sanity_check(void *mc, int print_err)
@@ -124,27 +131,25 @@ EXPORT_SYMBOL_GPL(microcode_sanity_check);
  /*
   * Returns 1 if update has been found, 0 otherwise.
   */
-int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
+int find_matching_signature(void *mc, unsigned int csig, int cpf)
  {
-       struct microcode_header_intel *mc_header = mc;
-       struct extended_sigtable *ext_header;
-       unsigned long total_size = get_totalsize(mc_header);
-       int ext_sigcount, i;
+       struct microcode_header_intel *mc_hdr = mc;
+       struct extended_sigtable *ext_hdr;
         struct extended_signature *ext_sig;
+       int i;
  
-       if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf))
+       if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
                 return 1;
  
         /* Look for ext. headers: */
-       if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+       if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
                 return 0;
  
-       ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
-       ext_sigcount = ext_header->count;
-       ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+       ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
+       ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
  
-       for (i = 0; i < ext_sigcount; i++) {
-               if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf))
+       for (i = 0; i < ext_hdr->count; i++) {
+               if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
                         return 1;
                 ext_sig++;
         }
@@ -154,13 +159,13 @@ int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
  /*
   * Returns 1 if update has been found, 0 otherwise.
   */
-int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc)
+int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
  {
         struct microcode_header_intel *mc_hdr = mc;
  
-       if (!revision_is_newer(mc_hdr, rev))
+       if (mc_hdr->rev <= new_rev)
                 return 0;
  
-       return get_matching_sig(csig, cpf, rev, mc);
+       return find_matching_signature(mc, csig, cpf);
  }
-EXPORT_SYMBOL_GPL(get_matching_microcode);
+EXPORT_SYMBOL_GPL(has_newer_microcode);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c

index 87848ebe2bb79a56625908c5a6af1b78055d70c9..5801a14f7524315a7318fe5a0f60509704fdb756 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -135,6 +135,7 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
  }
  
  static atomic_t active_events;
+static atomic_t pmc_refcount;
  static DEFINE_MUTEX(pmc_reserve_mutex);
  
  #ifdef CONFIG_X86_LOCAL_APIC
@@ -190,6 +191,7 @@ static bool check_hw_exists(void)
         u64 val, val_fail, val_new= ~0;
         int i, reg, reg_fail, ret = 0;
         int bios_fail = 0;
+       int reg_safe = -1;
  
         /*
          * Check to see if the BIOS enabled any of the counters, if so
@@ -204,6 +206,8 @@ static bool check_hw_exists(void)
                         bios_fail = 1;
                         val_fail = val;
                         reg_fail = reg;
+               } else {
+                       reg_safe = i;
                 }
         }
  
@@ -221,12 +225,23 @@ static bool check_hw_exists(void)
                 }
         }
  
+       /*
+        * If all the counters are enabled, the below test will always
+        * fail.  The tools will also become useless in this scenario.
+        * Just fail and disable the hardware counters.
+        */
+
+       if (reg_safe == -1) {
+               reg = reg_safe;
+               goto msr_fail;
+       }
+
         /*
          * Read the current value, change it and read it back to see if it
          * matches, this is needed to detect certain hardware emulators
          * (qemu/kvm) that don't trap on the MSR access and always return 0s.
          */
-       reg = x86_pmu_event_addr(0);
+       reg = x86_pmu_event_addr(reg_safe);
         if (rdmsrl_safe(reg, &val))
                 goto msr_fail;
         val ^= 0xffffUL;
@@ -256,11 +271,8 @@ msr_fail:
  
  static void hw_perf_event_destroy(struct perf_event *event)
  {
-       if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
-               release_pmc_hardware();
-               release_ds_buffers();
-               mutex_unlock(&pmc_reserve_mutex);
-       }
+       x86_release_hardware();
+       atomic_dec(&active_events);
  }
  
  void hw_perf_lbr_event_destroy(struct perf_event *event)
@@ -310,6 +322,35 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
         return x86_pmu_extra_regs(val, event);
  }
  
+int x86_reserve_hardware(void)
+{
+       int err = 0;
+
+       if (!atomic_inc_not_zero(&pmc_refcount)) {
+               mutex_lock(&pmc_reserve_mutex);
+               if (atomic_read(&pmc_refcount) == 0) {
+                       if (!reserve_pmc_hardware())
+                               err = -EBUSY;
+                       else
+                               reserve_ds_buffers();
+               }
+               if (!err)
+                       atomic_inc(&pmc_refcount);
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+
+       return err;
+}
+
+void x86_release_hardware(void)
+{
+       if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
+               release_pmc_hardware();
+               release_ds_buffers();
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+}
+
  /*
   * Check if we can create event of a certain type (that no conflicting events
   * are present).
@@ -322,21 +363,34 @@ int x86_add_exclusive(unsigned int what)
                 return 0;
  
         mutex_lock(&pmc_reserve_mutex);
-       for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++)
+       for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
                 if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
                         goto out;
+       }
  
         atomic_inc(&x86_pmu.lbr_exclusive[what]);
         ret = 0;
  
  out:
         mutex_unlock(&pmc_reserve_mutex);
+
+       /*
+        * Assuming that all exclusive events will share the PMI handler
+        * (which checks active_events for whether there is work to do),
+        * we can bump active_events counter right here, except for
+        * x86_lbr_exclusive_lbr events that go through x86_pmu_event_init()
+        * path, which already bumps active_events for them.
+        */
+       if (!ret && what != x86_lbr_exclusive_lbr)
+               atomic_inc(&active_events);
+
         return ret;
  }
  
  void x86_del_exclusive(unsigned int what)
  {
         atomic_dec(&x86_pmu.lbr_exclusive[what]);
+       atomic_dec(&active_events);
  }
  
  int x86_setup_perfctr(struct perf_event *event)
@@ -513,22 +567,11 @@ static int __x86_pmu_event_init(struct perf_event *event)
         if (!x86_pmu_initialized())
                 return -ENODEV;
  
-       err = 0;
-       if (!atomic_inc_not_zero(&active_events)) {
-               mutex_lock(&pmc_reserve_mutex);
-               if (atomic_read(&active_events) == 0) {
-                       if (!reserve_pmc_hardware())
-                               err = -EBUSY;
-                       else
-                               reserve_ds_buffers();
-               }
-               if (!err)
-                       atomic_inc(&active_events);
-               mutex_unlock(&pmc_reserve_mutex);
-       }
+       err = x86_reserve_hardware();
         if (err)
                 return err;
  
+       atomic_inc(&active_events);
         event->destroy = hw_perf_event_destroy;
  
         event->hw.idx = -1;
@@ -611,6 +654,7 @@ struct sched_state {
         int     event;          /* event index */
         int     counter;        /* counter index */
         int     unassigned;     /* number of events to be assigned left */
+       int     nr_gp;          /* number of GP counters used */
         unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
  };
  
@@ -620,27 +664,29 @@ struct sched_state {
  struct perf_sched {
         int                     max_weight;
         int                     max_events;
-       struct perf_event       **events;
-       struct sched_state      state;
+       int                     max_gp;
         int                     saved_states;
+       struct event_constraint **constraints;
+       struct sched_state      state;
         struct sched_state      saved[SCHED_STATES_MAX];
  };
  
  /*
   * Initialize interator that runs through all events and counters.
   */
-static void perf_sched_init(struct perf_sched *sched, struct perf_event **events,
-                           int num, int wmin, int wmax)
+static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
+                           int num, int wmin, int wmax, int gpmax)
  {
         int idx;
  
         memset(sched, 0, sizeof(*sched));
         sched->max_events       = num;
         sched->max_weight       = wmax;
-       sched->events           = events;
+       sched->max_gp           = gpmax;
+       sched->constraints      = constraints;
  
         for (idx = 0; idx < num; idx++) {
-               if (events[idx]->hw.constraint->weight == wmin)
+               if (constraints[idx]->weight == wmin)
                         break;
         }
  
@@ -687,7 +733,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
         if (sched->state.event >= sched->max_events)
                 return false;
  
-       c = sched->events[sched->state.event]->hw.constraint;
+       c = sched->constraints[sched->state.event];
         /* Prefer fixed purpose counters */
         if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
                 idx = INTEL_PMC_IDX_FIXED;
@@ -696,11 +742,16 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
                                 goto done;
                 }
         }
+
         /* Grab the first unused counter starting with idx */
         idx = sched->state.counter;
         for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
-               if (!__test_and_set_bit(idx, sched->state.used))
+               if (!__test_and_set_bit(idx, sched->state.used)) {
+                       if (sched->state.nr_gp++ >= sched->max_gp)
+                               return false;
+
                         goto done;
+               }
         }
  
         return false;
@@ -745,7 +796,7 @@ static bool perf_sched_next_event(struct perf_sched *sched)
                         if (sched->state.weight > sched->max_weight)
                                 return false;
                 }
-               c = sched->events[sched->state.event]->hw.constraint;
+               c = sched->constraints[sched->state.event];
         } while (c->weight != sched->state.weight);
  
         sched->state.counter = 0;       /* start with first counter */
@@ -756,12 +807,12 @@ static bool perf_sched_next_event(struct perf_sched *sched)
  /*
   * Assign a counter for each event.
   */
-int perf_assign_events(struct perf_event **events, int n,
-                       int wmin, int wmax, int *assign)
+int perf_assign_events(struct event_constraint **constraints, int n,
+                       int wmin, int wmax, int gpmax, int *assign)
  {
         struct perf_sched sched;
  
-       perf_sched_init(&sched, events, n, wmin, wmax);
+       perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
  
         do {
                 if (!perf_sched_find_counter(&sched))
@@ -788,9 +839,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
                 x86_pmu.start_scheduling(cpuc);
  
         for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
-               hwc = &cpuc->event_list[i]->hw;
+               cpuc->event_constraint[i] = NULL;
                 c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
-               hwc->constraint = c;
+               cpuc->event_constraint[i] = c;
  
                 wmin = min(wmin, c->weight);
                 wmax = max(wmax, c->weight);
@@ -801,7 +852,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
          */
         for (i = 0; i < n; i++) {
                 hwc = &cpuc->event_list[i]->hw;
-               c = hwc->constraint;
+               c = cpuc->event_constraint[i];
  
                 /* never assigned */
                 if (hwc->idx == -1)
@@ -821,9 +872,26 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
         }
  
         /* slow path */
-       if (i != n)
-               unsched = perf_assign_events(cpuc->event_list, n, wmin,
-                                            wmax, assign);
+       if (i != n) {
+               int gpmax = x86_pmu.num_counters;
+
+               /*
+                * Do not allow scheduling of more than half the available
+                * generic counters.
+                *
+                * This helps avoid counter starvation of sibling thread by
+                * ensuring at most half the counters cannot be in exclusive
+                * mode. There is no designated counters for the limits. Any
+                * N/2 counters can be used. This helps with events with
+                * specific counter constraints.
+                */
+               if (is_ht_workaround_enabled() && !cpuc->is_fake &&
+                   READ_ONCE(cpuc->excl_cntrs->exclusive_present))
+                       gpmax /= 2;
+
+               unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
+                                            wmax, gpmax, assign);
+       }
  
         /*
          * In case of success (unsched = 0), mark events as committed,
@@ -840,12 +908,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
                         e = cpuc->event_list[i];
                         e->hw.flags |= PERF_X86_EVENT_COMMITTED;
                         if (x86_pmu.commit_scheduling)
-                               x86_pmu.commit_scheduling(cpuc, e, assign[i]);
+                               x86_pmu.commit_scheduling(cpuc, i, assign[i]);
                 }
-       }
-
-       if (!assign || unsched) {
-
+       } else {
                 for (i = 0; i < n; i++) {
                         e = cpuc->event_list[i];
                         /*
@@ -1058,13 +1123,16 @@ int x86_perf_event_set_period(struct perf_event *event)
  
         per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
  
-       /*
-        * The hw event starts counting from this event offset,
-        * mark it to be able to extra future deltas:
-        */
-       local64_set(&hwc->prev_count, (u64)-left);
+       if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
+           local64_read(&hwc->prev_count) != (u64)-left) {
+               /*
+                * The hw event starts counting from this event offset,
+                * mark it to be able to extra future deltas:
+                */
+               local64_set(&hwc->prev_count, (u64)-left);
  
-       wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+               wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+       }
  
         /*
          * Due to erratum on certan cpu we need
@@ -1292,8 +1360,10 @@ static void x86_pmu_del(struct perf_event *event, int flags)
                 x86_pmu.put_event_constraints(cpuc, event);
  
         /* Delete the array entry. */
-       while (++i < cpuc->n_events)
+       while (++i < cpuc->n_events) {
                 cpuc->event_list[i-1] = cpuc->event_list[i];
+               cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
+       }
         --cpuc->n_events;
  
         perf_event_update_userpage(event);
@@ -1374,6 +1444,10 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
         u64 finish_clock;
         int ret;
  
+       /*
+        * All PMUs/events that share this PMI handler should make sure to
+        * increment active_events for their events.
+        */
         if (!atomic_read(&active_events))
                 return NMI_DONE;
  
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h

index 6ac5cb7a9e14839dcd0b622a91f0f0939133c81e..3e7fd27dfe201718860185be3fe5eeb3c028ed5c 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -74,6 +74,9 @@ struct event_constraint {
  #define PERF_X86_EVENT_EXCL            0x0040 /* HT exclusivity on counter */
  #define PERF_X86_EVENT_DYNAMIC         0x0080 /* dynamic alloc'd constraint */
  #define PERF_X86_EVENT_RDPMC_ALLOWED   0x0100 /* grant rdpmc permission */
+#define PERF_X86_EVENT_EXCL_ACCT       0x0200 /* accounted EXCL event */
+#define PERF_X86_EVENT_AUTO_RELOAD     0x0400 /* use PEBS auto-reload */
+#define PERF_X86_EVENT_FREERUNNING     0x0800 /* use freerunning PEBS */
  
  
  struct amd_nb {
@@ -86,6 +89,18 @@ struct amd_nb {
  /* The maximal number of PEBS events: */
  #define MAX_PEBS_EVENTS                8
  
+/*
+ * Flags PEBS can handle without an PMI.
+ *
+ * TID can only be handled by flushing at context switch.
+ *
+ */
+#define PEBS_FREERUNNING_FLAGS \
+       (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
+       PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
+       PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
+       PERF_SAMPLE_TRANSACTION)
+
  /*
   * A debug store configuration.
   *
@@ -132,10 +147,7 @@ enum intel_excl_state_type {
  };
  
  struct intel_excl_states {
-       enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
         enum intel_excl_state_type state[X86_PMC_IDX_MAX];
-       int  num_alloc_cntrs;/* #counters allocated */
-       int  max_alloc_cntrs;/* max #counters allowed */
         bool sched_started; /* true if scheduling has started */
  };
  
@@ -144,6 +156,11 @@ struct intel_excl_cntrs {
  
         struct intel_excl_states states[2];
  
+       union {
+               u16     has_exclusive[2];
+               u32     exclusive_present;
+       };
+
         int             refcnt;         /* per-core: #HT threads */
         unsigned        core_id;        /* per-core: core id */
  };
@@ -172,7 +189,11 @@ struct cpu_hw_events {
                                              added in the current transaction */
         int                     assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
         u64                     tags[X86_PMC_IDX_MAX];
+
         struct perf_event       *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+       struct event_constraint *event_constraint[X86_PMC_IDX_MAX];
+
+       int                     n_excl; /* the number of exclusive events */
  
         unsigned int            group_flag;
         int                     is_fake;
@@ -519,12 +540,10 @@ struct x86_pmu {
         void            (*put_event_constraints)(struct cpu_hw_events *cpuc,
                                                  struct perf_event *event);
  
-       void            (*commit_scheduling)(struct cpu_hw_events *cpuc,
-                                            struct perf_event *event,
-                                            int cntr);
-
         void            (*start_scheduling)(struct cpu_hw_events *cpuc);
  
+       void            (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
+
         void            (*stop_scheduling)(struct cpu_hw_events *cpuc);
  
         struct event_constraint *event_constraints;
@@ -697,6 +716,10 @@ int x86_add_exclusive(unsigned int what);
  
  void x86_del_exclusive(unsigned int what);
  
+int x86_reserve_hardware(void);
+
+void x86_release_hardware(void);
+
  void hw_perf_lbr_event_destroy(struct perf_event *event);
  
  int x86_setup_perfctr(struct perf_event *event);
@@ -717,8 +740,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
  
  void x86_pmu_enable_all(int added);
  
-int perf_assign_events(struct perf_event **events, int n,
-                       int wmin, int wmax, int *assign);
+int perf_assign_events(struct event_constraint **constraints, int n,
+                       int wmin, int wmax, int gpmax, int *assign);
  int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
  
  void x86_pmu_stop(struct perf_event *event, int flags);
@@ -860,6 +883,8 @@ void intel_pmu_pebs_enable_all(void);
  
  void intel_pmu_pebs_disable_all(void);
  
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
+
  void intel_ds_init(void);
  
  void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
@@ -929,4 +954,8 @@ static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
         return NULL;
  }
  
+static inline int is_ht_workaround_enabled(void)
+{
+       return 0;
+}
  #endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c

index 3998131d1a683058d6382b527c187028a7fede38..b9826a981fb20fa45a7c1255e277e9ad1cd5d150 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1903,9 +1903,8 @@ static void
  intel_start_scheduling(struct cpu_hw_events *cpuc)
  {
         struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       struct intel_excl_states *xl, *xlo;
+       struct intel_excl_states *xl;
         int tid = cpuc->excl_thread_id;
-       int o_tid = 1 - tid; /* sibling thread */
  
         /*
          * nothing needed if in group validation mode
@@ -1916,35 +1915,52 @@ intel_start_scheduling(struct cpu_hw_events *cpuc)
         /*
          * no exclusion needed
          */
-       if (!excl_cntrs)
+       if (WARN_ON_ONCE(!excl_cntrs))
                 return;
  
-       xlo = &excl_cntrs->states[o_tid];
         xl = &excl_cntrs->states[tid];
  
         xl->sched_started = true;
-       xl->num_alloc_cntrs = 0;
         /*
          * lock shared state until we are done scheduling
          * in stop_event_scheduling()
          * makes scheduling appear as a transaction
          */
-       WARN_ON_ONCE(!irqs_disabled());
         raw_spin_lock(&excl_cntrs->lock);
+}
  
-       /*
-        * save initial state of sibling thread
-        */
-       memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
+{
+       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+       struct event_constraint *c = cpuc->event_constraint[idx];
+       struct intel_excl_states *xl;
+       int tid = cpuc->excl_thread_id;
+
+       if (cpuc->is_fake || !is_ht_workaround_enabled())
+               return;
+
+       if (WARN_ON_ONCE(!excl_cntrs))
+               return;
+
+       if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+               return;
+
+       xl = &excl_cntrs->states[tid];
+
+       lockdep_assert_held(&excl_cntrs->lock);
+
+       if (c->flags & PERF_X86_EVENT_EXCL)
+               xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
+       else
+               xl->state[cntr] = INTEL_EXCL_SHARED;
  }
  
  static void
  intel_stop_scheduling(struct cpu_hw_events *cpuc)
  {
         struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       struct intel_excl_states *xl, *xlo;
+       struct intel_excl_states *xl;
         int tid = cpuc->excl_thread_id;
-       int o_tid = 1 - tid; /* sibling thread */
  
         /*
          * nothing needed if in group validation mode
@@ -1954,17 +1970,11 @@ intel_stop_scheduling(struct cpu_hw_events *cpuc)
         /*
          * no exclusion needed
          */
-       if (!excl_cntrs)
+       if (WARN_ON_ONCE(!excl_cntrs))
                 return;
  
-       xlo = &excl_cntrs->states[o_tid];
         xl = &excl_cntrs->states[tid];
  
-       /*
-        * make new sibling thread state visible
-        */
-       memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
-
         xl->sched_started = false;
         /*
          * release shared state lock (acquired in intel_start_scheduling())
@@ -1976,12 +1986,10 @@ static struct event_constraint *
  intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
                            int idx, struct event_constraint *c)
  {
-       struct event_constraint *cx;
         struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       struct intel_excl_states *xl, *xlo;
-       int is_excl, i;
+       struct intel_excl_states *xlo;
         int tid = cpuc->excl_thread_id;
-       int o_tid = 1 - tid; /* alternate */
+       int is_excl, i;
  
         /*
          * validating a group does not require
@@ -1993,34 +2001,8 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
         /*
          * no exclusion needed
          */
-       if (!excl_cntrs)
+       if (WARN_ON_ONCE(!excl_cntrs))
                 return c;
-       /*
-        * event requires exclusive counter access
-        * across HT threads
-        */
-       is_excl = c->flags & PERF_X86_EVENT_EXCL;
-
-       /*
-        * xl = state of current HT
-        * xlo = state of sibling HT
-        */
-       xl = &excl_cntrs->states[tid];
-       xlo = &excl_cntrs->states[o_tid];
-
-       /*
-        * do not allow scheduling of more than max_alloc_cntrs
-        * which is set to half the available generic counters.
-        * this helps avoid counter starvation of sibling thread
-        * by ensuring at most half the counters cannot be in
-        * exclusive mode. There is not designated counters for the
-        * limits. Any N/2 counters can be used. This helps with
-        * events with specifix counter constraints
-        */
-       if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
-               return &emptyconstraint;
-
-       cx = c;
  
         /*
          * because we modify the constraint, we need
@@ -2031,10 +2013,7 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
          * been cloned (marked dynamic)
          */
         if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
-
-               /* sanity check */
-               if (idx < 0)
-                       return &emptyconstraint;
+               struct event_constraint *cx;
  
                 /*
                  * grab pre-allocated constraint entry
@@ -2045,13 +2024,14 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
                  * initialize dynamic constraint
                  * with static constraint
                  */
-               memcpy(cx, c, sizeof(*cx));
+               *cx = *c;
  
                 /*
                  * mark constraint as dynamic, so we
                  * can free it later on
                  */
                 cx->flags |= PERF_X86_EVENT_DYNAMIC;
+               c = cx;
         }
  
         /*
@@ -2061,6 +2041,22 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
          * of this function
          */
  
+       /*
+        * state of sibling HT
+        */
+       xlo = &excl_cntrs->states[tid ^ 1];
+
+       /*
+        * event requires exclusive counter access
+        * across HT threads
+        */
+       is_excl = c->flags & PERF_X86_EVENT_EXCL;
+       if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
+               event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
+               if (!cpuc->n_excl++)
+                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
+       }
+
         /*
          * Modify static constraint with current dynamic
          * state of thread
@@ -2069,44 +2065,44 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
          * SHARED   : sibling counter measuring non-exclusive event
          * UNUSED   : sibling counter unused
          */
-       for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) {
+       for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
                 /*
                  * exclusive event in sibling counter
                  * our corresponding counter cannot be used
                  * regardless of our event
                  */
-               if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
-                       __clear_bit(i, cx->idxmsk);
+               if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
+                       __clear_bit(i, c->idxmsk);
                 /*
                  * if measuring an exclusive event, sibling
                  * measuring non-exclusive, then counter cannot
                  * be used
                  */
-               if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
-                       __clear_bit(i, cx->idxmsk);
+               if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
+                       __clear_bit(i, c->idxmsk);
         }
  
         /*
          * recompute actual bit weight for scheduling algorithm
          */
-       cx->weight = hweight64(cx->idxmsk64);
+       c->weight = hweight64(c->idxmsk64);
  
         /*
          * if we return an empty mask, then switch
          * back to static empty constraint to avoid
          * the cost of freeing later on
          */
-       if (cx->weight == 0)
-               cx = &emptyconstraint;
+       if (c->weight == 0)
+               c = &emptyconstraint;
  
-       return cx;
+       return c;
  }
  
  static struct event_constraint *
  intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
                             struct perf_event *event)
  {
-       struct event_constraint *c1 = event->hw.constraint;
+       struct event_constraint *c1 = cpuc->event_constraint[idx];
         struct event_constraint *c2;
  
         /*
@@ -2132,10 +2128,8 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
  {
         struct hw_perf_event *hwc = &event->hw;
         struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       struct intel_excl_states *xlo, *xl;
-       unsigned long flags = 0; /* keep compiler happy */
         int tid = cpuc->excl_thread_id;
-       int o_tid = 1 - tid;
+       struct intel_excl_states *xl;
  
         /*
          * nothing needed if in group validation mode
@@ -2143,31 +2137,35 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
         if (cpuc->is_fake)
                 return;
  
-       WARN_ON_ONCE(!excl_cntrs);
-
-       if (!excl_cntrs)
+       if (WARN_ON_ONCE(!excl_cntrs))
                 return;
  
-       xl = &excl_cntrs->states[tid];
-       xlo = &excl_cntrs->states[o_tid];
+       if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
+               hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
+               if (!--cpuc->n_excl)
+                       WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
+       }
  
         /*
-        * put_constraint may be called from x86_schedule_events()
-        * which already has the lock held so here make locking
-        * conditional
+        * If event was actually assigned, then mark the counter state as
+        * unused now.
          */
-       if (!xl->sched_started)
-               raw_spin_lock_irqsave(&excl_cntrs->lock, flags);
+       if (hwc->idx >= 0) {
+               xl = &excl_cntrs->states[tid];
  
-       /*
-        * if event was actually assigned, then mark the
-        * counter state as unused now
-        */
-       if (hwc->idx >= 0)
-               xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
+               /*
+                * put_constraint may be called from x86_schedule_events()
+                * which already has the lock held so here make locking
+                * conditional.
+                */
+               if (!xl->sched_started)
+                       raw_spin_lock(&excl_cntrs->lock);
  
-       if (!xl->sched_started)
-               raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
+               xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+               if (!xl->sched_started)
+                       raw_spin_unlock(&excl_cntrs->lock);
+       }
  }
  
  static void
@@ -2188,8 +2186,6 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
  static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
                                         struct perf_event *event)
  {
-       struct event_constraint *c = event->hw.constraint;
-
         intel_put_shared_regs_event_constraints(cpuc, event);
  
         /*
@@ -2197,48 +2193,8 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
          * all events are subject to and must call the
          * put_excl_constraints() routine
          */
-       if (c && cpuc->excl_cntrs)
+       if (cpuc->excl_cntrs)
                 intel_put_excl_constraints(cpuc, event);
-
-       /* cleanup dynamic constraint */
-       if (c && (c->flags & PERF_X86_EVENT_DYNAMIC))
-               event->hw.constraint = NULL;
-}
-
-static void intel_commit_scheduling(struct cpu_hw_events *cpuc,
-                                   struct perf_event *event, int cntr)
-{
-       struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-       struct event_constraint *c = event->hw.constraint;
-       struct intel_excl_states *xlo, *xl;
-       int tid = cpuc->excl_thread_id;
-       int o_tid = 1 - tid;
-       int is_excl;
-
-       if (cpuc->is_fake || !c)
-               return;
-
-       is_excl = c->flags & PERF_X86_EVENT_EXCL;
-
-       if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
-               return;
-
-       WARN_ON_ONCE(!excl_cntrs);
-
-       if (!excl_cntrs)
-               return;
-
-       xl = &excl_cntrs->states[tid];
-       xlo = &excl_cntrs->states[o_tid];
-
-       WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
-
-       if (cntr >= 0) {
-               if (is_excl)
-                       xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
-               else
-                       xlo->init_state[cntr] = INTEL_EXCL_SHARED;
-       }
  }
  
  static void intel_pebs_aliases_core2(struct perf_event *event)
@@ -2304,8 +2260,15 @@ static int intel_pmu_hw_config(struct perf_event *event)
         if (ret)
                 return ret;
  
-       if (event->attr.precise_ip && x86_pmu.pebs_aliases)
-               x86_pmu.pebs_aliases(event);
+       if (event->attr.precise_ip) {
+               if (!event->attr.freq) {
+                       event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
+                       if (!(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS))
+                               event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
+               }
+               if (x86_pmu.pebs_aliases)
+                       x86_pmu.pebs_aliases(event);
+       }
  
         if (needs_branch_stack(event)) {
                 ret = intel_pmu_setup_lbr_filter(event);
@@ -2554,19 +2517,11 @@ struct intel_shared_regs *allocate_shared_regs(int cpu)
  static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
  {
         struct intel_excl_cntrs *c;
-       int i;
  
         c = kzalloc_node(sizeof(struct intel_excl_cntrs),
                          GFP_KERNEL, cpu_to_node(cpu));
         if (c) {
                 raw_spin_lock_init(&c->lock);
-               for (i = 0; i < X86_PMC_IDX_MAX; i++) {
-                       c->states[0].state[i] = INTEL_EXCL_UNUSED;
-                       c->states[0].init_state[i] = INTEL_EXCL_UNUSED;
-
-                       c->states[1].state[i] = INTEL_EXCL_UNUSED;
-                       c->states[1].init_state[i] = INTEL_EXCL_UNUSED;
-               }
                 c->core_id = -1;
         }
         return c;
@@ -2621,7 +2576,7 @@ static void intel_pmu_cpu_starting(int cpu)
         if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
                 void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
  
-               for_each_cpu(i, topology_thread_cpumask(cpu)) {
+               for_each_cpu(i, topology_sibling_cpumask(cpu)) {
                         struct intel_shared_regs *pc;
  
                         pc = per_cpu(cpu_hw_events, i).shared_regs;
@@ -2639,9 +2594,7 @@ static void intel_pmu_cpu_starting(int cpu)
                 cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
  
         if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
-               int h = x86_pmu.num_counters >> 1;
-
-               for_each_cpu(i, topology_thread_cpumask(cpu)) {
+               for_each_cpu(i, topology_sibling_cpumask(cpu)) {
                         struct intel_excl_cntrs *c;
  
                         c = per_cpu(cpu_hw_events, i).excl_cntrs;
@@ -2654,11 +2607,6 @@ static void intel_pmu_cpu_starting(int cpu)
                 }
                 cpuc->excl_cntrs->core_id = core_id;
                 cpuc->excl_cntrs->refcnt++;
-               /*
-                * set hard limit to half the number of generic counters
-                */
-               cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
-               cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
         }
  }
  
@@ -2694,6 +2642,15 @@ static void intel_pmu_cpu_dying(int cpu)
         fini_debug_store_on_cpu(cpu);
  }
  
+static void intel_pmu_sched_task(struct perf_event_context *ctx,
+                                bool sched_in)
+{
+       if (x86_pmu.pebs_active)
+               intel_pmu_pebs_sched_task(ctx, sched_in);
+       if (x86_pmu.lbr_nr)
+               intel_pmu_lbr_sched_task(ctx, sched_in);
+}
+
  PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
  
  PMU_FORMAT_ATTR(ldlat, "config1:0-15");
@@ -2783,7 +2740,7 @@ static __initconst const struct x86_pmu intel_pmu = {
         .cpu_starting           = intel_pmu_cpu_starting,
         .cpu_dying              = intel_pmu_cpu_dying,
         .guest_get_msrs         = intel_guest_get_msrs,
-       .sched_task             = intel_pmu_lbr_sched_task,
+       .sched_task             = intel_pmu_sched_task,
  };
  
  static __init void intel_clovertown_quirk(void)
@@ -2956,8 +2913,8 @@ static __init void intel_ht_bug(void)
  {
         x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
  
-       x86_pmu.commit_scheduling = intel_commit_scheduling;
         x86_pmu.start_scheduling = intel_start_scheduling;
+       x86_pmu.commit_scheduling = intel_commit_scheduling;
         x86_pmu.stop_scheduling = intel_stop_scheduling;
  }
  
@@ -3270,6 +3227,8 @@ __init int intel_pmu_init(void)
  
         case 61: /* 14nm Broadwell Core-M */
         case 86: /* 14nm Broadwell Xeon D */
+       case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+       case 79: /* 14nm Broadwell Server */
                 x86_pmu.late_ack = true;
                 memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
                 memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
@@ -3339,13 +3298,13 @@ __init int intel_pmu_init(void)
                  * counter, so do not extend mask to generic counters
                  */
                 for_each_event_constraint(c, x86_pmu.event_constraints) {
-                       if (c->cmask != FIXED_EVENT_FLAGS
-                           || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) {
-                               continue;
+                       if (c->cmask == FIXED_EVENT_FLAGS
+                           && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
+                               c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
                         }
-
-                       c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
-                       c->weight += x86_pmu.num_counters;
+                       c->idxmsk64 &=
+                               ~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
+                       c->weight = hweight64(c->idxmsk64);
                 }
         }
  
@@ -3403,7 +3362,7 @@ static __init int fixup_ht_bug(void)
         if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
                 return 0;
  
-       w = cpumask_weight(topology_thread_cpumask(cpu));
+       w = cpumask_weight(topology_sibling_cpumask(cpu));
         if (w > 1) {
                 pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
                 return 0;
@@ -3413,8 +3372,8 @@ static __init int fixup_ht_bug(void)
  
         x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
  
-       x86_pmu.commit_scheduling = NULL;
         x86_pmu.start_scheduling = NULL;
+       x86_pmu.commit_scheduling = NULL;
         x86_pmu.stop_scheduling = NULL;
  
         watchdog_nmi_enable_all();
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c

index ac1f0c55f3796e17bdaddc5f946a41509890e446..7795f3f8b1d57198469ded20ac9a1244035428e8 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -483,17 +483,26 @@ static int bts_event_add(struct perf_event *event, int mode)
  
  static void bts_event_destroy(struct perf_event *event)
  {
+       x86_release_hardware();
         x86_del_exclusive(x86_lbr_exclusive_bts);
  }
  
  static int bts_event_init(struct perf_event *event)
  {
+       int ret;
+
         if (event->attr.type != bts_pmu.type)
                 return -ENOENT;
  
         if (x86_add_exclusive(x86_lbr_exclusive_bts))
                 return -EBUSY;
  
+       ret = x86_reserve_hardware();
+       if (ret) {
+               x86_del_exclusive(x86_lbr_exclusive_bts);
+               return ret;
+       }
+
         event->destroy = bts_event_destroy;
  
         return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c

index e4d1b8b738fa8a9fc350030c0b41dd75f5309530..188076161c1be51afe135d6289b8ce0e96952bf3 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -13,16 +13,35 @@
  #define MSR_IA32_QM_CTR                0x0c8e
  #define MSR_IA32_QM_EVTSEL     0x0c8d
  
-static unsigned int cqm_max_rmid = -1;
+static u32 cqm_max_rmid = -1;
  static unsigned int cqm_l3_scale; /* supposedly cacheline size */
  
-struct intel_cqm_state {
-       raw_spinlock_t          lock;
-       int                     rmid;
-       int                     cnt;
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @rmid:              The cached Resource Monitoring ID
+ * @closid:            The cached Class Of Service ID
+ * @rmid_usecnt:       The usage counter for rmid
+ *
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+       u32                     rmid;
+       u32                     closid;
+       int                     rmid_usecnt;
  };
  
-static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Both functions which modify the state
+ * (intel_cqm_event_start and intel_cqm_event_stop) are called with
+ * interrupts disabled, which is sufficient for the protection.
+ */
+static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
  
  /*
   * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
@@ -57,7 +76,7 @@ static cpumask_t cqm_cpumask;
   * near-zero occupancy value, i.e. no cachelines are tagged with this
   * RMID, once __intel_cqm_rmid_rotate() returns.
   */
-static unsigned int intel_cqm_rotation_rmid;
+static u32 intel_cqm_rotation_rmid;
  
  #define INVALID_RMID           (-1)
  
@@ -69,7 +88,7 @@ static unsigned int intel_cqm_rotation_rmid;
   * Likewise, an rmid value of -1 is used to indicate "no rmid currently
   * assigned" and is used as part of the rotation code.
   */
-static inline bool __rmid_valid(unsigned int rmid)
+static inline bool __rmid_valid(u32 rmid)
  {
         if (!rmid || rmid == INVALID_RMID)
                 return false;
@@ -77,7 +96,7 @@ static inline bool __rmid_valid(unsigned int rmid)
         return true;
  }
  
-static u64 __rmid_read(unsigned int rmid)
+static u64 __rmid_read(u32 rmid)
  {
         u64 val;
  
@@ -102,7 +121,7 @@ enum rmid_recycle_state {
  };
  
  struct cqm_rmid_entry {
-       unsigned int rmid;
+       u32 rmid;
         enum rmid_recycle_state state;
         struct list_head list;
         unsigned long queue_time;
@@ -147,7 +166,7 @@ static LIST_HEAD(cqm_rmid_limbo_lru);
   */
  static struct cqm_rmid_entry **cqm_rmid_ptrs;
  
-static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
+static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
  {
         struct cqm_rmid_entry *entry;
  
@@ -162,7 +181,7 @@ static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
   *
   * We expect to be called with cache_mutex held.
   */
-static int __get_rmid(void)
+static u32 __get_rmid(void)
  {
         struct cqm_rmid_entry *entry;
  
@@ -177,7 +196,7 @@ static int __get_rmid(void)
         return entry->rmid;
  }
  
-static void __put_rmid(unsigned int rmid)
+static void __put_rmid(u32 rmid)
  {
         struct cqm_rmid_entry *entry;
  
@@ -372,7 +391,7 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b)
  }
  
  struct rmid_read {
-       unsigned int rmid;
+       u32 rmid;
         atomic64_t value;
  };
  
@@ -381,12 +400,11 @@ static void __intel_cqm_event_count(void *info);
  /*
   * Exchange the RMID of a group of events.
   */
-static unsigned int
-intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid)
+static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
  {
         struct perf_event *event;
-       unsigned int old_rmid = group->hw.cqm_rmid;
         struct list_head *head = &group->hw.cqm_group_entry;
+       u32 old_rmid = group->hw.cqm_rmid;
  
         lockdep_assert_held(&cache_mutex);
  
@@ -451,7 +469,7 @@ static void intel_cqm_stable(void *arg)
   * If we have group events waiting for an RMID that don't conflict with
   * events already running, assign @rmid.
   */
-static bool intel_cqm_sched_in_event(unsigned int rmid)
+static bool intel_cqm_sched_in_event(u32 rmid)
  {
         struct perf_event *leader, *event;
  
@@ -598,7 +616,7 @@ static bool intel_cqm_rmid_stabilize(unsigned int *available)
  static void __intel_cqm_pick_and_rotate(struct perf_event *next)
  {
         struct perf_event *rotor;
-       unsigned int rmid;
+       u32 rmid;
  
         lockdep_assert_held(&cache_mutex);
  
@@ -626,7 +644,7 @@ static void __intel_cqm_pick_and_rotate(struct perf_event *next)
  static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
  {
         struct perf_event *group, *g;
-       unsigned int rmid;
+       u32 rmid;
  
         lockdep_assert_held(&cache_mutex);
  
@@ -828,8 +846,8 @@ static void intel_cqm_setup_event(struct perf_event *event,
                                   struct perf_event **group)
  {
         struct perf_event *iter;
-       unsigned int rmid;
         bool conflict = false;
+       u32 rmid;
  
         list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
                 rmid = iter->hw.cqm_rmid;
@@ -860,7 +878,7 @@ static void intel_cqm_setup_event(struct perf_event *event,
  static void intel_cqm_event_read(struct perf_event *event)
  {
         unsigned long flags;
-       unsigned int rmid;
+       u32 rmid;
         u64 val;
  
         /*
@@ -961,55 +979,48 @@ out:
  
  static void intel_cqm_event_start(struct perf_event *event, int mode)
  {
-       struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
-       unsigned int rmid = event->hw.cqm_rmid;
-       unsigned long flags;
+       struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+       u32 rmid = event->hw.cqm_rmid;
  
         if (!(event->hw.cqm_state & PERF_HES_STOPPED))
                 return;
  
         event->hw.cqm_state &= ~PERF_HES_STOPPED;
  
-       raw_spin_lock_irqsave(&state->lock, flags);
-
-       if (state->cnt++)
-               WARN_ON_ONCE(state->rmid != rmid);
-       else
+       if (state->rmid_usecnt++) {
+               if (!WARN_ON_ONCE(state->rmid != rmid))
+                       return;
+       } else {
                 WARN_ON_ONCE(state->rmid);
+       }
  
         state->rmid = rmid;
-       wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
-
-       raw_spin_unlock_irqrestore(&state->lock, flags);
+       wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
  }
  
  static void intel_cqm_event_stop(struct perf_event *event, int mode)
  {
-       struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
-       unsigned long flags;
+       struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
  
         if (event->hw.cqm_state & PERF_HES_STOPPED)
                 return;
  
         event->hw.cqm_state |= PERF_HES_STOPPED;
  
-       raw_spin_lock_irqsave(&state->lock, flags);
         intel_cqm_event_read(event);
  
-       if (!--state->cnt) {
+       if (!--state->rmid_usecnt) {
                 state->rmid = 0;
-               wrmsrl(MSR_IA32_PQR_ASSOC, 0);
+               wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
         } else {
                 WARN_ON_ONCE(!state->rmid);
         }
-
-       raw_spin_unlock_irqrestore(&state->lock, flags);
  }
  
  static int intel_cqm_event_add(struct perf_event *event, int mode)
  {
         unsigned long flags;
-       unsigned int rmid;
+       u32 rmid;
  
         raw_spin_lock_irqsave(&cache_lock, flags);
  
@@ -1024,11 +1035,6 @@ static int intel_cqm_event_add(struct perf_event *event, int mode)
         return 0;
  }
  
-static void intel_cqm_event_del(struct perf_event *event, int mode)
-{
-       intel_cqm_event_stop(event, mode);
-}
-
  static void intel_cqm_event_destroy(struct perf_event *event)
  {
         struct perf_event *group_other = NULL;
@@ -1057,7 +1063,7 @@ static void intel_cqm_event_destroy(struct perf_event *event)
                         list_replace(&event->hw.cqm_groups_entry,
                                      &group_other->hw.cqm_groups_entry);
                 } else {
-                       unsigned int rmid = event->hw.cqm_rmid;
+                       u32 rmid = event->hw.cqm_rmid;
  
                         if (__rmid_valid(rmid))
                                 __put_rmid(rmid);
@@ -1221,7 +1227,7 @@ static struct pmu intel_cqm_pmu = {
         .task_ctx_nr         = perf_sw_context,
         .event_init          = intel_cqm_event_init,
         .add                 = intel_cqm_event_add,
-       .del                 = intel_cqm_event_del,
+       .del                 = intel_cqm_event_stop,
         .start               = intel_cqm_event_start,
         .stop                = intel_cqm_event_stop,
         .read                = intel_cqm_event_read,
@@ -1243,12 +1249,12 @@ static inline void cqm_pick_event_reader(int cpu)
  
  static void intel_cqm_cpu_prepare(unsigned int cpu)
  {
-       struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
+       struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
         struct cpuinfo_x86 *c = &cpu_data(cpu);
  
-       raw_spin_lock_init(&state->lock);
         state->rmid = 0;
-       state->cnt  = 0;
+       state->closid = 0;
+       state->rmid_usecnt = 0;
  
         WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
         WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c

index 813f75d71175e3a117f13ec53efe6856a0508bec..71fc40238843bb0e80ad1be14e38e7f7b3787eb5 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -11,7 +11,7 @@
  #define BTS_RECORD_SIZE                24
  
  #define BTS_BUFFER_SIZE                (PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE       PAGE_SIZE
+#define PEBS_BUFFER_SIZE       (PAGE_SIZE << 4)
  #define PEBS_FIXUP_SIZE                PAGE_SIZE
  
  /*
@@ -250,7 +250,7 @@ static int alloc_pebs_buffer(int cpu)
  {
         struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
         int node = cpu_to_node(cpu);
-       int max, thresh = 1; /* always use a single PEBS record */
+       int max;
         void *buffer, *ibuffer;
  
         if (!x86_pmu.pebs)
@@ -280,9 +280,6 @@ static int alloc_pebs_buffer(int cpu)
         ds->pebs_absolute_maximum = ds->pebs_buffer_base +
                 max * x86_pmu.pebs_record_size;
  
-       ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
-               thresh * x86_pmu.pebs_record_size;
-
         return 0;
  }
  
@@ -549,6 +546,19 @@ int intel_pmu_drain_bts_buffer(void)
         return 1;
  }
  
+static inline void intel_pmu_drain_pebs_buffer(void)
+{
+       struct pt_regs regs;
+
+       x86_pmu.drain_pebs(&regs);
+}
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+       if (!sched_in)
+               intel_pmu_drain_pebs_buffer();
+}
+
  /*
   * PEBS
   */
@@ -684,33 +694,81 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
         return &emptyconstraint;
  }
  
+static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
+{
+       return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
+}
+
  void intel_pmu_pebs_enable(struct perf_event *event)
  {
         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
         struct hw_perf_event *hwc = &event->hw;
+       struct debug_store *ds = cpuc->ds;
+       bool first_pebs;
+       u64 threshold;
  
         hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
  
+       first_pebs = !pebs_is_enabled(cpuc);
         cpuc->pebs_enabled |= 1ULL << hwc->idx;
  
         if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
                 cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
         else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
                 cpuc->pebs_enabled |= 1ULL << 63;
+
+       /*
+        * When the event is constrained enough we can use a larger
+        * threshold and run the event with less frequent PMI.
+        */
+       if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
+               threshold = ds->pebs_absolute_maximum -
+                       x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+
+               if (first_pebs)
+                       perf_sched_cb_inc(event->ctx->pmu);
+       } else {
+               threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+               /*
+                * If not all events can use larger buffer,
+                * roll back to threshold = 1
+                */
+               if (!first_pebs &&
+                   (ds->pebs_interrupt_threshold > threshold))
+                       perf_sched_cb_dec(event->ctx->pmu);
+       }
+
+       /* Use auto-reload if possible to save a MSR write in the PMI */
+       if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
+               ds->pebs_event_reset[hwc->idx] =
+                       (u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
+       }
+
+       if (first_pebs || ds->pebs_interrupt_threshold > threshold)
+               ds->pebs_interrupt_threshold = threshold;
  }
  
  void intel_pmu_pebs_disable(struct perf_event *event)
  {
         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
         struct hw_perf_event *hwc = &event->hw;
+       struct debug_store *ds = cpuc->ds;
  
         cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
  
-       if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_LDLAT)
+       if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
                 cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
-       else if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_ST)
+       else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
                 cpuc->pebs_enabled &= ~(1ULL << 63);
  
+       if (ds->pebs_interrupt_threshold >
+           ds->pebs_buffer_base + x86_pmu.pebs_record_size) {
+               intel_pmu_drain_pebs_buffer();
+               if (!pebs_is_enabled(cpuc))
+                       perf_sched_cb_dec(event->ctx->pmu);
+       }
+
         if (cpuc->enabled)
                 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
  
@@ -846,8 +904,10 @@ static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs)
         return txn;
  }
  
-static void __intel_pmu_pebs_event(struct perf_event *event,
-                                  struct pt_regs *iregs, void *__pebs)
+static void setup_pebs_sample_data(struct perf_event *event,
+                                  struct pt_regs *iregs, void *__pebs,
+                                  struct perf_sample_data *data,
+                                  struct pt_regs *regs)
  {
  #define PERF_X86_EVENT_PEBS_HSW_PREC \
                 (PERF_X86_EVENT_PEBS_ST_HSW | \
@@ -859,13 +919,11 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
          */
         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
         struct pebs_record_hsw *pebs = __pebs;
-       struct perf_sample_data data;
-       struct pt_regs regs;
         u64 sample_type;
         int fll, fst, dsrc;
         int fl = event->hw.flags;
  
-       if (!intel_pmu_save_and_restart(event))
+       if (pebs == NULL)
                 return;
  
         sample_type = event->attr.sample_type;
@@ -874,15 +932,15 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
         fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
         fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
  
-       perf_sample_data_init(&data, 0, event->hw.last_period);
+       perf_sample_data_init(data, 0, event->hw.last_period);
  
-       data.period = event->hw.last_period;
+       data->period = event->hw.last_period;
  
         /*
          * Use latency for weight (only avail with PEBS-LL)
          */
         if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
-               data.weight = pebs->lat;
+               data->weight = pebs->lat;
  
         /*
          * data.data_src encodes the data source
@@ -895,7 +953,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
                         val = precise_datala_hsw(event, pebs->dse);
                 else if (fst)
                         val = precise_store_data(pebs->dse);
-               data.data_src.val = val;
+               data->data_src.val = val;
         }
  
         /*
@@ -908,61 +966,123 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
          * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
          * A possible PERF_SAMPLE_REGS will have to transfer all regs.
          */
-       regs = *iregs;
-       regs.flags = pebs->flags;
-       set_linear_ip(&regs, pebs->ip);
-       regs.bp = pebs->bp;
-       regs.sp = pebs->sp;
+       *regs = *iregs;
+       regs->flags = pebs->flags;
+       set_linear_ip(regs, pebs->ip);
+       regs->bp = pebs->bp;
+       regs->sp = pebs->sp;
  
         if (sample_type & PERF_SAMPLE_REGS_INTR) {
-               regs.ax = pebs->ax;
-               regs.bx = pebs->bx;
-               regs.cx = pebs->cx;
-               regs.dx = pebs->dx;
-               regs.si = pebs->si;
-               regs.di = pebs->di;
-               regs.bp = pebs->bp;
-               regs.sp = pebs->sp;
-
-               regs.flags = pebs->flags;
+               regs->ax = pebs->ax;
+               regs->bx = pebs->bx;
+               regs->cx = pebs->cx;
+               regs->dx = pebs->dx;
+               regs->si = pebs->si;
+               regs->di = pebs->di;
+               regs->bp = pebs->bp;
+               regs->sp = pebs->sp;
+
+               regs->flags = pebs->flags;
  #ifndef CONFIG_X86_32
-               regs.r8 = pebs->r8;
-               regs.r9 = pebs->r9;
-               regs.r10 = pebs->r10;
-               regs.r11 = pebs->r11;
-               regs.r12 = pebs->r12;
-               regs.r13 = pebs->r13;
-               regs.r14 = pebs->r14;
-               regs.r15 = pebs->r15;
+               regs->r8 = pebs->r8;
+               regs->r9 = pebs->r9;
+               regs->r10 = pebs->r10;
+               regs->r11 = pebs->r11;
+               regs->r12 = pebs->r12;
+               regs->r13 = pebs->r13;
+               regs->r14 = pebs->r14;
+               regs->r15 = pebs->r15;
  #endif
         }
  
         if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
-               regs.ip = pebs->real_ip;
-               regs.flags |= PERF_EFLAGS_EXACT;
-       } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
-               regs.flags |= PERF_EFLAGS_EXACT;
+               regs->ip = pebs->real_ip;
+               regs->flags |= PERF_EFLAGS_EXACT;
+       } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs))
+               regs->flags |= PERF_EFLAGS_EXACT;
         else
-               regs.flags &= ~PERF_EFLAGS_EXACT;
+               regs->flags &= ~PERF_EFLAGS_EXACT;
  
         if ((sample_type & PERF_SAMPLE_ADDR) &&
             x86_pmu.intel_cap.pebs_format >= 1)
-               data.addr = pebs->dla;
+               data->addr = pebs->dla;
  
         if (x86_pmu.intel_cap.pebs_format >= 2) {
                 /* Only set the TSX weight when no memory weight. */
                 if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
-                       data.weight = intel_hsw_weight(pebs);
+                       data->weight = intel_hsw_weight(pebs);
  
                 if (sample_type & PERF_SAMPLE_TRANSACTION)
-                       data.txn = intel_hsw_transaction(pebs);
+                       data->txn = intel_hsw_transaction(pebs);
         }
  
         if (has_branch_stack(event))
-               data.br_stack = &cpuc->lbr_stack;
+               data->br_stack = &cpuc->lbr_stack;
+}
+
+static inline void *
+get_next_pebs_record_by_bit(void *base, void *top, int bit)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       void *at;
+       u64 pebs_status;
+
+       if (base == NULL)
+               return NULL;
+
+       for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+               struct pebs_record_nhm *p = at;
  
-       if (perf_event_overflow(event, &data, &regs))
+               if (test_bit(bit, (unsigned long *)&p->status)) {
+                       /* PEBS v3 has accurate status bits */
+                       if (x86_pmu.intel_cap.pebs_format >= 3)
+                               return at;
+
+                       if (p->status == (1 << bit))
+                               return at;
+
+                       /* clear non-PEBS bit and re-check */
+                       pebs_status = p->status & cpuc->pebs_enabled;
+                       pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+                       if (pebs_status == (1 << bit))
+                               return at;
+               }
+       }
+       return NULL;
+}
+
+static void __intel_pmu_pebs_event(struct perf_event *event,
+                                  struct pt_regs *iregs,
+                                  void *base, void *top,
+                                  int bit, int count)
+{
+       struct perf_sample_data data;
+       struct pt_regs regs;
+       void *at = get_next_pebs_record_by_bit(base, top, bit);
+
+       if (!intel_pmu_save_and_restart(event) &&
+           !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
+               return;
+
+       while (count > 1) {
+               setup_pebs_sample_data(event, iregs, at, &data, &regs);
+               perf_event_output(event, &data, &regs);
+               at += x86_pmu.pebs_record_size;
+               at = get_next_pebs_record_by_bit(at, top, bit);
+               count--;
+       }
+
+       setup_pebs_sample_data(event, iregs, at, &data, &regs);
+
+       /*
+        * All but the last records are processed.
+        * The last one is left to be able to call the overflow handler.
+        */
+       if (perf_event_overflow(event, &data, &regs)) {
                 x86_pmu_stop(event, 0);
+               return;
+       }
+
  }
  
  static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
@@ -992,72 +1112,99 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
         if (!event->attr.precise_ip)
                 return;
  
-       n = top - at;
+       n = (top - at) / x86_pmu.pebs_record_size;
         if (n <= 0)
                 return;
  
-       /*
-        * Should not happen, we program the threshold at 1 and do not
-        * set a reset value.
-        */
-       WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);
-       at += n - 1;
-
-       __intel_pmu_pebs_event(event, iregs, at);
+       __intel_pmu_pebs_event(event, iregs, at, top, 0, n);
  }
  
  static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
  {
         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
         struct debug_store *ds = cpuc->ds;
-       struct perf_event *event = NULL;
-       void *at, *top;
-       u64 status = 0;
-       int bit;
+       struct perf_event *event;
+       void *base, *at, *top;
+       short counts[MAX_PEBS_EVENTS] = {};
+       short error[MAX_PEBS_EVENTS] = {};
+       int bit, i;
  
         if (!x86_pmu.pebs_active)
                 return;
  
-       at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+       base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
         top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
  
         ds->pebs_index = ds->pebs_buffer_base;
  
-       if (unlikely(at > top))
+       if (unlikely(base >= top))
                 return;
  
-       /*
-        * Should not happen, we program the threshold at 1 and do not
-        * set a reset value.
-        */
-       WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size,
-                 "Unexpected number of pebs records %ld\n",
-                 (long)(top - at) / x86_pmu.pebs_record_size);
-
-       for (; at < top; at += x86_pmu.pebs_record_size) {
+       for (at = base; at < top; at += x86_pmu.pebs_record_size) {
                 struct pebs_record_nhm *p = at;
  
-               for_each_set_bit(bit, (unsigned long *)&p->status,
-                                x86_pmu.max_pebs_events) {
-                       event = cpuc->events[bit];
-                       if (!test_bit(bit, cpuc->active_mask))
-                               continue;
-
-                       WARN_ON_ONCE(!event);
+               /* PEBS v3 has accurate status bits */
+               if (x86_pmu.intel_cap.pebs_format >= 3) {
+                       for_each_set_bit(bit, (unsigned long *)&p->status,
+                                        MAX_PEBS_EVENTS)
+                               counts[bit]++;
  
-                       if (!event->attr.precise_ip)
-                               continue;
+                       continue;
+               }
  
-                       if (__test_and_set_bit(bit, (unsigned long *)&status))
+               bit = find_first_bit((unsigned long *)&p->status,
+                                       x86_pmu.max_pebs_events);
+               if (bit >= x86_pmu.max_pebs_events)
+                       continue;
+               if (!test_bit(bit, cpuc->active_mask))
+                       continue;
+               /*
+                * The PEBS hardware does not deal well with the situation
+                * when events happen near to each other and multiple bits
+                * are set. But it should happen rarely.
+                *
+                * If these events include one PEBS and multiple non-PEBS
+                * events, it doesn't impact PEBS record. The record will
+                * be handled normally. (slow path)
+                *
+                * If these events include two or more PEBS events, the
+                * records for the events can be collapsed into a single
+                * one, and it's not possible to reconstruct all events
+                * that caused the PEBS record. It's called collision.
+                * If collision happened, the record will be dropped.
+                *
+                */
+               if (p->status != (1 << bit)) {
+                       u64 pebs_status;
+
+                       /* slow path */
+                       pebs_status = p->status & cpuc->pebs_enabled;
+                       pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+                       if (pebs_status != (1 << bit)) {
+                               for_each_set_bit(i, (unsigned long *)&pebs_status,
+                                                MAX_PEBS_EVENTS)
+                                       error[i]++;
                                 continue;
-
-                       break;
+                       }
                 }
+               counts[bit]++;
+       }
  
-               if (!event || bit >= x86_pmu.max_pebs_events)
+       for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
+               if ((counts[bit] == 0) && (error[bit] == 0))
                         continue;
+               event = cpuc->events[bit];
+               WARN_ON_ONCE(!event);
+               WARN_ON_ONCE(!event->attr.precise_ip);
  
-               __intel_pmu_pebs_event(event, iregs, at);
+               /* log dropped samples number */
+               if (error[bit])
+                       perf_log_lost_samples(event, error[bit]);
+
+               if (counts[bit]) {
+                       __intel_pmu_pebs_event(event, iregs, base,
+                                              top, bit, counts[bit]);
+               }
         }
  }
  
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c

index 94e5b506caa6d13206956095e646bfacbf558fb1..452a7bd2dedb6b72e98fad8da0cb8f2e0d2f32c3 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -96,6 +96,7 @@ enum {
         X86_BR_NO_TX            = 1 << 14,/* not in transaction */
         X86_BR_ZERO_CALL        = 1 << 15,/* zero length call */
         X86_BR_CALL_STACK       = 1 << 16,/* call stack */
+       X86_BR_IND_JMP          = 1 << 17,/* indirect jump */
  };
  
  #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -113,6 +114,7 @@ enum {
          X86_BR_IRQ      |\
          X86_BR_ABORT    |\
          X86_BR_IND_CALL |\
+        X86_BR_IND_JMP  |\
          X86_BR_ZERO_CALL)
  
  #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
@@ -262,9 +264,6 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
         struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
         struct x86_perf_task_context *task_ctx;
  
-       if (!x86_pmu.lbr_nr)
-               return;
-
         /*
          * If LBR callstack feature is enabled and the stack was saved when
          * the task was scheduled out, restore the stack. Otherwise flush
@@ -523,6 +522,9 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
                         X86_BR_CALL_STACK;
         }
  
+       if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
+               mask |= X86_BR_IND_JMP;
+
         /*
          * stash actual user request into reg, it may
          * be used by fixup code for some CPU
@@ -736,7 +738,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
                         break;
                 case 4:
                 case 5:
-                       ret = X86_BR_JMP;
+                       ret = X86_BR_IND_JMP;
                         break;
                 }
                 break;
@@ -844,6 +846,7 @@ static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
          */
         [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
         [PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
+       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
  };
  
  static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
@@ -856,6 +859,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
                                                 | LBR_FAR,
         [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]     = LBR_IND_CALL,
         [PERF_SAMPLE_BRANCH_COND_SHIFT]         = LBR_JCC,
+       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]     = LBR_IND_JMP,
  };
  
  static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
@@ -870,6 +874,7 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
         [PERF_SAMPLE_BRANCH_COND_SHIFT]         = LBR_JCC,
         [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]   = LBR_REL_CALL | LBR_IND_CALL
                                                 | LBR_RETURN | LBR_CALL_STACK,
+       [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]     = LBR_IND_JMP,
  };
  
  /* core */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c

index ffe666c2c6b58657b5895948a2e7d69f95223521..159887c3a89d66a4aaad415ddc069b25904b5676 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -151,7 +151,7 @@ static int __init pt_pmu_hw_init(void)
  
                 de_attr->attr.attr.name = pt_caps[i].name;
  
-               sysfs_attr_init(&de_attrs->attr.attr);
+               sysfs_attr_init(&de_attr->attr.attr);
  
                 de_attr->attr.attr.mode         = S_IRUGO;
                 de_attr->attr.show              = pt_cap_show;
@@ -187,15 +187,6 @@ static bool pt_event_valid(struct perf_event *event)
   * These all are cpu affine and operate on a local PT
   */
  
-static bool pt_is_running(void)
-{
-       u64 ctl;
-
-       rdmsrl(MSR_IA32_RTIT_CTL, ctl);
-
-       return !!(ctl & RTIT_CTL_TRACEEN);
-}
-
  static void pt_config(struct perf_event *event)
  {
         u64 reg;
@@ -609,16 +600,19 @@ static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
   * @handle:    Current output handle.
   *
   * Place INT and STOP marks to prevent overwriting old data that the consumer
- * hasn't yet collected.
+ * hasn't yet collected and waking up the consumer after a certain fraction of
+ * the buffer has filled up. Only needed and sensible for non-snapshot counters.
+ *
+ * This obviously relies on buf::head to figure out buffer markers, so it has
+ * to be called after pt_buffer_reset_offsets() and before the hardware tracing
+ * is enabled.
   */
  static int pt_buffer_reset_markers(struct pt_buffer *buf,
                                    struct perf_output_handle *handle)
  
  {
-       unsigned long idx, npages, end;
-
-       if (buf->snapshot)
-               return 0;
+       unsigned long head = local64_read(&buf->head);
+       unsigned long idx, npages, wakeup;
  
         /* can't stop in the middle of an output region */
         if (buf->output_off + handle->size + 1 <
@@ -634,17 +628,26 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
         buf->topa_index[buf->stop_pos]->stop = 0;
         buf->topa_index[buf->intr_pos]->intr = 0;
  
-       if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
-               npages = (handle->size + 1) >> PAGE_SHIFT;
-               end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages;
-               /*if (end > handle->wakeup >> PAGE_SHIFT)
-                 end = handle->wakeup >> PAGE_SHIFT;*/
-               idx = end & (buf->nr_pages - 1);
-               buf->stop_pos = idx;
-               idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1;
-               idx &= buf->nr_pages - 1;
-               buf->intr_pos = idx;
-       }
+       /* how many pages till the STOP marker */
+       npages = handle->size >> PAGE_SHIFT;
+
+       /* if it's on a page boundary, fill up one more page */
+       if (!offset_in_page(head + handle->size + 1))
+               npages++;
+
+       idx = (head >> PAGE_SHIFT) + npages;
+       idx &= buf->nr_pages - 1;
+       buf->stop_pos = idx;
+
+       wakeup = handle->wakeup >> PAGE_SHIFT;
+
+       /* in the worst case, wake up the consumer one page before hard stop */
+       idx = (head >> PAGE_SHIFT) + npages - 1;
+       if (idx > wakeup)
+               idx = wakeup;
+
+       idx &= buf->nr_pages - 1;
+       buf->intr_pos = idx;
  
         buf->topa_index[buf->stop_pos]->stop = 1;
         buf->topa_index[buf->intr_pos]->intr = 1;
@@ -664,7 +667,7 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
         struct topa *cur = buf->first, *prev = buf->last;
         struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
                 *te_prev = TOPA_ENTRY(prev, prev->last - 1);
-       int pg = 0, idx = 0, ntopa = 0;
+       int pg = 0, idx = 0;
  
         while (pg < buf->nr_pages) {
                 int tidx;
@@ -679,9 +682,9 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
                         /* advance to next topa table */
                         idx = 0;
                         cur = list_entry(cur->list.next, struct topa, list);
-                       ntopa++;
-               } else
+               } else {
                         idx++;
+               }
                 te_cur = TOPA_ENTRY(cur, idx);
         }
  
@@ -693,7 +696,14 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
   * @head:      Write pointer (aux_head) from AUX buffer.
   *
   * Find the ToPA table and entry corresponding to given @head and set buffer's
- * "current" pointers accordingly.
+ * "current" pointers accordingly. This is done after we have obtained the
+ * current aux_head position from a successful call to perf_aux_output_begin()
+ * to make sure the hardware is writing to the right place.
+ *
+ * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
+ * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
+ * which are used to determine INT and STOP markers' locations by a subsequent
+ * call to pt_buffer_reset_markers().
   */
  static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
  {
@@ -891,6 +901,7 @@ void intel_pt_interrupt(void)
                 }
  
                 pt_buffer_reset_offsets(buf, pt->handle.head);
+               /* snapshot counters don't use PMI, so it's safe */
                 ret = pt_buffer_reset_markers(buf, &pt->handle);
                 if (ret) {
                         perf_aux_output_end(&pt->handle, 0, true);
@@ -913,7 +924,7 @@ static void pt_event_start(struct perf_event *event, int mode)
         struct pt *pt = this_cpu_ptr(&pt_ctx);
         struct pt_buffer *buf = perf_get_aux(&pt->handle);
  
-       if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
+       if (!buf || pt_buffer_is_full(buf, pt)) {
                 event->hw.state = PERF_HES_STOPPED;
                 return;
         }
@@ -944,7 +955,6 @@ static void pt_event_stop(struct perf_event *event, int mode)
         event->hw.state = PERF_HES_STOPPED;
  
         if (mode & PERF_EF_UPDATE) {
-               struct pt *pt = this_cpu_ptr(&pt_ctx);
                 struct pt_buffer *buf = perf_get_aux(&pt->handle);
  
                 if (!buf)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c

index c635b8b49e931e7926efc3dc96475a8c577958e0..7c1de16101782b37e2ace385c55d0f7941a0d90d 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -365,9 +365,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
         bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
  
         for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
-               hwc = &box->event_list[i]->hw;
                 c = uncore_get_event_constraint(box, box->event_list[i]);
-               hwc->constraint = c;
+               box->event_constraint[i] = c;
                 wmin = min(wmin, c->weight);
                 wmax = max(wmax, c->weight);
         }
@@ -375,7 +374,7 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
         /* fastpath, try to reuse previous register */
         for (i = 0; i < n; i++) {
                 hwc = &box->event_list[i]->hw;
-               c = hwc->constraint;
+               c = box->event_constraint[i];
  
                 /* never assigned */
                 if (hwc->idx == -1)
@@ -395,8 +394,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
         }
         /* slow path */
         if (i != n)
-               ret = perf_assign_events(box->event_list, n,
-                                        wmin, wmax, assign);
+               ret = perf_assign_events(box->event_constraint, n,
+                                        wmin, wmax, n, assign);
  
         if (!assign || ret) {
                 for (i = 0; i < n; i++)
@@ -840,6 +839,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
         box->phys_id = phys_id;
         box->pci_dev = pdev;
         box->pmu = pmu;
+       uncore_box_init(box);
         pci_set_drvdata(pdev, box);
  
         raw_spin_lock(&uncore_box_lock);
@@ -922,6 +922,9 @@ static int __init uncore_pci_init(void)
         case 69: /* Haswell Celeron */
                 ret = hsw_uncore_pci_init();
                 break;
+       case 61: /* Broadwell */
+               ret = bdw_uncore_pci_init();
+               break;
         default:
                 return 0;
         }
@@ -1003,8 +1006,10 @@ static int uncore_cpu_starting(int cpu)
                         pmu = &type->pmus[j];
                         box = *per_cpu_ptr(pmu->box, cpu);
                         /* called by uncore_cpu_init? */
-                       if (box && box->phys_id >= 0)
+                       if (box && box->phys_id >= 0) {
+                               uncore_box_init(box);
                                 continue;
+                       }
  
                         for_each_online_cpu(k) {
                                 exist = *per_cpu_ptr(pmu->box, k);
@@ -1020,8 +1025,10 @@ static int uncore_cpu_starting(int cpu)
                                 }
                         }
  
-                       if (box)
+                       if (box) {
                                 box->phys_id = phys_id;
+                               uncore_box_init(box);
+                       }
                 }
         }
         return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h

index 6c8c1e7e69d85d3ad217eada0f0e55573c3daaf0..0f77f0a196e488b7e617ed2cabd0246cdab2611d 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -97,6 +97,7 @@ struct intel_uncore_box {
         atomic_t refcnt;
         struct perf_event *events[UNCORE_PMC_IDX_MAX];
         struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
+       struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX];
         unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
         u64 tags[UNCORE_PMC_IDX_MAX];
         struct pci_dev *pci_dev;
@@ -257,14 +258,6 @@ static inline int uncore_num_counters(struct intel_uncore_box *box)
         return box->pmu->type->num_counters;
  }
  
-static inline void uncore_box_init(struct intel_uncore_box *box)
-{
-       if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
-               if (box->pmu->type->ops->init_box)
-                       box->pmu->type->ops->init_box(box);
-       }
-}
-
  static inline void uncore_disable_box(struct intel_uncore_box *box)
  {
         if (box->pmu->type->ops->disable_box)
@@ -273,8 +266,6 @@ static inline void uncore_disable_box(struct intel_uncore_box *box)
  
  static inline void uncore_enable_box(struct intel_uncore_box *box)
  {
-       uncore_box_init(box);
-
         if (box->pmu->type->ops->enable_box)
                 box->pmu->type->ops->enable_box(box);
  }
@@ -297,6 +288,14 @@ static inline u64 uncore_read_counter(struct intel_uncore_box *box,
         return box->pmu->type->ops->read_counter(box, event);
  }
  
+static inline void uncore_box_init(struct intel_uncore_box *box)
+{
+       if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+               if (box->pmu->type->ops->init_box)
+                       box->pmu->type->ops->init_box(box);
+       }
+}
+
  static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
  {
         return (box->phys_id < 0);
@@ -326,6 +325,7 @@ extern struct event_constraint uncore_constraint_empty;
  int snb_uncore_pci_init(void);
  int ivb_uncore_pci_init(void);
  int hsw_uncore_pci_init(void);
+int bdw_uncore_pci_init(void);
  void snb_uncore_cpu_init(void);
  void nhm_uncore_cpu_init(void);
  
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c

index 4562e9e22c60600a89f706c3b8c3cfb269636060..b005a78c701286e3c460b1f70fde54d8116fd91e 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -7,6 +7,7 @@
  #define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
  #define PCI_DEVICE_ID_INTEL_HSW_IMC    0x0c00
  #define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
+#define PCI_DEVICE_ID_INTEL_BDW_IMC    0x1604
  
  /* SNB event control */
  #define SNB_UNC_CTL_EV_SEL_MASK                        0x000000ff
@@ -486,6 +487,14 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
         { /* end: all zeroes */ },
  };
  
+static const struct pci_device_id bdw_uncore_pci_ids[] = {
+       { /* IMC */
+               PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
+               .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+       },
+       { /* end: all zeroes */ },
+};
+
  static struct pci_driver snb_uncore_pci_driver = {
         .name           = "snb_uncore",
         .id_table       = snb_uncore_pci_ids,
@@ -501,6 +510,11 @@ static struct pci_driver hsw_uncore_pci_driver = {
         .id_table       = hsw_uncore_pci_ids,
  };
  
+static struct pci_driver bdw_uncore_pci_driver = {
+       .name           = "bdw_uncore",
+       .id_table       = bdw_uncore_pci_ids,
+};
+
  struct imc_uncore_pci_dev {
         __u32 pci_id;
         struct pci_driver *driver;
@@ -514,6 +528,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
         IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
         IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),    /* 4th Gen Core Processor */
         IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile Processor */
+       IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver),    /* 5th Gen Core U */
         {  /* end marker */ }
  };
  
@@ -561,6 +576,11 @@ int hsw_uncore_pci_init(void)
         return imc_uncore_pci_init();
  }
  
+int bdw_uncore_pci_init(void)
+{
+       return imc_uncore_pci_init();
+}
+
  /* end of Sandy Bridge uncore support */
  
  /* Nehalem uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c

index 12d9548457e7195a8a36b458e374cab9cabe5e07..6d6e85dd5849878e9caa379ef20eaab10b97559f 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -164,8 +164,8 @@
                                 ((1ULL << (n)) - 1)))
  
  /* Haswell-EP Ubox */
-#define HSWEP_U_MSR_PMON_CTR0                  0x705
-#define HSWEP_U_MSR_PMON_CTL0                  0x709
+#define HSWEP_U_MSR_PMON_CTR0                  0x709
+#define HSWEP_U_MSR_PMON_CTL0                  0x705
  #define HSWEP_U_MSR_PMON_FILTER                        0x707
  
  #define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL                0x703
@@ -1914,7 +1914,7 @@ static struct intel_uncore_type hswep_uncore_cbox = {
         .name                   = "cbox",
         .num_counters           = 4,
         .num_boxes              = 18,
-       .perf_ctr_bits          = 44,
+       .perf_ctr_bits          = 48,
         .event_ctl              = HSWEP_C0_MSR_PMON_CTL0,
         .perf_ctr               = HSWEP_C0_MSR_PMON_CTR0,
         .event_mask             = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c

index e7d8c7608471e960d0b64daeeb5e6bc0c468d07a..18ca99f2798b16443291827269389bf0ad52bf94 100644 (file)
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -12,7 +12,8 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
  {
  #ifdef CONFIG_SMP
         seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
-       seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu)));
+       seq_printf(m, "siblings\t: %d\n",
+                  cpumask_weight(topology_core_cpumask(cpu)));
         seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
         seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
         seq_printf(m, "apicid\t\t: %d\n", c->apicid);
diff --git a/arch/x86/kernel/fpu/Makefile b/arch/x86/kernel/fpu/Makefile

new file mode 100644 (file)

index 0000000..68279ef
--- /dev/null
+++ b/arch/x86/kernel/fpu/Makefile
@@ -0,0 +1,5 @@
+#
+# Build rules for the FPU support code:
+#
+
+obj-y                          += init.o bugs.o core.o regset.o signal.o xstate.o
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c

new file mode 100644 (file)

index 0000000..dd9ca9b
--- /dev/null
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -0,0 +1,71 @@
+/*
+ * x86 FPU bug checks:
+ */
+#include <asm/fpu/internal.h>
+
+/*
+ * Boot time CPU/FPU FDIV bug detection code:
+ */
+
+static double __initdata x = 4195835.0;
+static double __initdata y = 3145727.0;
+
+/*
+ * This used to check for exceptions..
+ * However, it turns out that to support that,
+ * the XMM trap handlers basically had to
+ * be buggy. So let's have a correct XMM trap
+ * handler, and forget about printing out
+ * some status at boot.
+ *
+ * We should really only care about bugs here
+ * anyway. Not features.
+ */
+static void __init check_fpu(void)
+{
+       u32 cr0_saved;
+       s32 fdiv_bug;
+
+       /* We might have CR0::TS set already, clear it: */
+       cr0_saved = read_cr0();
+       write_cr0(cr0_saved & ~X86_CR0_TS);
+
+       kernel_fpu_begin();
+
+       /*
+        * trap_init() enabled FXSR and company _before_ testing for FP
+        * problems here.
+        *
+        * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug
+        */
+       __asm__("fninit\n\t"
+               "fldl %1\n\t"
+               "fdivl %2\n\t"
+               "fmull %2\n\t"
+               "fldl %1\n\t"
+               "fsubp %%st,%%st(1)\n\t"
+               "fistpl %0\n\t"
+               "fwait\n\t"
+               "fninit"
+               : "=m" (*&fdiv_bug)
+               : "m" (*&x), "m" (*&y));
+
+       kernel_fpu_end();
+
+       write_cr0(cr0_saved);
+
+       if (fdiv_bug) {
+               set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
+               pr_warn("Hmm, FPU with FDIV bug\n");
+       }
+}
+
+void __init fpu__init_check_bugs(void)
+{
+       /*
+        * kernel_fpu_begin/end() in check_fpu() relies on the patched
+        * alternative instructions.
+        */
+       if (cpu_has_fpu)
+               check_fpu();
+}
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c

new file mode 100644 (file)

index 0000000..79de954
--- /dev/null
+++ b/arch/x86/kernel/fpu/core.c
@@ -0,0 +1,523 @@
+/*
+ *  Copyright (C) 1994 Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *  General FPU state handling cleanups
+ *     Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+#include <asm/fpu/internal.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/signal.h>
+#include <asm/traps.h>
+
+#include <linux/hardirq.h>
+
+/*
+ * Represents the initial FPU state. It's mostly (but not completely) zeroes,
+ * depending on the FPU hardware format:
+ */
+union fpregs_state init_fpstate __read_mostly;
+
+/*
+ * Track whether the kernel is using the FPU state
+ * currently.
+ *
+ * This flag is used:
+ *
+ *   - by IRQ context code to potentially use the FPU
+ *     if it's unused.
+ *
+ *   - to debug kernel_fpu_begin()/end() correctness
+ */
+static DEFINE_PER_CPU(bool, in_kernel_fpu);
+
+/*
+ * Track which context is using the FPU on the CPU:
+ */
+DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+static void kernel_fpu_disable(void)
+{
+       WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
+       this_cpu_write(in_kernel_fpu, true);
+}
+
+static void kernel_fpu_enable(void)
+{
+       WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
+       this_cpu_write(in_kernel_fpu, false);
+}
+
+static bool kernel_fpu_disabled(void)
+{
+       return this_cpu_read(in_kernel_fpu);
+}
+
+/*
+ * Were we in an interrupt that interrupted kernel mode?
+ *
+ * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that
+ * pair does nothing at all: the thread must not have fpu (so
+ * that we don't try to save the FPU state), and TS must
+ * be set (so that the clts/stts pair does nothing that is
+ * visible in the interrupted kernel thread).
+ *
+ * Except for the eagerfpu case when we return true; in the likely case
+ * the thread has FPU but we are not going to set/clear TS.
+ */
+static bool interrupted_kernel_fpu_idle(void)
+{
+       if (kernel_fpu_disabled())
+               return false;
+
+       if (use_eager_fpu())
+               return true;
+
+       return !current->thread.fpu.fpregs_active && (read_cr0() & X86_CR0_TS);
+}
+
+/*
+ * Were we in user mode (or vm86 mode) when we were
+ * interrupted?
+ *
+ * Doing kernel_fpu_begin/end() is ok if we are running
+ * in an interrupt context from user mode - we'll just
+ * save the FPU state as required.
+ */
+static bool interrupted_user_mode(void)
+{
+       struct pt_regs *regs = get_irq_regs();
+       return regs && user_mode(regs);
+}
+
+/*
+ * Can we use the FPU in kernel mode with the
+ * whole "kernel_fpu_begin/end()" sequence?
+ *
+ * It's always ok in process context (ie "not interrupt")
+ * but it is sometimes ok even from an irq.
+ */
+bool irq_fpu_usable(void)
+{
+       return !in_interrupt() ||
+               interrupted_user_mode() ||
+               interrupted_kernel_fpu_idle();
+}
+EXPORT_SYMBOL(irq_fpu_usable);
+
+void __kernel_fpu_begin(void)
+{
+       struct fpu *fpu = &current->thread.fpu;
+
+       WARN_ON_FPU(!irq_fpu_usable());
+
+       kernel_fpu_disable();
+
+       if (fpu->fpregs_active) {
+               copy_fpregs_to_fpstate(fpu);
+       } else {
+               this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+               __fpregs_activate_hw();
+       }
+}
+EXPORT_SYMBOL(__kernel_fpu_begin);
+
+void __kernel_fpu_end(void)
+{
+       struct fpu *fpu = &current->thread.fpu;
+
+       if (fpu->fpregs_active)
+               copy_kernel_to_fpregs(&fpu->state);
+       else
+               __fpregs_deactivate_hw();
+
+       kernel_fpu_enable();
+}
+EXPORT_SYMBOL(__kernel_fpu_end);
+
+void kernel_fpu_begin(void)
+{
+       preempt_disable();
+       __kernel_fpu_begin();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+
+void kernel_fpu_end(void)
+{
+       __kernel_fpu_end();
+       preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_end);
+
+/*
+ * CR0::TS save/restore functions:
+ */
+int irq_ts_save(void)
+{
+       /*
+        * If in process context and not atomic, we can take a spurious DNA fault.
+        * Otherwise, doing clts() in process context requires disabling preemption
+        * or some heavy lifting like kernel_fpu_begin()
+        */
+       if (!in_atomic())
+               return 0;
+
+       if (read_cr0() & X86_CR0_TS) {
+               clts();
+               return 1;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(irq_ts_save);
+
+void irq_ts_restore(int TS_state)
+{
+       if (TS_state)
+               stts();
+}
+EXPORT_SYMBOL_GPL(irq_ts_restore);
+
+/*
+ * Save the FPU state (mark it for reload if necessary):
+ *
+ * This only ever gets called for the current task.
+ */
+void fpu__save(struct fpu *fpu)
+{
+       WARN_ON_FPU(fpu != &current->thread.fpu);
+
+       preempt_disable();
+       if (fpu->fpregs_active) {
+               if (!copy_fpregs_to_fpstate(fpu))
+                       fpregs_deactivate(fpu);
+       }
+       preempt_enable();
+}
+EXPORT_SYMBOL_GPL(fpu__save);
+
+/*
+ * Legacy x87 fpstate state init:
+ */
+static inline void fpstate_init_fstate(struct fregs_state *fp)
+{
+       fp->cwd = 0xffff037fu;
+       fp->swd = 0xffff0000u;
+       fp->twd = 0xffffffffu;
+       fp->fos = 0xffff0000u;
+}
+
+void fpstate_init(union fpregs_state *state)
+{
+       if (!cpu_has_fpu) {
+               fpstate_init_soft(&state->soft);
+               return;
+       }
+
+       memset(state, 0, xstate_size);
+
+       if (cpu_has_fxsr)
+               fpstate_init_fxstate(&state->fxsave);
+       else
+               fpstate_init_fstate(&state->fsave);
+}
+EXPORT_SYMBOL_GPL(fpstate_init);
+
+/*
+ * Copy the current task's FPU state to a new task's FPU context.
+ *
+ * In both the 'eager' and the 'lazy' case we save hardware registers
+ * directly to the destination buffer.
+ */
+static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
+{
+       WARN_ON_FPU(src_fpu != &current->thread.fpu);
+
+       /*
+        * Don't let 'init optimized' areas of the XSAVE area
+        * leak into the child task:
+        */
+       if (use_eager_fpu())
+               memset(&dst_fpu->state.xsave, 0, xstate_size);
+
+       /*
+        * Save current FPU registers directly into the child
+        * FPU context, without any memory-to-memory copying.
+        *
+        * If the FPU context got destroyed in the process (FNSAVE
+        * done on old CPUs) then copy it back into the source
+        * context and mark the current task for lazy restore.
+        *
+        * We have to do all this with preemption disabled,
+        * mostly because of the FNSAVE case, because in that
+        * case we must not allow preemption in the window
+        * between the FNSAVE and us marking the context lazy.
+        *
+        * It shouldn't be an issue as even FNSAVE is plenty
+        * fast in terms of critical section length.
+        */
+       preempt_disable();
+       if (!copy_fpregs_to_fpstate(dst_fpu)) {
+               memcpy(&src_fpu->state, &dst_fpu->state, xstate_size);
+               fpregs_deactivate(src_fpu);
+       }
+       preempt_enable();
+}
+
+int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
+{
+       dst_fpu->counter = 0;
+       dst_fpu->fpregs_active = 0;
+       dst_fpu->last_cpu = -1;
+
+       if (src_fpu->fpstate_active)
+               fpu_copy(dst_fpu, src_fpu);
+
+       return 0;
+}
+
+/*
+ * Activate the current task's in-memory FPU context,
+ * if it has not been used before:
+ */
+void fpu__activate_curr(struct fpu *fpu)
+{
+       WARN_ON_FPU(fpu != &current->thread.fpu);
+
+       if (!fpu->fpstate_active) {
+               fpstate_init(&fpu->state);
+
+               /* Safe to do for the current task: */
+               fpu->fpstate_active = 1;
+       }
+}
+EXPORT_SYMBOL_GPL(fpu__activate_curr);
+
+/*
+ * This function must be called before we read a task's fpstate.
+ *
+ * If the task has not used the FPU before then initialize its
+ * fpstate.
+ *
+ * If the task has used the FPU before then save it.
+ */
+void fpu__activate_fpstate_read(struct fpu *fpu)
+{
+       /*
+        * If fpregs are active (in the current CPU), then
+        * copy them to the fpstate:
+        */
+       if (fpu->fpregs_active) {
+               fpu__save(fpu);
+       } else {
+               if (!fpu->fpstate_active) {
+                       fpstate_init(&fpu->state);
+
+                       /* Safe to do for current and for stopped child tasks: */
+                       fpu->fpstate_active = 1;
+               }
+       }
+}
+
+/*
+ * This function must be called before we write a task's fpstate.
+ *
+ * If the task has used the FPU before then unlazy it.
+ * If the task has not used the FPU before then initialize its fpstate.
+ *
+ * After this function call, after registers in the fpstate are
+ * modified and the child task has woken up, the child task will
+ * restore the modified FPU state from the modified context. If we
+ * didn't clear its lazy status here then the lazy in-registers
+ * state pending on its former CPU could be restored, corrupting
+ * the modifications.
+ */
+void fpu__activate_fpstate_write(struct fpu *fpu)
+{
+       /*
+        * Only stopped child tasks can be used to modify the FPU
+        * state in the fpstate buffer:
+        */
+       WARN_ON_FPU(fpu == &current->thread.fpu);
+
+       if (fpu->fpstate_active) {
+               /* Invalidate any lazy state: */
+               fpu->last_cpu = -1;
+       } else {
+               fpstate_init(&fpu->state);
+
+               /* Safe to do for stopped child tasks: */
+               fpu->fpstate_active = 1;
+       }
+}
+
+/*
+ * 'fpu__restore()' is called to copy FPU registers from
+ * the FPU fpstate to the live hw registers and to activate
+ * access to the hardware registers, so that FPU instructions
+ * can be used afterwards.
+ *
+ * Must be called with kernel preemption disabled (for example
+ * with local interrupts disabled, as it is in the case of
+ * do_device_not_available()).
+ */
+void fpu__restore(struct fpu *fpu)
+{
+       fpu__activate_curr(fpu);
+
+       /* Avoid __kernel_fpu_begin() right after fpregs_activate() */
+       kernel_fpu_disable();
+       fpregs_activate(fpu);
+       copy_kernel_to_fpregs(&fpu->state);
+       fpu->counter++;
+       kernel_fpu_enable();
+}
+EXPORT_SYMBOL_GPL(fpu__restore);
+
+/*
+ * Drops current FPU state: deactivates the fpregs and
+ * the fpstate. NOTE: it still leaves previous contents
+ * in the fpregs in the eager-FPU case.
+ *
+ * This function can be used in cases where we know that
+ * a state-restore is coming: either an explicit one,
+ * or a reschedule.
+ */
+void fpu__drop(struct fpu *fpu)
+{
+       preempt_disable();
+       fpu->counter = 0;
+
+       if (fpu->fpregs_active) {
+               /* Ignore delayed exceptions from user space */
+               asm volatile("1: fwait\n"
+                            "2:\n"
+                            _ASM_EXTABLE(1b, 2b));
+               fpregs_deactivate(fpu);
+       }
+
+       fpu->fpstate_active = 0;
+
+       preempt_enable();
+}
+
+/*
+ * Clear FPU registers by setting them up from
+ * the init fpstate:
+ */
+static inline void copy_init_fpstate_to_fpregs(void)
+{
+       if (use_xsave())
+               copy_kernel_to_xregs(&init_fpstate.xsave, -1);
+       else
+               copy_kernel_to_fxregs(&init_fpstate.fxsave);
+}
+
+/*
+ * Clear the FPU state back to init state.
+ *
+ * Called by sys_execve(), by the signal handler code and by various
+ * error paths.
+ */
+void fpu__clear(struct fpu *fpu)
+{
+       WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
+
+       if (!use_eager_fpu()) {
+               /* FPU state will be reallocated lazily at the first use. */
+               fpu__drop(fpu);
+       } else {
+               if (!fpu->fpstate_active) {
+                       fpu__activate_curr(fpu);
+                       user_fpu_begin();
+               }
+               copy_init_fpstate_to_fpregs();
+       }
+}
+
+/*
+ * x87 math exception handling:
+ */
+
+static inline unsigned short get_fpu_cwd(struct fpu *fpu)
+{
+       if (cpu_has_fxsr) {
+               return fpu->state.fxsave.cwd;
+       } else {
+               return (unsigned short)fpu->state.fsave.cwd;
+       }
+}
+
+static inline unsigned short get_fpu_swd(struct fpu *fpu)
+{
+       if (cpu_has_fxsr) {
+               return fpu->state.fxsave.swd;
+       } else {
+               return (unsigned short)fpu->state.fsave.swd;
+       }
+}
+
+static inline unsigned short get_fpu_mxcsr(struct fpu *fpu)
+{
+       if (cpu_has_xmm) {
+               return fpu->state.fxsave.mxcsr;
+       } else {
+               return MXCSR_DEFAULT;
+       }
+}
+
+int fpu__exception_code(struct fpu *fpu, int trap_nr)
+{
+       int err;
+
+       if (trap_nr == X86_TRAP_MF) {
+               unsigned short cwd, swd;
+               /*
+                * (~cwd & swd) will mask out exceptions that are not set to unmasked
+                * status.  0x3f is the exception bits in these regs, 0x200 is the
+                * C1 reg you need in case of a stack fault, 0x040 is the stack
+                * fault bit.  We should only be taking one exception at a time,
+                * so if this combination doesn't produce any single exception,
+                * then we have a bad program that isn't synchronizing its FPU usage
+                * and it will suffer the consequences since we won't be able to
+                * fully reproduce the context of the exception
+                */
+               cwd = get_fpu_cwd(fpu);
+               swd = get_fpu_swd(fpu);
+
+               err = swd & ~cwd;
+       } else {
+               /*
+                * The SIMD FPU exceptions are handled a little differently, as there
+                * is only a single status/control register.  Thus, to determine which
+                * unmasked exception was caught we must mask the exception mask bits
+                * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+                */
+               unsigned short mxcsr = get_fpu_mxcsr(fpu);
+               err = ~(mxcsr >> 7) & mxcsr;
+       }
+
+       if (err & 0x001) {      /* Invalid op */
+               /*
+                * swd & 0x240 == 0x040: Stack Underflow
+                * swd & 0x240 == 0x240: Stack Overflow
+                * User must clear the SF bit (0x40) if set
+                */
+               return FPE_FLTINV;
+       } else if (err & 0x004) { /* Divide by Zero */
+               return FPE_FLTDIV;
+       } else if (err & 0x008) { /* Overflow */
+               return FPE_FLTOVF;
+       } else if (err & 0x012) { /* Denormal, Underflow */
+               return FPE_FLTUND;
+       } else if (err & 0x020) { /* Precision */
+               return FPE_FLTRES;
+       }
+
+       /*
+        * If we're using IRQ 13, or supposedly even some trap
+        * X86_TRAP_MF implementations, it's possible
+        * we get a spurious trap, which is not an error.
+        */
+       return 0;
+}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c

new file mode 100644 (file)

index 0000000..fc878fe
--- /dev/null
+++ b/arch/x86/kernel/fpu/init.c
@@ -0,0 +1,354 @@
+/*
+ * x86 FPU boot time init code:
+ */
+#include <asm/fpu/internal.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Initialize the TS bit in CR0 according to the style of context-switches
+ * we are using:
+ */
+static void fpu__init_cpu_ctx_switch(void)
+{
+       if (!cpu_has_eager_fpu)
+               stts();
+       else
+               clts();
+}
+
+/*
+ * Initialize the registers found in all CPUs, CR0 and CR4:
+ */
+static void fpu__init_cpu_generic(void)
+{
+       unsigned long cr0;
+       unsigned long cr4_mask = 0;
+
+       if (cpu_has_fxsr)
+               cr4_mask |= X86_CR4_OSFXSR;
+       if (cpu_has_xmm)
+               cr4_mask |= X86_CR4_OSXMMEXCPT;
+       if (cr4_mask)
+               cr4_set_bits(cr4_mask);
+
+       cr0 = read_cr0();
+       cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
+       if (!cpu_has_fpu)
+               cr0 |= X86_CR0_EM;
+       write_cr0(cr0);
+
+       /* Flush out any pending x87 state: */
+       asm volatile ("fninit");
+}
+
+/*
+ * Enable all supported FPU features. Called when a CPU is brought online:
+ */
+void fpu__init_cpu(void)
+{
+       fpu__init_cpu_generic();
+       fpu__init_cpu_xstate();
+       fpu__init_cpu_ctx_switch();
+}
+
+/*
+ * The earliest FPU detection code.
+ *
+ * Set the X86_FEATURE_FPU CPU-capability bit based on
+ * trying to execute an actual sequence of FPU instructions:
+ */
+static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
+{
+       unsigned long cr0;
+       u16 fsw, fcw;
+
+       fsw = fcw = 0xffff;
+
+       cr0 = read_cr0();
+       cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
+       write_cr0(cr0);
+
+       asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
+                    : "+m" (fsw), "+m" (fcw));
+
+       if (fsw == 0 && (fcw & 0x103f) == 0x003f)
+               set_cpu_cap(c, X86_FEATURE_FPU);
+       else
+               clear_cpu_cap(c, X86_FEATURE_FPU);
+
+#ifndef CONFIG_MATH_EMULATION
+       if (!cpu_has_fpu) {
+               pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n");
+               for (;;)
+                       asm volatile("hlt");
+       }
+#endif
+}
+
+/*
+ * Boot time FPU feature detection code:
+ */
+unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+
+static void __init fpu__init_system_mxcsr(void)
+{
+       unsigned int mask = 0;
+
+       if (cpu_has_fxsr) {
+               struct fxregs_state fx_tmp __aligned(32) = { };
+
+               asm volatile("fxsave %0" : "+m" (fx_tmp));
+
+               mask = fx_tmp.mxcsr_mask;
+
+               /*
+                * If zero then use the default features mask,
+                * which has all features set, except the
+                * denormals-are-zero feature bit:
+                */
+               if (mask == 0)
+                       mask = 0x0000ffbf;
+       }
+       mxcsr_feature_mask &= mask;
+}
+
+/*
+ * Once per bootup FPU initialization sequences that will run on most x86 CPUs:
+ */
+static void __init fpu__init_system_generic(void)
+{
+       /*
+        * Set up the legacy init FPU context. (xstate init might overwrite this
+        * with a more modern format, if the CPU supports it.)
+        */
+       fpstate_init_fxstate(&init_fpstate.fxsave);
+
+       fpu__init_system_mxcsr();
+}
+
+/*
+ * Size of the FPU context state. All tasks in the system use the
+ * same context size, regardless of what portion they use.
+ * This is inherent to the XSAVE architecture which puts all state
+ * components into a single, continuous memory block:
+ */
+unsigned int xstate_size;
+EXPORT_SYMBOL_GPL(xstate_size);
+
+/*
+ * Set up the xstate_size based on the legacy FPU context size.
+ *
+ * We set this up first, and later it will be overwritten by
+ * fpu__init_system_xstate() if the CPU knows about xstates.
+ */
+static void __init fpu__init_system_xstate_size_legacy(void)
+{
+       static int on_boot_cpu = 1;
+
+       WARN_ON_FPU(!on_boot_cpu);
+       on_boot_cpu = 0;
+
+       /*
+        * Note that xstate_size might be overwriten later during
+        * fpu__init_system_xstate().
+        */
+
+       if (!cpu_has_fpu) {
+               /*
+                * Disable xsave as we do not support it if i387
+                * emulation is enabled.
+                */
+               setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+               setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+               xstate_size = sizeof(struct swregs_state);
+       } else {
+               if (cpu_has_fxsr)
+                       xstate_size = sizeof(struct fxregs_state);
+               else
+                       xstate_size = sizeof(struct fregs_state);
+       }
+       /*
+        * Quirk: we don't yet handle the XSAVES* instructions
+        * correctly, as we don't correctly convert between
+        * standard and compacted format when interfacing
+        * with user-space - so disable it for now.
+        *
+        * The difference is small: with recent CPUs the
+        * compacted format is only marginally smaller than
+        * the standard FPU state format.
+        *
+        * ( This is easy to backport while we are fixing
+        *   XSAVES* support. )
+        */
+       setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+}
+
+/*
+ * FPU context switching strategies:
+ *
+ * Against popular belief, we don't do lazy FPU saves, due to the
+ * task migration complications it brings on SMP - we only do
+ * lazy FPU restores.
+ *
+ * 'lazy' is the traditional strategy, which is based on setting
+ * CR0::TS to 1 during context-switch (instead of doing a full
+ * restore of the FPU state), which causes the first FPU instruction
+ * after the context switch (whenever it is executed) to fault - at
+ * which point we lazily restore the FPU state into FPU registers.
+ *
+ * Tasks are of course under no obligation to execute FPU instructions,
+ * so it can easily happen that another context-switch occurs without
+ * a single FPU instruction being executed. If we eventually switch
+ * back to the original task (that still owns the FPU) then we have
+ * not only saved the restores along the way, but we also have the
+ * FPU ready to be used for the original task.
+ *
+ * 'eager' switching is used on modern CPUs, there we switch the FPU
+ * state during every context switch, regardless of whether the task
+ * has used FPU instructions in that time slice or not. This is done
+ * because modern FPU context saving instructions are able to optimize
+ * state saving and restoration in hardware: they can detect both
+ * unused and untouched FPU state and optimize accordingly.
+ *
+ * [ Note that even in 'lazy' mode we might optimize context switches
+ *   to use 'eager' restores, if we detect that a task is using the FPU
+ *   frequently. See the fpu->counter logic in fpu/internal.h for that. ]
+ */
+static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
+
+static int __init eager_fpu_setup(char *s)
+{
+       if (!strcmp(s, "on"))
+               eagerfpu = ENABLE;
+       else if (!strcmp(s, "off"))
+               eagerfpu = DISABLE;
+       else if (!strcmp(s, "auto"))
+               eagerfpu = AUTO;
+       return 1;
+}
+__setup("eagerfpu=", eager_fpu_setup);
+
+/*
+ * Pick the FPU context switching strategy:
+ */
+static void __init fpu__init_system_ctx_switch(void)
+{
+       static bool on_boot_cpu = 1;
+
+       WARN_ON_FPU(!on_boot_cpu);
+       on_boot_cpu = 0;
+
+       WARN_ON_FPU(current->thread.fpu.fpstate_active);
+       current_thread_info()->status = 0;
+
+       /* Auto enable eagerfpu for xsaveopt */
+       if (cpu_has_xsaveopt && eagerfpu != DISABLE)
+               eagerfpu = ENABLE;
+
+       if (xfeatures_mask & XSTATE_EAGER) {
+               if (eagerfpu == DISABLE) {
+                       pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
+                              xfeatures_mask & XSTATE_EAGER);
+                       xfeatures_mask &= ~XSTATE_EAGER;
+               } else {
+                       eagerfpu = ENABLE;
+               }
+       }
+
+       if (eagerfpu == ENABLE)
+               setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
+
+       printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy");
+}
+
+/*
+ * Called on the boot CPU once per system bootup, to set up the initial
+ * FPU state that is later cloned into all processes:
+ */
+void __init fpu__init_system(struct cpuinfo_x86 *c)
+{
+       fpu__init_system_early_generic(c);
+
+       /*
+        * The FPU has to be operational for some of the
+        * later FPU init activities:
+        */
+       fpu__init_cpu();
+
+       /*
+        * But don't leave CR0::TS set yet, as some of the FPU setup
+        * methods depend on being able to execute FPU instructions
+        * that will fault on a set TS, such as the FXSAVE in
+        * fpu__init_system_mxcsr().
+        */
+       clts();
+
+       fpu__init_system_generic();
+       fpu__init_system_xstate_size_legacy();
+       fpu__init_system_xstate();
+
+       fpu__init_system_ctx_switch();
+}
+
+/*
+ * Boot parameter to turn off FPU support and fall back to math-emu:
+ */
+static int __init no_387(char *s)
+{
+       setup_clear_cpu_cap(X86_FEATURE_FPU);
+       return 1;
+}
+__setup("no387", no_387);
+
+/*
+ * Disable all xstate CPU features:
+ */
+static int __init x86_noxsave_setup(char *s)
+{
+       if (strlen(s))
+               return 0;
+
+       setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+       setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+       setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+       setup_clear_cpu_cap(X86_FEATURE_AVX);
+       setup_clear_cpu_cap(X86_FEATURE_AVX2);
+
+       return 1;
+}
+__setup("noxsave", x86_noxsave_setup);
+
+/*
+ * Disable the XSAVEOPT instruction specifically:
+ */
+static int __init x86_noxsaveopt_setup(char *s)
+{
+       setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+       return 1;
+}
+__setup("noxsaveopt", x86_noxsaveopt_setup);
+
+/*
+ * Disable the XSAVES instruction:
+ */
+static int __init x86_noxsaves_setup(char *s)
+{
+       setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+       return 1;
+}
+__setup("noxsaves", x86_noxsaves_setup);
+
+/*
+ * Disable FX save/restore and SSE support:
+ */
+static int __init x86_nofxsr_setup(char *s)
+{
+       setup_clear_cpu_cap(X86_FEATURE_FXSR);
+       setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
+       setup_clear_cpu_cap(X86_FEATURE_XMM);
+
+       return 1;
+}
+__setup("nofxsr", x86_nofxsr_setup);
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c

new file mode 100644 (file)

index 0000000..dc60810
--- /dev/null
+++ b/arch/x86/kernel/fpu/regset.c
@@ -0,0 +1,356 @@
+/*
+ * FPU register's regset abstraction, for ptrace, core dumps, etc.
+ */
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+
+/*
+ * The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
+ * as the "regset->n" for the xstate regset will be updated based on the feature
+ * capabilites supported by the xsave.
+ */
+int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+       struct fpu *target_fpu = &target->thread.fpu;
+
+       return target_fpu->fpstate_active ? regset->n : 0;
+}
+
+int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+       struct fpu *target_fpu = &target->thread.fpu;
+
+       return (cpu_has_fxsr && target_fpu->fpstate_active) ? regset->n : 0;
+}
+
+int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
+               unsigned int pos, unsigned int count,
+               void *kbuf, void __user *ubuf)
+{
+       struct fpu *fpu = &target->thread.fpu;
+
+       if (!cpu_has_fxsr)
+               return -ENODEV;
+
+       fpu__activate_fpstate_read(fpu);
+       fpstate_sanitize_xstate(fpu);
+
+       return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                                  &fpu->state.fxsave, 0, -1);
+}
+
+int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
+               unsigned int pos, unsigned int count,
+               const void *kbuf, const void __user *ubuf)
+{
+       struct fpu *fpu = &target->thread.fpu;
+       int ret;
+
+       if (!cpu_has_fxsr)
+               return -ENODEV;
+
+       fpu__activate_fpstate_write(fpu);
+       fpstate_sanitize_xstate(fpu);
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                &fpu->state.fxsave, 0, -1);
+
+       /*
+        * mxcsr reserved bits must be masked to zero for security reasons.
+        */
+       fpu->state.fxsave.mxcsr &= mxcsr_feature_mask;
+
+       /*
+        * update the header bits in the xsave header, indicating the
+        * presence of FP and SSE state.
+        */
+       if (cpu_has_xsave)
+               fpu->state.xsave.header.xfeatures |= XSTATE_FPSSE;
+
+       return ret;
+}
+
+int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
+               unsigned int pos, unsigned int count,
+               void *kbuf, void __user *ubuf)
+{
+       struct fpu *fpu = &target->thread.fpu;
+       struct xregs_state *xsave;
+       int ret;
+
+       if (!cpu_has_xsave)
+               return -ENODEV;
+
+       fpu__activate_fpstate_read(fpu);
+
+       xsave = &fpu->state.xsave;
+
+       /*
+        * Copy the 48bytes defined by the software first into the xstate
+        * memory layout in the thread struct, so that we can copy the entire
+        * xstateregs to the user using one user_regset_copyout().
+        */
+       memcpy(&xsave->i387.sw_reserved,
+               xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
+       /*
+        * Copy the xstate memory layout.
+        */
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+       return ret;
+}
+
+int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
+                 unsigned int pos, unsigned int count,
+                 const void *kbuf, const void __user *ubuf)
+{
+       struct fpu *fpu = &target->thread.fpu;
+       struct xregs_state *xsave;
+       int ret;
+
+       if (!cpu_has_xsave)
+               return -ENODEV;
+
+       fpu__activate_fpstate_write(fpu);
+
+       xsave = &fpu->state.xsave;
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+       /*
+        * mxcsr reserved bits must be masked to zero for security reasons.
+        */
+       xsave->i387.mxcsr &= mxcsr_feature_mask;
+       xsave->header.xfeatures &= xfeatures_mask;
+       /*
+        * These bits must be zero.
+        */
+       memset(&xsave->header.reserved, 0, 48);
+
+       return ret;
+}
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+
+/*
+ * FPU tag word conversions.
+ */
+
+static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
+{
+       unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+
+       /* Transform each pair of bits into 01 (valid) or 00 (empty) */
+       tmp = ~twd;
+       tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+       /* and move the valid bits to the lower byte. */
+       tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+       tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+       tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+
+       return tmp;
+}
+
+#define FPREG_ADDR(f, n)       ((void *)&(f)->st_space + (n) * 16)
+#define FP_EXP_TAG_VALID       0
+#define FP_EXP_TAG_ZERO                1
+#define FP_EXP_TAG_SPECIAL     2
+#define FP_EXP_TAG_EMPTY       3
+
+static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave)
+{
+       struct _fpxreg *st;
+       u32 tos = (fxsave->swd >> 11) & 7;
+       u32 twd = (unsigned long) fxsave->twd;
+       u32 tag;
+       u32 ret = 0xffff0000u;
+       int i;
+
+       for (i = 0; i < 8; i++, twd >>= 1) {
+               if (twd & 0x1) {
+                       st = FPREG_ADDR(fxsave, (i - tos) & 7);
+
+                       switch (st->exponent & 0x7fff) {
+                       case 0x7fff:
+                               tag = FP_EXP_TAG_SPECIAL;
+                               break;
+                       case 0x0000:
+                               if (!st->significand[0] &&
+                                   !st->significand[1] &&
+                                   !st->significand[2] &&
+                                   !st->significand[3])
+                                       tag = FP_EXP_TAG_ZERO;
+                               else
+                                       tag = FP_EXP_TAG_SPECIAL;
+                               break;
+                       default:
+                               if (st->significand[3] & 0x8000)
+                                       tag = FP_EXP_TAG_VALID;
+                               else
+                                       tag = FP_EXP_TAG_SPECIAL;
+                               break;
+                       }
+               } else {
+                       tag = FP_EXP_TAG_EMPTY;
+               }
+               ret |= tag << (2 * i);
+       }
+       return ret;
+}
+
+/*
+ * FXSR floating point environment conversions.
+ */
+
+void
+convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
+{
+       struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
+       struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
+       struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
+       int i;
+
+       env->cwd = fxsave->cwd | 0xffff0000u;
+       env->swd = fxsave->swd | 0xffff0000u;
+       env->twd = twd_fxsr_to_i387(fxsave);
+
+#ifdef CONFIG_X86_64
+       env->fip = fxsave->rip;
+       env->foo = fxsave->rdp;
+       /*
+        * should be actually ds/cs at fpu exception time, but
+        * that information is not available in 64bit mode.
+        */
+       env->fcs = task_pt_regs(tsk)->cs;
+       if (tsk == current) {
+               savesegment(ds, env->fos);
+       } else {
+               env->fos = tsk->thread.ds;
+       }
+       env->fos |= 0xffff0000;
+#else
+       env->fip = fxsave->fip;
+       env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
+       env->foo = fxsave->foo;
+       env->fos = fxsave->fos;
+#endif
+
+       for (i = 0; i < 8; ++i)
+               memcpy(&to[i], &from[i], sizeof(to[0]));
+}
+
+void convert_to_fxsr(struct task_struct *tsk,
+                    const struct user_i387_ia32_struct *env)
+
+{
+       struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
+       struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
+       struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
+       int i;
+
+       fxsave->cwd = env->cwd;
+       fxsave->swd = env->swd;
+       fxsave->twd = twd_i387_to_fxsr(env->twd);
+       fxsave->fop = (u16) ((u32) env->fcs >> 16);
+#ifdef CONFIG_X86_64
+       fxsave->rip = env->fip;
+       fxsave->rdp = env->foo;
+       /* cs and ds ignored */
+#else
+       fxsave->fip = env->fip;
+       fxsave->fcs = (env->fcs & 0xffff);
+       fxsave->foo = env->foo;
+       fxsave->fos = env->fos;
+#endif
+
+       for (i = 0; i < 8; ++i)
+               memcpy(&to[i], &from[i], sizeof(from[0]));
+}
+
+int fpregs_get(struct task_struct *target, const struct user_regset *regset,
+              unsigned int pos, unsigned int count,
+              void *kbuf, void __user *ubuf)
+{
+       struct fpu *fpu = &target->thread.fpu;
+       struct user_i387_ia32_struct env;
+
+       fpu__activate_fpstate_read(fpu);
+
+       if (!static_cpu_has(X86_FEATURE_FPU))
+               return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
+
+       if (!cpu_has_fxsr)
+               return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                                          &fpu->state.fsave, 0,
+                                          -1);
+
+       fpstate_sanitize_xstate(fpu);
+
+       if (kbuf && pos == 0 && count == sizeof(env)) {
+               convert_from_fxsr(kbuf, target);
+               return 0;
+       }
+
+       convert_from_fxsr(&env, target);
+
+       return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+}
+
+int fpregs_set(struct task_struct *target, const struct user_regset *regset,
+              unsigned int pos, unsigned int count,
+              const void *kbuf, const void __user *ubuf)
+{
+       struct fpu *fpu = &target->thread.fpu;
+       struct user_i387_ia32_struct env;
+       int ret;
+
+       fpu__activate_fpstate_write(fpu);
+       fpstate_sanitize_xstate(fpu);
+
+       if (!static_cpu_has(X86_FEATURE_FPU))
+               return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
+
+       if (!cpu_has_fxsr)
+               return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                         &fpu->state.fsave, 0,
+                                         -1);
+
+       if (pos > 0 || count < sizeof(env))
+               convert_from_fxsr(&env, target);
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+       if (!ret)
+               convert_to_fxsr(target, &env);
+
+       /*
+        * update the header bit in the xsave header, indicating the
+        * presence of FP.
+        */
+       if (cpu_has_xsave)
+               fpu->state.xsave.header.xfeatures |= XSTATE_FP;
+       return ret;
+}
+
+/*
+ * FPU state for core dumps.
+ * This is only used for a.out dumps now.
+ * It is declared generically using elf_fpregset_t (which is
+ * struct user_i387_struct) but is in fact only used for 32-bit
+ * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
+ */
+int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu)
+{
+       struct task_struct *tsk = current;
+       struct fpu *fpu = &tsk->thread.fpu;
+       int fpvalid;
+
+       fpvalid = fpu->fpstate_active;
+       if (fpvalid)
+               fpvalid = !fpregs_get(tsk, NULL,
+                                     0, sizeof(struct user_i387_ia32_struct),
+                                     ufpu, NULL);
+
+       return fpvalid;
+}
+EXPORT_SYMBOL(dump_fpu);
+
+#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c

new file mode 100644 (file)

index 0000000..50ec9af
--- /dev/null
+++ b/arch/x86/kernel/fpu/signal.c
@@ -0,0 +1,404 @@
+/*
+ * FPU signal frame handling routines.
+ */
+
+#include <linux/compat.h>
+#include <linux/cpu.h>
+
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+
+#include <asm/sigframe.h>
+
+static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
+
+/*
+ * Check for the presence of extended state information in the
+ * user fpstate pointer in the sigcontext.
+ */
+static inline int check_for_xstate(struct fxregs_state __user *buf,
+                                  void __user *fpstate,
+                                  struct _fpx_sw_bytes *fx_sw)
+{
+       int min_xstate_size = sizeof(struct fxregs_state) +
+                             sizeof(struct xstate_header);
+       unsigned int magic2;
+
+       if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw)))
+               return -1;
+
+       /* Check for the first magic field and other error scenarios. */
+       if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
+           fx_sw->xstate_size < min_xstate_size ||
+           fx_sw->xstate_size > xstate_size ||
+           fx_sw->xstate_size > fx_sw->extended_size)
+               return -1;
+
+       /*
+        * Check for the presence of second magic word at the end of memory
+        * layout. This detects the case where the user just copied the legacy
+        * fpstate layout with out copying the extended state information
+        * in the memory layout.
+        */
+       if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))
+           || magic2 != FP_XSTATE_MAGIC2)
+               return -1;
+
+       return 0;
+}
+
+/*
+ * Signal frame handlers.
+ */
+static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
+{
+       if (use_fxsr()) {
+               struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+               struct user_i387_ia32_struct env;
+               struct _fpstate_ia32 __user *fp = buf;
+
+               convert_from_fxsr(&env, tsk);
+
+               if (__copy_to_user(buf, &env, sizeof(env)) ||
+                   __put_user(xsave->i387.swd, &fp->status) ||
+                   __put_user(X86_FXSR_MAGIC, &fp->magic))
+                       return -1;
+       } else {
+               struct fregs_state __user *fp = buf;
+               u32 swd;
+               if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
+                       return -1;
+       }
+
+       return 0;
+}
+
+static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
+{
+       struct xregs_state __user *x = buf;
+       struct _fpx_sw_bytes *sw_bytes;
+       u32 xfeatures;
+       int err;
+
+       /* Setup the bytes not touched by the [f]xsave and reserved for SW. */
+       sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
+       err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
+
+       if (!use_xsave())
+               return err;
+
+       err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size));
+
+       /*
+        * Read the xfeatures which we copied (directly from the cpu or
+        * from the state in task struct) to the user buffers.
+        */
+       err |= __get_user(xfeatures, (__u32 *)&x->header.xfeatures);
+
+       /*
+        * For legacy compatible, we always set FP/SSE bits in the bit
+        * vector while saving the state to the user context. This will
+        * enable us capturing any changes(during sigreturn) to
+        * the FP/SSE bits by the legacy applications which don't touch
+        * xfeatures in the xsave header.
+        *
+        * xsave aware apps can change the xfeatures in the xsave
+        * header as well as change any contents in the memory layout.
+        * xrestore as part of sigreturn will capture all the changes.
+        */
+       xfeatures |= XSTATE_FPSSE;
+
+       err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures);
+
+       return err;
+}
+
+static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
+{
+       int err;
+
+       if (use_xsave())
+               err = copy_xregs_to_user(buf);
+       else if (use_fxsr())
+               err = copy_fxregs_to_user((struct fxregs_state __user *) buf);
+       else
+               err = copy_fregs_to_user((struct fregs_state __user *) buf);
+
+       if (unlikely(err) && __clear_user(buf, xstate_size))
+               err = -EFAULT;
+       return err;
+}
+
+/*
+ * Save the fpu, extended register state to the user signal frame.
+ *
+ * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
+ *  state is copied.
+ *  'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
+ *
+ *     buf == buf_fx for 64-bit frames and 32-bit fsave frame.
+ *     buf != buf_fx for 32-bit frames with fxstate.
+ *
+ * If the fpu, extended register state is live, save the state directly
+ * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise,
+ * copy the thread's fpu state to the user frame starting at 'buf_fx'.
+ *
+ * If this is a 32-bit frame with fxstate, put a fsave header before
+ * the aligned state at 'buf_fx'.
+ *
+ * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
+ * indicating the absence/presence of the extended state to the user.
+ */
+int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
+{
+       struct xregs_state *xsave = &current->thread.fpu.state.xsave;
+       struct task_struct *tsk = current;
+       int ia32_fxstate = (buf != buf_fx);
+
+       ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
+                        config_enabled(CONFIG_IA32_EMULATION));
+
+       if (!access_ok(VERIFY_WRITE, buf, size))
+               return -EACCES;
+
+       if (!static_cpu_has(X86_FEATURE_FPU))
+               return fpregs_soft_get(current, NULL, 0,
+                       sizeof(struct user_i387_ia32_struct), NULL,
+                       (struct _fpstate_ia32 __user *) buf) ? -1 : 1;
+
+       if (fpregs_active()) {
+               /* Save the live register state to the user directly. */
+               if (copy_fpregs_to_sigframe(buf_fx))
+                       return -1;
+               /* Update the thread's fxstate to save the fsave header. */
+               if (ia32_fxstate)
+                       copy_fxregs_to_kernel(&tsk->thread.fpu);
+       } else {
+               fpstate_sanitize_xstate(&tsk->thread.fpu);
+               if (__copy_to_user(buf_fx, xsave, xstate_size))
+                       return -1;
+       }
+
+       /* Save the fsave header for the 32-bit frames. */
+       if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
+               return -1;
+
+       if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
+               return -1;
+
+       return 0;
+}
+
+static inline void
+sanitize_restored_xstate(struct task_struct *tsk,
+                        struct user_i387_ia32_struct *ia32_env,
+                        u64 xfeatures, int fx_only)
+{
+       struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+       struct xstate_header *header = &xsave->header;
+
+       if (use_xsave()) {
+               /* These bits must be zero. */
+               memset(header->reserved, 0, 48);
+
+               /*
+                * Init the state that is not present in the memory
+                * layout and not enabled by the OS.
+                */
+               if (fx_only)
+                       header->xfeatures = XSTATE_FPSSE;
+               else
+                       header->xfeatures &= (xfeatures_mask & xfeatures);
+       }
+
+       if (use_fxsr()) {
+               /*
+                * mscsr reserved bits must be masked to zero for security
+                * reasons.
+                */
+               xsave->i387.mxcsr &= mxcsr_feature_mask;
+
+               convert_to_fxsr(tsk, ia32_env);
+       }
+}
+
+/*
+ * Restore the extended state if present. Otherwise, restore the FP/SSE state.
+ */
+static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
+{
+       if (use_xsave()) {
+               if ((unsigned long)buf % 64 || fx_only) {
+                       u64 init_bv = xfeatures_mask & ~XSTATE_FPSSE;
+                       copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+                       return copy_user_to_fxregs(buf);
+               } else {
+                       u64 init_bv = xfeatures_mask & ~xbv;
+                       if (unlikely(init_bv))
+                               copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+                       return copy_user_to_xregs(buf, xbv);
+               }
+       } else if (use_fxsr()) {
+               return copy_user_to_fxregs(buf);
+       } else
+               return copy_user_to_fregs(buf);
+}
+
+static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
+{
+       int ia32_fxstate = (buf != buf_fx);
+       struct task_struct *tsk = current;
+       struct fpu *fpu = &tsk->thread.fpu;
+       int state_size = xstate_size;
+       u64 xfeatures = 0;
+       int fx_only = 0;
+
+       ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
+                        config_enabled(CONFIG_IA32_EMULATION));
+
+       if (!buf) {
+               fpu__clear(fpu);
+               return 0;
+       }
+
+       if (!access_ok(VERIFY_READ, buf, size))
+               return -EACCES;
+
+       fpu__activate_curr(fpu);
+
+       if (!static_cpu_has(X86_FEATURE_FPU))
+               return fpregs_soft_set(current, NULL,
+                                      0, sizeof(struct user_i387_ia32_struct),
+                                      NULL, buf) != 0;
+
+       if (use_xsave()) {
+               struct _fpx_sw_bytes fx_sw_user;
+               if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
+                       /*
+                        * Couldn't find the extended state information in the
+                        * memory layout. Restore just the FP/SSE and init all
+                        * the other extended state.
+                        */
+                       state_size = sizeof(struct fxregs_state);
+                       fx_only = 1;
+               } else {
+                       state_size = fx_sw_user.xstate_size;
+                       xfeatures = fx_sw_user.xfeatures;
+               }
+       }
+
+       if (ia32_fxstate) {
+               /*
+                * For 32-bit frames with fxstate, copy the user state to the
+                * thread's fpu state, reconstruct fxstate from the fsave
+                * header. Sanitize the copied state etc.
+                */
+               struct fpu *fpu = &tsk->thread.fpu;
+               struct user_i387_ia32_struct env;
+               int err = 0;
+
+               /*
+                * Drop the current fpu which clears fpu->fpstate_active. This ensures
+                * that any context-switch during the copy of the new state,
+                * avoids the intermediate state from getting restored/saved.
+                * Thus avoiding the new restored state from getting corrupted.
+                * We will be ready to restore/save the state only after
+                * fpu->fpstate_active is again set.
+                */
+               fpu__drop(fpu);
+
+               if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) ||
+                   __copy_from_user(&env, buf, sizeof(env))) {
+                       fpstate_init(&fpu->state);
+                       err = -1;
+               } else {
+                       sanitize_restored_xstate(tsk, &env, xfeatures, fx_only);
+               }
+
+               fpu->fpstate_active = 1;
+               if (use_eager_fpu()) {
+                       preempt_disable();
+                       fpu__restore(fpu);
+                       preempt_enable();
+               }
+
+               return err;
+       } else {
+               /*
+                * For 64-bit frames and 32-bit fsave frames, restore the user
+                * state to the registers directly (with exceptions handled).
+                */
+               user_fpu_begin();
+               if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) {
+                       fpu__clear(fpu);
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+static inline int xstate_sigframe_size(void)
+{
+       return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size;
+}
+
+/*
+ * Restore FPU state from a sigframe:
+ */
+int fpu__restore_sig(void __user *buf, int ia32_frame)
+{
+       void __user *buf_fx = buf;
+       int size = xstate_sigframe_size();
+
+       if (ia32_frame && use_fxsr()) {
+               buf_fx = buf + sizeof(struct fregs_state);
+               size += sizeof(struct fregs_state);
+       }
+
+       return __fpu__restore_sig(buf, buf_fx, size);
+}
+
+unsigned long
+fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
+                    unsigned long *buf_fx, unsigned long *size)
+{
+       unsigned long frame_size = xstate_sigframe_size();
+
+       *buf_fx = sp = round_down(sp - frame_size, 64);
+       if (ia32_frame && use_fxsr()) {
+               frame_size += sizeof(struct fregs_state);
+               sp -= sizeof(struct fregs_state);
+       }
+
+       *size = frame_size;
+
+       return sp;
+}
+/*
+ * Prepare the SW reserved portion of the fxsave memory layout, indicating
+ * the presence of the extended state information in the memory layout
+ * pointed by the fpstate pointer in the sigcontext.
+ * This will be saved when ever the FP and extended state context is
+ * saved on the user stack during the signal handler delivery to the user.
+ */
+void fpu__init_prepare_fx_sw_frame(void)
+{
+       int fsave_header_size = sizeof(struct fregs_state);
+       int size = xstate_size + FP_XSTATE_MAGIC2_SIZE;
+
+       if (config_enabled(CONFIG_X86_32))
+               size += fsave_header_size;
+
+       fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
+       fx_sw_reserved.extended_size = size;
+       fx_sw_reserved.xfeatures = xfeatures_mask;
+       fx_sw_reserved.xstate_size = xstate_size;
+
+       if (config_enabled(CONFIG_IA32_EMULATION)) {
+               fx_sw_reserved_ia32 = fx_sw_reserved;
+               fx_sw_reserved_ia32.extended_size += fsave_header_size;
+       }
+}
+
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c

new file mode 100644 (file)

index 0000000..62fc001
--- /dev/null
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -0,0 +1,461 @@
+/*
+ * xsave/xrstor support.
+ *
+ * Author: Suresh Siddha <suresh.b.siddha@intel.com>
+ */
+#include <linux/compat.h>
+#include <linux/cpu.h>
+
+#include <asm/fpu/api.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+
+#include <asm/tlbflush.h>
+
+static const char *xfeature_names[] =
+{
+       "x87 floating point registers"  ,
+       "SSE registers"                 ,
+       "AVX registers"                 ,
+       "MPX bounds registers"          ,
+       "MPX CSR"                       ,
+       "AVX-512 opmask"                ,
+       "AVX-512 Hi256"                 ,
+       "AVX-512 ZMM_Hi256"             ,
+       "unknown xstate feature"        ,
+};
+
+/*
+ * Mask of xstate features supported by the CPU and the kernel:
+ */
+u64 xfeatures_mask __read_mostly;
+
+static unsigned int xstate_offsets[XFEATURES_NR_MAX] = { [ 0 ... XFEATURES_NR_MAX - 1] = -1};
+static unsigned int xstate_sizes[XFEATURES_NR_MAX]   = { [ 0 ... XFEATURES_NR_MAX - 1] = -1};
+static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
+
+/* The number of supported xfeatures in xfeatures_mask: */
+static unsigned int xfeatures_nr;
+
+/*
+ * Return whether the system supports a given xfeature.
+ *
+ * Also return the name of the (most advanced) feature that the caller requested:
+ */
+int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
+{
+       u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask;
+
+       if (unlikely(feature_name)) {
+               long xfeature_idx, max_idx;
+               u64 xfeatures_print;
+               /*
+                * So we use FLS here to be able to print the most advanced
+                * feature that was requested but is missing. So if a driver
+                * asks about "XSTATE_SSE | XSTATE_YMM" we'll print the
+                * missing AVX feature - this is the most informative message
+                * to users:
+                */
+               if (xfeatures_missing)
+                       xfeatures_print = xfeatures_missing;
+               else
+                       xfeatures_print = xfeatures_needed;
+
+               xfeature_idx = fls64(xfeatures_print)-1;
+               max_idx = ARRAY_SIZE(xfeature_names)-1;
+               xfeature_idx = min(xfeature_idx, max_idx);
+
+               *feature_name = xfeature_names[xfeature_idx];
+       }
+
+       if (xfeatures_missing)
+               return 0;
+
+       return 1;
+}
+EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
+
+/*
+ * When executing XSAVEOPT (or other optimized XSAVE instructions), if
+ * a processor implementation detects that an FPU state component is still
+ * (or is again) in its initialized state, it may clear the corresponding
+ * bit in the header.xfeatures field, and can skip the writeout of registers
+ * to the corresponding memory layout.
+ *
+ * This means that when the bit is zero, the state component might still contain
+ * some previous - non-initialized register state.
+ *
+ * Before writing xstate information to user-space we sanitize those components,
+ * to always ensure that the memory layout of a feature will be in the init state
+ * if the corresponding header bit is zero. This is to ensure that user-space doesn't
+ * see some stale state in the memory layout during signal handling, debugging etc.
+ */
+void fpstate_sanitize_xstate(struct fpu *fpu)
+{
+       struct fxregs_state *fx = &fpu->state.fxsave;
+       int feature_bit;
+       u64 xfeatures;
+
+       if (!use_xsaveopt())
+               return;
+
+       xfeatures = fpu->state.xsave.header.xfeatures;
+
+       /*
+        * None of the feature bits are in init state. So nothing else
+        * to do for us, as the memory layout is up to date.
+        */
+       if ((xfeatures & xfeatures_mask) == xfeatures_mask)
+               return;
+
+       /*
+        * FP is in init state
+        */
+       if (!(xfeatures & XSTATE_FP)) {
+               fx->cwd = 0x37f;
+               fx->swd = 0;
+               fx->twd = 0;
+               fx->fop = 0;
+               fx->rip = 0;
+               fx->rdp = 0;
+               memset(&fx->st_space[0], 0, 128);
+       }
+
+       /*
+        * SSE is in init state
+        */
+       if (!(xfeatures & XSTATE_SSE))
+               memset(&fx->xmm_space[0], 0, 256);
+
+       /*
+        * First two features are FPU and SSE, which above we handled
+        * in a special way already:
+        */
+       feature_bit = 0x2;
+       xfeatures = (xfeatures_mask & ~xfeatures) >> 2;
+
+       /*
+        * Update all the remaining memory layouts according to their
+        * standard xstate layout, if their header bit is in the init
+        * state:
+        */
+       while (xfeatures) {
+               if (xfeatures & 0x1) {
+                       int offset = xstate_offsets[feature_bit];
+                       int size = xstate_sizes[feature_bit];
+
+                       memcpy((void *)fx + offset,
+                              (void *)&init_fpstate.xsave + offset,
+                              size);
+               }
+
+               xfeatures >>= 1;
+               feature_bit++;
+       }
+}
+
+/*
+ * Enable the extended processor state save/restore feature.
+ * Called once per CPU onlining.
+ */
+void fpu__init_cpu_xstate(void)
+{
+       if (!cpu_has_xsave || !xfeatures_mask)
+               return;
+
+       cr4_set_bits(X86_CR4_OSXSAVE);
+       xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+}
+
+/*
+ * Record the offsets and sizes of various xstates contained
+ * in the XSAVE state memory layout.
+ *
+ * ( Note that certain features might be non-present, for them
+ *   we'll have 0 offset and 0 size. )
+ */
+static void __init setup_xstate_features(void)
+{
+       u32 eax, ebx, ecx, edx, leaf;
+
+       xfeatures_nr = fls64(xfeatures_mask);
+
+       for (leaf = 2; leaf < xfeatures_nr; leaf++) {
+               cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
+
+               xstate_offsets[leaf] = ebx;
+               xstate_sizes[leaf] = eax;
+
+               printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %04x, xstate_sizes[%d]: %04x\n", leaf, ebx, leaf, eax);
+       }
+}
+
+static void __init print_xstate_feature(u64 xstate_mask)
+{
+       const char *feature_name;
+
+       if (cpu_has_xfeatures(xstate_mask, &feature_name))
+               pr_info("x86/fpu: Supporting XSAVE feature 0x%02Lx: '%s'\n", xstate_mask, feature_name);
+}
+
+/*
+ * Print out all the supported xstate features:
+ */
+static void __init print_xstate_features(void)
+{
+       print_xstate_feature(XSTATE_FP);
+       print_xstate_feature(XSTATE_SSE);
+       print_xstate_feature(XSTATE_YMM);
+       print_xstate_feature(XSTATE_BNDREGS);
+       print_xstate_feature(XSTATE_BNDCSR);
+       print_xstate_feature(XSTATE_OPMASK);
+       print_xstate_feature(XSTATE_ZMM_Hi256);
+       print_xstate_feature(XSTATE_Hi16_ZMM);
+}
+
+/*
+ * This function sets up offsets and sizes of all extended states in
+ * xsave area. This supports both standard format and compacted format
+ * of the xsave aread.
+ */
+static void __init setup_xstate_comp(void)
+{
+       unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8];
+       int i;
+
+       /*
+        * The FP xstates and SSE xstates are legacy states. They are always
+        * in the fixed offsets in the xsave area in either compacted form
+        * or standard form.
+        */
+       xstate_comp_offsets[0] = 0;
+       xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
+
+       if (!cpu_has_xsaves) {
+               for (i = 2; i < xfeatures_nr; i++) {
+                       if (test_bit(i, (unsigned long *)&xfeatures_mask)) {
+                               xstate_comp_offsets[i] = xstate_offsets[i];
+                               xstate_comp_sizes[i] = xstate_sizes[i];
+                       }
+               }
+               return;
+       }
+
+       xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+       for (i = 2; i < xfeatures_nr; i++) {
+               if (test_bit(i, (unsigned long *)&xfeatures_mask))
+                       xstate_comp_sizes[i] = xstate_sizes[i];
+               else
+                       xstate_comp_sizes[i] = 0;
+
+               if (i > 2)
+                       xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
+                                       + xstate_comp_sizes[i-1];
+
+       }
+}
+
+/*
+ * setup the xstate image representing the init state
+ */
+static void __init setup_init_fpu_buf(void)
+{
+       static int on_boot_cpu = 1;
+
+       WARN_ON_FPU(!on_boot_cpu);
+       on_boot_cpu = 0;
+
+       if (!cpu_has_xsave)
+               return;
+
+       setup_xstate_features();
+       print_xstate_features();
+
+       if (cpu_has_xsaves) {
+               init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
+               init_fpstate.xsave.header.xfeatures = xfeatures_mask;
+       }
+
+       /*
+        * Init all the features state with header_bv being 0x0
+        */
+       copy_kernel_to_xregs_booting(&init_fpstate.xsave);
+
+       /*
+        * Dump the init state again. This is to identify the init state
+        * of any feature which is not represented by all zero's.
+        */
+       copy_xregs_to_kernel_booting(&init_fpstate.xsave);
+}
+
+/*
+ * Calculate total size of enabled xstates in XCR0/xfeatures_mask.
+ */
+static void __init init_xstate_size(void)
+{
+       unsigned int eax, ebx, ecx, edx;
+       int i;
+
+       if (!cpu_has_xsaves) {
+               cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+               xstate_size = ebx;
+               return;
+       }
+
+       xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+       for (i = 2; i < 64; i++) {
+               if (test_bit(i, (unsigned long *)&xfeatures_mask)) {
+                       cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+                       xstate_size += eax;
+               }
+       }
+}
+
+/*
+ * Enable and initialize the xsave feature.
+ * Called once per system bootup.
+ */
+void __init fpu__init_system_xstate(void)
+{
+       unsigned int eax, ebx, ecx, edx;
+       static int on_boot_cpu = 1;
+
+       WARN_ON_FPU(!on_boot_cpu);
+       on_boot_cpu = 0;
+
+       if (!cpu_has_xsave) {
+               pr_info("x86/fpu: Legacy x87 FPU detected.\n");
+               return;
+       }
+
+       if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
+               WARN_ON_FPU(1);
+               return;
+       }
+
+       cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+       xfeatures_mask = eax + ((u64)edx << 32);
+
+       if ((xfeatures_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
+               pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
+               BUG();
+       }
+
+       /* Support only the state known to the OS: */
+       xfeatures_mask = xfeatures_mask & XCNTXT_MASK;
+
+       /* Enable xstate instructions to be able to continue with initialization: */
+       fpu__init_cpu_xstate();
+
+       /* Recompute the context size for enabled features: */
+       init_xstate_size();
+
+       update_regset_xstate_info(xstate_size, xfeatures_mask);
+       fpu__init_prepare_fx_sw_frame();
+       setup_init_fpu_buf();
+       setup_xstate_comp();
+
+       pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is 0x%x bytes, using '%s' format.\n",
+               xfeatures_mask,
+               xstate_size,
+               cpu_has_xsaves ? "compacted" : "standard");
+}
+
+/*
+ * Restore minimal FPU state after suspend:
+ */
+void fpu__resume_cpu(void)
+{
+       /*
+        * Restore XCR0 on xsave capable CPUs:
+        */
+       if (cpu_has_xsave)
+               xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+}
+
+/*
+ * Given the xsave area and a state inside, this function returns the
+ * address of the state.
+ *
+ * This is the API that is called to get xstate address in either
+ * standard format or compacted format of xsave area.
+ *
+ * Note that if there is no data for the field in the xsave buffer
+ * this will return NULL.
+ *
+ * Inputs:
+ *     xstate: the thread's storage area for all FPU data
+ *     xstate_feature: state which is defined in xsave.h (e.g.
+ *     XSTATE_FP, XSTATE_SSE, etc...)
+ * Output:
+ *     address of the state in the xsave area, or NULL if the
+ *     field is not present in the xsave buffer.
+ */
+void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
+{
+       int feature_nr = fls64(xstate_feature) - 1;
+       /*
+        * Do we even *have* xsave state?
+        */
+       if (!boot_cpu_has(X86_FEATURE_XSAVE))
+               return NULL;
+
+       xsave = &current->thread.fpu.state.xsave;
+       /*
+        * We should not ever be requesting features that we
+        * have not enabled.  Remember that pcntxt_mask is
+        * what we write to the XCR0 register.
+        */
+       WARN_ONCE(!(xfeatures_mask & xstate_feature),
+                 "get of unsupported state");
+       /*
+        * This assumes the last 'xsave*' instruction to
+        * have requested that 'xstate_feature' be saved.
+        * If it did not, we might be seeing and old value
+        * of the field in the buffer.
+        *
+        * This can happen because the last 'xsave' did not
+        * request that this feature be saved (unlikely)
+        * or because the "init optimization" caused it
+        * to not be saved.
+        */
+       if (!(xsave->header.xfeatures & xstate_feature))
+               return NULL;
+
+       return (void *)xsave + xstate_comp_offsets[feature_nr];
+}
+EXPORT_SYMBOL_GPL(get_xsave_addr);
+
+/*
+ * This wraps up the common operations that need to occur when retrieving
+ * data from xsave state.  It first ensures that the current task was
+ * using the FPU and retrieves the data in to a buffer.  It then calculates
+ * the offset of the requested field in the buffer.
+ *
+ * This function is safe to call whether the FPU is in use or not.
+ *
+ * Note that this only works on the current task.
+ *
+ * Inputs:
+ *     @xsave_state: state which is defined in xsave.h (e.g. XSTATE_FP,
+ *     XSTATE_SSE, etc...)
+ * Output:
+ *     address of the state in the xsave area or NULL if the state
+ *     is not present or is in its 'init state'.
+ */
+const void *get_xsave_field_ptr(int xsave_state)
+{
+       struct fpu *fpu = &current->thread.fpu;
+
+       if (!fpu->fpstate_active)
+               return NULL;
+       /*
+        * fpu__save() takes the CPU's xstate registers
+        * and saves them off to the 'fpu memory buffer.
+        */
+       fpu__save(fpu);
+
+       return get_xsave_addr(&fpu->state.xsave, xsave_state);
+}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S

index 544dec4cc605dde41cb4ebd8e9c8516839c24d13..0e2d96ffd158d0e5f4c1d355040cd9b285ef84d6 100644 (file)
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -62,9 +62,16 @@
  #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
  #endif
  
-/* Number of possible pages in the lowmem region */
-LOWMEM_PAGES = (((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT)
-       
+/*
+ * Number of possible pages in the lowmem region.
+ *
+ * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
+ * gas warning about overflowing shift count when gas has been compiled
+ * with only a host target support using a 32-bit type for internal
+ * representation.
+ */
+LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)
+
  /* Enough space to fit pagetables for the low memory linear map */
  MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
  
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c

index 05fd74f537d62122ade73f53dad17c97346c7a80..64341aa485ae1ad6ab62c07984c9a70dadd44c64 100644 (file)
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -40,7 +40,5 @@ EXPORT_SYMBOL(empty_zero_page);
  
  #ifdef CONFIG_PREEMPT
  EXPORT_SYMBOL(___preempt_schedule);
-#ifdef CONFIG_CONTEXT_TRACKING
-EXPORT_SYMBOL(___preempt_schedule_context);
-#endif
+EXPORT_SYMBOL(___preempt_schedule_notrace);
  #endif
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c

deleted file mode 100644 (file)

index 6185d31..0000000
--- a/arch/x86/kernel/i387.c
+++ /dev/null
@@ -1,671 +0,0 @@
-/*
- *  Copyright (C) 1994 Linus Torvalds
- *
- *  Pentium III FXSR, SSE support
- *  General FPU state handling cleanups
- *     Gareth Hughes <gareth@valinux.com>, May 2000
- */
-#include <linux/module.h>
-#include <linux/regset.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-#include <asm/sigcontext.h>
-#include <asm/processor.h>
-#include <asm/math_emu.h>
-#include <asm/tlbflush.h>
-#include <asm/uaccess.h>
-#include <asm/ptrace.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
-#include <asm/user.h>
-
-static DEFINE_PER_CPU(bool, in_kernel_fpu);
-
-void kernel_fpu_disable(void)
-{
-       WARN_ON(this_cpu_read(in_kernel_fpu));
-       this_cpu_write(in_kernel_fpu, true);
-}
-
-void kernel_fpu_enable(void)
-{
-       this_cpu_write(in_kernel_fpu, false);
-}
-
-/*
- * Were we in an interrupt that interrupted kernel mode?
- *
- * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that
- * pair does nothing at all: the thread must not have fpu (so
- * that we don't try to save the FPU state), and TS must
- * be set (so that the clts/stts pair does nothing that is
- * visible in the interrupted kernel thread).
- *
- * Except for the eagerfpu case when we return true; in the likely case
- * the thread has FPU but we are not going to set/clear TS.
- */
-static inline bool interrupted_kernel_fpu_idle(void)
-{
-       if (this_cpu_read(in_kernel_fpu))
-               return false;
-
-       if (use_eager_fpu())
-               return true;
-
-       return !__thread_has_fpu(current) &&
-               (read_cr0() & X86_CR0_TS);
-}
-
-/*
- * Were we in user mode (or vm86 mode) when we were
- * interrupted?
- *
- * Doing kernel_fpu_begin/end() is ok if we are running
- * in an interrupt context from user mode - we'll just
- * save the FPU state as required.
- */
-static inline bool interrupted_user_mode(void)
-{
-       struct pt_regs *regs = get_irq_regs();
-       return regs && user_mode(regs);
-}
-
-/*
- * Can we use the FPU in kernel mode with the
- * whole "kernel_fpu_begin/end()" sequence?
- *
- * It's always ok in process context (ie "not interrupt")
- * but it is sometimes ok even from an irq.
- */
-bool irq_fpu_usable(void)
-{
-       return !in_interrupt() ||
-               interrupted_user_mode() ||
-               interrupted_kernel_fpu_idle();
-}
-EXPORT_SYMBOL(irq_fpu_usable);
-
-void __kernel_fpu_begin(void)
-{
-       struct task_struct *me = current;
-
-       this_cpu_write(in_kernel_fpu, true);
-
-       if (__thread_has_fpu(me)) {
-               __save_init_fpu(me);
-       } else {
-               this_cpu_write(fpu_owner_task, NULL);
-               if (!use_eager_fpu())
-                       clts();
-       }
-}
-EXPORT_SYMBOL(__kernel_fpu_begin);
-
-void __kernel_fpu_end(void)
-{
-       struct task_struct *me = current;
-
-       if (__thread_has_fpu(me)) {
-               if (WARN_ON(restore_fpu_checking(me)))
-                       fpu_reset_state(me);
-       } else if (!use_eager_fpu()) {
-               stts();
-       }
-
-       this_cpu_write(in_kernel_fpu, false);
-}
-EXPORT_SYMBOL(__kernel_fpu_end);
-
-void unlazy_fpu(struct task_struct *tsk)
-{
-       preempt_disable();
-       if (__thread_has_fpu(tsk)) {
-               if (use_eager_fpu()) {
-                       __save_fpu(tsk);
-               } else {
-                       __save_init_fpu(tsk);
-                       __thread_fpu_end(tsk);
-               }
-       }
-       preempt_enable();
-}
-EXPORT_SYMBOL(unlazy_fpu);
-
-unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
-unsigned int xstate_size;
-EXPORT_SYMBOL_GPL(xstate_size);
-static struct i387_fxsave_struct fx_scratch;
-
-static void mxcsr_feature_mask_init(void)
-{
-       unsigned long mask = 0;
-
-       if (cpu_has_fxsr) {
-               memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));
-               asm volatile("fxsave %0" : "+m" (fx_scratch));
-               mask = fx_scratch.mxcsr_mask;
-               if (mask == 0)
-                       mask = 0x0000ffbf;
-       }
-       mxcsr_feature_mask &= mask;
-}
-
-static void init_thread_xstate(void)
-{
-       /*
-        * Note that xstate_size might be overwriten later during
-        * xsave_init().
-        */
-
-       if (!cpu_has_fpu) {
-               /*
-                * Disable xsave as we do not support it if i387
-                * emulation is enabled.
-                */
-               setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-               setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-               xstate_size = sizeof(struct i387_soft_struct);
-               return;
-       }
-
-       if (cpu_has_fxsr)
-               xstate_size = sizeof(struct i387_fxsave_struct);
-       else
-               xstate_size = sizeof(struct i387_fsave_struct);
-
-       /*
-        * Quirk: we don't yet handle the XSAVES* instructions
-        * correctly, as we don't correctly convert between
-        * standard and compacted format when interfacing
-        * with user-space - so disable it for now.
-        *
-        * The difference is small: with recent CPUs the
-        * compacted format is only marginally smaller than
-        * the standard FPU state format.
-        *
-        * ( This is easy to backport while we are fixing
-        *   XSAVES* support. )
-        */
-       setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-}
-
-/*
- * Called at bootup to set up the initial FPU state that is later cloned
- * into all processes.
- */
-
-void fpu_init(void)
-{
-       unsigned long cr0;
-       unsigned long cr4_mask = 0;
-
-#ifndef CONFIG_MATH_EMULATION
-       if (!cpu_has_fpu) {
-               pr_emerg("No FPU found and no math emulation present\n");
-               pr_emerg("Giving up\n");
-               for (;;)
-                       asm volatile("hlt");
-       }
-#endif
-       if (cpu_has_fxsr)
-               cr4_mask |= X86_CR4_OSFXSR;
-       if (cpu_has_xmm)
-               cr4_mask |= X86_CR4_OSXMMEXCPT;
-       if (cr4_mask)
-               cr4_set_bits(cr4_mask);
-
-       cr0 = read_cr0();
-       cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
-       if (!cpu_has_fpu)
-               cr0 |= X86_CR0_EM;
-       write_cr0(cr0);
-
-       /*
-        * init_thread_xstate is only called once to avoid overriding
-        * xstate_size during boot time or during CPU hotplug.
-        */
-       if (xstate_size == 0)
-               init_thread_xstate();
-
-       mxcsr_feature_mask_init();
-       xsave_init();
-       eager_fpu_init();
-}
-
-void fpu_finit(struct fpu *fpu)
-{
-       if (!cpu_has_fpu) {
-               finit_soft_fpu(&fpu->state->soft);
-               return;
-       }
-
-       memset(fpu->state, 0, xstate_size);
-
-       if (cpu_has_fxsr) {
-               fx_finit(&fpu->state->fxsave);
-       } else {
-               struct i387_fsave_struct *fp = &fpu->state->fsave;
-               fp->cwd = 0xffff037fu;
-               fp->swd = 0xffff0000u;
-               fp->twd = 0xffffffffu;
-               fp->fos = 0xffff0000u;
-       }
-}
-EXPORT_SYMBOL_GPL(fpu_finit);
-
-/*
- * The _current_ task is using the FPU for the first time
- * so initialize it and set the mxcsr to its default
- * value at reset if we support XMM instructions and then
- * remember the current task has used the FPU.
- */
-int init_fpu(struct task_struct *tsk)
-{
-       int ret;
-
-       if (tsk_used_math(tsk)) {
-               if (cpu_has_fpu && tsk == current)
-                       unlazy_fpu(tsk);
-               task_disable_lazy_fpu_restore(tsk);
-               return 0;
-       }
-
-       /*
-        * Memory allocation at the first usage of the FPU and other state.
-        */
-       ret = fpu_alloc(&tsk->thread.fpu);
-       if (ret)
-               return ret;
-
-       fpu_finit(&tsk->thread.fpu);
-
-       set_stopped_child_used_math(tsk);
-       return 0;
-}
-EXPORT_SYMBOL_GPL(init_fpu);
-
-/*
- * The xstateregs_active() routine is the same as the fpregs_active() routine,
- * as the "regset->n" for the xstate regset will be updated based on the feature
- * capabilites supported by the xsave.
- */
-int fpregs_active(struct task_struct *target, const struct user_regset *regset)
-{
-       return tsk_used_math(target) ? regset->n : 0;
-}
-
-int xfpregs_active(struct task_struct *target, const struct user_regset *regset)
-{
-       return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0;
-}
-
-int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
-               unsigned int pos, unsigned int count,
-               void *kbuf, void __user *ubuf)
-{
-       int ret;
-
-       if (!cpu_has_fxsr)
-               return -ENODEV;
-
-       ret = init_fpu(target);
-       if (ret)
-               return ret;
-
-       sanitize_i387_state(target);
-
-       return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-                                  &target->thread.fpu.state->fxsave, 0, -1);
-}
-
-int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
-               unsigned int pos, unsigned int count,
-               const void *kbuf, const void __user *ubuf)
-{
-       int ret;
-
-       if (!cpu_has_fxsr)
-               return -ENODEV;
-
-       ret = init_fpu(target);
-       if (ret)
-               return ret;
-
-       sanitize_i387_state(target);
-
-       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-                                &target->thread.fpu.state->fxsave, 0, -1);
-
-       /*
-        * mxcsr reserved bits must be masked to zero for security reasons.
-        */
-       target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
-
-       /*
-        * update the header bits in the xsave header, indicating the
-        * presence of FP and SSE state.
-        */
-       if (cpu_has_xsave)
-               target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
-
-       return ret;
-}
-
-int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
-               unsigned int pos, unsigned int count,
-               void *kbuf, void __user *ubuf)
-{
-       struct xsave_struct *xsave;
-       int ret;
-
-       if (!cpu_has_xsave)
-               return -ENODEV;
-
-       ret = init_fpu(target);
-       if (ret)
-               return ret;
-
-       xsave = &target->thread.fpu.state->xsave;
-
-       /*
-        * Copy the 48bytes defined by the software first into the xstate
-        * memory layout in the thread struct, so that we can copy the entire
-        * xstateregs to the user using one user_regset_copyout().
-        */
-       memcpy(&xsave->i387.sw_reserved,
-               xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
-       /*
-        * Copy the xstate memory layout.
-        */
-       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
-       return ret;
-}
-
-int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
-                 unsigned int pos, unsigned int count,
-                 const void *kbuf, const void __user *ubuf)
-{
-       struct xsave_struct *xsave;
-       int ret;
-
-       if (!cpu_has_xsave)
-               return -ENODEV;
-
-       ret = init_fpu(target);
-       if (ret)
-               return ret;
-
-       xsave = &target->thread.fpu.state->xsave;
-
-       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
-       /*
-        * mxcsr reserved bits must be masked to zero for security reasons.
-        */
-       xsave->i387.mxcsr &= mxcsr_feature_mask;
-       xsave->xsave_hdr.xstate_bv &= pcntxt_mask;
-       /*
-        * These bits must be zero.
-        */
-       memset(&xsave->xsave_hdr.reserved, 0, 48);
-       return ret;
-}
-
-#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-
-/*
- * FPU tag word conversions.
- */
-
-static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
-{
-       unsigned int tmp; /* to avoid 16 bit prefixes in the code */
-
-       /* Transform each pair of bits into 01 (valid) or 00 (empty) */
-       tmp = ~twd;
-       tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
-       /* and move the valid bits to the lower byte. */
-       tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
-       tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
-       tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
-
-       return tmp;
-}
-
-#define FPREG_ADDR(f, n)       ((void *)&(f)->st_space + (n) * 16)
-#define FP_EXP_TAG_VALID       0
-#define FP_EXP_TAG_ZERO                1
-#define FP_EXP_TAG_SPECIAL     2
-#define FP_EXP_TAG_EMPTY       3
-
-static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
-{
-       struct _fpxreg *st;
-       u32 tos = (fxsave->swd >> 11) & 7;
-       u32 twd = (unsigned long) fxsave->twd;
-       u32 tag;
-       u32 ret = 0xffff0000u;
-       int i;
-
-       for (i = 0; i < 8; i++, twd >>= 1) {
-               if (twd & 0x1) {
-                       st = FPREG_ADDR(fxsave, (i - tos) & 7);
-
-                       switch (st->exponent & 0x7fff) {
-                       case 0x7fff:
-                               tag = FP_EXP_TAG_SPECIAL;
-                               break;
-                       case 0x0000:
-                               if (!st->significand[0] &&
-                                   !st->significand[1] &&
-                                   !st->significand[2] &&
-                                   !st->significand[3])
-                                       tag = FP_EXP_TAG_ZERO;
-                               else
-                                       tag = FP_EXP_TAG_SPECIAL;
-                               break;
-                       default:
-                               if (st->significand[3] & 0x8000)
-                                       tag = FP_EXP_TAG_VALID;
-                               else
-                                       tag = FP_EXP_TAG_SPECIAL;
-                               break;
-                       }
-               } else {
-                       tag = FP_EXP_TAG_EMPTY;
-               }
-               ret |= tag << (2 * i);
-       }
-       return ret;
-}
-
-/*
- * FXSR floating point environment conversions.
- */
-
-void
-convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
-{
-       struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
-       struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
-       struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
-       int i;
-
-       env->cwd = fxsave->cwd | 0xffff0000u;
-       env->swd = fxsave->swd | 0xffff0000u;
-       env->twd = twd_fxsr_to_i387(fxsave);
-
-#ifdef CONFIG_X86_64
-       env->fip = fxsave->rip;
-       env->foo = fxsave->rdp;
-       /*
-        * should be actually ds/cs at fpu exception time, but
-        * that information is not available in 64bit mode.
-        */
-       env->fcs = task_pt_regs(tsk)->cs;
-       if (tsk == current) {
-               savesegment(ds, env->fos);
-       } else {
-               env->fos = tsk->thread.ds;
-       }
-       env->fos |= 0xffff0000;
-#else
-       env->fip = fxsave->fip;
-       env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
-       env->foo = fxsave->foo;
-       env->fos = fxsave->fos;
-#endif
-
-       for (i = 0; i < 8; ++i)
-               memcpy(&to[i], &from[i], sizeof(to[0]));
-}
-
-void convert_to_fxsr(struct task_struct *tsk,
-                    const struct user_i387_ia32_struct *env)
-
-{
-       struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
-       struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
-       struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
-       int i;
-
-       fxsave->cwd = env->cwd;
-       fxsave->swd = env->swd;
-       fxsave->twd = twd_i387_to_fxsr(env->twd);
-       fxsave->fop = (u16) ((u32) env->fcs >> 16);
-#ifdef CONFIG_X86_64
-       fxsave->rip = env->fip;
-       fxsave->rdp = env->foo;
-       /* cs and ds ignored */
-#else
-       fxsave->fip = env->fip;
-       fxsave->fcs = (env->fcs & 0xffff);
-       fxsave->foo = env->foo;
-       fxsave->fos = env->fos;
-#endif
-
-       for (i = 0; i < 8; ++i)
-               memcpy(&to[i], &from[i], sizeof(from[0]));
-}
-
-int fpregs_get(struct task_struct *target, const struct user_regset *regset,
-              unsigned int pos, unsigned int count,
-              void *kbuf, void __user *ubuf)
-{
-       struct user_i387_ia32_struct env;
-       int ret;
-
-       ret = init_fpu(target);
-       if (ret)
-               return ret;
-
-       if (!static_cpu_has(X86_FEATURE_FPU))
-               return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
-
-       if (!cpu_has_fxsr)
-               return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-                                          &target->thread.fpu.state->fsave, 0,
-                                          -1);
-
-       sanitize_i387_state(target);
-
-       if (kbuf && pos == 0 && count == sizeof(env)) {
-               convert_from_fxsr(kbuf, target);
-               return 0;
-       }
-
-       convert_from_fxsr(&env, target);
-
-       return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
-}
-
-int fpregs_set(struct task_struct *target, const struct user_regset *regset,
-              unsigned int pos, unsigned int count,
-              const void *kbuf, const void __user *ubuf)
-{
-       struct user_i387_ia32_struct env;
-       int ret;
-
-       ret = init_fpu(target);
-       if (ret)
-               return ret;
-
-       sanitize_i387_state(target);
-
-       if (!static_cpu_has(X86_FEATURE_FPU))
-               return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
-
-       if (!cpu_has_fxsr)
-               return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-                                         &target->thread.fpu.state->fsave, 0,
-                                         -1);
-
-       if (pos > 0 || count < sizeof(env))
-               convert_from_fxsr(&env, target);
-
-       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
-       if (!ret)
-               convert_to_fxsr(target, &env);
-
-       /*
-        * update the header bit in the xsave header, indicating the
-        * presence of FP.
-        */
-       if (cpu_has_xsave)
-               target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
-       return ret;
-}
-
-/*
- * FPU state for core dumps.
- * This is only used for a.out dumps now.
- * It is declared generically using elf_fpregset_t (which is
- * struct user_i387_struct) but is in fact only used for 32-bit
- * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
- */
-int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu)
-{
-       struct task_struct *tsk = current;
-       int fpvalid;
-
-       fpvalid = !!used_math();
-       if (fpvalid)
-               fpvalid = !fpregs_get(tsk, NULL,
-                                     0, sizeof(struct user_i387_ia32_struct),
-                                     fpu, NULL);
-
-       return fpvalid;
-}
-EXPORT_SYMBOL(dump_fpu);
-
-#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
-
-static int __init no_387(char *s)
-{
-       setup_clear_cpu_cap(X86_FEATURE_FPU);
-       return 1;
-}
-
-__setup("no387", no_387);
-
-void fpu_detect(struct cpuinfo_x86 *c)
-{
-       unsigned long cr0;
-       u16 fsw, fcw;
-
-       fsw = fcw = 0xffff;
-
-       cr0 = read_cr0();
-       cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
-       write_cr0(cr0);
-
-       asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
-                    : "+m" (fsw), "+m" (fcw));
-
-       if (fsw == 0 && (fcw & 0x103f) == 0x003f)
-               set_cpu_cap(c, X86_FEATURE_FPU);
-       else
-               clear_cpu_cap(c, X86_FEATURE_FPU);
-
-       /* The final cr0 value is set in fpu_init() */
-}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c

index a25e202bb319caf87ce147831f6a1a47ed03e254..353972c1946cd35f378054439a05bed8200f92c9 100644 (file)
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -140,6 +140,51 @@ void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
                 free_pages((unsigned long)vaddr, get_order(size));
  }
  
+void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
+                     gfp_t gfp, struct dma_attrs *attrs)
+{
+       struct dma_map_ops *ops = get_dma_ops(dev);
+       void *memory;
+
+       gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
+       if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
+               return memory;
+
+       if (!dev)
+               dev = &x86_dma_fallback_dev;
+
+       if (!is_device_dma_capable(dev))
+               return NULL;
+
+       if (!ops->alloc)
+               return NULL;
+
+       memory = ops->alloc(dev, size, dma_handle,
+                           dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
+       debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
+
+       return memory;
+}
+EXPORT_SYMBOL(dma_alloc_attrs);
+
+void dma_free_attrs(struct device *dev, size_t size,
+                   void *vaddr, dma_addr_t bus,
+                   struct dma_attrs *attrs)
+{
+       struct dma_map_ops *ops = get_dma_ops(dev);
+
+       WARN_ON(irqs_disabled());       /* for portability */
+
+       if (dma_release_from_coherent(dev, get_order(size), vaddr))
+               return;
+
+       debug_dma_free_coherent(dev, size, vaddr, bus);
+       if (ops->free)
+               ops->free(dev, size, vaddr, bus, attrs);
+}
+EXPORT_SYMBOL(dma_free_attrs);
+
  /*
   * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
   * parameter documentation.
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c

index 77dd0ad58be4a6c9c8af113189c9805c1a535633..adf0392d549aa465e8f3e0ac336ea6f0e5967f1c 100644 (file)
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -20,6 +20,13 @@ void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
  {
         void *vaddr;
  
+       /*
+        * Don't print a warning when the first allocation attempt fails.
+        * swiotlb_alloc_coherent() will print a warning when the DMA
+        * memory allocation ultimately failed.
+        */
+       flags |= __GFP_NOWARN;
+
         vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags,
                                            attrs);
         if (vaddr)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c

index 6e338e3b1dc04cc69ab41c012fe5671cc25321cd..9cad694ed7c4d6a755b34af705e0a055cb0c04aa 100644 (file)
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -25,8 +25,7 @@
  #include <asm/idle.h>
  #include <asm/uaccess.h>
  #include <asm/mwait.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
  #include <asm/debugreg.h>
  #include <asm/nmi.h>
  #include <asm/tlbflush.h>
@@ -76,9 +75,6 @@ void idle_notifier_unregister(struct notifier_block *n)
  EXPORT_SYMBOL_GPL(idle_notifier_unregister);
  #endif
  
-struct kmem_cache *task_xstate_cachep;
-EXPORT_SYMBOL_GPL(task_xstate_cachep);
-
  /*
   * this gets called so that we can store lazy state into memory and copy the
   * current task into the new thread.
@@ -87,36 +83,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
  {
         *dst = *src;
  
-       dst->thread.fpu_counter = 0;
-       dst->thread.fpu.has_fpu = 0;
-       dst->thread.fpu.state = NULL;
-       task_disable_lazy_fpu_restore(dst);
-       if (tsk_used_math(src)) {
-               int err = fpu_alloc(&dst->thread.fpu);
-               if (err)
-                       return err;
-               fpu_copy(dst, src);
-       }
-       return 0;
-}
-
-void free_thread_xstate(struct task_struct *tsk)
-{
-       fpu_free(&tsk->thread.fpu);
-}
-
-void arch_release_task_struct(struct task_struct *tsk)
-{
-       free_thread_xstate(tsk);
-}
-
-void arch_task_cache_init(void)
-{
-        task_xstate_cachep =
-               kmem_cache_create("task_xstate", xstate_size,
-                                 __alignof__(union thread_xstate),
-                                 SLAB_PANIC | SLAB_NOTRACK, NULL);
-       setup_xstate_comp();
+       return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
  }
  
  /*
@@ -127,6 +94,7 @@ void exit_thread(void)
         struct task_struct *me = current;
         struct thread_struct *t = &me->thread;
         unsigned long *bp = t->io_bitmap_ptr;
+       struct fpu *fpu = &t->fpu;
  
         if (bp) {
                 struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
@@ -142,7 +110,7 @@ void exit_thread(void)
                 kfree(bp);
         }
  
-       drop_fpu(me);
+       fpu__drop(fpu);
  }
  
  void flush_thread(void)
@@ -152,19 +120,7 @@ void flush_thread(void)
         flush_ptrace_hw_breakpoint(tsk);
         memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
  
-       if (!use_eager_fpu()) {
-               /* FPU state will be reallocated lazily at the first use. */
-               drop_fpu(tsk);
-               free_thread_xstate(tsk);
-       } else {
-               if (!tsk_used_math(tsk)) {
-                       /* kthread execs. TODO: cleanup this horror. */
-                       if (WARN_ON(init_fpu(tsk)))
-                               force_sig(SIGKILL, tsk);
-                       user_fpu_begin();
-               }
-               restore_init_xstate();
-       }
+       fpu__clear(&tsk->thread.fpu);
  }
  
  static void hard_disable_TSC(void)
@@ -445,11 +401,10 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
  }
  
  /*
- * MONITOR/MWAIT with no hints, used for default default C1 state.
- * This invokes MWAIT with interrutps enabled and no flags,
- * which is backwards compatible with the original MWAIT implementation.
+ * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
+ * with interrupts enabled and no flags, which is backwards compatible with the
+ * original MWAIT implementation.
   */
-
  static void mwait_idle(void)
  {
         if (!current_set_polling_and_test()) {
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c

index a99900cedc22c4fc9836a0c956ef3723c4863215..c09c99ccf3e33fc5afff500b6310e2e6f9150526 100644 (file)
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -39,8 +39,7 @@
  #include <asm/pgtable.h>
  #include <asm/ldt.h>
  #include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
  #include <asm/desc.h>
  #ifdef CONFIG_MATH_EMULATION
  #include <asm/math_emu.h>
@@ -242,14 +241,16 @@ __visible __notrace_funcgraph struct task_struct *
  __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
  {
         struct thread_struct *prev = &prev_p->thread,
-                                *next = &next_p->thread;
+                            *next = &next_p->thread;
+       struct fpu *prev_fpu = &prev->fpu;
+       struct fpu *next_fpu = &next->fpu;
         int cpu = smp_processor_id();
         struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
-       fpu_switch_t fpu;
+       fpu_switch_t fpu_switch;
  
         /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
  
-       fpu = switch_fpu_prepare(prev_p, next_p, cpu);
+       fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
  
         /*
          * Save away %gs. No need to save %fs, as it was saved on the
@@ -296,7 +297,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
          * Leave lazy mode, flushing any hypercalls made here.
          * This must be done before restoring TLS segments so
          * the GDT and LDT are properly updated, and must be
-        * done before math_state_restore, so the TS bit is up
+        * done before fpu__restore(), so the TS bit is up
          * to date.
          */
         arch_end_context_switch(next_p);
@@ -316,7 +317,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         if (prev->gs | next->gs)
                 lazy_load_gs(next->gs);
  
-       switch_fpu_finish(next_p, fpu);
+       switch_fpu_finish(next_fpu, fpu_switch);
  
         this_cpu_write(current_task, next_p);
  
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c

index 82134506faa8ae939c9b1e216898ef66b1392e4b..843f92e4c7110cd621fb94dfca8cb980044fa32d 100644 (file)
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -38,8 +38,7 @@
  
  #include <asm/pgtable.h>
  #include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
  #include <asm/mmu_context.h>
  #include <asm/prctl.h>
  #include <asm/desc.h>
@@ -274,12 +273,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
  {
         struct thread_struct *prev = &prev_p->thread;
         struct thread_struct *next = &next_p->thread;
+       struct fpu *prev_fpu = &prev->fpu;
+       struct fpu *next_fpu = &next->fpu;
         int cpu = smp_processor_id();
         struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
         unsigned fsindex, gsindex;
-       fpu_switch_t fpu;
+       fpu_switch_t fpu_switch;
  
-       fpu = switch_fpu_prepare(prev_p, next_p, cpu);
+       fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
  
         /* We must save %fs and %gs before load_TLS() because
          * %fs and %gs may be cleared by load_TLS().
@@ -299,7 +300,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
          * Leave lazy mode, flushing any hypercalls made here.  This
          * must be done after loading TLS entries in the GDT but before
          * loading segments that might reference them, and and it must
-        * be done before math_state_restore, so the TS bit is up to
+        * be done before fpu__restore(), so the TS bit is up to
          * date.
          */
         arch_end_context_switch(next_p);
@@ -391,7 +392,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
                 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
         prev->gsindex = gsindex;
  
-       switch_fpu_finish(next_p, fpu);
+       switch_fpu_finish(next_fpu, fpu_switch);
  
         /*
          * Switch the PDA and FPU contexts.
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c

index a7bc794807195af79b6c15054b1941867d373198..9be72bc3613f80a9b5a5ae1bbf01adefdcfc29ce 100644 (file)
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -11,7 +11,6 @@
  #include <linux/errno.h>
  #include <linux/slab.h>
  #include <linux/ptrace.h>
-#include <linux/regset.h>
  #include <linux/tracehook.h>
  #include <linux/user.h>
  #include <linux/elf.h>
@@ -28,8 +27,9 @@
  #include <asm/uaccess.h>
  #include <asm/pgtable.h>
  #include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
  #include <asm/debugreg.h>
  #include <asm/ldt.h>
  #include <asm/desc.h>
@@ -1297,7 +1297,7 @@ static struct user_regset x86_64_regsets[] __read_mostly = {
                 .core_note_type = NT_PRFPREG,
                 .n = sizeof(struct user_i387_struct) / sizeof(long),
                 .size = sizeof(long), .align = sizeof(long),
-               .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
+               .active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
         },
         [REGSET_XSTATE] = {
                 .core_note_type = NT_X86_XSTATE,
@@ -1338,13 +1338,13 @@ static struct user_regset x86_32_regsets[] __read_mostly = {
                 .core_note_type = NT_PRFPREG,
                 .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
                 .size = sizeof(u32), .align = sizeof(u32),
-               .active = fpregs_active, .get = fpregs_get, .set = fpregs_set
+               .active = regset_fpregs_active, .get = fpregs_get, .set = fpregs_set
         },
         [REGSET_XFP] = {
                 .core_note_type = NT_PRXFPREG,
                 .n = sizeof(struct user32_fxsr_struct) / sizeof(u32),
                 .size = sizeof(u32), .align = sizeof(u32),
-               .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
+               .active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
         },
         [REGSET_XSTATE] = {
                 .core_note_type = NT_X86_XSTATE,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c

index 8d04a7594a036387fa483c8a108523e4849033ae..265a6fdea8b73c053b1080455994fe67c9449143 100644 (file)
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -531,12 +531,14 @@ static void __init reserve_crashkernel_low(void)
         if (ret != 0) {
                 /*
                  * two parts from lib/swiotlb.c:
-                *      swiotlb size: user specified with swiotlb= or default.
-                *      swiotlb overflow buffer: now is hardcoded to 32k.
-                *              We round it to 8M for other buffers that
-                *              may need to stay low too.
+                * -swiotlb size: user-specified with swiotlb= or default.
+                *
+                * -swiotlb overflow buffer: now hardcoded to 32k. We round it
+                * to 8M for other buffers that may need to stay low too. Also
+                * make sure we allocate enough extra low memory so that we
+                * don't run out of DMA buffers for 32-bit devices.
                  */
-               low_size = swiotlb_size_or_default() + (8UL<<20);
+               low_size = max(swiotlb_size_or_default() + (8UL<<20), 256UL<<20);
                 auto_set = true;
         } else {
                 /* passed with crashkernel=0,low ? */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c

index 1ea14fd53933bae96dc5f8e3fec99575a3c3c454..206996c1669db344aba7ff072f734552723e7938 100644 (file)
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -26,8 +26,8 @@
  
  #include <asm/processor.h>
  #include <asm/ucontext.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
  #include <asm/vdso.h>
  #include <asm/mce.h>
  #include <asm/sighandling.h>
@@ -103,7 +103,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
                 get_user_ex(buf, &sc->fpstate);
         } get_user_catch(err);
  
-       err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
+       err |= fpu__restore_sig(buf, config_enabled(CONFIG_X86_32));
  
         force_iret();
  
@@ -199,6 +199,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
         unsigned long sp = regs->sp;
         unsigned long buf_fx = 0;
         int onsigstack = on_sig_stack(sp);
+       struct fpu *fpu = &current->thread.fpu;
  
         /* redzone */
         if (config_enabled(CONFIG_X86_64))
@@ -218,9 +219,9 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
                 }
         }
  
-       if (used_math()) {
-               sp = alloc_mathframe(sp, config_enabled(CONFIG_X86_32),
-                                    &buf_fx, &math_size);
+       if (fpu->fpstate_active) {
+               sp = fpu__alloc_mathframe(sp, config_enabled(CONFIG_X86_32),
+                                         &buf_fx, &math_size);
                 *fpstate = (void __user *)sp;
         }
  
@@ -234,8 +235,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
                 return (void __user *)-1L;
  
         /* save i387 and extended state */
-       if (used_math() &&
-           save_xstate_sig(*fpstate, (void __user *)buf_fx, math_size) < 0)
+       if (fpu->fpstate_active &&
+           copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0)
                 return (void __user *)-1L;
  
         return (void __user *)sp;
@@ -593,6 +594,22 @@ badframe:
         return 0;
  }
  
+static inline int is_ia32_compat_frame(void)
+{
+       return config_enabled(CONFIG_IA32_EMULATION) &&
+              test_thread_flag(TIF_IA32);
+}
+
+static inline int is_ia32_frame(void)
+{
+       return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame();
+}
+
+static inline int is_x32_frame(void)
+{
+       return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32);
+}
+
  static int
  setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
  {
@@ -617,6 +634,7 @@ static void
  handle_signal(struct ksignal *ksig, struct pt_regs *regs)
  {
         bool stepping, failed;
+       struct fpu *fpu = &current->thread.fpu;
  
         /* Are we from a system call? */
         if (syscall_get_nr(current, regs) >= 0) {
@@ -665,8 +683,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
                 /*
                  * Ensure the signal handler starts with the new fpu state.
                  */
-               if (used_math())
-                       fpu_reset_state(current);
+               if (fpu->fpstate_active)
+                       fpu__clear(fpu);
         }
         signal_setup_done(failed, ksig, stepping);
  }
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c

index fd6291c921b6076a1297e4b563993367378fd782..8add66b22f333cbc5e8936b41e64525b7f49e1bc 100644 (file)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -68,8 +68,7 @@
  #include <asm/mwait.h>
  #include <asm/apic.h>
  #include <asm/io_apic.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
  #include <asm/setup.h>
  #include <asm/uv/uv.h>
  #include <linux/mc146818rtc.h>
@@ -314,10 +313,10 @@ topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
                 cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
  }
  
-#define link_mask(_m, c1, c2)                                          \
+#define link_mask(mfunc, c1, c2)                                       \
  do {                                                                   \
-       cpumask_set_cpu((c1), cpu_##_m##_mask(c2));                     \
-       cpumask_set_cpu((c2), cpu_##_m##_mask(c1));                     \
+       cpumask_set_cpu((c1), mfunc(c2));                               \
+       cpumask_set_cpu((c2), mfunc(c1));                               \
  } while (0)
  
  static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
@@ -398,9 +397,9 @@ void set_cpu_sibling_map(int cpu)
         cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
  
         if (!has_mp) {
-               cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+               cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
                 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
-               cpumask_set_cpu(cpu, cpu_core_mask(cpu));
+               cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
                 c->booted_cores = 1;
                 return;
         }
@@ -409,32 +408,34 @@ void set_cpu_sibling_map(int cpu)
                 o = &cpu_data(i);
  
                 if ((i == cpu) || (has_smt && match_smt(c, o)))
-                       link_mask(sibling, cpu, i);
+                       link_mask(topology_sibling_cpumask, cpu, i);
  
                 if ((i == cpu) || (has_mp && match_llc(c, o)))
-                       link_mask(llc_shared, cpu, i);
+                       link_mask(cpu_llc_shared_mask, cpu, i);
  
         }
  
         /*
          * This needs a separate iteration over the cpus because we rely on all
-        * cpu_sibling_mask links to be set-up.
+        * topology_sibling_cpumask links to be set-up.
          */
         for_each_cpu(i, cpu_sibling_setup_mask) {
                 o = &cpu_data(i);
  
                 if ((i == cpu) || (has_mp && match_die(c, o))) {
-                       link_mask(core, cpu, i);
+                       link_mask(topology_core_cpumask, cpu, i);
  
                         /*
                          *  Does this new cpu bringup a new core?
                          */
-                       if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) {
+                       if (cpumask_weight(
+                           topology_sibling_cpumask(cpu)) == 1) {
                                 /*
                                  * for each core in package, increment
                                  * the booted_cores for this new cpu
                                  */
-                               if (cpumask_first(cpu_sibling_mask(i)) == i)
+                               if (cpumask_first(
+                                   topology_sibling_cpumask(i)) == i)
                                         c->booted_cores++;
                                 /*
                                  * increment the core count for all
@@ -1042,8 +1043,8 @@ static __init void disable_smp(void)
                 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
         else
                 physid_set_mask_of_physid(0, &phys_cpu_present_map);
-       cpumask_set_cpu(0, cpu_sibling_mask(0));
-       cpumask_set_cpu(0, cpu_core_mask(0));
+       cpumask_set_cpu(0, topology_sibling_cpumask(0));
+       cpumask_set_cpu(0, topology_core_cpumask(0));
  }
  
  enum {
@@ -1328,22 +1329,22 @@ static void remove_siblinginfo(int cpu)
         int sibling;
         struct cpuinfo_x86 *c = &cpu_data(cpu);
  
-       for_each_cpu(sibling, cpu_core_mask(cpu)) {
-               cpumask_clear_cpu(cpu, cpu_core_mask(sibling));
+       for_each_cpu(sibling, topology_core_cpumask(cpu)) {
+               cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
                 /*/
                  * last thread sibling in this cpu core going down
                  */
-               if (cpumask_weight(cpu_sibling_mask(cpu)) == 1)
+               if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1)
                         cpu_data(sibling).booted_cores--;
         }
  
-       for_each_cpu(sibling, cpu_sibling_mask(cpu))
-               cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling));
+       for_each_cpu(sibling, topology_sibling_cpumask(cpu))
+               cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
         for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
                 cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
         cpumask_clear(cpu_llc_shared_mask(cpu));
-       cpumask_clear(cpu_sibling_mask(cpu));
-       cpumask_clear(cpu_core_mask(cpu));
+       cpumask_clear(topology_sibling_cpumask(cpu));
+       cpumask_clear(topology_core_cpumask(cpu));
         c->phys_proc_id = 0;
         c->cpu_core_id = 0;
         cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c

index de379366f6d12250eb714de82dc68c48e9637e32..f5791927aa644493354dd487a35a1111fdf676d6 100644 (file)
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -54,12 +54,13 @@
  #include <asm/ftrace.h>
  #include <asm/traps.h>
  #include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
  #include <asm/mce.h>
  #include <asm/fixmap.h>
  #include <asm/mach_traps.h>
  #include <asm/alternative.h>
+#include <asm/fpu/xstate.h>
+#include <asm/trace/mpx.h>
  #include <asm/mpx.h>
  
  #ifdef CONFIG_X86_64
@@ -370,10 +371,8 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
  
  dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
  {
-       struct task_struct *tsk = current;
-       struct xsave_struct *xsave_buf;
         enum ctx_state prev_state;
-       struct bndcsr *bndcsr;
+       const struct bndcsr *bndcsr;
         siginfo_t *info;
  
         prev_state = exception_enter();
@@ -392,15 +391,15 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
  
         /*
          * We need to look at BNDSTATUS to resolve this exception.
-        * It is not directly accessible, though, so we need to
-        * do an xsave and then pull it out of the xsave buffer.
+        * A NULL here might mean that it is in its 'init state',
+        * which is all zeros which indicates MPX was not
+        * responsible for the exception.
          */
-       fpu_save_init(&tsk->thread.fpu);
-       xsave_buf = &(tsk->thread.fpu.state->xsave);
-       bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR);
+       bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);
         if (!bndcsr)
                 goto exit_trap;
  
+       trace_bounds_exception_mpx(bndcsr);
         /*
          * The error code field of the BNDSTATUS register communicates status
          * information of a bound range exception #BR or operation involving
@@ -408,11 +407,11 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
          */
         switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) {
         case 2: /* Bound directory has invalid entry. */
-               if (mpx_handle_bd_fault(xsave_buf))
+               if (mpx_handle_bd_fault())
                         goto exit_trap;
                 break; /* Success, it was handled */
         case 1: /* Bound violation. */
-               info = mpx_generate_siginfo(regs, xsave_buf);
+               info = mpx_generate_siginfo(regs);
                 if (IS_ERR(info)) {
                         /*
                          * We failed to decode the MPX instruction.  Act as if
@@ -708,8 +707,8 @@ NOKPROBE_SYMBOL(do_debug);
  static void math_error(struct pt_regs *regs, int error_code, int trapnr)
  {
         struct task_struct *task = current;
+       struct fpu *fpu = &task->thread.fpu;
         siginfo_t info;
-       unsigned short err;
         char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
                                                 "simd exception";
  
@@ -717,8 +716,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
                 return;
         conditional_sti(regs);
  
-       if (!user_mode(regs))
-       {
+       if (!user_mode(regs)) {
                 if (!fixup_exception(regs)) {
                         task->thread.error_code = error_code;
                         task->thread.trap_nr = trapnr;
@@ -730,62 +728,20 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
         /*
          * Save the info for the exception handler and clear the error.
          */
-       unlazy_fpu(task);
-       task->thread.trap_nr = trapnr;
+       fpu__save(fpu);
+
+       task->thread.trap_nr    = trapnr;
         task->thread.error_code = error_code;
-       info.si_signo = SIGFPE;
-       info.si_errno = 0;
-       info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
-       if (trapnr == X86_TRAP_MF) {
-               unsigned short cwd, swd;
-               /*
-                * (~cwd & swd) will mask out exceptions that are not set to unmasked
-                * status.  0x3f is the exception bits in these regs, 0x200 is the
-                * C1 reg you need in case of a stack fault, 0x040 is the stack
-                * fault bit.  We should only be taking one exception at a time,
-                * so if this combination doesn't produce any single exception,
-                * then we have a bad program that isn't synchronizing its FPU usage
-                * and it will suffer the consequences since we won't be able to
-                * fully reproduce the context of the exception
-                */
-               cwd = get_fpu_cwd(task);
-               swd = get_fpu_swd(task);
+       info.si_signo           = SIGFPE;
+       info.si_errno           = 0;
+       info.si_addr            = (void __user *)uprobe_get_trap_addr(regs);
  
-               err = swd & ~cwd;
-       } else {
-               /*
-                * The SIMD FPU exceptions are handled a little differently, as there
-                * is only a single status/control register.  Thus, to determine which
-                * unmasked exception was caught we must mask the exception mask bits
-                * at 0x1f80, and then use these to mask the exception bits at 0x3f.
-                */
-               unsigned short mxcsr = get_fpu_mxcsr(task);
-               err = ~(mxcsr >> 7) & mxcsr;
-       }
+       info.si_code = fpu__exception_code(fpu, trapnr);
  
-       if (err & 0x001) {      /* Invalid op */
-               /*
-                * swd & 0x240 == 0x040: Stack Underflow
-                * swd & 0x240 == 0x240: Stack Overflow
-                * User must clear the SF bit (0x40) if set
-                */
-               info.si_code = FPE_FLTINV;
-       } else if (err & 0x004) { /* Divide by Zero */
-               info.si_code = FPE_FLTDIV;
-       } else if (err & 0x008) { /* Overflow */
-               info.si_code = FPE_FLTOVF;
-       } else if (err & 0x012) { /* Denormal, Underflow */
-               info.si_code = FPE_FLTUND;
-       } else if (err & 0x020) { /* Precision */
-               info.si_code = FPE_FLTRES;
-       } else {
-               /*
-                * If we're using IRQ 13, or supposedly even some trap
-                * X86_TRAP_MF implementations, it's possible
-                * we get a spurious trap, which is not an error.
-                */
+       /* Retry when we get spurious exceptions: */
+       if (!info.si_code)
                 return;
-       }
+
         force_sig_info(SIGFPE, &info, task);
  }
  
@@ -814,48 +770,6 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
         conditional_sti(regs);
  }
  
-/*
- * 'math_state_restore()' saves the current math information in the
- * old math state array, and gets the new ones from the current task
- *
- * Careful.. There are problems with IBM-designed IRQ13 behaviour.
- * Don't touch unless you *really* know how it works.
- *
- * Must be called with kernel preemption disabled (eg with local
- * local interrupts as in the case of do_device_not_available).
- */
-void math_state_restore(void)
-{
-       struct task_struct *tsk = current;
-
-       if (!tsk_used_math(tsk)) {
-               local_irq_enable();
-               /*
-                * does a slab alloc which can sleep
-                */
-               if (init_fpu(tsk)) {
-                       /*
-                        * ran out of memory!
-                        */
-                       do_group_exit(SIGKILL);
-                       return;
-               }
-               local_irq_disable();
-       }
-
-       /* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */
-       kernel_fpu_disable();
-       __thread_fpu_begin(tsk);
-       if (unlikely(restore_fpu_checking(tsk))) {
-               fpu_reset_state(tsk);
-               force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
-       } else {
-               tsk->thread.fpu_counter++;
-       }
-       kernel_fpu_enable();
-}
-EXPORT_SYMBOL_GPL(math_state_restore);
-
  dotraplinkage void
  do_device_not_available(struct pt_regs *regs, long error_code)
  {
@@ -876,7 +790,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
                 return;
         }
  #endif
-       math_state_restore(); /* interrupts still off */
+       fpu__restore(&current->thread.fpu); /* interrupts still off */
  #ifdef CONFIG_X86_32
         conditional_sti(regs);
  #endif
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c

index 26488487bc61e8fb5fcc76cf4ec245cb8ff1ebd7..dd8d0791dfb5021930793689376f0d4f064e5b38 100644 (file)
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -113,7 +113,7 @@ static void check_tsc_warp(unsigned int timeout)
   */
  static inline unsigned int loop_timeout(int cpu)
  {
-       return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20;
+       return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
  }
  
  /*
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c

index 0b81ad67da07fa36e57577de5f7165ab320a55a1..66476244731ef8fba8fafbe1bb6cbd17f1a18b9c 100644 (file)
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -29,6 +29,7 @@
  #include <linux/kdebug.h>
  #include <asm/processor.h>
  #include <asm/insn.h>
+#include <asm/mmu_context.h>
  
  /* Post-execution fixups. */
  
@@ -312,11 +313,6 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
  }
  
  #ifdef CONFIG_X86_64
-static inline bool is_64bit_mm(struct mm_struct *mm)
-{
-       return  !config_enabled(CONFIG_IA32_EMULATION) ||
-               !(mm->context.ia32_compat == TIF_IA32);
-}
  /*
   * If arch_uprobe->insn doesn't use rip-relative addressing, return
   * immediately.  Otherwise, rewrite the instruction so that it accesses
@@ -497,10 +493,6 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
         }
  }
  #else /* 32-bit: */
-static inline bool is_64bit_mm(struct mm_struct *mm)
-{
-       return false;
-}
  /*
   * No RIP-relative addressing on 32-bit
   */
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c

index 37d8fa4438f056b7611077a578a288d4250972b6..a0695be19864eda009742b142ba2046e13ad05f9 100644 (file)
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -75,7 +75,5 @@ EXPORT_SYMBOL(native_load_gs_index);
  
  #ifdef CONFIG_PREEMPT
  EXPORT_SYMBOL(___preempt_schedule);
-#ifdef CONFIG_CONTEXT_TRACKING
-EXPORT_SYMBOL(___preempt_schedule_context);
-#endif
+EXPORT_SYMBOL(___preempt_schedule_notrace);
  #endif
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c

deleted file mode 100644 (file)

index 87a815b..0000000
--- a/arch/x86/kernel/xsave.c
+++ /dev/null
@@ -1,724 +0,0 @@
-/*
- * xsave/xrstor support.
- *
- * Author: Suresh Siddha <suresh.b.siddha@intel.com>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/bootmem.h>
-#include <linux/compat.h>
-#include <linux/cpu.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
-#include <asm/sigframe.h>
-#include <asm/tlbflush.h>
-#include <asm/xcr.h>
-
-/*
- * Supported feature mask by the CPU and the kernel.
- */
-u64 pcntxt_mask;
-
-/*
- * Represents init state for the supported extended state.
- */
-struct xsave_struct *init_xstate_buf;
-
-static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
-static unsigned int *xstate_offsets, *xstate_sizes;
-static unsigned int xstate_comp_offsets[sizeof(pcntxt_mask)*8];
-static unsigned int xstate_features;
-
-/*
- * If a processor implementation discern that a processor state component is
- * in its initialized state it may modify the corresponding bit in the
- * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory
- * layout in the case of xsaveopt. While presenting the xstate information to
- * the user, we always ensure that the memory layout of a feature will be in
- * the init state if the corresponding header bit is zero. This is to ensure
- * that the user doesn't see some stale state in the memory layout during
- * signal handling, debugging etc.
- */
-void __sanitize_i387_state(struct task_struct *tsk)
-{
-       struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
-       int feature_bit = 0x2;
-       u64 xstate_bv;
-
-       if (!fx)
-               return;
-
-       xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
-
-       /*
-        * None of the feature bits are in init state. So nothing else
-        * to do for us, as the memory layout is up to date.
-        */
-       if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
-               return;
-
-       /*
-        * FP is in init state
-        */
-       if (!(xstate_bv & XSTATE_FP)) {
-               fx->cwd = 0x37f;
-               fx->swd = 0;
-               fx->twd = 0;
-               fx->fop = 0;
-               fx->rip = 0;
-               fx->rdp = 0;
-               memset(&fx->st_space[0], 0, 128);
-       }
-
-       /*
-        * SSE is in init state
-        */
-       if (!(xstate_bv & XSTATE_SSE))
-               memset(&fx->xmm_space[0], 0, 256);
-
-       xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2;
-
-       /*
-        * Update all the other memory layouts for which the corresponding
-        * header bit is in the init state.
-        */
-       while (xstate_bv) {
-               if (xstate_bv & 0x1) {
-                       int offset = xstate_offsets[feature_bit];
-                       int size = xstate_sizes[feature_bit];
-
-                       memcpy(((void *) fx) + offset,
-                              ((void *) init_xstate_buf) + offset,
-                              size);
-               }
-
-               xstate_bv >>= 1;
-               feature_bit++;
-       }
-}
-
-/*
- * Check for the presence of extended state information in the
- * user fpstate pointer in the sigcontext.
- */
-static inline int check_for_xstate(struct i387_fxsave_struct __user *buf,
-                                  void __user *fpstate,
-                                  struct _fpx_sw_bytes *fx_sw)
-{
-       int min_xstate_size = sizeof(struct i387_fxsave_struct) +
-                             sizeof(struct xsave_hdr_struct);
-       unsigned int magic2;
-
-       if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw)))
-               return -1;
-
-       /* Check for the first magic field and other error scenarios. */
-       if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
-           fx_sw->xstate_size < min_xstate_size ||
-           fx_sw->xstate_size > xstate_size ||
-           fx_sw->xstate_size > fx_sw->extended_size)
-               return -1;
-
-       /*
-        * Check for the presence of second magic word at the end of memory
-        * layout. This detects the case where the user just copied the legacy
-        * fpstate layout with out copying the extended state information
-        * in the memory layout.
-        */
-       if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))
-           || magic2 != FP_XSTATE_MAGIC2)
-               return -1;
-
-       return 0;
-}
-
-/*
- * Signal frame handlers.
- */
-static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
-{
-       if (use_fxsr()) {
-               struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
-               struct user_i387_ia32_struct env;
-               struct _fpstate_ia32 __user *fp = buf;
-
-               convert_from_fxsr(&env, tsk);
-
-               if (__copy_to_user(buf, &env, sizeof(env)) ||
-                   __put_user(xsave->i387.swd, &fp->status) ||
-                   __put_user(X86_FXSR_MAGIC, &fp->magic))
-                       return -1;
-       } else {
-               struct i387_fsave_struct __user *fp = buf;
-               u32 swd;
-               if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
-                       return -1;
-       }
-
-       return 0;
-}
-
-static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
-{
-       struct xsave_struct __user *x = buf;
-       struct _fpx_sw_bytes *sw_bytes;
-       u32 xstate_bv;
-       int err;
-
-       /* Setup the bytes not touched by the [f]xsave and reserved for SW. */
-       sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
-       err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
-
-       if (!use_xsave())
-               return err;
-
-       err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size));
-
-       /*
-        * Read the xstate_bv which we copied (directly from the cpu or
-        * from the state in task struct) to the user buffers.
-        */
-       err |= __get_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv);
-
-       /*
-        * For legacy compatible, we always set FP/SSE bits in the bit
-        * vector while saving the state to the user context. This will
-        * enable us capturing any changes(during sigreturn) to
-        * the FP/SSE bits by the legacy applications which don't touch
-        * xstate_bv in the xsave header.
-        *
-        * xsave aware apps can change the xstate_bv in the xsave
-        * header as well as change any contents in the memory layout.
-        * xrestore as part of sigreturn will capture all the changes.
-        */
-       xstate_bv |= XSTATE_FPSSE;
-
-       err |= __put_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv);
-
-       return err;
-}
-
-static inline int save_user_xstate(struct xsave_struct __user *buf)
-{
-       int err;
-
-       if (use_xsave())
-               err = xsave_user(buf);
-       else if (use_fxsr())
-               err = fxsave_user((struct i387_fxsave_struct __user *) buf);
-       else
-               err = fsave_user((struct i387_fsave_struct __user *) buf);
-
-       if (unlikely(err) && __clear_user(buf, xstate_size))
-               err = -EFAULT;
-       return err;
-}
-
-/*
- * Save the fpu, extended register state to the user signal frame.
- *
- * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
- *  state is copied.
- *  'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
- *
- *     buf == buf_fx for 64-bit frames and 32-bit fsave frame.
- *     buf != buf_fx for 32-bit frames with fxstate.
- *
- * If the fpu, extended register state is live, save the state directly
- * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise,
- * copy the thread's fpu state to the user frame starting at 'buf_fx'.
- *
- * If this is a 32-bit frame with fxstate, put a fsave header before
- * the aligned state at 'buf_fx'.
- *
- * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
- * indicating the absence/presence of the extended state to the user.
- */
-int save_xstate_sig(void __user *buf, void __user *buf_fx, int size)
-{
-       struct xsave_struct *xsave = &current->thread.fpu.state->xsave;
-       struct task_struct *tsk = current;
-       int ia32_fxstate = (buf != buf_fx);
-
-       ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
-                        config_enabled(CONFIG_IA32_EMULATION));
-
-       if (!access_ok(VERIFY_WRITE, buf, size))
-               return -EACCES;
-
-       if (!static_cpu_has(X86_FEATURE_FPU))
-               return fpregs_soft_get(current, NULL, 0,
-                       sizeof(struct user_i387_ia32_struct), NULL,
-                       (struct _fpstate_ia32 __user *) buf) ? -1 : 1;
-
-       if (user_has_fpu()) {
-               /* Save the live register state to the user directly. */
-               if (save_user_xstate(buf_fx))
-                       return -1;
-               /* Update the thread's fxstate to save the fsave header. */
-               if (ia32_fxstate)
-                       fpu_fxsave(&tsk->thread.fpu);
-       } else {
-               sanitize_i387_state(tsk);
-               if (__copy_to_user(buf_fx, xsave, xstate_size))
-                       return -1;
-       }
-
-       /* Save the fsave header for the 32-bit frames. */
-       if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
-               return -1;
-
-       if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
-               return -1;
-
-       return 0;
-}
-
-static inline void
-sanitize_restored_xstate(struct task_struct *tsk,
-                        struct user_i387_ia32_struct *ia32_env,
-                        u64 xstate_bv, int fx_only)
-{
-       struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
-       struct xsave_hdr_struct *xsave_hdr = &xsave->xsave_hdr;
-
-       if (use_xsave()) {
-               /* These bits must be zero. */
-               memset(xsave_hdr->reserved, 0, 48);
-
-               /*
-                * Init the state that is not present in the memory
-                * layout and not enabled by the OS.
-                */
-               if (fx_only)
-                       xsave_hdr->xstate_bv = XSTATE_FPSSE;
-               else
-                       xsave_hdr->xstate_bv &= (pcntxt_mask & xstate_bv);
-       }
-
-       if (use_fxsr()) {
-               /*
-                * mscsr reserved bits must be masked to zero for security
-                * reasons.
-                */
-               xsave->i387.mxcsr &= mxcsr_feature_mask;
-
-               convert_to_fxsr(tsk, ia32_env);
-       }
-}
-
-/*
- * Restore the extended state if present. Otherwise, restore the FP/SSE state.
- */
-static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only)
-{
-       if (use_xsave()) {
-               if ((unsigned long)buf % 64 || fx_only) {
-                       u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE;
-                       xrstor_state(init_xstate_buf, init_bv);
-                       return fxrstor_user(buf);
-               } else {
-                       u64 init_bv = pcntxt_mask & ~xbv;
-                       if (unlikely(init_bv))
-                               xrstor_state(init_xstate_buf, init_bv);
-                       return xrestore_user(buf, xbv);
-               }
-       } else if (use_fxsr()) {
-               return fxrstor_user(buf);
-       } else
-               return frstor_user(buf);
-}
-
-int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
-{
-       int ia32_fxstate = (buf != buf_fx);
-       struct task_struct *tsk = current;
-       int state_size = xstate_size;
-       u64 xstate_bv = 0;
-       int fx_only = 0;
-
-       ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
-                        config_enabled(CONFIG_IA32_EMULATION));
-
-       if (!buf) {
-               fpu_reset_state(tsk);
-               return 0;
-       }
-
-       if (!access_ok(VERIFY_READ, buf, size))
-               return -EACCES;
-
-       if (!used_math() && init_fpu(tsk))
-               return -1;
-
-       if (!static_cpu_has(X86_FEATURE_FPU))
-               return fpregs_soft_set(current, NULL,
-                                      0, sizeof(struct user_i387_ia32_struct),
-                                      NULL, buf) != 0;
-
-       if (use_xsave()) {
-               struct _fpx_sw_bytes fx_sw_user;
-               if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
-                       /*
-                        * Couldn't find the extended state information in the
-                        * memory layout. Restore just the FP/SSE and init all
-                        * the other extended state.
-                        */
-                       state_size = sizeof(struct i387_fxsave_struct);
-                       fx_only = 1;
-               } else {
-                       state_size = fx_sw_user.xstate_size;
-                       xstate_bv = fx_sw_user.xstate_bv;
-               }
-       }
-
-       if (ia32_fxstate) {
-               /*
-                * For 32-bit frames with fxstate, copy the user state to the
-                * thread's fpu state, reconstruct fxstate from the fsave
-                * header. Sanitize the copied state etc.
-                */
-               struct fpu *fpu = &tsk->thread.fpu;
-               struct user_i387_ia32_struct env;
-               int err = 0;
-
-               /*
-                * Drop the current fpu which clears used_math(). This ensures
-                * that any context-switch during the copy of the new state,
-                * avoids the intermediate state from getting restored/saved.
-                * Thus avoiding the new restored state from getting corrupted.
-                * We will be ready to restore/save the state only after
-                * set_used_math() is again set.
-                */
-               drop_fpu(tsk);
-
-               if (__copy_from_user(&fpu->state->xsave, buf_fx, state_size) ||
-                   __copy_from_user(&env, buf, sizeof(env))) {
-                       fpu_finit(fpu);
-                       err = -1;
-               } else {
-                       sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only);
-               }
-
-               set_used_math();
-               if (use_eager_fpu()) {
-                       preempt_disable();
-                       math_state_restore();
-                       preempt_enable();
-               }
-
-               return err;
-       } else {
-               /*
-                * For 64-bit frames and 32-bit fsave frames, restore the user
-                * state to the registers directly (with exceptions handled).
-                */
-               user_fpu_begin();
-               if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) {
-                       fpu_reset_state(tsk);
-                       return -1;
-               }
-       }
-
-       return 0;
-}
-
-/*
- * Prepare the SW reserved portion of the fxsave memory layout, indicating
- * the presence of the extended state information in the memory layout
- * pointed by the fpstate pointer in the sigcontext.
- * This will be saved when ever the FP and extended state context is
- * saved on the user stack during the signal handler delivery to the user.
- */
-static void prepare_fx_sw_frame(void)
-{
-       int fsave_header_size = sizeof(struct i387_fsave_struct);
-       int size = xstate_size + FP_XSTATE_MAGIC2_SIZE;
-
-       if (config_enabled(CONFIG_X86_32))
-               size += fsave_header_size;
-
-       fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
-       fx_sw_reserved.extended_size = size;
-       fx_sw_reserved.xstate_bv = pcntxt_mask;
-       fx_sw_reserved.xstate_size = xstate_size;
-
-       if (config_enabled(CONFIG_IA32_EMULATION)) {
-               fx_sw_reserved_ia32 = fx_sw_reserved;
-               fx_sw_reserved_ia32.extended_size += fsave_header_size;
-       }
-}
-
-/*
- * Enable the extended processor state save/restore feature
- */
-static inline void xstate_enable(void)
-{
-       cr4_set_bits(X86_CR4_OSXSAVE);
-       xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
-}
-
-/*
- * Record the offsets and sizes of different state managed by the xsave
- * memory layout.
- */
-static void __init setup_xstate_features(void)
-{
-       int eax, ebx, ecx, edx, leaf = 0x2;
-
-       xstate_features = fls64(pcntxt_mask);
-       xstate_offsets = alloc_bootmem(xstate_features * sizeof(int));
-       xstate_sizes = alloc_bootmem(xstate_features * sizeof(int));
-
-       do {
-               cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
-
-               if (eax == 0)
-                       break;
-
-               xstate_offsets[leaf] = ebx;
-               xstate_sizes[leaf] = eax;
-
-               leaf++;
-       } while (1);
-}
-
-/*
- * This function sets up offsets and sizes of all extended states in
- * xsave area. This supports both standard format and compacted format
- * of the xsave aread.
- *
- * Input: void
- * Output: void
- */
-void setup_xstate_comp(void)
-{
-       unsigned int xstate_comp_sizes[sizeof(pcntxt_mask)*8];
-       int i;
-
-       /*
-        * The FP xstates and SSE xstates are legacy states. They are always
-        * in the fixed offsets in the xsave area in either compacted form
-        * or standard form.
-        */
-       xstate_comp_offsets[0] = 0;
-       xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space);
-
-       if (!cpu_has_xsaves) {
-               for (i = 2; i < xstate_features; i++) {
-                       if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
-                               xstate_comp_offsets[i] = xstate_offsets[i];
-                               xstate_comp_sizes[i] = xstate_sizes[i];
-                       }
-               }
-               return;
-       }
-
-       xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE;
-
-       for (i = 2; i < xstate_features; i++) {
-               if (test_bit(i, (unsigned long *)&pcntxt_mask))
-                       xstate_comp_sizes[i] = xstate_sizes[i];
-               else
-                       xstate_comp_sizes[i] = 0;
-
-               if (i > 2)
-                       xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
-                                       + xstate_comp_sizes[i-1];
-
-       }
-}
-
-/*
- * setup the xstate image representing the init state
- */
-static void __init setup_init_fpu_buf(void)
-{
-       /*
-        * Setup init_xstate_buf to represent the init state of
-        * all the features managed by the xsave
-        */
-       init_xstate_buf = alloc_bootmem_align(xstate_size,
-                                             __alignof__(struct xsave_struct));
-       fx_finit(&init_xstate_buf->i387);
-
-       if (!cpu_has_xsave)
-               return;
-
-       setup_xstate_features();
-
-       if (cpu_has_xsaves) {
-               init_xstate_buf->xsave_hdr.xcomp_bv =
-                                               (u64)1 << 63 | pcntxt_mask;
-               init_xstate_buf->xsave_hdr.xstate_bv = pcntxt_mask;
-       }
-
-       /*
-        * Init all the features state with header_bv being 0x0
-        */
-       xrstor_state_booting(init_xstate_buf, -1);
-       /*
-        * Dump the init state again. This is to identify the init state
-        * of any feature which is not represented by all zero's.
-        */
-       xsave_state_booting(init_xstate_buf, -1);
-}
-
-static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
-static int __init eager_fpu_setup(char *s)
-{
-       if (!strcmp(s, "on"))
-               eagerfpu = ENABLE;
-       else if (!strcmp(s, "off"))
-               eagerfpu = DISABLE;
-       else if (!strcmp(s, "auto"))
-               eagerfpu = AUTO;
-       return 1;
-}
-__setup("eagerfpu=", eager_fpu_setup);
-
-
-/*
- * Calculate total size of enabled xstates in XCR0/pcntxt_mask.
- */
-static void __init init_xstate_size(void)
-{
-       unsigned int eax, ebx, ecx, edx;
-       int i;
-
-       if (!cpu_has_xsaves) {
-               cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
-               xstate_size = ebx;
-               return;
-       }
-
-       xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
-       for (i = 2; i < 64; i++) {
-               if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
-                       cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
-                       xstate_size += eax;
-               }
-       }
-}
-
-/*
- * Enable and initialize the xsave feature.
- */
-static void __init xstate_enable_boot_cpu(void)
-{
-       unsigned int eax, ebx, ecx, edx;
-
-       if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
-               WARN(1, KERN_ERR "XSTATE_CPUID missing\n");
-               return;
-       }
-
-       cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
-       pcntxt_mask = eax + ((u64)edx << 32);
-
-       if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
-               pr_err("FP/SSE not shown under xsave features 0x%llx\n",
-                      pcntxt_mask);
-               BUG();
-       }
-
-       /*
-        * Support only the state known to OS.
-        */
-       pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
-
-       xstate_enable();
-
-       /*
-        * Recompute the context size for enabled features
-        */
-       init_xstate_size();
-
-       update_regset_xstate_info(xstate_size, pcntxt_mask);
-       prepare_fx_sw_frame();
-       setup_init_fpu_buf();
-
-       /* Auto enable eagerfpu for xsaveopt */
-       if (cpu_has_xsaveopt && eagerfpu != DISABLE)
-               eagerfpu = ENABLE;
-
-       if (pcntxt_mask & XSTATE_EAGER) {
-               if (eagerfpu == DISABLE) {
-                       pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n",
-                                       pcntxt_mask & XSTATE_EAGER);
-                       pcntxt_mask &= ~XSTATE_EAGER;
-               } else {
-                       eagerfpu = ENABLE;
-               }
-       }
-
-       pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x using %s\n",
-               pcntxt_mask, xstate_size,
-               cpu_has_xsaves ? "compacted form" : "standard form");
-}
-
-/*
- * For the very first instance, this calls xstate_enable_boot_cpu();
- * for all subsequent instances, this calls xstate_enable().
- *
- * This is somewhat obfuscated due to the lack of powerful enough
- * overrides for the section checks.
- */
-void xsave_init(void)
-{
-       static __refdata void (*next_func)(void) = xstate_enable_boot_cpu;
-       void (*this_func)(void);
-
-       if (!cpu_has_xsave)
-               return;
-
-       this_func = next_func;
-       next_func = xstate_enable;
-       this_func();
-}
-
-/*
- * setup_init_fpu_buf() is __init and it is OK to call it here because
- * init_xstate_buf will be unset only once during boot.
- */
-void __init_refok eager_fpu_init(void)
-{
-       WARN_ON(used_math());
-       current_thread_info()->status = 0;
-
-       if (eagerfpu == ENABLE)
-               setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
-
-       if (!cpu_has_eager_fpu) {
-               stts();
-               return;
-       }
-
-       if (!init_xstate_buf)
-               setup_init_fpu_buf();
-}
-
-/*
- * Given the xsave area and a state inside, this function returns the
- * address of the state.
- *
- * This is the API that is called to get xstate address in either
- * standard format or compacted format of xsave area.
- *
- * Inputs:
- *     xsave: base address of the xsave area;
- *     xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE,
- *     etc.)
- * Output:
- *     address of the state in the xsave area.
- */
-void *get_xsave_addr(struct xsave_struct *xsave, int xstate)
-{
-       int feature = fls64(xstate) - 1;
-       if (!test_bit(feature, (unsigned long *)&pcntxt_mask))
-               return NULL;
-
-       return (void *)xsave + xstate_comp_offsets[feature];
-}
-EXPORT_SYMBOL_GPL(get_xsave_addr);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c

index 1d08ad3582d07fd61cd03302eba547e3a997058b..9f705e618af574d9a406f2d55f723bfa11129385 100644 (file)
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -16,10 +16,8 @@
  #include <linux/module.h>
  #include <linux/vmalloc.h>
  #include <linux/uaccess.h>
-#include <asm/i387.h> /* For use_eager_fpu.  Ugh! */
-#include <asm/fpu-internal.h> /* For use_eager_fpu.  Ugh! */
  #include <asm/user.h>
-#include <asm/xsave.h>
+#include <asm/fpu/xstate.h>
  #include "cpuid.h"
  #include "lapic.h"
  #include "mmu.h"
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c

index 629af0f1c5c4d0953010adc88233132bcdff4cb7..4c7deb4f78a147b1a4a8b451120d3b80fa6401dc 100644 (file)
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1090,6 +1090,17 @@ static void update_divide_count(struct kvm_lapic *apic)
                                    apic->divide_count);
  }
  
+static void apic_update_lvtt(struct kvm_lapic *apic)
+{
+       u32 timer_mode = kvm_apic_get_reg(apic, APIC_LVTT) &
+                       apic->lapic_timer.timer_mode_mask;
+
+       if (apic->lapic_timer.timer_mode != timer_mode) {
+               apic->lapic_timer.timer_mode = timer_mode;
+               hrtimer_cancel(&apic->lapic_timer.timer);
+       }
+}
+
  static void apic_timer_expired(struct kvm_lapic *apic)
  {
         struct kvm_vcpu *vcpu = apic->vcpu;
@@ -1298,6 +1309,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
                                 apic_set_reg(apic, APIC_LVTT + 0x10 * i,
                                              lvt_val | APIC_LVT_MASKED);
                         }
+                       apic_update_lvtt(apic);
                         atomic_set(&apic->lapic_timer.pending, 0);
  
                 }
@@ -1330,20 +1342,13 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
  
                 break;
  
-       case APIC_LVTT: {
-               u32 timer_mode = val & apic->lapic_timer.timer_mode_mask;
-
-               if (apic->lapic_timer.timer_mode != timer_mode) {
-                       apic->lapic_timer.timer_mode = timer_mode;
-                       hrtimer_cancel(&apic->lapic_timer.timer);
-               }
-
+       case APIC_LVTT:
                 if (!kvm_apic_sw_enabled(apic))
                         val |= APIC_LVT_MASKED;
                 val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
                 apic_set_reg(apic, APIC_LVTT, val);
+               apic_update_lvtt(apic);
                 break;
-       }
  
         case APIC_TMICT:
                 if (apic_lvtt_tscdeadline(apic))
@@ -1576,7 +1581,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
  
         for (i = 0; i < APIC_LVT_NUM; i++)
                 apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
-       apic->lapic_timer.timer_mode = 0;
+       apic_update_lvtt(apic);
         apic_set_reg(apic, APIC_LVT0,
                      SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
  
@@ -1802,6 +1807,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
  
         apic_update_ppr(apic);
         hrtimer_cancel(&apic->lapic_timer.timer);
+       apic_update_lvtt(apic);
         update_divide_count(apic);
         start_apic_timer(apic);
         apic->irr_pending = true;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

index 44a7d25154973437e0ce4233e142d01c43a948b9..b73337634214c209e250051cd21e00bd2436fcd6 100644 (file)
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4215,13 +4215,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
         u64 entry, gentry, *spte;
         int npte;
         bool remote_flush, local_flush, zap_page;
-       union kvm_mmu_page_role mask = (union kvm_mmu_page_role) {
-               .cr0_wp = 1,
-               .cr4_pae = 1,
-               .nxe = 1,
-               .smep_andnot_wp = 1,
-               .smap_andnot_wp = 1,
-       };
+       union kvm_mmu_page_role mask = { };
+
+       mask.cr0_wp = 1;
+       mask.cr4_pae = 1;
+       mask.nxe = 1;
+       mask.smep_andnot_wp = 1;
+       mask.smap_andnot_wp = 1;
  
         /*
          * If we don't have indirect shadow pages, it means no page is
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index 2d73807f0d317f3c46a7aa42234a67d6894b20a2..e11dd59398f1576d593ca16fb68a16de7ec241f7 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -40,8 +40,7 @@
  #include <asm/vmx.h>
  #include <asm/virtext.h>
  #include <asm/mce.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
+#include <asm/fpu/internal.h>
  #include <asm/perf_event.h>
  #include <asm/debugreg.h>
  #include <asm/kexec.h>
@@ -1883,7 +1882,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
          * If the FPU is not active (through the host task or
          * the guest vcpu), then restore the cr0.TS bit.
          */
-       if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded)
+       if (!fpregs_active() && !vmx->vcpu.guest_fpu_loaded)
                 stts();
         load_gdt(this_cpu_ptr(&host_gdt));
  }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index ea306adbbc13603591d46d3f062cc20cf1bc37d1..26eaeb522cab214bed15cba35f5be945722d70ae 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -59,9 +59,8 @@
  #include <asm/desc.h>
  #include <asm/mtrr.h>
  #include <asm/mce.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h> /* Ugh! */
-#include <asm/xcr.h>
+#include <linux/kernel_stat.h>
+#include <asm/fpu/internal.h> /* Ugh! */
  #include <asm/pvclock.h>
  #include <asm/div64.h>
  
@@ -3194,8 +3193,8 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
  
  static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
  {
-       struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave;
-       u64 xstate_bv = xsave->xsave_hdr.xstate_bv;
+       struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
+       u64 xstate_bv = xsave->header.xfeatures;
         u64 valid;
  
         /*
@@ -3230,7 +3229,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
  
  static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
  {
-       struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave;
+       struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
         u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
         u64 valid;
  
@@ -3241,9 +3240,9 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
         memcpy(xsave, src, XSAVE_HDR_OFFSET);
  
         /* Set XSTATE_BV and possibly XCOMP_BV.  */
-       xsave->xsave_hdr.xstate_bv = xstate_bv;
+       xsave->header.xfeatures = xstate_bv;
         if (cpu_has_xsaves)
-               xsave->xsave_hdr.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
+               xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
  
         /*
          * Copy each region from the non-compacted offset to the
@@ -3275,8 +3274,8 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
                 fill_xsave((u8 *) guest_xsave->region, vcpu);
         } else {
                 memcpy(guest_xsave->region,
-                       &vcpu->arch.guest_fpu.state->fxsave,
-                       sizeof(struct i387_fxsave_struct));
+                       &vcpu->arch.guest_fpu.state.fxsave,
+                       sizeof(struct fxregs_state));
                 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
                         XSTATE_FPSSE;
         }
@@ -3300,8 +3299,8 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
         } else {
                 if (xstate_bv & ~XSTATE_FPSSE)
                         return -EINVAL;
-               memcpy(&vcpu->arch.guest_fpu.state->fxsave,
-                       guest_xsave->region, sizeof(struct i387_fxsave_struct));
+               memcpy(&vcpu->arch.guest_fpu.state.fxsave,
+                       guest_xsave->region, sizeof(struct fxregs_state));
         }
         return 0;
  }
@@ -6597,11 +6596,11 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
  
  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
  {
+       struct fpu *fpu = &current->thread.fpu;
         int r;
         sigset_t sigsaved;
  
-       if (!tsk_used_math(current) && init_fpu(current))
-               return -ENOMEM;
+       fpu__activate_curr(fpu);
  
         if (vcpu->sigset_active)
                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
@@ -6971,8 +6970,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
  
  int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
  {
-       struct i387_fxsave_struct *fxsave =
-                       &vcpu->arch.guest_fpu.state->fxsave;
+       struct fxregs_state *fxsave =
+                       &vcpu->arch.guest_fpu.state.fxsave;
  
         memcpy(fpu->fpr, fxsave->st_space, 128);
         fpu->fcw = fxsave->cwd;
@@ -6988,8 +6987,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
  
  int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
  {
-       struct i387_fxsave_struct *fxsave =
-                       &vcpu->arch.guest_fpu.state->fxsave;
+       struct fxregs_state *fxsave =
+                       &vcpu->arch.guest_fpu.state.fxsave;
  
         memcpy(fxsave->st_space, fpu->fpr, 128);
         fxsave->cwd = fpu->fcw;
@@ -7003,17 +7002,11 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
         return 0;
  }
  
-int fx_init(struct kvm_vcpu *vcpu)
+static void fx_init(struct kvm_vcpu *vcpu)
  {
-       int err;
-
-       err = fpu_alloc(&vcpu->arch.guest_fpu);
-       if (err)
-               return err;
-
-       fpu_finit(&vcpu->arch.guest_fpu);
+       fpstate_init(&vcpu->arch.guest_fpu.state);
         if (cpu_has_xsaves)
-               vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv =
+               vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
                         host_xcr0 | XSTATE_COMPACTION_ENABLED;
  
         /*
@@ -7022,14 +7015,6 @@ int fx_init(struct kvm_vcpu *vcpu)
         vcpu->arch.xcr0 = XSTATE_FP;
  
         vcpu->arch.cr0 |= X86_CR0_ET;
-
-       return 0;
-}
-EXPORT_SYMBOL_GPL(fx_init);
-
-static void fx_free(struct kvm_vcpu *vcpu)
-{
-       fpu_free(&vcpu->arch.guest_fpu);
  }
  
  void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
@@ -7045,7 +7030,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
         kvm_put_guest_xcr0(vcpu);
         vcpu->guest_fpu_loaded = 1;
         __kernel_fpu_begin();
-       fpu_restore_checking(&vcpu->arch.guest_fpu);
+       __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
         trace_kvm_fpu(1);
  }
  
@@ -7057,7 +7042,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
                 return;
  
         vcpu->guest_fpu_loaded = 0;
-       fpu_save_init(&vcpu->arch.guest_fpu);
+       copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
         __kernel_fpu_end();
         ++vcpu->stat.fpu_reload;
         if (!vcpu->arch.eager_fpu)
@@ -7071,7 +7056,6 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
         kvmclock_reset(vcpu);
  
         free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
-       fx_free(vcpu);
         kvm_x86_ops->vcpu_free(vcpu);
  }
  
@@ -7137,7 +7121,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
         kvm_mmu_unload(vcpu);
         vcpu_put(vcpu);
  
-       fx_free(vcpu);
         kvm_x86_ops->vcpu_free(vcpu);
  }
  
@@ -7363,9 +7346,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
                 goto fail_free_mce_banks;
         }
  
-       r = fx_init(vcpu);
-       if (r)
-               goto fail_free_wbinvd_dirty_mask;
+       fx_init(vcpu);
  
         vcpu->arch.ia32_tsc_adjust_msr = 0x0;
         vcpu->arch.pv_time_enabled = false;
@@ -7379,8 +7360,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
         kvm_pmu_init(vcpu);
  
         return 0;
-fail_free_wbinvd_dirty_mask:
-       free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+
  fail_free_mce_banks:
         kfree(vcpu->arch.mce_banks);
  fail_free_lapic:
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c

index cab9aaa7802c8e8ca729567f1a97cf9080458884..f2dc08c003eb0b4c6c9f691a1c4b9e8f556eaaac 100644 (file)
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -70,7 +70,7 @@
  #include <asm/e820.h>
  #include <asm/mce.h>
  #include <asm/io.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  #include <asm/stackprotector.h>
  #include <asm/reboot.h>                /* for struct machine_ops */
  #include <asm/kvm_para.h>
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c

index c9f2d9ba8dd8c2da54b0bbd07c37be1d38aa49c4..e5e3ed8dc0798bd007e8573ddbf57dc4e2312049 100644 (file)
--- a/arch/x86/lib/mmx_32.c
+++ b/arch/x86/lib/mmx_32.c
@@ -22,7 +22,7 @@
  #include <linux/sched.h>
  #include <linux/types.h>
  
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  #include <asm/asm.h>
  
  void *_mmx_memcpy(void *to, const void *from, size_t len)
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c

index e2f5e21c03b3044a14ed12cb460ea2e3c0a0e13f..91d93b95bd8685228b395c10e77d30e3a4303355 100644 (file)
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -647,7 +647,8 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
   * @from: Source address, in kernel space.
   * @n:    Number of bytes to copy.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from kernel space to user space.
   *
@@ -668,7 +669,8 @@ EXPORT_SYMBOL(_copy_to_user);
   * @from: Source address, in user space.
   * @n:    Number of bytes to copy.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Copy data from user space to kernel space.
   *
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c

index dc8adad10a2f3881cdbca82b9a624dc17b88c4f8..dd76a05729b0106c95ae2bd925ca21a647214fd6 100644 (file)
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,7 +30,7 @@ static void fclex(void)
  }
  
  /* Needs to be externally visible */
-void finit_soft_fpu(struct i387_soft_struct *soft)
+void fpstate_init_soft(struct swregs_state *soft)
  {
         struct address *oaddr, *iaddr;
         memset(soft, 0, sizeof(*soft));
@@ -52,7 +52,7 @@ void finit_soft_fpu(struct i387_soft_struct *soft)
  
  void finit(void)
  {
-       finit_soft_fpu(&current->thread.fpu.state->soft);
+       fpstate_init_soft(&current->thread.fpu.state.soft);
  }
  
  /*
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c

index 9b868124128d79699d6325dc579908bc575c3f4a..f37e84ab49f38e335bde57880a6cbe8640fb2c4b 100644 (file)
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -31,7 +31,7 @@
  #include <asm/traps.h>
  #include <asm/desc.h>
  #include <asm/user.h>
-#include <asm/i387.h>
+#include <asm/fpu/internal.h>
  
  #include "fpu_system.h"
  #include "fpu_emu.h"
@@ -147,13 +147,9 @@ void math_emulate(struct math_emu_info *info)
         unsigned long code_base = 0;
         unsigned long code_limit = 0;   /* Initialized to stop compiler warnings */
         struct desc_struct code_descriptor;
+       struct fpu *fpu = &current->thread.fpu;
  
-       if (!used_math()) {
-               if (init_fpu(current)) {
-                       do_group_exit(SIGKILL);
-                       return;
-               }
-       }
+       fpu__activate_curr(fpu);
  
  #ifdef RE_ENTRANT_CHECKING
         if (emulating) {
@@ -673,7 +669,7 @@ void math_abort(struct math_emu_info *info, unsigned int signal)
  #endif /* PARANOID */
  }
  
-#define S387 ((struct i387_soft_struct *)s387)
+#define S387 ((struct swregs_state *)s387)
  #define sstatus_word() \
    ((S387->swd & ~SW_Top & 0xffff) | ((S387->ftop << SW_Top_Shift) & SW_Top))
  
@@ -682,14 +678,14 @@ int fpregs_soft_set(struct task_struct *target,
                     unsigned int pos, unsigned int count,
                     const void *kbuf, const void __user *ubuf)
  {
-       struct i387_soft_struct *s387 = &target->thread.fpu.state->soft;
+       struct swregs_state *s387 = &target->thread.fpu.state.soft;
         void *space = s387->st_space;
         int ret;
         int offset, other, i, tags, regnr, tag, newtop;
  
         RE_ENTRANT_CHECK_OFF;
         ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, s387, 0,
-                                offsetof(struct i387_soft_struct, st_space));
+                                offsetof(struct swregs_state, st_space));
         RE_ENTRANT_CHECK_ON;
  
         if (ret)
@@ -734,7 +730,7 @@ int fpregs_soft_get(struct task_struct *target,
                     unsigned int pos, unsigned int count,
                     void *kbuf, void __user *ubuf)
  {
-       struct i387_soft_struct *s387 = &target->thread.fpu.state->soft;
+       struct swregs_state *s387 = &target->thread.fpu.state.soft;
         const void *space = s387->st_space;
         int ret;
         int offset = (S387->ftop & 7) * 10, other = 80 - offset;
@@ -752,7 +748,7 @@ int fpregs_soft_get(struct task_struct *target,
  #endif /* PECULIAR_486 */
  
         ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, s387, 0,
-                                 offsetof(struct i387_soft_struct, st_space));
+                                 offsetof(struct swregs_state, st_space));
  
         /* Copy all registers in stack order. */
         if (!ret)
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h

index 2c614410a5f3978d646f87d2814edaf2ec383396..9ccecb61a4fa129a82028b27edc18b91a2f99042 100644 (file)
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -31,7 +31,7 @@
  #define SEG_EXPAND_DOWN(s)     (((s).b & ((1 << 11) | (1 << 10))) \
                                  == (1 << 10))
  
-#define I387                   (current->thread.fpu.state)
+#define I387                   (&current->thread.fpu.state)
  #define FPU_info               (I387->soft.info)
  
  #define FPU_CS                 (*(unsigned short *) &(FPU_info->regs->cs))
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c

index 181c53bac3a7ee8881b8844bae66b951d9beecde..9dc909841739bf24b01d7d2e574c4870cc409cfd 100644 (file)
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -13,6 +13,7 @@
  #include <linux/hugetlb.h>             /* hstate_index_to_shift        */
  #include <linux/prefetch.h>            /* prefetchw                    */
  #include <linux/context_tracking.h>    /* exception_enter(), ...       */
+#include <linux/uaccess.h>             /* faulthandler_disabled()      */
  
  #include <asm/traps.h>                 /* dotraplinkage, ...           */
  #include <asm/pgalloc.h>               /* pgd_*(), ...                 */
@@ -1126,9 +1127,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
  
         /*
          * If we're in an interrupt, have no user context or are running
-        * in an atomic region then we must not take the fault:
+        * in a region with pagefaults disabled then we must not take the fault
          */
-       if (unlikely(in_atomic() || !mm)) {
+       if (unlikely(faulthandler_disabled() || !mm)) {
                 bad_area_nosemaphore(regs, error_code, address);
                 return;
         }
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c

index 4500142bc4aa46429cb2be41a7ee3407426f6155..eecb207a2037080f9f5d74c36c300b217a4f7a82 100644 (file)
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -35,7 +35,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
         unsigned long vaddr;
         int idx, type;
  
-       /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+       preempt_disable();
         pagefault_disable();
  
         if (!PageHighMem(page))
@@ -100,6 +100,7 @@ void __kunmap_atomic(void *kvaddr)
  #endif
  
         pagefault_enable();
+       preempt_enable();
  }
  EXPORT_SYMBOL(__kunmap_atomic);
  
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c

index a9dc7a37e6a2e05423d2caf7f524b5d7f1c34865..9c0ff045fdd4dec98832a5c6de9353174c9695f0 100644 (file)
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -59,6 +59,7 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
         unsigned long vaddr;
         int idx, type;
  
+       preempt_disable();
         pagefault_disable();
  
         type = kmap_atomic_idx_push();
@@ -117,5 +118,6 @@ iounmap_atomic(void __iomem *kvaddr)
         }
  
         pagefault_enable();
+       preempt_enable();
  }
  EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c

index 8405c0c6a53594419b56bcab70effdfb1435e8df..cc5ccc415cc01ef8ea9e58b3f81a281c9ab412bf 100644 (file)
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -408,18 +408,18 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
  {
         unsigned long start  = phys &  PAGE_MASK;
         unsigned long offset = phys & ~PAGE_MASK;
-       unsigned long vaddr;
+       void *vaddr;
  
         /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
         if (page_is_ram(start >> PAGE_SHIFT))
                 return __va(phys);
  
-       vaddr = (unsigned long)ioremap_cache(start, PAGE_SIZE);
+       vaddr = ioremap_cache(start, PAGE_SIZE);
         /* Only add the offset on success and return NULL if the ioremap() failed: */
         if (vaddr)
                 vaddr += offset;
  
-       return (void *)vaddr;
+       return vaddr;
  }
  
  void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
@@ -428,7 +428,6 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
                 return;
  
         iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
-       return;
  }
  
  static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c

index c439ec47821601c5b594bc1eec5abc529c5fd012..7a657f58bbea152057262a61e325c169f78bc516 100644 (file)
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -10,13 +10,15 @@
  #include <linux/syscalls.h>
  #include <linux/sched/sysctl.h>
  
-#include <asm/i387.h>
  #include <asm/insn.h>
  #include <asm/mman.h>
  #include <asm/mmu_context.h>
  #include <asm/mpx.h>
  #include <asm/processor.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/mpx.h>
  
  static const char *mpx_mapping_name(struct vm_area_struct *vma)
  {
@@ -32,6 +34,22 @@ static int is_mpx_vma(struct vm_area_struct *vma)
         return (vma->vm_ops == &mpx_vma_ops);
  }
  
+static inline unsigned long mpx_bd_size_bytes(struct mm_struct *mm)
+{
+       if (is_64bit_mm(mm))
+               return MPX_BD_SIZE_BYTES_64;
+       else
+               return MPX_BD_SIZE_BYTES_32;
+}
+
+static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm)
+{
+       if (is_64bit_mm(mm))
+               return MPX_BT_SIZE_BYTES_64;
+       else
+               return MPX_BT_SIZE_BYTES_32;
+}
+
  /*
   * This is really a simplified "vm_mmap". it only handles MPX
   * bounds tables (the bounds directory is user-allocated).
@@ -47,8 +65,8 @@ static unsigned long mpx_mmap(unsigned long len)
         vm_flags_t vm_flags;
         struct vm_area_struct *vma;
  
-       /* Only bounds table and bounds directory can be allocated here */
-       if (len != MPX_BD_SIZE_BYTES && len != MPX_BT_SIZE_BYTES)
+       /* Only bounds table can be allocated here */
+       if (len != mpx_bt_size_bytes(mm))
                 return -EINVAL;
  
         down_write(&mm->mmap_sem);
@@ -272,10 +290,9 @@ bad_opcode:
   *
   * The caller is expected to kfree() the returned siginfo_t.
   */
-siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
-                               struct xsave_struct *xsave_buf)
+siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
  {
-       struct bndreg *bndregs, *bndreg;
+       const struct bndreg *bndregs, *bndreg;
         siginfo_t *info = NULL;
         struct insn insn;
         uint8_t bndregno;
@@ -295,8 +312,8 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
                 err = -EINVAL;
                 goto err_out;
         }
-       /* get the bndregs _area_ of the xsave structure */
-       bndregs = get_xsave_addr(xsave_buf, XSTATE_BNDREGS);
+       /* get bndregs field from current task's xsave area */
+       bndregs = get_xsave_field_ptr(XSTATE_BNDREGS);
         if (!bndregs) {
                 err = -EINVAL;
                 goto err_out;
@@ -334,6 +351,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
                 err = -EINVAL;
                 goto err_out;
         }
+       trace_mpx_bounds_register_exception(info->si_addr, bndreg);
         return info;
  err_out:
         /* info might be NULL, but kfree() handles that */
@@ -341,25 +359,18 @@ err_out:
         return ERR_PTR(err);
  }
  
-static __user void *task_get_bounds_dir(struct task_struct *tsk)
+static __user void *mpx_get_bounds_dir(void)
  {
-       struct bndcsr *bndcsr;
+       const struct bndcsr *bndcsr;
  
         if (!cpu_feature_enabled(X86_FEATURE_MPX))
                 return MPX_INVALID_BOUNDS_DIR;
  
-       /*
-        * 32-bit binaries on 64-bit kernels are currently
-        * unsupported.
-        */
-       if (IS_ENABLED(CONFIG_X86_64) && test_thread_flag(TIF_IA32))
-               return MPX_INVALID_BOUNDS_DIR;
         /*
          * The bounds directory pointer is stored in a register
          * only accessible if we first do an xsave.
          */
-       fpu_save_init(&tsk->thread.fpu);
-       bndcsr = get_xsave_addr(&tsk->thread.fpu.state->xsave, XSTATE_BNDCSR);
+       bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);
         if (!bndcsr)
                 return MPX_INVALID_BOUNDS_DIR;
  
@@ -378,10 +389,10 @@ static __user void *task_get_bounds_dir(struct task_struct *tsk)
                 (bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK);
  }
  
-int mpx_enable_management(struct task_struct *tsk)
+int mpx_enable_management(void)
  {
         void __user *bd_base = MPX_INVALID_BOUNDS_DIR;
-       struct mm_struct *mm = tsk->mm;
+       struct mm_struct *mm = current->mm;
         int ret = 0;
  
         /*
@@ -390,11 +401,12 @@ int mpx_enable_management(struct task_struct *tsk)
          * directory into XSAVE/XRSTOR Save Area and enable MPX through
          * XRSTOR instruction.
          *
-        * fpu_xsave() is expected to be very expensive. Storing the bounds
-        * directory here means that we do not have to do xsave in the unmap
-        * path; we can just use mm->bd_addr instead.
+        * The copy_xregs_to_kernel() beneath get_xsave_field_ptr() is
+        * expected to be relatively expensive. Storing the bounds
+        * directory here means that we do not have to do xsave in the
+        * unmap path; we can just use mm->bd_addr instead.
          */
-       bd_base = task_get_bounds_dir(tsk);
+       bd_base = mpx_get_bounds_dir();
         down_write(&mm->mmap_sem);
         mm->bd_addr = bd_base;
         if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR)
@@ -404,7 +416,7 @@ int mpx_enable_management(struct task_struct *tsk)
         return ret;
  }
  
-int mpx_disable_management(struct task_struct *tsk)
+int mpx_disable_management(void)
  {
         struct mm_struct *mm = current->mm;
  
@@ -417,29 +429,59 @@ int mpx_disable_management(struct task_struct *tsk)
         return 0;
  }
  
+static int mpx_cmpxchg_bd_entry(struct mm_struct *mm,
+               unsigned long *curval,
+               unsigned long __user *addr,
+               unsigned long old_val, unsigned long new_val)
+{
+       int ret;
+       /*
+        * user_atomic_cmpxchg_inatomic() actually uses sizeof()
+        * the pointer that we pass to it to figure out how much
+        * data to cmpxchg.  We have to be careful here not to
+        * pass a pointer to a 64-bit data type when we only want
+        * a 32-bit copy.
+        */
+       if (is_64bit_mm(mm)) {
+               ret = user_atomic_cmpxchg_inatomic(curval,
+                               addr, old_val, new_val);
+       } else {
+               u32 uninitialized_var(curval_32);
+               u32 old_val_32 = old_val;
+               u32 new_val_32 = new_val;
+               u32 __user *addr_32 = (u32 __user *)addr;
+
+               ret = user_atomic_cmpxchg_inatomic(&curval_32,
+                               addr_32, old_val_32, new_val_32);
+               *curval = curval_32;
+       }
+       return ret;
+}
+
  /*
- * With 32-bit mode, MPX_BT_SIZE_BYTES is 4MB, and the size of each
- * bounds table is 16KB. With 64-bit mode, MPX_BT_SIZE_BYTES is 2GB,
+ * With 32-bit mode, a bounds directory is 4MB, and the size of each
+ * bounds table is 16KB. With 64-bit mode, a bounds directory is 2GB,
   * and the size of each bounds table is 4MB.
   */
-static int allocate_bt(long __user *bd_entry)
+static int allocate_bt(struct mm_struct *mm, long __user *bd_entry)
  {
         unsigned long expected_old_val = 0;
         unsigned long actual_old_val = 0;
         unsigned long bt_addr;
+       unsigned long bd_new_entry;
         int ret = 0;
  
         /*
          * Carve the virtual space out of userspace for the new
          * bounds table:
          */
-       bt_addr = mpx_mmap(MPX_BT_SIZE_BYTES);
+       bt_addr = mpx_mmap(mpx_bt_size_bytes(mm));
         if (IS_ERR((void *)bt_addr))
                 return PTR_ERR((void *)bt_addr);
         /*
          * Set the valid flag (kinda like _PAGE_PRESENT in a pte)
          */
-       bt_addr = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
+       bd_new_entry = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
  
         /*
          * Go poke the address of the new bounds table in to the
@@ -452,8 +494,8 @@ static int allocate_bt(long __user *bd_entry)
          * mmap_sem at this point, unlike some of the other part
          * of the MPX code that have to pagefault_disable().
          */
-       ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry,
-                                          expected_old_val, bt_addr);
+       ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val, bd_entry,
+                                  expected_old_val, bd_new_entry);
         if (ret)
                 goto out_unmap;
  
@@ -481,9 +523,10 @@ static int allocate_bt(long __user *bd_entry)
                 ret = -EINVAL;
                 goto out_unmap;
         }
+       trace_mpx_new_bounds_table(bt_addr);
         return 0;
  out_unmap:
-       vm_munmap(bt_addr & MPX_BT_ADDR_MASK, MPX_BT_SIZE_BYTES);
+       vm_munmap(bt_addr, mpx_bt_size_bytes(mm));
         return ret;
  }
  
@@ -498,12 +541,13 @@ out_unmap:
   * bound table is 16KB. With 64-bit mode, the size of BD is 2GB,
   * and the size of each bound table is 4MB.
   */
-static int do_mpx_bt_fault(struct xsave_struct *xsave_buf)
+static int do_mpx_bt_fault(void)
  {
         unsigned long bd_entry, bd_base;
-       struct bndcsr *bndcsr;
+       const struct bndcsr *bndcsr;
+       struct mm_struct *mm = current->mm;
  
-       bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR);
+       bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);
         if (!bndcsr)
                 return -EINVAL;
         /*
@@ -520,13 +564,13 @@ static int do_mpx_bt_fault(struct xsave_struct *xsave_buf)
          * the directory is.
          */
         if ((bd_entry < bd_base) ||
-           (bd_entry >= bd_base + MPX_BD_SIZE_BYTES))
+           (bd_entry >= bd_base + mpx_bd_size_bytes(mm)))
                 return -EINVAL;
  
-       return allocate_bt((long __user *)bd_entry);
+       return allocate_bt(mm, (long __user *)bd_entry);
  }
  
-int mpx_handle_bd_fault(struct xsave_struct *xsave_buf)
+int mpx_handle_bd_fault(void)
  {
         /*
          * Userspace never asked us to manage the bounds tables,
@@ -535,7 +579,7 @@ int mpx_handle_bd_fault(struct xsave_struct *xsave_buf)
         if (!kernel_managing_mpx_tables(current->mm))
                 return -EINVAL;
  
-       if (do_mpx_bt_fault(xsave_buf)) {
+       if (do_mpx_bt_fault()) {
                 force_sig(SIGSEGV, current);
                 /*
                  * The force_sig() is essentially "handling" this
@@ -572,29 +616,55 @@ static int mpx_resolve_fault(long __user *addr, int write)
         return 0;
  }
  
+static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm,
+                                            unsigned long bd_entry)
+{
+       unsigned long bt_addr = bd_entry;
+       int align_to_bytes;
+       /*
+        * Bit 0 in a bt_entry is always the valid bit.
+        */
+       bt_addr &= ~MPX_BD_ENTRY_VALID_FLAG;
+       /*
+        * Tables are naturally aligned at 8-byte boundaries
+        * on 64-bit and 4-byte boundaries on 32-bit.  The
+        * documentation makes it appear that the low bits
+        * are ignored by the hardware, so we do the same.
+        */
+       if (is_64bit_mm(mm))
+               align_to_bytes = 8;
+       else
+               align_to_bytes = 4;
+       bt_addr &= ~(align_to_bytes-1);
+       return bt_addr;
+}
+
  /*
   * Get the base of bounds tables pointed by specific bounds
   * directory entry.
   */
  static int get_bt_addr(struct mm_struct *mm,
-                       long __user *bd_entry, unsigned long *bt_addr)
+                       long __user *bd_entry_ptr,
+                       unsigned long *bt_addr_result)
  {
         int ret;
         int valid_bit;
+       unsigned long bd_entry;
+       unsigned long bt_addr;
  
-       if (!access_ok(VERIFY_READ, (bd_entry), sizeof(*bd_entry)))
+       if (!access_ok(VERIFY_READ, (bd_entry_ptr), sizeof(*bd_entry_ptr)))
                 return -EFAULT;
  
         while (1) {
                 int need_write = 0;
  
                 pagefault_disable();
-               ret = get_user(*bt_addr, bd_entry);
+               ret = get_user(bd_entry, bd_entry_ptr);
                 pagefault_enable();
                 if (!ret)
                         break;
                 if (ret == -EFAULT)
-                       ret = mpx_resolve_fault(bd_entry, need_write);
+                       ret = mpx_resolve_fault(bd_entry_ptr, need_write);
                 /*
                  * If we could not resolve the fault, consider it
                  * userspace's fault and error out.
@@ -603,8 +673,8 @@ static int get_bt_addr(struct mm_struct *mm,
                         return ret;
         }
  
-       valid_bit = *bt_addr & MPX_BD_ENTRY_VALID_FLAG;
-       *bt_addr &= MPX_BT_ADDR_MASK;
+       valid_bit = bd_entry & MPX_BD_ENTRY_VALID_FLAG;
+       bt_addr = mpx_bd_entry_to_bt_addr(mm, bd_entry);
  
         /*
          * When the kernel is managing bounds tables, a bounds directory
@@ -613,7 +683,7 @@ static int get_bt_addr(struct mm_struct *mm,
          * data in the address field, we know something is wrong. This
          * -EINVAL return will cause a SIGSEGV.
          */
-       if (!valid_bit && *bt_addr)
+       if (!valid_bit && bt_addr)
                 return -EINVAL;
         /*
          * Do we have an completely zeroed bt entry?  That is OK.  It
@@ -624,19 +694,100 @@ static int get_bt_addr(struct mm_struct *mm,
         if (!valid_bit)
                 return -ENOENT;
  
+       *bt_addr_result = bt_addr;
         return 0;
  }
  
+static inline int bt_entry_size_bytes(struct mm_struct *mm)
+{
+       if (is_64bit_mm(mm))
+               return MPX_BT_ENTRY_BYTES_64;
+       else
+               return MPX_BT_ENTRY_BYTES_32;
+}
+
+/*
+ * Take a virtual address and turns it in to the offset in bytes
+ * inside of the bounds table where the bounds table entry
+ * controlling 'addr' can be found.
+ */
+static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm,
+               unsigned long addr)
+{
+       unsigned long bt_table_nr_entries;
+       unsigned long offset = addr;
+
+       if (is_64bit_mm(mm)) {
+               /* Bottom 3 bits are ignored on 64-bit */
+               offset >>= 3;
+               bt_table_nr_entries = MPX_BT_NR_ENTRIES_64;
+       } else {
+               /* Bottom 2 bits are ignored on 32-bit */
+               offset >>= 2;
+               bt_table_nr_entries = MPX_BT_NR_ENTRIES_32;
+       }
+       /*
+        * We know the size of the table in to which we are
+        * indexing, and we have eliminated all the low bits
+        * which are ignored for indexing.
+        *
+        * Mask out all the high bits which we do not need
+        * to index in to the table.  Note that the tables
+        * are always powers of two so this gives us a proper
+        * mask.
+        */
+       offset &= (bt_table_nr_entries-1);
+       /*
+        * We now have an entry offset in terms of *entries* in
+        * the table.  We need to scale it back up to bytes.
+        */
+       offset *= bt_entry_size_bytes(mm);
+       return offset;
+}
+
+/*
+ * How much virtual address space does a single bounds
+ * directory entry cover?
+ *
+ * Note, we need a long long because 4GB doesn't fit in
+ * to a long on 32-bit.
+ */
+static inline unsigned long bd_entry_virt_space(struct mm_struct *mm)
+{
+       unsigned long long virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
+       if (is_64bit_mm(mm))
+               return virt_space / MPX_BD_NR_ENTRIES_64;
+       else
+               return virt_space / MPX_BD_NR_ENTRIES_32;
+}
+
  /*
   * Free the backing physical pages of bounds table 'bt_addr'.
   * Assume start...end is within that bounds table.
   */
-static int zap_bt_entries(struct mm_struct *mm,
+static noinline int zap_bt_entries_mapping(struct mm_struct *mm,
                 unsigned long bt_addr,
-               unsigned long start, unsigned long end)
+               unsigned long start_mapping, unsigned long end_mapping)
  {
         struct vm_area_struct *vma;
         unsigned long addr, len;
+       unsigned long start;
+       unsigned long end;
+
+       /*
+        * if we 'end' on a boundary, the offset will be 0 which
+        * is not what we want.  Back it up a byte to get the
+        * last bt entry.  Then once we have the entry itself,
+        * move 'end' back up by the table entry size.
+        */
+       start = bt_addr + mpx_get_bt_entry_offset_bytes(mm, start_mapping);
+       end   = bt_addr + mpx_get_bt_entry_offset_bytes(mm, end_mapping - 1);
+       /*
+        * Move end back up by one entry.  Among other things
+        * this ensures that it remains page-aligned and does
+        * not screw up zap_page_range()
+        */
+       end += bt_entry_size_bytes(mm);
  
         /*
          * Find the first overlapping vma. If vma->vm_start > start, there
@@ -648,7 +799,7 @@ static int zap_bt_entries(struct mm_struct *mm,
                 return -EINVAL;
  
         /*
-        * A NUMA policy on a VM_MPX VMA could cause this bouds table to
+        * A NUMA policy on a VM_MPX VMA could cause this bounds table to
          * be split. So we need to look across the entire 'start -> end'
          * range of this bounds table, find all of the VM_MPX VMAs, and
          * zap only those.
@@ -666,27 +817,65 @@ static int zap_bt_entries(struct mm_struct *mm,
  
                 len = min(vma->vm_end, end) - addr;
                 zap_page_range(vma, addr, len, NULL);
+               trace_mpx_unmap_zap(addr, addr+len);
  
                 vma = vma->vm_next;
                 addr = vma->vm_start;
         }
-
         return 0;
  }
  
-static int unmap_single_bt(struct mm_struct *mm,
+static unsigned long mpx_get_bd_entry_offset(struct mm_struct *mm,
+               unsigned long addr)
+{
+       /*
+        * There are several ways to derive the bd offsets.  We
+        * use the following approach here:
+        * 1. We know the size of the virtual address space
+        * 2. We know the number of entries in a bounds table
+        * 3. We know that each entry covers a fixed amount of
+        *    virtual address space.
+        * So, we can just divide the virtual address by the
+        * virtual space used by one entry to determine which
+        * entry "controls" the given virtual address.
+        */
+       if (is_64bit_mm(mm)) {
+               int bd_entry_size = 8; /* 64-bit pointer */
+               /*
+                * Take the 64-bit addressing hole in to account.
+                */
+               addr &= ((1UL << boot_cpu_data.x86_virt_bits) - 1);
+               return (addr / bd_entry_virt_space(mm)) * bd_entry_size;
+       } else {
+               int bd_entry_size = 4; /* 32-bit pointer */
+               /*
+                * 32-bit has no hole so this case needs no mask
+                */
+               return (addr / bd_entry_virt_space(mm)) * bd_entry_size;
+       }
+       /*
+        * The two return calls above are exact copies.  If we
+        * pull out a single copy and put it in here, gcc won't
+        * realize that we're doing a power-of-2 divide and use
+        * shifts.  It uses a real divide.  If we put them up
+        * there, it manages to figure it out (gcc 4.8.3).
+        */
+}
+
+static int unmap_entire_bt(struct mm_struct *mm,
                 long __user *bd_entry, unsigned long bt_addr)
  {
         unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
-       unsigned long actual_old_val = 0;
+       unsigned long uninitialized_var(actual_old_val);
         int ret;
  
         while (1) {
                 int need_write = 1;
+               unsigned long cleared_bd_entry = 0;
  
                 pagefault_disable();
-               ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry,
-                                                  expected_old_val, 0);
+               ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val,
+                               bd_entry, expected_old_val, cleared_bd_entry);
                 pagefault_enable();
                 if (!ret)
                         break;
@@ -705,9 +894,8 @@ static int unmap_single_bt(struct mm_struct *mm,
         if (actual_old_val != expected_old_val) {
                 /*
                  * Someone else raced with us to unmap the table.
-                * There was no bounds table pointed to by the
-                * directory, so declare success.  Somebody freed
-                * it.
+                * That is OK, since we were both trying to do
+                * the same thing.  Declare success.
                  */
                 if (!actual_old_val)
                         return 0;
@@ -720,176 +908,113 @@ static int unmap_single_bt(struct mm_struct *mm,
                  */
                 return -EINVAL;
         }
-
         /*
          * Note, we are likely being called under do_munmap() already. To
          * avoid recursion, do_munmap() will check whether it comes
          * from one bounds table through VM_MPX flag.
          */
-       return do_munmap(mm, bt_addr, MPX_BT_SIZE_BYTES);
+       return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm));
  }
  
-/*
- * If the bounds table pointed by bounds directory 'bd_entry' is
- * not shared, unmap this whole bounds table. Otherwise, only free
- * those backing physical pages of bounds table entries covered
- * in this virtual address region start...end.
- */
-static int unmap_shared_bt(struct mm_struct *mm,
-               long __user *bd_entry, unsigned long start,
-               unsigned long end, bool prev_shared, bool next_shared)
+static int try_unmap_single_bt(struct mm_struct *mm,
+              unsigned long start, unsigned long end)
  {
-       unsigned long bt_addr;
-       int ret;
-
-       ret = get_bt_addr(mm, bd_entry, &bt_addr);
+       struct vm_area_struct *next;
+       struct vm_area_struct *prev;
         /*
-        * We could see an "error" ret for not-present bounds
-        * tables (not really an error), or actual errors, but
-        * stop unmapping either way.
+        * "bta" == Bounds Table Area: the area controlled by the
+        * bounds table that we are unmapping.
          */
-       if (ret)
-               return ret;
-
-       if (prev_shared && next_shared)
-               ret = zap_bt_entries(mm, bt_addr,
-                               bt_addr+MPX_GET_BT_ENTRY_OFFSET(start),
-                               bt_addr+MPX_GET_BT_ENTRY_OFFSET(end));
-       else if (prev_shared)
-               ret = zap_bt_entries(mm, bt_addr,
-                               bt_addr+MPX_GET_BT_ENTRY_OFFSET(start),
-                               bt_addr+MPX_BT_SIZE_BYTES);
-       else if (next_shared)
-               ret = zap_bt_entries(mm, bt_addr, bt_addr,
-                               bt_addr+MPX_GET_BT_ENTRY_OFFSET(end));
-       else
-               ret = unmap_single_bt(mm, bd_entry, bt_addr);
-
-       return ret;
-}
-
-/*
- * A virtual address region being munmap()ed might share bounds table
- * with adjacent VMAs. We only need to free the backing physical
- * memory of these shared bounds tables entries covered in this virtual
- * address region.
- */
-static int unmap_edge_bts(struct mm_struct *mm,
-               unsigned long start, unsigned long end)
-{
+       unsigned long bta_start_vaddr = start & ~(bd_entry_virt_space(mm)-1);
+       unsigned long bta_end_vaddr = bta_start_vaddr + bd_entry_virt_space(mm);
+       unsigned long uninitialized_var(bt_addr);
+       void __user *bde_vaddr;
         int ret;
-       long __user *bde_start, *bde_end;
-       struct vm_area_struct *prev, *next;
-       bool prev_shared = false, next_shared = false;
-
-       bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start);
-       bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1);
-
         /*
-        * Check whether bde_start and bde_end are shared with adjacent
-        * VMAs.
-        *
-        * We already unliked the VMAs from the mm's rbtree so 'start'
+        * We already unlinked the VMAs from the mm's rbtree so 'start'
          * is guaranteed to be in a hole. This gets us the first VMA
          * before the hole in to 'prev' and the next VMA after the hole
          * in to 'next'.
          */
         next = find_vma_prev(mm, start, &prev);
-       if (prev && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(prev->vm_end-1))
-                       == bde_start)
-               prev_shared = true;
-       if (next && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(next->vm_start))
-                       == bde_end)
-               next_shared = true;
-
         /*
-        * This virtual address region being munmap()ed is only
-        * covered by one bounds table.
-        *
-        * In this case, if this table is also shared with adjacent
-        * VMAs, only part of the backing physical memory of the bounds
-        * table need be freeed. Otherwise the whole bounds table need
-        * be unmapped.
-        */
-       if (bde_start == bde_end) {
-               return unmap_shared_bt(mm, bde_start, start, end,
-                               prev_shared, next_shared);
+        * Do not count other MPX bounds table VMAs as neighbors.
+        * Although theoretically possible, we do not allow bounds
+        * tables for bounds tables so our heads do not explode.
+        * If we count them as neighbors here, we may end up with
+        * lots of tables even though we have no actual table
+        * entries in use.
+        */
+       while (next && is_mpx_vma(next))
+               next = next->vm_next;
+       while (prev && is_mpx_vma(prev))
+               prev = prev->vm_prev;
+       /*
+        * We know 'start' and 'end' lie within an area controlled
+        * by a single bounds table.  See if there are any other
+        * VMAs controlled by that bounds table.  If there are not
+        * then we can "expand" the are we are unmapping to possibly
+        * cover the entire table.
+        */
+       next = find_vma_prev(mm, start, &prev);
+       if ((!prev || prev->vm_end <= bta_start_vaddr) &&
+           (!next || next->vm_start >= bta_end_vaddr)) {
+               /*
+                * No neighbor VMAs controlled by same bounds
+                * table.  Try to unmap the whole thing
+                */
+               start = bta_start_vaddr;
+               end = bta_end_vaddr;
         }
  
+       bde_vaddr = mm->bd_addr + mpx_get_bd_entry_offset(mm, start);
+       ret = get_bt_addr(mm, bde_vaddr, &bt_addr);
         /*
-        * If more than one bounds tables are covered in this virtual
-        * address region being munmap()ed, we need to separately check
-        * whether bde_start and bde_end are shared with adjacent VMAs.
+        * No bounds table there, so nothing to unmap.
          */
-       ret = unmap_shared_bt(mm, bde_start, start, end, prev_shared, false);
-       if (ret)
-               return ret;
-       ret = unmap_shared_bt(mm, bde_end, start, end, false, next_shared);
+       if (ret == -ENOENT) {
+               ret = 0;
+               return 0;
+       }
         if (ret)
                 return ret;
-
-       return 0;
+       /*
+        * We are unmapping an entire table.  Either because the
+        * unmap that started this whole process was large enough
+        * to cover an entire table, or that the unmap was small
+        * but was the area covered by a bounds table.
+        */
+       if ((start == bta_start_vaddr) &&
+           (end == bta_end_vaddr))
+               return unmap_entire_bt(mm, bde_vaddr, bt_addr);
+       return zap_bt_entries_mapping(mm, bt_addr, start, end);
  }
  
  static int mpx_unmap_tables(struct mm_struct *mm,
                 unsigned long start, unsigned long end)
  {
-       int ret;
-       long __user *bd_entry, *bde_start, *bde_end;
-       unsigned long bt_addr;
-
-       /*
-        * "Edge" bounds tables are those which are being used by the region
-        * (start -> end), but that may be shared with adjacent areas.  If they
-        * turn out to be completely unshared, they will be freed.  If they are
-        * shared, we will free the backing store (like an MADV_DONTNEED) for
-        * areas used by this region.
-        */
-       ret = unmap_edge_bts(mm, start, end);
-       switch (ret) {
-               /* non-present tables are OK */
-               case 0:
-               case -ENOENT:
-                       /* Success, or no tables to unmap */
-                       break;
-               case -EINVAL:
-               case -EFAULT:
-               default:
-                       return ret;
-       }
-
-       /*
-        * Only unmap the bounds table that are
-        *   1. fully covered
-        *   2. not at the edges of the mapping, even if full aligned
-        */
-       bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start);
-       bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1);
-       for (bd_entry = bde_start + 1; bd_entry < bde_end; bd_entry++) {
-               ret = get_bt_addr(mm, bd_entry, &bt_addr);
-               switch (ret) {
-                       case 0:
-                               break;
-                       case -ENOENT:
-                               /* No table here, try the next one */
-                               continue;
-                       case -EINVAL:
-                       case -EFAULT:
-                       default:
-                               /*
-                                * Note: we are being strict here.
-                                * Any time we run in to an issue
-                                * unmapping tables, we stop and
-                                * SIGSEGV.
-                                */
-                               return ret;
-               }
-
-               ret = unmap_single_bt(mm, bd_entry, bt_addr);
+       unsigned long one_unmap_start;
+       trace_mpx_unmap_search(start, end);
+
+       one_unmap_start = start;
+       while (one_unmap_start < end) {
+               int ret;
+               unsigned long next_unmap_start = ALIGN(one_unmap_start+1,
+                                                      bd_entry_virt_space(mm));
+               unsigned long one_unmap_end = end;
+               /*
+                * if the end is beyond the current bounds table,
+                * move it back so we only deal with a single one
+                * at a time
+                */
+               if (one_unmap_end > next_unmap_start)
+                       one_unmap_end = next_unmap_start;
+               ret = try_unmap_single_bt(mm, one_unmap_start, one_unmap_end);
                 if (ret)
                         return ret;
-       }
  
+               one_unmap_start = next_unmap_start;
+       }
         return 0;
  }
  
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c

index 02744df576d52588a35308998ecc1a138435012e..3b984c3aa1b0b5ba6e7b5e5321edb4fbc7013e5d 100644 (file)
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -501,6 +501,8 @@ void __init efi_init(void)
  
         if (efi_enabled(EFI_DBG))
                 print_efi_memmap();
+
+       efi_esrt_init();
  }
  
  void __init efi_late_init(void)
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c

index 757678fb26e1a06277687c1c90f86e75377de03a..0d7dd1f5ac36fa6814c18522dd28561566c570eb 100644 (file)
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -18,10 +18,9 @@
  #include <asm/mtrr.h>
  #include <asm/page.h>
  #include <asm/mce.h>
-#include <asm/xcr.h>
  #include <asm/suspend.h>
+#include <asm/fpu/internal.h>
  #include <asm/debugreg.h>
-#include <asm/fpu-internal.h> /* pcntxt_mask */
  #include <asm/cpu.h>
  
  #ifdef CONFIG_X86_32
@@ -155,6 +154,8 @@ static void fix_processor_context(void)
  #endif
         load_TR_desc();                         /* This does ltr */
         load_LDT(&current->active_mm->context); /* This does lldt */
+
+       fpu__resume_cpu();
  }
  
  /**
@@ -221,12 +222,6 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
         wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
  #endif
  
-       /*
-        * restore XCR0 for xsave capable cpu's.
-        */
-       if (cpu_has_xsave)
-               xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
-
         fix_processor_context();
  
         do_fpu_end();
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c

index a8f57a94785ab714d63379c3f876b7d7d28f9130..0b95c9b8283fe2afe885d9a8ae98393c14ecc498 100644 (file)
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1424,7 +1424,7 @@ static void xen_pvh_set_cr_flags(int cpu)
                 return;
         /*
          * For BSP, PSE PGE are set in probe_page_size_mask(), for APs
-        * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init.
+        * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu().
         */
         if (cpu_has_pse)
                 cr4_set_bits_and_update_boot(X86_CR4_PSE);
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c

index 9e3571a6535c3b1bbc8535195ee40405fe9c42c0..83a44a33cfa11221f99ba5f8d836e02a0c4c9f92 100644 (file)
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -15,10 +15,10 @@
  #include <linux/mm.h>
  #include <linux/module.h>
  #include <linux/hardirq.h>
+#include <linux/uaccess.h>
  #include <asm/mmu_context.h>
  #include <asm/cacheflush.h>
  #include <asm/hardirq.h>
-#include <asm/uaccess.h>
  #include <asm/pgalloc.h>
  
  DEFINE_PER_CPU(unsigned long, asid_cache) = ASID_USER_FIRST;
@@ -57,7 +57,7 @@ void do_page_fault(struct pt_regs *regs)
         /* If we're in an interrupt or have no user
          * context, we must not take the fault..
          */
-       if (in_atomic() || !mm) {
+       if (faulthandler_disabled() || !mm) {
                 bad_page_fault(regs, address, SIGSEGV);
                 return;
         }
diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c

index 8cfb71ec0937369a8adf79a3d6e754f589cbba5f..184ceadccc1a3aca946b3c9e609a2efb91046807 100644 (file)
--- a/arch/xtensa/mm/highmem.c
+++ b/arch/xtensa/mm/highmem.c
@@ -42,6 +42,7 @@ void *kmap_atomic(struct page *page)
         enum fixed_addresses idx;
         unsigned long vaddr;
  
+       preempt_disable();
         pagefault_disable();
         if (!PageHighMem(page))
                 return page_address(page);
@@ -79,6 +80,7 @@ void __kunmap_atomic(void *kvaddr)
         }
  
         pagefault_enable();
+       preempt_enable();
  }
  EXPORT_SYMBOL(__kunmap_atomic);
  
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c

index 5f13f4d0bcceda747589a300170537eefb4810eb..1e28ddb656b891b92d7c135fa65914939b1451aa 100644 (file)
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -24,7 +24,7 @@ static int get_first_sibling(unsigned int cpu)
  {
         unsigned int ret;
  
-       ret = cpumask_first(topology_thread_cpumask(cpu));
+       ret = cpumask_first(topology_sibling_cpumask(cpu));
         if (ret < nr_cpu_ids)
                 return ret;
  
diff --git a/block/blk-mq.c b/block/blk-mq.c

index e68b71b85a7eaf0e3097debe8bf4dc4078e7a038..594eea04266e6d05f7256255552a1c4c72c664f3 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1600,6 +1600,7 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,
         return NOTIFY_OK;
  }
  
+/* hctx->ctxs will be freed in queue's release handler */
  static void blk_mq_exit_hctx(struct request_queue *q,
                 struct blk_mq_tag_set *set,
                 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
@@ -1618,7 +1619,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
  
         blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
         blk_free_flush_queue(hctx->fq);
-       kfree(hctx->ctxs);
         blk_mq_free_bitmap(&hctx->ctx_map);
  }
  
@@ -1891,8 +1891,12 @@ void blk_mq_release(struct request_queue *q)
         unsigned int i;
  
         /* hctx kobj stays in hctx */
-       queue_for_each_hw_ctx(q, hctx, i)
+       queue_for_each_hw_ctx(q, hctx, i) {
+               if (!hctx)
+                       continue;
+               kfree(hctx->ctxs);
                 kfree(hctx);
+       }
  
         kfree(q->queue_hw_ctx);
  
diff --git a/block/genhd.c b/block/genhd.c

index 0a536dc05f3b559d6d04c1e819d65290f96f7c35..ea982eadaf6380b974d6b1d39a7197085217ac91 100644 (file)
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -422,9 +422,9 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
         /* allocate ext devt */
         idr_preload(GFP_KERNEL);
  
-       spin_lock(&ext_devt_lock);
+       spin_lock_bh(&ext_devt_lock);
         idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT);
-       spin_unlock(&ext_devt_lock);
+       spin_unlock_bh(&ext_devt_lock);
  
         idr_preload_end();
         if (idx < 0)
@@ -449,9 +449,9 @@ void blk_free_devt(dev_t devt)
                 return;
  
         if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
-               spin_lock(&ext_devt_lock);
+               spin_lock_bh(&ext_devt_lock);
                 idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
-               spin_unlock(&ext_devt_lock);
+               spin_unlock_bh(&ext_devt_lock);
         }
  }
  
@@ -653,7 +653,6 @@ void del_gendisk(struct gendisk *disk)
         disk->flags &= ~GENHD_FL_UP;
  
         sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
-       bdi_unregister(&disk->queue->backing_dev_info);
         blk_unregister_queue(disk);
         blk_unregister_region(disk_devt(disk), disk->minors);
  
@@ -691,13 +690,13 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
         } else {
                 struct hd_struct *part;
  
-               spin_lock(&ext_devt_lock);
+               spin_lock_bh(&ext_devt_lock);
                 part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
                 if (part && get_disk(part_to_disk(part))) {
                         *partno = part->partno;
                         disk = part_to_disk(part);
                 }
-               spin_unlock(&ext_devt_lock);
+               spin_unlock_bh(&ext_devt_lock);
         }
  
         return disk;
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c

index 6bc9cbc01ad6a3f20c27740ebbf1a919e7c74d0d..00b39802d7ecf5826b8c0b5dc9036b9f2f452ba5 100644 (file)
--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -105,7 +105,7 @@ static void round_robin_cpu(unsigned int tsk_index)
         mutex_lock(&round_robin_lock);
         cpumask_clear(tmp);
         for_each_cpu(cpu, pad_busy_cpus)
-               cpumask_or(tmp, tmp, topology_thread_cpumask(cpu));
+               cpumask_or(tmp, tmp, topology_sibling_cpumask(cpu));
         cpumask_andnot(tmp, cpu_online_mask, tmp);
         /* avoid HT sibilings if possible */
         if (cpumask_empty(tmp))
diff --git a/drivers/ata/ahci_mvebu.c b/drivers/ata/ahci_mvebu.c

index 23716dd8a7ec3f569f82db531e1ed71bc330c7d6..5928d0746a270e7b6b2ee12a022b19ed731f03fe 100644 (file)
--- a/drivers/ata/ahci_mvebu.c
+++ b/drivers/ata/ahci_mvebu.c
@@ -45,7 +45,7 @@ static void ahci_mvebu_mbus_config(struct ahci_host_priv *hpriv,
                 writel((cs->mbus_attr << 8) |
                        (dram->mbus_dram_target_id << 4) | 1,
                        hpriv->mmio + AHCI_WINDOW_CTRL(i));
-               writel(cs->base, hpriv->mmio + AHCI_WINDOW_BASE(i));
+               writel(cs->base >> 16, hpriv->mmio + AHCI_WINDOW_BASE(i));
                 writel(((cs->size - 1) & 0xffff0000),
                        hpriv->mmio + AHCI_WINDOW_SIZE(i));
         }
diff --git a/drivers/ata/pata_octeon_cf.c b/drivers/ata/pata_octeon_cf.c

index 80a80548ad0a80acf28c3407e6a6048825f92af3..27245957eee3cd906f546d67853d2ebd6ce54d30 100644 (file)
--- a/drivers/ata/pata_octeon_cf.c
+++ b/drivers/ata/pata_octeon_cf.c
@@ -1053,7 +1053,7 @@ static struct of_device_id octeon_cf_match[] = {
         },
         {},
  };
-MODULE_DEVICE_TABLE(of, octeon_i2c_match);
+MODULE_DEVICE_TABLE(of, octeon_cf_match);
  
  static struct platform_driver octeon_cf_driver = {
         .probe          = octeon_cf_probe,
diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c

index 9c2ba1c97c4257016503a8ed4d2166ac19dea9c0..df0c66cb7ad3719016436dd7eb16ab1d3234568d 100644 (file)
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -179,7 +179,7 @@ static int detect_cache_attributes(unsigned int cpu)
  {
         int ret;
  
-       if (init_cache_level(cpu))
+       if (init_cache_level(cpu) || !cache_leaves(cpu))
                 return -ENOENT;
  
         per_cpu_cacheinfo(cpu) = kcalloc(cache_leaves(cpu),
diff --git a/drivers/base/init.c b/drivers/base/init.c

index da033d3bab3c69d14e55d63c4286632905120ae2..48c0e220acc0a1b8192ca6b523ad35ab7073eba7 100644 (file)
--- a/drivers/base/init.c
+++ b/drivers/base/init.c
@@ -8,6 +8,7 @@
  #include <linux/device.h>
  #include <linux/init.h>
  #include <linux/memory.h>
+#include <linux/of.h>
  
  #include "base.h"
  
@@ -34,4 +35,5 @@ void __init driver_init(void)
         cpu_dev_init();
         memory_dev_init();
         container_dev_init();
+       of_core_init();
  }
diff --git a/drivers/base/topology.c b/drivers/base/topology.c

index 6491f45200a78681ce7ec95e65339292f8d194ab..8b7d7f8e58518448a53eeba640886d490fb75bee 100644 (file)
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -61,7 +61,7 @@ static DEVICE_ATTR_RO(physical_package_id);
  define_id_show_func(core_id);
  static DEVICE_ATTR_RO(core_id);
  
-define_siblings_show_func(thread_siblings, thread_cpumask);
+define_siblings_show_func(thread_siblings, sibling_cpumask);
  static DEVICE_ATTR_RO(thread_siblings);
  static DEVICE_ATTR_RO(thread_siblings_list);
  
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig

index eb1fed5bd516ffac33c850eed47fad402250c686..3ccef9eba6f9dc53cecb785c23582cbdeb3b8618 100644 (file)
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -406,6 +406,7 @@ config BLK_DEV_RAM_DAX
  
  config BLK_DEV_PMEM
         tristate "Persistent memory block device support"
+       depends on HAS_IOMEM
         help
           Saying Y here will allow you to use a contiguous range of reserved
           memory as one or more persistent block devices.
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c

index 85b8036deaa3b7daaba5317ed746936a1f5183db..683dff272562b16d325df65495ad6a868cf45b14 100644 (file)
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1750,6 +1750,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
         struct nvme_iod *iod;
         dma_addr_t meta_dma = 0;
         void *meta = NULL;
+       void __user *metadata;
  
         if (copy_from_user(&io, uio, sizeof(io)))
                 return -EFAULT;
@@ -1763,6 +1764,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
                 meta_len = 0;
         }
  
+       metadata = (void __user *)(unsigned long)io.metadata;
+
         write = io.opcode & 1;
  
         switch (io.opcode) {
@@ -1786,13 +1789,13 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
         if (meta_len) {
                 meta = dma_alloc_coherent(&dev->pci_dev->dev, meta_len,
                                                 &meta_dma, GFP_KERNEL);
+
                 if (!meta) {
                         status = -ENOMEM;
                         goto unmap;
                 }
                 if (write) {
-                       if (copy_from_user(meta, (void __user *)io.metadata,
-                                                               meta_len)) {
+                       if (copy_from_user(meta, metadata, meta_len)) {
                                 status = -EFAULT;
                                 goto unmap;
                         }
@@ -1819,8 +1822,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
         nvme_free_iod(dev, iod);
         if (meta) {
                 if (status == NVME_SC_SUCCESS && !write) {
-                       if (copy_to_user((void __user *)io.metadata, meta,
-                                                               meta_len))
+                       if (copy_to_user(metadata, meta, meta_len))
                                 status = -EFAULT;
                 }
                 dma_free_coherent(&dev->pci_dev->dev, meta_len, meta, meta_dma);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c

index 8dcbced0eafd5f8dc0a53dc8d8e9d4b37bad9bab..6e134f4759c0c9e98b93f221e7687004d4418342 100644 (file)
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -805,7 +805,9 @@ static void zram_reset_device(struct zram *zram)
         memset(&zram->stats, 0, sizeof(zram->stats));
         zram->disksize = 0;
         zram->max_comp_streams = 1;
+
         set_capacity(zram->disk, 0);
+       part_stat_set_all(&zram->disk->part0, 0);
  
         up_write(&zram->init_lock);
         /* I/O operation under all of CPU are done so let's free */
diff --git a/drivers/bus/mvebu-mbus.c b/drivers/bus/mvebu-mbus.c

index fb9ec6221730a2d594f66d15e54471aea75cc750..6f047dcb94c22b3ea67349bf1c4b95be6b91037a 100644 (file)
--- a/drivers/bus/mvebu-mbus.c
+++ b/drivers/bus/mvebu-mbus.c
@@ -58,7 +58,6 @@
  #include <linux/debugfs.h>
  #include <linux/log2.h>
  #include <linux/syscore_ops.h>
-#include <linux/memblock.h>
  
  /*
   * DDR target is the same on all platforms.
@@ -70,6 +69,7 @@
   */
  #define WIN_CTRL_OFF           0x0000
  #define   WIN_CTRL_ENABLE       BIT(0)
+/* Only on HW I/O coherency capable platforms */
  #define   WIN_CTRL_SYNCBARRIER  BIT(1)
  #define   WIN_CTRL_TGT_MASK     0xf0
  #define   WIN_CTRL_TGT_SHIFT    4
@@ -102,9 +102,7 @@
  
  /* Relative to mbusbridge_base */
  #define MBUS_BRIDGE_CTRL_OFF   0x0
-#define  MBUS_BRIDGE_SIZE_MASK  0xffff0000
  #define MBUS_BRIDGE_BASE_OFF   0x4
-#define  MBUS_BRIDGE_BASE_MASK  0xffff0000
  
  /* Maximum number of windows, for all known platforms */
  #define MBUS_WINS_MAX           20
@@ -323,8 +321,9 @@ static int mvebu_mbus_setup_window(struct mvebu_mbus_state *mbus,
         ctrl = ((size - 1) & WIN_CTRL_SIZE_MASK) |
                 (attr << WIN_CTRL_ATTR_SHIFT)    |
                 (target << WIN_CTRL_TGT_SHIFT)   |
-               WIN_CTRL_SYNCBARRIER             |
                 WIN_CTRL_ENABLE;
+       if (mbus->hw_io_coherency)
+               ctrl |= WIN_CTRL_SYNCBARRIER;
  
         writel(base & WIN_BASE_LOW, addr + WIN_BASE_OFF);
         writel(ctrl, addr + WIN_CTRL_OFF);
@@ -577,106 +576,36 @@ static unsigned int armada_xp_mbus_win_remap_offset(int win)
                 return MVEBU_MBUS_NO_REMAP;
  }
  
-/*
- * Use the memblock information to find the MBus bridge hole in the
- * physical address space.
- */
-static void __init
-mvebu_mbus_find_bridge_hole(uint64_t *start, uint64_t *end)
-{
-       struct memblock_region *r;
-       uint64_t s = 0;
-
-       for_each_memblock(memory, r) {
-               /*
-                * This part of the memory is above 4 GB, so we don't
-                * care for the MBus bridge hole.
-                */
-               if (r->base >= 0x100000000)
-                       continue;
-
-               /*
-                * The MBus bridge hole is at the end of the RAM under
-                * the 4 GB limit.
-                */
-               if (r->base + r->size > s)
-                       s = r->base + r->size;
-       }
-
-       *start = s;
-       *end = 0x100000000;
-}
-
  static void __init
  mvebu_mbus_default_setup_cpu_target(struct mvebu_mbus_state *mbus)
  {
         int i;
         int cs;
-       uint64_t mbus_bridge_base, mbus_bridge_end;
  
         mvebu_mbus_dram_info.mbus_dram_target_id = TARGET_DDR;
  
-       mvebu_mbus_find_bridge_hole(&mbus_bridge_base, &mbus_bridge_end);
-
         for (i = 0, cs = 0; i < 4; i++) {
-               u64 base = readl(mbus->sdramwins_base + DDR_BASE_CS_OFF(i));
-               u64 size = readl(mbus->sdramwins_base + DDR_SIZE_CS_OFF(i));
-               u64 end;
-               struct mbus_dram_window *w;
-
-               /* Ignore entries that are not enabled */
-               if (!(size & DDR_SIZE_ENABLED))
-                       continue;
-
-               /*
-                * Ignore entries whose base address is above 2^32,
-                * since devices cannot DMA to such high addresses
-                */
-               if (base & DDR_BASE_CS_HIGH_MASK)
-                       continue;
-
-               base = base & DDR_BASE_CS_LOW_MASK;
-               size = (size | ~DDR_SIZE_MASK) + 1;
-               end = base + size;
-
-               /*
-                * Adjust base/size of the current CS to make sure it
-                * doesn't overlap with the MBus bridge hole. This is
-                * particularly important for devices that do DMA from
-                * DRAM to a SRAM mapped in a MBus window, such as the
-                * CESA cryptographic engine.
-                */
+               u32 base = readl(mbus->sdramwins_base + DDR_BASE_CS_OFF(i));
+               u32 size = readl(mbus->sdramwins_base + DDR_SIZE_CS_OFF(i));
  
                 /*
-                * The CS is fully enclosed inside the MBus bridge
-                * area, so ignore it.
+                * We only take care of entries for which the chip
+                * select is enabled, and that don't have high base
+                * address bits set (devices can only access the first
+                * 32 bits of the memory).
                  */
-               if (base >= mbus_bridge_base && end <= mbus_bridge_end)
-                       continue;
+               if ((size & DDR_SIZE_ENABLED) &&
+                   !(base & DDR_BASE_CS_HIGH_MASK)) {
+                       struct mbus_dram_window *w;
  
-               /*
-                * Beginning of CS overlaps with end of MBus, raise CS
-                * base address, and shrink its size.
-                */
-               if (base >= mbus_bridge_base && end > mbus_bridge_end) {
-                       size -= mbus_bridge_end - base;
-                       base = mbus_bridge_end;
+                       w = &mvebu_mbus_dram_info.cs[cs++];
+                       w->cs_index = i;
+                       w->mbus_attr = 0xf & ~(1 << i);
+                       if (mbus->hw_io_coherency)
+                               w->mbus_attr |= ATTR_HW_COHERENCY;
+                       w->base = base & DDR_BASE_CS_LOW_MASK;
+                       w->size = (size | ~DDR_SIZE_MASK) + 1;
                 }
-
-               /*
-                * End of CS overlaps with beginning of MBus, shrink
-                * CS size.
-                */
-               if (base < mbus_bridge_base && end > mbus_bridge_base)
-                       size -= end - mbus_bridge_base;
-
-               w = &mvebu_mbus_dram_info.cs[cs++];
-               w->cs_index = i;
-               w->mbus_attr = 0xf & ~(1 << i);
-               if (mbus->hw_io_coherency)
-                       w->mbus_attr |= ATTR_HW_COHERENCY;
-               w->base = base;
-               w->size = size;
         }
         mvebu_mbus_dram_info.num_cs = cs;
  }
diff --git a/drivers/char/hw_random/via-rng.c b/drivers/char/hw_random/via-rng.c

index a3bebef255ad3af669c2134ce85805d0af4b0523..0c98a9d51a2494e6a49ef49e6bfb557cefca1974 100644 (file)
--- a/drivers/char/hw_random/via-rng.c
+++ b/drivers/char/hw_random/via-rng.c
@@ -33,7 +33,7 @@
  #include <asm/io.h>
  #include <asm/msr.h>
  #include <asm/cpufeature.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  
  
  
diff --git a/drivers/clk/at91/clk-peripheral.c b/drivers/clk/at91/clk-peripheral.c

index 597fed423d7d31906b1ca37f3fa9931231a63460..df2c1afa52b4acaa6204d5595a76c65f7dc0cb70 100644 (file)
--- a/drivers/clk/at91/clk-peripheral.c
+++ b/drivers/clk/at91/clk-peripheral.c
@@ -29,7 +29,7 @@
  #define PERIPHERAL_RSHIFT_MASK 0x3
  #define PERIPHERAL_RSHIFT(val) (((val) >> 16) & PERIPHERAL_RSHIFT_MASK)
  
-#define PERIPHERAL_MAX_SHIFT   4
+#define PERIPHERAL_MAX_SHIFT   3
  
  struct clk_peripheral {
         struct clk_hw hw;
@@ -242,7 +242,7 @@ static long clk_sam9x5_peripheral_round_rate(struct clk_hw *hw,
                 return *parent_rate;
  
         if (periph->range.max) {
-               for (; shift < PERIPHERAL_MAX_SHIFT; shift++) {
+               for (; shift <= PERIPHERAL_MAX_SHIFT; shift++) {
                         cur_rate = *parent_rate >> shift;
                         if (cur_rate <= periph->range.max)
                                 break;
@@ -254,7 +254,7 @@ static long clk_sam9x5_peripheral_round_rate(struct clk_hw *hw,
  
         best_diff = cur_rate - rate;
         best_rate = cur_rate;
-       for (; shift < PERIPHERAL_MAX_SHIFT; shift++) {
+       for (; shift <= PERIPHERAL_MAX_SHIFT; shift++) {
                 cur_rate = *parent_rate >> shift;
                 if (cur_rate < rate)
                         cur_diff = rate - cur_rate;
@@ -289,7 +289,7 @@ static int clk_sam9x5_peripheral_set_rate(struct clk_hw *hw,
         if (periph->range.max && rate > periph->range.max)
                 return -EINVAL;
  
-       for (shift = 0; shift < PERIPHERAL_MAX_SHIFT; shift++) {
+       for (shift = 0; shift <= PERIPHERAL_MAX_SHIFT; shift++) {
                 if (parent_rate >> shift == rate) {
                         periph->auto_div = false;
                         periph->div = shift;
diff --git a/drivers/clk/at91/clk-pll.c b/drivers/clk/at91/clk-pll.c

index 6ec79dbc0840ad8940e9e9ab599a0f865f1cd881..cbbe40377ad622a7f9d38aca5651916dda549e54 100644 (file)
--- a/drivers/clk/at91/clk-pll.c
+++ b/drivers/clk/at91/clk-pll.c
@@ -173,8 +173,7 @@ static long clk_pll_get_best_div_mul(struct clk_pll *pll, unsigned long rate,
         int i = 0;
  
         /* Check if parent_rate is a valid input rate */
-       if (parent_rate < characteristics->input.min ||
-           parent_rate > characteristics->input.max)
+       if (parent_rate < characteristics->input.min)
                 return -ERANGE;
  
         /*
@@ -187,6 +186,15 @@ static long clk_pll_get_best_div_mul(struct clk_pll *pll, unsigned long rate,
         if (!mindiv)
                 mindiv = 1;
  
+       if (parent_rate > characteristics->input.max) {
+               tmpdiv = DIV_ROUND_UP(parent_rate, characteristics->input.max);
+               if (tmpdiv > PLL_DIV_MAX)
+                       return -ERANGE;
+
+               if (tmpdiv > mindiv)
+                       mindiv = tmpdiv;
+       }
+
         /*
          * Calculate the maximum divider which is limited by PLL register
          * layout (limited by the MUL or DIV field size).
diff --git a/drivers/clk/at91/pmc.h b/drivers/clk/at91/pmc.h

index 69abb08cf146513b0307a4a78449b2e5da971282..eb8e5dc9076d46f07901a98db214fbeec0b0a3dc 100644 (file)
--- a/drivers/clk/at91/pmc.h
+++ b/drivers/clk/at91/pmc.h
@@ -121,7 +121,7 @@ extern void __init of_at91sam9x5_clk_smd_setup(struct device_node *np,
                                                struct at91_pmc *pmc);
  #endif
  
-#if defined(CONFIG_HAVE_AT91_SMD)
+#if defined(CONFIG_HAVE_AT91_H32MX)
  extern void __init of_sama5d4_clk_h32mx_setup(struct device_node *np,
                                               struct at91_pmc *pmc);
  #endif
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c

index b0c18ed8d83f707d000213e458dba613e4ffaf96..0136dfcdabf0bad0382639566ba8707a464ddcaf 100644 (file)
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -699,13 +699,14 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
         dmi_check_system(sw_any_bug_dmi_table);
         if (bios_with_sw_any_bug && !policy_is_shared(policy)) {
                 policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
-               cpumask_copy(policy->cpus, cpu_core_mask(cpu));
+               cpumask_copy(policy->cpus, topology_core_cpumask(cpu));
         }
  
         if (check_amd_hwpstate_cpu(cpu) && !acpi_pstate_strict) {
                 cpumask_clear(policy->cpus);
                 cpumask_set_cpu(cpu, policy->cpus);
-               cpumask_copy(data->freqdomain_cpus, cpu_sibling_mask(cpu));
+               cpumask_copy(data->freqdomain_cpus,
+                            topology_sibling_cpumask(cpu));
                 policy->shared_type = CPUFREQ_SHARED_TYPE_HW;
                 pr_info_once(PFX "overriding BIOS provided _PSD data\n");
         }
diff --git a/drivers/cpufreq/p4-clockmod.c b/drivers/cpufreq/p4-clockmod.c

index 529cfd92158fa6f7e0e9cfb79d88307c83c57ac3..5dd95dab580d1cf30135ff39ce2cfd81d9be9b7f 100644 (file)
--- a/drivers/cpufreq/p4-clockmod.c
+++ b/drivers/cpufreq/p4-clockmod.c
@@ -172,7 +172,7 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
         unsigned int i;
  
  #ifdef CONFIG_SMP
-       cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
+       cpumask_copy(policy->cpus, topology_sibling_cpumask(policy->cpu));
  #endif
  
         /* Errata workaround */
diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c

index f9ce7e4bf0feae587913bb46b94539e35a572fb9..5c035d04d827106505c26c08ac649d7ffc907401 100644 (file)
--- a/drivers/cpufreq/powernow-k8.c
+++ b/drivers/cpufreq/powernow-k8.c
@@ -57,13 +57,6 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
  
  static struct cpufreq_driver cpufreq_amd64_driver;
  
-#ifndef CONFIG_SMP
-static inline const struct cpumask *cpu_core_mask(int cpu)
-{
-       return cpumask_of(0);
-}
-#endif
-
  /* Return a frequency in MHz, given an input fid */
  static u32 find_freq_from_fid(u32 fid)
  {
@@ -620,7 +613,7 @@ static int fill_powernow_table(struct powernow_k8_data *data,
  
         pr_debug("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
         data->powernow_table = powernow_table;
-       if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
+       if (cpumask_first(topology_core_cpumask(data->cpu)) == data->cpu)
                 print_basics(data);
  
         for (j = 0; j < data->numps; j++)
@@ -784,7 +777,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
                 CPUFREQ_TABLE_END;
         data->powernow_table = powernow_table;
  
-       if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
+       if (cpumask_first(topology_core_cpumask(data->cpu)) == data->cpu)
                 print_basics(data);
  
         /* notify BIOS that we exist */
@@ -1090,7 +1083,7 @@ static int powernowk8_cpu_init(struct cpufreq_policy *pol)
         if (rc != 0)
                 goto err_out_exit_acpi;
  
-       cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
+       cpumask_copy(pol->cpus, topology_core_cpumask(pol->cpu));
         data->available_cores = pol->cpus;
  
         /* min/max the cpu is capable of */
diff --git a/drivers/cpufreq/speedstep-ich.c b/drivers/cpufreq/speedstep-ich.c

index e56d632a8b2107be82c7fffe94c8cf57d02aaca1..37555c6b86a7cf843f04187a24c86dfe99aa8d42 100644 (file)
--- a/drivers/cpufreq/speedstep-ich.c
+++ b/drivers/cpufreq/speedstep-ich.c
@@ -292,7 +292,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
  
         /* only run on CPU to be set, or on its sibling */
  #ifdef CONFIG_SMP
-       cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
+       cpumask_copy(policy->cpus, topology_sibling_cpumask(policy->cpu));
  #endif
         policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
  
diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c

index ba0532efd3ae68d0368a00a1018dd22ee76f99b0..332c8ef8dae2cc262540f0d168dffa2266d82c73 100644 (file)
--- a/drivers/crypto/caam/caamhash.c
+++ b/drivers/crypto/caam/caamhash.c
@@ -1544,6 +1544,8 @@ static int ahash_init(struct ahash_request *req)
  
         state->current_buf = 0;
         state->buf_dma = 0;
+       state->buflen_0 = 0;
+       state->buflen_1 = 0;
  
         return 0;
  }
diff --git a/drivers/crypto/caam/caamrng.c b/drivers/crypto/caam/caamrng.c

index 26a544b505f1e17166f95cf0e0dccdc5191015d3..5095337205b830c148696a37d53a8902643b317f 100644 (file)
--- a/drivers/crypto/caam/caamrng.c
+++ b/drivers/crypto/caam/caamrng.c
@@ -56,7 +56,7 @@
  
  /* Buffer, its dma address and lock */
  struct buf_data {
-       u8 buf[RN_BUF_SIZE];
+       u8 buf[RN_BUF_SIZE] ____cacheline_aligned;
         dma_addr_t addr;
         struct completion filled;
         u32 hw_desc[DESC_JOB_O_LEN];
diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c

index c178ed8c3908d3a92e55432aecb86e0e6e1501ee..da2d6777bd092f0a373e14cc960514b8a4f5d148 100644 (file)
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -22,7 +22,7 @@
  #include <asm/cpu_device_id.h>
  #include <asm/byteorder.h>
  #include <asm/processor.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  
  /*
   * Number of data blocks actually fetched for each xcrypt insn.
diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c

index 95f7d27ce491f000458a257e5dfa55a6105b433b..4e154c9b92064bb1fbafeb805bcb77f9cc2d1bdf 100644 (file)
--- a/drivers/crypto/padlock-sha.c
+++ b/drivers/crypto/padlock-sha.c
@@ -23,7 +23,7 @@
  #include <linux/kernel.h>
  #include <linux/scatterlist.h>
  #include <asm/cpu_device_id.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  
  struct padlock_sha_desc {
         struct shash_desc fallback;
diff --git a/drivers/crypto/vmx/aes.c b/drivers/crypto/vmx/aes.c

index ab300ea19434e3f93d9b08475f27ee59cf445390..a9064e36e7b5488c4f47f628bb317c48b7efa485 100644 (file)
--- a/drivers/crypto/vmx/aes.c
+++ b/drivers/crypto/vmx/aes.c
@@ -78,12 +78,14 @@ static int p8_aes_setkey(struct crypto_tfm *tfm, const u8 *key,
      int ret;
      struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
  
+    preempt_disable();
      pagefault_disable();
      enable_kernel_altivec();
      ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
      ret += aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
      pagefault_enable();
-    
+    preempt_enable();
+
      ret += crypto_cipher_setkey(ctx->fallback, key, keylen);
      return ret;
  }
@@ -95,10 +97,12 @@ static void p8_aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
      if (in_interrupt()) {
          crypto_cipher_encrypt_one(ctx->fallback, dst, src);
      } else {
+       preempt_disable();
          pagefault_disable();
          enable_kernel_altivec();
          aes_p8_encrypt(src, dst, &ctx->enc_key);
          pagefault_enable();
+       preempt_enable();
      }
  }
  
@@ -109,10 +113,12 @@ static void p8_aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
      if (in_interrupt()) {
          crypto_cipher_decrypt_one(ctx->fallback, dst, src);
      } else {
+       preempt_disable();
          pagefault_disable();
          enable_kernel_altivec();
          aes_p8_decrypt(src, dst, &ctx->dec_key);
          pagefault_enable();
+       preempt_enable();
      }
  }
  
diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c

index 1a559b7dddb5f2f5ff3184e0d50c275bf498aa3c..477284abdd11dc9d738fe08858d7cbe48a65fe32 100644 (file)
--- a/drivers/crypto/vmx/aes_cbc.c
+++ b/drivers/crypto/vmx/aes_cbc.c
@@ -79,11 +79,13 @@ static int p8_aes_cbc_setkey(struct crypto_tfm *tfm, const u8 *key,
      int ret;
      struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
  
+    preempt_disable();
      pagefault_disable();
      enable_kernel_altivec();
      ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
      ret += aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
      pagefault_enable();
+    preempt_enable();
  
      ret += crypto_blkcipher_setkey(ctx->fallback, key, keylen);
      return ret;
@@ -106,6 +108,7 @@ static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc,
      if (in_interrupt()) {
          ret = crypto_blkcipher_encrypt(&fallback_desc, dst, src, nbytes);
      } else {
+       preempt_disable();
          pagefault_disable();
          enable_kernel_altivec();
  
@@ -119,6 +122,7 @@ static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc,
         }
  
          pagefault_enable();
+       preempt_enable();
      }
  
      return ret;
@@ -141,6 +145,7 @@ static int p8_aes_cbc_decrypt(struct blkcipher_desc *desc,
      if (in_interrupt()) {
          ret = crypto_blkcipher_decrypt(&fallback_desc, dst, src, nbytes);
      } else {
+       preempt_disable();
          pagefault_disable();
          enable_kernel_altivec();
  
@@ -154,6 +159,7 @@ static int p8_aes_cbc_decrypt(struct blkcipher_desc *desc,
                 }
  
          pagefault_enable();
+       preempt_enable();
      }
  
      return ret;
diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c

index d0ffe277af5ca583157afbf881d5df2fbdec5bf5..f255ec4a04d48d60a28e1025b3e2473cdcb7d8fc 100644 (file)
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -114,11 +114,13 @@ static int p8_ghash_setkey(struct crypto_shash *tfm, const u8 *key,
      if (keylen != GHASH_KEY_LEN)
          return -EINVAL;
  
+    preempt_disable();
      pagefault_disable();
      enable_kernel_altivec();
      enable_kernel_fp();
      gcm_init_p8(ctx->htable, (const u64 *) key);
      pagefault_enable();
+    preempt_enable();
      return crypto_shash_setkey(ctx->fallback, key, keylen);
  }
  
@@ -140,23 +142,27 @@ static int p8_ghash_update(struct shash_desc *desc,
              }
              memcpy(dctx->buffer + dctx->bytes, src,
                      GHASH_DIGEST_SIZE - dctx->bytes);
+           preempt_disable();
              pagefault_disable();
              enable_kernel_altivec();
              enable_kernel_fp();
              gcm_ghash_p8(dctx->shash, ctx->htable, dctx->buffer,
                      GHASH_DIGEST_SIZE);
              pagefault_enable();
+           preempt_enable();
              src += GHASH_DIGEST_SIZE - dctx->bytes;
              srclen -= GHASH_DIGEST_SIZE - dctx->bytes;
              dctx->bytes = 0;
          }
          len = srclen & ~(GHASH_DIGEST_SIZE - 1);
          if (len) {
+           preempt_disable();
              pagefault_disable();
              enable_kernel_altivec();
              enable_kernel_fp();
              gcm_ghash_p8(dctx->shash, ctx->htable, src, len);
              pagefault_enable();
+           preempt_enable();
              src += len;
              srclen -= len;
          }
@@ -180,12 +186,14 @@ static int p8_ghash_final(struct shash_desc *desc, u8 *out)
          if (dctx->bytes) {
              for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++)
                  dctx->buffer[i] = 0;
+           preempt_disable();
              pagefault_disable();
              enable_kernel_altivec();
              enable_kernel_fp();
              gcm_ghash_p8(dctx->shash, ctx->htable, dctx->buffer,
                      GHASH_DIGEST_SIZE);
              pagefault_enable();
+           preempt_enable();
              dctx->bytes = 0;
          }
          memcpy(out, dctx->shash, GHASH_DIGEST_SIZE);
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c

index 933e4b338459284465d7970e3ff7dbf0f37314b8..7992164ea9ec2849f6ac3691629c47cda30aeb28 100644 (file)
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -174,6 +174,8 @@
  #define AT_XDMAC_MBR_UBC_NDV3          (0x3 << 27)     /* Next Descriptor View 3 */
  
  #define AT_XDMAC_MAX_CHAN      0x20
+#define AT_XDMAC_MAX_CSIZE     16      /* 16 data */
+#define AT_XDMAC_MAX_DWIDTH    8       /* 64 bits */
  
  #define AT_XDMAC_DMA_BUSWIDTHS\
         (BIT(DMA_SLAVE_BUSWIDTH_UNDEFINED) |\
@@ -192,20 +194,17 @@ struct at_xdmac_chan {
         struct dma_chan                 chan;
         void __iomem                    *ch_regs;
         u32                             mask;           /* Channel Mask */
-       u32                             cfg[2];         /* Channel Configuration Register */
-       #define AT_XDMAC_DEV_TO_MEM_CFG 0               /* Predifined dev to mem channel conf */
-       #define AT_XDMAC_MEM_TO_DEV_CFG 1               /* Predifined mem to dev channel conf */
+       u32                             cfg;            /* Channel Configuration Register */
         u8                              perid;          /* Peripheral ID */
         u8                              perif;          /* Peripheral Interface */
         u8                              memif;          /* Memory Interface */
-       u32                             per_src_addr;
-       u32                             per_dst_addr;
         u32                             save_cc;
         u32                             save_cim;
         u32                             save_cnda;
         u32                             save_cndc;
         unsigned long                   status;
         struct tasklet_struct           tasklet;
+       struct dma_slave_config         sconfig;
  
         spinlock_t                      lock;
  
@@ -415,8 +414,9 @@ static dma_cookie_t at_xdmac_tx_submit(struct dma_async_tx_descriptor *tx)
         struct at_xdmac_desc    *desc = txd_to_at_desc(tx);
         struct at_xdmac_chan    *atchan = to_at_xdmac_chan(tx->chan);
         dma_cookie_t            cookie;
+       unsigned long           irqflags;
  
-       spin_lock_bh(&atchan->lock);
+       spin_lock_irqsave(&atchan->lock, irqflags);
         cookie = dma_cookie_assign(tx);
  
         dev_vdbg(chan2dev(tx->chan), "%s: atchan 0x%p, add desc 0x%p to xfers_list\n",
@@ -425,7 +425,7 @@ static dma_cookie_t at_xdmac_tx_submit(struct dma_async_tx_descriptor *tx)
         if (list_is_singular(&atchan->xfers_list))
                 at_xdmac_start_xfer(atchan, desc);
  
-       spin_unlock_bh(&atchan->lock);
+       spin_unlock_irqrestore(&atchan->lock, irqflags);
         return cookie;
  }
  
@@ -494,61 +494,94 @@ static struct dma_chan *at_xdmac_xlate(struct of_phandle_args *dma_spec,
         return chan;
  }
  
+static int at_xdmac_compute_chan_conf(struct dma_chan *chan,
+                                     enum dma_transfer_direction direction)
+{
+       struct at_xdmac_chan    *atchan = to_at_xdmac_chan(chan);
+       int                     csize, dwidth;
+
+       if (direction == DMA_DEV_TO_MEM) {
+               atchan->cfg =
+                       AT91_XDMAC_DT_PERID(atchan->perid)
+                       | AT_XDMAC_CC_DAM_INCREMENTED_AM
+                       | AT_XDMAC_CC_SAM_FIXED_AM
+                       | AT_XDMAC_CC_DIF(atchan->memif)
+                       | AT_XDMAC_CC_SIF(atchan->perif)
+                       | AT_XDMAC_CC_SWREQ_HWR_CONNECTED
+                       | AT_XDMAC_CC_DSYNC_PER2MEM
+                       | AT_XDMAC_CC_MBSIZE_SIXTEEN
+                       | AT_XDMAC_CC_TYPE_PER_TRAN;
+               csize = ffs(atchan->sconfig.src_maxburst) - 1;
+               if (csize < 0) {
+                       dev_err(chan2dev(chan), "invalid src maxburst value\n");
+                       return -EINVAL;
+               }
+               atchan->cfg |= AT_XDMAC_CC_CSIZE(csize);
+               dwidth = ffs(atchan->sconfig.src_addr_width) - 1;
+               if (dwidth < 0) {
+                       dev_err(chan2dev(chan), "invalid src addr width value\n");
+                       return -EINVAL;
+               }
+               atchan->cfg |= AT_XDMAC_CC_DWIDTH(dwidth);
+       } else if (direction == DMA_MEM_TO_DEV) {
+               atchan->cfg =
+                       AT91_XDMAC_DT_PERID(atchan->perid)
+                       | AT_XDMAC_CC_DAM_FIXED_AM
+                       | AT_XDMAC_CC_SAM_INCREMENTED_AM
+                       | AT_XDMAC_CC_DIF(atchan->perif)
+                       | AT_XDMAC_CC_SIF(atchan->memif)
+                       | AT_XDMAC_CC_SWREQ_HWR_CONNECTED
+                       | AT_XDMAC_CC_DSYNC_MEM2PER
+                       | AT_XDMAC_CC_MBSIZE_SIXTEEN
+                       | AT_XDMAC_CC_TYPE_PER_TRAN;
+               csize = ffs(atchan->sconfig.dst_maxburst) - 1;
+               if (csize < 0) {
+                       dev_err(chan2dev(chan), "invalid src maxburst value\n");
+                       return -EINVAL;
+               }
+               atchan->cfg |= AT_XDMAC_CC_CSIZE(csize);
+               dwidth = ffs(atchan->sconfig.dst_addr_width) - 1;
+               if (dwidth < 0) {
+                       dev_err(chan2dev(chan), "invalid dst addr width value\n");
+                       return -EINVAL;
+               }
+               atchan->cfg |= AT_XDMAC_CC_DWIDTH(dwidth);
+       }
+
+       dev_dbg(chan2dev(chan), "%s: cfg=0x%08x\n", __func__, atchan->cfg);
+
+       return 0;
+}
+
+/*
+ * Only check that maxburst and addr width values are supported by the
+ * the controller but not that the configuration is good to perform the
+ * transfer since we don't know the direction at this stage.
+ */
+static int at_xdmac_check_slave_config(struct dma_slave_config *sconfig)
+{
+       if ((sconfig->src_maxburst > AT_XDMAC_MAX_CSIZE)
+           || (sconfig->dst_maxburst > AT_XDMAC_MAX_CSIZE))
+               return -EINVAL;
+
+       if ((sconfig->src_addr_width > AT_XDMAC_MAX_DWIDTH)
+           || (sconfig->dst_addr_width > AT_XDMAC_MAX_DWIDTH))
+               return -EINVAL;
+
+       return 0;
+}
+
  static int at_xdmac_set_slave_config(struct dma_chan *chan,
                                       struct dma_slave_config *sconfig)
  {
         struct at_xdmac_chan    *atchan = to_at_xdmac_chan(chan);
-       u8 dwidth;
-       int csize;
  
-       atchan->cfg[AT_XDMAC_DEV_TO_MEM_CFG] =
-               AT91_XDMAC_DT_PERID(atchan->perid)
-               | AT_XDMAC_CC_DAM_INCREMENTED_AM
-               | AT_XDMAC_CC_SAM_FIXED_AM
-               | AT_XDMAC_CC_DIF(atchan->memif)
-               | AT_XDMAC_CC_SIF(atchan->perif)
-               | AT_XDMAC_CC_SWREQ_HWR_CONNECTED
-               | AT_XDMAC_CC_DSYNC_PER2MEM
-               | AT_XDMAC_CC_MBSIZE_SIXTEEN
-               | AT_XDMAC_CC_TYPE_PER_TRAN;
-       csize = at_xdmac_csize(sconfig->src_maxburst);
-       if (csize < 0) {
-               dev_err(chan2dev(chan), "invalid src maxburst value\n");
+       if (at_xdmac_check_slave_config(sconfig)) {
+               dev_err(chan2dev(chan), "invalid slave configuration\n");
                 return -EINVAL;
         }
-       atchan->cfg[AT_XDMAC_DEV_TO_MEM_CFG] |= AT_XDMAC_CC_CSIZE(csize);
-       dwidth = ffs(sconfig->src_addr_width) - 1;
-       atchan->cfg[AT_XDMAC_DEV_TO_MEM_CFG] |= AT_XDMAC_CC_DWIDTH(dwidth);
-
-
-       atchan->cfg[AT_XDMAC_MEM_TO_DEV_CFG] =
-               AT91_XDMAC_DT_PERID(atchan->perid)
-               | AT_XDMAC_CC_DAM_FIXED_AM
-               | AT_XDMAC_CC_SAM_INCREMENTED_AM
-               | AT_XDMAC_CC_DIF(atchan->perif)
-               | AT_XDMAC_CC_SIF(atchan->memif)
-               | AT_XDMAC_CC_SWREQ_HWR_CONNECTED
-               | AT_XDMAC_CC_DSYNC_MEM2PER
-               | AT_XDMAC_CC_MBSIZE_SIXTEEN
-               | AT_XDMAC_CC_TYPE_PER_TRAN;
-       csize = at_xdmac_csize(sconfig->dst_maxburst);
-       if (csize < 0) {
-               dev_err(chan2dev(chan), "invalid src maxburst value\n");
-               return -EINVAL;
-       }
-       atchan->cfg[AT_XDMAC_MEM_TO_DEV_CFG] |= AT_XDMAC_CC_CSIZE(csize);
-       dwidth = ffs(sconfig->dst_addr_width) - 1;
-       atchan->cfg[AT_XDMAC_MEM_TO_DEV_CFG] |= AT_XDMAC_CC_DWIDTH(dwidth);
-
-       /* Src and dst addr are needed to configure the link list descriptor. */
-       atchan->per_src_addr = sconfig->src_addr;
-       atchan->per_dst_addr = sconfig->dst_addr;
  
-       dev_dbg(chan2dev(chan),
-               "%s: cfg[dev2mem]=0x%08x, cfg[mem2dev]=0x%08x, per_src_addr=0x%08x, per_dst_addr=0x%08x\n",
-               __func__, atchan->cfg[AT_XDMAC_DEV_TO_MEM_CFG],
-               atchan->cfg[AT_XDMAC_MEM_TO_DEV_CFG],
-               atchan->per_src_addr, atchan->per_dst_addr);
+       memcpy(&atchan->sconfig, sconfig, sizeof(atchan->sconfig));
  
         return 0;
  }
@@ -563,6 +596,8 @@ at_xdmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
         struct scatterlist      *sg;
         int                     i;
         unsigned int            xfer_size = 0;
+       unsigned long           irqflags;
+       struct dma_async_tx_descriptor  *ret = NULL;
  
         if (!sgl)
                 return NULL;
@@ -578,7 +613,10 @@ at_xdmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                  flags);
  
         /* Protect dma_sconfig field that can be modified by set_slave_conf. */
-       spin_lock_bh(&atchan->lock);
+       spin_lock_irqsave(&atchan->lock, irqflags);
+
+       if (at_xdmac_compute_chan_conf(chan, direction))
+               goto spin_unlock;
  
         /* Prepare descriptors. */
         for_each_sg(sgl, sg, sg_len, i) {
@@ -589,8 +627,7 @@ at_xdmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                 mem = sg_dma_address(sg);
                 if (unlikely(!len)) {
                         dev_err(chan2dev(chan), "sg data length is zero\n");
-                       spin_unlock_bh(&atchan->lock);
-                       return NULL;
+                       goto spin_unlock;
                 }
                 dev_dbg(chan2dev(chan), "%s: * sg%d len=%u, mem=0x%08x\n",
                          __func__, i, len, mem);
@@ -600,20 +637,18 @@ at_xdmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                         dev_err(chan2dev(chan), "can't get descriptor\n");
                         if (first)
                                 list_splice_init(&first->descs_list, &atchan->free_descs_list);
-                       spin_unlock_bh(&atchan->lock);
-                       return NULL;
+                       goto spin_unlock;
                 }
  
                 /* Linked list descriptor setup. */
                 if (direction == DMA_DEV_TO_MEM) {
-                       desc->lld.mbr_sa = atchan->per_src_addr;
+                       desc->lld.mbr_sa = atchan->sconfig.src_addr;
                         desc->lld.mbr_da = mem;
-                       desc->lld.mbr_cfg = atchan->cfg[AT_XDMAC_DEV_TO_MEM_CFG];
                 } else {
                         desc->lld.mbr_sa = mem;
-                       desc->lld.mbr_da = atchan->per_dst_addr;
-                       desc->lld.mbr_cfg = atchan->cfg[AT_XDMAC_MEM_TO_DEV_CFG];
+                       desc->lld.mbr_da = atchan->sconfig.dst_addr;
                 }
+               desc->lld.mbr_cfg = atchan->cfg;
                 dwidth = at_xdmac_get_dwidth(desc->lld.mbr_cfg);
                 fixed_dwidth = IS_ALIGNED(len, 1 << dwidth)
                                ? at_xdmac_get_dwidth(desc->lld.mbr_cfg)
@@ -645,13 +680,15 @@ at_xdmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                 xfer_size += len;
         }
  
-       spin_unlock_bh(&atchan->lock);
  
         first->tx_dma_desc.flags = flags;
         first->xfer_size = xfer_size;
         first->direction = direction;
+       ret = &first->tx_dma_desc;
  
-       return &first->tx_dma_desc;
+spin_unlock:
+       spin_unlock_irqrestore(&atchan->lock, irqflags);
+       return ret;
  }
  
  static struct dma_async_tx_descriptor *
@@ -664,6 +701,7 @@ at_xdmac_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr,
         struct at_xdmac_desc    *first = NULL, *prev = NULL;
         unsigned int            periods = buf_len / period_len;
         int                     i;
+       unsigned long           irqflags;
  
         dev_dbg(chan2dev(chan), "%s: buf_addr=%pad, buf_len=%zd, period_len=%zd, dir=%s, flags=0x%lx\n",
                 __func__, &buf_addr, buf_len, period_len,
@@ -679,32 +717,34 @@ at_xdmac_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr,
                 return NULL;
         }
  
+       if (at_xdmac_compute_chan_conf(chan, direction))
+               return NULL;
+
         for (i = 0; i < periods; i++) {
                 struct at_xdmac_desc    *desc = NULL;
  
-               spin_lock_bh(&atchan->lock);
+               spin_lock_irqsave(&atchan->lock, irqflags);
                 desc = at_xdmac_get_desc(atchan);
                 if (!desc) {
                         dev_err(chan2dev(chan), "can't get descriptor\n");
                         if (first)
                                 list_splice_init(&first->descs_list, &atchan->free_descs_list);
-                       spin_unlock_bh(&atchan->lock);
+                       spin_unlock_irqrestore(&atchan->lock, irqflags);
                         return NULL;
                 }
-               spin_unlock_bh(&atchan->lock);
+               spin_unlock_irqrestore(&atchan->lock, irqflags);
                 dev_dbg(chan2dev(chan),
                         "%s: desc=0x%p, tx_dma_desc.phys=%pad\n",
                         __func__, desc, &desc->tx_dma_desc.phys);
  
                 if (direction == DMA_DEV_TO_MEM) {
-                       desc->lld.mbr_sa = atchan->per_src_addr;
+                       desc->lld.mbr_sa = atchan->sconfig.src_addr;
                         desc->lld.mbr_da = buf_addr + i * period_len;
-                       desc->lld.mbr_cfg = atchan->cfg[AT_XDMAC_DEV_TO_MEM_CFG];
                 } else {
                         desc->lld.mbr_sa = buf_addr + i * period_len;
-                       desc->lld.mbr_da = atchan->per_dst_addr;
-                       desc->lld.mbr_cfg = atchan->cfg[AT_XDMAC_MEM_TO_DEV_CFG];
+                       desc->lld.mbr_da = atchan->sconfig.dst_addr;
                 }
+               desc->lld.mbr_cfg = atchan->cfg;
                 desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV1
                         | AT_XDMAC_MBR_UBC_NDEN
                         | AT_XDMAC_MBR_UBC_NSEN
@@ -766,6 +806,7 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
                                         | AT_XDMAC_CC_SIF(0)
                                         | AT_XDMAC_CC_MBSIZE_SIXTEEN
                                         | AT_XDMAC_CC_TYPE_MEM_TRAN;
+       unsigned long           irqflags;
  
         dev_dbg(chan2dev(chan), "%s: src=%pad, dest=%pad, len=%zd, flags=0x%lx\n",
                 __func__, &src, &dest, len, flags);
@@ -798,9 +839,9 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
  
                 dev_dbg(chan2dev(chan), "%s: remaining_size=%zu\n", __func__, remaining_size);
  
-               spin_lock_bh(&atchan->lock);
+               spin_lock_irqsave(&atchan->lock, irqflags);
                 desc = at_xdmac_get_desc(atchan);
-               spin_unlock_bh(&atchan->lock);
+               spin_unlock_irqrestore(&atchan->lock, irqflags);
                 if (!desc) {
                         dev_err(chan2dev(chan), "can't get descriptor\n");
                         if (first)
@@ -886,6 +927,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
         int                     residue;
         u32                     cur_nda, mask, value;
         u8                      dwidth = 0;
+       unsigned long           flags;
  
         ret = dma_cookie_status(chan, cookie, txstate);
         if (ret == DMA_COMPLETE)
@@ -894,7 +936,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
         if (!txstate)
                 return ret;
  
-       spin_lock_bh(&atchan->lock);
+       spin_lock_irqsave(&atchan->lock, flags);
  
         desc = list_first_entry(&atchan->xfers_list, struct at_xdmac_desc, xfer_node);
  
@@ -904,8 +946,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
          */
         if (!desc->active_xfer) {
                 dma_set_residue(txstate, desc->xfer_size);
-               spin_unlock_bh(&atchan->lock);
-               return ret;
+               goto spin_unlock;
         }
  
         residue = desc->xfer_size;
@@ -936,14 +977,14 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
         }
         residue += at_xdmac_chan_read(atchan, AT_XDMAC_CUBC) << dwidth;
  
-       spin_unlock_bh(&atchan->lock);
-
         dma_set_residue(txstate, residue);
  
         dev_dbg(chan2dev(chan),
                  "%s: desc=0x%p, tx_dma_desc.phys=%pad, tx_status=%d, cookie=%d, residue=%d\n",
                  __func__, desc, &desc->tx_dma_desc.phys, ret, cookie, residue);
  
+spin_unlock:
+       spin_unlock_irqrestore(&atchan->lock, flags);
         return ret;
  }
  
@@ -964,8 +1005,9 @@ static void at_xdmac_remove_xfer(struct at_xdmac_chan *atchan,
  static void at_xdmac_advance_work(struct at_xdmac_chan *atchan)
  {
         struct at_xdmac_desc    *desc;
+       unsigned long           flags;
  
-       spin_lock_bh(&atchan->lock);
+       spin_lock_irqsave(&atchan->lock, flags);
  
         /*
          * If channel is enabled, do nothing, advance_work will be triggered
@@ -980,7 +1022,7 @@ static void at_xdmac_advance_work(struct at_xdmac_chan *atchan)
                         at_xdmac_start_xfer(atchan, desc);
         }
  
-       spin_unlock_bh(&atchan->lock);
+       spin_unlock_irqrestore(&atchan->lock, flags);
  }
  
  static void at_xdmac_handle_cyclic(struct at_xdmac_chan *atchan)
@@ -1116,12 +1158,13 @@ static int at_xdmac_device_config(struct dma_chan *chan,
  {
         struct at_xdmac_chan    *atchan = to_at_xdmac_chan(chan);
         int ret;
+       unsigned long           flags;
  
         dev_dbg(chan2dev(chan), "%s\n", __func__);
  
-       spin_lock_bh(&atchan->lock);
+       spin_lock_irqsave(&atchan->lock, flags);
         ret = at_xdmac_set_slave_config(chan, config);
-       spin_unlock_bh(&atchan->lock);
+       spin_unlock_irqrestore(&atchan->lock, flags);
  
         return ret;
  }
@@ -1130,18 +1173,19 @@ static int at_xdmac_device_pause(struct dma_chan *chan)
  {
         struct at_xdmac_chan    *atchan = to_at_xdmac_chan(chan);
         struct at_xdmac         *atxdmac = to_at_xdmac(atchan->chan.device);
+       unsigned long           flags;
  
         dev_dbg(chan2dev(chan), "%s\n", __func__);
  
         if (test_and_set_bit(AT_XDMAC_CHAN_IS_PAUSED, &atchan->status))
                 return 0;
  
-       spin_lock_bh(&atchan->lock);
+       spin_lock_irqsave(&atchan->lock, flags);
         at_xdmac_write(atxdmac, AT_XDMAC_GRWS, atchan->mask);
         while (at_xdmac_chan_read(atchan, AT_XDMAC_CC)
                & (AT_XDMAC_CC_WRIP | AT_XDMAC_CC_RDIP))
                 cpu_relax();
-       spin_unlock_bh(&atchan->lock);
+       spin_unlock_irqrestore(&atchan->lock, flags);
  
         return 0;
  }
@@ -1150,18 +1194,19 @@ static int at_xdmac_device_resume(struct dma_chan *chan)
  {
         struct at_xdmac_chan    *atchan = to_at_xdmac_chan(chan);
         struct at_xdmac         *atxdmac = to_at_xdmac(atchan->chan.device);
+       unsigned long           flags;
  
         dev_dbg(chan2dev(chan), "%s\n", __func__);
  
-       spin_lock_bh(&atchan->lock);
+       spin_lock_irqsave(&atchan->lock, flags);
         if (!at_xdmac_chan_is_paused(atchan)) {
-               spin_unlock_bh(&atchan->lock);
+               spin_unlock_irqrestore(&atchan->lock, flags);
                 return 0;
         }
  
         at_xdmac_write(atxdmac, AT_XDMAC_GRWR, atchan->mask);
         clear_bit(AT_XDMAC_CHAN_IS_PAUSED, &atchan->status);
-       spin_unlock_bh(&atchan->lock);
+       spin_unlock_irqrestore(&atchan->lock, flags);
  
         return 0;
  }
@@ -1171,10 +1216,11 @@ static int at_xdmac_device_terminate_all(struct dma_chan *chan)
         struct at_xdmac_desc    *desc, *_desc;
         struct at_xdmac_chan    *atchan = to_at_xdmac_chan(chan);
         struct at_xdmac         *atxdmac = to_at_xdmac(atchan->chan.device);
+       unsigned long           flags;
  
         dev_dbg(chan2dev(chan), "%s\n", __func__);
  
-       spin_lock_bh(&atchan->lock);
+       spin_lock_irqsave(&atchan->lock, flags);
         at_xdmac_write(atxdmac, AT_XDMAC_GD, atchan->mask);
         while (at_xdmac_read(atxdmac, AT_XDMAC_GS) & atchan->mask)
                 cpu_relax();
@@ -1184,7 +1230,7 @@ static int at_xdmac_device_terminate_all(struct dma_chan *chan)
                 at_xdmac_remove_xfer(atchan, desc);
  
         clear_bit(AT_XDMAC_CHAN_IS_CYCLIC, &atchan->status);
-       spin_unlock_bh(&atchan->lock);
+       spin_unlock_irqrestore(&atchan->lock, flags);
  
         return 0;
  }
@@ -1194,8 +1240,9 @@ static int at_xdmac_alloc_chan_resources(struct dma_chan *chan)
         struct at_xdmac_chan    *atchan = to_at_xdmac_chan(chan);
         struct at_xdmac_desc    *desc;
         int                     i;
+       unsigned long           flags;
  
-       spin_lock_bh(&atchan->lock);
+       spin_lock_irqsave(&atchan->lock, flags);
  
         if (at_xdmac_chan_is_enabled(atchan)) {
                 dev_err(chan2dev(chan),
@@ -1226,7 +1273,7 @@ static int at_xdmac_alloc_chan_resources(struct dma_chan *chan)
         dev_dbg(chan2dev(chan), "%s: allocated %d descriptors\n", __func__, i);
  
  spin_unlock:
-       spin_unlock_bh(&atchan->lock);
+       spin_unlock_irqrestore(&atchan->lock, flags);
         return i;
  }
  
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c

index 2890d744bb1bb902cc095fb87841c492cef7542c..3ddfd1f6c23c0f0f891ed11d6f68cbcaaa3c6e03 100644 (file)
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -487,7 +487,11 @@ int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps)
         caps->directions = device->directions;
         caps->residue_granularity = device->residue_granularity;
  
-       caps->cmd_pause = !!device->device_pause;
+       /*
+        * Some devices implement only pause (e.g. to get residuum) but no
+        * resume. However cmd_pause is advertised as pause AND resume.
+        */
+       caps->cmd_pause = !!(device->device_pause && device->device_resume);
         caps->cmd_terminate = !!device->device_terminate_all;
  
         return 0;
diff --git a/drivers/dma/hsu/hsu.c b/drivers/dma/hsu/hsu.c

index 9b84def7a35373a45cf18efc9d53939bcc86baf0..f42f71e37e73767a55078aef4c81bcd238b02db1 100644 (file)
--- a/drivers/dma/hsu/hsu.c
+++ b/drivers/dma/hsu/hsu.c
@@ -384,7 +384,10 @@ static int hsu_dma_terminate_all(struct dma_chan *chan)
         spin_lock_irqsave(&hsuc->vchan.lock, flags);
  
         hsu_dma_stop_channel(hsuc);
-       hsuc->desc = NULL;
+       if (hsuc->desc) {
+               hsu_dma_desc_free(&hsuc->desc->vdesc);
+               hsuc->desc = NULL;
+       }
  
         vchan_get_all_descriptors(&hsuc->vchan, &head);
         spin_unlock_irqrestore(&hsuc->vchan.lock, flags);
diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c

index a7d9d3029b145dfa29babeee33022bc1f7354d52..340f9e607cd8b90dfe75add027c18bd26d67f1e0 100644 (file)
--- a/drivers/dma/pl330.c
+++ b/drivers/dma/pl330.c
@@ -2127,6 +2127,7 @@ static int pl330_terminate_all(struct dma_chan *chan)
         struct pl330_dmac *pl330 = pch->dmac;
         LIST_HEAD(list);
  
+       pm_runtime_get_sync(pl330->ddma.dev);
         spin_lock_irqsave(&pch->lock, flags);
         spin_lock(&pl330->lock);
         _stop(pch->thread);
@@ -2151,6 +2152,8 @@ static int pl330_terminate_all(struct dma_chan *chan)
         list_splice_tail_init(&pch->work_list, &pl330->desc_pool);
         list_splice_tail_init(&pch->completed_list, &pl330->desc_pool);
         spin_unlock_irqrestore(&pch->lock, flags);
+       pm_runtime_mark_last_busy(pl330->ddma.dev);
+       pm_runtime_put_autosuspend(pl330->ddma.dev);
  
         return 0;
  }
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig

index 8de4da5c9ab69c919389057078d352fd492fbcd3..54071c1483400d41e214c0f83512ca1f4600814a 100644 (file)
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -18,6 +18,11 @@ config EFI_VARS
           Subsequent efibootmgr releases may be found at:
           <http://github.com/vathpela/efibootmgr>
  
+config EFI_ESRT
+       bool
+       depends on EFI && !IA64
+       default y
+
  config EFI_VARS_PSTORE
         tristate "Register efivars backend for pstore"
         depends on EFI_VARS && PSTORE
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile

index d8be608a9f3be733bf4d56a1e360ecc3c5471e7b..6fd3da938717c27c233af9c6b66b186f69fc93fa 100644 (file)
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -3,6 +3,7 @@
  #
  obj-$(CONFIG_EFI)                      += efi.o vars.o reboot.o
  obj-$(CONFIG_EFI_VARS)                 += efivars.o
+obj-$(CONFIG_EFI_ESRT)                 += esrt.o
  obj-$(CONFIG_EFI_VARS_PSTORE)          += efi-pstore.o
  obj-$(CONFIG_UEFI_CPER)                        += cper.o
  obj-$(CONFIG_EFI_RUNTIME_MAP)          += runtime-map.o
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c

index 3061bb8629dc3fbdf19e19d373d75b2286cf8376..ca617f40574ac2bb8e5697b858a1c52f93c8fad9 100644 (file)
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -39,6 +39,7 @@ struct efi __read_mostly efi = {
         .fw_vendor  = EFI_INVALID_TABLE_ADDR,
         .runtime    = EFI_INVALID_TABLE_ADDR,
         .config_table  = EFI_INVALID_TABLE_ADDR,
+       .esrt       = EFI_INVALID_TABLE_ADDR,
  };
  EXPORT_SYMBOL(efi);
  
@@ -64,7 +65,7 @@ static int __init parse_efi_cmdline(char *str)
  }
  early_param("efi", parse_efi_cmdline);
  
-static struct kobject *efi_kobj;
+struct kobject *efi_kobj;
  static struct kobject *efivars_kobj;
  
  /*
@@ -85,10 +86,15 @@ static ssize_t systab_show(struct kobject *kobj,
                 str += sprintf(str, "ACPI20=0x%lx\n", efi.acpi20);
         if (efi.acpi != EFI_INVALID_TABLE_ADDR)
                 str += sprintf(str, "ACPI=0x%lx\n", efi.acpi);
-       if (efi.smbios != EFI_INVALID_TABLE_ADDR)
-               str += sprintf(str, "SMBIOS=0x%lx\n", efi.smbios);
+       /*
+        * If both SMBIOS and SMBIOS3 entry points are implemented, the
+        * SMBIOS3 entry point shall be preferred, so we list it first to
+        * let applications stop parsing after the first match.
+        */
         if (efi.smbios3 != EFI_INVALID_TABLE_ADDR)
                 str += sprintf(str, "SMBIOS3=0x%lx\n", efi.smbios3);
+       if (efi.smbios != EFI_INVALID_TABLE_ADDR)
+               str += sprintf(str, "SMBIOS=0x%lx\n", efi.smbios);
         if (efi.hcdp != EFI_INVALID_TABLE_ADDR)
                 str += sprintf(str, "HCDP=0x%lx\n", efi.hcdp);
         if (efi.boot_info != EFI_INVALID_TABLE_ADDR)
@@ -232,6 +238,84 @@ err_put:
  
  subsys_initcall(efisubsys_init);
  
+/*
+ * Find the efi memory descriptor for a given physical address.  Given a
+ * physicall address, determine if it exists within an EFI Memory Map entry,
+ * and if so, populate the supplied memory descriptor with the appropriate
+ * data.
+ */
+int __init efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md)
+{
+       struct efi_memory_map *map = efi.memmap;
+       void *p, *e;
+
+       if (!efi_enabled(EFI_MEMMAP)) {
+               pr_err_once("EFI_MEMMAP is not enabled.\n");
+               return -EINVAL;
+       }
+
+       if (!map) {
+               pr_err_once("efi.memmap is not set.\n");
+               return -EINVAL;
+       }
+       if (!out_md) {
+               pr_err_once("out_md is null.\n");
+               return -EINVAL;
+        }
+       if (WARN_ON_ONCE(!map->phys_map))
+               return -EINVAL;
+       if (WARN_ON_ONCE(map->nr_map == 0) || WARN_ON_ONCE(map->desc_size == 0))
+               return -EINVAL;
+
+       e = map->phys_map + map->nr_map * map->desc_size;
+       for (p = map->phys_map; p < e; p += map->desc_size) {
+               efi_memory_desc_t *md;
+               u64 size;
+               u64 end;
+
+               /*
+                * If a driver calls this after efi_free_boot_services,
+                * ->map will be NULL, and the target may also not be mapped.
+                * So just always get our own virtual map on the CPU.
+                *
+                */
+               md = early_memremap((phys_addr_t)p, sizeof (*md));
+               if (!md) {
+                       pr_err_once("early_memremap(%p, %zu) failed.\n",
+                                   p, sizeof (*md));
+                       return -ENOMEM;
+               }
+
+               if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
+                   md->type != EFI_BOOT_SERVICES_DATA &&
+                   md->type != EFI_RUNTIME_SERVICES_DATA) {
+                       early_memunmap(md, sizeof (*md));
+                       continue;
+               }
+
+               size = md->num_pages << EFI_PAGE_SHIFT;
+               end = md->phys_addr + size;
+               if (phys_addr >= md->phys_addr && phys_addr < end) {
+                       memcpy(out_md, md, sizeof(*out_md));
+                       early_memunmap(md, sizeof (*md));
+                       return 0;
+               }
+
+               early_memunmap(md, sizeof (*md));
+       }
+       pr_err_once("requested map not found.\n");
+       return -ENOENT;
+}
+
+/*
+ * Calculate the highest address of an efi memory descriptor.
+ */
+u64 __init efi_mem_desc_end(efi_memory_desc_t *md)
+{
+       u64 size = md->num_pages << EFI_PAGE_SHIFT;
+       u64 end = md->phys_addr + size;
+       return end;
+}
  
  /*
   * We can't ioremap data in EFI boot services RAM, because we've already mapped
@@ -274,6 +358,7 @@ static __initdata efi_config_table_type_t common_tables[] = {
         {SMBIOS_TABLE_GUID, "SMBIOS", &efi.smbios},
         {SMBIOS3_TABLE_GUID, "SMBIOS 3.0", &efi.smbios3},
         {UGA_IO_PROTOCOL_GUID, "UGA", &efi.uga},
+       {EFI_SYSTEM_RESOURCE_TABLE_GUID, "ESRT", &efi.esrt},
         {NULL_GUID, NULL, NULL},
  };
  
diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c

index 7b2e0496e0c084c4e9e319c04d61245abdde5edd..756eca8c4cf8f291025a3ad44f7cbb9981aeb5fd 100644 (file)
--- a/drivers/firmware/efi/efivars.c
+++ b/drivers/firmware/efi/efivars.c
@@ -535,7 +535,7 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
   * efivar_create_sysfs_entry - create a new entry in sysfs
   * @new_var: efivar entry to create
   *
- * Returns 1 on failure, 0 on success
+ * Returns 0 on success, negative error code on failure
   */
  static int
  efivar_create_sysfs_entry(struct efivar_entry *new_var)
@@ -544,6 +544,7 @@ efivar_create_sysfs_entry(struct efivar_entry *new_var)
         char *short_name;
         unsigned long variable_name_size;
         efi_char16_t *variable_name;
+       int ret;
  
         variable_name = new_var->var.VariableName;
         variable_name_size = ucs2_strlen(variable_name) * sizeof(efi_char16_t);
@@ -558,7 +559,7 @@ efivar_create_sysfs_entry(struct efivar_entry *new_var)
         short_name = kzalloc(short_name_size, GFP_KERNEL);
  
         if (!short_name)
-               return 1;
+               return -ENOMEM;
  
         /* Convert Unicode to normal chars (assume top bits are 0),
            ala UTF-8 */
@@ -574,11 +575,11 @@ efivar_create_sysfs_entry(struct efivar_entry *new_var)
  
         new_var->kobj.kset = efivars_kset;
  
-       i = kobject_init_and_add(&new_var->kobj, &efivar_ktype,
+       ret = kobject_init_and_add(&new_var->kobj, &efivar_ktype,
                                    NULL, "%s", short_name);
         kfree(short_name);
-       if (i)
-               return 1;
+       if (ret)
+               return ret;
  
         kobject_uevent(&new_var->kobj, KOBJ_ADD);
         efivar_entry_add(new_var, &efivar_sysfs_list);
diff --git a/drivers/firmware/efi/esrt.c b/drivers/firmware/efi/esrt.c

new file mode 100644 (file)

index 0000000..a5b95d6
--- /dev/null
+++ b/drivers/firmware/efi/esrt.c
@@ -0,0 +1,471 @@
+/*
+ * esrt.c
+ *
+ * This module exports EFI System Resource Table (ESRT) entries into userspace
+ * through the sysfs file system. The ESRT provides a read-only catalog of
+ * system components for which the system accepts firmware upgrades via UEFI's
+ * "Capsule Update" feature. This module allows userland utilities to evaluate
+ * what firmware updates can be applied to this system, and potentially arrange
+ * for those updates to occur.
+ *
+ * Data is currently found below /sys/firmware/efi/esrt/...
+ */
+#define pr_fmt(fmt) "esrt: " fmt
+
+#include <linux/capability.h>
+#include <linux/device.h>
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/memblock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <asm/io.h>
+#include <asm/early_ioremap.h>
+
+struct efi_system_resource_entry_v1 {
+       efi_guid_t      fw_class;
+       u32             fw_type;
+       u32             fw_version;
+       u32             lowest_supported_fw_version;
+       u32             capsule_flags;
+       u32             last_attempt_version;
+       u32             last_attempt_status;
+};
+
+/*
+ * _count and _version are what they seem like.  _max is actually just
+ * accounting info for the firmware when creating the table; it should never
+ * have been exposed to us.  To wit, the spec says:
+ * The maximum number of resource array entries that can be within the
+ * table without reallocating the table, must not be zero.
+ * Since there's no guidance about what that means in terms of memory layout,
+ * it means nothing to us.
+ */
+struct efi_system_resource_table {
+       u32     fw_resource_count;
+       u32     fw_resource_count_max;
+       u64     fw_resource_version;
+       u8      entries[];
+};
+
+static phys_addr_t esrt_data;
+static size_t esrt_data_size;
+
+static struct efi_system_resource_table *esrt;
+
+struct esre_entry {
+       union {
+               struct efi_system_resource_entry_v1 *esre1;
+       } esre;
+
+       struct kobject kobj;
+       struct list_head list;
+};
+
+/* global list of esre_entry. */
+static LIST_HEAD(entry_list);
+
+/* entry attribute */
+struct esre_attribute {
+       struct attribute attr;
+       ssize_t (*show)(struct esre_entry *entry, char *buf);
+       ssize_t (*store)(struct esre_entry *entry,
+                        const char *buf, size_t count);
+};
+
+static struct esre_entry *to_entry(struct kobject *kobj)
+{
+       return container_of(kobj, struct esre_entry, kobj);
+}
+
+static struct esre_attribute *to_attr(struct attribute *attr)
+{
+       return container_of(attr, struct esre_attribute, attr);
+}
+
+static ssize_t esre_attr_show(struct kobject *kobj,
+                             struct attribute *_attr, char *buf)
+{
+       struct esre_entry *entry = to_entry(kobj);
+       struct esre_attribute *attr = to_attr(_attr);
+
+       /* Don't tell normal users what firmware versions we've got... */
+       if (!capable(CAP_SYS_ADMIN))
+               return -EACCES;
+
+       return attr->show(entry, buf);
+}
+
+static const struct sysfs_ops esre_attr_ops = {
+       .show = esre_attr_show,
+};
+
+/* Generic ESRT Entry ("ESRE") support. */
+static ssize_t esre_fw_class_show(struct esre_entry *entry, char *buf)
+{
+       char *str = buf;
+
+       efi_guid_to_str(&entry->esre.esre1->fw_class, str);
+       str += strlen(str);
+       str += sprintf(str, "\n");
+
+       return str - buf;
+}
+
+static struct esre_attribute esre_fw_class = __ATTR(fw_class, 0400,
+       esre_fw_class_show, NULL);
+
+#define esre_attr_decl(name, size, fmt) \
+static ssize_t esre_##name##_show(struct esre_entry *entry, char *buf) \
+{ \
+       return sprintf(buf, fmt "\n", \
+                      le##size##_to_cpu(entry->esre.esre1->name)); \
+} \
+\
+static struct esre_attribute esre_##name = __ATTR(name, 0400, \
+       esre_##name##_show, NULL)
+
+esre_attr_decl(fw_type, 32, "%u");
+esre_attr_decl(fw_version, 32, "%u");
+esre_attr_decl(lowest_supported_fw_version, 32, "%u");
+esre_attr_decl(capsule_flags, 32, "0x%x");
+esre_attr_decl(last_attempt_version, 32, "%u");
+esre_attr_decl(last_attempt_status, 32, "%u");
+
+static struct attribute *esre1_attrs[] = {
+       &esre_fw_class.attr,
+       &esre_fw_type.attr,
+       &esre_fw_version.attr,
+       &esre_lowest_supported_fw_version.attr,
+       &esre_capsule_flags.attr,
+       &esre_last_attempt_version.attr,
+       &esre_last_attempt_status.attr,
+       NULL
+};
+static void esre_release(struct kobject *kobj)
+{
+       struct esre_entry *entry = to_entry(kobj);
+
+       list_del(&entry->list);
+       kfree(entry);
+}
+
+static struct kobj_type esre1_ktype = {
+       .release = esre_release,
+       .sysfs_ops = &esre_attr_ops,
+       .default_attrs = esre1_attrs,
+};
+
+
+static struct kobject *esrt_kobj;
+static struct kset *esrt_kset;
+
+static int esre_create_sysfs_entry(void *esre, int entry_num)
+{
+       struct esre_entry *entry;
+       char name[20];
+
+       entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+       if (!entry)
+               return -ENOMEM;
+
+       sprintf(name, "entry%d", entry_num);
+
+       entry->kobj.kset = esrt_kset;
+
+       if (esrt->fw_resource_version == 1) {
+               int rc = 0;
+
+               entry->esre.esre1 = esre;
+               rc = kobject_init_and_add(&entry->kobj, &esre1_ktype, NULL,
+                                         "%s", name);
+               if (rc) {
+                       kfree(entry);
+                       return rc;
+               }
+       }
+
+       list_add_tail(&entry->list, &entry_list);
+       return 0;
+}
+
+/* support for displaying ESRT fields at the top level */
+#define esrt_attr_decl(name, size, fmt) \
+static ssize_t esrt_##name##_show(struct kobject *kobj, \
+                                 struct kobj_attribute *attr, char *buf)\
+{ \
+       return sprintf(buf, fmt "\n", le##size##_to_cpu(esrt->name)); \
+} \
+\
+static struct kobj_attribute esrt_##name = __ATTR(name, 0400, \
+       esrt_##name##_show, NULL)
+
+esrt_attr_decl(fw_resource_count, 32, "%u");
+esrt_attr_decl(fw_resource_count_max, 32, "%u");
+esrt_attr_decl(fw_resource_version, 64, "%llu");
+
+static struct attribute *esrt_attrs[] = {
+       &esrt_fw_resource_count.attr,
+       &esrt_fw_resource_count_max.attr,
+       &esrt_fw_resource_version.attr,
+       NULL,
+};
+
+static inline int esrt_table_exists(void)
+{
+       if (!efi_enabled(EFI_CONFIG_TABLES))
+               return 0;
+       if (efi.esrt == EFI_INVALID_TABLE_ADDR)
+               return 0;
+       return 1;
+}
+
+static umode_t esrt_attr_is_visible(struct kobject *kobj,
+                                   struct attribute *attr, int n)
+{
+       if (!esrt_table_exists())
+               return 0;
+       return attr->mode;
+}
+
+static struct attribute_group esrt_attr_group = {
+       .attrs = esrt_attrs,
+       .is_visible = esrt_attr_is_visible,
+};
+
+/*
+ * remap the table, copy it to kmalloced pages, and unmap it.
+ */
+void __init efi_esrt_init(void)
+{
+       void *va;
+       struct efi_system_resource_table tmpesrt;
+       struct efi_system_resource_entry_v1 *v1_entries;
+       size_t size, max, entry_size, entries_size;
+       efi_memory_desc_t md;
+       int rc;
+       phys_addr_t end;
+
+       pr_debug("esrt-init: loading.\n");
+       if (!esrt_table_exists())
+               return;
+
+       rc = efi_mem_desc_lookup(efi.esrt, &md);
+       if (rc < 0) {
+               pr_err("ESRT header is not in the memory map.\n");
+               return;
+       }
+
+       max = efi_mem_desc_end(&md);
+       if (max < efi.esrt) {
+               pr_err("EFI memory descriptor is invalid. (esrt: %p max: %p)\n",
+                      (void *)efi.esrt, (void *)max);
+               return;
+       }
+
+       size = sizeof(*esrt);
+       max -= efi.esrt;
+
+       if (max < size) {
+               pr_err("ESRT header doen't fit on single memory map entry. (size: %zu max: %zu)\n",
+                      size, max);
+               return;
+       }
+
+       va = early_memremap(efi.esrt, size);
+       if (!va) {
+               pr_err("early_memremap(%p, %zu) failed.\n", (void *)efi.esrt,
+                      size);
+               return;
+       }
+
+       memcpy(&tmpesrt, va, sizeof(tmpesrt));
+
+       if (tmpesrt.fw_resource_version == 1) {
+               entry_size = sizeof (*v1_entries);
+       } else {
+               pr_err("Unsupported ESRT version %lld.\n",
+                      tmpesrt.fw_resource_version);
+               return;
+       }
+
+       if (tmpesrt.fw_resource_count > 0 && max - size < entry_size) {
+               pr_err("ESRT memory map entry can only hold the header. (max: %zu size: %zu)\n",
+                      max - size, entry_size);
+               goto err_memunmap;
+       }
+
+       /*
+        * The format doesn't really give us any boundary to test here,
+        * so I'm making up 128 as the max number of individually updatable
+        * components we support.
+        * 128 should be pretty excessive, but there's still some chance
+        * somebody will do that someday and we'll need to raise this.
+        */
+       if (tmpesrt.fw_resource_count > 128) {
+               pr_err("ESRT says fw_resource_count has very large value %d.\n",
+                      tmpesrt.fw_resource_count);
+               goto err_memunmap;
+       }
+
+       /*
+        * We know it can't be larger than N * sizeof() here, and N is limited
+        * by the previous test to a small number, so there's no overflow.
+        */
+       entries_size = tmpesrt.fw_resource_count * entry_size;
+       if (max < size + entries_size) {
+               pr_err("ESRT does not fit on single memory map entry (size: %zu max: %zu)\n",
+                      size, max);
+               goto err_memunmap;
+       }
+
+       /* remap it with our (plausible) new pages */
+       early_memunmap(va, size);
+       size += entries_size;
+       va = early_memremap(efi.esrt, size);
+       if (!va) {
+               pr_err("early_memremap(%p, %zu) failed.\n", (void *)efi.esrt,
+                      size);
+               return;
+       }
+
+       esrt_data = (phys_addr_t)efi.esrt;
+       esrt_data_size = size;
+
+       end = esrt_data + size;
+       pr_info("Reserving ESRT space from %pa to %pa.\n", &esrt_data, &end);
+       memblock_reserve(esrt_data, esrt_data_size);
+
+       pr_debug("esrt-init: loaded.\n");
+err_memunmap:
+       early_memunmap(va, size);
+}
+
+static int __init register_entries(void)
+{
+       struct efi_system_resource_entry_v1 *v1_entries = (void *)esrt->entries;
+       int i, rc;
+
+       if (!esrt_table_exists())
+               return 0;
+
+       for (i = 0; i < le32_to_cpu(esrt->fw_resource_count); i++) {
+               void *esre = NULL;
+               if (esrt->fw_resource_version == 1) {
+                       esre = &v1_entries[i];
+               } else {
+                       pr_err("Unsupported ESRT version %lld.\n",
+                              esrt->fw_resource_version);
+                       return -EINVAL;
+               }
+
+               rc = esre_create_sysfs_entry(esre, i);
+               if (rc < 0) {
+                       pr_err("ESRT entry creation failed with error %d.\n",
+                              rc);
+                       return rc;
+               }
+       }
+       return 0;
+}
+
+static void cleanup_entry_list(void)
+{
+       struct esre_entry *entry, *next;
+
+       list_for_each_entry_safe(entry, next, &entry_list, list) {
+               kobject_put(&entry->kobj);
+       }
+}
+
+static int __init esrt_sysfs_init(void)
+{
+       int error;
+       struct efi_system_resource_table __iomem *ioesrt;
+
+       pr_debug("esrt-sysfs: loading.\n");
+       if (!esrt_data || !esrt_data_size)
+               return -ENOSYS;
+
+       ioesrt = ioremap(esrt_data, esrt_data_size);
+       if (!ioesrt) {
+               pr_err("ioremap(%pa, %zu) failed.\n", &esrt_data,
+                      esrt_data_size);
+               return -ENOMEM;
+       }
+
+       esrt = kmalloc(esrt_data_size, GFP_KERNEL);
+       if (!esrt) {
+               pr_err("kmalloc failed. (wanted %zu bytes)\n", esrt_data_size);
+               iounmap(ioesrt);
+               return -ENOMEM;
+       }
+
+       memcpy_fromio(esrt, ioesrt, esrt_data_size);
+
+       esrt_kobj = kobject_create_and_add("esrt", efi_kobj);
+       if (!esrt_kobj) {
+               pr_err("Firmware table registration failed.\n");
+               error = -ENOMEM;
+               goto err;
+       }
+
+       error = sysfs_create_group(esrt_kobj, &esrt_attr_group);
+       if (error) {
+               pr_err("Sysfs attribute export failed with error %d.\n",
+                      error);
+               goto err_remove_esrt;
+       }
+
+       esrt_kset = kset_create_and_add("entries", NULL, esrt_kobj);
+       if (!esrt_kset) {
+               pr_err("kset creation failed.\n");
+               error = -ENOMEM;
+               goto err_remove_group;
+       }
+
+       error = register_entries();
+       if (error)
+               goto err_cleanup_list;
+
+       memblock_remove(esrt_data, esrt_data_size);
+
+       pr_debug("esrt-sysfs: loaded.\n");
+
+       return 0;
+err_cleanup_list:
+       cleanup_entry_list();
+       kset_unregister(esrt_kset);
+err_remove_group:
+       sysfs_remove_group(esrt_kobj, &esrt_attr_group);
+err_remove_esrt:
+       kobject_put(esrt_kobj);
+err:
+       kfree(esrt);
+       esrt = NULL;
+       return error;
+}
+
+static void __exit esrt_sysfs_exit(void)
+{
+       pr_debug("esrt-sysfs: unloading.\n");
+       cleanup_entry_list();
+       kset_unregister(esrt_kset);
+       sysfs_remove_group(esrt_kobj, &esrt_attr_group);
+       kfree(esrt);
+       esrt = NULL;
+       kobject_del(esrt_kobj);
+       kobject_put(esrt_kobj);
+}
+
+module_init(esrt_sysfs_init);
+module_exit(esrt_sysfs_exit);
+
+MODULE_AUTHOR("Peter Jones <pjones@redhat.com>");
+MODULE_DESCRIPTION("EFI System Resource Table support");
+MODULE_LICENSE("GPL");
diff --git a/drivers/firmware/iscsi_ibft.c b/drivers/firmware/iscsi_ibft.c

index 071c2c969eec06ad929ecfb871c614297a615e9e..72791232e46ba44ff474cf7dadff5ccb433d7348 100644 (file)
--- a/drivers/firmware/iscsi_ibft.c
+++ b/drivers/firmware/iscsi_ibft.c
@@ -186,8 +186,20 @@ struct ibft_kobject {
  
  static struct iscsi_boot_kset *boot_kset;
  
+/* fully null address */
  static const char nulls[16];
  
+/* IPv4-mapped IPv6 ::ffff:0.0.0.0 */
+static const char mapped_nulls[16] = { 0x00, 0x00, 0x00, 0x00,
+                                       0x00, 0x00, 0x00, 0x00,
+                                       0x00, 0x00, 0xff, 0xff,
+                                       0x00, 0x00, 0x00, 0x00 };
+
+static int address_not_null(u8 *ip)
+{
+       return (memcmp(ip, nulls, 16) && memcmp(ip, mapped_nulls, 16));
+}
+
  /*
   * Helper functions to parse data properly.
   */
@@ -445,7 +457,7 @@ static umode_t ibft_check_nic_for(void *data, int type)
                 rc = S_IRUGO;
                 break;
         case ISCSI_BOOT_ETH_IP_ADDR:
-               if (memcmp(nic->ip_addr, nulls, sizeof(nic->ip_addr)))
+               if (address_not_null(nic->ip_addr))
                         rc = S_IRUGO;
                 break;
         case ISCSI_BOOT_ETH_SUBNET_MASK:
@@ -456,21 +468,19 @@ static umode_t ibft_check_nic_for(void *data, int type)
                 rc = S_IRUGO;
                 break;
         case ISCSI_BOOT_ETH_GATEWAY:
-               if (memcmp(nic->gateway, nulls, sizeof(nic->gateway)))
+               if (address_not_null(nic->gateway))
                         rc = S_IRUGO;
                 break;
         case ISCSI_BOOT_ETH_PRIMARY_DNS:
-               if (memcmp(nic->primary_dns, nulls,
-                          sizeof(nic->primary_dns)))
+               if (address_not_null(nic->primary_dns))
                         rc = S_IRUGO;
                 break;
         case ISCSI_BOOT_ETH_SECONDARY_DNS:
-               if (memcmp(nic->secondary_dns, nulls,
-                          sizeof(nic->secondary_dns)))
+               if (address_not_null(nic->secondary_dns))
                         rc = S_IRUGO;
                 break;
         case ISCSI_BOOT_ETH_DHCP:
-               if (memcmp(nic->dhcp, nulls, sizeof(nic->dhcp)))
+               if (address_not_null(nic->dhcp))
                         rc = S_IRUGO;
                 break;
         case ISCSI_BOOT_ETH_VLAN:
@@ -536,23 +546,19 @@ static umode_t __init ibft_check_initiator_for(void *data, int type)
                 rc = S_IRUGO;
                 break;
         case ISCSI_BOOT_INI_ISNS_SERVER:
-               if (memcmp(init->isns_server, nulls,
-                          sizeof(init->isns_server)))
+               if (address_not_null(init->isns_server))
                         rc = S_IRUGO;
                 break;
         case ISCSI_BOOT_INI_SLP_SERVER:
-               if (memcmp(init->slp_server, nulls,
-                          sizeof(init->slp_server)))
+               if (address_not_null(init->slp_server))
                         rc = S_IRUGO;
                 break;
         case ISCSI_BOOT_INI_PRI_RADIUS_SERVER:
-               if (memcmp(init->pri_radius_server, nulls,
-                          sizeof(init->pri_radius_server)))
+               if (address_not_null(init->pri_radius_server))
                         rc = S_IRUGO;
                 break;
         case ISCSI_BOOT_INI_SEC_RADIUS_SERVER:
-               if (memcmp(init->sec_radius_server, nulls,
-                          sizeof(init->sec_radius_server)))
+               if (address_not_null(init->sec_radius_server))
                         rc = S_IRUGO;
                 break;
         case ISCSI_BOOT_INI_INITIATOR_NAME:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c

index e469c4b2e8cc85981e3ba99cae0b28c1b7a9b2ec..c25728bc388a2be7134cb3e6b895a7a39d4189a2 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -684,8 +684,6 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
                         dev->node_props.cpu_core_id_base);
         sysfs_show_32bit_prop(buffer, "simd_id_base",
                         dev->node_props.simd_id_base);
-       sysfs_show_32bit_prop(buffer, "capability",
-                       dev->node_props.capability);
         sysfs_show_32bit_prop(buffer, "max_waves_per_simd",
                         dev->node_props.max_waves_per_simd);
         sysfs_show_32bit_prop(buffer, "lds_size_in_kb",
@@ -736,6 +734,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
                         dev->gpu->kfd2kgd->get_fw_version(
                                                 dev->gpu->kgd,
                                                 KGD_ENGINE_MEC1));
+               sysfs_show_32bit_prop(buffer, "capability",
+                               dev->node_props.capability);
         }
  
         return sysfs_show_32bit_prop(buffer, "max_engine_clk_ccompute",
diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c

index ffc305fc20768c29af6883eeb2d70553839cfa6e..eb7e61078a5b6f1088489b49b42b32abe8ffca42 100644 (file)
--- a/drivers/gpu/drm/drm_sysfs.c
+++ b/drivers/gpu/drm/drm_sysfs.c
@@ -217,7 +217,7 @@ static ssize_t status_store(struct device *device,
  
         mutex_unlock(&dev->mode_config.mutex);
  
-       return ret;
+       return ret ? ret : count;
  }
  
  static ssize_t status_show(struct device *device,
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c

index 007c7d7d82950f597bb05ef8388fb1696ef72b38..dc55c51964ab501720f02ae682118ce12a51f0ff 100644 (file)
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1667,12 +1667,15 @@ static int i915_sr_status(struct seq_file *m, void *unused)
  
         if (HAS_PCH_SPLIT(dev))
                 sr_enabled = I915_READ(WM1_LP_ILK) & WM1_LP_SR_EN;
-       else if (IS_CRESTLINE(dev) || IS_I945G(dev) || IS_I945GM(dev))
+       else if (IS_CRESTLINE(dev) || IS_G4X(dev) ||
+                IS_I945G(dev) || IS_I945GM(dev))
                 sr_enabled = I915_READ(FW_BLC_SELF) & FW_BLC_SELF_EN;
         else if (IS_I915GM(dev))
                 sr_enabled = I915_READ(INSTPM) & INSTPM_SELF_EN;
         else if (IS_PINEVIEW(dev))
                 sr_enabled = I915_READ(DSPFW3) & PINEVIEW_SELF_REFRESH_EN;
+       else if (IS_VALLEYVIEW(dev))
+               sr_enabled = I915_READ(FW_BLC_SELF_VLV) & FW_CSPWRDWNEN;
  
         intel_runtime_pm_put(dev_priv);
  
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c

index 53394f998a1f9429f87b78598a69e232a48d5b38..2d0995e7afc37482a594be7e25b5baaadc6a6798 100644 (file)
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3003,8 +3003,8 @@ int i915_vma_unbind(struct i915_vma *vma)
                 } else if (vma->ggtt_view.pages) {
                         sg_free_table(vma->ggtt_view.pages);
                         kfree(vma->ggtt_view.pages);
-                       vma->ggtt_view.pages = NULL;
                 }
+               vma->ggtt_view.pages = NULL;
         }
  
         drm_mm_remove_node(&vma->node);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c

index a3190e793ed43744980bedba4ed42e1d0e38d597..cc552a4c1f3b20a714e2ba73bbb59d3559451d0e 100644 (file)
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -32,6 +32,7 @@
  #include "i915_trace.h"
  #include "intel_drv.h"
  #include <linux/dma_remapping.h>
+#include <linux/uaccess.h>
  
  #define  __EXEC_OBJECT_HAS_PIN (1<<31)
  #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
@@ -465,7 +466,7 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
         }
  
         /* We can't wait for rendering with pagefaults disabled */
-       if (obj->active && in_atomic())
+       if (obj->active && pagefault_disabled())
                 return -EFAULT;
  
         if (use_cpu_reloc(obj))
diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c

index f27346e907b1e9e4cb1d4d3eda9b33cca2f63033..d714a4b5711e4e7fa390ec6b659d2683ef41f585 100644 (file)
--- a/drivers/gpu/drm/i915/intel_dp.c
+++ b/drivers/gpu/drm/i915/intel_dp.c
@@ -880,10 +880,8 @@ intel_dp_aux_ch(struct intel_dp *intel_dp,
                                       DP_AUX_CH_CTL_RECEIVE_ERROR))
                                 continue;
                         if (status & DP_AUX_CH_CTL_DONE)
-                               break;
+                               goto done;
                 }
-               if (status & DP_AUX_CH_CTL_DONE)
-                       break;
         }
  
         if ((status & DP_AUX_CH_CTL_DONE) == 0) {
@@ -892,6 +890,7 @@ intel_dp_aux_ch(struct intel_dp *intel_dp,
                 goto out;
         }
  
+done:
         /* Check for timeout or receive error.
          * Timeouts occur when the sink is not connected
          */
diff --git a/drivers/gpu/drm/i915/intel_i2c.c b/drivers/gpu/drm/i915/intel_i2c.c

index 56e437e3158021a09641d188affc6129f0b1eda8..ae628001fd97873b67f99fb0128167858948afe6 100644 (file)
--- a/drivers/gpu/drm/i915/intel_i2c.c
+++ b/drivers/gpu/drm/i915/intel_i2c.c
@@ -435,7 +435,7 @@ gmbus_xfer(struct i2c_adapter *adapter,
                                                struct intel_gmbus,
                                                adapter);
         struct drm_i915_private *dev_priv = bus->dev_priv;
-       int i, reg_offset;
+       int i = 0, inc, try = 0, reg_offset;
         int ret = 0;
  
         intel_aux_display_runtime_get(dev_priv);
@@ -448,12 +448,14 @@ gmbus_xfer(struct i2c_adapter *adapter,
  
         reg_offset = dev_priv->gpio_mmio_base;
  
+retry:
         I915_WRITE(GMBUS0 + reg_offset, bus->reg0);
  
-       for (i = 0; i < num; i++) {
+       for (; i < num; i += inc) {
+               inc = 1;
                 if (gmbus_is_index_read(msgs, i, num)) {
                         ret = gmbus_xfer_index_read(dev_priv, &msgs[i]);
-                       i += 1;  /* set i to the index of the read xfer */
+                       inc = 2; /* an index read is two msgs */
                 } else if (msgs[i].flags & I2C_M_RD) {
                         ret = gmbus_xfer_read(dev_priv, &msgs[i], 0);
                 } else {
@@ -525,6 +527,18 @@ clear_err:
                          adapter->name, msgs[i].addr,
                          (msgs[i].flags & I2C_M_RD) ? 'r' : 'w', msgs[i].len);
  
+       /*
+        * Passive adapters sometimes NAK the first probe. Retry the first
+        * message once on -ENXIO for GMBUS transfers; the bit banging algorithm
+        * has retries internally. See also the retry loop in
+        * drm_do_probe_ddc_edid, which bails out on the first -ENXIO.
+        */
+       if (ret == -ENXIO && i == 0 && try++ == 0) {
+               DRM_DEBUG_KMS("GMBUS [%s] NAK on first message, retry\n",
+                             adapter->name);
+               goto retry;
+       }
+
         goto out;
  
  timeout:
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c

index 09df74b8e917b1dac90d460be50d1c4c5152881c..424e6219778712dcaf0e7c5c1ae7c51709fce6ba 100644 (file)
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1134,6 +1134,12 @@ static int gen8_init_common_ring(struct intel_engine_cs *ring)
         I915_WRITE_IMR(ring, ~(ring->irq_enable_mask | ring->irq_keep_mask));
         I915_WRITE(RING_HWSTAM(ring->mmio_base), 0xffffffff);
  
+       if (ring->status_page.obj) {
+               I915_WRITE(RING_HWS_PGA(ring->mmio_base),
+                          (u32)ring->status_page.gfx_addr);
+               POSTING_READ(RING_HWS_PGA(ring->mmio_base));
+       }
+
         I915_WRITE(RING_MODE_GEN7(ring),
                    _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) |
                    _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c

index 441e2502b88946ff2d7455a9f26cc32faa87d8fc..005b5e04de4d74d13eee87af223c9e22687f6d35 100644 (file)
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -901,13 +901,6 @@ static int chv_init_workarounds(struct intel_engine_cs *ring)
                             GEN6_WIZ_HASHING_MASK,
                             GEN6_WIZ_HASHING_16x4);
  
-       if (INTEL_REVID(dev) == SKL_REVID_C0 ||
-           INTEL_REVID(dev) == SKL_REVID_D0)
-               /* WaBarrierPerformanceFixDisable:skl */
-               WA_SET_BIT_MASKED(HDC_CHICKEN0,
-                                 HDC_FENCE_DEST_SLM_DISABLE |
-                                 HDC_BARRIER_PERFORMANCE_DISABLE);
-
         return 0;
  }
  
@@ -1024,6 +1017,13 @@ static int skl_init_workarounds(struct intel_engine_cs *ring)
                 WA_SET_BIT_MASKED(HIZ_CHICKEN,
                                   BDW_HIZ_POWER_COMPILER_CLOCK_GATING_DISABLE);
  
+       if (INTEL_REVID(dev) == SKL_REVID_C0 ||
+           INTEL_REVID(dev) == SKL_REVID_D0)
+               /* WaBarrierPerformanceFixDisable:skl */
+               WA_SET_BIT_MASKED(HDC_CHICKEN0,
+                                 HDC_FENCE_DEST_SLM_DISABLE |
+                                 HDC_BARRIER_PERFORMANCE_DISABLE);
+
         return skl_tune_iz_hashing(ring);
  }
  
diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c

index e87d2f418de4f381d50471494e5fe8050de4bdb2..987b81f31b0e693cfe7d505b2f66eecc7eac6539 100644 (file)
--- a/drivers/gpu/drm/i915/intel_sdvo.c
+++ b/drivers/gpu/drm/i915/intel_sdvo.c
@@ -2550,7 +2550,7 @@ intel_sdvo_analog_init(struct intel_sdvo *intel_sdvo, int device)
  
         DRM_DEBUG_KMS("initialising analog device %d\n", device);
  
-       intel_sdvo_connector = kzalloc(sizeof(*intel_sdvo_connector), GFP_KERNEL);
+       intel_sdvo_connector = intel_sdvo_connector_alloc();
         if (!intel_sdvo_connector)
                 return false;
  
diff --git a/drivers/gpu/drm/mgag200/mgag200_mode.c b/drivers/gpu/drm/mgag200/mgag200_mode.c

index 6e84df9369a657223d17387ad14929cdf435e238..ad4b9010dfb0bbed135185e9f64aed98c3239a24 100644 (file)
--- a/drivers/gpu/drm/mgag200/mgag200_mode.c
+++ b/drivers/gpu/drm/mgag200/mgag200_mode.c
@@ -1526,6 +1526,11 @@ static int mga_vga_mode_valid(struct drm_connector *connector,
                 return MODE_BANDWIDTH;
         }
  
+       if ((mode->hdisplay % 8) != 0 || (mode->hsync_start % 8) != 0 ||
+           (mode->hsync_end % 8) != 0 || (mode->htotal % 8) != 0) {
+               return MODE_H_ILLEGAL;
+       }
+
         if (mode->crtc_hdisplay > 2048 || mode->crtc_hsync_start > 4096 ||
             mode->crtc_hsync_end > 4096 || mode->crtc_htotal > 4096 ||
             mode->crtc_vdisplay > 2048 || mode->crtc_vsync_start > 4096 ||
diff --git a/drivers/gpu/drm/radeon/atombios_crtc.c b/drivers/gpu/drm/radeon/atombios_crtc.c

index e597ffc265633ef7439b2247301333a3affaeaea..dac78ad24b31558aa53d917fb802865b6a122b61 100644 (file)
--- a/drivers/gpu/drm/radeon/atombios_crtc.c
+++ b/drivers/gpu/drm/radeon/atombios_crtc.c
@@ -580,9 +580,6 @@ static u32 atombios_adjust_pll(struct drm_crtc *crtc,
                 else
                         radeon_crtc->pll_flags |= RADEON_PLL_PREFER_LOW_REF_DIV;
  
-               /* if there is no audio, set MINM_OVER_MAXP  */
-               if (!drm_detect_monitor_audio(radeon_connector_edid(connector)))
-                       radeon_crtc->pll_flags |= RADEON_PLL_PREFER_MINM_OVER_MAXP;
                 if (rdev->family < CHIP_RV770)
                         radeon_crtc->pll_flags |= RADEON_PLL_PREFER_MINM_OVER_MAXP;
                 /* use frac fb div on APUs */
@@ -1798,9 +1795,7 @@ static int radeon_get_shared_nondp_ppll(struct drm_crtc *crtc)
                         if ((crtc->mode.clock == test_crtc->mode.clock) &&
                             (adjusted_clock == test_adjusted_clock) &&
                             (radeon_crtc->ss_enabled == test_radeon_crtc->ss_enabled) &&
-                           (test_radeon_crtc->pll_id != ATOM_PPLL_INVALID) &&
-                           (drm_detect_monitor_audio(radeon_connector_edid(test_radeon_crtc->connector)) ==
-                            drm_detect_monitor_audio(radeon_connector_edid(radeon_crtc->connector))))
+                           (test_radeon_crtc->pll_id != ATOM_PPLL_INVALID))
                                 return test_radeon_crtc->pll_id;
                 }
         }
diff --git a/drivers/gpu/drm/radeon/dce3_1_afmt.c b/drivers/gpu/drm/radeon/dce3_1_afmt.c

index f04205170b8a5942d73437ada72437bc18d028a8..cfa3a84a2af03c100741cb7e5b352781adf60b00 100644 (file)
--- a/drivers/gpu/drm/radeon/dce3_1_afmt.c
+++ b/drivers/gpu/drm/radeon/dce3_1_afmt.c
@@ -173,7 +173,7 @@ void dce3_2_hdmi_update_acr(struct drm_encoder *encoder, long offset,
         struct drm_device *dev = encoder->dev;
         struct radeon_device *rdev = dev->dev_private;
  
-       WREG32(HDMI0_ACR_PACKET_CONTROL + offset,
+       WREG32(DCE3_HDMI0_ACR_PACKET_CONTROL + offset,
                 HDMI0_ACR_SOURCE |              /* select SW CTS value */
                 HDMI0_ACR_AUTO_SEND);   /* allow hw to sent ACR packets when required */
  
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c

index b7ca4c51462120fab3ab146dd74f653e8bcb91cb..a7fdfa4f0857b3a416e67d79007a1da731455b80 100644 (file)
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -1463,6 +1463,21 @@ int radeon_device_init(struct radeon_device *rdev,
         if (r)
                 DRM_ERROR("ib ring test failed (%d).\n", r);
  
+       /*
+        * Turks/Thames GPU will freeze whole laptop if DPM is not restarted
+        * after the CP ring have chew one packet at least. Hence here we stop
+        * and restart DPM after the radeon_ib_ring_tests().
+        */
+       if (rdev->pm.dpm_enabled &&
+           (rdev->pm.pm_method == PM_METHOD_DPM) &&
+           (rdev->family == CHIP_TURKS) &&
+           (rdev->flags & RADEON_IS_MOBILITY)) {
+               mutex_lock(&rdev->pm.mutex);
+               radeon_dpm_disable(rdev);
+               radeon_dpm_enable(rdev);
+               mutex_unlock(&rdev->pm.mutex);
+       }
+
         if ((radeon_testing & 1)) {
                 if (rdev->accel_working)
                         radeon_test_moves(rdev);
diff --git a/drivers/gpu/drm/radeon/radeon_dp_mst.c b/drivers/gpu/drm/radeon/radeon_dp_mst.c

index 2b98ed3e684d706a07e3c43b6da9f2232143e580..257b10be5cda902861339d9fde17c38e4f238d06 100644 (file)
--- a/drivers/gpu/drm/radeon/radeon_dp_mst.c
+++ b/drivers/gpu/drm/radeon/radeon_dp_mst.c
@@ -663,12 +663,17 @@ int
  radeon_dp_mst_probe(struct radeon_connector *radeon_connector)
  {
         struct radeon_connector_atom_dig *dig_connector = radeon_connector->con_priv;
+       struct drm_device *dev = radeon_connector->base.dev;
+       struct radeon_device *rdev = dev->dev_private;
         int ret;
         u8 msg[1];
  
         if (!radeon_mst)
                 return 0;
  
+       if (!ASIC_IS_DCE5(rdev))
+               return 0;
+
         if (dig_connector->dpcd[DP_DPCD_REV] < 0x12)
                 return 0;
  
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c

index 7b2a7335cc5d557eafa6864d50cb6ebc9cdfb5ff..b0acf50d95581d9970cef89690be25b32324c7b3 100644 (file)
--- a/drivers/gpu/drm/radeon/radeon_kms.c
+++ b/drivers/gpu/drm/radeon/radeon_kms.c
@@ -576,6 +576,9 @@ static int radeon_info_ioctl(struct drm_device *dev, void *data, struct drm_file
                 if (radeon_get_allowed_info_register(rdev, *value, value))
                         return -EINVAL;
                 break;
+       case RADEON_INFO_VA_UNMAP_WORKING:
+               *value = true;
+               break;
         default:
                 DRM_DEBUG_KMS("Invalid request %d\n", info->request);
                 return -EINVAL;
diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c

index de42fc4a22b869296ff44c85c859678c6155ddd7..9c3377ca17b75ecd2092e4fd78a2238c126d88f1 100644 (file)
--- a/drivers/gpu/drm/radeon/radeon_vm.c
+++ b/drivers/gpu/drm/radeon/radeon_vm.c
@@ -458,14 +458,16 @@ int radeon_vm_bo_set_addr(struct radeon_device *rdev,
                 /* make sure object fit at this offset */
                 eoffset = soffset + size;
                 if (soffset >= eoffset) {
-                       return -EINVAL;
+                       r = -EINVAL;
+                       goto error_unreserve;
                 }
  
                 last_pfn = eoffset / RADEON_GPU_PAGE_SIZE;
                 if (last_pfn > rdev->vm_manager.max_pfn) {
                         dev_err(rdev->dev, "va above limit (0x%08X > 0x%08X)\n",
                                 last_pfn, rdev->vm_manager.max_pfn);
-                       return -EINVAL;
+                       r = -EINVAL;
+                       goto error_unreserve;
                 }
  
         } else {
@@ -486,7 +488,8 @@ int radeon_vm_bo_set_addr(struct radeon_device *rdev,
                                 "(bo %p 0x%010lx 0x%010lx)\n", bo_va->bo,
                                 soffset, tmp->bo, tmp->it.start, tmp->it.last);
                         mutex_unlock(&vm->mutex);
-                       return -EINVAL;
+                       r = -EINVAL;
+                       goto error_unreserve;
                 }
         }
  
@@ -497,7 +500,8 @@ int radeon_vm_bo_set_addr(struct radeon_device *rdev,
                         tmp = kzalloc(sizeof(struct radeon_bo_va), GFP_KERNEL);
                         if (!tmp) {
                                 mutex_unlock(&vm->mutex);
-                               return -ENOMEM;
+                               r = -ENOMEM;
+                               goto error_unreserve;
                         }
                         tmp->it.start = bo_va->it.start;
                         tmp->it.last = bo_va->it.last;
@@ -555,7 +559,6 @@ int radeon_vm_bo_set_addr(struct radeon_device *rdev,
                 r = radeon_vm_clear_bo(rdev, pt);
                 if (r) {
                         radeon_bo_unref(&pt);
-                       radeon_bo_reserve(bo_va->bo, false);
                         return r;
                 }
  
@@ -575,6 +578,10 @@ int radeon_vm_bo_set_addr(struct radeon_device *rdev,
  
         mutex_unlock(&vm->mutex);
         return 0;
+
+error_unreserve:
+       radeon_bo_unreserve(bo_va->bo);
+       return r;
  }
  
  /**
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c

index ed303ba3a59393533a1f0d06b95abc53a5febcdd..3e03379e7c5d92c0191af13883c4e440dbc2cc9c 100644 (file)
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -63,7 +63,8 @@ MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius");
  #define TO_ATTR_NO(cpu)                (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO)
  
  #ifdef CONFIG_SMP
-#define for_each_sibling(i, cpu)       for_each_cpu(i, cpu_sibling_mask(cpu))
+#define for_each_sibling(i, cpu) \
+       for_each_cpu(i, topology_sibling_cpumask(cpu))
  #else
  #define for_each_sibling(i, cpu)       for (i = 0; false; )
  #endif
diff --git a/drivers/i2c/busses/i2c-hix5hd2.c b/drivers/i2c/busses/i2c-hix5hd2.c

index 8fe78d08e01cf1551ea0eaf53f50d2185dfff809..7c6966434ee7b9a2707da849e56128af41c0baf5 100644 (file)
--- a/drivers/i2c/busses/i2c-hix5hd2.c
+++ b/drivers/i2c/busses/i2c-hix5hd2.c
@@ -554,4 +554,4 @@ module_platform_driver(hix5hd2_i2c_driver);
  MODULE_DESCRIPTION("Hix5hd2 I2C Bus driver");
  MODULE_AUTHOR("Wei Yan <sledge.yanwei@huawei.com>");
  MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:i2c-hix5hd2");
+MODULE_ALIAS("platform:hix5hd2-i2c");
diff --git a/drivers/i2c/busses/i2c-s3c2410.c b/drivers/i2c/busses/i2c-s3c2410.c

index 958c8db4ec30740e2d9aae00a7835256700d3424..297e9c9ac9432f5e645e06cf932710cd93c7f924 100644 (file)
--- a/drivers/i2c/busses/i2c-s3c2410.c
+++ b/drivers/i2c/busses/i2c-s3c2410.c
@@ -1143,6 +1143,7 @@ static int s3c24xx_i2c_probe(struct platform_device *pdev)
                 return -ENOMEM;
  
         i2c->quirks = s3c24xx_get_device_quirks(pdev);
+       i2c->sysreg = ERR_PTR(-ENOENT);
         if (pdata)
                 memcpy(i2c->pdata, pdata, sizeof(*pdata));
         else
diff --git a/drivers/iio/adc/twl6030-gpadc.c b/drivers/iio/adc/twl6030-gpadc.c

index 89d8aa1d2818502f974c92f7925ea4440df3d97d..df12c57e6ce07a700d211b81c9b5d3c9c15ff2d3 100644 (file)
--- a/drivers/iio/adc/twl6030-gpadc.c
+++ b/drivers/iio/adc/twl6030-gpadc.c
@@ -1001,7 +1001,7 @@ static struct platform_driver twl6030_gpadc_driver = {
  
  module_platform_driver(twl6030_gpadc_driver);
  
-MODULE_ALIAS("platform: " DRIVER_NAME);
+MODULE_ALIAS("platform:" DRIVER_NAME);
  MODULE_AUTHOR("Balaji T K <balajitk@ti.com>");
  MODULE_AUTHOR("Graeme Gregory <gg@slimlogic.co.uk>");
  MODULE_AUTHOR("Oleksandr Kozaruk <oleksandr.kozaruk@ti.com");
diff --git a/drivers/iio/imu/adis16400.h b/drivers/iio/imu/adis16400.h

index 0916bf6b6c311c503931f26387712c6677b17645..73b189c1c0fb0fdcc73d64b7118ab4f79a12d41b 100644 (file)
--- a/drivers/iio/imu/adis16400.h
+++ b/drivers/iio/imu/adis16400.h
@@ -139,6 +139,7 @@
  #define ADIS16400_NO_BURST             BIT(1)
  #define ADIS16400_HAS_SLOW_MODE                BIT(2)
  #define ADIS16400_HAS_SERIAL_NUMBER    BIT(3)
+#define ADIS16400_BURST_DIAG_STAT      BIT(4)
  
  struct adis16400_state;
  
@@ -165,6 +166,7 @@ struct adis16400_state {
         int                             filt_int;
  
         struct adis adis;
+       unsigned long avail_scan_mask[2];
  };
  
  /* At the moment triggers are only used for ring buffer
diff --git a/drivers/iio/imu/adis16400_buffer.c b/drivers/iio/imu/adis16400_buffer.c

index 6e727ffe52621f43bb40f31466730705477961ef..90c24a23c679b8001e31cdff48b098cacb872682 100644 (file)
--- a/drivers/iio/imu/adis16400_buffer.c
+++ b/drivers/iio/imu/adis16400_buffer.c
@@ -18,7 +18,8 @@ int adis16400_update_scan_mode(struct iio_dev *indio_dev,
  {
         struct adis16400_state *st = iio_priv(indio_dev);
         struct adis *adis = &st->adis;
-       uint16_t *tx;
+       unsigned int burst_length;
+       u8 *tx;
  
         if (st->variant->flags & ADIS16400_NO_BURST)
                 return adis_update_scan_mode(indio_dev, scan_mask);
@@ -26,26 +27,29 @@ int adis16400_update_scan_mode(struct iio_dev *indio_dev,
         kfree(adis->xfer);
         kfree(adis->buffer);
  
+       /* All but the timestamp channel */
+       burst_length = (indio_dev->num_channels - 1) * sizeof(u16);
+       if (st->variant->flags & ADIS16400_BURST_DIAG_STAT)
+               burst_length += sizeof(u16);
+
         adis->xfer = kcalloc(2, sizeof(*adis->xfer), GFP_KERNEL);
         if (!adis->xfer)
                 return -ENOMEM;
  
-       adis->buffer = kzalloc(indio_dev->scan_bytes + sizeof(u16),
-               GFP_KERNEL);
+       adis->buffer = kzalloc(burst_length + sizeof(u16), GFP_KERNEL);
         if (!adis->buffer)
                 return -ENOMEM;
  
-       tx = adis->buffer + indio_dev->scan_bytes;
-
+       tx = adis->buffer + burst_length;
         tx[0] = ADIS_READ_REG(ADIS16400_GLOB_CMD);
         tx[1] = 0;
  
         adis->xfer[0].tx_buf = tx;
         adis->xfer[0].bits_per_word = 8;
         adis->xfer[0].len = 2;
-       adis->xfer[1].tx_buf = tx;
+       adis->xfer[1].rx_buf = adis->buffer;
         adis->xfer[1].bits_per_word = 8;
-       adis->xfer[1].len = indio_dev->scan_bytes;
+       adis->xfer[1].len = burst_length;
  
         spi_message_init(&adis->msg);
         spi_message_add_tail(&adis->xfer[0], &adis->msg);
@@ -61,6 +65,7 @@ irqreturn_t adis16400_trigger_handler(int irq, void *p)
         struct adis16400_state *st = iio_priv(indio_dev);
         struct adis *adis = &st->adis;
         u32 old_speed_hz = st->adis.spi->max_speed_hz;
+       void *buffer;
         int ret;
  
         if (!adis->buffer)
@@ -81,7 +86,12 @@ irqreturn_t adis16400_trigger_handler(int irq, void *p)
                 spi_setup(st->adis.spi);
         }
  
-       iio_push_to_buffers_with_timestamp(indio_dev, adis->buffer,
+       if (st->variant->flags & ADIS16400_BURST_DIAG_STAT)
+               buffer = adis->buffer + sizeof(u16);
+       else
+               buffer = adis->buffer;
+
+       iio_push_to_buffers_with_timestamp(indio_dev, buffer,
                 pf->timestamp);
  
         iio_trigger_notify_done(indio_dev->trig);
diff --git a/drivers/iio/imu/adis16400_core.c b/drivers/iio/imu/adis16400_core.c

index fa795dcd5f75ec0a1e8de143bc0122ef36bf9409..2fd68f2219a7d422a604b91ce90138f1050528cd 100644 (file)
--- a/drivers/iio/imu/adis16400_core.c
+++ b/drivers/iio/imu/adis16400_core.c
@@ -405,6 +405,11 @@ static int adis16400_read_raw(struct iio_dev *indio_dev,
                         *val = st->variant->temp_scale_nano / 1000000;
                         *val2 = (st->variant->temp_scale_nano % 1000000);
                         return IIO_VAL_INT_PLUS_MICRO;
+               case IIO_PRESSURE:
+                       /* 20 uBar = 0.002kPascal */
+                       *val = 0;
+                       *val2 = 2000;
+                       return IIO_VAL_INT_PLUS_MICRO;
                 default:
                         return -EINVAL;
                 }
@@ -454,10 +459,10 @@ static int adis16400_read_raw(struct iio_dev *indio_dev,
         }
  }
  
-#define ADIS16400_VOLTAGE_CHAN(addr, bits, name, si) { \
+#define ADIS16400_VOLTAGE_CHAN(addr, bits, name, si, chn) { \
         .type = IIO_VOLTAGE, \
         .indexed = 1, \
-       .channel = 0, \
+       .channel = chn, \
         .extend_name = name, \
         .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | \
                 BIT(IIO_CHAN_INFO_SCALE), \
@@ -474,10 +479,10 @@ static int adis16400_read_raw(struct iio_dev *indio_dev,
  }
  
  #define ADIS16400_SUPPLY_CHAN(addr, bits) \
-       ADIS16400_VOLTAGE_CHAN(addr, bits, "supply", ADIS16400_SCAN_SUPPLY)
+       ADIS16400_VOLTAGE_CHAN(addr, bits, "supply", ADIS16400_SCAN_SUPPLY, 0)
  
  #define ADIS16400_AUX_ADC_CHAN(addr, bits) \
-       ADIS16400_VOLTAGE_CHAN(addr, bits, NULL, ADIS16400_SCAN_ADC)
+       ADIS16400_VOLTAGE_CHAN(addr, bits, NULL, ADIS16400_SCAN_ADC, 1)
  
  #define ADIS16400_GYRO_CHAN(mod, addr, bits) { \
         .type = IIO_ANGL_VEL, \
@@ -773,7 +778,8 @@ static struct adis16400_chip_info adis16400_chips[] = {
                 .channels = adis16448_channels,
                 .num_channels = ARRAY_SIZE(adis16448_channels),
                 .flags = ADIS16400_HAS_PROD_ID |
-                               ADIS16400_HAS_SERIAL_NUMBER,
+                               ADIS16400_HAS_SERIAL_NUMBER |
+                               ADIS16400_BURST_DIAG_STAT,
                 .gyro_scale_micro = IIO_DEGREE_TO_RAD(10000), /* 0.01 deg/s */
                 .accel_scale_micro = IIO_G_TO_M_S_2(833), /* 1/1200 g */
                 .temp_scale_nano = 73860000, /* 0.07386 C */
@@ -791,11 +797,6 @@ static const struct iio_info adis16400_info = {
         .debugfs_reg_access = adis_debugfs_reg_access,
  };
  
-static const unsigned long adis16400_burst_scan_mask[] = {
-       ~0UL,
-       0,
-};
-
  static const char * const adis16400_status_error_msgs[] = {
         [ADIS16400_DIAG_STAT_ZACCL_FAIL] = "Z-axis accelerometer self-test failure",
         [ADIS16400_DIAG_STAT_YACCL_FAIL] = "Y-axis accelerometer self-test failure",
@@ -843,6 +844,20 @@ static const struct adis_data adis16400_data = {
                 BIT(ADIS16400_DIAG_STAT_POWER_LOW),
  };
  
+static void adis16400_setup_chan_mask(struct adis16400_state *st)
+{
+       const struct adis16400_chip_info *chip_info = st->variant;
+       unsigned i;
+
+       for (i = 0; i < chip_info->num_channels; i++) {
+               const struct iio_chan_spec *ch = &chip_info->channels[i];
+
+               if (ch->scan_index >= 0 &&
+                   ch->scan_index != ADIS16400_SCAN_TIMESTAMP)
+                       st->avail_scan_mask[0] |= BIT(ch->scan_index);
+       }
+}
+
  static int adis16400_probe(struct spi_device *spi)
  {
         struct adis16400_state *st;
@@ -866,8 +881,10 @@ static int adis16400_probe(struct spi_device *spi)
         indio_dev->info = &adis16400_info;
         indio_dev->modes = INDIO_DIRECT_MODE;
  
-       if (!(st->variant->flags & ADIS16400_NO_BURST))
-               indio_dev->available_scan_masks = adis16400_burst_scan_mask;
+       if (!(st->variant->flags & ADIS16400_NO_BURST)) {
+               adis16400_setup_chan_mask(st);
+               indio_dev->available_scan_masks = st->avail_scan_mask;
+       }
  
         ret = adis_init(&st->adis, indio_dev, spi, &adis16400_data);
         if (ret)
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c

index 3f40319a55da364f2e757acb7bc0e83d86c78c38..575a072d765f65cc49190a3066218759bd3569cd 100644 (file)
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -65,6 +65,8 @@ static int
  isert_rdma_accept(struct isert_conn *isert_conn);
  struct rdma_cm_id *isert_setup_id(struct isert_np *isert_np);
  
+static void isert_release_work(struct work_struct *work);
+
  static inline bool
  isert_prot_cmd(struct isert_conn *conn, struct se_cmd *cmd)
  {
@@ -648,6 +650,7 @@ isert_init_conn(struct isert_conn *isert_conn)
         mutex_init(&isert_conn->mutex);
         spin_lock_init(&isert_conn->pool_lock);
         INIT_LIST_HEAD(&isert_conn->fr_pool);
+       INIT_WORK(&isert_conn->release_work, isert_release_work);
  }
  
  static void
@@ -925,6 +928,7 @@ isert_disconnected_handler(struct rdma_cm_id *cma_id,
  {
         struct isert_np *isert_np = cma_id->context;
         struct isert_conn *isert_conn;
+       bool terminating = false;
  
         if (isert_np->np_cm_id == cma_id)
                 return isert_np_cma_handler(cma_id->context, event);
@@ -932,12 +936,25 @@ isert_disconnected_handler(struct rdma_cm_id *cma_id,
         isert_conn = cma_id->qp->qp_context;
  
         mutex_lock(&isert_conn->mutex);
+       terminating = (isert_conn->state == ISER_CONN_TERMINATING);
         isert_conn_terminate(isert_conn);
         mutex_unlock(&isert_conn->mutex);
  
         isert_info("conn %p completing wait\n", isert_conn);
         complete(&isert_conn->wait);
  
+       if (terminating)
+               goto out;
+
+       mutex_lock(&isert_np->np_accept_mutex);
+       if (!list_empty(&isert_conn->accept_node)) {
+               list_del_init(&isert_conn->accept_node);
+               isert_put_conn(isert_conn);
+               queue_work(isert_release_wq, &isert_conn->release_work);
+       }
+       mutex_unlock(&isert_np->np_accept_mutex);
+
+out:
         return 0;
  }
  
@@ -2380,7 +2397,6 @@ isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
         page_off = offset % PAGE_SIZE;
  
         send_wr->sg_list = ib_sge;
-       send_wr->num_sge = sg_nents;
         send_wr->wr_id = (uintptr_t)&isert_cmd->tx_desc;
         /*
          * Perform mapping of TCM scatterlist memory ib_sge dma_addr.
@@ -2400,14 +2416,17 @@ isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
                           ib_sge->addr, ib_sge->length, ib_sge->lkey);
                 page_off = 0;
                 data_left -= ib_sge->length;
+               if (!data_left)
+                       break;
                 ib_sge++;
                 isert_dbg("Incrementing ib_sge pointer to %p\n", ib_sge);
         }
  
+       send_wr->num_sge = ++i;
         isert_dbg("Set outgoing sg_list: %p num_sg: %u from TCM SGLs\n",
                   send_wr->sg_list, send_wr->num_sge);
  
-       return sg_nents;
+       return send_wr->num_sge;
  }
  
  static int
@@ -3366,7 +3385,6 @@ static void isert_wait_conn(struct iscsi_conn *conn)
         isert_wait4flush(isert_conn);
         isert_wait4logout(isert_conn);
  
-       INIT_WORK(&isert_conn->release_work, isert_release_work);
         queue_work(isert_release_wq, &isert_conn->release_work);
  }
  
@@ -3374,6 +3392,7 @@ static void isert_free_conn(struct iscsi_conn *conn)
  {
         struct isert_conn *isert_conn = conn->context;
  
+       isert_wait4flush(isert_conn);
         isert_put_conn(isert_conn);
  }
  
diff --git a/drivers/input/mouse/alps.c b/drivers/input/mouse/alps.c

index 7752bd59d4b7d529218dad186c155a91a9922ad1..a353b7de6d22e91a52378cd4c106b17cafc26a07 100644 (file)
--- a/drivers/input/mouse/alps.c
+++ b/drivers/input/mouse/alps.c
@@ -1063,9 +1063,8 @@ static void alps_process_trackstick_packet_v7(struct psmouse *psmouse)
         right = (packet[1] & 0x02) >> 1;
         middle = (packet[1] & 0x04) >> 2;
  
-       /* Divide 2 since trackpoint's speed is too fast */
-       input_report_rel(dev2, REL_X, (char)x / 2);
-       input_report_rel(dev2, REL_Y, -((char)y / 2));
+       input_report_rel(dev2, REL_X, (char)x);
+       input_report_rel(dev2, REL_Y, -((char)y));
  
         input_report_key(dev2, BTN_LEFT, left);
         input_report_key(dev2, BTN_RIGHT, right);
diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c

index 79363b6871959ec2b74c24f4b9e9b89b172d950d..ce3d40004458c87392339472f654462fae7cf0bc 100644 (file)
--- a/drivers/input/mouse/elantech.c
+++ b/drivers/input/mouse/elantech.c
@@ -1376,10 +1376,11 @@ static bool elantech_is_signature_valid(const unsigned char *param)
                 return true;
  
         /*
-        * Some models have a revision higher then 20. Meaning param[2] may
-        * be 10 or 20, skip the rates check for these.
+        * Some hw_version >= 4 models have a revision higher then 20. Meaning
+        * that param[2] may be 10 or 20, skip the rates check for these.
          */
-       if (param[0] == 0x46 && (param[1] & 0xef) == 0x0f && param[2] < 40)
+       if ((param[0] & 0x0f) >= 0x06 && (param[1] & 0xaf) == 0x0f &&
+           param[2] < 40)
                 return true;
  
         for (i = 0; i < ARRAY_SIZE(rates); i++)
@@ -1555,6 +1556,7 @@ static int elantech_set_properties(struct elantech_data *etd)
                 case 9:
                 case 10:
                 case 13:
+               case 14:
                         etd->hw_version = 4;
                         break;
                 default:
diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c

index 630af73e98c488a5e266e4ccb6eed5dba622f3d3..35c8d0ceabeebf989b8eeff5cd54ee8f3ac2e247 100644 (file)
--- a/drivers/input/mouse/synaptics.c
+++ b/drivers/input/mouse/synaptics.c
@@ -150,6 +150,11 @@ static const struct min_max_quirk min_max_pnpid_table[] = {
                 {ANY_BOARD_ID, 2961},
                 1024, 5112, 2024, 4832
         },
+       {
+               (const char * const []){"LEN2000", NULL},
+               {ANY_BOARD_ID, ANY_BOARD_ID},
+               1024, 5113, 2021, 4832
+       },
         {
                 (const char * const []){"LEN2001", NULL},
                 {ANY_BOARD_ID, ANY_BOARD_ID},
@@ -191,7 +196,7 @@ static const char * const topbuttonpad_pnp_ids[] = {
         "LEN0045",
         "LEN0047",
         "LEN0049",
-       "LEN2000",
+       "LEN2000", /* S540 */
         "LEN2001", /* Edge E431 */
         "LEN2002", /* Edge E531 */
         "LEN2003",
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c

index cbe8c1f28a95d710143e35f74f6e902eb4037ea9..fffea87a014f9ecf58ce9f2d55901b7d83a8bab9 100644 (file)
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2931,6 +2931,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
         size      = PAGE_ALIGN(size);
         dma_mask  = dev->coherent_dma_mask;
         flag     &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+       flag     |= __GFP_ZERO;
  
         page = alloc_pages(flag | __GFP_NOWARN,  get_order(size));
         if (!page) {
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c

index 68d43beccb7e560f845ad49b8ae7d9e38872fcf7..5ecfaf29933ad4634e2124544e3c800b9b309d44 100644 (file)
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -422,6 +422,14 @@ static int dmar_map_gfx = 1;
  static int dmar_forcedac;
  static int intel_iommu_strict;
  static int intel_iommu_superpage = 1;
+static int intel_iommu_ecs = 1;
+
+/* We only actually use ECS when PASID support (on the new bit 40)
+ * is also advertised. Some early implementations — the ones with
+ * PASID support on bit 28 — have issues even when we *only* use
+ * extended root/context tables. */
+#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
+                           ecap_pasid(iommu->ecap))
  
  int intel_iommu_gfx_mapped;
  EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
@@ -465,6 +473,10 @@ static int __init intel_iommu_setup(char *str)
                         printk(KERN_INFO
                                 "Intel-IOMMU: disable supported super page\n");
                         intel_iommu_superpage = 0;
+               } else if (!strncmp(str, "ecs_off", 7)) {
+                       printk(KERN_INFO
+                               "Intel-IOMMU: disable extended context table support\n");
+                       intel_iommu_ecs = 0;
                 }
  
                 str += strcspn(str, ",");
@@ -669,7 +681,7 @@ static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu
         struct context_entry *context;
         u64 *entry;
  
-       if (ecap_ecs(iommu->ecap)) {
+       if (ecs_enabled(iommu)) {
                 if (devfn >= 0x80) {
                         devfn -= 0x80;
                         entry = &root->hi;
@@ -696,6 +708,11 @@ static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu
         return &context[devfn];
  }
  
+static int iommu_dummy(struct device *dev)
+{
+       return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
+}
+
  static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
  {
         struct dmar_drhd_unit *drhd = NULL;
@@ -705,6 +722,9 @@ static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devf
         u16 segment = 0;
         int i;
  
+       if (iommu_dummy(dev))
+               return NULL;
+
         if (dev_is_pci(dev)) {
                 pdev = to_pci_dev(dev);
                 segment = pci_domain_nr(pdev->bus);
@@ -798,7 +818,7 @@ static void free_context_table(struct intel_iommu *iommu)
                 if (context)
                         free_pgtable_page(context);
  
-               if (!ecap_ecs(iommu->ecap))
+               if (!ecs_enabled(iommu))
                         continue;
  
                 context = iommu_context_addr(iommu, i, 0x80, 0);
@@ -1133,7 +1153,7 @@ static void iommu_set_root_entry(struct intel_iommu *iommu)
         unsigned long flag;
  
         addr = virt_to_phys(iommu->root_entry);
-       if (ecap_ecs(iommu->ecap))
+       if (ecs_enabled(iommu))
                 addr |= DMA_RTADDR_RTT;
  
         raw_spin_lock_irqsave(&iommu->register_lock, flag);
@@ -2969,11 +2989,6 @@ static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
         return __get_valid_domain_for_dev(dev);
  }
  
-static int iommu_dummy(struct device *dev)
-{
-       return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
-}
-
  /* Check if the dev needs to go through non-identity map and unmap process.*/
  static int iommu_no_mapping(struct device *dev)
  {
diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c

index 57f09cb544644bcd97aa81bfc044c2686b350bbb..269c2354c43169307aa02438dbf38aa4b54f0dad 100644 (file)
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -271,7 +271,7 @@ int gic_get_c0_fdc_int(void)
                                   GIC_LOCAL_TO_HWIRQ(GIC_LOCAL_INT_FDC));
  }
  
-static void gic_handle_shared_int(void)
+static void gic_handle_shared_int(bool chained)
  {
         unsigned int i, intr, virq;
         unsigned long *pcpu_mask;
@@ -299,7 +299,10 @@ static void gic_handle_shared_int(void)
         while (intr != gic_shared_intrs) {
                 virq = irq_linear_revmap(gic_irq_domain,
                                          GIC_SHARED_TO_HWIRQ(intr));
-               do_IRQ(virq);
+               if (chained)
+                       generic_handle_irq(virq);
+               else
+                       do_IRQ(virq);
  
                 /* go to next pending bit */
                 bitmap_clear(pending, intr, 1);
@@ -431,7 +434,7 @@ static struct irq_chip gic_edge_irq_controller = {
  #endif
  };
  
-static void gic_handle_local_int(void)
+static void gic_handle_local_int(bool chained)
  {
         unsigned long pending, masked;
         unsigned int intr, virq;
@@ -445,7 +448,10 @@ static void gic_handle_local_int(void)
         while (intr != GIC_NUM_LOCAL_INTRS) {
                 virq = irq_linear_revmap(gic_irq_domain,
                                          GIC_LOCAL_TO_HWIRQ(intr));
-               do_IRQ(virq);
+               if (chained)
+                       generic_handle_irq(virq);
+               else
+                       do_IRQ(virq);
  
                 /* go to next pending bit */
                 bitmap_clear(&pending, intr, 1);
@@ -509,13 +515,14 @@ static struct irq_chip gic_all_vpes_local_irq_controller = {
  
  static void __gic_irq_dispatch(void)
  {
-       gic_handle_local_int();
-       gic_handle_shared_int();
+       gic_handle_local_int(false);
+       gic_handle_shared_int(false);
  }
  
  static void gic_irq_dispatch(unsigned int irq, struct irq_desc *desc)
  {
-       __gic_irq_dispatch();
+       gic_handle_local_int(true);
+       gic_handle_shared_int(true);
  }
  
  #ifdef CONFIG_MIPS_GIC_IPI
diff --git a/drivers/irqchip/irq-sunxi-nmi.c b/drivers/irqchip/irq-sunxi-nmi.c

index 4a9ce5b50c5bba33b7428a0b67b88d26e31c4067..6b2b582433bde95062e85d17403e4a505c5a4ef9 100644 (file)
--- a/drivers/irqchip/irq-sunxi-nmi.c
+++ b/drivers/irqchip/irq-sunxi-nmi.c
@@ -104,7 +104,7 @@ static int sunxi_sc_nmi_set_type(struct irq_data *data, unsigned int flow_type)
         irqd_set_trigger_type(data, flow_type);
         irq_setup_alt_chip(data, flow_type);
  
-       for (i = 0; i <= gc->num_ct; i++, ct++)
+       for (i = 0; i < gc->num_ct; i++, ct++)
                 if (ct->type & flow_type)
                         ctrl_off = ct->regs.type;
  
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c

index 30f2aef69d787d7245b3e91c53b98a0a0216cdb9..6a4cd771a2be62b4172cc26a178ca85fbf7e6d27 100644 (file)
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -46,7 +46,7 @@
  #include <asm/setup.h>
  #include <asm/lguest.h>
  #include <asm/uaccess.h>
-#include <asm/i387.h>
+#include <asm/fpu/internal.h>
  #include <asm/tlbflush.h>
  #include "../lg.h"
  
@@ -251,7 +251,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
          * we set it now, so we can trap and pass that trap to the Guest if it
          * uses the FPU.
          */
-       if (cpu->ts && user_has_fpu())
+       if (cpu->ts && fpregs_active())
                 stts();
  
         /*
@@ -283,7 +283,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
                 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
  
         /* Clear the host TS bit if it was set above. */
-       if (cpu->ts && user_has_fpu())
+       if (cpu->ts && fpregs_active())
                 clts();
  
         /*
@@ -297,12 +297,12 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
         /*
          * Similarly, if we took a trap because the Guest used the FPU,
          * we have to restore the FPU it expects to see.
-        * math_state_restore() may sleep and we may even move off to
+        * fpu__restore() may sleep and we may even move off to
          * a different CPU. So all the critical stuff should be done
          * before this.
          */
-       else if (cpu->regs->trapnum == 7 && !user_has_fpu())
-               math_state_restore();
+       else if (cpu->regs->trapnum == 7 && !fpregs_active())
+               fpu__restore(&current->thread.fpu);
  }
  
  /*H:130
diff --git a/drivers/md/md.c b/drivers/md/md.c

index 27506302eb7aa42557bfc01547274957ccbace50..4dbed4a67aaf40e3c04bde925870c24d13cd1b4e 100644 (file)
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3834,7 +3834,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
                                 err = -EBUSY;
                 }
                 spin_unlock(&mddev->lock);
-               return err;
+               return err ?: len;
         }
         err = mddev_lock(mddev);
         if (err)
@@ -4217,13 +4217,14 @@ action_store(struct mddev *mddev, const char *page, size_t len)
                         set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                 else
                         clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-               flush_workqueue(md_misc_wq);
-               if (mddev->sync_thread) {
-                       set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-                       if (mddev_lock(mddev) == 0) {
+               if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+                   mddev_lock(mddev) == 0) {
+                       flush_workqueue(md_misc_wq);
+                       if (mddev->sync_thread) {
+                               set_bit(MD_RECOVERY_INTR, &mddev->recovery);
                                 md_reap_sync_thread(mddev);
-                               mddev_unlock(mddev);
                         }
+                       mddev_unlock(mddev);
                 }
         } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
                    test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@ -8261,6 +8262,7 @@ void md_reap_sync_thread(struct mddev *mddev)
         if (mddev_is_clustered(mddev))
                 md_cluster_ops->metadata_update_finish(mddev);
         clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+       clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
         clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
         clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c

index e793ab6b35705e0ed1ad6904ebe9353b6dbf6fd6..f55c3f35b7463141086afb727785c775c5185d76 100644 (file)
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4156,6 +4156,7 @@ static int raid10_start_reshape(struct mddev *mddev)
  
         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+       clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
         set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
         set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
  
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 553d54b870528f0917e7518a9a783a28636d884a..b6793d2e051f3b278405f236e6623980bcdf1d04 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7354,6 +7354,7 @@ static int raid5_start_reshape(struct mddev *mddev)
  
         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+       clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
         set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
         set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
         mddev->sync_thread = md_register_thread(md_do_sync, mddev,
diff --git a/drivers/media/Kconfig b/drivers/media/Kconfig

index 3ef0f90b128fc5bdf6d5e5dff0d5bbfbd5190d7b..157099243d6152190211b8625ba656d45feae003 100644 (file)
--- a/drivers/media/Kconfig
+++ b/drivers/media/Kconfig
@@ -97,6 +97,7 @@ config MEDIA_CONTROLLER
  config MEDIA_CONTROLLER_DVB
         bool "Enable Media controller for DVB"
         depends on MEDIA_CONTROLLER
+       depends on BROKEN
         ---help---
           Enable the media controller API support for DVB.
  
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c

index db84ddcfec8464191a3edcccfd87c869ac1c5a7c..9fd6c69a8bac3c77d1c0c6e99eb4f3644561f78a 100644 (file)
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -423,7 +423,7 @@ static void xgbe_tx_timer(unsigned long data)
         if (napi_schedule_prep(napi)) {
                 /* Disable Tx and Rx interrupts */
                 if (pdata->per_channel_irq)
-                       disable_irq(channel->dma_irq);
+                       disable_irq_nosync(channel->dma_irq);
                 else
                         xgbe_disable_rx_tx_ints(pdata);
  
diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c

index 77363d6805321534a582e579552f46e254737e25..a3b1c07ae0af0935f3026ba8a56e21512e238e36 100644 (file)
--- a/drivers/net/ethernet/broadcom/b44.c
+++ b/drivers/net/ethernet/broadcom/b44.c
@@ -2464,6 +2464,7 @@ err_out_powerdown:
         ssb_bus_may_powerdown(sdev->bus);
  
  err_out_free_dev:
+       netif_napi_del(&bp->napi);
         free_netdev(dev);
  
  out:
@@ -2480,6 +2481,7 @@ static void b44_remove_one(struct ssb_device *sdev)
                 b44_unregister_phy_one(bp);
         ssb_device_disable(sdev, 0);
         ssb_bus_may_powerdown(sdev->bus);
+       netif_napi_del(&bp->napi);
         free_netdev(dev);
         ssb_pcihost_set_power_state(sdev, PCI_D3hot);
         ssb_set_drvdata(sdev, NULL);
diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c

index e7651b3c6c5767f7609115ef0430c13aac8d17a9..420949cc55aab6349b75c33f0c4f061aa384d537 100644 (file)
--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c
@@ -299,9 +299,6 @@ int bcmgenet_mii_config(struct net_device *dev, bool init)
                         phy_name = "external RGMII (no delay)";
                 else
                         phy_name = "external RGMII (TX delay)";
-               reg = bcmgenet_ext_readl(priv, EXT_RGMII_OOB_CTRL);
-               reg |= RGMII_MODE_EN | id_mode_dis;
-               bcmgenet_ext_writel(priv, reg, EXT_RGMII_OOB_CTRL);
                 bcmgenet_sys_writel(priv,
                                     PORT_MODE_EXT_GPHY, SYS_PORT_CTRL);
                 break;
@@ -310,6 +307,15 @@ int bcmgenet_mii_config(struct net_device *dev, bool init)
                 return -EINVAL;
         }
  
+       /* This is an external PHY (xMII), so we need to enable the RGMII
+        * block for the interface to work
+        */
+       if (priv->ext_phy) {
+               reg = bcmgenet_ext_readl(priv, EXT_RGMII_OOB_CTRL);
+               reg |= RGMII_MODE_EN | id_mode_dis;
+               bcmgenet_ext_writel(priv, reg, EXT_RGMII_OOB_CTRL);
+       }
+
         if (init)
                 dev_info(kdev, "configuring instance for %s\n", phy_name);
  
diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c

index 28d9ca675a274f9876473bcce7e6995a14e1289e..68d47b196daec3d3c5d0b8af19f8d167735e1e79 100644 (file)
--- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c
+++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
@@ -131,8 +131,15 @@ static void enic_get_drvinfo(struct net_device *netdev,
  {
         struct enic *enic = netdev_priv(netdev);
         struct vnic_devcmd_fw_info *fw_info;
+       int err;
  
-       enic_dev_fw_info(enic, &fw_info);
+       err = enic_dev_fw_info(enic, &fw_info);
+       /* return only when pci_zalloc_consistent fails in vnic_dev_fw_info
+        * For other failures, like devcmd failure, we return previously
+        * recorded info.
+        */
+       if (err == -ENOMEM)
+               return;
  
         strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver));
         strlcpy(drvinfo->version, DRV_VERSION, sizeof(drvinfo->version));
@@ -181,8 +188,15 @@ static void enic_get_ethtool_stats(struct net_device *netdev,
         struct enic *enic = netdev_priv(netdev);
         struct vnic_stats *vstats;
         unsigned int i;
-
-       enic_dev_stats_dump(enic, &vstats);
+       int err;
+
+       err = enic_dev_stats_dump(enic, &vstats);
+       /* return only when pci_zalloc_consistent fails in vnic_dev_stats_dump
+        * For other failures, like devcmd failure, we return previously
+        * recorded stats.
+        */
+       if (err == -ENOMEM)
+               return;
  
         for (i = 0; i < enic_n_tx_stats; i++)
                 *(data++) = ((u64 *)&vstats->tx)[enic_tx_stats[i].index];
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c

index 204bd182473bceaaabaa5b1eba5ed618de751808..eadae1b412c652974dde24a9a76c5d74a8c3fa29 100644 (file)
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -615,8 +615,15 @@ static struct rtnl_link_stats64 *enic_get_stats(struct net_device *netdev,
  {
         struct enic *enic = netdev_priv(netdev);
         struct vnic_stats *stats;
+       int err;
  
-       enic_dev_stats_dump(enic, &stats);
+       err = enic_dev_stats_dump(enic, &stats);
+       /* return only when pci_zalloc_consistent fails in vnic_dev_stats_dump
+        * For other failures, like devcmd failure, we return previously
+        * recorded stats.
+        */
+       if (err == -ENOMEM)
+               return net_stats;
  
         net_stats->tx_packets = stats->tx.tx_frames_ok;
         net_stats->tx_bytes = stats->tx.tx_bytes_ok;
@@ -1407,6 +1414,7 @@ static int enic_poll_msix_rq(struct napi_struct *napi, int budget)
                  */
                 enic_calc_int_moderation(enic, &enic->rq[rq]);
  
+       enic_poll_unlock_napi(&enic->rq[rq]);
         if (work_done < work_to_do) {
  
                 /* Some work done, but not enough to stay in polling,
@@ -1418,7 +1426,6 @@ static int enic_poll_msix_rq(struct napi_struct *napi, int budget)
                         enic_set_int_moderation(enic, &enic->rq[rq]);
                 vnic_intr_unmask(&enic->intr[intr]);
         }
-       enic_poll_unlock_napi(&enic->rq[rq]);
  
         return work_done;
  }
diff --git a/drivers/net/ethernet/cisco/enic/vnic_rq.c b/drivers/net/ethernet/cisco/enic/vnic_rq.c

index 36a2ed606c911f21355360fad81eb39b18162c59..c4b2183bf352fb2a1881001777df91857c2d1f79 100644 (file)
--- a/drivers/net/ethernet/cisco/enic/vnic_rq.c
+++ b/drivers/net/ethernet/cisco/enic/vnic_rq.c
@@ -188,16 +188,15 @@ void vnic_rq_clean(struct vnic_rq *rq,
         struct vnic_rq_buf *buf;
         u32 fetch_index;
         unsigned int count = rq->ring.desc_count;
+       int i;
  
         buf = rq->to_clean;
  
-       while (vnic_rq_desc_used(rq) > 0) {
-
+       for (i = 0; i < rq->ring.desc_count; i++) {
                 (*buf_clean)(rq, buf);
-
-               buf = rq->to_clean = buf->next;
-               rq->ring.desc_avail++;
+               buf = buf->next;
         }
+       rq->ring.desc_avail = rq->ring.desc_count - 1;
  
         /* Use current fetch_index as the ring starting point */
         fetch_index = ioread32(&rq->ctrl->fetch_index);
diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c

index fb140faeafb1cbda612cd11a9a1aac04e936c4a3..c5e1d0ac75f909f843dd0397ad41b85eeb26a164 100644 (file)
--- a/drivers/net/ethernet/emulex/benet/be_cmds.c
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.c
@@ -1720,9 +1720,9 @@ int be_cmd_get_regs(struct be_adapter *adapter, u32 buf_len, void *buf)
         total_size = buf_len;
  
         get_fat_cmd.size = sizeof(struct be_cmd_req_get_fat) + 60*1024;
-       get_fat_cmd.va = pci_alloc_consistent(adapter->pdev,
-                                             get_fat_cmd.size,
-                                             &get_fat_cmd.dma);
+       get_fat_cmd.va = dma_zalloc_coherent(&adapter->pdev->dev,
+                                            get_fat_cmd.size,
+                                            &get_fat_cmd.dma, GFP_ATOMIC);
         if (!get_fat_cmd.va) {
                 dev_err(&adapter->pdev->dev,
                         "Memory allocation failure while reading FAT data\n");
@@ -1767,8 +1767,8 @@ int be_cmd_get_regs(struct be_adapter *adapter, u32 buf_len, void *buf)
                 log_offset += buf_size;
         }
  err:
-       pci_free_consistent(adapter->pdev, get_fat_cmd.size,
-                           get_fat_cmd.va, get_fat_cmd.dma);
+       dma_free_coherent(&adapter->pdev->dev, get_fat_cmd.size,
+                         get_fat_cmd.va, get_fat_cmd.dma);
         spin_unlock_bh(&adapter->mcc_lock);
         return status;
  }
@@ -2215,12 +2215,12 @@ int be_cmd_read_port_transceiver_data(struct be_adapter *adapter,
                 return -EINVAL;
  
         cmd.size = sizeof(struct be_cmd_resp_port_type);
-       cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma);
+       cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma,
+                                    GFP_ATOMIC);
         if (!cmd.va) {
                 dev_err(&adapter->pdev->dev, "Memory allocation failed\n");
                 return -ENOMEM;
         }
-       memset(cmd.va, 0, cmd.size);
  
         spin_lock_bh(&adapter->mcc_lock);
  
@@ -2245,7 +2245,7 @@ int be_cmd_read_port_transceiver_data(struct be_adapter *adapter,
         }
  err:
         spin_unlock_bh(&adapter->mcc_lock);
-       pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma);
+       dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va, cmd.dma);
         return status;
  }
  
@@ -2720,7 +2720,8 @@ int be_cmd_get_phy_info(struct be_adapter *adapter)
                 goto err;
         }
         cmd.size = sizeof(struct be_cmd_req_get_phy_info);
-       cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma);
+       cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma,
+                                    GFP_ATOMIC);
         if (!cmd.va) {
                 dev_err(&adapter->pdev->dev, "Memory alloc failure\n");
                 status = -ENOMEM;
@@ -2754,7 +2755,7 @@ int be_cmd_get_phy_info(struct be_adapter *adapter)
                                 BE_SUPPORTED_SPEED_1GBPS;
                 }
         }
-       pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma);
+       dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va, cmd.dma);
  err:
         spin_unlock_bh(&adapter->mcc_lock);
         return status;
@@ -2805,8 +2806,9 @@ int be_cmd_get_cntl_attributes(struct be_adapter *adapter)
  
         memset(&attribs_cmd, 0, sizeof(struct be_dma_mem));
         attribs_cmd.size = sizeof(struct be_cmd_resp_cntl_attribs);
-       attribs_cmd.va = pci_alloc_consistent(adapter->pdev, attribs_cmd.size,
-                                             &attribs_cmd.dma);
+       attribs_cmd.va = dma_zalloc_coherent(&adapter->pdev->dev,
+                                            attribs_cmd.size,
+                                            &attribs_cmd.dma, GFP_ATOMIC);
         if (!attribs_cmd.va) {
                 dev_err(&adapter->pdev->dev, "Memory allocation failure\n");
                 status = -ENOMEM;
@@ -2833,8 +2835,8 @@ int be_cmd_get_cntl_attributes(struct be_adapter *adapter)
  err:
         mutex_unlock(&adapter->mbox_lock);
         if (attribs_cmd.va)
-               pci_free_consistent(adapter->pdev, attribs_cmd.size,
-                                   attribs_cmd.va, attribs_cmd.dma);
+               dma_free_coherent(&adapter->pdev->dev, attribs_cmd.size,
+                                 attribs_cmd.va, attribs_cmd.dma);
         return status;
  }
  
@@ -2972,9 +2974,10 @@ int be_cmd_get_mac_from_list(struct be_adapter *adapter, u8 *mac,
  
         memset(&get_mac_list_cmd, 0, sizeof(struct be_dma_mem));
         get_mac_list_cmd.size = sizeof(struct be_cmd_resp_get_mac_list);
-       get_mac_list_cmd.va = pci_alloc_consistent(adapter->pdev,
-                                                  get_mac_list_cmd.size,
-                                                  &get_mac_list_cmd.dma);
+       get_mac_list_cmd.va = dma_zalloc_coherent(&adapter->pdev->dev,
+                                                 get_mac_list_cmd.size,
+                                                 &get_mac_list_cmd.dma,
+                                                 GFP_ATOMIC);
  
         if (!get_mac_list_cmd.va) {
                 dev_err(&adapter->pdev->dev,
@@ -3047,8 +3050,8 @@ int be_cmd_get_mac_from_list(struct be_adapter *adapter, u8 *mac,
  
  out:
         spin_unlock_bh(&adapter->mcc_lock);
-       pci_free_consistent(adapter->pdev, get_mac_list_cmd.size,
-                           get_mac_list_cmd.va, get_mac_list_cmd.dma);
+       dma_free_coherent(&adapter->pdev->dev, get_mac_list_cmd.size,
+                         get_mac_list_cmd.va, get_mac_list_cmd.dma);
         return status;
  }
  
@@ -3101,8 +3104,8 @@ int be_cmd_set_mac_list(struct be_adapter *adapter, u8 *mac_array,
  
         memset(&cmd, 0, sizeof(struct be_dma_mem));
         cmd.size = sizeof(struct be_cmd_req_set_mac_list);
-       cmd.va = dma_alloc_coherent(&adapter->pdev->dev, cmd.size,
-                                   &cmd.dma, GFP_KERNEL);
+       cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma,
+                                    GFP_KERNEL);
         if (!cmd.va)
                 return -ENOMEM;
  
@@ -3291,7 +3294,8 @@ int be_cmd_get_acpi_wol_cap(struct be_adapter *adapter)
  
         memset(&cmd, 0, sizeof(struct be_dma_mem));
         cmd.size = sizeof(struct be_cmd_resp_acpi_wol_magic_config_v1);
-       cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma);
+       cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma,
+                                    GFP_ATOMIC);
         if (!cmd.va) {
                 dev_err(&adapter->pdev->dev, "Memory allocation failure\n");
                 status = -ENOMEM;
@@ -3326,7 +3330,8 @@ int be_cmd_get_acpi_wol_cap(struct be_adapter *adapter)
  err:
         mutex_unlock(&adapter->mbox_lock);
         if (cmd.va)
-               pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma);
+               dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va,
+                                 cmd.dma);
         return status;
  
  }
@@ -3340,8 +3345,9 @@ int be_cmd_set_fw_log_level(struct be_adapter *adapter, u32 level)
  
         memset(&extfat_cmd, 0, sizeof(struct be_dma_mem));
         extfat_cmd.size = sizeof(struct be_cmd_resp_get_ext_fat_caps);
-       extfat_cmd.va = pci_alloc_consistent(adapter->pdev, extfat_cmd.size,
-                                            &extfat_cmd.dma);
+       extfat_cmd.va = dma_zalloc_coherent(&adapter->pdev->dev,
+                                           extfat_cmd.size, &extfat_cmd.dma,
+                                           GFP_ATOMIC);
         if (!extfat_cmd.va)
                 return -ENOMEM;
  
@@ -3363,8 +3369,8 @@ int be_cmd_set_fw_log_level(struct be_adapter *adapter, u32 level)
  
         status = be_cmd_set_ext_fat_capabilites(adapter, &extfat_cmd, cfgs);
  err:
-       pci_free_consistent(adapter->pdev, extfat_cmd.size, extfat_cmd.va,
-                           extfat_cmd.dma);
+       dma_free_coherent(&adapter->pdev->dev, extfat_cmd.size, extfat_cmd.va,
+                         extfat_cmd.dma);
         return status;
  }
  
@@ -3377,8 +3383,9 @@ int be_cmd_get_fw_log_level(struct be_adapter *adapter)
  
         memset(&extfat_cmd, 0, sizeof(struct be_dma_mem));
         extfat_cmd.size = sizeof(struct be_cmd_resp_get_ext_fat_caps);
-       extfat_cmd.va = pci_alloc_consistent(adapter->pdev, extfat_cmd.size,
-                                            &extfat_cmd.dma);
+       extfat_cmd.va = dma_zalloc_coherent(&adapter->pdev->dev,
+                                           extfat_cmd.size, &extfat_cmd.dma,
+                                           GFP_ATOMIC);
  
         if (!extfat_cmd.va) {
                 dev_err(&adapter->pdev->dev, "%s: Memory allocation failure\n",
@@ -3396,8 +3403,8 @@ int be_cmd_get_fw_log_level(struct be_adapter *adapter)
                                 level = cfgs->module[0].trace_lvl[j].dbg_lvl;
                 }
         }
-       pci_free_consistent(adapter->pdev, extfat_cmd.size, extfat_cmd.va,
-                           extfat_cmd.dma);
+       dma_free_coherent(&adapter->pdev->dev, extfat_cmd.size, extfat_cmd.va,
+                         extfat_cmd.dma);
  err:
         return level;
  }
@@ -3595,7 +3602,8 @@ int be_cmd_get_func_config(struct be_adapter *adapter, struct be_resources *res)
  
         memset(&cmd, 0, sizeof(struct be_dma_mem));
         cmd.size = sizeof(struct be_cmd_resp_get_func_config);
-       cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma);
+       cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma,
+                                    GFP_ATOMIC);
         if (!cmd.va) {
                 dev_err(&adapter->pdev->dev, "Memory alloc failure\n");
                 status = -ENOMEM;
@@ -3635,7 +3643,8 @@ int be_cmd_get_func_config(struct be_adapter *adapter, struct be_resources *res)
  err:
         mutex_unlock(&adapter->mbox_lock);
         if (cmd.va)
-               pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma);
+               dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va,
+                                 cmd.dma);
         return status;
  }
  
@@ -3656,7 +3665,8 @@ int be_cmd_get_profile_config(struct be_adapter *adapter,
  
         memset(&cmd, 0, sizeof(struct be_dma_mem));
         cmd.size = sizeof(struct be_cmd_resp_get_profile_config);
-       cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma);
+       cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma,
+                                    GFP_ATOMIC);
         if (!cmd.va)
                 return -ENOMEM;
  
@@ -3702,7 +3712,8 @@ int be_cmd_get_profile_config(struct be_adapter *adapter,
                 res->vf_if_cap_flags = vf_res->cap_flags;
  err:
         if (cmd.va)
-               pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma);
+               dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va,
+                                 cmd.dma);
         return status;
  }
  
@@ -3717,7 +3728,8 @@ static int be_cmd_set_profile_config(struct be_adapter *adapter, void *desc,
  
         memset(&cmd, 0, sizeof(struct be_dma_mem));
         cmd.size = sizeof(struct be_cmd_req_set_profile_config);
-       cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma);
+       cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma,
+                                    GFP_ATOMIC);
         if (!cmd.va)
                 return -ENOMEM;
  
@@ -3733,7 +3745,8 @@ static int be_cmd_set_profile_config(struct be_adapter *adapter, void *desc,
         status = be_cmd_notify_wait(adapter, &wrb);
  
         if (cmd.va)
-               pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma);
+               dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va,
+                                 cmd.dma);
         return status;
  }
  
diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c

index b765c24625bf523fd7932be17f6dfa22840a8e46..2835dee5dc3930cc5d1d09ec958bd2557228a2cd 100644 (file)
--- a/drivers/net/ethernet/emulex/benet/be_ethtool.c
+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c
@@ -264,8 +264,8 @@ static int lancer_cmd_read_file(struct be_adapter *adapter, u8 *file_name,
         int status = 0;
  
         read_cmd.size = LANCER_READ_FILE_CHUNK;
-       read_cmd.va = pci_alloc_consistent(adapter->pdev, read_cmd.size,
-                                          &read_cmd.dma);
+       read_cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, read_cmd.size,
+                                         &read_cmd.dma, GFP_ATOMIC);
  
         if (!read_cmd.va) {
                 dev_err(&adapter->pdev->dev,
@@ -289,8 +289,8 @@ static int lancer_cmd_read_file(struct be_adapter *adapter, u8 *file_name,
                         break;
                 }
         }
-       pci_free_consistent(adapter->pdev, read_cmd.size, read_cmd.va,
-                           read_cmd.dma);
+       dma_free_coherent(&adapter->pdev->dev, read_cmd.size, read_cmd.va,
+                         read_cmd.dma);
  
         return status;
  }
@@ -818,8 +818,9 @@ static int be_test_ddr_dma(struct be_adapter *adapter)
         };
  
         ddrdma_cmd.size = sizeof(struct be_cmd_req_ddrdma_test);
-       ddrdma_cmd.va = dma_alloc_coherent(&adapter->pdev->dev, ddrdma_cmd.size,
-                                          &ddrdma_cmd.dma, GFP_KERNEL);
+       ddrdma_cmd.va = dma_zalloc_coherent(&adapter->pdev->dev,
+                                           ddrdma_cmd.size, &ddrdma_cmd.dma,
+                                           GFP_KERNEL);
         if (!ddrdma_cmd.va)
                 return -ENOMEM;
  
@@ -941,8 +942,9 @@ static int be_read_eeprom(struct net_device *netdev,
  
         memset(&eeprom_cmd, 0, sizeof(struct be_dma_mem));
         eeprom_cmd.size = sizeof(struct be_cmd_req_seeprom_read);
-       eeprom_cmd.va = dma_alloc_coherent(&adapter->pdev->dev, eeprom_cmd.size,
-                                          &eeprom_cmd.dma, GFP_KERNEL);
+       eeprom_cmd.va = dma_zalloc_coherent(&adapter->pdev->dev,
+                                           eeprom_cmd.size, &eeprom_cmd.dma,
+                                           GFP_KERNEL);
  
         if (!eeprom_cmd.va)
                 return -ENOMEM;
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c

index 6f9ffb9026cd56825f90e0c300aab1c8cd2a7739..e43cc8a73ea7e85a927443c077c18ce6c673751a 100644 (file)
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -4605,8 +4605,8 @@ static int lancer_fw_download(struct be_adapter *adapter,
  
         flash_cmd.size = sizeof(struct lancer_cmd_req_write_object)
                                 + LANCER_FW_DOWNLOAD_CHUNK;
-       flash_cmd.va = dma_alloc_coherent(dev, flash_cmd.size,
-                                         &flash_cmd.dma, GFP_KERNEL);
+       flash_cmd.va = dma_zalloc_coherent(dev, flash_cmd.size,
+                                          &flash_cmd.dma, GFP_KERNEL);
         if (!flash_cmd.va)
                 return -ENOMEM;
  
@@ -4739,8 +4739,8 @@ static int be_fw_download(struct be_adapter *adapter, const struct firmware* fw)
         }
  
         flash_cmd.size = sizeof(struct be_cmd_write_flashrom);
-       flash_cmd.va = dma_alloc_coherent(dev, flash_cmd.size, &flash_cmd.dma,
-                                         GFP_KERNEL);
+       flash_cmd.va = dma_zalloc_coherent(dev, flash_cmd.size, &flash_cmd.dma,
+                                          GFP_KERNEL);
         if (!flash_cmd.va)
                 return -ENOMEM;
  
@@ -5291,16 +5291,15 @@ static int be_drv_init(struct be_adapter *adapter)
         int status = 0;
  
         mbox_mem_alloc->size = sizeof(struct be_mcc_mailbox) + 16;
-       mbox_mem_alloc->va = dma_alloc_coherent(dev, mbox_mem_alloc->size,
-                                               &mbox_mem_alloc->dma,
-                                               GFP_KERNEL);
+       mbox_mem_alloc->va = dma_zalloc_coherent(dev, mbox_mem_alloc->size,
+                                                &mbox_mem_alloc->dma,
+                                                GFP_KERNEL);
         if (!mbox_mem_alloc->va)
                 return -ENOMEM;
  
         mbox_mem_align->size = sizeof(struct be_mcc_mailbox);
         mbox_mem_align->va = PTR_ALIGN(mbox_mem_alloc->va, 16);
         mbox_mem_align->dma = PTR_ALIGN(mbox_mem_alloc->dma, 16);
-       memset(mbox_mem_align->va, 0, sizeof(struct be_mcc_mailbox));
  
         rx_filter->size = sizeof(struct be_cmd_req_rx_filter);
         rx_filter->va = dma_zalloc_coherent(dev, rx_filter->size,
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h

index 33c35d3b7420fa9ae545aea4ebd5160036914718..5d47307121abbe413cd259ff74f9aa2ee68e6c45 100644 (file)
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -317,6 +317,7 @@ struct i40e_pf {
  #endif
  #define I40E_FLAG_PORT_ID_VALID                (u64)(1 << 28)
  #define I40E_FLAG_DCB_CAPABLE                  (u64)(1 << 29)
+#define I40E_FLAG_VEB_MODE_ENABLED             BIT_ULL(40)
  
         /* tracks features that get auto disabled by errors */
         u64 auto_disable_flags;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c

index 34170eabca7da939ba1c8b9b5fad14dc2f54370d..da0faf478af076199e4281b0f3da57ad92c5e62b 100644 (file)
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -1021,6 +1021,15 @@ static ssize_t i40e_dbg_command_write(struct file *filp,
                         goto command_write_done;
                 }
  
+               /* By default we are in VEPA mode, if this is the first VF/VMDq
+                * VSI to be added switch to VEB mode.
+                */
+               if (!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) {
+                       pf->flags |= I40E_FLAG_VEB_MODE_ENABLED;
+                       i40e_do_reset_safe(pf,
+                                          BIT_ULL(__I40E_PF_RESET_REQUESTED));
+               }
+
                 vsi = i40e_vsi_setup(pf, I40E_VSI_VMDQ2, vsi_seid, 0);
                 if (vsi)
                         dev_info(&pf->pdev->dev, "added VSI %d to relay %d\n",
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c

index a54c14491e3b6a4dbc168980dd44d399b6766487..5b5bea159bd53c8684d0a69b310e492bc797c8b6 100644 (file)
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -6097,6 +6097,10 @@ static int i40e_reconstitute_veb(struct i40e_veb *veb)
         if (ret)
                 goto end_reconstitute;
  
+       if (pf->flags & I40E_FLAG_VEB_MODE_ENABLED)
+               veb->bridge_mode = BRIDGE_MODE_VEB;
+       else
+               veb->bridge_mode = BRIDGE_MODE_VEPA;
         i40e_config_bridge_mode(veb);
  
         /* create the remaining VSIs attached to this VEB */
@@ -8031,7 +8035,12 @@ static int i40e_ndo_bridge_setlink(struct net_device *dev,
                 } else if (mode != veb->bridge_mode) {
                         /* Existing HW bridge but different mode needs reset */
                         veb->bridge_mode = mode;
-                       i40e_do_reset(pf, (1 << __I40E_PF_RESET_REQUESTED));
+                       /* TODO: If no VFs or VMDq VSIs, disallow VEB mode */
+                       if (mode == BRIDGE_MODE_VEB)
+                               pf->flags |= I40E_FLAG_VEB_MODE_ENABLED;
+                       else
+                               pf->flags &= ~I40E_FLAG_VEB_MODE_ENABLED;
+                       i40e_do_reset(pf, BIT_ULL(__I40E_PF_RESET_REQUESTED));
                         break;
                 }
         }
@@ -8343,11 +8352,12 @@ static int i40e_add_vsi(struct i40e_vsi *vsi)
                 ctxt.uplink_seid = vsi->uplink_seid;
                 ctxt.connection_type = I40E_AQ_VSI_CONN_TYPE_NORMAL;
                 ctxt.flags = I40E_AQ_VSI_TYPE_PF;
-               if (i40e_is_vsi_uplink_mode_veb(vsi)) {
+               if ((pf->flags & I40E_FLAG_VEB_MODE_ENABLED) &&
+                   (i40e_is_vsi_uplink_mode_veb(vsi))) {
                         ctxt.info.valid_sections |=
-                               cpu_to_le16(I40E_AQ_VSI_PROP_SWITCH_VALID);
+                            cpu_to_le16(I40E_AQ_VSI_PROP_SWITCH_VALID);
                         ctxt.info.switch_id =
-                               cpu_to_le16(I40E_AQ_VSI_SW_ID_FLAG_ALLOW_LB);
+                          cpu_to_le16(I40E_AQ_VSI_SW_ID_FLAG_ALLOW_LB);
                 }
                 i40e_vsi_setup_queue_map(vsi, &ctxt, enabled_tc, true);
                 break;
@@ -8746,6 +8756,14 @@ struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 type,
                                          __func__);
                                 return NULL;
                         }
+                       /* We come up by default in VEPA mode if SRIOV is not
+                        * already enabled, in which case we can't force VEPA
+                        * mode.
+                        */
+                       if (!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) {
+                               veb->bridge_mode = BRIDGE_MODE_VEPA;
+                               pf->flags &= ~I40E_FLAG_VEB_MODE_ENABLED;
+                       }
                         i40e_config_bridge_mode(veb);
                 }
                 for (i = 0; i < I40E_MAX_VEB && !veb; i++) {
@@ -9856,6 +9874,15 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
                 goto err_switch_setup;
         }
  
+#ifdef CONFIG_PCI_IOV
+       /* prep for VF support */
+       if ((pf->flags & I40E_FLAG_SRIOV_ENABLED) &&
+           (pf->flags & I40E_FLAG_MSIX_ENABLED) &&
+           !test_bit(__I40E_BAD_EEPROM, &pf->state)) {
+               if (pci_num_vf(pdev))
+                       pf->flags |= I40E_FLAG_VEB_MODE_ENABLED;
+       }
+#endif
         err = i40e_setup_pf_switch(pf, false);
         if (err) {
                 dev_info(&pdev->dev, "setup_pf_switch failed: %d\n", err);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c

index 4bd3a80aba82998bba343a1870b2d21f59bca4e0..9d95042d5a0f5805824d53ecc847ff76a9909444 100644 (file)
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2410,14 +2410,12 @@ static int i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
   * i40e_chk_linearize - Check if there are more than 8 fragments per packet
   * @skb:      send buffer
   * @tx_flags: collected send information
- * @hdr_len:  size of the packet header
   *
   * Note: Our HW can't scatter-gather more than 8 fragments to build
   * a packet on the wire and so we need to figure out the cases where we
   * need to linearize the skb.
   **/
-static bool i40e_chk_linearize(struct sk_buff *skb, u32 tx_flags,
-                              const u8 hdr_len)
+static bool i40e_chk_linearize(struct sk_buff *skb, u32 tx_flags)
  {
         struct skb_frag_struct *frag;
         bool linearize = false;
@@ -2429,7 +2427,7 @@ static bool i40e_chk_linearize(struct sk_buff *skb, u32 tx_flags,
         gso_segs = skb_shinfo(skb)->gso_segs;
  
         if (tx_flags & (I40E_TX_FLAGS_TSO | I40E_TX_FLAGS_FSO)) {
-               u16 j = 1;
+               u16 j = 0;
  
                 if (num_frags < (I40E_MAX_BUFFER_TXD))
                         goto linearize_chk_done;
@@ -2440,21 +2438,18 @@ static bool i40e_chk_linearize(struct sk_buff *skb, u32 tx_flags,
                         goto linearize_chk_done;
                 }
                 frag = &skb_shinfo(skb)->frags[0];
-               size = hdr_len;
                 /* we might still have more fragments per segment */
                 do {
                         size += skb_frag_size(frag);
                         frag++; j++;
+                       if ((size >= skb_shinfo(skb)->gso_size) &&
+                           (j < I40E_MAX_BUFFER_TXD)) {
+                               size = (size % skb_shinfo(skb)->gso_size);
+                               j = (size) ? 1 : 0;
+                       }
                         if (j == I40E_MAX_BUFFER_TXD) {
-                               if (size < skb_shinfo(skb)->gso_size) {
-                                       linearize = true;
-                                       break;
-                               }
-                               j = 1;
-                               size -= skb_shinfo(skb)->gso_size;
-                               if (size)
-                                       j++;
-                               size += hdr_len;
+                               linearize = true;
+                               break;
                         }
                         num_frags--;
                 } while (num_frags);
@@ -2724,7 +2719,7 @@ static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff *skb,
         if (tsyn)
                 tx_flags |= I40E_TX_FLAGS_TSYN;
  
-       if (i40e_chk_linearize(skb, tx_flags, hdr_len))
+       if (i40e_chk_linearize(skb, tx_flags))
                 if (skb_linearize(skb))
                         goto out_drop;
  
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c

index 78d1c4ff565e8853473b70c3827e6a727ff3ce1c..4e9376da051829969de7750c2dc7a66acc5e5f40 100644 (file)
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -1018,11 +1018,19 @@ int i40e_pci_sriov_configure(struct pci_dev *pdev, int num_vfs)
  {
         struct i40e_pf *pf = pci_get_drvdata(pdev);
  
-       if (num_vfs)
+       if (num_vfs) {
+               if (!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) {
+                       pf->flags |= I40E_FLAG_VEB_MODE_ENABLED;
+                       i40e_do_reset_safe(pf,
+                                          BIT_ULL(__I40E_PF_RESET_REQUESTED));
+               }
                 return i40e_pci_sriov_enable(pdev, num_vfs);
+       }
  
         if (!pci_vfs_assigned(pf->pdev)) {
                 i40e_free_vfs(pf);
+               pf->flags &= ~I40E_FLAG_VEB_MODE_ENABLED;
+               i40e_do_reset_safe(pf, BIT_ULL(__I40E_PF_RESET_REQUESTED));
         } else {
                 dev_warn(&pdev->dev, "Unable to free VFs because some are assigned to VMs.\n");
                 return -EINVAL;
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c

index b077e02a0cc7ac8f67ad90560cf990f8f7a66277..458fbb421090772d0bbc1620277624339e0cd757 100644 (file)
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -1619,14 +1619,12 @@ static void i40e_create_tx_ctx(struct i40e_ring *tx_ring,
   * i40e_chk_linearize - Check if there are more than 8 fragments per packet
   * @skb:      send buffer
   * @tx_flags: collected send information
- * @hdr_len:  size of the packet header
   *
   * Note: Our HW can't scatter-gather more than 8 fragments to build
   * a packet on the wire and so we need to figure out the cases where we
   * need to linearize the skb.
   **/
-static bool i40e_chk_linearize(struct sk_buff *skb, u32 tx_flags,
-                              const u8 hdr_len)
+static bool i40e_chk_linearize(struct sk_buff *skb, u32 tx_flags)
  {
         struct skb_frag_struct *frag;
         bool linearize = false;
@@ -1638,7 +1636,7 @@ static bool i40e_chk_linearize(struct sk_buff *skb, u32 tx_flags,
         gso_segs = skb_shinfo(skb)->gso_segs;
  
         if (tx_flags & (I40E_TX_FLAGS_TSO | I40E_TX_FLAGS_FSO)) {
-               u16 j = 1;
+               u16 j = 0;
  
                 if (num_frags < (I40E_MAX_BUFFER_TXD))
                         goto linearize_chk_done;
@@ -1649,21 +1647,18 @@ static bool i40e_chk_linearize(struct sk_buff *skb, u32 tx_flags,
                         goto linearize_chk_done;
                 }
                 frag = &skb_shinfo(skb)->frags[0];
-               size = hdr_len;
                 /* we might still have more fragments per segment */
                 do {
                         size += skb_frag_size(frag);
                         frag++; j++;
+                       if ((size >= skb_shinfo(skb)->gso_size) &&
+                           (j < I40E_MAX_BUFFER_TXD)) {
+                               size = (size % skb_shinfo(skb)->gso_size);
+                               j = (size) ? 1 : 0;
+                       }
                         if (j == I40E_MAX_BUFFER_TXD) {
-                               if (size < skb_shinfo(skb)->gso_size) {
-                                       linearize = true;
-                                       break;
-                               }
-                               j = 1;
-                               size -= skb_shinfo(skb)->gso_size;
-                               if (size)
-                                       j++;
-                               size += hdr_len;
+                               linearize = true;
+                               break;
                         }
                         num_frags--;
                 } while (num_frags);
@@ -1950,7 +1945,7 @@ static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff *skb,
         else if (tso)
                 tx_flags |= I40E_TX_FLAGS_TSO;
  
-       if (i40e_chk_linearize(skb, tx_flags, hdr_len))
+       if (i40e_chk_linearize(skb, tx_flags))
                 if (skb_linearize(skb))
                         goto out_drop;
  
diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c

index e3b9b63ad01083cb987429f57c9ebef84d86f4db..c3a9392cbc192229f4178c913fad8ab64d8c44c3 100644 (file)
--- a/drivers/net/ethernet/intel/igb/igb_ptp.c
+++ b/drivers/net/ethernet/intel/igb/igb_ptp.c
@@ -538,8 +538,8 @@ static int igb_ptp_feature_enable_i210(struct ptp_clock_info *ptp,
                         igb->perout[i].start.tv_nsec = rq->perout.start.nsec;
                         igb->perout[i].period.tv_sec = ts.tv_sec;
                         igb->perout[i].period.tv_nsec = ts.tv_nsec;
-                       wr32(trgttiml, rq->perout.start.sec);
-                       wr32(trgttimh, rq->perout.start.nsec);
+                       wr32(trgttimh, rq->perout.start.sec);
+                       wr32(trgttiml, rq->perout.start.nsec);
                         tsauxc |= tsauxc_mask;
                         tsim |= tsim_mask;
                 } else {
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c

index 4b00545a3ace7784b3b3e668ccd68b4334c1500b..65944dd8bf6b11239e4945b6be7e91fdc547254a 100644 (file)
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -1304,7 +1304,7 @@ static unsigned int efx_wanted_parallelism(struct efx_nic *efx)
                         if (!cpumask_test_cpu(cpu, thread_mask)) {
                                 ++count;
                                 cpumask_or(thread_mask, thread_mask,
-                                          topology_thread_cpumask(cpu));
+                                          topology_sibling_cpumask(cpu));
                         }
                 }
  
diff --git a/drivers/ntb/ntb_hw.c b/drivers/ntb/ntb_hw.c

index cd29b1038c5e3bf6f4a21659343c65584c44b969..15f9b7c9e4d38e93a52864a953e12d4172602797 100644 (file)
--- a/drivers/ntb/ntb_hw.c
+++ b/drivers/ntb/ntb_hw.c
@@ -1660,6 +1660,7 @@ static int ntb_atom_detect(struct ntb_device *ndev)
         u32 ppd;
  
         ndev->hw_type = BWD_HW;
+       ndev->limits.max_mw = BWD_MAX_MW;
  
         rc = pci_read_config_dword(ndev->pdev, NTB_PPD_OFFSET, &ppd);
         if (rc)
@@ -1778,7 +1779,7 @@ static int ntb_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
                         dev_warn(&pdev->dev, "Cannot remap BAR %d\n",
                                  MW_TO_BAR(i));
                         rc = -EIO;
-                       goto err3;
+                       goto err4;
                 }
         }
  
diff --git a/drivers/of/base.c b/drivers/of/base.c

index 99764db0875aa0e1b34ca348ca1606c2a8990258..f0650265febf95cc6a37d03dd3d5b38b0d7370af 100644 (file)
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -189,7 +189,7 @@ int __of_attach_node_sysfs(struct device_node *np)
         return 0;
  }
  
-static int __init of_init(void)
+void __init of_core_init(void)
  {
         struct device_node *np;
  
@@ -198,7 +198,8 @@ static int __init of_init(void)
         of_kset = kset_create_and_add("devicetree", NULL, firmware_kobj);
         if (!of_kset) {
                 mutex_unlock(&of_mutex);
-               return -ENOMEM;
+               pr_err("devicetree: failed to register existing nodes\n");
+               return;
         }
         for_each_of_allnodes(np)
                 __of_attach_node_sysfs(np);
@@ -207,10 +208,7 @@ static int __init of_init(void)
         /* Symlink in /proc as required by userspace ABI */
         if (of_root)
                 proc_symlink("device-tree", NULL, "/sys/firmware/devicetree/base");
-
-       return 0;
  }
-core_initcall(of_init);
  
  static struct property *__of_find_property(const struct device_node *np,
                                            const char *name, int *lenp)
diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c

index 3351ef408125d757f52ac772700687ef7f735c06..53826b84e0ec6d46d3699705f46216070a471867 100644 (file)
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -225,7 +225,7 @@ void __of_attach_node(struct device_node *np)
         phandle = __of_get_property(np, "phandle", &sz);
         if (!phandle)
                 phandle = __of_get_property(np, "linux,phandle", &sz);
-       if (IS_ENABLED(PPC_PSERIES) && !phandle)
+       if (IS_ENABLED(CONFIG_PPC_PSERIES) && !phandle)
                 phandle = __of_get_property(np, "ibm,phandle", &sz);
         np->phandle = (phandle && (sz >= 4)) ? be32_to_cpup(phandle) : 0;
  
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c

index 4fd0cacf7ca0ae0dfaebf5c612f457cdf6fa43f9..508cc56130e3f88d1b01716a7a00fead250fdf1c 100644 (file)
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -428,16 +428,19 @@ static void __assign_resources_sorted(struct list_head *head,
                  * consistent.
                  */
                 if (add_align > dev_res->res->start) {
+                       resource_size_t r_size = resource_size(dev_res->res);
+
                         dev_res->res->start = add_align;
-                       dev_res->res->end = add_align +
-                                           resource_size(dev_res->res);
+                       dev_res->res->end = add_align + r_size - 1;
  
                         list_for_each_entry(dev_res2, head, list) {
                                 align = pci_resource_alignment(dev_res2->dev,
                                                                dev_res2->res);
-                               if (add_align > align)
+                               if (add_align > align) {
                                         list_move_tail(&dev_res->list,
                                                        &dev_res2->list);
+                                       break;
+                               }
                         }
                 }
  
diff --git a/drivers/phy/Kconfig b/drivers/phy/Kconfig

index a53bd5b52df97ff48fa921a5009f2fa6937aa377..fc9b9f0ea91e8132b08c85478a592e3f820fc2cc 100644 (file)
--- a/drivers/phy/Kconfig
+++ b/drivers/phy/Kconfig
@@ -38,7 +38,9 @@ config ARMADA375_USBCLUSTER_PHY
  config PHY_DM816X_USB
         tristate "TI dm816x USB PHY driver"
         depends on ARCH_OMAP2PLUS
+       depends on USB_SUPPORT
         select GENERIC_PHY
+       select USB_PHY
         help
           Enable this for dm816x USB to work.
  
@@ -97,8 +99,9 @@ config OMAP_CONTROL_PHY
  config OMAP_USB2
         tristate "OMAP USB2 PHY Driver"
         depends on ARCH_OMAP2PLUS
-       depends on USB_PHY
+       depends on USB_SUPPORT
         select GENERIC_PHY
+       select USB_PHY
         select OMAP_CONTROL_PHY
         depends on OMAP_OCP2SCP
         help
@@ -122,8 +125,9 @@ config TI_PIPE3
  config TWL4030_USB
         tristate "TWL4030 USB Transceiver Driver"
         depends on TWL4030_CORE && REGULATOR_TWL4030 && USB_MUSB_OMAP2PLUS
-       depends on USB_PHY
+       depends on USB_SUPPORT
         select GENERIC_PHY
+       select USB_PHY
         help
           Enable this to support the USB OTG transceiver on TWL4030
           family chips (including the TWL5030 and TPS659x0 devices).
@@ -304,7 +308,7 @@ config PHY_STIH41X_USB
  
  config PHY_QCOM_UFS
         tristate "Qualcomm UFS PHY driver"
-       depends on OF && ARCH_MSM
+       depends on OF && ARCH_QCOM
         select GENERIC_PHY
         help
           Support for UFS PHY on QCOM chipsets.
diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c

index 3791838f4bd4b14e145dd5718a3030c4b89d9f3b..63bc12d7a73e561a8e967ac4fb7f453c9a0d23ab 100644 (file)
--- a/drivers/phy/phy-core.c
+++ b/drivers/phy/phy-core.c
@@ -530,7 +530,7 @@ struct phy *phy_optional_get(struct device *dev, const char *string)
  {
         struct phy *phy = phy_get(dev, string);
  
-       if (PTR_ERR(phy) == -ENODEV)
+       if (IS_ERR(phy) && (PTR_ERR(phy) == -ENODEV))
                 phy = NULL;
  
         return phy;
@@ -584,7 +584,7 @@ struct phy *devm_phy_optional_get(struct device *dev, const char *string)
  {
         struct phy *phy = devm_phy_get(dev, string);
  
-       if (PTR_ERR(phy) == -ENODEV)
+       if (IS_ERR(phy) && (PTR_ERR(phy) == -ENODEV))
                 phy = NULL;
  
         return phy;
diff --git a/drivers/phy/phy-omap-usb2.c b/drivers/phy/phy-omap-usb2.c

index 183ef43681016ba0f238edfa98bbbef3684ab543..c1a468686bdc72433b7596512cb70852f3ef2420 100644 (file)
--- a/drivers/phy/phy-omap-usb2.c
+++ b/drivers/phy/phy-omap-usb2.c
@@ -275,6 +275,7 @@ static int omap_usb2_probe(struct platform_device *pdev)
                 phy->wkupclk = devm_clk_get(phy->dev, "usb_phy_cm_clk32k");
                 if (IS_ERR(phy->wkupclk)) {
                         dev_err(&pdev->dev, "unable to get usb_phy_cm_clk32k\n");
+                       pm_runtime_disable(phy->dev);
                         return PTR_ERR(phy->wkupclk);
                 } else {
                         dev_warn(&pdev->dev,
diff --git a/drivers/phy/phy-rcar-gen2.c b/drivers/phy/phy-rcar-gen2.c

index 778276aba3aa0092d8e8e7bc2de15eae4f5a5a15..97d45f47d1ade847f9f0d7462d0ae91e91505974 100644 (file)
--- a/drivers/phy/phy-rcar-gen2.c
+++ b/drivers/phy/phy-rcar-gen2.c
@@ -23,7 +23,7 @@
  #define USBHS_LPSTS                    0x02
  #define USBHS_UGCTRL                   0x80
  #define USBHS_UGCTRL2                  0x84
-#define USBHS_UGSTS                    0x88    /* The manuals have 0x90 */
+#define USBHS_UGSTS                    0x88    /* From technical update */
  
  /* Low Power Status register (LPSTS) */
  #define USBHS_LPSTS_SUSPM              0x4000
@@ -41,7 +41,7 @@
  #define USBHS_UGCTRL2_USB0SEL_HS_USB   0x00000030
  
  /* USB General status register (UGSTS) */
-#define USBHS_UGSTS_LOCK               0x00000300 /* The manuals have 0x3 */
+#define USBHS_UGSTS_LOCK               0x00000100 /* From technical update */
  
  #define PHYS_PER_CHANNEL       2
  
diff --git a/drivers/soc/mediatek/Kconfig b/drivers/soc/mediatek/Kconfig

index bcdb22d5e215c9a393ccabe58f4f94ef132e3516..3c1850332a90212798ab5030554bc8fad39d9796 100644 (file)
--- a/drivers/soc/mediatek/Kconfig
+++ b/drivers/soc/mediatek/Kconfig
@@ -4,6 +4,7 @@
  config MTK_PMIC_WRAP
         tristate "MediaTek PMIC Wrapper Support"
         depends on ARCH_MEDIATEK
+       depends on RESET_CONTROLLER
         select REGMAP
         help
           Say yes here to add support for MediaTek PMIC Wrapper found
diff --git a/drivers/soc/mediatek/mtk-pmic-wrap.c b/drivers/soc/mediatek/mtk-pmic-wrap.c

index db5be1eec54c8db3977ea810e13c5470f416aaa7..f432291feee91e4b7c7b5ce3cc84f3b130933309 100644 (file)
--- a/drivers/soc/mediatek/mtk-pmic-wrap.c
+++ b/drivers/soc/mediatek/mtk-pmic-wrap.c
@@ -443,11 +443,6 @@ static int pwrap_wait_for_state(struct pmic_wrapper *wrp,
  static int pwrap_write(struct pmic_wrapper *wrp, u32 adr, u32 wdata)
  {
         int ret;
-       u32 val;
-
-       val = pwrap_readl(wrp, PWRAP_WACS2_RDATA);
-       if (PWRAP_GET_WACS_FSM(val) == PWRAP_WACS_FSM_WFVLDCLR)
-               pwrap_writel(wrp, 1, PWRAP_WACS2_VLDCLR);
  
         ret = pwrap_wait_for_state(wrp, pwrap_is_fsm_idle);
         if (ret)
@@ -462,11 +457,6 @@ static int pwrap_write(struct pmic_wrapper *wrp, u32 adr, u32 wdata)
  static int pwrap_read(struct pmic_wrapper *wrp, u32 adr, u32 *rdata)
  {
         int ret;
-       u32 val;
-
-       val = pwrap_readl(wrp, PWRAP_WACS2_RDATA);
-       if (PWRAP_GET_WACS_FSM(val) == PWRAP_WACS_FSM_WFVLDCLR)
-               pwrap_writel(wrp, 1, PWRAP_WACS2_VLDCLR);
  
         ret = pwrap_wait_for_state(wrp, pwrap_is_fsm_idle);
         if (ret)
@@ -480,6 +470,8 @@ static int pwrap_read(struct pmic_wrapper *wrp, u32 adr, u32 *rdata)
  
         *rdata = PWRAP_GET_WACS_RDATA(pwrap_readl(wrp, PWRAP_WACS2_RDATA));
  
+       pwrap_writel(wrp, 1, PWRAP_WACS2_VLDCLR);
+
         return 0;
  }
  
@@ -563,45 +555,17 @@ static int pwrap_init_sidly(struct pmic_wrapper *wrp)
  
  static int pwrap_init_reg_clock(struct pmic_wrapper *wrp)
  {
-       unsigned long rate_spi;
-       int ck_mhz;
-
-       rate_spi = clk_get_rate(wrp->clk_spi);
-
-       if (rate_spi > 26000000)
-               ck_mhz = 26;
-       else if (rate_spi > 18000000)
-               ck_mhz = 18;
-       else
-               ck_mhz = 0;
-
-       switch (ck_mhz) {
-       case 18:
-               if (pwrap_is_mt8135(wrp))
-                       pwrap_writel(wrp, 0xc, PWRAP_CSHEXT);
-               pwrap_writel(wrp, 0x4, PWRAP_CSHEXT_WRITE);
-               pwrap_writel(wrp, 0xc, PWRAP_CSHEXT_READ);
-               pwrap_writel(wrp, 0x0, PWRAP_CSLEXT_START);
-               pwrap_writel(wrp, 0x0, PWRAP_CSLEXT_END);
-               break;
-       case 26:
-               if (pwrap_is_mt8135(wrp))
-                       pwrap_writel(wrp, 0x4, PWRAP_CSHEXT);
+       if (pwrap_is_mt8135(wrp)) {
+               pwrap_writel(wrp, 0x4, PWRAP_CSHEXT);
                 pwrap_writel(wrp, 0x0, PWRAP_CSHEXT_WRITE);
                 pwrap_writel(wrp, 0x4, PWRAP_CSHEXT_READ);
                 pwrap_writel(wrp, 0x0, PWRAP_CSLEXT_START);
                 pwrap_writel(wrp, 0x0, PWRAP_CSLEXT_END);
-               break;
-       case 0:
-               if (pwrap_is_mt8135(wrp))
-                       pwrap_writel(wrp, 0xf, PWRAP_CSHEXT);
-               pwrap_writel(wrp, 0xf, PWRAP_CSHEXT_WRITE);
-               pwrap_writel(wrp, 0xf, PWRAP_CSHEXT_READ);
-               pwrap_writel(wrp, 0xf, PWRAP_CSLEXT_START);
-               pwrap_writel(wrp, 0xf, PWRAP_CSLEXT_END);
-               break;
-       default:
-               return -EINVAL;
+       } else {
+               pwrap_writel(wrp, 0x0, PWRAP_CSHEXT_WRITE);
+               pwrap_writel(wrp, 0x4, PWRAP_CSHEXT_READ);
+               pwrap_writel(wrp, 0x2, PWRAP_CSLEXT_START);
+               pwrap_writel(wrp, 0x2, PWRAP_CSLEXT_END);
         }
  
         return 0;
diff --git a/drivers/ssb/driver_chipcommon_pmu.c b/drivers/ssb/driver_chipcommon_pmu.c

index 09428412139e399979537da2e6272eda827a8757..c5352ea4821ea0df593c7043ac911ee891f103b0 100644 (file)
--- a/drivers/ssb/driver_chipcommon_pmu.c
+++ b/drivers/ssb/driver_chipcommon_pmu.c
@@ -621,8 +621,8 @@ static u32 ssb_pmu_get_alp_clock_clk0(struct ssb_chipcommon *cc)
         u32 crystalfreq;
         const struct pmu0_plltab_entry *e = NULL;
  
-       crystalfreq = chipco_read32(cc, SSB_CHIPCO_PMU_CTL) &
-                     SSB_CHIPCO_PMU_CTL_XTALFREQ >> SSB_CHIPCO_PMU_CTL_XTALFREQ_SHIFT;
+       crystalfreq = (chipco_read32(cc, SSB_CHIPCO_PMU_CTL) &
+                      SSB_CHIPCO_PMU_CTL_XTALFREQ)  >> SSB_CHIPCO_PMU_CTL_XTALFREQ_SHIFT;
         e = pmu0_plltab_find_entry(crystalfreq);
         BUG_ON(!e);
         return e->freq * 1000;
@@ -634,7 +634,7 @@ u32 ssb_pmu_get_alp_clock(struct ssb_chipcommon *cc)
  
         switch (bus->chip_id) {
         case 0x5354:
-               ssb_pmu_get_alp_clock_clk0(cc);
+               return ssb_pmu_get_alp_clock_clk0(cc);
         default:
                 ssb_err("ERROR: PMU alp clock unknown for device %04X\n",
                         bus->chip_id);
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h

index 3925db160650ca5d96df0880331717188a2ce23f..513c81f43d6e87926a2d368fc023525a6a37b338 100644 (file)
--- a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h
@@ -189,22 +189,7 @@ static inline int ll_quota_off(struct super_block *sb, int off, int remount)
  #endif
  
  
-
-/*
- * After 3.1, kernel's nameidata.intent.open.flags is different
- * with lustre's lookup_intent.it_flags, as lustre's it_flags'
- * lower bits equal to FMODE_xxx while kernel doesn't transliterate
- * lower bits of nameidata.intent.open.flags to FMODE_xxx.
- * */
  #include <linux/version.h>
-static inline int ll_namei_to_lookup_intent_flag(int flag)
-{
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0)
-       flag = (flag & ~O_ACCMODE) | OPEN_FMODE(flag);
-#endif
-       return flag;
-}
-
  #include <linux/fs.h>
  
  # define ll_umode_t    umode_t
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c

index cc3ab351943e195a454d33d3f8d8529694f5ac0d..f9262243f9359aa2b9ae0ddd00a9f5c0937cbe6f 100644 (file)
--- a/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
@@ -87,7 +87,7 @@ static void cfs_cpu_core_siblings(int cpu, cpumask_t *mask)
  /* return cpumask of HTs in the same core */
  static void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask)
  {
-       cpumask_copy(mask, topology_thread_cpumask(cpu));
+       cpumask_copy(mask, topology_sibling_cpumask(cpu));
  }
  
  static void cfs_node_to_cpumask(int node, cpumask_t *mask)
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h

index 5f918e3c4683ddac82c864d1d45f7c5700d20282..528af9011653d199f77dddb9926168acbfa96fef 100644 (file)
--- a/drivers/staging/lustre/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -57,12 +57,6 @@
  #define VM_FAULT_RETRY 0
  #endif
  
-/* Kernel 3.1 kills LOOKUP_CONTINUE, LOOKUP_PARENT is equivalent to it.
- * seem kernel commit 49084c3bb2055c401f3493c13edae14d49128ca0 */
-#ifndef LOOKUP_CONTINUE
-#define LOOKUP_CONTINUE LOOKUP_PARENT
-#endif
-
  /** Only used on client-side for indicating the tail of dir hash/offset. */
  #define LL_DIR_END_OFF   0x7fffffffffffffffULL
  #define LL_DIR_END_OFF_32BIT    0x7fffffffUL
diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c

index 3711e671a4dfaa21af87d19857416f8f0f631734..69b203651905e93f77149754a5b9d6a021b6bf32 100644 (file)
--- a/drivers/staging/lustre/lustre/llite/symlink.c
+++ b/drivers/staging/lustre/lustre/llite/symlink.c
@@ -118,7 +118,7 @@ failed:
         return rc;
  }
  
-static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *ll_follow_link(struct dentry *dentry, void **cookie)
  {
         struct inode *inode = d_inode(dentry);
         struct ptlrpc_request *request = NULL;
@@ -126,32 +126,22 @@ static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd)
         char *symname = NULL;
  
         CDEBUG(D_VFSTRACE, "VFS Op\n");
-       /* Limit the recursive symlink depth to 5 instead of default
-        * 8 links when kernel has 4k stack to prevent stack overflow.
-        * For 8k stacks we need to limit it to 7 for local servers. */
-       if (THREAD_SIZE < 8192 && current->link_count >= 6) {
-               rc = -ELOOP;
-       } else if (THREAD_SIZE == 8192 && current->link_count >= 8) {
-               rc = -ELOOP;
-       } else {
-               ll_inode_size_lock(inode);
-               rc = ll_readlink_internal(inode, &request, &symname);
-               ll_inode_size_unlock(inode);
-       }
+       ll_inode_size_lock(inode);
+       rc = ll_readlink_internal(inode, &request, &symname);
+       ll_inode_size_unlock(inode);
         if (rc) {
                 ptlrpc_req_finished(request);
-               request = NULL;
-               symname = ERR_PTR(rc);
+               return ERR_PTR(rc);
         }
  
-       nd_set_link(nd, symname);
         /* symname may contain a pointer to the request message buffer,
          * we delay request releasing until ll_put_link then.
          */
-       return request;
+       *cookie = request;
+       return symname;
  }
  
-static void ll_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+static void ll_put_link(struct inode *unused, void *cookie)
  {
         ptlrpc_req_finished(cookie);
  }
diff --git a/drivers/staging/lustre/lustre/ptlrpc/service.c b/drivers/staging/lustre/lustre/ptlrpc/service.c

index 8e61421515cb689e205681257fe634f9baeb72bc..344189ac5698a575accbaf7afc5f1ec940651240 100644 (file)
--- a/drivers/staging/lustre/lustre/ptlrpc/service.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/service.c
@@ -557,7 +557,7 @@ ptlrpc_server_nthreads_check(struct ptlrpc_service *svc,
                  * there are.
                  */
                 /* weight is # of HTs */
-               if (cpumask_weight(topology_thread_cpumask(0)) > 1) {
+               if (cpumask_weight(topology_sibling_cpumask(0)) > 1) {
                         /* depress thread factor for hyper-thread */
                         factor = factor - (factor >> 1) + (factor >> 3);
                 }
@@ -2768,7 +2768,7 @@ int ptlrpc_hr_init(void)
  
         init_waitqueue_head(&ptlrpc_hr.hr_waitq);
  
-       weight = cpumask_weight(topology_thread_cpumask(0));
+       weight = cpumask_weight(topology_sibling_cpumask(0));
  
         cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
                 hrp->hrp_cpt = i;
diff --git a/drivers/staging/ozwpan/ozhcd.c b/drivers/staging/ozwpan/ozhcd.c

index 5ff4716b72c311485084005b9e09a36021157530..784b5ecfa8493ba07d8ba90cde1b11b2b6a4b6b7 100644 (file)
--- a/drivers/staging/ozwpan/ozhcd.c
+++ b/drivers/staging/ozwpan/ozhcd.c
@@ -746,8 +746,8 @@ void oz_hcd_pd_reset(void *hpd, void *hport)
  /*
   * Context: softirq
   */
-void oz_hcd_get_desc_cnf(void *hport, u8 req_id, int status, const u8 *desc,
-                       int length, int offset, int total_size)
+void oz_hcd_get_desc_cnf(void *hport, u8 req_id, u8 status, const u8 *desc,
+                       u8 length, u16 offset, u16 total_size)
  {
         struct oz_port *port = hport;
         struct urb *urb;
@@ -759,8 +759,8 @@ void oz_hcd_get_desc_cnf(void *hport, u8 req_id, int status, const u8 *desc,
         if (!urb)
                 return;
         if (status == 0) {
-               int copy_len;
-               int required_size = urb->transfer_buffer_length;
+               unsigned int copy_len;
+               unsigned int required_size = urb->transfer_buffer_length;
  
                 if (required_size > total_size)
                         required_size = total_size;
diff --git a/drivers/staging/ozwpan/ozusbif.h b/drivers/staging/ozwpan/ozusbif.h

index 4249fa37401289c4caf1f4cae4d46dba321f276b..d2a6085345bec8c2e927115389efc46bfbad3019 100644 (file)
--- a/drivers/staging/ozwpan/ozusbif.h
+++ b/drivers/staging/ozwpan/ozusbif.h
@@ -29,8 +29,8 @@ void oz_usb_request_heartbeat(void *hpd);
  
  /* Confirmation functions.
   */
-void oz_hcd_get_desc_cnf(void *hport, u8 req_id, int status,
-       const u8 *desc, int length, int offset, int total_size);
+void oz_hcd_get_desc_cnf(void *hport, u8 req_id, u8 status,
+       const u8 *desc, u8 length, u16 offset, u16 total_size);
  void oz_hcd_control_cnf(void *hport, u8 req_id, u8 rcode,
         const u8 *data, int data_len);
  
diff --git a/drivers/staging/ozwpan/ozusbsvc1.c b/drivers/staging/ozwpan/ozusbsvc1.c

index d434d8c6fff67c04b58d6cac5c76a6832bae5bc3..f660bb198c65534a6cbe8183d3f5d0a30a532eb1 100644 (file)
--- a/drivers/staging/ozwpan/ozusbsvc1.c
+++ b/drivers/staging/ozwpan/ozusbsvc1.c
@@ -326,7 +326,11 @@ static void oz_usb_handle_ep_data(struct oz_usb_ctx *usb_ctx,
                         struct oz_multiple_fixed *body =
                                 (struct oz_multiple_fixed *)data_hdr;
                         u8 *data = body->data;
-                       int n = (len - sizeof(struct oz_multiple_fixed)+1)
+                       unsigned int n;
+                       if (!body->unit_size ||
+                               len < sizeof(struct oz_multiple_fixed) - 1)
+                               break;
+                       n = (len - (sizeof(struct oz_multiple_fixed) - 1))
                                 / body->unit_size;
                         while (n--) {
                                 oz_hcd_data_ind(usb_ctx->hport, body->endpoint,
@@ -390,10 +394,15 @@ void oz_usb_rx(struct oz_pd *pd, struct oz_elt *elt)
         case OZ_GET_DESC_RSP: {
                         struct oz_get_desc_rsp *body =
                                 (struct oz_get_desc_rsp *)usb_hdr;
-                       int data_len = elt->length -
-                                       sizeof(struct oz_get_desc_rsp) + 1;
-                       u16 offs = le16_to_cpu(get_unaligned(&body->offset));
-                       u16 total_size =
+                       u16 offs, total_size;
+                       u8 data_len;
+
+                       if (elt->length < sizeof(struct oz_get_desc_rsp) - 1)
+                               break;
+                       data_len = elt->length -
+                                       (sizeof(struct oz_get_desc_rsp) - 1);
+                       offs = le16_to_cpu(get_unaligned(&body->offset));
+                       total_size =
                                 le16_to_cpu(get_unaligned(&body->total_size));
                         oz_dbg(ON, "USB_REQ_GET_DESCRIPTOR - cnf\n");
                         oz_hcd_get_desc_cnf(usb_ctx->hport, body->req_id,
diff --git a/drivers/staging/rtl8712/rtl8712_led.c b/drivers/staging/rtl8712/rtl8712_led.c

index f1d47a0676c3e3ba29ea974754c77e8a32a3f950..ada8d5dafd492e97a1b4d9457d25e4a485e67556 100644 (file)
--- a/drivers/staging/rtl8712/rtl8712_led.c
+++ b/drivers/staging/rtl8712/rtl8712_led.c
@@ -898,11 +898,11 @@ static void SwLedControlMode1(struct _adapter *padapter,
                           IS_LED_WPS_BLINKING(pLed))
                                 return;
                         if (pLed->bLedLinkBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedLinkBlinkInProgress = false;
                         }
                         if (pLed->bLedBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedBlinkInProgress = false;
                         }
                         pLed->bLedNoLinkBlinkInProgress = true;
@@ -921,11 +921,11 @@ static void SwLedControlMode1(struct _adapter *padapter,
                             IS_LED_WPS_BLINKING(pLed))
                                 return;
                         if (pLed->bLedNoLinkBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedNoLinkBlinkInProgress = false;
                         }
                         if (pLed->bLedBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedBlinkInProgress = false;
                         }
                         pLed->bLedLinkBlinkInProgress = true;
@@ -946,15 +946,15 @@ static void SwLedControlMode1(struct _adapter *padapter,
                         if (IS_LED_WPS_BLINKING(pLed))
                                 return;
                         if (pLed->bLedNoLinkBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedNoLinkBlinkInProgress = false;
                         }
                         if (pLed->bLedLinkBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                  pLed->bLedLinkBlinkInProgress = false;
                         }
                         if (pLed->bLedBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedBlinkInProgress = false;
                         }
                         pLed->bLedScanBlinkInProgress = true;
@@ -975,11 +975,11 @@ static void SwLedControlMode1(struct _adapter *padapter,
                             IS_LED_WPS_BLINKING(pLed))
                                 return;
                         if (pLed->bLedNoLinkBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedNoLinkBlinkInProgress = false;
                         }
                         if (pLed->bLedLinkBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedLinkBlinkInProgress = false;
                         }
                         pLed->bLedBlinkInProgress = true;
@@ -998,19 +998,19 @@ static void SwLedControlMode1(struct _adapter *padapter,
         case LED_CTL_START_WPS_BOTTON:
                  if (pLed->bLedWPSBlinkInProgress == false) {
                         if (pLed->bLedNoLinkBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedNoLinkBlinkInProgress = false;
                         }
                         if (pLed->bLedLinkBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                  pLed->bLedLinkBlinkInProgress = false;
                         }
                         if (pLed->bLedBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedBlinkInProgress = false;
                         }
                         if (pLed->bLedScanBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedScanBlinkInProgress = false;
                         }
                         pLed->bLedWPSBlinkInProgress = true;
@@ -1025,23 +1025,23 @@ static void SwLedControlMode1(struct _adapter *padapter,
                 break;
         case LED_CTL_STOP_WPS:
                 if (pLed->bLedNoLinkBlinkInProgress == true) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedNoLinkBlinkInProgress = false;
                 }
                 if (pLed->bLedLinkBlinkInProgress == true) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                          pLed->bLedLinkBlinkInProgress = false;
                 }
                 if (pLed->bLedBlinkInProgress == true) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedBlinkInProgress = false;
                 }
                 if (pLed->bLedScanBlinkInProgress == true) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedScanBlinkInProgress = false;
                 }
                 if (pLed->bLedWPSBlinkInProgress)
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                 else
                         pLed->bLedWPSBlinkInProgress = true;
                 pLed->CurrLedState = LED_BLINK_WPS_STOP;
@@ -1057,7 +1057,7 @@ static void SwLedControlMode1(struct _adapter *padapter,
                 break;
         case LED_CTL_STOP_WPS_FAIL:
                 if (pLed->bLedWPSBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedWPSBlinkInProgress = false;
                 }
                 pLed->bLedNoLinkBlinkInProgress = true;
@@ -1073,23 +1073,23 @@ static void SwLedControlMode1(struct _adapter *padapter,
                 pLed->CurrLedState = LED_OFF;
                 pLed->BlinkingLedState = LED_OFF;
                 if (pLed->bLedNoLinkBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedNoLinkBlinkInProgress = false;
                 }
                 if (pLed->bLedLinkBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedLinkBlinkInProgress = false;
                 }
                 if (pLed->bLedBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedBlinkInProgress = false;
                 }
                 if (pLed->bLedWPSBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedWPSBlinkInProgress = false;
                 }
                 if (pLed->bLedScanBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedScanBlinkInProgress = false;
                 }
                 mod_timer(&pLed->BlinkTimer,
@@ -1116,7 +1116,7 @@ static void SwLedControlMode2(struct _adapter *padapter,
                                 return;
  
                         if (pLed->bLedBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedBlinkInProgress = false;
                         }
                         pLed->bLedScanBlinkInProgress = true;
@@ -1154,11 +1154,11 @@ static void SwLedControlMode2(struct _adapter *padapter,
                 pLed->CurrLedState = LED_ON;
                 pLed->BlinkingLedState = LED_ON;
                 if (pLed->bLedBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedBlinkInProgress = false;
                 }
                 if (pLed->bLedScanBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedScanBlinkInProgress = false;
                 }
  
@@ -1170,11 +1170,11 @@ static void SwLedControlMode2(struct _adapter *padapter,
         case LED_CTL_START_WPS_BOTTON:
                 if (pLed->bLedWPSBlinkInProgress == false) {
                         if (pLed->bLedBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedBlinkInProgress = false;
                         }
                         if (pLed->bLedScanBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedScanBlinkInProgress = false;
                         }
                         pLed->bLedWPSBlinkInProgress = true;
@@ -1214,15 +1214,15 @@ static void SwLedControlMode2(struct _adapter *padapter,
                 pLed->CurrLedState = LED_OFF;
                 pLed->BlinkingLedState = LED_OFF;
                 if (pLed->bLedBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedBlinkInProgress = false;
                 }
                 if (pLed->bLedScanBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedScanBlinkInProgress = false;
                 }
                 if (pLed->bLedWPSBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedWPSBlinkInProgress = false;
                 }
                 mod_timer(&pLed->BlinkTimer,
@@ -1248,7 +1248,7 @@ static void SwLedControlMode3(struct _adapter *padapter,
                         if (IS_LED_WPS_BLINKING(pLed))
                                 return;
                         if (pLed->bLedBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedBlinkInProgress = false;
                         }
                         pLed->bLedScanBlinkInProgress = true;
@@ -1286,11 +1286,11 @@ static void SwLedControlMode3(struct _adapter *padapter,
                 pLed->CurrLedState = LED_ON;
                 pLed->BlinkingLedState = LED_ON;
                 if (pLed->bLedBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedBlinkInProgress = false;
                 }
                 if (pLed->bLedScanBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedScanBlinkInProgress = false;
                 }
                 mod_timer(&pLed->BlinkTimer,
@@ -1300,11 +1300,11 @@ static void SwLedControlMode3(struct _adapter *padapter,
         case LED_CTL_START_WPS_BOTTON:
                 if (pLed->bLedWPSBlinkInProgress == false) {
                         if (pLed->bLedBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedBlinkInProgress = false;
                         }
                         if (pLed->bLedScanBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedScanBlinkInProgress = false;
                         }
                         pLed->bLedWPSBlinkInProgress = true;
@@ -1319,7 +1319,7 @@ static void SwLedControlMode3(struct _adapter *padapter,
                 break;
         case LED_CTL_STOP_WPS:
                 if (pLed->bLedWPSBlinkInProgress) {
-                       del_timer_sync(&(pLed->BlinkTimer));
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedWPSBlinkInProgress = false;
                 } else
                         pLed->bLedWPSBlinkInProgress = true;
@@ -1336,7 +1336,7 @@ static void SwLedControlMode3(struct _adapter *padapter,
                 break;
         case LED_CTL_STOP_WPS_FAIL:
                 if (pLed->bLedWPSBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedWPSBlinkInProgress = false;
                 }
                 pLed->CurrLedState = LED_OFF;
@@ -1357,15 +1357,15 @@ static void SwLedControlMode3(struct _adapter *padapter,
                 pLed->CurrLedState = LED_OFF;
                 pLed->BlinkingLedState = LED_OFF;
                 if (pLed->bLedBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedBlinkInProgress = false;
                 }
                 if (pLed->bLedScanBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedScanBlinkInProgress = false;
                 }
                 if (pLed->bLedWPSBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedWPSBlinkInProgress = false;
                 }
                 mod_timer(&pLed->BlinkTimer,
@@ -1388,7 +1388,7 @@ static void SwLedControlMode4(struct _adapter *padapter,
         case LED_CTL_START_TO_LINK:
                 if (pLed1->bLedWPSBlinkInProgress) {
                         pLed1->bLedWPSBlinkInProgress = false;
-                       del_timer_sync(&pLed1->BlinkTimer);
+                       del_timer(&pLed1->BlinkTimer);
                         pLed1->BlinkingLedState = LED_OFF;
                         pLed1->CurrLedState = LED_OFF;
                         if (pLed1->bLedOn)
@@ -1400,11 +1400,11 @@ static void SwLedControlMode4(struct _adapter *padapter,
                             IS_LED_WPS_BLINKING(pLed))
                                 return;
                         if (pLed->bLedBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedBlinkInProgress = false;
                         }
                         if (pLed->bLedNoLinkBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedNoLinkBlinkInProgress = false;
                         }
                         pLed->bLedStartToLinkBlinkInProgress = true;
@@ -1426,7 +1426,7 @@ static void SwLedControlMode4(struct _adapter *padapter,
                 if (LedAction == LED_CTL_LINK) {
                         if (pLed1->bLedWPSBlinkInProgress) {
                                 pLed1->bLedWPSBlinkInProgress = false;
-                               del_timer_sync(&pLed1->BlinkTimer);
+                               del_timer(&pLed1->BlinkTimer);
                                 pLed1->BlinkingLedState = LED_OFF;
                                 pLed1->CurrLedState = LED_OFF;
                                 if (pLed1->bLedOn)
@@ -1439,7 +1439,7 @@ static void SwLedControlMode4(struct _adapter *padapter,
                             IS_LED_WPS_BLINKING(pLed))
                                 return;
                         if (pLed->bLedBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedBlinkInProgress = false;
                         }
                         pLed->bLedNoLinkBlinkInProgress = true;
@@ -1460,11 +1460,11 @@ static void SwLedControlMode4(struct _adapter *padapter,
                         if (IS_LED_WPS_BLINKING(pLed))
                                 return;
                         if (pLed->bLedNoLinkBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedNoLinkBlinkInProgress = false;
                         }
                         if (pLed->bLedBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedBlinkInProgress = false;
                         }
                         pLed->bLedScanBlinkInProgress = true;
@@ -1485,7 +1485,7 @@ static void SwLedControlMode4(struct _adapter *padapter,
                             IS_LED_WPS_BLINKING(pLed))
                                 return;
                         if (pLed->bLedNoLinkBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedNoLinkBlinkInProgress = false;
                         }
                         pLed->bLedBlinkInProgress = true;
@@ -1503,7 +1503,7 @@ static void SwLedControlMode4(struct _adapter *padapter,
         case LED_CTL_START_WPS_BOTTON:
                 if (pLed1->bLedWPSBlinkInProgress) {
                         pLed1->bLedWPSBlinkInProgress = false;
-                       del_timer_sync(&(pLed1->BlinkTimer));
+                       del_timer(&pLed1->BlinkTimer);
                         pLed1->BlinkingLedState = LED_OFF;
                         pLed1->CurrLedState = LED_OFF;
                         if (pLed1->bLedOn)
@@ -1512,15 +1512,15 @@ static void SwLedControlMode4(struct _adapter *padapter,
                 }
                 if (pLed->bLedWPSBlinkInProgress == false) {
                         if (pLed->bLedNoLinkBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedNoLinkBlinkInProgress = false;
                         }
                         if (pLed->bLedBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedBlinkInProgress = false;
                         }
                         if (pLed->bLedScanBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedScanBlinkInProgress = false;
                         }
                         pLed->bLedWPSBlinkInProgress = true;
@@ -1538,7 +1538,7 @@ static void SwLedControlMode4(struct _adapter *padapter,
                 break;
         case LED_CTL_STOP_WPS:  /*WPS connect success*/
                 if (pLed->bLedWPSBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedWPSBlinkInProgress = false;
                 }
                 pLed->bLedNoLinkBlinkInProgress = true;
@@ -1552,7 +1552,7 @@ static void SwLedControlMode4(struct _adapter *padapter,
                 break;
         case LED_CTL_STOP_WPS_FAIL:     /*WPS authentication fail*/
                 if (pLed->bLedWPSBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedWPSBlinkInProgress = false;
                 }
                 pLed->bLedNoLinkBlinkInProgress = true;
@@ -1565,7 +1565,7 @@ static void SwLedControlMode4(struct _adapter *padapter,
                           msecs_to_jiffies(LED_BLINK_NO_LINK_INTERVAL_ALPHA));
                 /*LED1 settings*/
                 if (pLed1->bLedWPSBlinkInProgress)
-                       del_timer_sync(&pLed1->BlinkTimer);
+                       del_timer(&pLed1->BlinkTimer);
                 else
                         pLed1->bLedWPSBlinkInProgress = true;
                 pLed1->CurrLedState = LED_BLINK_WPS_STOP;
@@ -1578,7 +1578,7 @@ static void SwLedControlMode4(struct _adapter *padapter,
                 break;
         case LED_CTL_STOP_WPS_FAIL_OVERLAP:     /*WPS session overlap*/
                 if (pLed->bLedWPSBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedWPSBlinkInProgress = false;
                 }
                 pLed->bLedNoLinkBlinkInProgress = true;
@@ -1591,7 +1591,7 @@ static void SwLedControlMode4(struct _adapter *padapter,
                           msecs_to_jiffies(LED_BLINK_NO_LINK_INTERVAL_ALPHA));
                 /*LED1 settings*/
                 if (pLed1->bLedWPSBlinkInProgress)
-                       del_timer_sync(&pLed1->BlinkTimer);
+                       del_timer(&pLed1->BlinkTimer);
                 else
                         pLed1->bLedWPSBlinkInProgress = true;
                 pLed1->CurrLedState = LED_BLINK_WPS_STOP_OVERLAP;
@@ -1607,31 +1607,31 @@ static void SwLedControlMode4(struct _adapter *padapter,
                 pLed->CurrLedState = LED_OFF;
                 pLed->BlinkingLedState = LED_OFF;
                 if (pLed->bLedNoLinkBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedNoLinkBlinkInProgress = false;
                 }
                 if (pLed->bLedLinkBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedLinkBlinkInProgress = false;
                 }
                 if (pLed->bLedBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedBlinkInProgress = false;
                 }
                 if (pLed->bLedWPSBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedWPSBlinkInProgress = false;
                 }
                 if (pLed->bLedScanBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedScanBlinkInProgress = false;
                 }
                 if (pLed->bLedStartToLinkBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedStartToLinkBlinkInProgress = false;
                 }
                 if (pLed1->bLedWPSBlinkInProgress) {
-                       del_timer_sync(&pLed1->BlinkTimer);
+                       del_timer(&pLed1->BlinkTimer);
                         pLed1->bLedWPSBlinkInProgress = false;
                 }
                 pLed1->BlinkingLedState = LED_UNKNOWN;
@@ -1671,7 +1671,7 @@ static void SwLedControlMode5(struct _adapter *padapter,
                         ; /* dummy branch */
                 else if (pLed->bLedScanBlinkInProgress == false) {
                         if (pLed->bLedBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedBlinkInProgress = false;
                         }
                         pLed->bLedScanBlinkInProgress = true;
@@ -1705,7 +1705,7 @@ static void SwLedControlMode5(struct _adapter *padapter,
                 pLed->CurrLedState = LED_OFF;
                 pLed->BlinkingLedState = LED_OFF;
                 if (pLed->bLedBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedBlinkInProgress = false;
                 }
                 SwLedOff(padapter, pLed);
@@ -1756,7 +1756,7 @@ static void SwLedControlMode6(struct _adapter *padapter,
         case LED_CTL_START_WPS_BOTTON:
                 if (pLed->bLedWPSBlinkInProgress == false) {
                         if (pLed->bLedBlinkInProgress == true) {
-                               del_timer_sync(&pLed->BlinkTimer);
+                               del_timer(&pLed->BlinkTimer);
                                 pLed->bLedBlinkInProgress = false;
                         }
                         pLed->bLedWPSBlinkInProgress = true;
@@ -1772,7 +1772,7 @@ static void SwLedControlMode6(struct _adapter *padapter,
         case LED_CTL_STOP_WPS_FAIL:
         case LED_CTL_STOP_WPS:
                 if (pLed->bLedWPSBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedWPSBlinkInProgress = false;
                 }
                 pLed->CurrLedState = LED_ON;
@@ -1784,11 +1784,11 @@ static void SwLedControlMode6(struct _adapter *padapter,
                 pLed->CurrLedState = LED_OFF;
                 pLed->BlinkingLedState = LED_OFF;
                 if (pLed->bLedBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedBlinkInProgress = false;
                 }
                 if (pLed->bLedWPSBlinkInProgress) {
-                       del_timer_sync(&pLed->BlinkTimer);
+                       del_timer(&pLed->BlinkTimer);
                         pLed->bLedWPSBlinkInProgress = false;
                 }
                 SwLedOff(padapter, pLed);
diff --git a/drivers/staging/rtl8712/rtl871x_cmd.c b/drivers/staging/rtl8712/rtl871x_cmd.c

index 1a1c38f885d6b191d5a62b5fb1aae26713dd6cb3..e35854d28f90ed96aa3ff149f39175c9e46b1373 100644 (file)
--- a/drivers/staging/rtl8712/rtl871x_cmd.c
+++ b/drivers/staging/rtl8712/rtl871x_cmd.c
@@ -910,7 +910,7 @@ void r8712_createbss_cmd_callback(struct _adapter *padapter,
         if (pcmd->res != H2C_SUCCESS)
                 mod_timer(&pmlmepriv->assoc_timer,
                           jiffies + msecs_to_jiffies(1));
-       del_timer_sync(&pmlmepriv->assoc_timer);
+       del_timer(&pmlmepriv->assoc_timer);
  #ifdef __BIG_ENDIAN
         /* endian_convert */
         pnetwork->Length = le32_to_cpu(pnetwork->Length);
diff --git a/drivers/staging/rtl8712/rtl871x_mlme.c b/drivers/staging/rtl8712/rtl871x_mlme.c

index fb2b195b90af0d1690552dfccb6ec93b13960fdf..c044b0e55ba93d0c989031d52ce99f4008ae0630 100644 (file)
--- a/drivers/staging/rtl8712/rtl871x_mlme.c
+++ b/drivers/staging/rtl8712/rtl871x_mlme.c
@@ -582,7 +582,7 @@ void r8712_surveydone_event_callback(struct _adapter *adapter, u8 *pbuf)
         spin_lock_irqsave(&pmlmepriv->lock, irqL);
  
         if (check_fwstate(pmlmepriv, _FW_UNDER_SURVEY) == true) {
-               del_timer_sync(&pmlmepriv->scan_to_timer);
+               del_timer(&pmlmepriv->scan_to_timer);
  
                 _clr_fwstate_(pmlmepriv, _FW_UNDER_SURVEY);
         }
@@ -696,7 +696,7 @@ void r8712_ind_disconnect(struct _adapter *padapter)
         }
         if (padapter->pwrctrlpriv.pwr_mode !=
             padapter->registrypriv.power_mgnt) {
-               del_timer_sync(&pmlmepriv->dhcp_timer);
+               del_timer(&pmlmepriv->dhcp_timer);
                 r8712_set_ps_mode(padapter, padapter->registrypriv.power_mgnt,
                                   padapter->registrypriv.smart_ps);
         }
@@ -910,7 +910,7 @@ void r8712_joinbss_event_callback(struct _adapter *adapter, u8 *pbuf)
                         if (check_fwstate(pmlmepriv, WIFI_STATION_STATE)
                                 == true)
                                 r8712_indicate_connect(adapter);
-                       del_timer_sync(&pmlmepriv->assoc_timer);
+                       del_timer(&pmlmepriv->assoc_timer);
                 } else
                         goto ignore_joinbss_callback;
         } else {
diff --git a/drivers/staging/rtl8712/rtl871x_pwrctrl.c b/drivers/staging/rtl8712/rtl871x_pwrctrl.c

index aaa584435c87d25d3efb3bbbe794da6cf2096c24..9bc04f474d18d7c79311c8bd6fc80b48015a6550 100644 (file)
--- a/drivers/staging/rtl8712/rtl871x_pwrctrl.c
+++ b/drivers/staging/rtl8712/rtl871x_pwrctrl.c
@@ -103,7 +103,7 @@ void r8712_cpwm_int_hdl(struct _adapter *padapter,
  
         if (pwrpriv->cpwm_tog == ((preportpwrstate->state) & 0x80))
                 return;
-       del_timer_sync(&padapter->pwrctrlpriv.rpwm_check_timer);
+       del_timer(&padapter->pwrctrlpriv.rpwm_check_timer);
         _enter_pwrlock(&pwrpriv->lock);
         pwrpriv->cpwm = (preportpwrstate->state) & 0xf;
         if (pwrpriv->cpwm >= PS_STATE_S2) {
diff --git a/drivers/staging/rtl8712/rtl871x_sta_mgt.c b/drivers/staging/rtl8712/rtl871x_sta_mgt.c

index 7bb96c47f1883dad0c62e8618b2e98ac773fca27..a9b93d0f6f566b83bb00271de37f68dc1716586c 100644 (file)
--- a/drivers/staging/rtl8712/rtl871x_sta_mgt.c
+++ b/drivers/staging/rtl8712/rtl871x_sta_mgt.c
@@ -198,7 +198,7 @@ void r8712_free_stainfo(struct _adapter *padapter, struct sta_info *psta)
          * cancel reordering_ctrl_timer */
         for (i = 0; i < 16; i++) {
                 preorder_ctrl = &psta->recvreorder_ctrl[i];
-               del_timer_sync(&preorder_ctrl->reordering_ctrl_timer);
+               del_timer(&preorder_ctrl->reordering_ctrl_timer);
         }
         spin_lock(&(pfree_sta_queue->lock));
         /* insert into free_sta_queue; 20061114 */
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c

index cc57a3a6b02b348df95c827fd2c770e59ffca155..396344cb011fd1fafab05c3ddeeff1841e13e055 100644 (file)
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -162,6 +162,17 @@ static inline int tty_put_user(struct tty_struct *tty, unsigned char x,
         return put_user(x, ptr);
  }
  
+static inline int tty_copy_to_user(struct tty_struct *tty,
+                                       void __user *to,
+                                       const void *from,
+                                       unsigned long n)
+{
+       struct n_tty_data *ldata = tty->disc_data;
+
+       tty_audit_add_data(tty, to, n, ldata->icanon);
+       return copy_to_user(to, from, n);
+}
+
  /**
   *     n_tty_kick_worker - start input worker (if required)
   *     @tty: terminal
@@ -2070,8 +2081,8 @@ static int canon_copy_from_read_buf(struct tty_struct *tty,
  
         size = N_TTY_BUF_SIZE - tail;
         n = eol - tail;
-       if (n > 4096)
-               n += 4096;
+       if (n > N_TTY_BUF_SIZE)
+               n += N_TTY_BUF_SIZE;
         n += found;
         c = n;
  
@@ -2084,12 +2095,12 @@ static int canon_copy_from_read_buf(struct tty_struct *tty,
                     __func__, eol, found, n, c, size, more);
  
         if (n > size) {
-               ret = copy_to_user(*b, read_buf_addr(ldata, tail), size);
+               ret = tty_copy_to_user(tty, *b, read_buf_addr(ldata, tail), size);
                 if (ret)
                         return -EFAULT;
-               ret = copy_to_user(*b + size, ldata->read_buf, n - size);
+               ret = tty_copy_to_user(tty, *b + size, ldata->read_buf, n - size);
         } else
-               ret = copy_to_user(*b, read_buf_addr(ldata, tail), n);
+               ret = tty_copy_to_user(tty, *b, read_buf_addr(ldata, tail), n);
  
         if (ret)
                 return -EFAULT;
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c

index 9289999cb7c62bb05b2a4b758fa76d5ce9413316..dce1a23706e86531d3caa86ba4b03c36b03bf3cf 100644 (file)
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -562,12 +562,36 @@ static irqreturn_t omap_wake_irq(int irq, void *dev_id)
         return IRQ_NONE;
  }
  
+#ifdef CONFIG_SERIAL_8250_DMA
+static int omap_8250_dma_handle_irq(struct uart_port *port);
+#endif
+
+static irqreturn_t omap8250_irq(int irq, void *dev_id)
+{
+       struct uart_port *port = dev_id;
+       struct uart_8250_port *up = up_to_u8250p(port);
+       unsigned int iir;
+       int ret;
+
+#ifdef CONFIG_SERIAL_8250_DMA
+       if (up->dma) {
+               ret = omap_8250_dma_handle_irq(port);
+               return IRQ_RETVAL(ret);
+       }
+#endif
+
+       serial8250_rpm_get(up);
+       iir = serial_port_in(port, UART_IIR);
+       ret = serial8250_handle_irq(port, iir);
+       serial8250_rpm_put(up);
+
+       return IRQ_RETVAL(ret);
+}
+
  static int omap_8250_startup(struct uart_port *port)
  {
-       struct uart_8250_port *up =
-               container_of(port, struct uart_8250_port, port);
+       struct uart_8250_port *up = up_to_u8250p(port);
         struct omap8250_priv *priv = port->private_data;
-
         int ret;
  
         if (priv->wakeirq) {
@@ -580,10 +604,31 @@ static int omap_8250_startup(struct uart_port *port)
  
         pm_runtime_get_sync(port->dev);
  
-       ret = serial8250_do_startup(port);
-       if (ret)
+       up->mcr = 0;
+       serial_out(up, UART_FCR, UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
+
+       serial_out(up, UART_LCR, UART_LCR_WLEN8);
+
+       up->lsr_saved_flags = 0;
+       up->msr_saved_flags = 0;
+
+       if (up->dma) {
+               ret = serial8250_request_dma(up);
+               if (ret) {
+                       dev_warn_ratelimited(port->dev,
+                                            "failed to request DMA\n");
+                       up->dma = NULL;
+               }
+       }
+
+       ret = request_irq(port->irq, omap8250_irq, IRQF_SHARED,
+                         dev_name(port->dev), port);
+       if (ret < 0)
                 goto err;
  
+       up->ier = UART_IER_RLSI | UART_IER_RDI;
+       serial_out(up, UART_IER, up->ier);
+
  #ifdef CONFIG_PM
         up->capabilities |= UART_CAP_RPM;
  #endif
@@ -610,8 +655,7 @@ err:
  
  static void omap_8250_shutdown(struct uart_port *port)
  {
-       struct uart_8250_port *up =
-               container_of(port, struct uart_8250_port, port);
+       struct uart_8250_port *up = up_to_u8250p(port);
         struct omap8250_priv *priv = port->private_data;
  
         flush_work(&priv->qos_work);
@@ -621,11 +665,24 @@ static void omap_8250_shutdown(struct uart_port *port)
         pm_runtime_get_sync(port->dev);
  
         serial_out(up, UART_OMAP_WER, 0);
-       serial8250_do_shutdown(port);
+
+       up->ier = 0;
+       serial_out(up, UART_IER, 0);
+
+       if (up->dma)
+               serial8250_release_dma(up);
+
+       /*
+        * Disable break condition and FIFOs
+        */
+       if (up->lcr & UART_LCR_SBC)
+               serial_out(up, UART_LCR, up->lcr & ~UART_LCR_SBC);
+       serial_out(up, UART_FCR, UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
  
         pm_runtime_mark_last_busy(port->dev);
         pm_runtime_put_autosuspend(port->dev);
  
+       free_irq(port->irq, port);
         if (priv->wakeirq)
                 free_irq(priv->wakeirq, port);
  }
@@ -974,6 +1031,13 @@ static inline int omap_8250_rx_dma(struct uart_8250_port *p, unsigned int iir)
  }
  #endif
  
+static int omap8250_no_handle_irq(struct uart_port *port)
+{
+       /* IRQ has not been requested but handling irq? */
+       WARN_ONCE(1, "Unexpected irq handling before port startup\n");
+       return 0;
+}
+
  static int omap8250_probe(struct platform_device *pdev)
  {
         struct resource *regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
@@ -1075,6 +1139,7 @@ static int omap8250_probe(struct platform_device *pdev)
         pm_runtime_get_sync(&pdev->dev);
  
         omap_serial_fill_features_erratas(&up, priv);
+       up.port.handle_irq = omap8250_no_handle_irq;
  #ifdef CONFIG_SERIAL_8250_DMA
         if (pdev->dev.of_node) {
                 /*
@@ -1088,7 +1153,6 @@ static int omap8250_probe(struct platform_device *pdev)
                 ret = of_property_count_strings(pdev->dev.of_node, "dma-names");
                 if (ret == 2) {
                         up.dma = &priv->omap8250_dma;
-                       up.port.handle_irq = omap_8250_dma_handle_irq;
                         priv->omap8250_dma.fn = the_no_dma_filter_fn;
                         priv->omap8250_dma.tx_dma = omap_8250_tx_dma;
                         priv->omap8250_dma.rx_dma = omap_8250_rx_dma;
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c

index 6f5a0720a8c8eead6c23f37c359c516730013cef..763eb20fe3213b6cfda04dc2624bcd1b8638f324 100644 (file)
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -1249,20 +1249,19 @@ __acquires(&uap->port.lock)
  
  /*
   * Transmit a character
- * There must be at least one free entry in the TX FIFO to accept the char.
   *
- * Returns true if the FIFO might have space in it afterwards;
- * returns false if the FIFO definitely became full.
+ * Returns true if the character was successfully queued to the FIFO.
+ * Returns false otherwise.
   */
  static bool pl011_tx_char(struct uart_amba_port *uap, unsigned char c)
  {
+       if (readw(uap->port.membase + UART01x_FR) & UART01x_FR_TXFF)
+               return false; /* unable to transmit character */
+
         writew(c, uap->port.membase + UART01x_DR);
         uap->port.icount.tx++;
  
-       if (likely(uap->tx_irq_seen > 1))
-               return true;
-
-       return !(readw(uap->port.membase + UART01x_FR) & UART01x_FR_TXFF);
+       return true;
  }
  
  static bool pl011_tx_chars(struct uart_amba_port *uap)
@@ -1296,7 +1295,8 @@ static bool pl011_tx_chars(struct uart_amba_port *uap)
                 return false;
  
         if (uap->port.x_char) {
-               pl011_tx_char(uap, uap->port.x_char);
+               if (!pl011_tx_char(uap, uap->port.x_char))
+                       goto done;
                 uap->port.x_char = 0;
                 --count;
         }
diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c

index c8cfa06371280af6abfd63bd379ee5c121523ad7..88250395b0ce96486a2dac5e2e9162fb7f4eae43 100644 (file)
--- a/drivers/tty/serial/imx.c
+++ b/drivers/tty/serial/imx.c
@@ -911,6 +911,14 @@ static void dma_rx_callback(void *data)
  
         status = dmaengine_tx_status(chan, (dma_cookie_t)0, &state);
         count = RX_BUF_SIZE - state.residue;
+
+       if (readl(sport->port.membase + USR2) & USR2_IDLE) {
+               /* In condition [3] the SDMA counted up too early */
+               count--;
+
+               writel(USR2_IDLE, sport->port.membase + USR2);
+       }
+
         dev_dbg(sport->port.dev, "We get %d bytes.\n", count);
  
         if (count) {
diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h

index fdab715a063119d6e696a8f66ea26d4a1613e983..c0eafa6fd40314086474f5b7cab8f63361c73d64 100644 (file)
--- a/drivers/usb/dwc3/core.h
+++ b/drivers/usb/dwc3/core.h
@@ -339,7 +339,7 @@
  #define DWC3_DGCMD_SET_ENDPOINT_NRDY   0x0c
  #define DWC3_DGCMD_RUN_SOC_BUS_LOOPBACK        0x10
  
-#define DWC3_DGCMD_STATUS(n)           (((n) >> 15) & 1)
+#define DWC3_DGCMD_STATUS(n)           (((n) >> 12) & 0x0F)
  #define DWC3_DGCMD_CMDACT              (1 << 10)
  #define DWC3_DGCMD_CMDIOC              (1 << 8)
  
@@ -355,7 +355,7 @@
  #define DWC3_DEPCMD_PARAM_SHIFT                16
  #define DWC3_DEPCMD_PARAM(x)           ((x) << DWC3_DEPCMD_PARAM_SHIFT)
  #define DWC3_DEPCMD_GET_RSC_IDX(x)     (((x) >> DWC3_DEPCMD_PARAM_SHIFT) & 0x7f)
-#define DWC3_DEPCMD_STATUS(x)          (((x) >> 15) & 1)
+#define DWC3_DEPCMD_STATUS(x)          (((x) >> 12) & 0x0F)
  #define DWC3_DEPCMD_HIPRI_FORCERM      (1 << 11)
  #define DWC3_DEPCMD_CMDACT             (1 << 10)
  #define DWC3_DEPCMD_CMDIOC             (1 << 8)
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c

index 6bdb5706904497ca9eccb7fd5d979c67824d8600..3507f880eb74294c76ddbc43c3aa153528478f53 100644 (file)
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -315,7 +315,6 @@ static ssize_t ffs_ep0_write(struct file *file, const char __user *buf,
                                 return ret;
                         }
  
-                       set_bit(FFS_FL_CALL_CLOSED_CALLBACK, &ffs->flags);
                         return len;
                 }
                 break;
@@ -847,7 +846,7 @@ static ssize_t ffs_epfile_io(struct file *file, struct ffs_io_data *io_data)
                                 ret = ep->status;
                                 if (io_data->read && ret > 0) {
                                         ret = copy_to_iter(data, ret, &io_data->data);
-                                       if (unlikely(iov_iter_count(&io_data->data)))
+                                       if (!ret)
                                                 ret = -EFAULT;
                                 }
                         }
@@ -1463,8 +1462,7 @@ static void ffs_data_clear(struct ffs_data *ffs)
  {
         ENTER();
  
-       if (test_and_clear_bit(FFS_FL_CALL_CLOSED_CALLBACK, &ffs->flags))
-               ffs_closed(ffs);
+       ffs_closed(ffs);
  
         BUG_ON(ffs->gadget);
  
@@ -3422,9 +3420,13 @@ static int ffs_ready(struct ffs_data *ffs)
         ffs_obj->desc_ready = true;
         ffs_obj->ffs_data = ffs;
  
-       if (ffs_obj->ffs_ready_callback)
+       if (ffs_obj->ffs_ready_callback) {
                 ret = ffs_obj->ffs_ready_callback(ffs);
+               if (ret)
+                       goto done;
+       }
  
+       set_bit(FFS_FL_CALL_CLOSED_CALLBACK, &ffs->flags);
  done:
         ffs_dev_unlock();
         return ret;
@@ -3443,7 +3445,8 @@ static void ffs_closed(struct ffs_data *ffs)
  
         ffs_obj->desc_ready = false;
  
-       if (ffs_obj->ffs_closed_callback)
+       if (test_and_clear_bit(FFS_FL_CALL_CLOSED_CALLBACK, &ffs->flags) &&
+           ffs_obj->ffs_closed_callback)
                 ffs_obj->ffs_closed_callback(ffs);
  
         if (!ffs_obj->opts || ffs_obj->opts->no_configfs
diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c

index 259b656c0b3ec7bde9e119488f46ded351bb7300..6316aa5b1c4947a6df2e08b4c45856dc77b94374 100644 (file)
--- a/drivers/usb/gadget/function/f_midi.c
+++ b/drivers/usb/gadget/function/f_midi.c
@@ -973,7 +973,13 @@ static ssize_t f_midi_opts_id_show(struct f_midi_opts *opts, char *page)
         int result;
  
         mutex_lock(&opts->lock);
-       result = strlcpy(page, opts->id, PAGE_SIZE);
+       if (opts->id) {
+               result = strlcpy(page, opts->id, PAGE_SIZE);
+       } else {
+               page[0] = 0;
+               result = 0;
+       }
+
         mutex_unlock(&opts->lock);
  
         return result;
diff --git a/drivers/usb/gadget/function/f_uac1.c b/drivers/usb/gadget/function/f_uac1.c

index 9719abfb61455ca91ec5d1721e53622d4b76f1ef..7856b3394494b7d4250637277dd1f42f45d7a1ea 100644 (file)
--- a/drivers/usb/gadget/function/f_uac1.c
+++ b/drivers/usb/gadget/function/f_uac1.c
@@ -588,7 +588,10 @@ static int f_audio_set_alt(struct usb_function *f, unsigned intf, unsigned alt)
  
         if (intf == 1) {
                 if (alt == 1) {
-                       config_ep_by_speed(cdev->gadget, f, out_ep);
+                       err = config_ep_by_speed(cdev->gadget, f, out_ep);
+                       if (err)
+                               return err;
+
                         usb_ep_enable(out_ep);
                         out_ep->driver_data = audio;
                         audio->copy_buf = f_audio_buffer_alloc(audio_buf_size);
diff --git a/drivers/usb/gadget/legacy/g_ffs.c b/drivers/usb/gadget/legacy/g_ffs.c

index 7b9ef7e257d236dd442226203301a59bbd59ef47..e821931c965cd9203a8011358ffeb16844dc7eed 100644 (file)
--- a/drivers/usb/gadget/legacy/g_ffs.c
+++ b/drivers/usb/gadget/legacy/g_ffs.c
@@ -304,8 +304,10 @@ static int functionfs_ready_callback(struct ffs_data *ffs)
         gfs_registered = true;
  
         ret = usb_composite_probe(&gfs_driver);
-       if (unlikely(ret < 0))
+       if (unlikely(ret < 0)) {
+               ++missing_funcs;
                 gfs_registered = false;
+       }
         
         return ret;
  }
diff --git a/drivers/usb/gadget/udc/s3c2410_udc.c b/drivers/usb/gadget/udc/s3c2410_udc.c

index b808951491ccbfcdd949d8e78f7c1cc2b4c55f47..99fd9a5667dfd4997092d982c0beae28b578a17c 100644 (file)
--- a/drivers/usb/gadget/udc/s3c2410_udc.c
+++ b/drivers/usb/gadget/udc/s3c2410_udc.c
@@ -1487,7 +1487,7 @@ static int s3c2410_udc_pullup(struct usb_gadget *gadget, int is_on)
  
         dprintk(DEBUG_NORMAL, "%s()\n", __func__);
  
-       s3c2410_udc_set_pullup(udc, is_on ? 0 : 1);
+       s3c2410_udc_set_pullup(udc, is_on);
         return 0;
  }
  
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c

index ec8ac16748547a2ac87bf9aa225ed0a36c0bf7df..36bf089b708fe5219258d46305719b7a999a23f6 100644 (file)
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -3682,18 +3682,21 @@ int xhci_alloc_dev(struct usb_hcd *hcd, struct usb_device *udev)
  {
         struct xhci_hcd *xhci = hcd_to_xhci(hcd);
         unsigned long flags;
-       int ret;
+       int ret, slot_id;
         struct xhci_command *command;
  
         command = xhci_alloc_command(xhci, false, false, GFP_KERNEL);
         if (!command)
                 return 0;
  
+       /* xhci->slot_id and xhci->addr_dev are not thread-safe */
+       mutex_lock(&xhci->mutex);
         spin_lock_irqsave(&xhci->lock, flags);
         command->completion = &xhci->addr_dev;
         ret = xhci_queue_slot_control(xhci, command, TRB_ENABLE_SLOT, 0);
         if (ret) {
                 spin_unlock_irqrestore(&xhci->lock, flags);
+               mutex_unlock(&xhci->mutex);
                 xhci_dbg(xhci, "FIXME: allocate a command ring segment\n");
                 kfree(command);
                 return 0;
@@ -3702,8 +3705,10 @@ int xhci_alloc_dev(struct usb_hcd *hcd, struct usb_device *udev)
         spin_unlock_irqrestore(&xhci->lock, flags);
  
         wait_for_completion(command->completion);
+       slot_id = xhci->slot_id;
+       mutex_unlock(&xhci->mutex);
  
-       if (!xhci->slot_id || command->status != COMP_SUCCESS) {
+       if (!slot_id || command->status != COMP_SUCCESS) {
                 xhci_err(xhci, "Error while assigning device slot ID\n");
                 xhci_err(xhci, "Max number of devices this xHCI host supports is %u.\n",
                                 HCS_MAX_SLOTS(
@@ -3728,11 +3733,11 @@ int xhci_alloc_dev(struct usb_hcd *hcd, struct usb_device *udev)
          * xhci_discover_or_reset_device(), which may be called as part of
          * mass storage driver error handling.
          */
-       if (!xhci_alloc_virt_device(xhci, xhci->slot_id, udev, GFP_NOIO)) {
+       if (!xhci_alloc_virt_device(xhci, slot_id, udev, GFP_NOIO)) {
                 xhci_warn(xhci, "Could not allocate xHCI USB device data structures\n");
                 goto disable_slot;
         }
-       udev->slot_id = xhci->slot_id;
+       udev->slot_id = slot_id;
  
  #ifndef CONFIG_USB_DEFAULT_PERSIST
         /*
@@ -3778,12 +3783,15 @@ static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev,
         struct xhci_slot_ctx *slot_ctx;
         struct xhci_input_control_ctx *ctrl_ctx;
         u64 temp_64;
-       struct xhci_command *command;
+       struct xhci_command *command = NULL;
+
+       mutex_lock(&xhci->mutex);
  
         if (!udev->slot_id) {
                 xhci_dbg_trace(xhci, trace_xhci_dbg_address,
                                 "Bad Slot ID %d", udev->slot_id);
-               return -EINVAL;
+               ret = -EINVAL;
+               goto out;
         }
  
         virt_dev = xhci->devs[udev->slot_id];
@@ -3796,7 +3804,8 @@ static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev,
                  */
                 xhci_warn(xhci, "Virt dev invalid for slot_id 0x%x!\n",
                         udev->slot_id);
-               return -EINVAL;
+               ret = -EINVAL;
+               goto out;
         }
  
         if (setup == SETUP_CONTEXT_ONLY) {
@@ -3804,13 +3813,15 @@ static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev,
                 if (GET_SLOT_STATE(le32_to_cpu(slot_ctx->dev_state)) ==
                     SLOT_STATE_DEFAULT) {
                         xhci_dbg(xhci, "Slot already in default state\n");
-                       return 0;
+                       goto out;
                 }
         }
  
         command = xhci_alloc_command(xhci, false, false, GFP_KERNEL);
-       if (!command)
-               return -ENOMEM;
+       if (!command) {
+               ret = -ENOMEM;
+               goto out;
+       }
  
         command->in_ctx = virt_dev->in_ctx;
         command->completion = &xhci->addr_dev;
@@ -3820,8 +3831,8 @@ static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev,
         if (!ctrl_ctx) {
                 xhci_warn(xhci, "%s: Could not get input context, bad type.\n",
                                 __func__);
-               kfree(command);
-               return -EINVAL;
+               ret = -EINVAL;
+               goto out;
         }
         /*
          * If this is the first Set Address since device plug-in or
@@ -3848,8 +3859,7 @@ static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev,
                 spin_unlock_irqrestore(&xhci->lock, flags);
                 xhci_dbg_trace(xhci, trace_xhci_dbg_address,
                                 "FIXME: allocate a command ring segment");
-               kfree(command);
-               return ret;
+               goto out;
         }
         xhci_ring_cmd_db(xhci);
         spin_unlock_irqrestore(&xhci->lock, flags);
@@ -3896,10 +3906,8 @@ static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev,
                 ret = -EINVAL;
                 break;
         }
-       if (ret) {
-               kfree(command);
-               return ret;
-       }
+       if (ret)
+               goto out;
         temp_64 = xhci_read_64(xhci, &xhci->op_regs->dcbaa_ptr);
         xhci_dbg_trace(xhci, trace_xhci_dbg_address,
                         "Op regs DCBAA ptr = %#016llx", temp_64);
@@ -3932,8 +3940,10 @@ static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev,
         xhci_dbg_trace(xhci, trace_xhci_dbg_address,
                        "Internal device address = %d",
                        le32_to_cpu(slot_ctx->dev_state) & DEV_ADDR_MASK);
+out:
+       mutex_unlock(&xhci->mutex);
         kfree(command);
-       return 0;
+       return ret;
  }
  
  int xhci_address_device(struct usb_hcd *hcd, struct usb_device *udev)
@@ -4855,6 +4865,7 @@ int xhci_gen_setup(struct usb_hcd *hcd, xhci_get_quirks_t get_quirks)
                 return 0;
         }
  
+       mutex_init(&xhci->mutex);
         xhci->cap_regs = hcd->regs;
         xhci->op_regs = hcd->regs +
                 HC_LENGTH(readl(&xhci->cap_regs->hc_capbase));
@@ -5011,4 +5022,12 @@ static int __init xhci_hcd_init(void)
         BUILD_BUG_ON(sizeof(struct xhci_run_regs) != (8+8*128)*32/8);
         return 0;
  }
+
+/*
+ * If an init function is provided, an exit function must also be provided
+ * to allow module unload.
+ */
+static void __exit xhci_hcd_fini(void) { }
+
  module_init(xhci_hcd_init);
+module_exit(xhci_hcd_fini);
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h

index ea75e8ccd3c11d397dc7a6a2ff45e78ae829fd81..6977f8491fa7ced6ea317bf75354a0eb7703670e 100644 (file)
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -1497,6 +1497,8 @@ struct xhci_hcd {
         struct list_head        lpm_failed_devs;
  
         /* slot enabling and address device helpers */
+       /* these are not thread safe so use mutex */
+       struct mutex mutex;
         struct completion       addr_dev;
         int slot_id;
         /* For USB 3.0 LPM enable/disable. */
diff --git a/drivers/usb/musb/musb_core.c b/drivers/usb/musb/musb_core.c

index 3789b08ef67b037781e278c41c0d4b2f2d33e5d9..6dca3d794ced6e1948dd5cbb180e708893f7ba83 100644 (file)
--- a/drivers/usb/musb/musb_core.c
+++ b/drivers/usb/musb/musb_core.c
@@ -2021,13 +2021,7 @@ musb_init_controller(struct device *dev, int nIrq, void __iomem *ctrl)
         if (musb->ops->quirks)
                 musb->io.quirks = musb->ops->quirks;
  
-       /* At least tusb6010 has it's own offsets.. */
-       if (musb->ops->ep_offset)
-               musb->io.ep_offset = musb->ops->ep_offset;
-       if (musb->ops->ep_select)
-               musb->io.ep_select = musb->ops->ep_select;
-
-       /* ..and some devices use indexed offset or flat offset */
+       /* Most devices use indexed offset or flat offset */
         if (musb->io.quirks & MUSB_INDEXED_EP) {
                 musb->io.ep_offset = musb_indexed_ep_offset;
                 musb->io.ep_select = musb_indexed_ep_select;
@@ -2036,6 +2030,12 @@ musb_init_controller(struct device *dev, int nIrq, void __iomem *ctrl)
                 musb->io.ep_select = musb_flat_ep_select;
         }
  
+       /* At least tusb6010 has its own offsets */
+       if (musb->ops->ep_offset)
+               musb->io.ep_offset = musb->ops->ep_offset;
+       if (musb->ops->ep_select)
+               musb->io.ep_select = musb->ops->ep_select;
+
         if (musb->ops->fifo_mode)
                 fifo_mode = musb->ops->fifo_mode;
         else
diff --git a/drivers/usb/phy/phy-ab8500-usb.c b/drivers/usb/phy/phy-ab8500-usb.c

index 7225d526df0446ff26fd69ef65268265737d8c66..03ab0c699f74dd1768f2b769ca823eb7904132ab 100644 (file)
--- a/drivers/usb/phy/phy-ab8500-usb.c
+++ b/drivers/usb/phy/phy-ab8500-usb.c
@@ -1179,7 +1179,7 @@ static int ab8500_usb_irq_setup(struct platform_device *pdev,
                 }
                 err = devm_request_threaded_irq(&pdev->dev, irq, NULL,
                                 ab8500_usb_link_status_irq,
-                               IRQF_NO_SUSPEND | IRQF_SHARED,
+                               IRQF_NO_SUSPEND | IRQF_SHARED | IRQF_ONESHOT,
                                 "usb-link-status", ab);
                 if (err < 0) {
                         dev_err(ab->dev, "request_irq failed for link status irq\n");
@@ -1195,7 +1195,7 @@ static int ab8500_usb_irq_setup(struct platform_device *pdev,
                 }
                 err = devm_request_threaded_irq(&pdev->dev, irq, NULL,
                                 ab8500_usb_disconnect_irq,
-                               IRQF_NO_SUSPEND | IRQF_SHARED,
+                               IRQF_NO_SUSPEND | IRQF_SHARED | IRQF_ONESHOT,
                                 "usb-id-fall", ab);
                 if (err < 0) {
                         dev_err(ab->dev, "request_irq failed for ID fall irq\n");
@@ -1211,7 +1211,7 @@ static int ab8500_usb_irq_setup(struct platform_device *pdev,
                 }
                 err = devm_request_threaded_irq(&pdev->dev, irq, NULL,
                                 ab8500_usb_disconnect_irq,
-                               IRQF_NO_SUSPEND | IRQF_SHARED,
+                               IRQF_NO_SUSPEND | IRQF_SHARED | IRQF_ONESHOT,
                                 "usb-vbus-fall", ab);
                 if (err < 0) {
                         dev_err(ab->dev, "request_irq failed for Vbus fall irq\n");
diff --git a/drivers/usb/phy/phy-tahvo.c b/drivers/usb/phy/phy-tahvo.c

index 845f658276b106342907c7606a078dbfa47d06d1..2b28443d07b92daed26660f1d80f0bd390937992 100644 (file)
--- a/drivers/usb/phy/phy-tahvo.c
+++ b/drivers/usb/phy/phy-tahvo.c
@@ -401,7 +401,8 @@ static int tahvo_usb_probe(struct platform_device *pdev)
         dev_set_drvdata(&pdev->dev, tu);
  
         tu->irq = platform_get_irq(pdev, 0);
-       ret = request_threaded_irq(tu->irq, NULL, tahvo_usb_vbus_interrupt, 0,
+       ret = request_threaded_irq(tu->irq, NULL, tahvo_usb_vbus_interrupt,
+                                  IRQF_ONESHOT,
                                    "tahvo-vbus", tu);
         if (ret) {
                 dev_err(&pdev->dev, "could not register tahvo-vbus irq: %d\n",
diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c

index 8597cf9cfceb7715883738ac8cf1c0380e9a00b1..c0f5c652d272c8959f5b3d59461e1af139d6f7fd 100644 (file)
--- a/drivers/usb/renesas_usbhs/fifo.c
+++ b/drivers/usb/renesas_usbhs/fifo.c
@@ -611,6 +611,8 @@ struct usbhs_pkt_handle usbhs_fifo_pio_push_handler = {
  static int usbhsf_prepare_pop(struct usbhs_pkt *pkt, int *is_done)
  {
         struct usbhs_pipe *pipe = pkt->pipe;
+       struct usbhs_priv *priv = usbhs_pipe_to_priv(pipe);
+       struct usbhs_fifo *fifo = usbhsf_get_cfifo(priv);
  
         if (usbhs_pipe_is_busy(pipe))
                 return 0;
@@ -624,6 +626,9 @@ static int usbhsf_prepare_pop(struct usbhs_pkt *pkt, int *is_done)
         usbhs_pipe_data_sequence(pipe, pkt->sequence);
         pkt->sequence = -1; /* -1 sequence will be ignored */
  
+       if (usbhs_pipe_is_dcp(pipe))
+               usbhsf_fifo_clear(pipe, fifo);
+
         usbhs_pipe_set_trans_count_if_bulk(pipe, pkt->length);
         usbhs_pipe_enable(pipe);
         usbhs_pipe_running(pipe, 1);
@@ -673,7 +678,14 @@ static int usbhsf_pio_try_pop(struct usbhs_pkt *pkt, int *is_done)
                 *is_done = 1;
                 usbhsf_rx_irq_ctrl(pipe, 0);
                 usbhs_pipe_running(pipe, 0);
-               usbhs_pipe_disable(pipe);       /* disable pipe first */
+               /*
+                * If function mode, since this controller is possible to enter
+                * Control Write status stage at this timing, this driver
+                * should not disable the pipe. If such a case happens, this
+                * controller is not able to complete the status stage.
+                */
+               if (!usbhs_mod_is_host(priv) && !usbhs_pipe_is_dcp(pipe))
+                       usbhs_pipe_disable(pipe);       /* disable pipe first */
         }
  
         /*
@@ -1227,15 +1239,21 @@ static void usbhsf_dma_init_dt(struct device *dev, struct usbhs_fifo *fifo,
  {
         char name[16];
  
-       snprintf(name, sizeof(name), "tx%d", channel);
-       fifo->tx_chan = dma_request_slave_channel_reason(dev, name);
-       if (IS_ERR(fifo->tx_chan))
-               fifo->tx_chan = NULL;
-
-       snprintf(name, sizeof(name), "rx%d", channel);
-       fifo->rx_chan = dma_request_slave_channel_reason(dev, name);
-       if (IS_ERR(fifo->rx_chan))
-               fifo->rx_chan = NULL;
+       /*
+        * To avoid complex handing for DnFIFOs, the driver uses each
+        * DnFIFO as TX or RX direction (not bi-direction).
+        * So, the driver uses odd channels for TX, even channels for RX.
+        */
+       snprintf(name, sizeof(name), "ch%d", channel);
+       if (channel & 1) {
+               fifo->tx_chan = dma_request_slave_channel_reason(dev, name);
+               if (IS_ERR(fifo->tx_chan))
+                       fifo->tx_chan = NULL;
+       } else {
+               fifo->rx_chan = dma_request_slave_channel_reason(dev, name);
+               if (IS_ERR(fifo->rx_chan))
+                       fifo->rx_chan = NULL;
+       }
  }
  
  static void usbhsf_dma_init(struct usbhs_priv *priv, struct usbhs_fifo *fifo,
diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c

index 9031750e7404a566d3c08c30e0366c424dcd0b06..ffd739e31bfc193b058628560e86ea6f9b96f375 100644 (file)
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -128,6 +128,7 @@ static const struct usb_device_id id_table[] = {
         { USB_DEVICE(0x10C4, 0x8946) }, /* Ketra N1 Wireless Interface */
         { USB_DEVICE(0x10C4, 0x8977) }, /* CEL MeshWorks DevKit Device */
         { USB_DEVICE(0x10C4, 0x8998) }, /* KCF Technologies PRN */
+       { USB_DEVICE(0x10C4, 0x8A2A) }, /* HubZ dual ZigBee and Z-Wave dongle */
         { USB_DEVICE(0x10C4, 0xEA60) }, /* Silicon Labs factory default */
         { USB_DEVICE(0x10C4, 0xEA61) }, /* Silicon Labs factory default */
         { USB_DEVICE(0x10C4, 0xEA70) }, /* Silicon Labs factory default */
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c

index 8eb68a31cab6c4021617ca555cd58b086872c112..4c8b3b82103d6318ea1d46250ad708bb3f722260 100644 (file)
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -699,6 +699,7 @@ static const struct usb_device_id id_table_combined[] = {
         { USB_DEVICE(XSENS_VID, XSENS_AWINDA_DONGLE_PID) },
         { USB_DEVICE(XSENS_VID, XSENS_AWINDA_STATION_PID) },
         { USB_DEVICE(XSENS_VID, XSENS_CONVERTER_PID) },
+       { USB_DEVICE(XSENS_VID, XSENS_MTDEVBOARD_PID) },
         { USB_DEVICE(XSENS_VID, XSENS_MTW_PID) },
         { USB_DEVICE(FTDI_VID, FTDI_OMNI1509) },
         { USB_DEVICE(MOBILITY_VID, MOBILITY_USB_SERIAL_PID) },
diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h

index 4e4f46f3c89c025670d42860756f39b2bb62ae24..792e054126de51402711814f5962945f7742e188 100644 (file)
--- a/drivers/usb/serial/ftdi_sio_ids.h
+++ b/drivers/usb/serial/ftdi_sio_ids.h
@@ -155,6 +155,7 @@
  #define XSENS_AWINDA_STATION_PID 0x0101
  #define XSENS_AWINDA_DONGLE_PID 0x0102
  #define XSENS_MTW_PID          0x0200  /* Xsens MTw */
+#define XSENS_MTDEVBOARD_PID   0x0300  /* Motion Tracker Development Board */
  #define XSENS_CONVERTER_PID    0xD00D  /* Xsens USB-serial converter */
  
  /* Xsens devices using FTDI VID */
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c

index e894eb278d8336d018d3e6e8c29556dc9b5f3cb5..eba1b7ac729454d30b1d611cd01d45b5ba23407e 100644 (file)
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -423,6 +423,7 @@ int vp_set_vq_affinity(struct virtqueue *vq, int cpu)
                 if (cpu == -1)
                         irq_set_affinity_hint(irq, NULL);
                 else {
+                       cpumask_clear(mask);
                         cpumask_set_cpu(cpu, mask);
                         irq_set_affinity_hint(irq, mask);
                 }
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h

index fb9ffcb432779b55620909090b825ead7ddbcede..0923f2cf3c80aa2fb95a7385276de6f497ea46fe 100644 (file)
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -149,8 +149,6 @@ extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
  extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
  extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                         struct inode *new_dir, struct dentry *new_dentry);
-extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
-                       void *p);
  extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
                                          struct p9_fid *fid,
                                          struct super_block *sb, int new);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c

index 703342e309f57af329085db6d8f0b1b4814793dd..510040b04c964dbdeab4ab47c29b789fef5c4b5f 100644 (file)
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1224,100 +1224,43 @@ ino_t v9fs_qid2ino(struct p9_qid *qid)
  }
  
  /**
- * v9fs_readlink - read a symlink's location (internal version)
+ * v9fs_vfs_follow_link - follow a symlink path
   * @dentry: dentry for symlink
- * @buffer: buffer to load symlink location into
- * @buflen: length of buffer
- *
+ * @cookie: place to pass the data to put_link()
   */
  
-static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
+static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie)
  {
-       int retval;
-
-       struct v9fs_session_info *v9ses;
-       struct p9_fid *fid;
+       struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
+       struct p9_fid *fid = v9fs_fid_lookup(dentry);
         struct p9_wstat *st;
+       char *res;
+
+       p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
  
-       p9_debug(P9_DEBUG_VFS, " %pd\n", dentry);
-       retval = -EPERM;
-       v9ses = v9fs_dentry2v9ses(dentry);
-       fid = v9fs_fid_lookup(dentry);
         if (IS_ERR(fid))
-               return PTR_ERR(fid);
+               return ERR_CAST(fid);
  
         if (!v9fs_proto_dotu(v9ses))
-               return -EBADF;
+               return ERR_PTR(-EBADF);
  
         st = p9_client_stat(fid);
         if (IS_ERR(st))
-               return PTR_ERR(st);
+               return ERR_CAST(st);
  
         if (!(st->mode & P9_DMSYMLINK)) {
-               retval = -EINVAL;
-               goto done;
+               p9stat_free(st);
+               kfree(st);
+               return ERR_PTR(-EINVAL);
         }
+       res = st->extension;
+       st->extension = NULL;
+       if (strlen(res) >= PATH_MAX)
+               res[PATH_MAX - 1] = '\0';
  
-       /* copy extension buffer into buffer */
-       retval = min(strlen(st->extension)+1, (size_t)buflen);
-       memcpy(buffer, st->extension, retval);
-
-       p9_debug(P9_DEBUG_VFS, "%pd -> %s (%.*s)\n",
-                dentry, st->extension, buflen, buffer);
-
-done:
         p9stat_free(st);
         kfree(st);
-       return retval;
-}
-
-/**
- * v9fs_vfs_follow_link - follow a symlink path
- * @dentry: dentry for symlink
- * @nd: nameidata
- *
- */
-
-static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-       int len = 0;
-       char *link = __getname();
-
-       p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
-
-       if (!link)
-               link = ERR_PTR(-ENOMEM);
-       else {
-               len = v9fs_readlink(dentry, link, PATH_MAX);
-
-               if (len < 0) {
-                       __putname(link);
-                       link = ERR_PTR(len);
-               } else
-                       link[min(len, PATH_MAX-1)] = 0;
-       }
-       nd_set_link(nd, link);
-
-       return NULL;
-}
-
-/**
- * v9fs_vfs_put_link - release a symlink path
- * @dentry: dentry for symlink
- * @nd: nameidata
- * @p: unused
- *
- */
-
-void
-v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
-{
-       char *s = nd_get_link(nd);
-
-       p9_debug(P9_DEBUG_VFS, " %pd %s\n",
-                dentry, IS_ERR(s) ? "<error>" : s);
-       if (!IS_ERR(s))
-               __putname(s);
+       return *cookie = res;
  }
  
  /**
@@ -1370,6 +1313,8 @@ v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
         return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname);
  }
  
+#define U32_MAX_DIGITS 10
+
  /**
   * v9fs_vfs_link - create a hardlink
   * @old_dentry: dentry for file to link to
@@ -1383,7 +1328,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
               struct dentry *dentry)
  {
         int retval;
-       char *name;
+       char name[1 + U32_MAX_DIGITS + 2]; /* sign + number + \n + \0 */
         struct p9_fid *oldfid;
  
         p9_debug(P9_DEBUG_VFS, " %lu,%pd,%pd\n",
@@ -1393,20 +1338,12 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
         if (IS_ERR(oldfid))
                 return PTR_ERR(oldfid);
  
-       name = __getname();
-       if (unlikely(!name)) {
-               retval = -ENOMEM;
-               goto clunk_fid;
-       }
-
         sprintf(name, "%d\n", oldfid->fid);
         retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name);
-       __putname(name);
         if (!retval) {
                 v9fs_refresh_inode(oldfid, d_inode(old_dentry));
                 v9fs_invalidate_inode_attr(dir);
         }
-clunk_fid:
         p9_client_clunk(oldfid);
         return retval;
  }
@@ -1425,7 +1362,7 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde
  {
         struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
         int retval;
-       char *name;
+       char name[2 + U32_MAX_DIGITS + 1 + U32_MAX_DIGITS + 1];
         u32 perm;
  
         p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n",
@@ -1435,26 +1372,16 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde
         if (!new_valid_dev(rdev))
                 return -EINVAL;
  
-       name = __getname();
-       if (!name)
-               return -ENOMEM;
         /* build extension */
         if (S_ISBLK(mode))
                 sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev));
         else if (S_ISCHR(mode))
                 sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
-       else if (S_ISFIFO(mode))
-               *name = 0;
-       else if (S_ISSOCK(mode))
+       else
                 *name = 0;
-       else {
-               __putname(name);
-               return -EINVAL;
-       }
  
         perm = unixmode2p9mode(v9ses, mode);
         retval = v9fs_vfs_mkspecial(dir, dentry, perm, name);
-       __putname(name);
  
         return retval;
  }
@@ -1530,7 +1457,7 @@ static const struct inode_operations v9fs_file_inode_operations = {
  static const struct inode_operations v9fs_symlink_inode_operations = {
         .readlink = generic_readlink,
         .follow_link = v9fs_vfs_follow_link,
-       .put_link = v9fs_vfs_put_link,
+       .put_link = kfree_put_link,
         .getattr = v9fs_vfs_getattr,
         .setattr = v9fs_vfs_setattr,
  };
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c

index 9861c7c951a6dbd293e78d84e190f0908b4a1921..09e4433717b8795c2c7c8c76452887ec27a91be4 100644 (file)
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -905,41 +905,24 @@ error:
  /**
   * v9fs_vfs_follow_link_dotl - follow a symlink path
   * @dentry: dentry for symlink
- * @nd: nameidata
- *
+ * @cookie: place to pass the data to put_link()
   */
  
-static void *
-v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
+static const char *
+v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie)
  {
-       int retval;
-       struct p9_fid *fid;
-       char *link = __getname();
+       struct p9_fid *fid = v9fs_fid_lookup(dentry);
         char *target;
+       int retval;
  
         p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
  
-       if (!link) {
-               link = ERR_PTR(-ENOMEM);
-               goto ndset;
-       }
-       fid = v9fs_fid_lookup(dentry);
-       if (IS_ERR(fid)) {
-               __putname(link);
-               link = ERR_CAST(fid);
-               goto ndset;
-       }
+       if (IS_ERR(fid))
+               return ERR_CAST(fid);
         retval = p9_client_readlink(fid, &target);
-       if (!retval) {
-               strcpy(link, target);
-               kfree(target);
-               goto ndset;
-       }
-       __putname(link);
-       link = ERR_PTR(retval);
-ndset:
-       nd_set_link(nd, link);
-       return NULL;
+       if (retval)
+               return ERR_PTR(retval);
+       return *cookie = target;
  }
  
  int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
@@ -1006,7 +989,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = {
  const struct inode_operations v9fs_symlink_inode_operations_dotl = {
         .readlink = generic_readlink,
         .follow_link = v9fs_vfs_follow_link_dotl,
-       .put_link = v9fs_vfs_put_link,
+       .put_link = kfree_put_link,
         .getattr = v9fs_vfs_getattr_dotl,
         .setattr = v9fs_vfs_setattr_dotl,
         .setxattr = generic_setxattr,
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c

index de58cc7b8076178605cea8f776031945d7d6da62..da0c33481bc0387788bcf4ce1792b38e141804e4 100644 (file)
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -12,14 +12,13 @@
  
  #include "autofs_i.h"
  
-static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *autofs4_follow_link(struct dentry *dentry, void **cookie)
  {
         struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
         struct autofs_info *ino = autofs4_dentry_ino(dentry);
         if (ino && !autofs4_oz_mode(sbi))
                 ino->last_used = jiffies;
-       nd_set_link(nd, d_inode(dentry)->i_private);
-       return NULL;
+       return d_inode(dentry)->i_private;
  }
  
  const struct inode_operations autofs4_symlink_inode_operations = {
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c

index 7943533c386802dc880051cc7ac2b078cf5d3ccb..46aedacfa6a8d4131563a83a8402daf89d590ddb 100644 (file)
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -42,8 +42,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long);
  static struct inode *befs_alloc_inode(struct super_block *sb);
  static void befs_destroy_inode(struct inode *inode);
  static void befs_destroy_inodecache(void);
-static void *befs_follow_link(struct dentry *, struct nameidata *);
-static void *befs_fast_follow_link(struct dentry *, struct nameidata *);
+static const char *befs_follow_link(struct dentry *, void **);
  static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
                         char **out, int *out_len);
  static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
@@ -80,11 +79,6 @@ static const struct address_space_operations befs_aops = {
         .bmap           = befs_bmap,
  };
  
-static const struct inode_operations befs_fast_symlink_inode_operations = {
-       .readlink       = generic_readlink,
-       .follow_link    = befs_fast_follow_link,
-};
-
  static const struct inode_operations befs_symlink_inode_operations = {
         .readlink       = generic_readlink,
         .follow_link    = befs_follow_link,
@@ -403,10 +397,12 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
                 inode->i_op = &befs_dir_inode_operations;
                 inode->i_fop = &befs_dir_operations;
         } else if (S_ISLNK(inode->i_mode)) {
-               if (befs_ino->i_flags & BEFS_LONG_SYMLINK)
+               if (befs_ino->i_flags & BEFS_LONG_SYMLINK) {
                         inode->i_op = &befs_symlink_inode_operations;
-               else
-                       inode->i_op = &befs_fast_symlink_inode_operations;
+               } else {
+                       inode->i_link = befs_ino->i_data.symlink;
+                       inode->i_op = &simple_symlink_inode_operations;
+               }
         } else {
                 befs_error(sb, "Inode %lu is not a regular file, "
                            "directory or symlink. THAT IS WRONG! BeFS has no "
@@ -467,8 +463,8 @@ befs_destroy_inodecache(void)
   * The data stream become link name. Unless the LONG_SYMLINK
   * flag is set.
   */
-static void *
-befs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *
+befs_follow_link(struct dentry *dentry, void **cookie)
  {
         struct super_block *sb = dentry->d_sb;
         struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry));
@@ -478,33 +474,20 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
  
         if (len == 0) {
                 befs_error(sb, "Long symlink with illegal length");
-               link = ERR_PTR(-EIO);
-       } else {
-               befs_debug(sb, "Follow long symlink");
-
-               link = kmalloc(len, GFP_NOFS);
-               if (!link) {
-                       link = ERR_PTR(-ENOMEM);
-               } else if (befs_read_lsymlink(sb, data, link, len) != len) {
-                       kfree(link);
-                       befs_error(sb, "Failed to read entire long symlink");
-                       link = ERR_PTR(-EIO);
-               } else {
-                       link[len - 1] = '\0';
-               }
+               return ERR_PTR(-EIO);
         }
-       nd_set_link(nd, link);
-       return NULL;
-}
-
-
-static void *
-befs_fast_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-       struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry));
+       befs_debug(sb, "Follow long symlink");
  
-       nd_set_link(nd, befs_ino->i_data.symlink);
-       return NULL;
+       link = kmalloc(len, GFP_NOFS);
+       if (!link)
+               return ERR_PTR(-ENOMEM);
+       if (befs_read_lsymlink(sb, data, link, len) != len) {
+               kfree(link);
+               befs_error(sb, "Failed to read entire long symlink");
+               return ERR_PTR(-EIO);
+       }
+       link[len - 1] = '\0';
+       return *cookie = link;
  }
  
  /*
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c

index e876e1944519a330a2cc1f44e33a031139a35438..571acd88606cfcec3d01fc4a6ef453f0b49e9713 100644 (file)
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -6,7 +6,6 @@
  #include <linux/string.h>
  #include <linux/uaccess.h>
  #include <linux/kernel.h>
-#include <linux/namei.h>
  #include <linux/writeback.h>
  #include <linux/vmalloc.h>
  #include <linux/posix_acl.h>
@@ -819,6 +818,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
                         else
                                 kfree(sym); /* lost a race */
                 }
+               inode->i_link = ci->i_symlink;
                 break;
         case S_IFDIR:
                 inode->i_op = &ceph_dir_iops;
@@ -1691,16 +1691,9 @@ retry:
  /*
   * symlinks
   */
-static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-       struct ceph_inode_info *ci = ceph_inode(d_inode(dentry));
-       nd_set_link(nd, ci->i_symlink);
-       return NULL;
-}
-
  static const struct inode_operations ceph_symlink_iops = {
         .readlink = generic_readlink,
-       .follow_link = ceph_sym_follow_link,
+       .follow_link = simple_follow_link,
         .setattr = ceph_setattr,
         .getattr = ceph_getattr,
         .setxattr = ceph_setxattr,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h

index 252f5c15806bc2f18f5c1c10ff7c2bde0aedba6d..a782b22904e40b71387d844a6a7879bab8191a88 100644 (file)
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -120,7 +120,7 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
  #endif
  
  /* Functions related to symlinks */
-extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
+extern const char *cifs_follow_link(struct dentry *direntry, void **cookie);
  extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
                          int buflen);
  extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
diff --git a/fs/cifs/link.c b/fs/cifs/link.c

index e6c707cc62b39b445b4b374eeec51a5be4fe07f4..e3548f73bdeaa980ef1c282246688e1e3f5f21e8 100644 (file)
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -626,8 +626,8 @@ cifs_hl_exit:
         return rc;
  }
  
-void *
-cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
+const char *
+cifs_follow_link(struct dentry *direntry, void **cookie)
  {
         struct inode *inode = d_inode(direntry);
         int rc = -ENOMEM;
@@ -643,16 +643,18 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
  
         tlink = cifs_sb_tlink(cifs_sb);
         if (IS_ERR(tlink)) {
-               rc = PTR_ERR(tlink);
-               tlink = NULL;
-               goto out;
+               free_xid(xid);
+               return ERR_CAST(tlink);
         }
         tcon = tlink_tcon(tlink);
         server = tcon->ses->server;
  
         full_path = build_path_from_dentry(direntry);
-       if (!full_path)
-               goto out;
+       if (!full_path) {
+               free_xid(xid);
+               cifs_put_tlink(tlink);
+               return ERR_PTR(-ENOMEM);
+       }
  
         cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode);
  
@@ -670,17 +672,13 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
                                                 &target_path, cifs_sb);
  
         kfree(full_path);
-out:
+       free_xid(xid);
+       cifs_put_tlink(tlink);
         if (rc != 0) {
                 kfree(target_path);
-               target_path = ERR_PTR(rc);
+               return ERR_PTR(rc);
         }
-
-       free_xid(xid);
-       if (tlink)
-               cifs_put_tlink(tlink);
-       nd_set_link(nd, target_path);
-       return NULL;
+       return *cookie = target_path;
  }
  
  int
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c

index cc9f2546ea4a041273654b6076d1fbc774d447e4..ec5c8325b503d1a1602863769ae43c067d13a047 100644 (file)
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -279,36 +279,27 @@ static int configfs_getlink(struct dentry *dentry, char * path)
  
  }
  
-static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *configfs_follow_link(struct dentry *dentry, void **cookie)
  {
-       int error = -ENOMEM;
         unsigned long page = get_zeroed_page(GFP_KERNEL);
+       int error;
  
-       if (page) {
-               error = configfs_getlink(dentry, (char *)page);
-               if (!error) {
-                       nd_set_link(nd, (char *)page);
-                       return (void *)page;
-               }
-       }
-
-       nd_set_link(nd, ERR_PTR(error));
-       return NULL;
-}
+       if (!page)
+               return ERR_PTR(-ENOMEM);
  
-static void configfs_put_link(struct dentry *dentry, struct nameidata *nd,
-                             void *cookie)
-{
-       if (cookie) {
-               unsigned long page = (unsigned long)cookie;
-               free_page(page);
+       error = configfs_getlink(dentry, (char *)page);
+       if (!error) {
+               return *cookie = (void *)page;
         }
+
+       free_page(page);
+       return ERR_PTR(error);
  }
  
  const struct inode_operations configfs_symlink_inode_operations = {
         .follow_link = configfs_follow_link,
         .readlink = generic_readlink,
-       .put_link = configfs_put_link,
+       .put_link = free_page_put_link,
         .setattr = configfs_setattr,
  };
  
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c

index 830a7e76f5c64067e46fad8fd368e9112ddae7a9..284f9aa0028b8dd46b9897ababc5e826c185c608 100644 (file)
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -17,7 +17,6 @@
  #include <linux/fs.h>
  #include <linux/seq_file.h>
  #include <linux/pagemap.h>
-#include <linux/namei.h>
  #include <linux/debugfs.h>
  #include <linux/io.h>
  #include <linux/slab.h>
@@ -43,17 +42,6 @@ const struct file_operations debugfs_file_operations = {
         .llseek =       noop_llseek,
  };
  
-static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-       nd_set_link(nd, d_inode(dentry)->i_private);
-       return NULL;
-}
-
-const struct inode_operations debugfs_link_operations = {
-       .readlink       = generic_readlink,
-       .follow_link    = debugfs_follow_link,
-};
-
  static int debugfs_u8_set(void *data, u64 val)
  {
         *(u8 *)data = val;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c

index c1e7ffb0dab658ecd21c449bf36467b14e0b75d6..7eaec88ea970d1a6ea8422465857deaefd2b7052 100644 (file)
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -174,7 +174,7 @@ static void debugfs_evict_inode(struct inode *inode)
         truncate_inode_pages_final(&inode->i_data);
         clear_inode(inode);
         if (S_ISLNK(inode->i_mode))
-               kfree(inode->i_private);
+               kfree(inode->i_link);
  }
  
  static const struct super_operations debugfs_super_operations = {
@@ -511,8 +511,8 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
                 return failed_creating(dentry);
         }
         inode->i_mode = S_IFLNK | S_IRWXUGO;
-       inode->i_op = &debugfs_link_operations;
-       inode->i_private = link;
+       inode->i_op = &simple_symlink_inode_operations;
+       inode->i_link = link;
         d_instantiate(dentry, inode);
         return end_creating(dentry);
  }
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c

index fc850b55db67a27a99663596e1e8c711c8d71237..3c4db1172d222840b8cf0fcd50a61437ebf5f4c7 100644 (file)
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -170,7 +170,6 @@ out_unlock:
   * @directory_inode: inode of the new file's dentry's parent in ecryptfs
   * @ecryptfs_dentry: New file's dentry in ecryptfs
   * @mode: The mode of the new file
- * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
   *
   * Creates the underlying file and the eCryptfs inode which will link to
   * it. It will also update the eCryptfs directory inode to mimic the
@@ -384,7 +383,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
   * ecryptfs_lookup
   * @ecryptfs_dir_inode: The eCryptfs directory inode
   * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
- * @ecryptfs_nd: nameidata; may be NULL
+ * @flags: lookup flags
   *
   * Find a file on disk. If the file does not exist, then we'll add it to the
   * dentry cache and continue on to read it from the disk.
@@ -675,18 +674,16 @@ out:
         return rc ? ERR_PTR(rc) : buf;
  }
  
-static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie)
  {
         size_t len;
         char *buf = ecryptfs_readlink_lower(dentry, &len);
         if (IS_ERR(buf))
-               goto out;
+               return buf;
         fsstack_copy_attr_atime(d_inode(dentry),
                                 d_inode(ecryptfs_dentry_to_lower(dentry)));
         buf[len] = '\0';
-out:
-       nd_set_link(nd, buf);
-       return NULL;
+       return *cookie = buf;
  }
  
  /**
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild

index b47c7b8dc275429e87b1b35fe0a36af2e820cdf3..a364fd0965ec6a35e27c386b22916a20de9469b3 100644 (file)
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -16,5 +16,5 @@
  libore-y := ore.o ore_raid.o
  obj-$(CONFIG_ORE) += libore.o
  
-exofs-y := inode.o file.o symlink.o namei.o dir.o super.o sys.o
+exofs-y := inode.o file.o namei.o dir.o super.o sys.o
  obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h

index ad9cac670a470d163001c7aa2227db46afc607d3..2e86086bc9403efe99a25ab0df439db0c714eb4e 100644 (file)
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -207,10 +207,6 @@ extern const struct address_space_operations exofs_aops;
  extern const struct inode_operations exofs_dir_inode_operations;
  extern const struct inode_operations exofs_special_inode_operations;
  
-/* symlink.c         */
-extern const struct inode_operations exofs_symlink_inode_operations;
-extern const struct inode_operations exofs_fast_symlink_inode_operations;
-
  /* exofs_init_comps will initialize an ore_components device array
   * pointing to a single ore_comp struct, and a round-robin view
   * of the device table.
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c

index 786e4cc8c889cc8903f7d734ee3e255c0aeb24f8..73c64daa0f5517b4ff8271bd17e25740bb83d506 100644 (file)
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1222,10 +1222,11 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
                 inode->i_fop = &exofs_dir_operations;
                 inode->i_mapping->a_ops = &exofs_aops;
         } else if (S_ISLNK(inode->i_mode)) {
-               if (exofs_inode_is_fast_symlink(inode))
-                       inode->i_op = &exofs_fast_symlink_inode_operations;
-               else {
-                       inode->i_op = &exofs_symlink_inode_operations;
+               if (exofs_inode_is_fast_symlink(inode)) {
+                       inode->i_op = &simple_symlink_inode_operations;
+                       inode->i_link = (char *)oi->i_data;
+               } else {
+                       inode->i_op = &page_symlink_inode_operations;
                         inode->i_mapping->a_ops = &exofs_aops;
                 }
         } else {
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c

index 5ae25e43119185e04d19a287e534921cc61bbc1c..09a6bb1ad63c840b91ef56114871753454046292 100644 (file)
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -113,7 +113,7 @@ static int exofs_symlink(struct inode *dir, struct dentry *dentry,
         oi = exofs_i(inode);
         if (l > sizeof(oi->i_data)) {
                 /* slow symlink */
-               inode->i_op = &exofs_symlink_inode_operations;
+               inode->i_op = &page_symlink_inode_operations;
                 inode->i_mapping->a_ops = &exofs_aops;
                 memset(oi->i_data, 0, sizeof(oi->i_data));
  
@@ -122,7 +122,8 @@ static int exofs_symlink(struct inode *dir, struct dentry *dentry,
                         goto out_fail;
         } else {
                 /* fast symlink */
-               inode->i_op = &exofs_fast_symlink_inode_operations;
+               inode->i_op = &simple_symlink_inode_operations;
+               inode->i_link = (char *)oi->i_data;
                 memcpy(oi->i_data, symname, l);
                 inode->i_size = l-1;
         }
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c

deleted file mode 100644 (file)

index 6f6f3a4..0000000
--- a/fs/exofs/symlink.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- *     Copyright (C) 1992, 1993, 1994, 1995
- *     Remy Card (card@masi.ibp.fr)
- *     Laboratoire MASI - Institut Blaise Pascal
- *     Universite Pierre et Marie Curie (Paris VI)
- *     from
- *     linux/fs/minix/inode.c
- *     Copyright (C) 1991, 1992  Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation.  Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include <linux/namei.h>
-
-#include "exofs.h"
-
-static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-       struct exofs_i_info *oi = exofs_i(d_inode(dentry));
-
-       nd_set_link(nd, (char *)oi->i_data);
-       return NULL;
-}
-
-const struct inode_operations exofs_symlink_inode_operations = {
-       .readlink       = generic_readlink,
-       .follow_link    = page_follow_link_light,
-       .put_link       = page_put_link,
-};
-
-const struct inode_operations exofs_fast_symlink_inode_operations = {
-       .readlink       = generic_readlink,
-       .follow_link    = exofs_follow_link,
-};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c

index f460ae36d5b78addfd9cc1a6eb6c4287887a946e..5c09776d347fc363c4f456862eb2361d717e46bd 100644 (file)
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1403,6 +1403,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
                         inode->i_mapping->a_ops = &ext2_aops;
         } else if (S_ISLNK(inode->i_mode)) {
                 if (ext2_inode_is_fast_symlink(inode)) {
+                       inode->i_link = (char *)ei->i_data;
                         inode->i_op = &ext2_fast_symlink_inode_operations;
                         nd_terminate_link(ei->i_data, inode->i_size,
                                 sizeof(ei->i_data) - 1);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c

index 3e074a9ccbe6dd048c288ae8162229b0af26d176..13ec54a99c962a85bc628732102fdb554f750d1c 100644 (file)
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -189,7 +189,8 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
         } else {
                 /* fast symlink */
                 inode->i_op = &ext2_fast_symlink_inode_operations;
-               memcpy((char*)(EXT2_I(inode)->i_data),symname,l);
+               inode->i_link = (char*)EXT2_I(inode)->i_data;
+               memcpy(inode->i_link, symname, l);
                 inode->i_size = l-1;
         }
         mark_inode_dirty(inode);
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c

index 20608f17c2e5144ed6283e6afe7f23dae6f51031..ae17179f3810b2dd635c81203643a14f8f4c0c10 100644 (file)
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -19,14 +19,6 @@
  
  #include "ext2.h"
  #include "xattr.h"
-#include <linux/namei.h>
-
-static void *ext2_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-       struct ext2_inode_info *ei = EXT2_I(d_inode(dentry));
-       nd_set_link(nd, (char *)ei->i_data);
-       return NULL;
-}
  
  const struct inode_operations ext2_symlink_inode_operations = {
         .readlink       = generic_readlink,
@@ -43,7 +35,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
   
  const struct inode_operations ext2_fast_symlink_inode_operations = {
         .readlink       = generic_readlink,
-       .follow_link    = ext2_follow_link,
+       .follow_link    = simple_follow_link,
         .setattr        = ext2_setattr,
  #ifdef CONFIG_EXT2_FS_XATTR
         .setxattr       = generic_setxattr,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c

index 2ee2dc4351d1630b375da3b9aaa062840ce1afc9..6c7e5468a2f807d68e48b7c43b13a0626a5f4aee 100644 (file)
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2999,6 +2999,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
                         inode->i_op = &ext3_fast_symlink_inode_operations;
                         nd_terminate_link(ei->i_data, inode->i_size,
                                 sizeof(ei->i_data) - 1);
+                       inode->i_link = (char *)ei->i_data;
                 } else {
                         inode->i_op = &ext3_symlink_inode_operations;
                         ext3_set_aops(inode);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c

index 4264b9bd0002f199593308feaf7906292311e42f..c9e767cd4b67991e3ecf4bebc77b8eaa6be2f752 100644 (file)
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2308,7 +2308,8 @@ retry:
                 }
         } else {
                 inode->i_op = &ext3_fast_symlink_inode_operations;
-               memcpy((char*)&EXT3_I(inode)->i_data,symname,l);
+               inode->i_link = (char*)&EXT3_I(inode)->i_data;
+               memcpy(inode->i_link, symname, l);
                 inode->i_size = l-1;
         }
         EXT3_I(inode)->i_disksize = inode->i_size;
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c

index ea96df3c58db199915e6e9b60b1d7beb9b931150..c08c59094ae61f3172c58e237477a7cb7518e50f 100644 (file)
--- a/fs/ext3/symlink.c
+++ b/fs/ext3/symlink.c
@@ -17,17 +17,9 @@
   *  ext3 symlink handling code
   */
  
-#include <linux/namei.h>
  #include "ext3.h"
  #include "xattr.h"
  
-static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-       struct ext3_inode_info *ei = EXT3_I(d_inode(dentry));
-       nd_set_link(nd, (char*)ei->i_data);
-       return NULL;
-}
-
  const struct inode_operations ext3_symlink_inode_operations = {
         .readlink       = generic_readlink,
         .follow_link    = page_follow_link_light,
@@ -43,7 +35,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
  
  const struct inode_operations ext3_fast_symlink_inode_operations = {
         .readlink       = generic_readlink,
-       .follow_link    = ext3_follow_link,
+       .follow_link    = simple_follow_link,
         .setattr        = ext3_setattr,
  #ifdef CONFIG_EXT3_FS_XATTR
         .setxattr       = generic_setxattr,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

index 9a83f149ac85525b821a4d24ba5387b93b3229d6..0a3b72d1d458bd68834465a70fff3d323959f997 100644 (file)
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2847,6 +2847,7 @@ extern int ext4_mpage_readpages(struct address_space *mapping,
                                 unsigned nr_pages);
  
  /* symlink.c */
+extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
  extern const struct inode_operations ext4_symlink_inode_operations;
  extern const struct inode_operations ext4_fast_symlink_inode_operations;
  
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index 0554b0b5957bb5db223534f2116d5eea18eae2d4..5168c9b568809d81f66cd4813bc25cba5d669fc9 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4213,8 +4213,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                 inode->i_op = &ext4_dir_inode_operations;
                 inode->i_fop = &ext4_dir_operations;
         } else if (S_ISLNK(inode->i_mode)) {
-               if (ext4_inode_is_fast_symlink(inode) &&
-                   !ext4_encrypted_inode(inode)) {
+               if (ext4_encrypted_inode(inode)) {
+                       inode->i_op = &ext4_encrypted_symlink_inode_operations;
+                       ext4_set_aops(inode);
+               } else if (ext4_inode_is_fast_symlink(inode)) {
+                       inode->i_link = (char *)ei->i_data;
                         inode->i_op = &ext4_fast_symlink_inode_operations;
                         nd_terminate_link(ei->i_data, inode->i_size,
                                 sizeof(ei->i_data) - 1);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c

index 814f3beb436965f116b7555ee8cf9ac30c3f0165..5fdb9f6aa869445ca9893751393e4822b2e0ed63 100644 (file)
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3206,10 +3206,12 @@ static int ext4_symlink(struct inode *dir,
                         goto err_drop_inode;
                 sd->len = cpu_to_le16(ostr.len);
                 disk_link.name = (char *) sd;
+               inode->i_op = &ext4_encrypted_symlink_inode_operations;
         }
  
         if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
-               inode->i_op = &ext4_symlink_inode_operations;
+               if (!encryption_required)
+                       inode->i_op = &ext4_symlink_inode_operations;
                 ext4_set_aops(inode);
                 /*
                  * We cannot call page_symlink() with transaction started
@@ -3249,9 +3251,10 @@ static int ext4_symlink(struct inode *dir,
         } else {
                 /* clear the extent format for fast symlink */
                 ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
-               inode->i_op = encryption_required ?
-                       &ext4_symlink_inode_operations :
-                       &ext4_fast_symlink_inode_operations;
+               if (!encryption_required) {
+                       inode->i_op = &ext4_fast_symlink_inode_operations;
+                       inode->i_link = (char *)&EXT4_I(inode)->i_data;
+               }
                 memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name,
                        disk_link.len);
                 inode->i_size = disk_link.len - 1;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c

index 187b789203142d6b444b264acd44798427626b41..ba5bd18a9825242fdfc9e8ee4cb05f1cc7cd05f7 100644 (file)
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,7 +23,7 @@
  #include "xattr.h"
  
  #ifdef CONFIG_EXT4_FS_ENCRYPTION
-static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *ext4_follow_link(struct dentry *dentry, void **cookie)
  {
         struct page *cpage = NULL;
         char *caddr, *paddr = NULL;
@@ -35,12 +35,9 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
         int res;
         u32 plen, max_size = inode->i_sb->s_blocksize;
  
-       if (!ext4_encrypted_inode(inode))
-               return page_follow_link_light(dentry, nd);
-
         ctx = ext4_get_fname_crypto_ctx(inode, inode->i_sb->s_blocksize);
         if (IS_ERR(ctx))
-               return ctx;
+               return ERR_CAST(ctx);
  
         if (ext4_inode_is_fast_symlink(inode)) {
                 caddr = (char *) EXT4_I(inode)->i_data;
@@ -49,7 +46,7 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
                 cpage = read_mapping_page(inode->i_mapping, 0, NULL);
                 if (IS_ERR(cpage)) {
                         ext4_put_fname_crypto_ctx(&ctx);
-                       return cpage;
+                       return ERR_CAST(cpage);
                 }
                 caddr = kmap(cpage);
                 caddr[size] = 0;
@@ -80,13 +77,12 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
         /* Null-terminate the name */
         if (res <= plen)
                 paddr[res] = '\0';
-       nd_set_link(nd, paddr);
         ext4_put_fname_crypto_ctx(&ctx);
         if (cpage) {
                 kunmap(cpage);
                 page_cache_release(cpage);
         }
-       return NULL;
+       return *cookie = paddr;
  errout:
         ext4_put_fname_crypto_ctx(&ctx);
         if (cpage) {
@@ -97,36 +93,22 @@ errout:
         return ERR_PTR(res);
  }
  
-static void ext4_put_link(struct dentry *dentry, struct nameidata *nd,
-                         void *cookie)
-{
-       struct page *page = cookie;
-
-       if (!page) {
-               kfree(nd_get_link(nd));
-       } else {
-               kunmap(page);
-               page_cache_release(page);
-       }
-}
+const struct inode_operations ext4_encrypted_symlink_inode_operations = {
+       .readlink       = generic_readlink,
+       .follow_link    = ext4_follow_link,
+       .put_link       = kfree_put_link,
+       .setattr        = ext4_setattr,
+       .setxattr       = generic_setxattr,
+       .getxattr       = generic_getxattr,
+       .listxattr      = ext4_listxattr,
+       .removexattr    = generic_removexattr,
+};
  #endif
  
-static void *ext4_follow_fast_link(struct dentry *dentry, struct nameidata *nd)
-{
-       struct ext4_inode_info *ei = EXT4_I(d_inode(dentry));
-       nd_set_link(nd, (char *) ei->i_data);
-       return NULL;
-}
-
  const struct inode_operations ext4_symlink_inode_operations = {
         .readlink       = generic_readlink,
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-       .follow_link    = ext4_follow_link,
-       .put_link       = ext4_put_link,
-#else
         .follow_link    = page_follow_link_light,
         .put_link       = page_put_link,
-#endif
         .setattr        = ext4_setattr,
         .setxattr       = generic_setxattr,
         .getxattr       = generic_getxattr,
@@ -136,7 +118,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
  
  const struct inode_operations ext4_fast_symlink_inode_operations = {
         .readlink       = generic_readlink,
-       .follow_link    = ext4_follow_fast_link,
+       .follow_link    = simple_follow_link,
         .setattr        = ext4_setattr,
         .setxattr       = generic_setxattr,
         .getxattr       = generic_getxattr,
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c

index 658e8079aaf9b9020068bd30aad723d168899309..71765d062914a515fc7603843cabd2b456d5a189 100644 (file)
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -296,19 +296,15 @@ fail:
         return err;
  }
  
-static void *f2fs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *f2fs_follow_link(struct dentry *dentry, void **cookie)
  {
-       struct page *page = page_follow_link_light(dentry, nd);
-
-       if (IS_ERR_OR_NULL(page))
-               return page;
-
-       /* this is broken symlink case */
-       if (*nd_get_link(nd) == 0) {
-               page_put_link(dentry, nd, page);
-               return ERR_PTR(-ENOENT);
+       const char *link = page_follow_link_light(dentry, cookie);
+       if (!IS_ERR(link) && !*link) {
+               /* this is broken symlink case */
+               page_put_link(NULL, *cookie);
+               link = ERR_PTR(-ENOENT);
         }
-       return page;
+       return link;
  }
  
  static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
diff --git a/fs/fhandle.c b/fs/fhandle.c

index 999ff5c3cab0edacd585447132180d5c35554e3c..d59712dfa3e701e86ff53609308e813cf8acf69e 100644 (file)
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -195,8 +195,9 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
                 goto out_err;
         }
         /* copy the full handle */
-       if (copy_from_user(handle, ufh,
-                          sizeof(struct file_handle) +
+       *handle = f_handle;
+       if (copy_from_user(&handle->f_handle,
+                          &ufh->f_handle,
                            f_handle.handle_bytes)) {
                 retval = -EFAULT;
                 goto out_handle;
diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h

index 881aa3d217f007a76361ff1c23f77499bdab851e..e3dcb4467d92752af6980549fb740f97d11d1f47 100644 (file)
--- a/fs/freevxfs/vxfs_extern.h
+++ b/fs/freevxfs/vxfs_extern.h
@@ -50,9 +50,6 @@ extern daddr_t                        vxfs_bmap1(struct inode *, long);
  /* vxfs_fshead.c */
  extern int                     vxfs_read_fshead(struct super_block *);
  
-/* vxfs_immed.c */
-extern const struct inode_operations vxfs_immed_symlink_iops;
-
  /* vxfs_inode.c */
  extern const struct address_space_operations vxfs_immed_aops;
  extern struct kmem_cache       *vxfs_inode_cachep;
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c

index 8b9229e2ca5cb572976a839ca20f58e0514b6726..cb84f0fcc72a468c1366f498554ab4c79ac26830 100644 (file)
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -32,28 +32,14 @@
   */
  #include <linux/fs.h>
  #include <linux/pagemap.h>
-#include <linux/namei.h>
  
  #include "vxfs.h"
  #include "vxfs_extern.h"
  #include "vxfs_inode.h"
  
  
-static void *  vxfs_immed_follow_link(struct dentry *, struct nameidata *);
-
  static int     vxfs_immed_readpage(struct file *, struct page *);
  
-/*
- * Inode operations for immed symlinks.
- *
- * Unliked all other operations we do not go through the pagecache,
- * but do all work directly on the inode.
- */
-const struct inode_operations vxfs_immed_symlink_iops = {
-       .readlink =             generic_readlink,
-       .follow_link =          vxfs_immed_follow_link,
-};
-
  /*
   * Address space operations for immed files and directories.
   */
@@ -61,26 +47,6 @@ const struct address_space_operations vxfs_immed_aops = {
         .readpage =             vxfs_immed_readpage,
  };
  
-/**
- * vxfs_immed_follow_link - follow immed symlink
- * @dp:                dentry for the link
- * @np:                pathname lookup data for the current path walk
- *
- * Description:
- *   vxfs_immed_follow_link restarts the pathname lookup with
- *   the data obtained from @dp.
- *
- * Returns:
- *   Zero on success, else a negative error code.
- */
-static void *
-vxfs_immed_follow_link(struct dentry *dp, struct nameidata *np)
-{
-       struct vxfs_inode_info          *vip = VXFS_INO(d_inode(dp));
-       nd_set_link(np, vip->vii_immed.vi_immed);
-       return NULL;
-}
-
  /**
   * vxfs_immed_readpage - read part of an immed inode into pagecache
   * @file:      file context (unused)
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c

index 363e3ae25f6b42c775f6c09f6251786555adeaa2..ef73ed674a27162917845b0507269bdf86b273da 100644 (file)
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -35,6 +35,7 @@
  #include <linux/pagemap.h>
  #include <linux/kernel.h>
  #include <linux/slab.h>
+#include <linux/namei.h>
  
  #include "vxfs.h"
  #include "vxfs_inode.h"
@@ -327,8 +328,10 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
                         ip->i_op = &page_symlink_inode_operations;
                         ip->i_mapping->a_ops = &vxfs_aops;
                 } else {
-                       ip->i_op = &vxfs_immed_symlink_iops;
-                       vip->vii_immed.vi_immed[ip->i_size] = '\0';
+                       ip->i_op = &simple_symlink_inode_operations;
+                       ip->i_link = vip->vii_immed.vi_immed;
+                       nd_terminate_link(ip->i_link, ip->i_size,
+                                         sizeof(vip->vii_immed.vi_immed) - 1);
                 }
         } else
                 init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev));
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c

index 0572bca49f1546b3d9cd3b00fb7fc21f0369308b..5e2e08712d3ba614a46687d5688fc2f01cd835be 100644 (file)
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1365,7 +1365,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
         return err;
  }
  
-static char *read_link(struct dentry *dentry)
+static const char *fuse_follow_link(struct dentry *dentry, void **cookie)
  {
         struct inode *inode = d_inode(dentry);
         struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1389,28 +1389,12 @@ static char *read_link(struct dentry *dentry)
                 link = ERR_PTR(ret);
         } else {
                 link[ret] = '\0';
+               *cookie = link;
         }
         fuse_invalidate_atime(inode);
         return link;
  }
  
-static void free_link(char *link)
-{
-       if (!IS_ERR(link))
-               free_page((unsigned long) link);
-}
-
-static void *fuse_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-       nd_set_link(nd, read_link(dentry));
-       return NULL;
-}
-
-static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
-{
-       free_link(nd_get_link(nd));
-}
-
  static int fuse_dir_open(struct inode *inode, struct file *file)
  {
         return fuse_open_common(inode, file, true);
@@ -1926,7 +1910,7 @@ static const struct inode_operations fuse_common_inode_operations = {
  static const struct inode_operations fuse_symlink_inode_operations = {
         .setattr        = fuse_setattr,
         .follow_link    = fuse_follow_link,
-       .put_link       = fuse_put_link,
+       .put_link       = free_page_put_link,
         .readlink       = generic_readlink,
         .getattr        = fuse_getattr,
         .setxattr       = fuse_setxattr,
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c

index 1b3ca7a2e3fcfb807d4505d81bace6b698272bff..3a1461de1551d5765b7334747e69d2909ffb8d00 100644 (file)
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1548,7 +1548,7 @@ out:
   * Returns: 0 on success or error code
   */
  
-static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *gfs2_follow_link(struct dentry *dentry, void **cookie)
  {
         struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
         struct gfs2_holder i_gh;
@@ -1561,8 +1561,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
         error = gfs2_glock_nq(&i_gh);
         if (error) {
                 gfs2_holder_uninit(&i_gh);
-               nd_set_link(nd, ERR_PTR(error));
-               return NULL;
+               return ERR_PTR(error);
         }
  
         size = (unsigned int)i_size_read(&ip->i_inode);
@@ -1586,8 +1585,9 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
         brelse(dibh);
  out:
         gfs2_glock_dq_uninit(&i_gh);
-       nd_set_link(nd, buf);
-       return NULL;
+       if (!IS_ERR(buf))
+               *cookie = buf;
+       return buf;
  }
  
  /**
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c

index 07d8d8f52faf50d027f2f90a1699c7afa274aa34..059597b23f677b0959d8264b83cf4c4a2cec34b7 100644 (file)
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -892,7 +892,7 @@ static const struct inode_operations hostfs_dir_iops = {
         .setattr        = hostfs_setattr,
  };
  
-static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *hostfs_follow_link(struct dentry *dentry, void **cookie)
  {
         char *link = __getname();
         if (link) {
@@ -906,21 +906,18 @@ static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
                 }
                 if (err < 0) {
                         __putname(link);
-                       link = ERR_PTR(err);
+                       return ERR_PTR(err);
                 }
         } else {
-               link = ERR_PTR(-ENOMEM);
+               return ERR_PTR(-ENOMEM);
         }
  
-       nd_set_link(nd, link);
-       return NULL;
+       return *cookie = link;
  }
  
-static void hostfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+static void hostfs_put_link(struct inode *unused, void *cookie)
  {
-       char *s = nd_get_link(nd);
-       if (!IS_ERR(s))
-               __putname(s);
+       __putname(cookie);
  }
  
  static const struct inode_operations hostfs_link_iops = {
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c

index fa2bd5366ecf1f4c3d9b81c2d80336a621831dcf..2867837909a91ba005af78ea3ba4b5191e13c1d5 100644 (file)
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -642,20 +642,19 @@ static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
                                                     buflen);
  }
  
-static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *hppfs_follow_link(struct dentry *dentry, void **cookie)
  {
         struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry;
  
-       return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, nd);
+       return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, cookie);
  }
  
-static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd,
-                          void *cookie)
+static void hppfs_put_link(struct inode *inode, void *cookie)
  {
-       struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry;
+       struct inode *proc_inode = d_inode(HPPFS_I(inode)->proc_dentry);
  
-       if (d_inode(proc_dentry)->i_op->put_link)
-               d_inode(proc_dentry)->i_op->put_link(proc_dentry, nd, cookie);
+       if (proc_inode->i_op->put_link)
+               proc_inode->i_op->put_link(proc_inode, cookie);
  }
  
  static const struct inode_operations hppfs_dir_iops = {
diff --git a/fs/inode.c b/fs/inode.c

index ea37cd17b53f0c98b47e2e626b9ff2a9b6e5699b..e8d62688ed9181e511e2a0e8c6a5f36840cdbe94 100644 (file)
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -152,6 +152,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
         inode->i_pipe = NULL;
         inode->i_bdev = NULL;
         inode->i_cdev = NULL;
+       inode->i_link = NULL;
         inode->i_rdev = 0;
         inode->dirtied_when = 0;
  
@@ -1584,36 +1585,47 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
   *     This function automatically handles read only file systems and media,
   *     as well as the "noatime" flag and inode specific "noatime" markers.
   */
-void touch_atime(const struct path *path)
+bool atime_needs_update(const struct path *path, struct inode *inode)
  {
         struct vfsmount *mnt = path->mnt;
-       struct inode *inode = d_inode(path->dentry);
         struct timespec now;
  
         if (inode->i_flags & S_NOATIME)
-               return;
+               return false;
         if (IS_NOATIME(inode))
-               return;
+               return false;
         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
-               return;
+               return false;
  
         if (mnt->mnt_flags & MNT_NOATIME)
-               return;
+               return false;
         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
-               return;
+               return false;
  
         now = current_fs_time(inode->i_sb);
  
         if (!relatime_need_update(mnt, inode, now))
-               return;
+               return false;
  
         if (timespec_equal(&inode->i_atime, &now))
+               return false;
+
+       return true;
+}
+
+void touch_atime(const struct path *path)
+{
+       struct vfsmount *mnt = path->mnt;
+       struct inode *inode = d_inode(path->dentry);
+       struct timespec now;
+
+       if (!atime_needs_update(path, inode))
                 return;
  
         if (!sb_start_write_trylock(inode->i_sb))
                 return;
  
-       if (__mnt_want_write(mnt))
+       if (__mnt_want_write(mnt) != 0)
                 goto skip_update;
         /*
          * File systems can error out when updating inodes if they need to
@@ -1624,6 +1636,7 @@ void touch_atime(const struct path *path)
          * We may also fail on filesystems that have the ability to make parts
          * of the fs read only, e.g. subvolumes in Btrfs.
          */
+       now = current_fs_time(inode->i_sb);
         update_time(inode, &now, S_ATIME);
         __mnt_drop_write(mnt);
  skip_update:
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c

index 1ba5c97943b8751f0210870a7bb51636dd4e5ecd..81180022923fbd8ccd499d0b74af05b59695c302 100644 (file)
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -354,6 +354,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
                 ret = -ENOMEM;
                 goto fail;
         }
+       inode->i_link = f->target;
  
         jffs2_dbg(1, "%s(): symlink's target '%s' cached\n",
                   __func__, (char *)f->target);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c

index fe5ea080b4ec810f29589b257f605389067dadf4..60d86e8fba6e9561bb4204ef2c99fe7c744288da 100644 (file)
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -294,6 +294,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
  
         case S_IFLNK:
                 inode->i_op = &jffs2_symlink_inode_operations;
+               inode->i_link = f->target;
                 break;
  
         case S_IFDIR:
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c

index 1fefa25d0fa586a14caae06efea7363794be38fc..8ce2f240125b39803b4ebf2d681b6a95d40c33f5 100644 (file)
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -9,58 +9,15 @@
   *
   */
  
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/namei.h>
  #include "nodelist.h"
  
-static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd);
-
  const struct inode_operations jffs2_symlink_inode_operations =
  {
         .readlink =     generic_readlink,
-       .follow_link =  jffs2_follow_link,
+       .follow_link =  simple_follow_link,
         .setattr =      jffs2_setattr,
         .setxattr =     jffs2_setxattr,
         .getxattr =     jffs2_getxattr,
         .listxattr =    jffs2_listxattr,
         .removexattr =  jffs2_removexattr
  };
-
-static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-       struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(dentry));
-       char *p = (char *)f->target;
-
-       /*
-        * We don't acquire the f->sem mutex here since the only data we
-        * use is f->target.
-        *
-        * 1. If we are here the inode has already built and f->target has
-        * to point to the target path.
-        * 2. Nobody uses f->target (if the inode is symlink's inode). The
-        * exception is inode freeing function which frees f->target. But
-        * it can't be called while we are here and before VFS has
-        * stopped using our f->target string which we provide by means of
-        * nd_set_link() call.
-        */
-
-       if (!p) {
-               pr_err("%s(): can't find symlink target\n", __func__);
-               p = ERR_PTR(-EIO);
-       }
-       jffs2_dbg(1, "%s(): target path is '%s'\n",
-                 __func__, (char *)f->target);
-
-       nd_set_link(nd, p);
-
-       /*
-        * We will unlock the f->sem mutex but VFS will use the f->target string. This is safe
-        * since the only way that may cause f->target to be changed is iput() operation.
-        * But VFS will not use f->target after iput() has been called.
-        */
-       return NULL;
-}
-
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c

index 070dc4b335449423091e67dd74c0f1c34617b041..6f1cb2b5ee285dd50622f719296fe71284d5f826 100644 (file)
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -63,11 +63,12 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
                         inode->i_mapping->a_ops = &jfs_aops;
                 } else {
                         inode->i_op = &jfs_fast_symlink_inode_operations;
+                       inode->i_link = JFS_IP(inode)->i_inline;
                         /*
                          * The inline data should be null-terminated, but
                          * don't let on-disk corruption crash the kernel
                          */
-                       JFS_IP(inode)->i_inline[inode->i_size] = '\0';
+                       inode->i_link[inode->i_size] = '\0';
                 }
         } else {
                 inode->i_op = &jfs_file_inode_operations;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c

index 66db7bc0ed1096050c2b2f93c3aa9e16a0ba4726..e33be921aa41b5ae56a4054605aaeea5b8065987 100644 (file)
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -880,7 +880,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
         int ssize;              /* source pathname size */
         struct btstack btstack;
         struct inode *ip = d_inode(dentry);
-       unchar *i_fastsymlink;
         s64 xlen = 0;
         int bmask = 0, xsize;
         s64 xaddr;
@@ -946,8 +945,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
         if (ssize <= IDATASIZE) {
                 ip->i_op = &jfs_fast_symlink_inode_operations;
  
-               i_fastsymlink = JFS_IP(ip)->i_inline;
-               memcpy(i_fastsymlink, name, ssize);
+               ip->i_link = JFS_IP(ip)->i_inline;
+               memcpy(ip->i_link, name, ssize);
                 ip->i_size = ssize - 1;
  
                 /*
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c

index 80f42bcc4ef1295669de10ea1087384f1c98c95b..5929e2363cb85eddc0d54bf3a04754383cb395db 100644 (file)
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -17,21 +17,13 @@
   */
  
  #include <linux/fs.h>
-#include <linux/namei.h>
  #include "jfs_incore.h"
  #include "jfs_inode.h"
  #include "jfs_xattr.h"
  
-static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-       char *s = JFS_IP(d_inode(dentry))->i_inline;
-       nd_set_link(nd, s);
-       return NULL;
-}
-
  const struct inode_operations jfs_fast_symlink_inode_operations = {
         .readlink       = generic_readlink,
-       .follow_link    = jfs_follow_link,
+       .follow_link    = simple_follow_link,
         .setattr        = jfs_setattr,
         .setxattr       = jfs_setxattr,
         .getxattr       = jfs_getxattr,
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c

index 8a198898e39afd3ffde994cee7d732dcdfa8bdcd..db272528ab5bb01c192b5502650f29e0784663ce 100644 (file)
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -112,25 +112,18 @@ static int kernfs_getlink(struct dentry *dentry, char *path)
         return error;
  }
  
-static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie)
  {
         int error = -ENOMEM;
         unsigned long page = get_zeroed_page(GFP_KERNEL);
-       if (page) {
-               error = kernfs_getlink(dentry, (char *) page);
-               if (error < 0)
-                       free_page((unsigned long)page);
-       }
-       nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
-       return NULL;
-}
-
-static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd,
-                               void *cookie)
-{
-       char *page = nd_get_link(nd);
-       if (!IS_ERR(page))
+       if (!page)
+               return ERR_PTR(-ENOMEM);
+       error = kernfs_getlink(dentry, (char *)page);
+       if (unlikely(error < 0)) {
                 free_page((unsigned long)page);
+               return ERR_PTR(error);
+       }
+       return *cookie = (char *)page;
  }
  
  const struct inode_operations kernfs_symlink_iops = {
@@ -140,7 +133,7 @@ const struct inode_operations kernfs_symlink_iops = {
         .listxattr      = kernfs_iop_listxattr,
         .readlink       = generic_readlink,
         .follow_link    = kernfs_iop_follow_link,
-       .put_link       = kernfs_iop_put_link,
+       .put_link       = free_page_put_link,
         .setattr        = kernfs_iop_setattr,
         .getattr        = kernfs_iop_getattr,
         .permission     = kernfs_iop_permission,
diff --git a/fs/libfs.c b/fs/libfs.c

index cb1fb4b9b6377b09b669b833cc3437b259b622f6..65e1feca8b982c55bff37e5a85529f8cb0d4121e 100644 (file)
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1024,15 +1024,18 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
  }
  EXPORT_SYMBOL(noop_fsync);
  
-void kfree_put_link(struct dentry *dentry, struct nameidata *nd,
-                               void *cookie)
+void kfree_put_link(struct inode *unused, void *cookie)
  {
-       char *s = nd_get_link(nd);
-       if (!IS_ERR(s))
-               kfree(s);
+       kfree(cookie);
  }
  EXPORT_SYMBOL(kfree_put_link);
  
+void free_page_put_link(struct inode *unused, void *cookie)
+{
+       free_page((unsigned long) cookie);
+}
+EXPORT_SYMBOL(free_page_put_link);
+
  /*
   * nop .set_page_dirty method so that people can use .page_mkwrite on
   * anon inodes.
@@ -1093,3 +1096,15 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
         return -EINVAL;
  }
  EXPORT_SYMBOL(simple_nosetlease);
+
+const char *simple_follow_link(struct dentry *dentry, void **cookie)
+{
+       return d_inode(dentry)->i_link;
+}
+EXPORT_SYMBOL(simple_follow_link);
+
+const struct inode_operations simple_symlink_inode_operations = {
+       .follow_link = simple_follow_link,
+       .readlink = generic_readlink
+};
+EXPORT_SYMBOL(simple_symlink_inode_operations);
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c

index 4cf38f1185494115c0dcb8625f07b6593ae15fd5..f9b45d46d4c483ea0be1ceca4c35b9b3075b56b9 100644 (file)
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -779,6 +779,7 @@ fail:
  const struct inode_operations logfs_symlink_iops = {
         .readlink       = generic_readlink,
         .follow_link    = page_follow_link_light,
+       .put_link       = page_put_link,
  };
  
  const struct inode_operations logfs_dir_iops = {
diff --git a/fs/mount.h b/fs/mount.h

index 6a61c2b3e385cfabcf53cdf13ed8ae9afa99ce1d..b5b8082bfa4208086a7ee06741cb76670d51bab9 100644 (file)
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -88,6 +88,7 @@ static inline int is_mounted(struct vfsmount *mnt)
  extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
  extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
  
+extern int __legitimize_mnt(struct vfsmount *, unsigned);
  extern bool legitimize_mnt(struct vfsmount *, unsigned);
  
  extern void __detach_mounts(struct dentry *dentry);
diff --git a/fs/namei.c b/fs/namei.c

index fe30d3be43a8b381d3b9ac3016b28531996f91b7..2dad0eaf91d34d8f47d3cc525eafd45107d429bd 100644 (file)
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -492,6 +492,7 @@ void path_put(const struct path *path)
  }
  EXPORT_SYMBOL(path_put);
  
+#define EMBEDDED_LEVELS 2
  struct nameidata {
         struct path     path;
         struct qstr     last;
@@ -501,10 +502,139 @@ struct nameidata {
         unsigned        seq, m_seq;
         int             last_type;
         unsigned        depth;
-       struct file     *base;
-       char *saved_names[MAX_NESTED_LINKS + 1];
+       int             total_link_count;
+       struct saved {
+               struct path link;
+               void *cookie;
+               const char *name;
+               struct inode *inode;
+               unsigned seq;
+       } *stack, internal[EMBEDDED_LEVELS];
+       struct filename *name;
+       struct nameidata *saved;
+       unsigned        root_seq;
+       int             dfd;
  };
  
+static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
+{
+       struct nameidata *old = current->nameidata;
+       p->stack = p->internal;
+       p->dfd = dfd;
+       p->name = name;
+       p->total_link_count = old ? old->total_link_count : 0;
+       p->saved = old;
+       current->nameidata = p;
+}
+
+static void restore_nameidata(void)
+{
+       struct nameidata *now = current->nameidata, *old = now->saved;
+
+       current->nameidata = old;
+       if (old)
+               old->total_link_count = now->total_link_count;
+       if (now->stack != now->internal) {
+               kfree(now->stack);
+               now->stack = now->internal;
+       }
+}
+
+static int __nd_alloc_stack(struct nameidata *nd)
+{
+       struct saved *p;
+
+       if (nd->flags & LOOKUP_RCU) {
+               p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
+                                 GFP_ATOMIC);
+               if (unlikely(!p))
+                       return -ECHILD;
+       } else {
+               p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
+                                 GFP_KERNEL);
+               if (unlikely(!p))
+                       return -ENOMEM;
+       }
+       memcpy(p, nd->internal, sizeof(nd->internal));
+       nd->stack = p;
+       return 0;
+}
+
+static inline int nd_alloc_stack(struct nameidata *nd)
+{
+       if (likely(nd->depth != EMBEDDED_LEVELS))
+               return 0;
+       if (likely(nd->stack != nd->internal))
+               return 0;
+       return __nd_alloc_stack(nd);
+}
+
+static void drop_links(struct nameidata *nd)
+{
+       int i = nd->depth;
+       while (i--) {
+               struct saved *last = nd->stack + i;
+               struct inode *inode = last->inode;
+               if (last->cookie && inode->i_op->put_link) {
+                       inode->i_op->put_link(inode, last->cookie);
+                       last->cookie = NULL;
+               }
+       }
+}
+
+static void terminate_walk(struct nameidata *nd)
+{
+       drop_links(nd);
+       if (!(nd->flags & LOOKUP_RCU)) {
+               int i;
+               path_put(&nd->path);
+               for (i = 0; i < nd->depth; i++)
+                       path_put(&nd->stack[i].link);
+               if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+                       path_put(&nd->root);
+                       nd->root.mnt = NULL;
+               }
+       } else {
+               nd->flags &= ~LOOKUP_RCU;
+               if (!(nd->flags & LOOKUP_ROOT))
+                       nd->root.mnt = NULL;
+               rcu_read_unlock();
+       }
+       nd->depth = 0;
+}
+
+/* path_put is needed afterwards regardless of success or failure */
+static bool legitimize_path(struct nameidata *nd,
+                           struct path *path, unsigned seq)
+{
+       int res = __legitimize_mnt(path->mnt, nd->m_seq);
+       if (unlikely(res)) {
+               if (res > 0)
+                       path->mnt = NULL;
+               path->dentry = NULL;
+               return false;
+       }
+       if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
+               path->dentry = NULL;
+               return false;
+       }
+       return !read_seqcount_retry(&path->dentry->d_seq, seq);
+}
+
+static bool legitimize_links(struct nameidata *nd)
+{
+       int i;
+       for (i = 0; i < nd->depth; i++) {
+               struct saved *last = nd->stack + i;
+               if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
+                       drop_links(nd);
+                       nd->depth = i + 1;
+                       return false;
+               }
+       }
+       return true;
+}
+
  /*
   * Path walking has 2 modes, rcu-walk and ref-walk (see
   * Documentation/filesystems/path-lookup.txt).  In situations when we can't
@@ -520,35 +650,28 @@ struct nameidata {
   * unlazy_walk - try to switch to ref-walk mode.
   * @nd: nameidata pathwalk data
   * @dentry: child of nd->path.dentry or NULL
+ * @seq: seq number to check dentry against
   * Returns: 0 on success, -ECHILD on failure
   *
   * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
   * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
   * @nd or NULL.  Must be called from rcu-walk context.
+ * Nothing should touch nameidata between unlazy_walk() failure and
+ * terminate_walk().
   */
-static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
+static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq)
  {
-       struct fs_struct *fs = current->fs;
         struct dentry *parent = nd->path.dentry;
  
         BUG_ON(!(nd->flags & LOOKUP_RCU));
  
-       /*
-        * After legitimizing the bastards, terminate_walk()
-        * will do the right thing for non-RCU mode, and all our
-        * subsequent exit cases should rcu_read_unlock()
-        * before returning.  Do vfsmount first; if dentry
-        * can't be legitimized, just set nd->path.dentry to NULL
-        * and rely on dput(NULL) being a no-op.
-        */
-       if (!legitimize_mnt(nd->path.mnt, nd->m_seq))
-               return -ECHILD;
         nd->flags &= ~LOOKUP_RCU;
-
-       if (!lockref_get_not_dead(&parent->d_lockref)) {
-               nd->path.dentry = NULL; 
-               goto out;
-       }
+       if (unlikely(!legitimize_links(nd)))
+               goto out2;
+       if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
+               goto out2;
+       if (unlikely(!lockref_get_not_dead(&parent->d_lockref)))
+               goto out1;
  
         /*
          * For a negative lookup, the lookup sequence point is the parents
@@ -568,7 +691,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
         } else {
                 if (!lockref_get_not_dead(&dentry->d_lockref))
                         goto out;
-               if (read_seqcount_retry(&dentry->d_seq, nd->seq))
+               if (read_seqcount_retry(&dentry->d_seq, seq))
                         goto drop_dentry;
         }
  
@@ -577,22 +700,24 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
          * still valid and get it if required.
          */
         if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
-               spin_lock(&fs->lock);
-               if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry)
-                       goto unlock_and_drop_dentry;
-               path_get(&nd->root);
-               spin_unlock(&fs->lock);
+               if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) {
+                       rcu_read_unlock();
+                       dput(dentry);
+                       return -ECHILD;
+               }
         }
  
         rcu_read_unlock();
         return 0;
  
-unlock_and_drop_dentry:
-       spin_unlock(&fs->lock);
  drop_dentry:
         rcu_read_unlock();
         dput(dentry);
         goto drop_root_mnt;
+out2:
+       nd->path.mnt = NULL;
+out1:
+       nd->path.dentry = NULL;
  out:
         rcu_read_unlock();
  drop_root_mnt:
@@ -601,6 +726,24 @@ drop_root_mnt:
         return -ECHILD;
  }
  
+static int unlazy_link(struct nameidata *nd, struct path *link, unsigned seq)
+{
+       if (unlikely(!legitimize_path(nd, link, seq))) {
+               drop_links(nd);
+               nd->depth = 0;
+               nd->flags &= ~LOOKUP_RCU;
+               nd->path.mnt = NULL;
+               nd->path.dentry = NULL;
+               if (!(nd->flags & LOOKUP_ROOT))
+                       nd->root.mnt = NULL;
+               rcu_read_unlock();
+       } else if (likely(unlazy_walk(nd, NULL, 0)) == 0) {
+               return 0;
+       }
+       path_put(link);
+       return -ECHILD;
+}
+
  static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
  {
         return dentry->d_op->d_revalidate(dentry, flags);
@@ -622,26 +765,10 @@ static int complete_walk(struct nameidata *nd)
         int status;
  
         if (nd->flags & LOOKUP_RCU) {
-               nd->flags &= ~LOOKUP_RCU;
                 if (!(nd->flags & LOOKUP_ROOT))
                         nd->root.mnt = NULL;
-
-               if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) {
-                       rcu_read_unlock();
-                       return -ECHILD;
-               }
-               if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
-                       rcu_read_unlock();
-                       mntput(nd->path.mnt);
+               if (unlikely(unlazy_walk(nd, NULL, 0)))
                         return -ECHILD;
-               }
-               if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
-                       rcu_read_unlock();
-                       dput(dentry);
-                       mntput(nd->path.mnt);
-                       return -ECHILD;
-               }
-               rcu_read_unlock();
         }
  
         if (likely(!(nd->flags & LOOKUP_JUMPED)))
@@ -657,28 +784,25 @@ static int complete_walk(struct nameidata *nd)
         if (!status)
                 status = -ESTALE;
  
-       path_put(&nd->path);
         return status;
  }
  
-static __always_inline void set_root(struct nameidata *nd)
+static void set_root(struct nameidata *nd)
  {
         get_fs_root(current->fs, &nd->root);
  }
  
-static int link_path_walk(const char *, struct nameidata *);
-
-static __always_inline unsigned set_root_rcu(struct nameidata *nd)
+static unsigned set_root_rcu(struct nameidata *nd)
  {
         struct fs_struct *fs = current->fs;
-       unsigned seq, res;
+       unsigned seq;
  
         do {
                 seq = read_seqcount_begin(&fs->seq);
                 nd->root = fs->root;
-               res = __read_seqcount_begin(&nd->root.dentry->d_seq);
+               nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
         } while (read_seqcount_retry(&fs->seq, seq));
-       return res;
+       return nd->root_seq;
  }
  
  static void path_put_conditional(struct path *path, struct nameidata *nd)
@@ -704,8 +828,9 @@ static inline void path_to_nameidata(const struct path *path,
   * Helper to directly jump to a known parsed path from ->follow_link,
   * caller must have taken a reference to path beforehand.
   */
-void nd_jump_link(struct nameidata *nd, struct path *path)
+void nd_jump_link(struct path *path)
  {
+       struct nameidata *nd = current->nameidata;
         path_put(&nd->path);
  
         nd->path = *path;
@@ -713,24 +838,14 @@ void nd_jump_link(struct nameidata *nd, struct path *path)
         nd->flags |= LOOKUP_JUMPED;
  }
  
-void nd_set_link(struct nameidata *nd, char *path)
-{
-       nd->saved_names[nd->depth] = path;
-}
-EXPORT_SYMBOL(nd_set_link);
-
-char *nd_get_link(struct nameidata *nd)
-{
-       return nd->saved_names[nd->depth];
-}
-EXPORT_SYMBOL(nd_get_link);
-
-static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
+static inline void put_link(struct nameidata *nd)
  {
-       struct inode *inode = link->dentry->d_inode;
-       if (inode->i_op->put_link)
-               inode->i_op->put_link(link->dentry, nd, cookie);
-       path_put(link);
+       struct saved *last = nd->stack + --nd->depth;
+       struct inode *inode = last->inode;
+       if (last->cookie && inode->i_op->put_link)
+               inode->i_op->put_link(inode, last->cookie);
+       if (!(nd->flags & LOOKUP_RCU))
+               path_put(&last->link);
  }
  
  int sysctl_protected_symlinks __read_mostly = 0;
@@ -738,7 +853,6 @@ int sysctl_protected_hardlinks __read_mostly = 0;
  
  /**
   * may_follow_link - Check symlink following for unsafe situations
- * @link: The path of the symlink
   * @nd: nameidata pathwalk data
   *
   * In the case of the sysctl_protected_symlinks sysctl being enabled,
@@ -752,7 +866,7 @@ int sysctl_protected_hardlinks __read_mostly = 0;
   *
   * Returns 0 if following the symlink is allowed, -ve on error.
   */
-static inline int may_follow_link(struct path *link, struct nameidata *nd)
+static inline int may_follow_link(struct nameidata *nd)
  {
         const struct inode *inode;
         const struct inode *parent;
@@ -761,7 +875,7 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd)
                 return 0;
  
         /* Allowed if owner and follower match. */
-       inode = link->dentry->d_inode;
+       inode = nd->stack[0].inode;
         if (uid_eq(current_cred()->fsuid, inode->i_uid))
                 return 0;
  
@@ -774,9 +888,10 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd)
         if (uid_eq(parent->i_uid, inode->i_uid))
                 return 0;
  
-       audit_log_link_denied("follow_link", link);
-       path_put_conditional(link, nd);
-       path_put(&nd->path);
+       if (nd->flags & LOOKUP_RCU)
+               return -ECHILD;
+
+       audit_log_link_denied("follow_link", &nd->stack[0].link);
         return -EACCES;
  }
  
@@ -849,82 +964,68 @@ static int may_linkat(struct path *link)
         return -EPERM;
  }
  
-static __always_inline int
-follow_link(struct path *link, struct nameidata *nd, void **p)
+static __always_inline
+const char *get_link(struct nameidata *nd)
  {
-       struct dentry *dentry = link->dentry;
+       struct saved *last = nd->stack + nd->depth - 1;
+       struct dentry *dentry = last->link.dentry;
+       struct inode *inode = last->inode;
         int error;
-       char *s;
+       const char *res;
  
-       BUG_ON(nd->flags & LOOKUP_RCU);
-
-       if (link->mnt == nd->path.mnt)
-               mntget(link->mnt);
-
-       error = -ELOOP;
-       if (unlikely(current->total_link_count >= 40))
-               goto out_put_nd_path;
-
-       cond_resched();
-       current->total_link_count++;
-
-       touch_atime(link);
-       nd_set_link(nd, NULL);
+       if (!(nd->flags & LOOKUP_RCU)) {
+               touch_atime(&last->link);
+               cond_resched();
+       } else if (atime_needs_update(&last->link, inode)) {
+               if (unlikely(unlazy_walk(nd, NULL, 0)))
+                       return ERR_PTR(-ECHILD);
+               touch_atime(&last->link);
+       }
  
-       error = security_inode_follow_link(link->dentry, nd);
-       if (error)
-               goto out_put_nd_path;
+       error = security_inode_follow_link(dentry, inode,
+                                          nd->flags & LOOKUP_RCU);
+       if (unlikely(error))
+               return ERR_PTR(error);
  
         nd->last_type = LAST_BIND;
-       *p = dentry->d_inode->i_op->follow_link(dentry, nd);
-       error = PTR_ERR(*p);
-       if (IS_ERR(*p))
-               goto out_put_nd_path;
-
-       error = 0;
-       s = nd_get_link(nd);
-       if (s) {
-               if (unlikely(IS_ERR(s))) {
-                       path_put(&nd->path);
-                       put_link(nd, link, *p);
-                       return PTR_ERR(s);
+       res = inode->i_link;
+       if (!res) {
+               if (nd->flags & LOOKUP_RCU) {
+                       if (unlikely(unlazy_walk(nd, NULL, 0)))
+                               return ERR_PTR(-ECHILD);
                 }
-               if (*s == '/') {
+               res = inode->i_op->follow_link(dentry, &last->cookie);
+               if (IS_ERR_OR_NULL(res)) {
+                       last->cookie = NULL;
+                       return res;
+               }
+       }
+       if (*res == '/') {
+               if (nd->flags & LOOKUP_RCU) {
+                       struct dentry *d;
+                       if (!nd->root.mnt)
+                               set_root_rcu(nd);
+                       nd->path = nd->root;
+                       d = nd->path.dentry;
+                       nd->inode = d->d_inode;
+                       nd->seq = nd->root_seq;
+                       if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
+                               return ERR_PTR(-ECHILD);
+               } else {
                         if (!nd->root.mnt)
                                 set_root(nd);
                         path_put(&nd->path);
                         nd->path = nd->root;
                         path_get(&nd->root);
-                       nd->flags |= LOOKUP_JUMPED;
+                       nd->inode = nd->path.dentry->d_inode;
                 }
-               nd->inode = nd->path.dentry->d_inode;
-               error = link_path_walk(s, nd);
-               if (unlikely(error))
-                       put_link(nd, link, *p);
+               nd->flags |= LOOKUP_JUMPED;
+               while (unlikely(*++res == '/'))
+                       ;
         }
-
-       return error;
-
-out_put_nd_path:
-       *p = NULL;
-       path_put(&nd->path);
-       path_put(link);
-       return error;
-}
-
-static int follow_up_rcu(struct path *path)
-{
-       struct mount *mnt = real_mount(path->mnt);
-       struct mount *parent;
-       struct dentry *mountpoint;
-
-       parent = mnt->mnt_parent;
-       if (&parent->mnt == path->mnt)
-               return 0;
-       mountpoint = mnt->mnt_mountpoint;
-       path->dentry = mountpoint;
-       path->mnt = &parent->mnt;
-       return 1;
+       if (!*res)
+               res = NULL;
+       return res;
  }
  
  /*
@@ -965,7 +1066,7 @@ EXPORT_SYMBOL(follow_up);
   * - return -EISDIR to tell follow_managed() to stop and return the path we
   *   were called with.
   */
-static int follow_automount(struct path *path, unsigned flags,
+static int follow_automount(struct path *path, struct nameidata *nd,
                             bool *need_mntput)
  {
         struct vfsmount *mnt;
@@ -985,13 +1086,13 @@ static int follow_automount(struct path *path, unsigned flags,
          * as being automount points.  These will need the attentions
          * of the daemon to instantiate them before they can be used.
          */
-       if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
-                    LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
+       if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
+                          LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
             path->dentry->d_inode)
                 return -EISDIR;
  
-       current->total_link_count++;
-       if (current->total_link_count >= 40)
+       nd->total_link_count++;
+       if (nd->total_link_count >= 40)
                 return -ELOOP;
  
         mnt = path->dentry->d_op->d_automount(path);
@@ -1005,7 +1106,7 @@ static int follow_automount(struct path *path, unsigned flags,
                  * the path being looked up; if it wasn't then the remainder of
                  * the path is inaccessible and we should say so.
                  */
-               if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT))
+               if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
                         return -EREMOTE;
                 return PTR_ERR(mnt);
         }
@@ -1045,7 +1146,7 @@ static int follow_automount(struct path *path, unsigned flags,
   *
   * Serialization is taken care of in namespace.c
   */
-static int follow_managed(struct path *path, unsigned flags)
+static int follow_managed(struct path *path, struct nameidata *nd)
  {
         struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
         unsigned managed;
@@ -1089,7 +1190,7 @@ static int follow_managed(struct path *path, unsigned flags)
  
                 /* Handle an automount point */
                 if (managed & DCACHE_NEED_AUTOMOUNT) {
-                       ret = follow_automount(path, flags, &need_mntput);
+                       ret = follow_automount(path, nd, &need_mntput);
                         if (ret < 0)
                                 break;
                         continue;
@@ -1103,7 +1204,11 @@ static int follow_managed(struct path *path, unsigned flags)
                 mntput(path->mnt);
         if (ret == -EISDIR)
                 ret = 0;
-       return ret < 0 ? ret : need_mntput;
+       if (need_mntput)
+               nd->flags |= LOOKUP_JUMPED;
+       if (unlikely(ret < 0))
+               path_put_conditional(path, nd);
+       return ret;
  }
  
  int follow_down_one(struct path *path)
@@ -1133,7 +1238,7 @@ static inline int managed_dentry_rcu(struct dentry *dentry)
   * we meet a managed dentry that would need blocking.
   */
  static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
-                              struct inode **inode)
+                              struct inode **inode, unsigned *seqp)
  {
         for (;;) {
                 struct mount *mounted;
@@ -1160,7 +1265,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                 path->mnt = &mounted->mnt;
                 path->dentry = mounted->mnt.mnt_root;
                 nd->flags |= LOOKUP_JUMPED;
-               nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+               *seqp = read_seqcount_begin(&path->dentry->d_seq);
                 /*
                  * Update the inode too. We don't need to re-check the
                  * dentry sequence number here after this d_inode read,
@@ -1179,10 +1284,8 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                 set_root_rcu(nd);
  
         while (1) {
-               if (nd->path.dentry == nd->root.dentry &&
-                   nd->path.mnt == nd->root.mnt) {
+               if (path_equal(&nd->path, &nd->root))
                         break;
-               }
                 if (nd->path.dentry != nd->path.mnt->mnt_root) {
                         struct dentry *old = nd->path.dentry;
                         struct dentry *parent = old->d_parent;
@@ -1190,38 +1293,42 @@ static int follow_dotdot_rcu(struct nameidata *nd)
  
                         inode = parent->d_inode;
                         seq = read_seqcount_begin(&parent->d_seq);
-                       if (read_seqcount_retry(&old->d_seq, nd->seq))
-                               goto failed;
+                       if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
+                               return -ECHILD;
                         nd->path.dentry = parent;
                         nd->seq = seq;
                         break;
+               } else {
+                       struct mount *mnt = real_mount(nd->path.mnt);
+                       struct mount *mparent = mnt->mnt_parent;
+                       struct dentry *mountpoint = mnt->mnt_mountpoint;
+                       struct inode *inode2 = mountpoint->d_inode;
+                       unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
+                       if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
+                               return -ECHILD;
+                       if (&mparent->mnt == nd->path.mnt)
+                               break;
+                       /* we know that mountpoint was pinned */
+                       nd->path.dentry = mountpoint;
+                       nd->path.mnt = &mparent->mnt;
+                       inode = inode2;
+                       nd->seq = seq;
                 }
-               if (!follow_up_rcu(&nd->path))
-                       break;
-               inode = nd->path.dentry->d_inode;
-               nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
         }
-       while (d_mountpoint(nd->path.dentry)) {
+       while (unlikely(d_mountpoint(nd->path.dentry))) {
                 struct mount *mounted;
                 mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
+               if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
+                       return -ECHILD;
                 if (!mounted)
                         break;
                 nd->path.mnt = &mounted->mnt;
                 nd->path.dentry = mounted->mnt.mnt_root;
                 inode = nd->path.dentry->d_inode;
                 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
-               if (read_seqretry(&mount_lock, nd->m_seq))
-                       goto failed;
         }
         nd->inode = inode;
         return 0;
-
-failed:
-       nd->flags &= ~LOOKUP_RCU;
-       if (!(nd->flags & LOOKUP_ROOT))
-               nd->root.mnt = NULL;
-       rcu_read_unlock();
-       return -ECHILD;
  }
  
  /*
@@ -1400,7 +1507,8 @@ static struct dentry *__lookup_hash(struct qstr *name,
   *  It _is_ time-critical.
   */
  static int lookup_fast(struct nameidata *nd,
-                      struct path *path, struct inode **inode)
+                      struct path *path, struct inode **inode,
+                      unsigned *seqp)
  {
         struct vfsmount *mnt = nd->path.mnt;
         struct dentry *dentry, *parent = nd->path.dentry;
@@ -1424,7 +1532,7 @@ static int lookup_fast(struct nameidata *nd,
                  * This sequence count validates that the inode matches
                  * the dentry name information from lookup.
                  */
-               *inode = dentry->d_inode;
+               *inode = d_backing_inode(dentry);
                 negative = d_is_negative(dentry);
                 if (read_seqcount_retry(&dentry->d_seq, seq))
                         return -ECHILD;
@@ -1440,8 +1548,8 @@ static int lookup_fast(struct nameidata *nd,
                  */
                 if (__read_seqcount_retry(&parent->d_seq, nd->seq))
                         return -ECHILD;
-               nd->seq = seq;
  
+               *seqp = seq;
                 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
                         status = d_revalidate(dentry, nd->flags);
                         if (unlikely(status <= 0)) {
@@ -1452,10 +1560,10 @@ static int lookup_fast(struct nameidata *nd,
                 }
                 path->mnt = mnt;
                 path->dentry = dentry;
-               if (likely(__follow_mount_rcu(nd, path, inode)))
+               if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
                         return 0;
  unlazy:
-               if (unlazy_walk(nd, dentry))
+               if (unlazy_walk(nd, dentry, seq))
                         return -ECHILD;
         } else {
                 dentry = __d_lookup(parent, &nd->last);
@@ -1482,15 +1590,10 @@ unlazy:
         }
         path->mnt = mnt;
         path->dentry = dentry;
-       err = follow_managed(path, nd->flags);
-       if (unlikely(err < 0)) {
-               path_put_conditional(path, nd);
-               return err;
-       }
-       if (err)
-               nd->flags |= LOOKUP_JUMPED;
-       *inode = path->dentry->d_inode;
-       return 0;
+       err = follow_managed(path, nd);
+       if (likely(!err))
+               *inode = d_backing_inode(path->dentry);
+       return err;
  
  need_lookup:
         return 1;
@@ -1500,7 +1603,6 @@ need_lookup:
  static int lookup_slow(struct nameidata *nd, struct path *path)
  {
         struct dentry *dentry, *parent;
-       int err;
  
         parent = nd->path.dentry;
         BUG_ON(nd->inode != parent->d_inode);
@@ -1512,14 +1614,7 @@ static int lookup_slow(struct nameidata *nd, struct path *path)
                 return PTR_ERR(dentry);
         path->mnt = nd->path.mnt;
         path->dentry = dentry;
-       err = follow_managed(path, nd->flags);
-       if (unlikely(err < 0)) {
-               path_put_conditional(path, nd);
-               return err;
-       }
-       if (err)
-               nd->flags |= LOOKUP_JUMPED;
-       return 0;
+       return follow_managed(path, nd);
  }
  
  static inline int may_lookup(struct nameidata *nd)
@@ -1528,7 +1623,7 @@ static inline int may_lookup(struct nameidata *nd)
                 int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
                 if (err != -ECHILD)
                         return err;
-               if (unlazy_walk(nd, NULL))
+               if (unlazy_walk(nd, NULL, 0))
                         return -ECHILD;
         }
         return inode_permission(nd->inode, MAY_EXEC);
@@ -1538,24 +1633,45 @@ static inline int handle_dots(struct nameidata *nd, int type)
  {
         if (type == LAST_DOTDOT) {
                 if (nd->flags & LOOKUP_RCU) {
-                       if (follow_dotdot_rcu(nd))
-                               return -ECHILD;
+                       return follow_dotdot_rcu(nd);
                 } else
                         follow_dotdot(nd);
         }
         return 0;
  }
  
-static void terminate_walk(struct nameidata *nd)
+static int pick_link(struct nameidata *nd, struct path *link,
+                    struct inode *inode, unsigned seq)
  {
+       int error;
+       struct saved *last;
+       if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
+               path_to_nameidata(link, nd);
+               return -ELOOP;
+       }
         if (!(nd->flags & LOOKUP_RCU)) {
-               path_put(&nd->path);
-       } else {
-               nd->flags &= ~LOOKUP_RCU;
-               if (!(nd->flags & LOOKUP_ROOT))
-                       nd->root.mnt = NULL;
-               rcu_read_unlock();
+               if (link->mnt == nd->path.mnt)
+                       mntget(link->mnt);
+       }
+       error = nd_alloc_stack(nd);
+       if (unlikely(error)) {
+               if (error == -ECHILD) {
+                       if (unlikely(unlazy_link(nd, link, seq)))
+                               return -ECHILD;
+                       error = nd_alloc_stack(nd);
+               }
+               if (error) {
+                       path_put(link);
+                       return error;
+               }
         }
+
+       last = nd->stack + nd->depth++;
+       last->link = *link;
+       last->cookie = NULL;
+       last->inode = inode;
+       last->seq = seq;
+       return 1;
  }
  
  /*
@@ -1564,97 +1680,67 @@ static void terminate_walk(struct nameidata *nd)
   * so we keep a cache of "no, this doesn't need follow_link"
   * for the common case.
   */
-static inline int should_follow_link(struct dentry *dentry, int follow)
+static inline int should_follow_link(struct nameidata *nd, struct path *link,
+                                    int follow,
+                                    struct inode *inode, unsigned seq)
  {
-       return unlikely(d_is_symlink(dentry)) ? follow : 0;
+       if (likely(!d_is_symlink(link->dentry)))
+               return 0;
+       if (!follow)
+               return 0;
+       return pick_link(nd, link, inode, seq);
  }
  
-static inline int walk_component(struct nameidata *nd, struct path *path,
-               int follow)
+enum {WALK_GET = 1, WALK_PUT = 2};
+
+static int walk_component(struct nameidata *nd, int flags)
  {
+       struct path path;
         struct inode *inode;
+       unsigned seq;
         int err;
         /*
          * "." and ".." are special - ".." especially so because it has
          * to be able to know about the current root directory and
          * parent relationships.
          */
-       if (unlikely(nd->last_type != LAST_NORM))
-               return handle_dots(nd, nd->last_type);
-       err = lookup_fast(nd, path, &inode);
+       if (unlikely(nd->last_type != LAST_NORM)) {
+               err = handle_dots(nd, nd->last_type);
+               if (flags & WALK_PUT)
+                       put_link(nd);
+               return err;
+       }
+       err = lookup_fast(nd, &path, &inode, &seq);
         if (unlikely(err)) {
                 if (err < 0)
-                       goto out_err;
+                       return err;
  
-               err = lookup_slow(nd, path);
+               err = lookup_slow(nd, &path);
                 if (err < 0)
-                       goto out_err;
+                       return err;
  
-               inode = path->dentry->d_inode;
+               inode = d_backing_inode(path.dentry);
+               seq = 0;        /* we are already out of RCU mode */
                 err = -ENOENT;
-               if (d_is_negative(path->dentry))
+               if (d_is_negative(path.dentry))
                         goto out_path_put;
         }
  
-       if (should_follow_link(path->dentry, follow)) {
-               if (nd->flags & LOOKUP_RCU) {
-                       if (unlikely(nd->path.mnt != path->mnt ||
-                                    unlazy_walk(nd, path->dentry))) {
-                               err = -ECHILD;
-                               goto out_err;
-                       }
-               }
-               BUG_ON(inode != path->dentry->d_inode);
-               return 1;
-       }
-       path_to_nameidata(path, nd);
+       if (flags & WALK_PUT)
+               put_link(nd);
+       err = should_follow_link(nd, &path, flags & WALK_GET, inode, seq);
+       if (unlikely(err))
+               return err;
+       path_to_nameidata(&path, nd);
         nd->inode = inode;
+       nd->seq = seq;
         return 0;
  
  out_path_put:
-       path_to_nameidata(path, nd);
-out_err:
-       terminate_walk(nd);
+       path_to_nameidata(&path, nd);
         return err;
  }
  
-/*
- * This limits recursive symlink follows to 8, while
- * limiting consecutive symlinks to 40.
- *
- * Without that kind of total limit, nasty chains of consecutive
- * symlinks can cause almost arbitrarily long lookups.
- */
-static inline int nested_symlink(struct path *path, struct nameidata *nd)
-{
-       int res;
-
-       if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
-               path_put_conditional(path, nd);
-               path_put(&nd->path);
-               return -ELOOP;
-       }
-       BUG_ON(nd->depth >= MAX_NESTED_LINKS);
-
-       nd->depth++;
-       current->link_count++;
-
-       do {
-               struct path link = *path;
-               void *cookie;
-
-               res = follow_link(&link, nd, &cookie);
-               if (res)
-                       break;
-               res = walk_component(nd, path, LOOKUP_FOLLOW);
-               put_link(nd, &link, cookie);
-       } while (res > 0);
-
-       current->link_count--;
-       nd->depth--;
-       return res;
-}
-
  /*
   * We can do the critical dentry name comparison and hashing
   * operations one word at a time, but we are limited to:
@@ -1781,9 +1867,8 @@ static inline u64 hash_name(const char *name)
   */
  static int link_path_walk(const char *name, struct nameidata *nd)
  {
-       struct path next;
         int err;
-       
+
         while (*name=='/')
                 name++;
         if (!*name)
@@ -1796,7 +1881,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
  
                 err = may_lookup(nd);
                 if (err)
-                       break;
+                       return err;
  
                 hash_len = hash_name(name);
  
@@ -1818,7 +1903,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                                 struct qstr this = { { .hash_len = hash_len }, .name = name };
                                 err = parent->d_op->d_hash(parent, &this);
                                 if (err < 0)
-                                       break;
+                                       return err;
                                 hash_len = this.hash_len;
                                 name = this.name;
                         }
@@ -1830,7 +1915,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
  
                 name += hashlen_len(hash_len);
                 if (!*name)
-                       return 0;
+                       goto OK;
                 /*
                  * If it wasn't NUL, we know it was '/'. Skip that
                  * slash, and continue until no more slashes.
@@ -1838,57 +1923,73 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                 do {
                         name++;
                 } while (unlikely(*name == '/'));
-               if (!*name)
-                       return 0;
-
-               err = walk_component(nd, &next, LOOKUP_FOLLOW);
+               if (unlikely(!*name)) {
+OK:
+                       /* pathname body, done */
+                       if (!nd->depth)
+                               return 0;
+                       name = nd->stack[nd->depth - 1].name;
+                       /* trailing symlink, done */
+                       if (!name)
+                               return 0;
+                       /* last component of nested symlink */
+                       err = walk_component(nd, WALK_GET | WALK_PUT);
+               } else {
+                       err = walk_component(nd, WALK_GET);
+               }
                 if (err < 0)
                         return err;
  
                 if (err) {
-                       err = nested_symlink(&next, nd);
-                       if (err)
-                               return err;
-               }
-               if (!d_can_lookup(nd->path.dentry)) {
-                       err = -ENOTDIR; 
-                       break;
+                       const char *s = get_link(nd);
+
+                       if (unlikely(IS_ERR(s)))
+                               return PTR_ERR(s);
+                       err = 0;
+                       if (unlikely(!s)) {
+                               /* jumped */
+                               put_link(nd);
+                       } else {
+                               nd->stack[nd->depth - 1].name = name;
+                               name = s;
+                               continue;
+                       }
                 }
+               if (unlikely(!d_can_lookup(nd->path.dentry)))
+                       return -ENOTDIR;
         }
-       terminate_walk(nd);
-       return err;
  }
  
-static int path_init(int dfd, const struct filename *name, unsigned int flags,
-                    struct nameidata *nd)
+static const char *path_init(struct nameidata *nd, unsigned flags)
  {
         int retval = 0;
-       const char *s = name->name;
+       const char *s = nd->name->name;
  
         nd->last_type = LAST_ROOT; /* if there are only slashes... */
         nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
         nd->depth = 0;
-       nd->base = NULL;
+       nd->total_link_count = 0;
         if (flags & LOOKUP_ROOT) {
                 struct dentry *root = nd->root.dentry;
                 struct inode *inode = root->d_inode;
                 if (*s) {
                         if (!d_can_lookup(root))
-                               return -ENOTDIR;
+                               return ERR_PTR(-ENOTDIR);
                         retval = inode_permission(inode, MAY_EXEC);
                         if (retval)
-                               return retval;
+                               return ERR_PTR(retval);
                 }
                 nd->path = nd->root;
                 nd->inode = inode;
                 if (flags & LOOKUP_RCU) {
                         rcu_read_lock();
                         nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                       nd->root_seq = nd->seq;
                         nd->m_seq = read_seqbegin(&mount_lock);
                 } else {
                         path_get(&nd->path);
                 }
-               goto done;
+               return s;
         }
  
         nd->root.mnt = NULL;
@@ -1903,7 +2004,7 @@ static int path_init(int dfd, const struct filename *name, unsigned int flags,
                         path_get(&nd->root);
                 }
                 nd->path = nd->root;
-       } else if (dfd == AT_FDCWD) {
+       } else if (nd->dfd == AT_FDCWD) {
                 if (flags & LOOKUP_RCU) {
                         struct fs_struct *fs = current->fs;
                         unsigned seq;
@@ -1920,180 +2021,205 @@ static int path_init(int dfd, const struct filename *name, unsigned int flags,
                 }
         } else {
                 /* Caller must check execute permissions on the starting path component */
-               struct fd f = fdget_raw(dfd);
+               struct fd f = fdget_raw(nd->dfd);
                 struct dentry *dentry;
  
                 if (!f.file)
-                       return -EBADF;
+                       return ERR_PTR(-EBADF);
  
                 dentry = f.file->f_path.dentry;
  
                 if (*s) {
                         if (!d_can_lookup(dentry)) {
                                 fdput(f);
-                               return -ENOTDIR;
+                               return ERR_PTR(-ENOTDIR);
                         }
                 }
  
                 nd->path = f.file->f_path;
                 if (flags & LOOKUP_RCU) {
-                       if (f.flags & FDPUT_FPUT)
-                               nd->base = f.file;
-                       nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
                         rcu_read_lock();
+                       nd->inode = nd->path.dentry->d_inode;
+                       nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
                 } else {
                         path_get(&nd->path);
-                       fdput(f);
+                       nd->inode = nd->path.dentry->d_inode;
                 }
+               fdput(f);
+               return s;
         }
  
         nd->inode = nd->path.dentry->d_inode;
         if (!(flags & LOOKUP_RCU))
-               goto done;
+               return s;
         if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
-               goto done;
+               return s;
         if (!(nd->flags & LOOKUP_ROOT))
                 nd->root.mnt = NULL;
         rcu_read_unlock();
-       return -ECHILD;
-done:
-       current->total_link_count = 0;
-       return link_path_walk(s, nd);
+       return ERR_PTR(-ECHILD);
  }
  
-static void path_cleanup(struct nameidata *nd)
+static const char *trailing_symlink(struct nameidata *nd)
  {
-       if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
-               path_put(&nd->root);
-               nd->root.mnt = NULL;
-       }
-       if (unlikely(nd->base))
-               fput(nd->base);
+       const char *s;
+       int error = may_follow_link(nd);
+       if (unlikely(error))
+               return ERR_PTR(error);
+       nd->flags |= LOOKUP_PARENT;
+       nd->stack[0].name = NULL;
+       s = get_link(nd);
+       return s ? s : "";
  }
  
-static inline int lookup_last(struct nameidata *nd, struct path *path)
+static inline int lookup_last(struct nameidata *nd)
  {
         if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
                 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
  
         nd->flags &= ~LOOKUP_PARENT;
-       return walk_component(nd, path, nd->flags & LOOKUP_FOLLOW);
+       return walk_component(nd,
+                       nd->flags & LOOKUP_FOLLOW
+                               ? nd->depth
+                                       ? WALK_PUT | WALK_GET
+                                       : WALK_GET
+                               : 0);
  }
  
  /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int path_lookupat(int dfd, const struct filename *name,
-                               unsigned int flags, struct nameidata *nd)
+static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
  {
-       struct path path;
+       const char *s = path_init(nd, flags);
         int err;
  
-       /*
-        * Path walking is largely split up into 2 different synchronisation
-        * schemes, rcu-walk and ref-walk (explained in
-        * Documentation/filesystems/path-lookup.txt). These share much of the
-        * path walk code, but some things particularly setup, cleanup, and
-        * following mounts are sufficiently divergent that functions are
-        * duplicated. Typically there is a function foo(), and its RCU
-        * analogue, foo_rcu().
-        *
-        * -ECHILD is the error number of choice (just to avoid clashes) that
-        * is returned if some aspect of an rcu-walk fails. Such an error must
-        * be handled by restarting a traditional ref-walk (which will always
-        * be able to complete).
-        */
-       err = path_init(dfd, name, flags, nd);
-       if (!err && !(flags & LOOKUP_PARENT)) {
-               err = lookup_last(nd, &path);
-               while (err > 0) {
-                       void *cookie;
-                       struct path link = path;
-                       err = may_follow_link(&link, nd);
-                       if (unlikely(err))
-                               break;
-                       nd->flags |= LOOKUP_PARENT;
-                       err = follow_link(&link, nd, &cookie);
-                       if (err)
-                               break;
-                       err = lookup_last(nd, &path);
-                       put_link(nd, &link, cookie);
+       if (IS_ERR(s))
+               return PTR_ERR(s);
+       while (!(err = link_path_walk(s, nd))
+               && ((err = lookup_last(nd)) > 0)) {
+               s = trailing_symlink(nd);
+               if (IS_ERR(s)) {
+                       err = PTR_ERR(s);
+                       break;
                 }
         }
-
         if (!err)
                 err = complete_walk(nd);
  
-       if (!err && nd->flags & LOOKUP_DIRECTORY) {
-               if (!d_can_lookup(nd->path.dentry)) {
-                       path_put(&nd->path);
+       if (!err && nd->flags & LOOKUP_DIRECTORY)
+               if (!d_can_lookup(nd->path.dentry))
                         err = -ENOTDIR;
-               }
+       if (!err) {
+               *path = nd->path;
+               nd->path.mnt = NULL;
+               nd->path.dentry = NULL;
         }
-
-       path_cleanup(nd);
+       terminate_walk(nd);
         return err;
  }
  
-static int filename_lookup(int dfd, struct filename *name,
-                               unsigned int flags, struct nameidata *nd)
+static int filename_lookup(int dfd, struct filename *name, unsigned flags,
+                          struct path *path, struct path *root)
  {
-       int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
+       int retval;
+       struct nameidata nd;
+       if (IS_ERR(name))
+               return PTR_ERR(name);
+       if (unlikely(root)) {
+               nd.root = *root;
+               flags |= LOOKUP_ROOT;
+       }
+       set_nameidata(&nd, dfd, name);
+       retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
         if (unlikely(retval == -ECHILD))
-               retval = path_lookupat(dfd, name, flags, nd);
+               retval = path_lookupat(&nd, flags, path);
         if (unlikely(retval == -ESTALE))
-               retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
+               retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
  
         if (likely(!retval))
-               audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
+               audit_inode(name, path->dentry, flags & LOOKUP_PARENT);
+       restore_nameidata();
+       putname(name);
         return retval;
  }
  
+/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+static int path_parentat(struct nameidata *nd, unsigned flags,
+                               struct path *parent)
+{
+       const char *s = path_init(nd, flags);
+       int err;
+       if (IS_ERR(s))
+               return PTR_ERR(s);
+       err = link_path_walk(s, nd);
+       if (!err)
+               err = complete_walk(nd);
+       if (!err) {
+               *parent = nd->path;
+               nd->path.mnt = NULL;
+               nd->path.dentry = NULL;
+       }
+       terminate_walk(nd);
+       return err;
+}
+
+static struct filename *filename_parentat(int dfd, struct filename *name,
+                               unsigned int flags, struct path *parent,
+                               struct qstr *last, int *type)
+{
+       int retval;
+       struct nameidata nd;
+
+       if (IS_ERR(name))
+               return name;
+       set_nameidata(&nd, dfd, name);
+       retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
+       if (unlikely(retval == -ECHILD))
+               retval = path_parentat(&nd, flags, parent);
+       if (unlikely(retval == -ESTALE))
+               retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
+       if (likely(!retval)) {
+               *last = nd.last;
+               *type = nd.last_type;
+               audit_inode(name, parent->dentry, LOOKUP_PARENT);
+       } else {
+               putname(name);
+               name = ERR_PTR(retval);
+       }
+       restore_nameidata();
+       return name;
+}
+
  /* does lookup, returns the object with parent locked */
  struct dentry *kern_path_locked(const char *name, struct path *path)
  {
-       struct filename *filename = getname_kernel(name);
-       struct nameidata nd;
+       struct filename *filename;
         struct dentry *d;
-       int err;
+       struct qstr last;
+       int type;
  
+       filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
+                                   &last, &type);
         if (IS_ERR(filename))
                 return ERR_CAST(filename);
-
-       err = filename_lookup(AT_FDCWD, filename, LOOKUP_PARENT, &nd);
-       if (err) {
-               d = ERR_PTR(err);
-               goto out;
-       }
-       if (nd.last_type != LAST_NORM) {
-               path_put(&nd.path);
-               d = ERR_PTR(-EINVAL);
-               goto out;
+       if (unlikely(type != LAST_NORM)) {
+               path_put(path);
+               putname(filename);
+               return ERR_PTR(-EINVAL);
         }
-       mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
-       d = __lookup_hash(&nd.last, nd.path.dentry, 0);
+       mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+       d = __lookup_hash(&last, path->dentry, 0);
         if (IS_ERR(d)) {
-               mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-               path_put(&nd.path);
-               goto out;
+               mutex_unlock(&path->dentry->d_inode->i_mutex);
+               path_put(path);
         }
-       *path = nd.path;
-out:
         putname(filename);
         return d;
  }
  
  int kern_path(const char *name, unsigned int flags, struct path *path)
  {
-       struct nameidata nd;
-       struct filename *filename = getname_kernel(name);
-       int res = PTR_ERR(filename);
-
-       if (!IS_ERR(filename)) {
-               res = filename_lookup(AT_FDCWD, filename, flags, &nd);
-               putname(filename);
-               if (!res)
-                       *path = nd.path;
-       }
-       return res;
+       return filename_lookup(AT_FDCWD, getname_kernel(name),
+                              flags, path, NULL);
  }
  EXPORT_SYMBOL(kern_path);
  
@@ -2109,36 +2235,13 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                     const char *name, unsigned int flags,
                     struct path *path)
  {
-       struct filename *filename = getname_kernel(name);
-       int err = PTR_ERR(filename);
-
-       BUG_ON(flags & LOOKUP_PARENT);
-
-       /* the first argument of filename_lookup() is ignored with LOOKUP_ROOT */
-       if (!IS_ERR(filename)) {
-               struct nameidata nd;
-               nd.root.dentry = dentry;
-               nd.root.mnt = mnt;
-               err = filename_lookup(AT_FDCWD, filename,
-                                     flags | LOOKUP_ROOT, &nd);
-               if (!err)
-                       *path = nd.path;
-               putname(filename);
-       }
-       return err;
+       struct path root = {.mnt = mnt, .dentry = dentry};
+       /* the first argument of filename_lookup() is ignored with root */
+       return filename_lookup(AT_FDCWD, getname_kernel(name),
+                              flags , path, &root);
  }
  EXPORT_SYMBOL(vfs_path_lookup);
  
-/*
- * Restricted form of lookup. Doesn't follow links, single-component only,
- * needs parent already locked. Doesn't follow mounts.
- * SMP-safe.
- */
-static struct dentry *lookup_hash(struct nameidata *nd)
-{
-       return __lookup_hash(&nd->last, nd->path.dentry, nd->flags);
-}
-
  /**
   * lookup_one_len - filesystem helper to lookup single pathname component
   * @name:      pathname component to lookup
@@ -2193,27 +2296,10 @@ EXPORT_SYMBOL(lookup_one_len);
  int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
                  struct path *path, int *empty)
  {
-       struct nameidata nd;
-       struct filename *tmp = getname_flags(name, flags, empty);
-       int err = PTR_ERR(tmp);
-       if (!IS_ERR(tmp)) {
-
-               BUG_ON(flags & LOOKUP_PARENT);
-
-               err = filename_lookup(dfd, tmp, flags, &nd);
-               putname(tmp);
-               if (!err)
-                       *path = nd.path;
-       }
-       return err;
-}
-
-int user_path_at(int dfd, const char __user *name, unsigned flags,
-                struct path *path)
-{
-       return user_path_at_empty(dfd, name, flags, path, NULL);
+       return filename_lookup(dfd, getname_flags(name, flags, empty),
+                              flags, path, NULL);
  }
-EXPORT_SYMBOL(user_path_at);
+EXPORT_SYMBOL(user_path_at_empty);
  
  /*
   * NB: most callers don't do anything directly with the reference to the
@@ -2221,26 +2307,16 @@ EXPORT_SYMBOL(user_path_at);
   *     allocated by getname. So we must hold the reference to it until all
   *     path-walking is complete.
   */
-static struct filename *
-user_path_parent(int dfd, const char __user *path, struct nameidata *nd,
+static inline struct filename *
+user_path_parent(int dfd, const char __user *path,
+                struct path *parent,
+                struct qstr *last,
+                int *type,
                  unsigned int flags)
  {
-       struct filename *s = getname(path);
-       int error;
-
         /* only LOOKUP_REVAL is allowed in extra flags */
-       flags &= LOOKUP_REVAL;
-
-       if (IS_ERR(s))
-               return s;
-
-       error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd);
-       if (error) {
-               putname(s);
-               return ERR_PTR(error);
-       }
-
-       return s;
+       return filename_parentat(dfd, getname(path), flags & LOOKUP_REVAL,
+                                parent, last, type);
  }
  
  /**
@@ -2279,10 +2355,8 @@ mountpoint_last(struct nameidata *nd, struct path *path)
  
         /* If we're in rcuwalk, drop out of it to handle last component */
         if (nd->flags & LOOKUP_RCU) {
-               if (unlazy_walk(nd, NULL)) {
-                       error = -ECHILD;
-                       goto out;
-               }
+               if (unlazy_walk(nd, NULL, 0))
+                       return -ECHILD;
         }
  
         nd->flags &= ~LOOKUP_PARENT;
@@ -2290,7 +2364,7 @@ mountpoint_last(struct nameidata *nd, struct path *path)
         if (unlikely(nd->last_type != LAST_NORM)) {
                 error = handle_dots(nd, nd->last_type);
                 if (error)
-                       goto out;
+                       return error;
                 dentry = dget(nd->path.dentry);
                 goto done;
         }
@@ -2305,74 +2379,60 @@ mountpoint_last(struct nameidata *nd, struct path *path)
                  */
                 dentry = d_alloc(dir, &nd->last);
                 if (!dentry) {
-                       error = -ENOMEM;
                         mutex_unlock(&dir->d_inode->i_mutex);
-                       goto out;
+                       return -ENOMEM;
                 }
                 dentry = lookup_real(dir->d_inode, dentry, nd->flags);
-               error = PTR_ERR(dentry);
                 if (IS_ERR(dentry)) {
                         mutex_unlock(&dir->d_inode->i_mutex);
-                       goto out;
+                       return PTR_ERR(dentry);
                 }
         }
         mutex_unlock(&dir->d_inode->i_mutex);
  
  done:
         if (d_is_negative(dentry)) {
-               error = -ENOENT;
                 dput(dentry);
-               goto out;
+               return -ENOENT;
         }
+       if (nd->depth)
+               put_link(nd);
         path->dentry = dentry;
         path->mnt = nd->path.mnt;
-       if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW))
-               return 1;
+       error = should_follow_link(nd, path, nd->flags & LOOKUP_FOLLOW,
+                                  d_backing_inode(dentry), 0);
+       if (unlikely(error))
+               return error;
         mntget(path->mnt);
         follow_mount(path);
-       error = 0;
-out:
-       terminate_walk(nd);
-       return error;
+       return 0;
  }
  
  /**
   * path_mountpoint - look up a path to be umounted
- * @dfd:       directory file descriptor to start walk from
- * @name:      full pathname to walk
- * @path:      pointer to container for result
+ * @nameidata: lookup context
   * @flags:     lookup flags
+ * @path:      pointer to container for result
   *
   * Look up the given name, but don't attempt to revalidate the last component.
   * Returns 0 and "path" will be valid on success; Returns error otherwise.
   */
  static int
-path_mountpoint(int dfd, const struct filename *name, struct path *path,
-               unsigned int flags)
+path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path)
  {
-       struct nameidata nd;
+       const char *s = path_init(nd, flags);
         int err;
-
-       err = path_init(dfd, name, flags, &nd);
-       if (unlikely(err))
-               goto out;
-
-       err = mountpoint_last(&nd, path);
-       while (err > 0) {
-               void *cookie;
-               struct path link = *path;
-               err = may_follow_link(&link, &nd);
-               if (unlikely(err))
-                       break;
-               nd.flags |= LOOKUP_PARENT;
-               err = follow_link(&link, &nd, &cookie);
-               if (err)
+       if (IS_ERR(s))
+               return PTR_ERR(s);
+       while (!(err = link_path_walk(s, nd)) &&
+               (err = mountpoint_last(nd, path)) > 0) {
+               s = trailing_symlink(nd);
+               if (IS_ERR(s)) {
+                       err = PTR_ERR(s);
                         break;
-               err = mountpoint_last(&nd, path);
-               put_link(&nd, &link, cookie);
+               }
         }
-out:
-       path_cleanup(&nd);
+       terminate_walk(nd);
         return err;
  }
  
@@ -2380,16 +2440,19 @@ static int
  filename_mountpoint(int dfd, struct filename *name, struct path *path,
                         unsigned int flags)
  {
+       struct nameidata nd;
         int error;
         if (IS_ERR(name))
                 return PTR_ERR(name);
-       error = path_mountpoint(dfd, name, path, flags | LOOKUP_RCU);
+       set_nameidata(&nd, dfd, name);
+       error = path_mountpoint(&nd, flags | LOOKUP_RCU, path);
         if (unlikely(error == -ECHILD))
-               error = path_mountpoint(dfd, name, path, flags);
+               error = path_mountpoint(&nd, flags, path);
         if (unlikely(error == -ESTALE))
-               error = path_mountpoint(dfd, name, path, flags | LOOKUP_REVAL);
+               error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
         if (likely(!error))
                 audit_inode(name, path->dentry, 0);
+       restore_nameidata();
         putname(name);
         return error;
  }
@@ -2456,7 +2519,7 @@ EXPORT_SYMBOL(__check_sticky);
   */
  static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
  {
-       struct inode *inode = victim->d_inode;
+       struct inode *inode = d_backing_inode(victim);
         int error;
  
         if (d_is_negative(victim))
@@ -2922,18 +2985,19 @@ out_dput:
  /*
   * Handle the last step of open()
   */
-static int do_last(struct nameidata *nd, struct path *path,
+static int do_last(struct nameidata *nd,
                    struct file *file, const struct open_flags *op,
-                  int *opened, struct filename *name)
+                  int *opened)
  {
         struct dentry *dir = nd->path.dentry;
         int open_flag = op->open_flag;
         bool will_truncate = (open_flag & O_TRUNC) != 0;
         bool got_write = false;
         int acc_mode = op->acc_mode;
+       unsigned seq;
         struct inode *inode;
-       bool symlink_ok = false;
         struct path save_parent = { .dentry = NULL, .mnt = NULL };
+       struct path path;
         bool retried = false;
         int error;
  
@@ -2942,7 +3006,7 @@ static int do_last(struct nameidata *nd, struct path *path,
  
         if (nd->last_type != LAST_NORM) {
                 error = handle_dots(nd, nd->last_type);
-               if (error)
+               if (unlikely(error))
                         return error;
                 goto finish_open;
         }
@@ -2950,15 +3014,13 @@ static int do_last(struct nameidata *nd, struct path *path,
         if (!(open_flag & O_CREAT)) {
                 if (nd->last.name[nd->last.len])
                         nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
-               if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
-                       symlink_ok = true;
                 /* we _can_ be in RCU mode here */
-               error = lookup_fast(nd, path, &inode);
+               error = lookup_fast(nd, &path, &inode, &seq);
                 if (likely(!error))
                         goto finish_lookup;
  
                 if (error < 0)
-                       goto out;
+                       return error;
  
                 BUG_ON(nd->inode != dir->d_inode);
         } else {
@@ -2972,11 +3034,10 @@ static int do_last(struct nameidata *nd, struct path *path,
                 if (error)
                         return error;
  
-               audit_inode(name, dir, LOOKUP_PARENT);
-               error = -EISDIR;
+               audit_inode(nd->name, dir, LOOKUP_PARENT);
                 /* trailing slashes? */
-               if (nd->last.name[nd->last.len])
-                       goto out;
+               if (unlikely(nd->last.name[nd->last.len]))
+                       return -EISDIR;
         }
  
  retry_lookup:
@@ -2991,7 +3052,7 @@ retry_lookup:
                  */
         }
         mutex_lock(&dir->d_inode->i_mutex);
-       error = lookup_open(nd, path, file, op, got_write, opened);
+       error = lookup_open(nd, &path, file, op, got_write, opened);
         mutex_unlock(&dir->d_inode->i_mutex);
  
         if (error <= 0) {
@@ -3002,7 +3063,7 @@ retry_lookup:
                     !S_ISREG(file_inode(file)->i_mode))
                         will_truncate = false;
  
-               audit_inode(name, file->f_path.dentry, 0);
+               audit_inode(nd->name, file->f_path.dentry, 0);
                 goto opened;
         }
  
@@ -3011,15 +3072,15 @@ retry_lookup:
                 open_flag &= ~O_TRUNC;
                 will_truncate = false;
                 acc_mode = MAY_OPEN;
-               path_to_nameidata(path, nd);
+               path_to_nameidata(&path, nd);
                 goto finish_open_created;
         }
  
         /*
          * create/update audit record if it already exists.
          */
-       if (d_is_positive(path->dentry))
-               audit_inode(name, path->dentry, 0);
+       if (d_is_positive(path.dentry))
+               audit_inode(nd->name, path.dentry, 0);
  
         /*
          * If atomic_open() acquired write access it is dropped now due to
@@ -3031,47 +3092,45 @@ retry_lookup:
                 got_write = false;
         }
  
-       error = -EEXIST;
-       if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))
-               goto exit_dput;
-
-       error = follow_managed(path, nd->flags);
-       if (error < 0)
-               goto exit_dput;
+       if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
+               path_to_nameidata(&path, nd);
+               return -EEXIST;
+       }
  
-       if (error)
-               nd->flags |= LOOKUP_JUMPED;
+       error = follow_managed(&path, nd);
+       if (unlikely(error < 0))
+               return error;
  
         BUG_ON(nd->flags & LOOKUP_RCU);
-       inode = path->dentry->d_inode;
-       error = -ENOENT;
-       if (d_is_negative(path->dentry)) {
-               path_to_nameidata(path, nd);
-               goto out;
+       inode = d_backing_inode(path.dentry);
+       seq = 0;        /* out of RCU mode, so the value doesn't matter */
+       if (unlikely(d_is_negative(path.dentry))) {
+               path_to_nameidata(&path, nd);
+               return -ENOENT;
         }
  finish_lookup:
-       /* we _can_ be in RCU mode here */
-       if (should_follow_link(path->dentry, !symlink_ok)) {
-               if (nd->flags & LOOKUP_RCU) {
-                       if (unlikely(nd->path.mnt != path->mnt ||
-                                    unlazy_walk(nd, path->dentry))) {
-                               error = -ECHILD;
-                               goto out;
-                       }
-               }
-               BUG_ON(inode != path->dentry->d_inode);
-               return 1;
+       if (nd->depth)
+               put_link(nd);
+       error = should_follow_link(nd, &path, nd->flags & LOOKUP_FOLLOW,
+                                  inode, seq);
+       if (unlikely(error))
+               return error;
+
+       if (unlikely(d_is_symlink(path.dentry)) && !(open_flag & O_PATH)) {
+               path_to_nameidata(&path, nd);
+               return -ELOOP;
         }
  
-       if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) {
-               path_to_nameidata(path, nd);
+       if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
+               path_to_nameidata(&path, nd);
         } else {
                 save_parent.dentry = nd->path.dentry;
-               save_parent.mnt = mntget(path->mnt);
-               nd->path.dentry = path->dentry;
+               save_parent.mnt = mntget(path.mnt);
+               nd->path.dentry = path.dentry;
  
         }
         nd->inode = inode;
+       nd->seq = seq;
         /* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
  finish_open:
         error = complete_walk(nd);
@@ -3079,7 +3138,7 @@ finish_open:
                 path_put(&save_parent);
                 return error;
         }
-       audit_inode(name, nd->path.dentry, 0);
+       audit_inode(nd->name, nd->path.dentry, 0);
         error = -EISDIR;
         if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
                 goto out;
@@ -3126,12 +3185,8 @@ out:
         if (got_write)
                 mnt_drop_write(nd->path.mnt);
         path_put(&save_parent);
-       terminate_walk(nd);
         return error;
  
-exit_dput:
-       path_put_conditional(path, nd);
-       goto out;
  exit_fput:
         fput(file);
         goto out;
@@ -3155,50 +3210,46 @@ stale_open:
         goto retry_lookup;
  }
  
-static int do_tmpfile(int dfd, struct filename *pathname,
-               struct nameidata *nd, int flags,
+static int do_tmpfile(struct nameidata *nd, unsigned flags,
                 const struct open_flags *op,
                 struct file *file, int *opened)
  {
         static const struct qstr name = QSTR_INIT("/", 1);
-       struct dentry *dentry, *child;
+       struct dentry *child;
         struct inode *dir;
-       int error = path_lookupat(dfd, pathname,
-                                 flags | LOOKUP_DIRECTORY, nd);
+       struct path path;
+       int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
         if (unlikely(error))
                 return error;
-       error = mnt_want_write(nd->path.mnt);
+       error = mnt_want_write(path.mnt);
         if (unlikely(error))
                 goto out;
+       dir = path.dentry->d_inode;
         /* we want directory to be writable */
-       error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC);
+       error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
         if (error)
                 goto out2;
-       dentry = nd->path.dentry;
-       dir = dentry->d_inode;
         if (!dir->i_op->tmpfile) {
                 error = -EOPNOTSUPP;
                 goto out2;
         }
-       child = d_alloc(dentry, &name);
+       child = d_alloc(path.dentry, &name);
         if (unlikely(!child)) {
                 error = -ENOMEM;
                 goto out2;
         }
-       nd->flags &= ~LOOKUP_DIRECTORY;
-       nd->flags |= op->intent;
-       dput(nd->path.dentry);
-       nd->path.dentry = child;
-       error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode);
+       dput(path.dentry);
+       path.dentry = child;
+       error = dir->i_op->tmpfile(dir, child, op->mode);
         if (error)
                 goto out2;
-       audit_inode(pathname, nd->path.dentry, 0);
+       audit_inode(nd->name, child, 0);
         /* Don't check for other permissions, the inode was just created */
-       error = may_open(&nd->path, MAY_OPEN, op->open_flag);
+       error = may_open(&path, MAY_OPEN, op->open_flag);
         if (error)
                 goto out2;
-       file->f_path.mnt = nd->path.mnt;
-       error = finish_open(file, nd->path.dentry, NULL, opened);
+       file->f_path.mnt = path.mnt;
+       error = finish_open(file, child, NULL, opened);
         if (error)
                 goto out2;
         error = open_check_o_direct(file);
@@ -3211,17 +3262,17 @@ static int do_tmpfile(int dfd, struct filename *pathname,
                 spin_unlock(&inode->i_lock);
         }
  out2:
-       mnt_drop_write(nd->path.mnt);
+       mnt_drop_write(path.mnt);
  out:
-       path_put(&nd->path);
+       path_put(&path);
         return error;
  }
  
-static struct file *path_openat(int dfd, struct filename *pathname,
-               struct nameidata *nd, const struct open_flags *op, int flags)
+static struct file *path_openat(struct nameidata *nd,
+                       const struct open_flags *op, unsigned flags)
  {
+       const char *s;
         struct file *file;
-       struct path path;
         int opened = 0;
         int error;
  
@@ -3232,37 +3283,25 @@ static struct file *path_openat(int dfd, struct filename *pathname,
         file->f_flags = op->open_flag;
  
         if (unlikely(file->f_flags & __O_TMPFILE)) {
-               error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
+               error = do_tmpfile(nd, flags, op, file, &opened);
                 goto out2;
         }
  
-       error = path_init(dfd, pathname, flags, nd);
-       if (unlikely(error))
-               goto out;
-
-       error = do_last(nd, &path, file, op, &opened, pathname);
-       while (unlikely(error > 0)) { /* trailing symlink */
-               struct path link = path;
-               void *cookie;
-               if (!(nd->flags & LOOKUP_FOLLOW)) {
-                       path_put_conditional(&path, nd);
-                       path_put(&nd->path);
-                       error = -ELOOP;
-                       break;
-               }
-               error = may_follow_link(&link, nd);
-               if (unlikely(error))
-                       break;
-               nd->flags |= LOOKUP_PARENT;
+       s = path_init(nd, flags);
+       if (IS_ERR(s)) {
+               put_filp(file);
+               return ERR_CAST(s);
+       }
+       while (!(error = link_path_walk(s, nd)) &&
+               (error = do_last(nd, file, op, &opened)) > 0) {
                 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-               error = follow_link(&link, nd, &cookie);
-               if (unlikely(error))
+               s = trailing_symlink(nd);
+               if (IS_ERR(s)) {
+                       error = PTR_ERR(s);
                         break;
-               error = do_last(nd, &path, file, op, &opened, pathname);
-               put_link(nd, &link, cookie);
+               }
         }
-out:
-       path_cleanup(nd);
+       terminate_walk(nd);
  out2:
         if (!(opened & FILE_OPENED)) {
                 BUG_ON(!error);
@@ -3287,11 +3326,13 @@ struct file *do_filp_open(int dfd, struct filename *pathname,
         int flags = op->lookup_flags;
         struct file *filp;
  
-       filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
+       set_nameidata(&nd, dfd, pathname);
+       filp = path_openat(&nd, op, flags | LOOKUP_RCU);
         if (unlikely(filp == ERR_PTR(-ECHILD)))
-               filp = path_openat(dfd, pathname, &nd, op, flags);
+               filp = path_openat(&nd, op, flags);
         if (unlikely(filp == ERR_PTR(-ESTALE)))
-               filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
+               filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
+       restore_nameidata();
         return filp;
  }
  
@@ -3313,11 +3354,13 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
         if (unlikely(IS_ERR(filename)))
                 return ERR_CAST(filename);
  
-       file = path_openat(-1, filename, &nd, op, flags | LOOKUP_RCU);
+       set_nameidata(&nd, -1, filename);
+       file = path_openat(&nd, op, flags | LOOKUP_RCU);
         if (unlikely(file == ERR_PTR(-ECHILD)))
-               file = path_openat(-1, filename, &nd, op, flags);
+               file = path_openat(&nd, op, flags);
         if (unlikely(file == ERR_PTR(-ESTALE)))
-               file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL);
+               file = path_openat(&nd, op, flags | LOOKUP_REVAL);
+       restore_nameidata();
         putname(filename);
         return file;
  }
@@ -3326,7 +3369,8 @@ static struct dentry *filename_create(int dfd, struct filename *name,
                                 struct path *path, unsigned int lookup_flags)
  {
         struct dentry *dentry = ERR_PTR(-EEXIST);
-       struct nameidata nd;
+       struct qstr last;
+       int type;
         int err2;
         int error;
         bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
@@ -3337,26 +3381,25 @@ static struct dentry *filename_create(int dfd, struct filename *name,
          */
         lookup_flags &= LOOKUP_REVAL;
  
-       error = filename_lookup(dfd, name, LOOKUP_PARENT|lookup_flags, &nd);
-       if (error)
-               return ERR_PTR(error);
+       name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
+       if (IS_ERR(name))
+               return ERR_CAST(name);
  
         /*
          * Yucky last component or no last component at all?
          * (foo/., foo/.., /////)
          */
-       if (nd.last_type != LAST_NORM)
+       if (unlikely(type != LAST_NORM))
                 goto out;
-       nd.flags &= ~LOOKUP_PARENT;
-       nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL;
  
         /* don't fail immediately if it's r/o, at least try to report other errors */
-       err2 = mnt_want_write(nd.path.mnt);
+       err2 = mnt_want_write(path->mnt);
         /*
          * Do the final lookup.
          */
-       mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
-       dentry = lookup_hash(&nd);
+       lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
+       mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+       dentry = __lookup_hash(&last, path->dentry, lookup_flags);
         if (IS_ERR(dentry))
                 goto unlock;
  
@@ -3370,7 +3413,7 @@ static struct dentry *filename_create(int dfd, struct filename *name,
          * all is fine. Let's be bastards - you had / on the end, you've
          * been asking for (non-existent) directory. -ENOENT for you.
          */
-       if (unlikely(!is_dir && nd.last.name[nd.last.len])) {
+       if (unlikely(!is_dir && last.name[last.len])) {
                 error = -ENOENT;
                 goto fail;
         }
@@ -3378,31 +3421,26 @@ static struct dentry *filename_create(int dfd, struct filename *name,
                 error = err2;
                 goto fail;
         }
-       *path = nd.path;
+       putname(name);
         return dentry;
  fail:
         dput(dentry);
         dentry = ERR_PTR(error);
  unlock:
-       mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+       mutex_unlock(&path->dentry->d_inode->i_mutex);
         if (!err2)
-               mnt_drop_write(nd.path.mnt);
+               mnt_drop_write(path->mnt);
  out:
-       path_put(&nd.path);
+       path_put(path);
+       putname(name);
         return dentry;
  }
  
  struct dentry *kern_path_create(int dfd, const char *pathname,
                                 struct path *path, unsigned int lookup_flags)
  {
-       struct filename *filename = getname_kernel(pathname);
-       struct dentry *res;
-
-       if (IS_ERR(filename))
-               return ERR_CAST(filename);
-       res = filename_create(dfd, filename, path, lookup_flags);
-       putname(filename);
-       return res;
+       return filename_create(dfd, getname_kernel(pathname),
+                               path, lookup_flags);
  }
  EXPORT_SYMBOL(kern_path_create);
  
@@ -3415,16 +3453,10 @@ void done_path_create(struct path *path, struct dentry *dentry)
  }
  EXPORT_SYMBOL(done_path_create);
  
-struct dentry *user_path_create(int dfd, const char __user *pathname,
+inline struct dentry *user_path_create(int dfd, const char __user *pathname,
                                 struct path *path, unsigned int lookup_flags)
  {
-       struct filename *tmp = getname(pathname);
-       struct dentry *res;
-       if (IS_ERR(tmp))
-               return ERR_CAST(tmp);
-       res = filename_create(dfd, tmp, path, lookup_flags);
-       putname(tmp);
-       return res;
+       return filename_create(dfd, getname(pathname), path, lookup_flags);
  }
  EXPORT_SYMBOL(user_path_create);
  
@@ -3645,14 +3677,17 @@ static long do_rmdir(int dfd, const char __user *pathname)
         int error = 0;
         struct filename *name;
         struct dentry *dentry;
-       struct nameidata nd;
+       struct path path;
+       struct qstr last;
+       int type;
         unsigned int lookup_flags = 0;
  retry:
-       name = user_path_parent(dfd, pathname, &nd, lookup_flags);
+       name = user_path_parent(dfd, pathname,
+                               &path, &last, &type, lookup_flags);
         if (IS_ERR(name))
                 return PTR_ERR(name);
  
-       switch(nd.last_type) {
+       switch (type) {
         case LAST_DOTDOT:
                 error = -ENOTEMPTY;
                 goto exit1;
@@ -3664,13 +3699,12 @@ retry:
                 goto exit1;
         }
  
-       nd.flags &= ~LOOKUP_PARENT;
-       error = mnt_want_write(nd.path.mnt);
+       error = mnt_want_write(path.mnt);
         if (error)
                 goto exit1;
  
-       mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
-       dentry = lookup_hash(&nd);
+       mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+       dentry = __lookup_hash(&last, path.dentry, lookup_flags);
         error = PTR_ERR(dentry);
         if (IS_ERR(dentry))
                 goto exit2;
@@ -3678,17 +3712,17 @@ retry:
                 error = -ENOENT;
                 goto exit3;
         }
-       error = security_path_rmdir(&nd.path, dentry);
+       error = security_path_rmdir(&path, dentry);
         if (error)
                 goto exit3;
-       error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+       error = vfs_rmdir(path.dentry->d_inode, dentry);
  exit3:
         dput(dentry);
  exit2:
-       mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-       mnt_drop_write(nd.path.mnt);
+       mutex_unlock(&path.dentry->d_inode->i_mutex);
+       mnt_drop_write(path.mnt);
  exit1:
-       path_put(&nd.path);
+       path_put(&path);
         putname(name);
         if (retry_estale(error, lookup_flags)) {
                 lookup_flags |= LOOKUP_REVAL;
@@ -3771,43 +3805,45 @@ static long do_unlinkat(int dfd, const char __user *pathname)
         int error;
         struct filename *name;
         struct dentry *dentry;
-       struct nameidata nd;
+       struct path path;
+       struct qstr last;
+       int type;
         struct inode *inode = NULL;
         struct inode *delegated_inode = NULL;
         unsigned int lookup_flags = 0;
  retry:
-       name = user_path_parent(dfd, pathname, &nd, lookup_flags);
+       name = user_path_parent(dfd, pathname,
+                               &path, &last, &type, lookup_flags);
         if (IS_ERR(name))
                 return PTR_ERR(name);
  
         error = -EISDIR;
-       if (nd.last_type != LAST_NORM)
+       if (type != LAST_NORM)
                 goto exit1;
  
-       nd.flags &= ~LOOKUP_PARENT;
-       error = mnt_want_write(nd.path.mnt);
+       error = mnt_want_write(path.mnt);
         if (error)
                 goto exit1;
  retry_deleg:
-       mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
-       dentry = lookup_hash(&nd);
+       mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+       dentry = __lookup_hash(&last, path.dentry, lookup_flags);
         error = PTR_ERR(dentry);
         if (!IS_ERR(dentry)) {
                 /* Why not before? Because we want correct error value */
-               if (nd.last.name[nd.last.len])
+               if (last.name[last.len])
                         goto slashes;
                 inode = dentry->d_inode;
                 if (d_is_negative(dentry))
                         goto slashes;
                 ihold(inode);
-               error = security_path_unlink(&nd.path, dentry);
+               error = security_path_unlink(&path, dentry);
                 if (error)
                         goto exit2;
-               error = vfs_unlink(nd.path.dentry->d_inode, dentry, &delegated_inode);
+               error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
  exit2:
                 dput(dentry);
         }
-       mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+       mutex_unlock(&path.dentry->d_inode->i_mutex);
         if (inode)
                 iput(inode);    /* truncate the inode here */
         inode = NULL;
@@ -3816,9 +3852,9 @@ exit2:
                 if (!error)
                         goto retry_deleg;
         }
-       mnt_drop_write(nd.path.mnt);
+       mnt_drop_write(path.mnt);
  exit1:
-       path_put(&nd.path);
+       path_put(&path);
         putname(name);
         if (retry_estale(error, lookup_flags)) {
                 lookup_flags |= LOOKUP_REVAL;
@@ -4248,14 +4284,15 @@ EXPORT_SYMBOL(vfs_rename);
  SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
                 int, newdfd, const char __user *, newname, unsigned int, flags)
  {
-       struct dentry *old_dir, *new_dir;
         struct dentry *old_dentry, *new_dentry;
         struct dentry *trap;
-       struct nameidata oldnd, newnd;
+       struct path old_path, new_path;
+       struct qstr old_last, new_last;
+       int old_type, new_type;
         struct inode *delegated_inode = NULL;
         struct filename *from;
         struct filename *to;
-       unsigned int lookup_flags = 0;
+       unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
         bool should_retry = false;
         int error;
  
@@ -4269,47 +4306,45 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
         if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
                 return -EPERM;
  
+       if (flags & RENAME_EXCHANGE)
+               target_flags = 0;
+
  retry:
-       from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
+       from = user_path_parent(olddfd, oldname,
+                               &old_path, &old_last, &old_type, lookup_flags);
         if (IS_ERR(from)) {
                 error = PTR_ERR(from);
                 goto exit;
         }
  
-       to = user_path_parent(newdfd, newname, &newnd, lookup_flags);
+       to = user_path_parent(newdfd, newname,
+                               &new_path, &new_last, &new_type, lookup_flags);
         if (IS_ERR(to)) {
                 error = PTR_ERR(to);
                 goto exit1;
         }
  
         error = -EXDEV;
-       if (oldnd.path.mnt != newnd.path.mnt)
+       if (old_path.mnt != new_path.mnt)
                 goto exit2;
  
-       old_dir = oldnd.path.dentry;
         error = -EBUSY;
-       if (oldnd.last_type != LAST_NORM)
+       if (old_type != LAST_NORM)
                 goto exit2;
  
-       new_dir = newnd.path.dentry;
         if (flags & RENAME_NOREPLACE)
                 error = -EEXIST;
-       if (newnd.last_type != LAST_NORM)
+       if (new_type != LAST_NORM)
                 goto exit2;
  
-       error = mnt_want_write(oldnd.path.mnt);
+       error = mnt_want_write(old_path.mnt);
         if (error)
                 goto exit2;
  
-       oldnd.flags &= ~LOOKUP_PARENT;
-       newnd.flags &= ~LOOKUP_PARENT;
-       if (!(flags & RENAME_EXCHANGE))
-               newnd.flags |= LOOKUP_RENAME_TARGET;
-
  retry_deleg:
-       trap = lock_rename(new_dir, old_dir);
+       trap = lock_rename(new_path.dentry, old_path.dentry);
  
-       old_dentry = lookup_hash(&oldnd);
+       old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
         error = PTR_ERR(old_dentry);
         if (IS_ERR(old_dentry))
                 goto exit3;
@@ -4317,7 +4352,7 @@ retry_deleg:
         error = -ENOENT;
         if (d_is_negative(old_dentry))
                 goto exit4;
-       new_dentry = lookup_hash(&newnd);
+       new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
         error = PTR_ERR(new_dentry);
         if (IS_ERR(new_dentry))
                 goto exit4;
@@ -4331,16 +4366,16 @@ retry_deleg:
  
                 if (!d_is_dir(new_dentry)) {
                         error = -ENOTDIR;
-                       if (newnd.last.name[newnd.last.len])
+                       if (new_last.name[new_last.len])
                                 goto exit5;
                 }
         }
         /* unless the source is a directory trailing slashes give -ENOTDIR */
         if (!d_is_dir(old_dentry)) {
                 error = -ENOTDIR;
-               if (oldnd.last.name[oldnd.last.len])
+               if (old_last.name[old_last.len])
                         goto exit5;
-               if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len])
+               if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
                         goto exit5;
         }
         /* source should not be ancestor of target */
@@ -4353,32 +4388,32 @@ retry_deleg:
         if (new_dentry == trap)
                 goto exit5;
  
-       error = security_path_rename(&oldnd.path, old_dentry,
-                                    &newnd.path, new_dentry, flags);
+       error = security_path_rename(&old_path, old_dentry,
+                                    &new_path, new_dentry, flags);
         if (error)
                 goto exit5;
-       error = vfs_rename(old_dir->d_inode, old_dentry,
-                          new_dir->d_inode, new_dentry,
+       error = vfs_rename(old_path.dentry->d_inode, old_dentry,
+                          new_path.dentry->d_inode, new_dentry,
                            &delegated_inode, flags);
  exit5:
         dput(new_dentry);
  exit4:
         dput(old_dentry);
  exit3:
-       unlock_rename(new_dir, old_dir);
+       unlock_rename(new_path.dentry, old_path.dentry);
         if (delegated_inode) {
                 error = break_deleg_wait(&delegated_inode);
                 if (!error)
                         goto retry_deleg;
         }
-       mnt_drop_write(oldnd.path.mnt);
+       mnt_drop_write(old_path.mnt);
  exit2:
         if (retry_estale(error, lookup_flags))
                 should_retry = true;
-       path_put(&newnd.path);
+       path_put(&new_path);
         putname(to);
  exit1:
-       path_put(&oldnd.path);
+       path_put(&old_path);
         putname(from);
         if (should_retry) {
                 should_retry = false;
@@ -4437,18 +4472,19 @@ EXPORT_SYMBOL(readlink_copy);
   */
  int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
  {
-       struct nameidata nd;
         void *cookie;
+       struct inode *inode = d_inode(dentry);
+       const char *link = inode->i_link;
         int res;
  
-       nd.depth = 0;
-       cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
-       if (IS_ERR(cookie))
-               return PTR_ERR(cookie);
-
-       res = readlink_copy(buffer, buflen, nd_get_link(&nd));
-       if (dentry->d_inode->i_op->put_link)
-               dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
+       if (!link) {
+               link = inode->i_op->follow_link(dentry, &cookie);
+               if (IS_ERR(link))
+                       return PTR_ERR(link);
+       }
+       res = readlink_copy(buffer, buflen, link);
+       if (inode->i_op->put_link)
+               inode->i_op->put_link(inode, cookie);
         return res;
  }
  EXPORT_SYMBOL(generic_readlink);
@@ -4480,22 +4516,21 @@ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
  }
  EXPORT_SYMBOL(page_readlink);
  
-void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
+const char *page_follow_link_light(struct dentry *dentry, void **cookie)
  {
         struct page *page = NULL;
-       nd_set_link(nd, page_getlink(dentry, &page));
-       return page;
+       char *res = page_getlink(dentry, &page);
+       if (!IS_ERR(res))
+               *cookie = page;
+       return res;
  }
  EXPORT_SYMBOL(page_follow_link_light);
  
-void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+void page_put_link(struct inode *unused, void *cookie)
  {
         struct page *page = cookie;
-
-       if (page) {
-               kunmap(page);
-               page_cache_release(page);
-       }
+       kunmap(page);
+       page_cache_release(page);
  }
  EXPORT_SYMBOL(page_put_link);
  
diff --git a/fs/namespace.c b/fs/namespace.c

index 1b9e11167baedc310b8f5b6585ac019abd78b915..9c1c43d0d4f10112bcf711068873a70e2a057200 100644 (file)
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -590,24 +590,35 @@ static void delayed_free_vfsmnt(struct rcu_head *head)
  }
  
  /* call under rcu_read_lock */
-bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
+int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
  {
         struct mount *mnt;
         if (read_seqretry(&mount_lock, seq))
-               return false;
+               return 1;
         if (bastard == NULL)
-               return true;
+               return 0;
         mnt = real_mount(bastard);
         mnt_add_count(mnt, 1);
         if (likely(!read_seqretry(&mount_lock, seq)))
-               return true;
+               return 0;
         if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
                 mnt_add_count(mnt, -1);
-               return false;
+               return 1;
+       }
+       return -1;
+}
+
+/* call under rcu_read_lock */
+bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
+{
+       int res = __legitimize_mnt(bastard, seq);
+       if (likely(!res))
+               return true;
+       if (unlikely(res < 0)) {
+               rcu_read_unlock();
+               mntput(bastard);
+               rcu_read_lock();
         }
-       rcu_read_unlock();
-       mntput(bastard);
-       rcu_read_lock();
         return false;
  }
  
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c

index 2d56200655fe600ae73d11c382af5815285fab48..b6de433da5db14ab788ba358ce94b5952d5c601f 100644 (file)
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -20,7 +20,6 @@
  #include <linux/stat.h>
  #include <linux/mm.h>
  #include <linux/string.h>
-#include <linux/namei.h>
  
  /* Symlink caching in the page cache is even more simplistic
   * and straight-forward than readdir caching.
@@ -43,7 +42,7 @@ error:
         return -EIO;
  }
  
-static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *nfs_follow_link(struct dentry *dentry, void **cookie)
  {
         struct inode *inode = d_inode(dentry);
         struct page *page;
@@ -51,19 +50,13 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
  
         err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
         if (err)
-               goto read_failed;
+               return err;
         page = read_cache_page(&inode->i_data, 0,
                                 (filler_t *)nfs_symlink_filler, inode);
-       if (IS_ERR(page)) {
-               err = page;
-               goto read_failed;
-       }
-       nd_set_link(nd, kmap(page));
-       return page;
-
-read_failed:
-       nd_set_link(nd, err);
-       return NULL;
+       if (IS_ERR(page))
+               return ERR_CAST(page);
+       *cookie = page;
+       return kmap(page);
  }
  
  /*
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c

index 0f35b80d17fe019cdae356ecaba8359ac9f9a9a2..443abecf01b7d45cfb19be0ee63032b156ae718b 100644 (file)
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -35,7 +35,7 @@
   * ntfs_lookup - find the inode represented by a dentry in a directory inode
   * @dir_ino:   directory inode in which to look for the inode
   * @dent:      dentry representing the inode to look for
- * @nd:                lookup nameidata
+ * @flags:     lookup flags
   *
   * In short, ntfs_lookup() looks for the inode represented by the dentry @dent
   * in the directory inode @dir_ino and if found attaches the inode to the
diff --git a/fs/open.c b/fs/open.c

index 98e5a52dc68c9503136b8ff6a89b44dacd5478bf..e0250bdcc44005db6510ac516a53923282db5d12 100644 (file)
--- a/fs/open.c
+++ b/fs/open.c
@@ -367,7 +367,7 @@ retry:
         if (res)
                 goto out;
  
-       inode = path.dentry->d_inode;
+       inode = d_backing_inode(path.dentry);
  
         if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
                 /*
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c

index 04f1248846877d019625c861b474702177b38ae5..308379b2d0b2cb82b6ad505755484a19b4042fa0 100644 (file)
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -140,11 +140,12 @@ struct ovl_link_data {
         void *cookie;
  };
  
-static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *ovl_follow_link(struct dentry *dentry, void **cookie)
  {
-       void *ret;
         struct dentry *realdentry;
         struct inode *realinode;
+       struct ovl_link_data *data = NULL;
+       const char *ret;
  
         realdentry = ovl_dentry_real(dentry);
         realinode = realdentry->d_inode;
@@ -152,28 +153,28 @@ static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
         if (WARN_ON(!realinode->i_op->follow_link))
                 return ERR_PTR(-EPERM);
  
-       ret = realinode->i_op->follow_link(realdentry, nd);
-       if (IS_ERR(ret))
-               return ret;
-
         if (realinode->i_op->put_link) {
-               struct ovl_link_data *data;
-
                 data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
-               if (!data) {
-                       realinode->i_op->put_link(realdentry, nd, ret);
+               if (!data)
                         return ERR_PTR(-ENOMEM);
-               }
                 data->realdentry = realdentry;
-               data->cookie = ret;
+       }
  
-               return data;
-       } else {
-               return NULL;
+       ret = realinode->i_op->follow_link(realdentry, cookie);
+       if (IS_ERR_OR_NULL(ret)) {
+               kfree(data);
+               return ret;
         }
+
+       if (data)
+               data->cookie = *cookie;
+
+       *cookie = data;
+
+       return ret;
  }
  
-static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
+static void ovl_put_link(struct inode *unused, void *c)
  {
         struct inode *realinode;
         struct ovl_link_data *data = c;
@@ -182,7 +183,7 @@ static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
                 return;
  
         realinode = data->realdentry->d_inode;
-       realinode->i_op->put_link(data->realdentry, nd, data->cookie);
+       realinode->i_op->put_link(realinode, data->cookie);
         kfree(data);
  }
  
diff --git a/fs/proc/base.c b/fs/proc/base.c

index 093ca14f570154f5de1cc6db10c7995e1b7cba02..286a422f440e9ed8a6c4b9c54ad7b7c85d28d6f2 100644 (file)
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1380,7 +1380,7 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
                 return -ENOENT;
  }
  
-static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie)
  {
         struct inode *inode = d_inode(dentry);
         struct path path;
@@ -1394,7 +1394,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
         if (error)
                 goto out;
  
-       nd_jump_link(nd, &path);
+       nd_jump_link(&path);
         return NULL;
  out:
         return ERR_PTR(error);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c

index 8272aaba1bb06fd4b65416979155f18b590291a4..afe232b9df6e5b6c83779712c8cd068169992a55 100644 (file)
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -23,7 +23,6 @@
  #include <linux/slab.h>
  #include <linux/mount.h>
  #include <linux/magic.h>
-#include <linux/namei.h>
  
  #include <asm/uaccess.h>
  
@@ -394,16 +393,16 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
  };
  #endif
  
-static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_follow_link(struct dentry *dentry, void **cookie)
  {
         struct proc_dir_entry *pde = PDE(d_inode(dentry));
         if (unlikely(!use_pde(pde)))
                 return ERR_PTR(-EINVAL);
-       nd_set_link(nd, pde->data);
-       return pde;
+       *cookie = pde;
+       return pde->data;
  }
  
-static void proc_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+static void proc_put_link(struct inode *unused, void *p)
  {
         unuse_pde(p);
  }
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c

index e512642dbbdcb3cfe37f97b586770d695e2a755f..f6e8354b8cea20a936f6a4f8ae0335fd7fa36bd4 100644 (file)
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -30,7 +30,7 @@ static const struct proc_ns_operations *ns_entries[] = {
         &mntns_operations,
  };
  
-static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie)
  {
         struct inode *inode = d_inode(dentry);
         const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
@@ -45,7 +45,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
         if (ptrace_may_access(task, PTRACE_MODE_READ)) {
                 error = ns_get_path(&ns_path, task, ns_ops);
                 if (!error)
-                       nd_jump_link(nd, &ns_path);
+                       nd_jump_link(&ns_path);
         }
         put_task_struct(task);
         return error;
diff --git a/fs/proc/self.c b/fs/proc/self.c

index 6195b4a7c3b17f8c7feb09542e7e435b2cd59356..113b8d061fc023858ab152a5033e029d085f27a6 100644 (file)
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -1,5 +1,4 @@
  #include <linux/sched.h>
-#include <linux/namei.h>
  #include <linux/slab.h>
  #include <linux/pid_namespace.h>
  #include "internal.h"
@@ -19,21 +18,20 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
         return readlink_copy(buffer, buflen, tmp);
  }
  
-static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_self_follow_link(struct dentry *dentry, void **cookie)
  {
         struct pid_namespace *ns = dentry->d_sb->s_fs_info;
         pid_t tgid = task_tgid_nr_ns(current, ns);
-       char *name = ERR_PTR(-ENOENT);
-       if (tgid) {
-               /* 11 for max length of signed int in decimal + NULL term */
-               name = kmalloc(12, GFP_KERNEL);
-               if (!name)
-                       name = ERR_PTR(-ENOMEM);
-               else
-                       sprintf(name, "%d", tgid);
-       }
-       nd_set_link(nd, name);
-       return NULL;
+       char *name;
+
+       if (!tgid)
+               return ERR_PTR(-ENOENT);
+       /* 11 for max length of signed int in decimal + NULL term */
+       name = kmalloc(12, GFP_KERNEL);
+       if (!name)
+               return ERR_PTR(-ENOMEM);
+       sprintf(name, "%d", tgid);
+       return *cookie = name;
  }
  
  static const struct inode_operations proc_self_inode_operations = {
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c

index a8371993b4fb7822b865cd4b2a02bc63d9642fa7..947b0f4fd0a194057334762bafeff3548c276568 100644 (file)
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -1,5 +1,4 @@
  #include <linux/sched.h>
-#include <linux/namei.h>
  #include <linux/slab.h>
  #include <linux/pid_namespace.h>
  #include "internal.h"
@@ -20,21 +19,20 @@ static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
         return readlink_copy(buffer, buflen, tmp);
  }
  
-static void *proc_thread_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie)
  {
         struct pid_namespace *ns = dentry->d_sb->s_fs_info;
         pid_t tgid = task_tgid_nr_ns(current, ns);
         pid_t pid = task_pid_nr_ns(current, ns);
-       char *name = ERR_PTR(-ENOENT);
-       if (pid) {
-               name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
-               if (!name)
-                       name = ERR_PTR(-ENOMEM);
-               else
-                       sprintf(name, "%d/task/%d", tgid, pid);
-       }
-       nd_set_link(nd, name);
-       return NULL;
+       char *name;
+
+       if (!pid)
+               return ERR_PTR(-ENOENT);
+       name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
+       if (!name)
+               return ERR_PTR(-ENOMEM);
+       sprintf(name, "%d/task/%d", tgid, pid);
+       return *cookie = name;
  }
  
  static const struct inode_operations proc_thread_self_inode_operations = {
diff --git a/fs/sysv/Makefile b/fs/sysv/Makefile

index 3591f9d7a48a4a38948d30f7b523103471b85889..7a75e70a4b61b9d9fdc9fdd5d087be105304d62e 100644 (file)
--- a/fs/sysv/Makefile
+++ b/fs/sysv/Makefile
@@ -5,4 +5,4 @@
  obj-$(CONFIG_SYSV_FS) += sysv.o
  
  sysv-objs := ialloc.o balloc.o inode.o itree.o file.o dir.o \
-            namei.o super.o symlink.o
+            namei.o super.o
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c

index 88956309cc86ab6d6d614315651ced6778647347..590ad9206e3f4e761d2c2ad95cfaedfa76b47e16 100644 (file)
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -166,8 +166,9 @@ void sysv_set_inode(struct inode *inode, dev_t rdev)
                         inode->i_op = &sysv_symlink_inode_operations;
                         inode->i_mapping->a_ops = &sysv_aops;
                 } else {
-                       inode->i_op = &sysv_fast_symlink_inode_operations;
-                       nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size,
+                       inode->i_op = &simple_symlink_inode_operations;
+                       inode->i_link = (char *)SYSV_I(inode)->i_data;
+                       nd_terminate_link(inode->i_link, inode->i_size,
                                 sizeof(SYSV_I(inode)->i_data) - 1);
                 }
         } else
diff --git a/fs/sysv/symlink.c b/fs/sysv/symlink.c

deleted file mode 100644 (file)

index d3fa0d7..0000000
--- a/fs/sysv/symlink.c
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- *  linux/fs/sysv/symlink.c
- *
- *  Handling of System V filesystem fast symlinks extensions.
- *  Aug 2001, Christoph Hellwig (hch@infradead.org)
- */
-
-#include "sysv.h"
-#include <linux/namei.h>
-
-static void *sysv_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-       nd_set_link(nd, (char *)SYSV_I(d_inode(dentry))->i_data);
-       return NULL;
-}
-
-const struct inode_operations sysv_fast_symlink_inode_operations = {
-       .readlink       = generic_readlink,
-       .follow_link    = sysv_follow_link,
-};
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h

index 69d488986cce4923860c6d1f4c7ab325370fc5ce..2c13525131cd8146dd19d6a07086a70afc97b25a 100644 (file)
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -161,7 +161,6 @@ extern ino_t sysv_inode_by_name(struct dentry *);
  
  extern const struct inode_operations sysv_file_inode_operations;
  extern const struct inode_operations sysv_dir_inode_operations;
-extern const struct inode_operations sysv_fast_symlink_inode_operations;
  extern const struct file_operations sysv_file_operations;
  extern const struct file_operations sysv_dir_operations;
  extern const struct address_space_operations sysv_aops;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c

index 27060fc855d42549b2bbe7d3cbe6329858e3fb2c..5c27c66c224af38618ec4f92b453bd8e65d24608 100644 (file)
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -889,6 +889,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
  
         memcpy(ui->data, symname, len);
         ((char *)ui->data)[len] = '\0';
+       inode->i_link = ui->data;
         /*
          * The terminating zero byte is not written to the flash media and it
          * is put just to make later in-memory string processing simpler. Thus,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c

index 35efc103c39c102215cebb0792b6b4f24d49b90d..a3dfe2ae79f28592a0ba01feb6d0b1889345957c 100644 (file)
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -51,7 +51,6 @@
  
  #include "ubifs.h"
  #include <linux/mount.h>
-#include <linux/namei.h>
  #include <linux/slab.h>
  
  static int read_block(struct inode *inode, void *addr, unsigned int block,
@@ -1300,14 +1299,6 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset,
         ClearPageChecked(page);
  }
  
-static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-       struct ubifs_inode *ui = ubifs_inode(d_inode(dentry));
-
-       nd_set_link(nd, ui->data);
-       return NULL;
-}
-
  int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
  {
         struct inode *inode = file->f_mapping->host;
@@ -1570,7 +1561,7 @@ const struct inode_operations ubifs_file_inode_operations = {
  
  const struct inode_operations ubifs_symlink_inode_operations = {
         .readlink    = generic_readlink,
-       .follow_link = ubifs_follow_link,
+       .follow_link = simple_follow_link,
         .setattr     = ubifs_setattr,
         .getattr     = ubifs_getattr,
         .setxattr    = ubifs_setxattr,
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c

index 75e6f04bb795a9605a5d8c7cf6e4607b89a5f7cf..20f5dbd7c6a8b6bee476dc6139b803c1e14d5a47 100644 (file)
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -195,6 +195,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
                 }
                 memcpy(ui->data, ino->data, ui->data_len);
                 ((char *)ui->data)[ui->data_len] = '\0';
+               inode->i_link = ui->data;
                 break;
         case S_IFBLK:
         case S_IFCHR:
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c

index be7d42c7d9382bf8072a7e6e9eff7e03b24bab25..99aaf5c9bf4d83f0f5ec6469d6d6cb20827750d0 100644 (file)
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -572,9 +572,10 @@ static void ufs_set_inode_ops(struct inode *inode)
                 inode->i_fop = &ufs_dir_operations;
                 inode->i_mapping->a_ops = &ufs_aops;
         } else if (S_ISLNK(inode->i_mode)) {
-               if (!inode->i_blocks)
+               if (!inode->i_blocks) {
                         inode->i_op = &ufs_fast_symlink_inode_operations;
-               else {
+                       inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
+               } else {
                         inode->i_op = &ufs_symlink_inode_operations;
                         inode->i_mapping->a_ops = &ufs_aops;
                 }
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c

index e491a93a7e9af14c4227ee5072fa4a9d0bc17709..f773deb1d2e3fd561b906a0dd2a050d16161a5be 100644 (file)
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -144,7 +144,8 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
         } else {
                 /* fast symlink */
                 inode->i_op = &ufs_fast_symlink_inode_operations;
-               memcpy(UFS_I(inode)->i_u1.i_symlink, symname, l);
+               inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
+               memcpy(inode->i_link, symname, l);
                 inode->i_size = l-1;
         }
         mark_inode_dirty(inode);
diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c

index 5b537e2fdda385a0da9aa346e93e05e83084e1dd..874480bb43e9d08190707d7d2ed30aee02eec531 100644 (file)
--- a/fs/ufs/symlink.c
+++ b/fs/ufs/symlink.c
@@ -25,23 +25,12 @@
   *  ext2 symlink handling code
   */
  
-#include <linux/fs.h>
-#include <linux/namei.h>
-
  #include "ufs_fs.h"
  #include "ufs.h"
  
-
-static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-       struct ufs_inode_info *p = UFS_I(d_inode(dentry));
-       nd_set_link(nd, (char*)p->i_u1.i_symlink);
-       return NULL;
-}
-
  const struct inode_operations ufs_fast_symlink_inode_operations = {
         .readlink       = generic_readlink,
-       .follow_link    = ufs_follow_link,
+       .follow_link    = simple_follow_link,
         .setattr        = ufs_setattr,
  };
  
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c

index f4cd7204e23667724c01a4c4b8efe8c1d48b1cb3..7f51f39f8acc0a2fd407a7be57c1477439213a0f 100644 (file)
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -41,7 +41,6 @@
  
  #include <linux/capability.h>
  #include <linux/xattr.h>
-#include <linux/namei.h>
  #include <linux/posix_acl.h>
  #include <linux/security.h>
  #include <linux/fiemap.h>
@@ -414,10 +413,10 @@ xfs_vn_rename(
   * we need to be very careful about how much stack we use.
   * uio is kmalloced for this reason...
   */
-STATIC void *
+STATIC const char *
  xfs_vn_follow_link(
         struct dentry           *dentry,
-       struct nameidata        *nd)
+       void                    **cookie)
  {
         char                    *link;
         int                     error = -ENOMEM;
@@ -430,14 +429,12 @@ xfs_vn_follow_link(
         if (unlikely(error))
                 goto out_kfree;
  
-       nd_set_link(nd, link);
-       return NULL;
+       return *cookie = link;
  
   out_kfree:
         kfree(link);
   out_err:
-       nd_set_link(nd, ERR_PTR(error));
-       return NULL;
+       return ERR_PTR(error);
  }
  
  STATIC int
diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h

index b59b5a52637ec78397262862b06796b5d2fa2329..e56272c919b5a688e1739cdefaa08c121e26f5a8 100644 (file)
--- a/include/asm-generic/futex.h
+++ b/include/asm-generic/futex.h
@@ -8,8 +8,7 @@
  #ifndef CONFIG_SMP
  /*
   * The following implementation only for uniprocessor machines.
- * For UP, it's relies on the fact that pagefault_disable() also disables
- * preemption to ensure mutual exclusion.
+ * It relies on preempt_disable() ensuring mutual exclusion.
   *
   */
  
@@ -38,6 +37,7 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
         if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
                 oparg = 1 << oparg;
  
+       preempt_disable();
         pagefault_disable();
  
         ret = -EFAULT;
@@ -72,6 +72,7 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
  
  out_pagefault_enable:
         pagefault_enable();
+       preempt_enable();
  
         if (ret == 0) {
                 switch (cmp) {
@@ -106,6 +107,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
  {
         u32 val;
  
+       preempt_disable();
         if (unlikely(get_user(val, uaddr) != 0))
                 return -EFAULT;
  
@@ -113,6 +115,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
                 return -EFAULT;
  
         *uval = val;
+       preempt_enable();
  
         return 0;
  }
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h

index eb6f9e6c30756f5f39605582cf34c0e48c6a3df1..d0a7a4753db2b3cb516ceed3c331ad330b92e877 100644 (file)
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -79,11 +79,8 @@ static __always_inline bool should_resched(void)
  #ifdef CONFIG_PREEMPT
  extern asmlinkage void preempt_schedule(void);
  #define __preempt_schedule() preempt_schedule()
-
-#ifdef CONFIG_CONTEXT_TRACKING
-extern asmlinkage void preempt_schedule_context(void);
-#define __preempt_schedule_context() preempt_schedule_context()
-#endif
+extern asmlinkage void preempt_schedule_notrace(void);
+#define __preempt_schedule_notrace() preempt_schedule_notrace()
  #endif /* CONFIG_PREEMPT */
  
  #endif /* __ASM_PREEMPT_H */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h

index aff923ae8c4b963272563759b9ac52ad55778bd0..d87d8eced06407c59c6d231f9e707bdcc398ce52 100644 (file)
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -116,7 +116,6 @@ __printf(3, 4)
  int bdi_register(struct backing_dev_info *bdi, struct device *parent,
                 const char *fmt, ...);
  int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
-void bdi_unregister(struct backing_dev_info *bdi);
  int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
  void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
                         enum wb_reason reason);
diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h

index 86c12c93e3cf6ce9c1db4085b9ee986d2c1e5b9a..8fdcb783197d723a60dc548114af08cd6de9ca88 100644 (file)
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -2,7 +2,6 @@
  #define _LINUX_BH_H
  
  #include <linux/preempt.h>
-#include <linux/preempt_mask.h>
  
  #ifdef CONFIG_TRACE_IRQFLAGS
  extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
diff --git a/include/linux/compiler.h b/include/linux/compiler.h

index 03e227ba481c419ab468d697469a4418eec9f48f..05be2352fef889663fad482f57c4d8b9d5e18df4 100644 (file)
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -252,6 +252,22 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  #define WRITE_ONCE(x, val) \
         ({ union { typeof(x) __val; char __c[1]; } __u = { .__val = (val) }; __write_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
  
+/**
+ * READ_ONCE_CTRL - Read a value heading a control dependency
+ * @x: The value to be read, heading the control dependency
+ *
+ * Control dependencies are tricky.  See Documentation/memory-barriers.txt
+ * for important information on how to use them.  Note that in many cases,
+ * use of smp_load_acquire() will be much simpler.  Control dependencies
+ * should be avoided except on the hottest of hotpaths.
+ */
+#define READ_ONCE_CTRL(x) \
+({ \
+       typeof(x) __val = READ_ONCE(x); \
+       smp_read_barrier_depends(); /* Enforce control dependency. */ \
+       __val; \
+})
+
  #endif /* __KERNEL__ */
  
  #endif /* __ASSEMBLY__ */
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h

index cb25af46105406908e5a825ca3812f0ff63d3194..420311bcee38c291cf75894ebfe4c2d1141da1a7 100644 (file)
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -45,7 +45,6 @@ extern struct dentry *arch_debugfs_dir;
  
  /* declared over in file.c */
  extern const struct file_operations debugfs_file_operations;
-extern const struct inode_operations debugfs_link_operations;
  
  struct dentry *debugfs_create_file(const char *name, umode_t mode,
                                    struct dentry *parent, void *data,
diff --git a/include/linux/efi.h b/include/linux/efi.h

index af5be0368dec26c934565e634c0dc803958ed2cc..2092965afca3994606ee8a255a97929a38df8095 100644 (file)
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -583,6 +583,9 @@ void efi_native_runtime_setup(void);
  #define EFI_FILE_INFO_ID \
      EFI_GUID(  0x9576e92, 0x6d3f, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b )
  
+#define EFI_SYSTEM_RESOURCE_TABLE_GUID \
+    EFI_GUID(  0xb122a263, 0x3661, 0x4f68, 0x99, 0x29, 0x78, 0xf8, 0xb0, 0xd6, 0x21, 0x80 )
+
  #define EFI_FILE_SYSTEM_GUID \
      EFI_GUID(  0x964e5b22, 0x6459, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b )
  
@@ -823,6 +826,7 @@ extern struct efi {
         unsigned long fw_vendor;        /* fw_vendor */
         unsigned long runtime;          /* runtime table */
         unsigned long config_table;     /* config tables */
+       unsigned long esrt;             /* ESRT table */
         efi_get_time_t *get_time;
         efi_set_time_t *set_time;
         efi_get_wakeup_time_t *get_wakeup_time;
@@ -875,6 +879,11 @@ static inline efi_status_t efi_query_variable_store(u32 attributes, unsigned lon
  #endif
  extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr);
  extern int efi_config_init(efi_config_table_type_t *arch_tables);
+#ifdef CONFIG_EFI_ESRT
+extern void __init efi_esrt_init(void);
+#else
+static inline void efi_esrt_init(void) { }
+#endif
  extern int efi_config_parse_tables(void *config_tables, int count, int sz,
                                    efi_config_table_type_t *arch_tables);
  extern u64 efi_get_iobase (void);
@@ -882,12 +891,15 @@ extern u32 efi_mem_type (unsigned long phys_addr);
  extern u64 efi_mem_attributes (unsigned long phys_addr);
  extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size);
  extern int __init efi_uart_console_only (void);
+extern u64 efi_mem_desc_end(efi_memory_desc_t *md);
+extern int efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md);
  extern void efi_initialize_iomem_resources(struct resource *code_resource,
                 struct resource *data_resource, struct resource *bss_resource);
  extern void efi_get_time(struct timespec *now);
  extern void efi_reserve_boot_services(void);
  extern int efi_get_fdt_params(struct efi_fdt_params *params, int verbose);
  extern struct efi_memory_map memmap;
+extern struct kobject *efi_kobj;
  
  extern int efi_reboot_quirk_mode;
  extern bool efi_poweroff_required(void);
diff --git a/include/linux/fs.h b/include/linux/fs.h

index 35ec87e490b1a41ff0bc3ba20b06ac9d958f972a..b577e801b4af17ddd3288e28c209b644415cc63c 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -38,7 +38,6 @@ struct backing_dev_info;
  struct export_operations;
  struct hd_geometry;
  struct iovec;
-struct nameidata;
  struct kiocb;
  struct kobject;
  struct pipe_inode_info;
@@ -656,6 +655,7 @@ struct inode {
                 struct pipe_inode_info  *i_pipe;
                 struct block_device     *i_bdev;
                 struct cdev             *i_cdev;
+               char                    *i_link;
         };
  
         __u32                   i_generation;
@@ -1607,12 +1607,12 @@ struct file_operations {
  
  struct inode_operations {
         struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
-       void * (*follow_link) (struct dentry *, struct nameidata *);
+       const char * (*follow_link) (struct dentry *, void **);
         int (*permission) (struct inode *, int);
         struct posix_acl * (*get_acl)(struct inode *, int);
  
         int (*readlink) (struct dentry *, char __user *,int);
-       void (*put_link) (struct dentry *, struct nameidata *, void *);
+       void (*put_link) (struct inode *, void *);
  
         int (*create) (struct inode *,struct dentry *, umode_t, bool);
         int (*link) (struct dentry *,struct inode *,struct dentry *);
@@ -1879,6 +1879,7 @@ enum file_time_flags {
         S_VERSION = 8,
  };
  
+extern bool atime_needs_update(const struct path *, struct inode *);
  extern void touch_atime(const struct path *);
  static inline void file_accessed(struct file *file)
  {
@@ -2704,13 +2705,14 @@ extern const struct file_operations generic_ro_fops;
  
  extern int readlink_copy(char __user *, int, const char *);
  extern int page_readlink(struct dentry *, char __user *, int);
-extern void *page_follow_link_light(struct dentry *, struct nameidata *);
-extern void page_put_link(struct dentry *, struct nameidata *, void *);
+extern const char *page_follow_link_light(struct dentry *, void **);
+extern void page_put_link(struct inode *, void *);
  extern int __page_symlink(struct inode *inode, const char *symname, int len,
                 int nofs);
  extern int page_symlink(struct inode *inode, const char *symname, int len);
  extern const struct inode_operations page_symlink_inode_operations;
-extern void kfree_put_link(struct dentry *, struct nameidata *, void *);
+extern void kfree_put_link(struct inode *, void *);
+extern void free_page_put_link(struct inode *, void *);
  extern int generic_readlink(struct dentry *, char __user *, int);
  extern void generic_fillattr(struct inode *, struct kstat *);
  int vfs_getattr_nosec(struct path *path, struct kstat *stat);
@@ -2721,6 +2723,8 @@ void __inode_sub_bytes(struct inode *inode, loff_t bytes);
  void inode_sub_bytes(struct inode *inode, loff_t bytes);
  loff_t inode_get_bytes(struct inode *inode);
  void inode_set_bytes(struct inode *inode, loff_t bytes);
+const char *simple_follow_link(struct dentry *, void **);
+extern const struct inode_operations simple_symlink_inode_operations;
  
  extern int iterate_dir(struct file *, struct dir_context *);
  
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h

index f4af03404b9789805e06638303a8873aa14fe5a8..dfd59d6bc6f0f74ef89b0512a25180fd0077e2d3 100644 (file)
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -1,7 +1,7 @@
  #ifndef LINUX_HARDIRQ_H
  #define LINUX_HARDIRQ_H
  
-#include <linux/preempt_mask.h>
+#include <linux/preempt.h>
  #include <linux/lockdep.h>
  #include <linux/ftrace_irq.h>
  #include <linux/vtime.h>
diff --git a/include/linux/highmem.h b/include/linux/highmem.h

index 9286a46b7d69b539f027bcc890b3be976d20f228..6aefcd0031a6bd013cf322d1021812fcf84250c2 100644 (file)
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -65,6 +65,7 @@ static inline void kunmap(struct page *page)
  
  static inline void *kmap_atomic(struct page *page)
  {
+       preempt_disable();
         pagefault_disable();
         return page_address(page);
  }
@@ -73,6 +74,7 @@ static inline void *kmap_atomic(struct page *page)
  static inline void __kunmap_atomic(void *addr)
  {
         pagefault_enable();
+       preempt_enable();
  }
  
  #define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
diff --git a/include/linux/init_task.h b/include/linux/init_task.h

index 696d22312b3199ed2f515240111a669a35d48934..bb9b075f0eb022e8b35fa64916af56245243cc5b 100644 (file)
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -50,9 +50,8 @@ extern struct fs_struct init_fs;
         .cpu_timers     = INIT_CPU_TIMERS(sig.cpu_timers),              \
         .rlim           = INIT_RLIMITS,                                 \
         .cputimer       = {                                             \
-               .cputime = INIT_CPUTIME,                                \
-               .running = 0,                                           \
-               .lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock),    \
+               .cputime_atomic = INIT_CPUTIME_ATOMIC,                  \
+               .running        = 0,                                    \
         },                                                              \
         .cred_guard_mutex =                                             \
                  __MUTEX_INITIALIZER(sig.cred_guard_mutex),             \
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h

index 0c251be39836e32e6ddfb8250a79ce0683a4f95b..3665cb331ca1c6f58276cc1ea14cbe77ce19d3b1 100644 (file)
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -116,13 +116,14 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
   * Extended Capability Register
   */
  
+#define ecap_pasid(e)          ((e >> 40) & 0x1)
  #define ecap_pss(e)            ((e >> 35) & 0x1f)
  #define ecap_eafs(e)           ((e >> 34) & 0x1)
  #define ecap_nwfs(e)           ((e >> 33) & 0x1)
  #define ecap_srs(e)            ((e >> 31) & 0x1)
  #define ecap_ers(e)            ((e >> 30) & 0x1)
  #define ecap_prs(e)            ((e >> 29) & 0x1)
-#define ecap_pasid(e)          ((e >> 28) & 0x1)
+/* PASID support used to be on bit 28 */
  #define ecap_dis(e)            ((e >> 27) & 0x1)
  #define ecap_nest(e)           ((e >> 26) & 0x1)
  #define ecap_mts(e)            ((e >> 25) & 0x1)
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h

index 657fab4efab351070a9aaf7a625759159a786580..c27dde7215b5b291394747d35e1f4a19d9f1ac8e 100644 (file)
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -141,6 +141,7 @@ static inline void __iomem *
  io_mapping_map_atomic_wc(struct io_mapping *mapping,
                          unsigned long offset)
  {
+       preempt_disable();
         pagefault_disable();
         return ((char __force __iomem *) mapping) + offset;
  }
@@ -149,6 +150,7 @@ static inline void
  io_mapping_unmap_atomic(void __iomem *vaddr)
  {
         pagefault_enable();
+       preempt_enable();
  }
  
  /* Non-atomic map/unmap */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h

index 3a5b48e52a9ee3ce035a7a36a9cf35a2cc807bcb..060dd7b61c6d411bd1a8b9c48fa9f4010450e81d 100644 (file)
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -244,7 +244,8 @@ static inline u32 reciprocal_scale(u32 val, u32 ep_ro)
  
  #if defined(CONFIG_MMU) && \
         (defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP))
-void might_fault(void);
+#define might_fault() __might_fault(__FILE__, __LINE__)
+void __might_fault(const char *file, int line);
  #else
  static inline void might_fault(void) { }
  #endif
diff --git a/include/linux/lglock.h b/include/linux/lglock.h

index 0081f000e34b30f0ba0f6562274bb33a9ae0e74c..c92ebd100d9b653e25ed9accc4d1ac4569bea784 100644 (file)
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -52,10 +52,15 @@ struct lglock {
         static struct lglock name = { .lock = &name ## _lock }
  
  void lg_lock_init(struct lglock *lg, char *name);
+
  void lg_local_lock(struct lglock *lg);
  void lg_local_unlock(struct lglock *lg);
  void lg_local_lock_cpu(struct lglock *lg, int cpu);
  void lg_local_unlock_cpu(struct lglock *lg, int cpu);
+
+void lg_double_lock(struct lglock *lg, int cpu1, int cpu2);
+void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2);
+
  void lg_global_lock(struct lglock *lg);
  void lg_global_unlock(struct lglock *lg);
  
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h

index 066ba4157541a94b0bfadf4b9fb2e245fc75ab8e..2722111591a398a8ad37e9001e1453d43a15cb1d 100644 (file)
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -130,8 +130,8 @@ enum bounce_type {
  };
  
  struct lock_class_stats {
-       unsigned long                   contention_point[4];
-       unsigned long                   contending_point[4];
+       unsigned long                   contention_point[LOCKSTAT_POINTS];
+       unsigned long                   contending_point[LOCKSTAT_POINTS];
         struct lock_time                read_waittime;
         struct lock_time                write_waittime;
         struct lock_time                read_holdtime;
diff --git a/include/linux/namei.h b/include/linux/namei.h

index c8990779f0c33b99e552ca9406621cde03f49443..d8c6334cd15005c16162f57959e20b2e09f99d03 100644 (file)
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -1,16 +1,15 @@
  #ifndef _LINUX_NAMEI_H
  #define _LINUX_NAMEI_H
  
-#include <linux/dcache.h>
-#include <linux/errno.h>
-#include <linux/linkage.h>
+#include <linux/kernel.h>
  #include <linux/path.h>
-
-struct vfsmount;
-struct nameidata;
+#include <linux/fcntl.h>
+#include <linux/errno.h>
  
  enum { MAX_NESTED_LINKS = 8 };
  
+#define MAXSYMLINKS 40
+
  /*
   * Type of the last component on LOOKUP_PARENT
   */
@@ -45,13 +44,29 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
  #define LOOKUP_ROOT            0x2000
  #define LOOKUP_EMPTY           0x4000
  
-extern int user_path_at(int, const char __user *, unsigned, struct path *);
  extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);
  
-#define user_path(name, path) user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW, path)
-#define user_lpath(name, path) user_path_at(AT_FDCWD, name, 0, path)
-#define user_path_dir(name, path) \
-       user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, path)
+static inline int user_path_at(int dfd, const char __user *name, unsigned flags,
+                struct path *path)
+{
+       return user_path_at_empty(dfd, name, flags, path, NULL);
+}
+
+static inline int user_path(const char __user *name, struct path *path)
+{
+       return user_path_at_empty(AT_FDCWD, name, LOOKUP_FOLLOW, path, NULL);
+}
+
+static inline int user_lpath(const char __user *name, struct path *path)
+{
+       return user_path_at_empty(AT_FDCWD, name, 0, path, NULL);
+}
+
+static inline int user_path_dir(const char __user *name, struct path *path)
+{
+       return user_path_at_empty(AT_FDCWD, name,
+                                 LOOKUP_FOLLOW | LOOKUP_DIRECTORY, path, NULL);
+}
  
  extern int kern_path(const char *, unsigned, struct path *);
  
@@ -70,9 +85,7 @@ extern int follow_up(struct path *);
  extern struct dentry *lock_rename(struct dentry *, struct dentry *);
  extern void unlock_rename(struct dentry *, struct dentry *);
  
-extern void nd_jump_link(struct nameidata *nd, struct path *path);
-extern void nd_set_link(struct nameidata *nd, char *path);
-extern char *nd_get_link(struct nameidata *nd);
+extern void nd_jump_link(struct path *path);
  
  static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
  {
diff --git a/include/linux/of.h b/include/linux/of.h

index ddeaae6d2083b256b21b930f3eed8182510e26a8..b871ff9d81d7207333fa021e6a95cb6bdbcf34ac 100644 (file)
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -121,6 +121,8 @@ extern struct device_node *of_stdout;
  extern raw_spinlock_t devtree_lock;
  
  #ifdef CONFIG_OF
+void of_core_init(void);
+
  static inline bool is_of_node(struct fwnode_handle *fwnode)
  {
         return fwnode && fwnode->type == FWNODE_OF;
@@ -376,6 +378,10 @@ bool of_console_check(struct device_node *dn, char *name, int index);
  
  #else /* CONFIG_OF */
  
+static inline void of_core_init(void)
+{
+}
+
  static inline bool is_of_node(struct fwnode_handle *fwnode)
  {
         return false;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

index 61992cf2e9771699ee06595c8fbb1bd39633018a..a204d5266f5f0fc6ba6c175e9580d9b00c9621dc 100644 (file)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -92,8 +92,6 @@ struct hw_perf_event_extra {
         int             idx;    /* index in shared_regs->regs[] */
  };
  
-struct event_constraint;
-
  /**
   * struct hw_perf_event - performance event hardware details:
   */
@@ -112,8 +110,6 @@ struct hw_perf_event {
  
                         struct hw_perf_event_extra extra_reg;
                         struct hw_perf_event_extra branch_reg;
-
-                       struct event_constraint *constraint;
                 };
                 struct { /* software */
                         struct hrtimer  hrtimer;
@@ -124,7 +120,7 @@ struct hw_perf_event {
                 };
                 struct { /* intel_cqm */
                         int                     cqm_state;
-                       int                     cqm_rmid;
+                       u32                     cqm_rmid;
                         struct list_head        cqm_events_entry;
                         struct list_head        cqm_groups_entry;
                         struct list_head        cqm_group_entry;
@@ -734,6 +730,22 @@ extern int perf_event_overflow(struct perf_event *event,
                                  struct perf_sample_data *data,
                                  struct pt_regs *regs);
  
+extern void perf_event_output(struct perf_event *event,
+                               struct perf_sample_data *data,
+                               struct pt_regs *regs);
+
+extern void
+perf_event_header__init_id(struct perf_event_header *header,
+                          struct perf_sample_data *data,
+                          struct perf_event *event);
+extern void
+perf_event__output_id_sample(struct perf_event *event,
+                            struct perf_output_handle *handle,
+                            struct perf_sample_data *sample);
+
+extern void
+perf_log_lost_samples(struct perf_event *event, u64 lost);
+
  static inline bool is_sampling_event(struct perf_event *event)
  {
         return event->attr.sample_period != 0;
@@ -798,11 +810,33 @@ perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
  
  extern struct static_key_deferred perf_sched_events;
  
+static __always_inline bool
+perf_sw_migrate_enabled(void)
+{
+       if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS]))
+               return true;
+       return false;
+}
+
+static inline void perf_event_task_migrate(struct task_struct *task)
+{
+       if (perf_sw_migrate_enabled())
+               task->sched_migrated = 1;
+}
+
  static inline void perf_event_task_sched_in(struct task_struct *prev,
                                             struct task_struct *task)
  {
         if (static_key_false(&perf_sched_events.key))
                 __perf_event_task_sched_in(prev, task);
+
+       if (perf_sw_migrate_enabled() && task->sched_migrated) {
+               struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
+
+               perf_fetch_caller_regs(regs);
+               ___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0);
+               task->sched_migrated = 0;
+       }
  }
  
  static inline void perf_event_task_sched_out(struct task_struct *prev,
@@ -925,6 +959,8 @@ perf_aux_output_skip(struct perf_output_handle *handle,
  static inline void *
  perf_get_aux(struct perf_output_handle *handle)                                { return NULL; }
  static inline void
+perf_event_task_migrate(struct task_struct *task)                      { }
+static inline void
  perf_event_task_sched_in(struct task_struct *prev,
                          struct task_struct *task)                      { }
  static inline void
diff --git a/include/linux/preempt.h b/include/linux/preempt.h

index de83b4eb164287db363328f87c0f8af216497a91..0f1534acaf60983da1cc84548659ccc1398d79e9 100644 (file)
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -10,13 +10,117 @@
  #include <linux/list.h>
  
  /*
- * We use the MSB mostly because its available; see <linux/preempt_mask.h> for
- * the other bits -- can't include that header due to inclusion hell.
+ * We put the hardirq and softirq counter into the preemption
+ * counter. The bitmask has the following meaning:
+ *
+ * - bits 0-7 are the preemption count (max preemption depth: 256)
+ * - bits 8-15 are the softirq count (max # of softirqs: 256)
+ *
+ * The hardirq count could in theory be the same as the number of
+ * interrupts in the system, but we run all interrupt handlers with
+ * interrupts disabled, so we cannot have nesting interrupts. Though
+ * there are a few palaeontologic drivers which reenable interrupts in
+ * the handler, so we need more than one bit here.
+ *
+ *         PREEMPT_MASK:       0x000000ff
+ *         SOFTIRQ_MASK:       0x0000ff00
+ *         HARDIRQ_MASK:       0x000f0000
+ *             NMI_MASK:       0x00100000
+ *       PREEMPT_ACTIVE:       0x00200000
+ * PREEMPT_NEED_RESCHED:       0x80000000
   */
+#define PREEMPT_BITS   8
+#define SOFTIRQ_BITS   8
+#define HARDIRQ_BITS   4
+#define NMI_BITS       1
+
+#define PREEMPT_SHIFT  0
+#define SOFTIRQ_SHIFT  (PREEMPT_SHIFT + PREEMPT_BITS)
+#define HARDIRQ_SHIFT  (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+#define NMI_SHIFT      (HARDIRQ_SHIFT + HARDIRQ_BITS)
+
+#define __IRQ_MASK(x)  ((1UL << (x))-1)
+
+#define PREEMPT_MASK   (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
+#define SOFTIRQ_MASK   (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
+#define HARDIRQ_MASK   (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
+#define NMI_MASK       (__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)
+
+#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
+#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
+#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
+#define NMI_OFFSET     (1UL << NMI_SHIFT)
+
+#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
+
+#define PREEMPT_ACTIVE_BITS    1
+#define PREEMPT_ACTIVE_SHIFT   (NMI_SHIFT + NMI_BITS)
+#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
+
+/* We use the MSB mostly because its available */
  #define PREEMPT_NEED_RESCHED   0x80000000
  
+/* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
  #include <asm/preempt.h>
  
+#define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
+#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
+#define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
+                                | NMI_MASK))
+
+/*
+ * Are we doing bottom half or hardware interrupt processing?
+ * Are we in a softirq context? Interrupt context?
+ * in_softirq - Are we currently processing softirq or have bh disabled?
+ * in_serving_softirq - Are we currently processing softirq?
+ */
+#define in_irq()               (hardirq_count())
+#define in_softirq()           (softirq_count())
+#define in_interrupt()         (irq_count())
+#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
+
+/*
+ * Are we in NMI context?
+ */
+#define in_nmi()       (preempt_count() & NMI_MASK)
+
+#if defined(CONFIG_PREEMPT_COUNT)
+# define PREEMPT_DISABLE_OFFSET 1
+#else
+# define PREEMPT_DISABLE_OFFSET 0
+#endif
+
+/*
+ * The preempt_count offset needed for things like:
+ *
+ *  spin_lock_bh()
+ *
+ * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
+ * softirqs, such that unlock sequences of:
+ *
+ *  spin_unlock();
+ *  local_bh_enable();
+ *
+ * Work as expected.
+ */
+#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_DISABLE_OFFSET)
+
+/*
+ * Are we running in atomic context?  WARNING: this macro cannot
+ * always detect atomic context; in particular, it cannot know about
+ * held spinlocks in non-preemptible kernels.  Thus it should not be
+ * used in the general case to determine whether sleeping is possible.
+ * Do not use in_atomic() in driver code.
+ */
+#define in_atomic()    (preempt_count() != 0)
+
+/*
+ * Check whether we were atomic before we did preempt_disable():
+ * (used by the scheduler)
+ */
+#define in_atomic_preempt_off() \
+               ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET)
+
  #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
  extern void preempt_count_add(int val);
  extern void preempt_count_sub(int val);
@@ -33,6 +137,18 @@ extern void preempt_count_sub(int val);
  #define preempt_count_inc() preempt_count_add(1)
  #define preempt_count_dec() preempt_count_sub(1)
  
+#define preempt_active_enter() \
+do { \
+       preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
+       barrier(); \
+} while (0)
+
+#define preempt_active_exit() \
+do { \
+       barrier(); \
+       preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
+} while (0)
+
  #ifdef CONFIG_PREEMPT_COUNT
  
  #define preempt_disable() \
@@ -49,6 +165,8 @@ do { \
  
  #define preempt_enable_no_resched() sched_preempt_enable_no_resched()
  
+#define preemptible()  (preempt_count() == 0 && !irqs_disabled())
+
  #ifdef CONFIG_PREEMPT
  #define preempt_enable() \
  do { \
@@ -57,52 +175,46 @@ do { \
                 __preempt_schedule(); \
  } while (0)
  
+#define preempt_enable_notrace() \
+do { \
+       barrier(); \
+       if (unlikely(__preempt_count_dec_and_test())) \
+               __preempt_schedule_notrace(); \
+} while (0)
+
  #define preempt_check_resched() \
  do { \
         if (should_resched()) \
                 __preempt_schedule(); \
  } while (0)
  
-#else
+#else /* !CONFIG_PREEMPT */
  #define preempt_enable() \
  do { \
         barrier(); \
         preempt_count_dec(); \
  } while (0)
-#define preempt_check_resched() do { } while (0)
-#endif
-
-#define preempt_disable_notrace() \
-do { \
-       __preempt_count_inc(); \
-       barrier(); \
-} while (0)
  
-#define preempt_enable_no_resched_notrace() \
+#define preempt_enable_notrace() \
  do { \
         barrier(); \
         __preempt_count_dec(); \
  } while (0)
  
-#ifdef CONFIG_PREEMPT
-
-#ifndef CONFIG_CONTEXT_TRACKING
-#define __preempt_schedule_context() __preempt_schedule()
-#endif
+#define preempt_check_resched() do { } while (0)
+#endif /* CONFIG_PREEMPT */
  
-#define preempt_enable_notrace() \
+#define preempt_disable_notrace() \
  do { \
+       __preempt_count_inc(); \
         barrier(); \
-       if (unlikely(__preempt_count_dec_and_test())) \
-               __preempt_schedule_context(); \
  } while (0)
-#else
-#define preempt_enable_notrace() \
+
+#define preempt_enable_no_resched_notrace() \
  do { \
         barrier(); \
         __preempt_count_dec(); \
  } while (0)
-#endif
  
  #else /* !CONFIG_PREEMPT_COUNT */
  
@@ -121,6 +233,7 @@ do { \
  #define preempt_disable_notrace()              barrier()
  #define preempt_enable_no_resched_notrace()    barrier()
  #define preempt_enable_notrace()               barrier()
+#define preemptible()                          0
  
  #endif /* CONFIG_PREEMPT_COUNT */
  
diff --git a/include/linux/preempt_mask.h b/include/linux/preempt_mask.h

deleted file mode 100644 (file)

index dbeec4d..0000000
--- a/include/linux/preempt_mask.h
+++ /dev/null
@@ -1,117 +0,0 @@
-#ifndef LINUX_PREEMPT_MASK_H
-#define LINUX_PREEMPT_MASK_H
-
-#include <linux/preempt.h>
-
-/*
- * We put the hardirq and softirq counter into the preemption
- * counter. The bitmask has the following meaning:
- *
- * - bits 0-7 are the preemption count (max preemption depth: 256)
- * - bits 8-15 are the softirq count (max # of softirqs: 256)
- *
- * The hardirq count could in theory be the same as the number of
- * interrupts in the system, but we run all interrupt handlers with
- * interrupts disabled, so we cannot have nesting interrupts. Though
- * there are a few palaeontologic drivers which reenable interrupts in
- * the handler, so we need more than one bit here.
- *
- * PREEMPT_MASK:       0x000000ff
- * SOFTIRQ_MASK:       0x0000ff00
- * HARDIRQ_MASK:       0x000f0000
- *     NMI_MASK:       0x00100000
- * PREEMPT_ACTIVE:     0x00200000
- */
-#define PREEMPT_BITS   8
-#define SOFTIRQ_BITS   8
-#define HARDIRQ_BITS   4
-#define NMI_BITS       1
-
-#define PREEMPT_SHIFT  0
-#define SOFTIRQ_SHIFT  (PREEMPT_SHIFT + PREEMPT_BITS)
-#define HARDIRQ_SHIFT  (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
-#define NMI_SHIFT      (HARDIRQ_SHIFT + HARDIRQ_BITS)
-
-#define __IRQ_MASK(x)  ((1UL << (x))-1)
-
-#define PREEMPT_MASK   (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
-#define SOFTIRQ_MASK   (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
-#define HARDIRQ_MASK   (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
-#define NMI_MASK       (__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)
-
-#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
-#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
-#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
-#define NMI_OFFSET     (1UL << NMI_SHIFT)
-
-#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
-
-#define PREEMPT_ACTIVE_BITS    1
-#define PREEMPT_ACTIVE_SHIFT   (NMI_SHIFT + NMI_BITS)
-#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
-
-#define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
-#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
-#define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
-                                | NMI_MASK))
-
-/*
- * Are we doing bottom half or hardware interrupt processing?
- * Are we in a softirq context? Interrupt context?
- * in_softirq - Are we currently processing softirq or have bh disabled?
- * in_serving_softirq - Are we currently processing softirq?
- */
-#define in_irq()               (hardirq_count())
-#define in_softirq()           (softirq_count())
-#define in_interrupt()         (irq_count())
-#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
-
-/*
- * Are we in NMI context?
- */
-#define in_nmi()       (preempt_count() & NMI_MASK)
-
-#if defined(CONFIG_PREEMPT_COUNT)
-# define PREEMPT_CHECK_OFFSET 1
-#else
-# define PREEMPT_CHECK_OFFSET 0
-#endif
-
-/*
- * The preempt_count offset needed for things like:
- *
- *  spin_lock_bh()
- *
- * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
- * softirqs, such that unlock sequences of:
- *
- *  spin_unlock();
- *  local_bh_enable();
- *
- * Work as expected.
- */
-#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET)
-
-/*
- * Are we running in atomic context?  WARNING: this macro cannot
- * always detect atomic context; in particular, it cannot know about
- * held spinlocks in non-preemptible kernels.  Thus it should not be
- * used in the general case to determine whether sleeping is possible.
- * Do not use in_atomic() in driver code.
- */
-#define in_atomic()    ((preempt_count() & ~PREEMPT_ACTIVE) != 0)
-
-/*
- * Check whether we were atomic before we did preempt_disable():
- * (used by the scheduler, *after* releasing the kernel lock)
- */
-#define in_atomic_preempt_off() \
-               ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
-
-#ifdef CONFIG_PREEMPT_COUNT
-# define preemptible() (preempt_count() == 0 && !irqs_disabled())
-#else
-# define preemptible() 0
-#endif
-
-#endif /* LINUX_PREEMPT_MASK_H */
diff --git a/include/linux/rculist.h b/include/linux/rculist.h

index a18b16f1dc0e44f7f5a3b99ea4f43d64c67b8a0c..17c6b1f84a77d3b3073bc272fffc6a6138b322b5 100644 (file)
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -29,8 +29,8 @@
   */
  static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
  {
-       ACCESS_ONCE(list->next) = list;
-       ACCESS_ONCE(list->prev) = list;
+       WRITE_ONCE(list->next, list);
+       WRITE_ONCE(list->prev, list);
  }
  
  /*
@@ -288,7 +288,7 @@ static inline void list_splice_init_rcu(struct list_head *list,
  #define list_first_or_null_rcu(ptr, type, member) \
  ({ \
         struct list_head *__ptr = (ptr); \
-       struct list_head *__next = ACCESS_ONCE(__ptr->next); \
+       struct list_head *__next = READ_ONCE(__ptr->next); \
         likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \
  })
  
@@ -549,8 +549,8 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n,
   */
  #define hlist_for_each_entry_from_rcu(pos, member)                     \
         for (; pos;                                                     \
-            pos = hlist_entry_safe(rcu_dereference((pos)->member.next),\
-                       typeof(*(pos)), member))
+            pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
+                       &(pos)->member)), typeof(*(pos)), member))
  
  #endif /* __KERNEL__ */
  #endif
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h

index 573a5afd5ed884d5bdcfc4af6cf88c3b9d25214d..03a899aabd1762c74bb86b4b807dbf29651fad42 100644 (file)
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -292,10 +292,6 @@ void rcu_sched_qs(void);
  void rcu_bh_qs(void);
  void rcu_check_callbacks(int user);
  struct notifier_block;
-void rcu_idle_enter(void);
-void rcu_idle_exit(void);
-void rcu_irq_enter(void);
-void rcu_irq_exit(void);
  int rcu_cpu_notify(struct notifier_block *self,
                    unsigned long action, void *hcpu);
  
@@ -364,8 +360,8 @@ extern struct srcu_struct tasks_rcu_exit_srcu;
  #define rcu_note_voluntary_context_switch(t) \
         do { \
                 rcu_all_qs(); \
-               if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \
-                       ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \
+               if (READ_ONCE((t)->rcu_tasks_holdout)) \
+                       WRITE_ONCE((t)->rcu_tasks_holdout, false); \
         } while (0)
  #else /* #ifdef CONFIG_TASKS_RCU */
  #define TASKS_RCU(x) do { } while (0)
@@ -609,7 +605,7 @@ static inline void rcu_preempt_sleep_check(void)
  
  #define __rcu_access_pointer(p, space) \
  ({ \
-       typeof(*p) *_________p1 = (typeof(*p) *__force)ACCESS_ONCE(p); \
+       typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \
         rcu_dereference_sparse(p, space); \
         ((typeof(*p) __force __kernel *)(_________p1)); \
  })
@@ -628,21 +624,6 @@ static inline void rcu_preempt_sleep_check(void)
         ((typeof(*p) __force __kernel *)(p)); \
  })
  
-#define __rcu_access_index(p, space) \
-({ \
-       typeof(p) _________p1 = ACCESS_ONCE(p); \
-       rcu_dereference_sparse(p, space); \
-       (_________p1); \
-})
-#define __rcu_dereference_index_check(p, c) \
-({ \
-       /* Dependency order vs. p above. */ \
-       typeof(p) _________p1 = lockless_dereference(p); \
-       rcu_lockdep_assert(c, \
-                          "suspicious rcu_dereference_index_check() usage"); \
-       (_________p1); \
-})
-
  /**
   * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
   * @v: The value to statically initialize with.
@@ -659,7 +640,7 @@ static inline void rcu_preempt_sleep_check(void)
   */
  #define lockless_dereference(p) \
  ({ \
-       typeof(p) _________p1 = ACCESS_ONCE(p); \
+       typeof(p) _________p1 = READ_ONCE(p); \
         smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
         (_________p1); \
  })
@@ -702,7 +683,7 @@ static inline void rcu_preempt_sleep_check(void)
   * @p: The pointer to read
   *
   * Return the value of the specified RCU-protected pointer, but omit the
- * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
+ * smp_read_barrier_depends() and keep the READ_ONCE().  This is useful
   * when the value of this pointer is accessed, but the pointer is not
   * dereferenced, for example, when testing an RCU-protected pointer against
   * NULL.  Although rcu_access_pointer() may also be used in cases where
@@ -786,48 +767,13 @@ static inline void rcu_preempt_sleep_check(void)
   */
  #define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu)
  
-/**
- * rcu_access_index() - fetch RCU index with no dereferencing
- * @p: The index to read
- *
- * Return the value of the specified RCU-protected index, but omit the
- * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
- * when the value of this index is accessed, but the index is not
- * dereferenced, for example, when testing an RCU-protected index against
- * -1.  Although rcu_access_index() may also be used in cases where
- * update-side locks prevent the value of the index from changing, you
- * should instead use rcu_dereference_index_protected() for this use case.
- */
-#define rcu_access_index(p) __rcu_access_index((p), __rcu)
-
-/**
- * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
- * @p: The pointer to read, prior to dereferencing
- * @c: The conditions under which the dereference will take place
- *
- * Similar to rcu_dereference_check(), but omits the sparse checking.
- * This allows rcu_dereference_index_check() to be used on integers,
- * which can then be used as array indices.  Attempting to use
- * rcu_dereference_check() on an integer will give compiler warnings
- * because the sparse address-space mechanism relies on dereferencing
- * the RCU-protected pointer.  Dereferencing integers is not something
- * that even gcc will put up with.
- *
- * Note that this function does not implicitly check for RCU read-side
- * critical sections.  If this function gains lots of uses, it might
- * make sense to provide versions for each flavor of RCU, but it does
- * not make sense as of early 2010.
- */
-#define rcu_dereference_index_check(p, c) \
-       __rcu_dereference_index_check((p), (c))
-
  /**
   * rcu_dereference_protected() - fetch RCU pointer when updates prevented
   * @p: The pointer to read, prior to dereferencing
   * @c: The conditions under which the dereference will take place
   *
   * Return the value of the specified RCU-protected pointer, but omit
- * both the smp_read_barrier_depends() and the ACCESS_ONCE().  This
+ * both the smp_read_barrier_depends() and the READ_ONCE().  This
   * is useful in cases where update-side locks prevent the value of the
   * pointer from changing.  Please note that this primitive does -not-
   * prevent the compiler from repeating this reference or combining it
@@ -1153,13 +1099,13 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
  #define kfree_rcu(ptr, rcu_head)                                       \
         __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
  
-#if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL)
+#ifdef CONFIG_TINY_RCU
  static inline int rcu_needs_cpu(unsigned long *delta_jiffies)
  {
         *delta_jiffies = ULONG_MAX;
         return 0;
  }
-#endif /* #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) */
+#endif /* #ifdef CONFIG_TINY_RCU */
  
  #if defined(CONFIG_RCU_NOCB_CPU_ALL)
  static inline bool rcu_is_nocb_cpu(int cpu) { return true; }
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h

index 937edaeb150deb17759a9c0c715630fd0cc9a729..3df6c1ec4e25503583cb14656474d6727495a530 100644 (file)
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -159,6 +159,22 @@ static inline void rcu_cpu_stall_reset(void)
  {
  }
  
+static inline void rcu_idle_enter(void)
+{
+}
+
+static inline void rcu_idle_exit(void)
+{
+}
+
+static inline void rcu_irq_enter(void)
+{
+}
+
+static inline void rcu_irq_exit(void)
+{
+}
+
  static inline void exit_rcu(void)
  {
  }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h

index d2e583a6aacacf09ee9dc3bf3646b6a3cff3494e..3fa4a43ab4150b0b8c956ebacc5de47dace12462 100644 (file)
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -31,9 +31,7 @@
  #define __LINUX_RCUTREE_H
  
  void rcu_note_context_switch(void);
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
  int rcu_needs_cpu(unsigned long *delta_jiffies);
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
  void rcu_cpu_stall_reset(void);
  
  /*
@@ -93,6 +91,11 @@ void rcu_force_quiescent_state(void);
  void rcu_bh_force_quiescent_state(void);
  void rcu_sched_force_quiescent_state(void);
  
+void rcu_idle_enter(void);
+void rcu_idle_exit(void);
+void rcu_irq_enter(void);
+void rcu_irq_exit(void);
+
  void exit_rcu(void);
  
  void rcu_scheduler_starting(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 18f197223ebda308f25243933f1a9f68b3484eac..d4193d5613cf594108390e5d953451fd73087de5 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -25,7 +25,7 @@ struct sched_param {
  #include <linux/errno.h>
  #include <linux/nodemask.h>
  #include <linux/mm_types.h>
-#include <linux/preempt_mask.h>
+#include <linux/preempt.h>
  
  #include <asm/page.h>
  #include <asm/ptrace.h>
@@ -132,6 +132,7 @@ struct fs_struct;
  struct perf_event_context;
  struct blk_plug;
  struct filename;
+struct nameidata;
  
  #define VMACACHE_BITS 2
  #define VMACACHE_SIZE (1U << VMACACHE_BITS)
@@ -173,7 +174,12 @@ extern unsigned long nr_iowait_cpu(int cpu);
  extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
  
  extern void calc_global_load(unsigned long ticks);
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
  extern void update_cpu_load_nohz(void);
+#else
+static inline void update_cpu_load_nohz(void) { }
+#endif
  
  extern unsigned long get_parent_ip(unsigned long addr);
  
@@ -213,9 +219,10 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
  #define TASK_WAKEKILL          128
  #define TASK_WAKING            256
  #define TASK_PARKED            512
-#define TASK_STATE_MAX         1024
+#define TASK_NOLOAD            1024
+#define TASK_STATE_MAX         2048
  
-#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWP"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
  
  extern char ___assert_task_state[1 - 2*!!(
                 sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
@@ -225,6 +232,8 @@ extern char ___assert_task_state[1 - 2*!!(
  #define TASK_STOPPED           (TASK_WAKEKILL | __TASK_STOPPED)
  #define TASK_TRACED            (TASK_WAKEKILL | __TASK_TRACED)
  
+#define TASK_IDLE              (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
+
  /* Convenience macros for the sake of wake_up */
  #define TASK_NORMAL            (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
  #define TASK_ALL               (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
@@ -240,7 +249,8 @@ extern char ___assert_task_state[1 - 2*!!(
                         ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
  #define task_contributes_to_load(task) \
                                 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
-                                (task->flags & PF_FROZEN) == 0)
+                                (task->flags & PF_FROZEN) == 0 && \
+                                (task->state & TASK_NOLOAD) == 0)
  
  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
  
@@ -567,6 +577,23 @@ struct task_cputime {
                 .sum_exec_runtime = 0,                          \
         }
  
+/*
+ * This is the atomic variant of task_cputime, which can be used for
+ * storing and updating task_cputime statistics without locking.
+ */
+struct task_cputime_atomic {
+       atomic64_t utime;
+       atomic64_t stime;
+       atomic64_t sum_exec_runtime;
+};
+
+#define INIT_CPUTIME_ATOMIC \
+       (struct task_cputime_atomic) {                          \
+               .utime = ATOMIC64_INIT(0),                      \
+               .stime = ATOMIC64_INIT(0),                      \
+               .sum_exec_runtime = ATOMIC64_INIT(0),           \
+       }
+
  #ifdef CONFIG_PREEMPT_COUNT
  #define PREEMPT_DISABLED       (1 + PREEMPT_ENABLED)
  #else
@@ -584,18 +611,16 @@ struct task_cputime {
  
  /**
   * struct thread_group_cputimer - thread group interval timer counts
- * @cputime:           thread group interval timers.
+ * @cputime_atomic:    atomic thread group interval timers.
   * @running:           non-zero when there are timers running and
   *                     @cputime receives updates.
- * @lock:              lock for fields in this struct.
   *
   * This structure contains the version of task_cputime, above, that is
   * used for thread group CPU timer calculations.
   */
  struct thread_group_cputimer {
-       struct task_cputime cputime;
+       struct task_cputime_atomic cputime_atomic;
         int running;
-       raw_spinlock_t lock;
  };
  
  #include <linux/rwsem.h>
@@ -899,6 +924,50 @@ enum cpu_idle_type {
  #define SCHED_CAPACITY_SHIFT   10
  #define SCHED_CAPACITY_SCALE   (1L << SCHED_CAPACITY_SHIFT)
  
+/*
+ * Wake-queues are lists of tasks with a pending wakeup, whose
+ * callers have already marked the task as woken internally,
+ * and can thus carry on. A common use case is being able to
+ * do the wakeups once the corresponding user lock as been
+ * released.
+ *
+ * We hold reference to each task in the list across the wakeup,
+ * thus guaranteeing that the memory is still valid by the time
+ * the actual wakeups are performed in wake_up_q().
+ *
+ * One per task suffices, because there's never a need for a task to be
+ * in two wake queues simultaneously; it is forbidden to abandon a task
+ * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
+ * already in a wake queue, the wakeup will happen soon and the second
+ * waker can just skip it.
+ *
+ * The WAKE_Q macro declares and initializes the list head.
+ * wake_up_q() does NOT reinitialize the list; it's expected to be
+ * called near the end of a function, where the fact that the queue is
+ * not used again will be easy to see by inspection.
+ *
+ * Note that this can cause spurious wakeups. schedule() callers
+ * must ensure the call is done inside a loop, confirming that the
+ * wakeup condition has in fact occurred.
+ */
+struct wake_q_node {
+       struct wake_q_node *next;
+};
+
+struct wake_q_head {
+       struct wake_q_node *first;
+       struct wake_q_node **lastp;
+};
+
+#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
+
+#define WAKE_Q(name)                                   \
+       struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
+
+extern void wake_q_add(struct wake_q_head *head,
+                      struct task_struct *task);
+extern void wake_up_q(struct wake_q_head *head);
+
  /*
   * sched-domains (multiprocessor balancing) declarations:
   */
@@ -1334,8 +1403,6 @@ struct task_struct {
         int rcu_read_lock_nesting;
         union rcu_special rcu_read_unlock_special;
         struct list_head rcu_node_entry;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-#ifdef CONFIG_PREEMPT_RCU
         struct rcu_node *rcu_blocked_node;
  #endif /* #ifdef CONFIG_PREEMPT_RCU */
  #ifdef CONFIG_TASKS_RCU
@@ -1356,9 +1423,6 @@ struct task_struct {
  #endif
  
         struct mm_struct *mm, *active_mm;
-#ifdef CONFIG_COMPAT_BRK
-       unsigned brk_randomized:1;
-#endif
         /* per-thread vma caching */
         u32 vmacache_seqnum;
         struct vm_area_struct *vmacache[VMACACHE_SIZE];
@@ -1369,7 +1433,7 @@ struct task_struct {
         int exit_state;
         int exit_code, exit_signal;
         int pdeath_signal;  /*  The signal sent when the parent dies  */
-       unsigned int jobctl;    /* JOBCTL_*, siglock protected */
+       unsigned long jobctl;   /* JOBCTL_*, siglock protected */
  
         /* Used for emulating ABI behavior of previous Linux versions */
         unsigned int personality;
@@ -1381,10 +1445,14 @@ struct task_struct {
         /* Revert to default priority/policy when forking */
         unsigned sched_reset_on_fork:1;
         unsigned sched_contributes_to_load:1;
+       unsigned sched_migrated:1;
  
  #ifdef CONFIG_MEMCG_KMEM
         unsigned memcg_kmem_skip_account:1;
  #endif
+#ifdef CONFIG_COMPAT_BRK
+       unsigned brk_randomized:1;
+#endif
  
         unsigned long atomic_flags; /* Flags needing atomic access. */
  
@@ -1461,7 +1529,7 @@ struct task_struct {
                                        it with task_lock())
                                      - initialized normally by setup_new_exec */
  /* file system info */
-       int link_count, total_link_count;
+       struct nameidata *nameidata;
  #ifdef CONFIG_SYSVIPC
  /* ipc stuff */
         struct sysv_sem sysvsem;
@@ -1511,6 +1579,8 @@ struct task_struct {
         /* Protection of the PI data structures: */
         raw_spinlock_t pi_lock;
  
+       struct wake_q_node wake_q;
+
  #ifdef CONFIG_RT_MUTEXES
         /* PI waiters blocked on a rt_mutex held by this task */
         struct rb_root pi_waiters;
@@ -1724,6 +1794,7 @@ struct task_struct {
  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
         unsigned long   task_state_change;
  #endif
+       int pagefault_disabled;
  };
  
  /* Future-safe accessor for struct task_struct's cpus_allowed. */
@@ -2077,22 +2148,22 @@ TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
  #define JOBCTL_TRAPPING_BIT    21      /* switching to TRACED */
  #define JOBCTL_LISTENING_BIT   22      /* ptracer is listening for events */
  
-#define JOBCTL_STOP_DEQUEUED   (1 << JOBCTL_STOP_DEQUEUED_BIT)
-#define JOBCTL_STOP_PENDING    (1 << JOBCTL_STOP_PENDING_BIT)
-#define JOBCTL_STOP_CONSUME    (1 << JOBCTL_STOP_CONSUME_BIT)
-#define JOBCTL_TRAP_STOP       (1 << JOBCTL_TRAP_STOP_BIT)
-#define JOBCTL_TRAP_NOTIFY     (1 << JOBCTL_TRAP_NOTIFY_BIT)
-#define JOBCTL_TRAPPING                (1 << JOBCTL_TRAPPING_BIT)
-#define JOBCTL_LISTENING       (1 << JOBCTL_LISTENING_BIT)
+#define JOBCTL_STOP_DEQUEUED   (1UL << JOBCTL_STOP_DEQUEUED_BIT)
+#define JOBCTL_STOP_PENDING    (1UL << JOBCTL_STOP_PENDING_BIT)
+#define JOBCTL_STOP_CONSUME    (1UL << JOBCTL_STOP_CONSUME_BIT)
+#define JOBCTL_TRAP_STOP       (1UL << JOBCTL_TRAP_STOP_BIT)
+#define JOBCTL_TRAP_NOTIFY     (1UL << JOBCTL_TRAP_NOTIFY_BIT)
+#define JOBCTL_TRAPPING                (1UL << JOBCTL_TRAPPING_BIT)
+#define JOBCTL_LISTENING       (1UL << JOBCTL_LISTENING_BIT)
  
  #define JOBCTL_TRAP_MASK       (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
  #define JOBCTL_PENDING_MASK    (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
  
  extern bool task_set_jobctl_pending(struct task_struct *task,
-                                   unsigned int mask);
+                                   unsigned long mask);
  extern void task_clear_jobctl_trapping(struct task_struct *task);
  extern void task_clear_jobctl_pending(struct task_struct *task,
-                                     unsigned int mask);
+                                     unsigned long mask);
  
  static inline void rcu_copy_process(struct task_struct *p)
  {
@@ -2962,11 +3033,6 @@ static __always_inline bool need_resched(void)
  void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
  void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
  
-static inline void thread_group_cputime_init(struct signal_struct *sig)
-{
-       raw_spin_lock_init(&sig->cputimer.lock);
-}
-
  /*
   * Reevaluate whether the task has signals pending delivery.
   * Wake the task if so.
@@ -3080,13 +3146,13 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
  static inline unsigned long task_rlimit(const struct task_struct *tsk,
                 unsigned int limit)
  {
-       return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur);
+       return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
  }
  
  static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
                 unsigned int limit)
  {
-       return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max);
+       return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
  }
  
  static inline unsigned long rlimit(unsigned int limit)
diff --git a/include/linux/security.h b/include/linux/security.h

index 18264ea9e314153488f9726b530993658c4cea25..52febde524794f5b0201ceba920c4c695edcf8e3 100644 (file)
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -43,7 +43,6 @@ struct file;
  struct vfsmount;
  struct path;
  struct qstr;
-struct nameidata;
  struct iattr;
  struct fown_struct;
  struct file_operations;
@@ -477,7 +476,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
   * @inode_follow_link:
   *     Check permission to follow a symbolic link when looking up a pathname.
   *     @dentry contains the dentry structure for the link.
- *     @nd contains the nameidata structure for the parent directory.
+ *     @inode contains the inode, which itself is not stable in RCU-walk
+ *     @rcu indicates whether we are in RCU-walk mode.
   *     Return 0 if permission is granted.
   * @inode_permission:
   *     Check permission before accessing an inode.  This hook is called by the
@@ -1553,7 +1553,8 @@ struct security_operations {
         int (*inode_rename) (struct inode *old_dir, struct dentry *old_dentry,
                              struct inode *new_dir, struct dentry *new_dentry);
         int (*inode_readlink) (struct dentry *dentry);
-       int (*inode_follow_link) (struct dentry *dentry, struct nameidata *nd);
+       int (*inode_follow_link) (struct dentry *dentry, struct inode *inode,
+                                 bool rcu);
         int (*inode_permission) (struct inode *inode, int mask);
         int (*inode_setattr)    (struct dentry *dentry, struct iattr *attr);
         int (*inode_getattr) (const struct path *path);
@@ -1839,7 +1840,8 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
                           struct inode *new_dir, struct dentry *new_dentry,
                           unsigned int flags);
  int security_inode_readlink(struct dentry *dentry);
-int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd);
+int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
+                              bool rcu);
  int security_inode_permission(struct inode *inode, int mask);
  int security_inode_setattr(struct dentry *dentry, struct iattr *attr);
  int security_inode_getattr(const struct path *path);
@@ -2242,7 +2244,8 @@ static inline int security_inode_readlink(struct dentry *dentry)
  }
  
  static inline int security_inode_follow_link(struct dentry *dentry,
-                                             struct nameidata *nd)
+                                            struct inode *inode,
+                                            bool rcu)
  {
         return 0;
  }
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h

index 3e18379dfa6f349ba48edfc6615232af41ccfb99..0063b24b4f36df594b3587daadfdaf8849192c7d 100644 (file)
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -120,7 +120,7 @@ do {                                                                \
  /*
   * Despite its name it doesn't necessarily has to be a full barrier.
   * It should only guarantee that a STORE before the critical section
- * can not be reordered with a LOAD inside this section.
+ * can not be reordered with LOADs and STOREs inside this section.
   * spin_lock() is the one-way barrier, this LOAD can not escape out
   * of the region. So the default implementation simply ensures that
   * a STORE can not move into the critical section, smp_wmb() should
diff --git a/include/linux/topology.h b/include/linux/topology.h

index 909b6e43b6942c2a7372314627e19d1bd0f72c7c..73ddad1e0fa3435ffcd567dbbcfeb01fe296a014 100644 (file)
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -191,8 +191,8 @@ static inline int cpu_to_mem(int cpu)
  #ifndef topology_core_id
  #define topology_core_id(cpu)                  ((void)(cpu), 0)
  #endif
-#ifndef topology_thread_cpumask
-#define topology_thread_cpumask(cpu)           cpumask_of(cpu)
+#ifndef topology_sibling_cpumask
+#define topology_sibling_cpumask(cpu)          cpumask_of(cpu)
  #endif
  #ifndef topology_core_cpumask
  #define topology_core_cpumask(cpu)             cpumask_of(cpu)
@@ -201,7 +201,7 @@ static inline int cpu_to_mem(int cpu)
  #ifdef CONFIG_SCHED_SMT
  static inline const struct cpumask *cpu_smt_mask(int cpu)
  {
-       return topology_thread_cpumask(cpu);
+       return topology_sibling_cpumask(cpu);
  }
  #endif
  
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h

index ecd3319dac33140a21a7c8fd89a2c95c18ce9e42..ae572c1386073cce6c57807c47dc3d4694af0f03 100644 (file)
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -1,21 +1,30 @@
  #ifndef __LINUX_UACCESS_H__
  #define __LINUX_UACCESS_H__
  
-#include <linux/preempt.h>
+#include <linux/sched.h>
  #include <asm/uaccess.h>
  
+static __always_inline void pagefault_disabled_inc(void)
+{
+       current->pagefault_disabled++;
+}
+
+static __always_inline void pagefault_disabled_dec(void)
+{
+       current->pagefault_disabled--;
+       WARN_ON(current->pagefault_disabled < 0);
+}
+
  /*
- * These routines enable/disable the pagefault handler in that
- * it will not take any locks and go straight to the fixup table.
+ * These routines enable/disable the pagefault handler. If disabled, it will
+ * not take any locks and go straight to the fixup table.
   *
- * They have great resemblance to the preempt_disable/enable calls
- * and in fact they are identical; this is because currently there is
- * no other way to make the pagefault handlers do this. So we do
- * disable preemption but we don't necessarily care about that.
+ * User access methods will not sleep when called from a pagefault_disabled()
+ * environment.
   */
  static inline void pagefault_disable(void)
  {
-       preempt_count_inc();
+       pagefault_disabled_inc();
         /*
          * make sure to have issued the store before a pagefault
          * can hit.
@@ -25,18 +34,31 @@ static inline void pagefault_disable(void)
  
  static inline void pagefault_enable(void)
  {
-#ifndef CONFIG_PREEMPT
         /*
          * make sure to issue those last loads/stores before enabling
          * the pagefault handler again.
          */
         barrier();
-       preempt_count_dec();
-#else
-       preempt_enable();
-#endif
+       pagefault_disabled_dec();
  }
  
+/*
+ * Is the pagefault handler disabled? If so, user access methods will not sleep.
+ */
+#define pagefault_disabled() (current->pagefault_disabled != 0)
+
+/*
+ * The pagefault handler is in general disabled by pagefault_disable() or
+ * when in irq context (via in_atomic()).
+ *
+ * This function should only be used by the fault handlers. Other users should
+ * stick to pagefault_disabled().
+ * Please NEVER use preempt_disable() to disable the fault handler. With
+ * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled.
+ * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT.
+ */
+#define faulthandler_disabled() (pagefault_disabled() || in_atomic())
+
  #ifndef ARCH_HAS_NOCACHE_UACCESS
  
  static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
diff --git a/include/linux/wait.h b/include/linux/wait.h

index 2db83349865bb7a27eba8438a9e8a3cd8509abd0..d69ac4ecc88b9c0d6ff4d5f97cf0fa9d89b5fcdc 100644 (file)
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -969,7 +969,7 @@ extern int bit_wait_io_timeout(struct wait_bit_key *);
   * on that signal.
   */
  static inline int
-wait_on_bit(void *word, int bit, unsigned mode)
+wait_on_bit(unsigned long *word, int bit, unsigned mode)
  {
         might_sleep();
         if (!test_bit(bit, word))
@@ -994,7 +994,7 @@ wait_on_bit(void *word, int bit, unsigned mode)
   * on that signal.
   */
  static inline int
-wait_on_bit_io(void *word, int bit, unsigned mode)
+wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
  {
         might_sleep();
         if (!test_bit(bit, word))
@@ -1020,7 +1020,8 @@ wait_on_bit_io(void *word, int bit, unsigned mode)
   * received a signal and the mode permitted wakeup on that signal.
   */
  static inline int
-wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout)
+wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
+                   unsigned long timeout)
  {
         might_sleep();
         if (!test_bit(bit, word))
@@ -1047,7 +1048,8 @@ wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout)
   * on that signal.
   */
  static inline int
-wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode)
+wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
+                  unsigned mode)
  {
         might_sleep();
         if (!test_bit(bit, word))
@@ -1075,7 +1077,7 @@ wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode
   * the @mode allows that signal to wake the process.
   */
  static inline int
-wait_on_bit_lock(void *word, int bit, unsigned mode)
+wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
  {
         might_sleep();
         if (!test_and_set_bit(bit, word))
@@ -1099,7 +1101,7 @@ wait_on_bit_lock(void *word, int bit, unsigned mode)
   * the @mode allows that signal to wake the process.
   */
  static inline int
-wait_on_bit_lock_io(void *word, int bit, unsigned mode)
+wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode)
  {
         might_sleep();
         if (!test_and_set_bit(bit, word))
@@ -1125,7 +1127,8 @@ wait_on_bit_lock_io(void *word, int bit, unsigned mode)
   * the @mode allows that signal to wake the process.
   */
  static inline int
-wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned mode)
+wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
+                       unsigned mode)
  {
         might_sleep();
         if (!test_and_set_bit(bit, word))
diff --git a/include/sound/hda_regmap.h b/include/sound/hda_regmap.h

index 53a18b3635e24a458700a21566c56a10d0704c56..df705908480aebbf754900731834162fa8097f75 100644 (file)
--- a/include/sound/hda_regmap.h
+++ b/include/sound/hda_regmap.h
@@ -9,6 +9,8 @@
  #include <sound/core.h>
  #include <sound/hdaudio.h>
  
+#define AC_AMP_FAKE_MUTE       0x10    /* fake mute bit set to amp verbs */
+
  int snd_hdac_regmap_init(struct hdac_device *codec);
  void snd_hdac_regmap_exit(struct hdac_device *codec);
  int snd_hdac_regmap_add_vendor_verb(struct hdac_device *codec,
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h

index 30fedaf3e56a253175619fbef7a9f9a1f6dc15a3..d57a575fe31fc5796e9866470e3dd40a881ca12a 100644 (file)
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -147,7 +147,8 @@ TRACE_EVENT(sched_switch,
                   __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
                                 { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
                                 { 16, "Z" }, { 32, "X" }, { 64, "x" },
-                               { 128, "K" }, { 256, "W" }, { 512, "P" }) : "R",
+                               { 128, "K" }, { 256, "W" }, { 512, "P" },
+                               { 1024, "N" }) : "R",
                 __entry->prev_state & TASK_STATE_MAX ? "+" : "",
                 __entry->next_comm, __entry->next_pid, __entry->next_prio)
  );
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h

index 880dd74371729939a0179ef3dfd487e1fa017838..c178d13d6f4c0cb51d441c59e7b4975a1913ed3e 100644 (file)
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -250,7 +250,6 @@ DEFINE_EVENT(writeback_class, name, \
  DEFINE_WRITEBACK_EVENT(writeback_nowork);
  DEFINE_WRITEBACK_EVENT(writeback_wake_background);
  DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
-DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
  
  DECLARE_EVENT_CLASS(wbc_class,
         TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h

index 871e73f99a4d7aa13b4cd6f5bc8969421c2a9301..94d44ab2fda1821bcda7e1b541bcfeffb556e7c5 100644 (file)
--- a/include/uapi/drm/radeon_drm.h
+++ b/include/uapi/drm/radeon_drm.h
@@ -1038,6 +1038,7 @@ struct drm_radeon_cs {
  #define RADEON_INFO_CURRENT_GPU_SCLK   0x22
  #define RADEON_INFO_CURRENT_GPU_MCLK   0x23
  #define RADEON_INFO_READ_REG           0x24
+#define RADEON_INFO_VA_UNMAP_WORKING   0x25
  
  struct drm_radeon_info {
         uint32_t                request;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h

index 309211b3eb672f9449a0fc266ed643ac881196d3..d97f84c080daefb3e8789a59b98cedbbe387cc56 100644 (file)
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -167,6 +167,7 @@ enum perf_branch_sample_type_shift {
         PERF_SAMPLE_BRANCH_COND_SHIFT           = 10, /* conditional branches */
  
         PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT     = 11, /* call/ret stack */
+       PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT       = 12, /* indirect jumps */
  
         PERF_SAMPLE_BRANCH_MAX_SHIFT            /* non-ABI */
  };
@@ -186,6 +187,7 @@ enum perf_branch_sample_type {
         PERF_SAMPLE_BRANCH_COND         = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT,
  
         PERF_SAMPLE_BRANCH_CALL_STACK   = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
+       PERF_SAMPLE_BRANCH_IND_JUMP     = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT,
  
         PERF_SAMPLE_BRANCH_MAX          = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
  };
@@ -563,6 +565,10 @@ struct perf_event_mmap_page {
  #define PERF_RECORD_MISC_GUEST_KERNEL          (4 << 0)
  #define PERF_RECORD_MISC_GUEST_USER            (5 << 0)
  
+/*
+ * Indicates that /proc/PID/maps parsing are truncated by time out.
+ */
+#define PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT        (1 << 12)
  /*
   * PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on
   * different events so can reuse the same bit position.
@@ -800,6 +806,18 @@ enum perf_event_type {
          */
         PERF_RECORD_ITRACE_START                = 12,
  
+       /*
+        * Records the dropped/lost sample number.
+        *
+        * struct {
+        *      struct perf_event_header        header;
+        *
+        *      u64                             lost;
+        *      struct sample_id                sample_id;
+        * };
+        */
+       PERF_RECORD_LOST_SAMPLES                = 13,
+
         PERF_RECORD_MAX,                        /* non-ABI */
  };
  
diff --git a/init/Kconfig b/init/Kconfig

index dc24dec6023292ac6f1bd484ce506074d3e53566..b999fa381bf9fe1f37757af5e0a454cc6adb2da9 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -465,13 +465,9 @@ endmenu # "CPU/Task time and stats accounting"
  
  menu "RCU Subsystem"
  
-choice
-       prompt "RCU Implementation"
-       default TREE_RCU
-
  config TREE_RCU
-       bool "Tree-based hierarchical RCU"
-       depends on !PREEMPT && SMP
+       bool
+       default y if !PREEMPT && SMP
         help
           This option selects the RCU implementation that is
           designed for very large SMP system with hundreds or
@@ -479,8 +475,8 @@ config TREE_RCU
           smaller systems.
  
  config PREEMPT_RCU
-       bool "Preemptible tree-based hierarchical RCU"
-       depends on PREEMPT
+       bool
+       default y if PREEMPT
         help
           This option selects the RCU implementation that is
           designed for very large SMP systems with hundreds or
@@ -491,15 +487,28 @@ config PREEMPT_RCU
           Select this option if you are unsure.
  
  config TINY_RCU
-       bool "UP-only small-memory-footprint RCU"
-       depends on !PREEMPT && !SMP
+       bool
+       default y if !PREEMPT && !SMP
         help
           This option selects the RCU implementation that is
           designed for UP systems from which real-time response
           is not required.  This option greatly reduces the
           memory footprint of RCU.
  
-endchoice
+config RCU_EXPERT
+       bool "Make expert-level adjustments to RCU configuration"
+       default n
+       help
+         This option needs to be enabled if you wish to make
+         expert-level adjustments to RCU configuration.  By default,
+         no such adjustments can be made, which has the often-beneficial
+         side-effect of preventing "make oldconfig" from asking you all
+         sorts of detailed questions about how you would like numerous
+         obscure RCU options to be set up.
+
+         Say Y if you need to make expert-level adjustments to RCU.
+
+         Say N if you are unsure.
  
  config SRCU
         bool
@@ -509,7 +518,7 @@ config SRCU
           sections.
  
  config TASKS_RCU
-       bool "Task_based RCU implementation using voluntary context switch"
+       bool
         default n
         select SRCU
         help
@@ -517,8 +526,6 @@ config TASKS_RCU
           only voluntary context switch (not preemption!), idle, and
           user-mode execution as quiescent states.
  
-         If unsure, say N.
-
  config RCU_STALL_COMMON
         def_bool ( TREE_RCU || PREEMPT_RCU || RCU_TRACE )
         help
@@ -531,9 +538,7 @@ config CONTEXT_TRACKING
         bool
  
  config RCU_USER_QS
-       bool "Consider userspace as in RCU extended quiescent state"
-       depends on HAVE_CONTEXT_TRACKING && SMP
-       select CONTEXT_TRACKING
+       bool
         help
           This option sets hooks on kernel / userspace boundaries and
           puts RCU in extended quiescent state when the CPU runs in
@@ -541,12 +546,6 @@ config RCU_USER_QS
           excluded from the global RCU state machine and thus doesn't
           try to keep the timer tick on for RCU.
  
-         Unless you want to hack and help the development of the full
-         dynticks mode, you shouldn't enable this option.  It also
-         adds unnecessary overhead.
-
-         If unsure say N
-
  config CONTEXT_TRACKING_FORCE
         bool "Force context tracking"
         depends on CONTEXT_TRACKING
@@ -578,7 +577,7 @@ config RCU_FANOUT
         int "Tree-based hierarchical RCU fanout value"
         range 2 64 if 64BIT
         range 2 32 if !64BIT
-       depends on TREE_RCU || PREEMPT_RCU
+       depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
         default 64 if 64BIT
         default 32 if !64BIT
         help
@@ -596,9 +595,9 @@ config RCU_FANOUT
  
  config RCU_FANOUT_LEAF
         int "Tree-based hierarchical RCU leaf-level fanout value"
-       range 2 RCU_FANOUT if 64BIT
-       range 2 RCU_FANOUT if !64BIT
-       depends on TREE_RCU || PREEMPT_RCU
+       range 2 64 if 64BIT
+       range 2 32 if !64BIT
+       depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
         default 16
         help
           This option controls the leaf-level fanout of hierarchical
@@ -621,23 +620,9 @@ config RCU_FANOUT_LEAF
  
           Take the default if unsure.
  
-config RCU_FANOUT_EXACT
-       bool "Disable tree-based hierarchical RCU auto-balancing"
-       depends on TREE_RCU || PREEMPT_RCU
-       default n
-       help
-         This option forces use of the exact RCU_FANOUT value specified,
-         regardless of imbalances in the hierarchy.  This is useful for
-         testing RCU itself, and might one day be useful on systems with
-         strong NUMA behavior.
-
-         Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
-
-         Say N if unsure.
-
  config RCU_FAST_NO_HZ
         bool "Accelerate last non-dyntick-idle CPU's grace periods"
-       depends on NO_HZ_COMMON && SMP
+       depends on NO_HZ_COMMON && SMP && RCU_EXPERT
         default n
         help
           This option permits CPUs to enter dynticks-idle state even if
@@ -663,7 +648,7 @@ config TREE_RCU_TRACE
  
  config RCU_BOOST
         bool "Enable RCU priority boosting"
-       depends on RT_MUTEXES && PREEMPT_RCU
+       depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
         default n
         help
           This option boosts the priority of preempted RCU readers that
@@ -680,6 +665,7 @@ config RCU_KTHREAD_PRIO
         range 0 99 if !RCU_BOOST
         default 1 if RCU_BOOST
         default 0 if !RCU_BOOST
+       depends on RCU_EXPERT
         help
           This option specifies the SCHED_FIFO priority value that will be
           assigned to the rcuc/n and rcub/n threads and is also the value
@@ -1637,7 +1623,7 @@ config PERF_EVENTS
  config DEBUG_PERF_USE_VMALLOC
         default n
         bool "Debug: use vmalloc to back perf mmap() buffers"
-       depends on PERF_EVENTS && DEBUG_KERNEL
+       depends on PERF_EVENTS && DEBUG_KERNEL && !PPC
         select PERF_USE_VMALLOC
         help
          Use vmalloc memory to back perf mmap() buffers.
diff --git a/ipc/mqueue.c b/ipc/mqueue.c

index 3aaea7ffd077c874cb1b420013458020057f45a6..a24ba9fe5bb8892dfaa7452fe78f9ef68d1d97fc 100644 (file)
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -47,8 +47,7 @@
  #define RECV           1
  
  #define STATE_NONE     0
-#define STATE_PENDING  1
-#define STATE_READY    2
+#define STATE_READY    1
  
  struct posix_msg_tree_node {
         struct rb_node          rb_node;
@@ -571,15 +570,12 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr,
         wq_add(info, sr, ewp);
  
         for (;;) {
-               set_current_state(TASK_INTERRUPTIBLE);
+               __set_current_state(TASK_INTERRUPTIBLE);
  
                 spin_unlock(&info->lock);
                 time = schedule_hrtimeout_range_clock(timeout, 0,
                         HRTIMER_MODE_ABS, CLOCK_REALTIME);
  
-               while (ewp->state == STATE_PENDING)
-                       cpu_relax();
-
                 if (ewp->state == STATE_READY) {
                         retval = 0;
                         goto out;
@@ -907,11 +903,15 @@ out_name:
   * list of waiting receivers. A sender checks that list before adding the new
   * message into the message array. If there is a waiting receiver, then it
   * bypasses the message array and directly hands the message over to the
- * receiver.
- * The receiver accepts the message and returns without grabbing the queue
- * spinlock. Therefore an intermediate STATE_PENDING state and memory barriers
- * are necessary. The same algorithm is used for sysv semaphores, see
- * ipc/sem.c for more details.
+ * receiver. The receiver accepts the message and returns without grabbing the
+ * queue spinlock:
+ *
+ * - Set pointer to message.
+ * - Queue the receiver task for later wakeup (without the info->lock).
+ * - Update its state to STATE_READY. Now the receiver can continue.
+ * - Wake up the process after the lock is dropped. Should the process wake up
+ *   before this wakeup (due to a timeout or a signal) it will either see
+ *   STATE_READY and continue or acquire the lock to check the state again.
   *
   * The same algorithm is used for senders.
   */
@@ -919,21 +919,29 @@ out_name:
  /* pipelined_send() - send a message directly to the task waiting in
   * sys_mq_timedreceive() (without inserting message into a queue).
   */
-static inline void pipelined_send(struct mqueue_inode_info *info,
+static inline void pipelined_send(struct wake_q_head *wake_q,
+                                 struct mqueue_inode_info *info,
                                   struct msg_msg *message,
                                   struct ext_wait_queue *receiver)
  {
         receiver->msg = message;
         list_del(&receiver->list);
-       receiver->state = STATE_PENDING;
-       wake_up_process(receiver->task);
-       smp_wmb();
+       wake_q_add(wake_q, receiver->task);
+       /*
+        * Rely on the implicit cmpxchg barrier from wake_q_add such
+        * that we can ensure that updating receiver->state is the last
+        * write operation: As once set, the receiver can continue,
+        * and if we don't have the reference count from the wake_q,
+        * yet, at that point we can later have a use-after-free
+        * condition and bogus wakeup.
+        */
         receiver->state = STATE_READY;
  }
  
  /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
   * gets its message and put to the queue (we have one free place for sure). */
-static inline void pipelined_receive(struct mqueue_inode_info *info)
+static inline void pipelined_receive(struct wake_q_head *wake_q,
+                                    struct mqueue_inode_info *info)
  {
         struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);
  
@@ -944,10 +952,9 @@ static inline void pipelined_receive(struct mqueue_inode_info *info)
         }
         if (msg_insert(sender->msg, info))
                 return;
+
         list_del(&sender->list);
-       sender->state = STATE_PENDING;
-       wake_up_process(sender->task);
-       smp_wmb();
+       wake_q_add(wake_q, sender->task);
         sender->state = STATE_READY;
  }
  
@@ -965,6 +972,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
         struct timespec ts;
         struct posix_msg_tree_node *new_leaf = NULL;
         int ret = 0;
+       WAKE_Q(wake_q);
  
         if (u_abs_timeout) {
                 int res = prepare_timeout(u_abs_timeout, &expires, &ts);
@@ -1049,7 +1057,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
         } else {
                 receiver = wq_get_first_waiter(info, RECV);
                 if (receiver) {
-                       pipelined_send(info, msg_ptr, receiver);
+                       pipelined_send(&wake_q, info, msg_ptr, receiver);
                 } else {
                         /* adds message to the queue */
                         ret = msg_insert(msg_ptr, info);
@@ -1062,6 +1070,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
         }
  out_unlock:
         spin_unlock(&info->lock);
+       wake_up_q(&wake_q);
  out_free:
         if (ret)
                 free_msg(msg_ptr);
@@ -1149,14 +1158,17 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
                         msg_ptr = wait.msg;
                 }
         } else {
+               WAKE_Q(wake_q);
+
                 msg_ptr = msg_get(info);
  
                 inode->i_atime = inode->i_mtime = inode->i_ctime =
                                 CURRENT_TIME;
  
                 /* There is now free space in queue. */
-               pipelined_receive(info);
+               pipelined_receive(&wake_q, info);
                 spin_unlock(&info->lock);
+               wake_up_q(&wake_q);
                 ret = 0;
         }
         if (ret == 0) {
diff --git a/kernel/compat.c b/kernel/compat.c

index 24f00610c575fd5d34c40dcf9fad35b358596c22..333d364be29d9e6c8b209d9eaded9d28552a36d7 100644 (file)
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -912,7 +912,8 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
                          * bitmap. We must however ensure the end of the
                          * kernel bitmap is zeroed.
                          */
-                       if (nr_compat_longs-- > 0) {
+                       if (nr_compat_longs) {
+                               nr_compat_longs--;
                                 if (__get_user(um, umask))
                                         return -EFAULT;
                         } else {
@@ -954,7 +955,8 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
                          * We dont want to write past the end of the userspace
                          * bitmap.
                          */
-                       if (nr_compat_longs-- > 0) {
+                       if (nr_compat_longs) {
+                               nr_compat_longs--;
                                 if (__put_user(um, umask))
                                         return -EFAULT;
                         }
diff --git a/kernel/cpu.c b/kernel/cpu.c

index 94bbe4695232cd2fa2e9c0def32de7fa27644971..9c9c9fab16cc3610afa76a6c467780482b35b0be 100644 (file)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -398,7 +398,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
         err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
         if (err) {
                 /* CPU didn't die: tell everyone.  Can't complain. */
-               smpboot_unpark_threads(cpu);
                 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
                 goto out_release;
         }
@@ -463,6 +462,7 @@ static int smpboot_thread_call(struct notifier_block *nfb,
  
         switch (action & ~CPU_TASKS_FROZEN) {
  
+       case CPU_DOWN_FAILED:
         case CPU_ONLINE:
                 smpboot_unpark_threads(cpu);
                 break;
@@ -479,7 +479,7 @@ static struct notifier_block smpboot_thread_notifier = {
         .priority = CPU_PRI_SMPBOOT,
  };
  
-void __cpuinit smpboot_thread_init(void)
+void smpboot_thread_init(void)
  {
         register_cpu_notifier(&smpboot_thread_notifier);
  }
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 1a3bf48743ce1c62c26077d642084cbdc8b40d6b..f2003b97ddc99d726cf5cc145b134b128671f17c 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3442,7 +3442,6 @@ static void free_event_rcu(struct rcu_head *head)
         if (event->ns)
                 put_pid_ns(event->ns);
         perf_event_free_filter(event);
-       perf_event_free_bpf_prog(event);
         kfree(event);
  }
  
@@ -3573,6 +3572,8 @@ static void __free_event(struct perf_event *event)
                         put_callchain_buffers();
         }
  
+       perf_event_free_bpf_prog(event);
+
         if (event->destroy)
                 event->destroy(event);
  
@@ -4330,20 +4331,20 @@ static void ring_buffer_attach(struct perf_event *event,
                 WARN_ON_ONCE(event->rcu_pending);
  
                 old_rb = event->rb;
-               event->rcu_batches = get_state_synchronize_rcu();
-               event->rcu_pending = 1;
-
                 spin_lock_irqsave(&old_rb->event_lock, flags);
                 list_del_rcu(&event->rb_entry);
                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
-       }
  
-       if (event->rcu_pending && rb) {
-               cond_synchronize_rcu(event->rcu_batches);
-               event->rcu_pending = 0;
+               event->rcu_batches = get_state_synchronize_rcu();
+               event->rcu_pending = 1;
         }
  
         if (rb) {
+               if (event->rcu_pending) {
+                       cond_synchronize_rcu(event->rcu_batches);
+                       event->rcu_pending = 0;
+               }
+
                 spin_lock_irqsave(&rb->event_lock, flags);
                 list_add_rcu(&event->rb_entry, &rb->event_list);
                 spin_unlock_irqrestore(&rb->event_lock, flags);
@@ -5380,9 +5381,9 @@ void perf_prepare_sample(struct perf_event_header *header,
         }
  }
  
-static void perf_event_output(struct perf_event *event,
-                               struct perf_sample_data *data,
-                               struct pt_regs *regs)
+void perf_event_output(struct perf_event *event,
+                       struct perf_sample_data *data,
+                       struct pt_regs *regs)
  {
         struct perf_output_handle handle;
         struct perf_event_header header;
@@ -5973,6 +5974,39 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head,
         perf_output_end(&handle);
  }
  
+/*
+ * Lost/dropped samples logging
+ */
+void perf_log_lost_samples(struct perf_event *event, u64 lost)
+{
+       struct perf_output_handle handle;
+       struct perf_sample_data sample;
+       int ret;
+
+       struct {
+               struct perf_event_header        header;
+               u64                             lost;
+       } lost_samples_event = {
+               .header = {
+                       .type = PERF_RECORD_LOST_SAMPLES,
+                       .misc = 0,
+                       .size = sizeof(lost_samples_event),
+               },
+               .lost           = lost,
+       };
+
+       perf_event_header__init_id(&lost_samples_event.header, &sample, event);
+
+       ret = perf_output_begin(&handle, event,
+                               lost_samples_event.header.size);
+       if (ret)
+               return;
+
+       perf_output_put(&handle, lost_samples_event);
+       perf_event__output_id_sample(event, &handle, &sample);
+       perf_output_end(&handle);
+}
+
  /*
   * IRQ throttle logging
   */
diff --git a/kernel/events/internal.h b/kernel/events/internal.h

index 9f6ce9ba4a04330d689345bc312fc765f0804a31..2deb24c7a40dd979313eb1fcff58044d2d948888 100644 (file)
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -72,15 +72,6 @@ static inline bool rb_has_aux(struct ring_buffer *rb)
  void perf_event_aux_event(struct perf_event *event, unsigned long head,
                           unsigned long size, u64 flags);
  
-extern void
-perf_event_header__init_id(struct perf_event_header *header,
-                          struct perf_sample_data *data,
-                          struct perf_event *event);
-extern void
-perf_event__output_id_sample(struct perf_event *event,
-                            struct perf_output_handle *handle,
-                            struct perf_sample_data *sample);
-
  extern struct page *
  perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
  
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c

index 232f00f273cbe419d2738d5f83465dd96529ee17..96472824a752f76fe651ec1bfb7ab7a52411a12c 100644 (file)
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -141,7 +141,7 @@ int perf_output_begin(struct perf_output_handle *handle,
         perf_output_get_handle(handle);
  
         do {
-               tail = ACCESS_ONCE(rb->user_page->data_tail);
+               tail = READ_ONCE_CTRL(rb->user_page->data_tail);
                 offset = head = local_read(&rb->head);
                 if (!rb->overwrite &&
                     unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
@@ -493,6 +493,20 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
                         rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
         }
  
+       /*
+        * In overwrite mode, PMUs that don't support SG may not handle more
+        * than one contiguous allocation, since they rely on PMI to do double
+        * buffering. In this case, the entire buffer has to be one contiguous
+        * chunk.
+        */
+       if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) &&
+           overwrite) {
+               struct page *page = virt_to_page(rb->aux_pages[0]);
+
+               if (page_private(page) != max_order)
+                       goto out;
+       }
+
         rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
                                              overwrite);
         if (!rb->aux_priv)
diff --git a/kernel/fork.c b/kernel/fork.c

index 03c1eaaa6ef56f56a670488eaf572eb8c6f58d4e..0bb88b555550580dca507fa34fcc7c60a5a013ee 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1091,10 +1091,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
  {
         unsigned long cpu_limit;
  
-       /* Thread group counters. */
-       thread_group_cputime_init(sig);
-
-       cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+       cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
         if (cpu_limit != RLIM_INFINITY) {
                 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
                 sig->cputimer.running = 1;
@@ -1396,6 +1393,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         p->hardirq_context = 0;
         p->softirq_context = 0;
  #endif
+
+       p->pagefault_disabled = 0;
+
  #ifdef CONFIG_LOCKDEP
         p->lockdep_depth = 0; /* no locks held yet */
         p->curr_chain_key = 0;
diff --git a/kernel/futex.c b/kernel/futex.c

index 55ca63ad9622487c368e4ee471d54faaa806d7e4..aacc706f85fcc1d7a1e8dd6c4e753c5b3495e1b0 100644 (file)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1090,9 +1090,11 @@ static void __unqueue_futex(struct futex_q *q)
  
  /*
   * The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
   */
-static void wake_futex(struct futex_q *q)
+static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
  {
         struct task_struct *p = q->task;
  
@@ -1100,14 +1102,10 @@ static void wake_futex(struct futex_q *q)
                 return;
  
         /*
-        * We set q->lock_ptr = NULL _before_ we wake up the task. If
-        * a non-futex wake up happens on another CPU then the task
-        * might exit and p would dereference a non-existing task
-        * struct. Prevent this by holding a reference on p across the
-        * wake up.
+        * Queue the task for later wakeup for after we've released
+        * the hb->lock. wake_q_add() grabs reference to p.
          */
-       get_task_struct(p);
-
+       wake_q_add(wake_q, p);
         __unqueue_futex(q);
         /*
          * The waiting task can free the futex_q as soon as
@@ -1117,9 +1115,6 @@ static void wake_futex(struct futex_q *q)
          */
         smp_wmb();
         q->lock_ptr = NULL;
-
-       wake_up_state(p, TASK_NORMAL);
-       put_task_struct(p);
  }
  
  static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -1217,6 +1212,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
         struct futex_q *this, *next;
         union futex_key key = FUTEX_KEY_INIT;
         int ret;
+       WAKE_Q(wake_q);
  
         if (!bitset)
                 return -EINVAL;
@@ -1244,13 +1240,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
                         if (!(this->bitset & bitset))
                                 continue;
  
-                       wake_futex(this);
+                       mark_wake_futex(&wake_q, this);
                         if (++ret >= nr_wake)
                                 break;
                 }
         }
  
         spin_unlock(&hb->lock);
+       wake_up_q(&wake_q);
  out_put_key:
         put_futex_key(&key);
  out:
@@ -1269,6 +1266,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
         struct futex_hash_bucket *hb1, *hb2;
         struct futex_q *this, *next;
         int ret, op_ret;
+       WAKE_Q(wake_q);
  
  retry:
         ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
@@ -1320,7 +1318,7 @@ retry_private:
                                 ret = -EINVAL;
                                 goto out_unlock;
                         }
-                       wake_futex(this);
+                       mark_wake_futex(&wake_q, this);
                         if (++ret >= nr_wake)
                                 break;
                 }
@@ -1334,7 +1332,7 @@ retry_private:
                                         ret = -EINVAL;
                                         goto out_unlock;
                                 }
-                               wake_futex(this);
+                               mark_wake_futex(&wake_q, this);
                                 if (++op_ret >= nr_wake2)
                                         break;
                         }
@@ -1344,6 +1342,7 @@ retry_private:
  
  out_unlock:
         double_unlock_hb(hb1, hb2);
+       wake_up_q(&wake_q);
  out_put_keys:
         put_futex_key(&key2);
  out_put_key1:
@@ -1503,6 +1502,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
         struct futex_pi_state *pi_state = NULL;
         struct futex_hash_bucket *hb1, *hb2;
         struct futex_q *this, *next;
+       WAKE_Q(wake_q);
  
         if (requeue_pi) {
                 /*
@@ -1679,7 +1679,7 @@ retry_private:
                  * woken by futex_unlock_pi().
                  */
                 if (++task_count <= nr_wake && !requeue_pi) {
-                       wake_futex(this);
+                       mark_wake_futex(&wake_q, this);
                         continue;
                 }
  
@@ -1719,6 +1719,7 @@ retry_private:
  out_unlock:
         free_pi_state(pi_state);
         double_unlock_hb(hb1, hb2);
+       wake_up_q(&wake_q);
         hb_waiters_dec(hb2);
  
         /*
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c

index 86ae2aebf00432f4d681a413febebff79406889d..951cfcd10b4a0dc98d81f59ec61667378922269e 100644 (file)
--- a/kernel/locking/lglock.c
+++ b/kernel/locking/lglock.c
@@ -60,6 +60,28 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
  }
  EXPORT_SYMBOL(lg_local_unlock_cpu);
  
+void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
+{
+       BUG_ON(cpu1 == cpu2);
+
+       /* lock in cpu order, just like lg_global_lock */
+       if (cpu2 < cpu1)
+               swap(cpu1, cpu2);
+
+       preempt_disable();
+       lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+       arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
+       arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
+}
+
+void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
+{
+       lock_release(&lg->lock_dep_map, 1, _RET_IP_);
+       arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
+       arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
+       preempt_enable();
+}
+
  void lg_global_lock(struct lglock *lg)
  {
         int i;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c

index a61bb1d37a52c3a2a41fb037e0548c449a27bd03..456614136f1a2caed847a9d9051754e11e1a4ed3 100644 (file)
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3900,7 +3900,8 @@ static void zap_class(struct lock_class *class)
         list_del_rcu(&class->hash_entry);
         list_del_rcu(&class->lock_entry);
  
-       class->key = NULL;
+       RCU_INIT_POINTER(class->key, NULL);
+       RCU_INIT_POINTER(class->name, NULL);
  }
  
  static inline int within(const void *addr, void *start, unsigned long size)
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c

index ef43ac4bafb59b83ab979a680d49d6077749f955..d83d798bef95a042e1060a35bf4b79e7c7a6c05c 100644 (file)
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -426,10 +426,12 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
  
  static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
  {
-       char name[39];
-       struct lock_class *class;
+       struct lockdep_subclass_key *ckey;
         struct lock_class_stats *stats;
+       struct lock_class *class;
+       const char *cname;
         int i, namelen;
+       char name[39];
  
         class = data->class;
         stats = &data->stats;
@@ -440,15 +442,25 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
         if (class->subclass)
                 namelen -= 2;
  
-       if (!class->name) {
+       rcu_read_lock_sched();
+       cname = rcu_dereference_sched(class->name);
+       ckey  = rcu_dereference_sched(class->key);
+
+       if (!cname && !ckey) {
+               rcu_read_unlock_sched();
+               return;
+
+       } else if (!cname) {
                 char str[KSYM_NAME_LEN];
                 const char *key_name;
  
-               key_name = __get_key_name(class->key, str);
+               key_name = __get_key_name(ckey, str);
                 snprintf(name, namelen, "%s", key_name);
         } else {
-               snprintf(name, namelen, "%s", class->name);
+               snprintf(name, namelen, "%s", cname);
         }
+       rcu_read_unlock_sched();
+
         namelen = strlen(name);
         if (class->name_version > 1) {
                 snprintf(name+namelen, 3, "#%d", class->name_version);
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c

index ec8cce259779061dd863e6a48ff1dcfa1a7a131c..32244186f1f2ae0e7a6343ad084f416aa0cda055 100644 (file)
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -122,12 +122,12 @@ static int torture_lock_busted_write_lock(void)
  
  static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
  {
-       const unsigned long longdelay_us = 100;
+       const unsigned long longdelay_ms = 100;
  
         /* We want a long delay occasionally to force massive contention.  */
         if (!(torture_random(trsp) %
-             (cxt.nrealwriters_stress * 2000 * longdelay_us)))
-               mdelay(longdelay_us);
+             (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+               mdelay(longdelay_ms);
  #ifdef CONFIG_PREEMPT
         if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
                 preempt_schedule();  /* Allow test to be preempted. */
@@ -160,14 +160,14 @@ static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
  static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
  {
         const unsigned long shortdelay_us = 2;
-       const unsigned long longdelay_us = 100;
+       const unsigned long longdelay_ms = 100;
  
         /* We want a short delay mostly to emulate likely code, and
          * we want a long delay occasionally to force massive contention.
          */
         if (!(torture_random(trsp) %
-             (cxt.nrealwriters_stress * 2000 * longdelay_us)))
-               mdelay(longdelay_us);
+             (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+               mdelay(longdelay_ms);
         if (!(torture_random(trsp) %
               (cxt.nrealwriters_stress * 2 * shortdelay_us)))
                 udelay(shortdelay_us);
@@ -309,7 +309,7 @@ static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
  static void torture_rwlock_read_unlock_irq(void)
  __releases(torture_rwlock)
  {
-       write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
+       read_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
  }
  
  static struct lock_torture_ops rw_lock_irq_ops = {
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c

index 00c12bb390b5729dcda12a9e8def3b032da73093..6c5da483966bde7aea3c7e7d43a42c0b55349f65 100644 (file)
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -22,6 +22,26 @@
  #include <linux/hardirq.h>
  #include <asm/qrwlock.h>
  
+/*
+ * This internal data structure is used for optimizing access to some of
+ * the subfields within the atomic_t cnts.
+ */
+struct __qrwlock {
+       union {
+               atomic_t cnts;
+               struct {
+#ifdef __LITTLE_ENDIAN
+                       u8 wmode;       /* Writer mode   */
+                       u8 rcnts[3];    /* Reader counts */
+#else
+                       u8 rcnts[3];    /* Reader counts */
+                       u8 wmode;       /* Writer mode   */
+#endif
+               };
+       };
+       arch_spinlock_t lock;
+};
+
  /**
   * rspin_until_writer_unlock - inc reader count & spin until writer is gone
   * @lock  : Pointer to queue rwlock structure
@@ -107,10 +127,10 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
          * or wait for a previous writer to go away.
          */
         for (;;) {
-               cnts = atomic_read(&lock->cnts);
-               if (!(cnts & _QW_WMASK) &&
-                   (atomic_cmpxchg(&lock->cnts, cnts,
-                                   cnts | _QW_WAITING) == cnts))
+               struct __qrwlock *l = (struct __qrwlock *)lock;
+
+               if (!READ_ONCE(l->wmode) &&
+                  (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0))
                         break;
  
                 cpu_relax_lowlatency();
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c

index 8dbe27611ec399e42f8912d49708ff4e20bff73f..59e32684c23b58714ecb26215856f67866cfc58e 100644 (file)
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -241,6 +241,7 @@ rcu_torture_free(struct rcu_torture *p)
  struct rcu_torture_ops {
         int ttype;
         void (*init)(void);
+       void (*cleanup)(void);
         int (*readlock)(void);
         void (*read_delay)(struct torture_random_state *rrsp);
         void (*readunlock)(int idx);
@@ -477,10 +478,12 @@ static struct rcu_torture_ops rcu_busted_ops = {
   */
  
  DEFINE_STATIC_SRCU(srcu_ctl);
+static struct srcu_struct srcu_ctld;
+static struct srcu_struct *srcu_ctlp = &srcu_ctl;
  
-static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
+static int srcu_torture_read_lock(void) __acquires(srcu_ctlp)
  {
-       return srcu_read_lock(&srcu_ctl);
+       return srcu_read_lock(srcu_ctlp);
  }
  
  static void srcu_read_delay(struct torture_random_state *rrsp)
@@ -499,49 +502,49 @@ static void srcu_read_delay(struct torture_random_state *rrsp)
                 rcu_read_delay(rrsp);
  }
  
-static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
+static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
  {
-       srcu_read_unlock(&srcu_ctl, idx);
+       srcu_read_unlock(srcu_ctlp, idx);
  }
  
  static unsigned long srcu_torture_completed(void)
  {
-       return srcu_batches_completed(&srcu_ctl);
+       return srcu_batches_completed(srcu_ctlp);
  }
  
  static void srcu_torture_deferred_free(struct rcu_torture *rp)
  {
-       call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
+       call_srcu(srcu_ctlp, &rp->rtort_rcu, rcu_torture_cb);
  }
  
  static void srcu_torture_synchronize(void)
  {
-       synchronize_srcu(&srcu_ctl);
+       synchronize_srcu(srcu_ctlp);
  }
  
  static void srcu_torture_call(struct rcu_head *head,
                               void (*func)(struct rcu_head *head))
  {
-       call_srcu(&srcu_ctl, head, func);
+       call_srcu(srcu_ctlp, head, func);
  }
  
  static void srcu_torture_barrier(void)
  {
-       srcu_barrier(&srcu_ctl);
+       srcu_barrier(srcu_ctlp);
  }
  
  static void srcu_torture_stats(void)
  {
         int cpu;
-       int idx = srcu_ctl.completed & 0x1;
+       int idx = srcu_ctlp->completed & 0x1;
  
         pr_alert("%s%s per-CPU(idx=%d):",
                  torture_type, TORTURE_FLAG, idx);
         for_each_possible_cpu(cpu) {
                 long c0, c1;
  
-               c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
-               c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
+               c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
+               c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
                 pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
         }
         pr_cont("\n");
@@ -549,7 +552,7 @@ static void srcu_torture_stats(void)
  
  static void srcu_torture_synchronize_expedited(void)
  {
-       synchronize_srcu_expedited(&srcu_ctl);
+       synchronize_srcu_expedited(srcu_ctlp);
  }
  
  static struct rcu_torture_ops srcu_ops = {
@@ -569,6 +572,38 @@ static struct rcu_torture_ops srcu_ops = {
         .name           = "srcu"
  };
  
+static void srcu_torture_init(void)
+{
+       rcu_sync_torture_init();
+       WARN_ON(init_srcu_struct(&srcu_ctld));
+       srcu_ctlp = &srcu_ctld;
+}
+
+static void srcu_torture_cleanup(void)
+{
+       cleanup_srcu_struct(&srcu_ctld);
+       srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
+}
+
+/* As above, but dynamically allocated. */
+static struct rcu_torture_ops srcud_ops = {
+       .ttype          = SRCU_FLAVOR,
+       .init           = srcu_torture_init,
+       .cleanup        = srcu_torture_cleanup,
+       .readlock       = srcu_torture_read_lock,
+       .read_delay     = srcu_read_delay,
+       .readunlock     = srcu_torture_read_unlock,
+       .started        = NULL,
+       .completed      = srcu_torture_completed,
+       .deferred_free  = srcu_torture_deferred_free,
+       .sync           = srcu_torture_synchronize,
+       .exp_sync       = srcu_torture_synchronize_expedited,
+       .call           = srcu_torture_call,
+       .cb_barrier     = srcu_torture_barrier,
+       .stats          = srcu_torture_stats,
+       .name           = "srcud"
+};
+
  /*
   * Definitions for sched torture testing.
   */
@@ -672,8 +707,8 @@ static void rcu_torture_boost_cb(struct rcu_head *head)
         struct rcu_boost_inflight *rbip =
                 container_of(head, struct rcu_boost_inflight, rcu);
  
-       smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
-       rbip->inflight = 0;
+       /* Ensure RCU-core accesses precede clearing ->inflight */
+       smp_store_release(&rbip->inflight, 0);
  }
  
  static int rcu_torture_boost(void *arg)
@@ -710,9 +745,9 @@ static int rcu_torture_boost(void *arg)
                 call_rcu_time = jiffies;
                 while (ULONG_CMP_LT(jiffies, endtime)) {
                         /* If we don't have a callback in flight, post one. */
-                       if (!rbi.inflight) {
-                               smp_mb(); /* RCU core before ->inflight = 1. */
-                               rbi.inflight = 1;
+                       if (!smp_load_acquire(&rbi.inflight)) {
+                               /* RCU core before ->inflight = 1. */
+                               smp_store_release(&rbi.inflight, 1);
                                 call_rcu(&rbi.rcu, rcu_torture_boost_cb);
                                 if (jiffies - call_rcu_time >
                                          test_boost_duration * HZ - HZ / 2) {
@@ -751,11 +786,10 @@ checkwait:        stutter_wait("rcu_torture_boost");
         } while (!torture_must_stop());
  
         /* Clean up and exit. */
-       while (!kthread_should_stop() || rbi.inflight) {
+       while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) {
                 torture_shutdown_absorb("rcu_torture_boost");
                 schedule_timeout_uninterruptible(1);
         }
-       smp_mb(); /* order accesses to ->inflight before stack-frame death. */
         destroy_rcu_head_on_stack(&rbi.rcu);
         torture_kthread_stopping("rcu_torture_boost");
         return 0;
@@ -1054,7 +1088,7 @@ static void rcu_torture_timer(unsigned long unused)
         p = rcu_dereference_check(rcu_torture_current,
                                   rcu_read_lock_bh_held() ||
                                   rcu_read_lock_sched_held() ||
-                                 srcu_read_lock_held(&srcu_ctl));
+                                 srcu_read_lock_held(srcu_ctlp));
         if (p == NULL) {
                 /* Leave because rcu_torture_writer is not yet underway */
                 cur_ops->readunlock(idx);
@@ -1128,7 +1162,7 @@ rcu_torture_reader(void *arg)
                 p = rcu_dereference_check(rcu_torture_current,
                                           rcu_read_lock_bh_held() ||
                                           rcu_read_lock_sched_held() ||
-                                         srcu_read_lock_held(&srcu_ctl));
+                                         srcu_read_lock_held(srcu_ctlp));
                 if (p == NULL) {
                         /* Wait for rcu_torture_writer to get underway */
                         cur_ops->readunlock(idx);
@@ -1413,12 +1447,15 @@ static int rcu_torture_barrier_cbs(void *arg)
         do {
                 wait_event(barrier_cbs_wq[myid],
                            (newphase =
-                           ACCESS_ONCE(barrier_phase)) != lastphase ||
+                           smp_load_acquire(&barrier_phase)) != lastphase ||
                            torture_must_stop());
                 lastphase = newphase;
-               smp_mb(); /* ensure barrier_phase load before ->call(). */
                 if (torture_must_stop())
                         break;
+               /*
+                * The above smp_load_acquire() ensures barrier_phase load
+                * is ordered before the folloiwng ->call().
+                */
                 cur_ops->call(&rcu, rcu_torture_barrier_cbf);
                 if (atomic_dec_and_test(&barrier_cbs_count))
                         wake_up(&barrier_wq);
@@ -1439,8 +1476,8 @@ static int rcu_torture_barrier(void *arg)
         do {
                 atomic_set(&barrier_cbs_invoked, 0);
                 atomic_set(&barrier_cbs_count, n_barrier_cbs);
-               smp_mb(); /* Ensure barrier_phase after prior assignments. */
-               barrier_phase = !barrier_phase;
+               /* Ensure barrier_phase ordered after prior assignments. */
+               smp_store_release(&barrier_phase, !barrier_phase);
                 for (i = 0; i < n_barrier_cbs; i++)
                         wake_up(&barrier_cbs_wq[i]);
                 wait_event(barrier_wq,
@@ -1588,10 +1625,14 @@ rcu_torture_cleanup(void)
                         rcutorture_booster_cleanup(i);
         }
  
-       /* Wait for all RCU callbacks to fire.  */
-
+       /*
+        * Wait for all RCU callbacks to fire, then do flavor-specific
+        * cleanup operations.
+        */
         if (cur_ops->cb_barrier != NULL)
                 cur_ops->cb_barrier();
+       if (cur_ops->cleanup != NULL)
+               cur_ops->cleanup();
  
         rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
  
@@ -1668,8 +1709,8 @@ rcu_torture_init(void)
         int cpu;
         int firsterr = 0;
         static struct rcu_torture_ops *torture_ops[] = {
-               &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
-               RCUTORTURE_TASKS_OPS
+               &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
+               &sched_ops, RCUTORTURE_TASKS_OPS
         };
  
         if (!torture_init_begin(torture_type, verbose, &torture_runnable))
@@ -1701,7 +1742,7 @@ rcu_torture_init(void)
         if (nreaders >= 0) {
                 nrealreaders = nreaders;
         } else {
-               nrealreaders = num_online_cpus() - 1;
+               nrealreaders = num_online_cpus() - 2 - nreaders;
                 if (nrealreaders <= 0)
                         nrealreaders = 1;
         }
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c

index cad76e76b4e7def42de7c379882c878084c5394c..fb33d35ee0b7c0ecdb6df0cb0562050abcc6707b 100644 (file)
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -151,7 +151,7 @@ static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
         unsigned long t;
  
         for_each_possible_cpu(cpu) {
-               t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
+               t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
                 sum += t;
         }
         return sum;
@@ -168,7 +168,7 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
         unsigned long t;
  
         for_each_possible_cpu(cpu) {
-               t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
+               t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
                 sum += t;
         }
         return sum;
@@ -265,8 +265,8 @@ static int srcu_readers_active(struct srcu_struct *sp)
         unsigned long sum = 0;
  
         for_each_possible_cpu(cpu) {
-               sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
-               sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+               sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
+               sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
         }
         return sum;
  }
@@ -296,7 +296,7 @@ int __srcu_read_lock(struct srcu_struct *sp)
  {
         int idx;
  
-       idx = ACCESS_ONCE(sp->completed) & 0x1;
+       idx = READ_ONCE(sp->completed) & 0x1;
         preempt_disable();
         __this_cpu_inc(sp->per_cpu_ref->c[idx]);
         smp_mb(); /* B */  /* Avoid leaking the critical section. */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c

index 069742d61c68873a3fce7bf5d7df2e77d5a59958..591af0cb7b9f4e7a25abbab58f3a7ab1fe3cb4fb 100644 (file)
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -49,39 +49,6 @@ static void __call_rcu(struct rcu_head *head,
  
  #include "tiny_plugin.h"
  
-/*
- * Enter idle, which is an extended quiescent state if we have fully
- * entered that mode.
- */
-void rcu_idle_enter(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
-
-/*
- * Exit an interrupt handler towards idle.
- */
-void rcu_irq_exit(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_irq_exit);
-
-/*
- * Exit idle, so that we are no longer in an extended quiescent state.
- */
-void rcu_idle_exit(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
-
-/*
- * Enter an interrupt handler, moving away from idle.
- */
-void rcu_irq_enter(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_irq_enter);
-
  #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
  
  /*
@@ -170,6 +137,11 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
  
         /* Move the ready-to-invoke callbacks to a local list. */
         local_irq_save(flags);
+       if (rcp->donetail == &rcp->rcucblist) {
+               /* No callbacks ready, so just leave. */
+               local_irq_restore(flags);
+               return;
+       }
         RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
         list = rcp->rcucblist;
         rcp->rcucblist = *rcp->donetail;
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h

index f94e209a10d615a5a4f7d379e623918f533ce814..e492a5253e0f10c94da7056efd8f42ba2c1a394c 100644 (file)
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -144,16 +144,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
                 return;
         rcp->ticks_this_gp++;
         j = jiffies;
-       js = ACCESS_ONCE(rcp->jiffies_stall);
+       js = READ_ONCE(rcp->jiffies_stall);
         if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
                 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
                        rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
                        jiffies - rcp->gp_start, rcp->qlen);
                 dump_stack();
-               ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
-                       3 * rcu_jiffies_till_stall_check() + 3;
+               WRITE_ONCE(rcp->jiffies_stall,
+                          jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
         } else if (ULONG_CMP_GE(j, js)) {
-               ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+               WRITE_ONCE(rcp->jiffies_stall,
+                          jiffies + rcu_jiffies_till_stall_check());
         }
  }
  
@@ -161,7 +162,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
  {
         rcp->ticks_this_gp = 0;
         rcp->gp_start = jiffies;
-       ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+       WRITE_ONCE(rcp->jiffies_stall,
+                  jiffies + rcu_jiffies_till_stall_check());
  }
  
  static void check_cpu_stalls(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c

index 8cf7304b2867f5a113807afb0bd5dc0a3bd3cfc0..add042926a6608258564c72ea5a33dd204428df6 100644 (file)
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -91,7 +91,7 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var
  
  #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
  DEFINE_RCU_TPS(sname) \
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
  struct rcu_state sname##_state = { \
         .level = { &sname##_state.node[0] }, \
         .rda = &sname##_data, \
@@ -110,11 +110,18 @@ struct rcu_state sname##_state = { \
  RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
  RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
  
-static struct rcu_state *rcu_state_p;
+static struct rcu_state *const rcu_state_p;
+static struct rcu_data __percpu *const rcu_data_p;
  LIST_HEAD(rcu_struct_flavors);
  
-/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
-static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
+/* Dump rcu_node combining tree at boot to verify correct setup. */
+static bool dump_tree;
+module_param(dump_tree, bool, 0444);
+/* Control rcu_node-tree auto-balancing at boot time. */
+static bool rcu_fanout_exact;
+module_param(rcu_fanout_exact, bool, 0444);
+/* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */
+static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
  module_param(rcu_fanout_leaf, int, 0444);
  int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
  static int num_rcu_lvl[] = {  /* Number of rcu_nodes at specified level. */
@@ -159,17 +166,46 @@ static void invoke_rcu_core(void);
  static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
  
  /* rcuc/rcub kthread realtime priority */
+#ifdef CONFIG_RCU_KTHREAD_PRIO
  static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */
+static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
+#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */
  module_param(kthread_prio, int, 0644);
  
  /* Delay in jiffies for grace-period initialization delays, debug only. */
+
+#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT
+static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY;
+module_param(gp_preinit_delay, int, 0644);
+#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
+static const int gp_preinit_delay;
+#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
+
  #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT
  static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY;
  module_param(gp_init_delay, int, 0644);
  #else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
  static const int gp_init_delay;
  #endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
-#define PER_RCU_NODE_PERIOD 10 /* Number of grace periods between delays. */
+
+#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP
+static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY;
+module_param(gp_cleanup_delay, int, 0644);
+#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
+static const int gp_cleanup_delay;
+#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
+
+/*
+ * Number of grace periods between delays, normalized by the duration of
+ * the delay.  The longer the the delay, the more the grace periods between
+ * each delay.  The reason for this normalization is that it means that,
+ * for non-zero delays, the overall slowdown of grace periods is constant
+ * regardless of the duration of the delay.  This arrangement balances
+ * the need for long delays to increase some race probabilities with the
+ * need for fast grace periods to increase other race probabilities.
+ */
+#define PER_RCU_NODE_PERIOD 3  /* Number of grace periods between delays. */
  
  /*
   * Track the rcutorture test sequence number and the update version
@@ -191,17 +227,17 @@ unsigned long rcutorture_vernum;
   */
  unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
  {
-       return ACCESS_ONCE(rnp->qsmaskinitnext);
+       return READ_ONCE(rnp->qsmaskinitnext);
  }
  
  /*
- * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
+ * Return true if an RCU grace period is in progress.  The READ_ONCE()s
   * permit this function to be invoked without holding the root rcu_node
   * structure's ->lock, but of course results can be subject to change.
   */
  static int rcu_gp_in_progress(struct rcu_state *rsp)
  {
-       return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
+       return READ_ONCE(rsp->completed) != READ_ONCE(rsp->gpnum);
  }
  
  /*
@@ -278,8 +314,8 @@ static void rcu_momentary_dyntick_idle(void)
                 if (!(resched_mask & rsp->flavor_mask))
                         continue;
                 smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
-               if (ACCESS_ONCE(rdp->mynode->completed) !=
-                   ACCESS_ONCE(rdp->cond_resched_completed))
+               if (READ_ONCE(rdp->mynode->completed) !=
+                   READ_ONCE(rdp->cond_resched_completed))
                         continue;
  
                 /*
@@ -491,9 +527,9 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
                 break;
         }
         if (rsp != NULL) {
-               *flags = ACCESS_ONCE(rsp->gp_flags);
-               *gpnum = ACCESS_ONCE(rsp->gpnum);
-               *completed = ACCESS_ONCE(rsp->completed);
+               *flags = READ_ONCE(rsp->gp_flags);
+               *gpnum = READ_ONCE(rsp->gpnum);
+               *completed = READ_ONCE(rsp->completed);
                 return;
         }
         *flags = 0;
@@ -539,10 +575,10 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
  static int rcu_future_needs_gp(struct rcu_state *rsp)
  {
         struct rcu_node *rnp = rcu_get_root(rsp);
-       int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1;
+       int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
         int *fp = &rnp->need_future_gp[idx];
  
-       return ACCESS_ONCE(*fp);
+       return READ_ONCE(*fp);
  }
  
  /*
@@ -565,7 +601,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
                 return 1;  /* Yes, this CPU has newly registered callbacks. */
         for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
                 if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
-                   ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
+                   ULONG_CMP_LT(READ_ONCE(rsp->completed),
                                  rdp->nxtcompleted[i]))
                         return 1;  /* Yes, CBs for future grace period. */
         return 0; /* No grace period needed. */
@@ -585,7 +621,8 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
         struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
  
         trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
-       if (!user && !is_idle_task(current)) {
+       if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+           !user && !is_idle_task(current)) {
                 struct task_struct *idle __maybe_unused =
                         idle_task(smp_processor_id());
  
@@ -604,7 +641,8 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
         smp_mb__before_atomic();  /* See above. */
         atomic_inc(&rdtp->dynticks);
         smp_mb__after_atomic();  /* Force ordering with next sojourn. */
-       WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+       WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+                    atomic_read(&rdtp->dynticks) & 0x1);
         rcu_dynticks_task_enter();
  
         /*
@@ -630,7 +668,8 @@ static void rcu_eqs_enter(bool user)
  
         rdtp = this_cpu_ptr(&rcu_dynticks);
         oldval = rdtp->dynticks_nesting;
-       WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
+       WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+                    (oldval & DYNTICK_TASK_NEST_MASK) == 0);
         if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
                 rdtp->dynticks_nesting = 0;
                 rcu_eqs_enter_common(oldval, user);
@@ -703,7 +742,8 @@ void rcu_irq_exit(void)
         rdtp = this_cpu_ptr(&rcu_dynticks);
         oldval = rdtp->dynticks_nesting;
         rdtp->dynticks_nesting--;
-       WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
+       WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+                    rdtp->dynticks_nesting < 0);
         if (rdtp->dynticks_nesting)
                 trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
         else
@@ -728,10 +768,12 @@ static void rcu_eqs_exit_common(long long oldval, int user)
         atomic_inc(&rdtp->dynticks);
         /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
         smp_mb__after_atomic();  /* See above. */
-       WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+       WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+                    !(atomic_read(&rdtp->dynticks) & 0x1));
         rcu_cleanup_after_idle();
         trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
-       if (!user && !is_idle_task(current)) {
+       if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+           !user && !is_idle_task(current)) {
                 struct task_struct *idle __maybe_unused =
                         idle_task(smp_processor_id());
  
@@ -755,7 +797,7 @@ static void rcu_eqs_exit(bool user)
  
         rdtp = this_cpu_ptr(&rcu_dynticks);
         oldval = rdtp->dynticks_nesting;
-       WARN_ON_ONCE(oldval < 0);
+       WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
         if (oldval & DYNTICK_TASK_NEST_MASK) {
                 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
         } else {
@@ -828,7 +870,8 @@ void rcu_irq_enter(void)
         rdtp = this_cpu_ptr(&rcu_dynticks);
         oldval = rdtp->dynticks_nesting;
         rdtp->dynticks_nesting++;
-       WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
+       WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+                    rdtp->dynticks_nesting == 0);
         if (oldval)
                 trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
         else
@@ -1011,9 +1054,9 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
                 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
                 return 1;
         } else {
-               if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+               if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
                                  rdp->mynode->gpnum))
-                       ACCESS_ONCE(rdp->gpwrap) = true;
+                       WRITE_ONCE(rdp->gpwrap, true);
                 return 0;
         }
  }
@@ -1093,12 +1136,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
         if (ULONG_CMP_GE(jiffies,
                          rdp->rsp->gp_start + jiffies_till_sched_qs) ||
             ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
-               if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
-                       ACCESS_ONCE(rdp->cond_resched_completed) =
-                               ACCESS_ONCE(rdp->mynode->completed);
+               if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
+                       WRITE_ONCE(rdp->cond_resched_completed,
+                                  READ_ONCE(rdp->mynode->completed));
                         smp_mb(); /* ->cond_resched_completed before *rcrmp. */
-                       ACCESS_ONCE(*rcrmp) =
-                               ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+                       WRITE_ONCE(*rcrmp,
+                                  READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
                         resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
                         rdp->rsp->jiffies_resched += 5; /* Enable beating. */
                 } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
@@ -1119,9 +1162,9 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
         rsp->gp_start = j;
         smp_wmb(); /* Record start time before stall time. */
         j1 = rcu_jiffies_till_stall_check();
-       ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
+       WRITE_ONCE(rsp->jiffies_stall, j + j1);
         rsp->jiffies_resched = j + j1 / 2;
-       rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
+       rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs);
  }
  
  /*
@@ -1133,10 +1176,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
         unsigned long j;
  
         j = jiffies;
-       gpa = ACCESS_ONCE(rsp->gp_activity);
+       gpa = READ_ONCE(rsp->gp_activity);
         if (j - gpa > 2 * HZ)
-               pr_err("%s kthread starved for %ld jiffies!\n",
-                      rsp->name, j - gpa);
+               pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n",
+                      rsp->name, j - gpa,
+                      rsp->gpnum, rsp->completed, rsp->gp_flags);
  }
  
  /*
@@ -1173,12 +1217,13 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
         /* Only let one CPU complain about others per time interval. */
  
         raw_spin_lock_irqsave(&rnp->lock, flags);
-       delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
+       delta = jiffies - READ_ONCE(rsp->jiffies_stall);
         if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
                 return;
         }
-       ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+       WRITE_ONCE(rsp->jiffies_stall,
+                  jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
         raw_spin_unlock_irqrestore(&rnp->lock, flags);
  
         /*
@@ -1212,12 +1257,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
         if (ndetected) {
                 rcu_dump_cpu_stacks(rsp);
         } else {
-               if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
-                   ACCESS_ONCE(rsp->completed) == gpnum) {
+               if (READ_ONCE(rsp->gpnum) != gpnum ||
+                   READ_ONCE(rsp->completed) == gpnum) {
                         pr_err("INFO: Stall ended before state dump start\n");
                 } else {
                         j = jiffies;
-                       gpa = ACCESS_ONCE(rsp->gp_activity);
+                       gpa = READ_ONCE(rsp->gp_activity);
                         pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
                                rsp->name, j - gpa, j, gpa,
                                jiffies_till_next_fqs,
@@ -1262,9 +1307,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
         rcu_dump_cpu_stacks(rsp);
  
         raw_spin_lock_irqsave(&rnp->lock, flags);
-       if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
-               ACCESS_ONCE(rsp->jiffies_stall) = jiffies +
-                                    3 * rcu_jiffies_till_stall_check() + 3;
+       if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
+               WRITE_ONCE(rsp->jiffies_stall,
+                          jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
         raw_spin_unlock_irqrestore(&rnp->lock, flags);
  
         /*
@@ -1307,20 +1352,20 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
          * Given this check, comparisons of jiffies, rsp->jiffies_stall,
          * and rsp->gp_start suffice to forestall false positives.
          */
-       gpnum = ACCESS_ONCE(rsp->gpnum);
+       gpnum = READ_ONCE(rsp->gpnum);
         smp_rmb(); /* Pick up ->gpnum first... */
-       js = ACCESS_ONCE(rsp->jiffies_stall);
+       js = READ_ONCE(rsp->jiffies_stall);
         smp_rmb(); /* ...then ->jiffies_stall before the rest... */
-       gps = ACCESS_ONCE(rsp->gp_start);
+       gps = READ_ONCE(rsp->gp_start);
         smp_rmb(); /* ...and finally ->gp_start before ->completed. */
-       completed = ACCESS_ONCE(rsp->completed);
+       completed = READ_ONCE(rsp->completed);
         if (ULONG_CMP_GE(completed, gpnum) ||
             ULONG_CMP_LT(j, js) ||
             ULONG_CMP_GE(gps, js))
                 return; /* No stall or GP completed since entering function. */
         rnp = rdp->mynode;
         if (rcu_gp_in_progress(rsp) &&
-           (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
+           (READ_ONCE(rnp->qsmask) & rdp->grpmask)) {
  
                 /* We haven't checked in, so go dump stack. */
                 print_cpu_stall(rsp);
@@ -1347,7 +1392,7 @@ void rcu_cpu_stall_reset(void)
         struct rcu_state *rsp;
  
         for_each_rcu_flavor(rsp)
-               ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
+               WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2);
  }
  
  /*
@@ -1457,7 +1502,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
          * doing some extra useless work.
          */
         if (rnp->gpnum != rnp->completed ||
-           ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {
+           READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) {
                 rnp->need_future_gp[c & 0x1]++;
                 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
                 goto out;
@@ -1542,7 +1587,7 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
  static void rcu_gp_kthread_wake(struct rcu_state *rsp)
  {
         if (current == rsp->gp_kthread ||
-           !ACCESS_ONCE(rsp->gp_flags) ||
+           !READ_ONCE(rsp->gp_flags) ||
             !rsp->gp_kthread)
                 return;
         wake_up(&rsp->gp_wq);
@@ -1677,7 +1722,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
  
         /* Handle the ends of any preceding grace periods first. */
         if (rdp->completed == rnp->completed &&
-           !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
+           !unlikely(READ_ONCE(rdp->gpwrap))) {
  
                 /* No grace period end, so just accelerate recent callbacks. */
                 ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1692,7 +1737,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
         }
  
-       if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
+       if (rdp->gpnum != rnp->gpnum || unlikely(READ_ONCE(rdp->gpwrap))) {
                 /*
                  * If the current grace period is waiting for this CPU,
                  * set up to detect a quiescent state, otherwise don't
@@ -1704,7 +1749,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
                 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
                 zero_cpu_stall_ticks(rdp);
-               ACCESS_ONCE(rdp->gpwrap) = false;
+               WRITE_ONCE(rdp->gpwrap, false);
         }
         return ret;
  }
@@ -1717,9 +1762,9 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
  
         local_irq_save(flags);
         rnp = rdp->mynode;
-       if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
-            rdp->completed == ACCESS_ONCE(rnp->completed) &&
-            !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
+       if ((rdp->gpnum == READ_ONCE(rnp->gpnum) &&
+            rdp->completed == READ_ONCE(rnp->completed) &&
+            !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
             !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
                 local_irq_restore(flags);
                 return;
@@ -1731,6 +1776,13 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
                 rcu_gp_kthread_wake(rsp);
  }
  
+static void rcu_gp_slow(struct rcu_state *rsp, int delay)
+{
+       if (delay > 0 &&
+           !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
+               schedule_timeout_uninterruptible(delay);
+}
+
  /*
   * Initialize a new grace period.  Return 0 if no grace period required.
   */
@@ -1740,15 +1792,15 @@ static int rcu_gp_init(struct rcu_state *rsp)
         struct rcu_data *rdp;
         struct rcu_node *rnp = rcu_get_root(rsp);
  
-       ACCESS_ONCE(rsp->gp_activity) = jiffies;
+       WRITE_ONCE(rsp->gp_activity, jiffies);
         raw_spin_lock_irq(&rnp->lock);
         smp_mb__after_unlock_lock();
-       if (!ACCESS_ONCE(rsp->gp_flags)) {
+       if (!READ_ONCE(rsp->gp_flags)) {
                 /* Spurious wakeup, tell caller to go back to sleep.  */
                 raw_spin_unlock_irq(&rnp->lock);
                 return 0;
         }
-       ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */
+       WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
  
         if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
                 /*
@@ -1773,6 +1825,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
          * will handle subsequent offline CPUs.
          */
         rcu_for_each_leaf_node(rsp, rnp) {
+               rcu_gp_slow(rsp, gp_preinit_delay);
                 raw_spin_lock_irq(&rnp->lock);
                 smp_mb__after_unlock_lock();
                 if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
@@ -1829,14 +1882,15 @@ static int rcu_gp_init(struct rcu_state *rsp)
          * process finishes, because this kthread handles both.
          */
         rcu_for_each_node_breadth_first(rsp, rnp) {
+               rcu_gp_slow(rsp, gp_init_delay);
                 raw_spin_lock_irq(&rnp->lock);
                 smp_mb__after_unlock_lock();
                 rdp = this_cpu_ptr(rsp->rda);
                 rcu_preempt_check_blocked_tasks(rnp);
                 rnp->qsmask = rnp->qsmaskinit;
-               ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
+               WRITE_ONCE(rnp->gpnum, rsp->gpnum);
                 if (WARN_ON_ONCE(rnp->completed != rsp->completed))
-                       ACCESS_ONCE(rnp->completed) = rsp->completed;
+                       WRITE_ONCE(rnp->completed, rsp->completed);
                 if (rnp == rdp->mynode)
                         (void)__note_gp_changes(rsp, rnp, rdp);
                 rcu_preempt_boost_start_gp(rnp);
@@ -1845,10 +1899,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
                                             rnp->grphi, rnp->qsmask);
                 raw_spin_unlock_irq(&rnp->lock);
                 cond_resched_rcu_qs();
-               ACCESS_ONCE(rsp->gp_activity) = jiffies;
-               if (gp_init_delay > 0 &&
-                   !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD)))
-                       schedule_timeout_uninterruptible(gp_init_delay);
+               WRITE_ONCE(rsp->gp_activity, jiffies);
         }
  
         return 1;
@@ -1864,7 +1915,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
         unsigned long maxj;
         struct rcu_node *rnp = rcu_get_root(rsp);
  
-       ACCESS_ONCE(rsp->gp_activity) = jiffies;
+       WRITE_ONCE(rsp->gp_activity, jiffies);
         rsp->n_force_qs++;
         if (fqs_state == RCU_SAVE_DYNTICK) {
                 /* Collect dyntick-idle snapshots. */
@@ -1882,11 +1933,11 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
                 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
         }
         /* Clear flag to prevent immediate re-entry. */
-       if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+       if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
                 raw_spin_lock_irq(&rnp->lock);
                 smp_mb__after_unlock_lock();
-               ACCESS_ONCE(rsp->gp_flags) =
-                       ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS;
+               WRITE_ONCE(rsp->gp_flags,
+                          READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
                 raw_spin_unlock_irq(&rnp->lock);
         }
         return fqs_state;
@@ -1903,7 +1954,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
         struct rcu_data *rdp;
         struct rcu_node *rnp = rcu_get_root(rsp);
  
-       ACCESS_ONCE(rsp->gp_activity) = jiffies;
+       WRITE_ONCE(rsp->gp_activity, jiffies);
         raw_spin_lock_irq(&rnp->lock);
         smp_mb__after_unlock_lock();
         gp_duration = jiffies - rsp->gp_start;
@@ -1934,7 +1985,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
                 smp_mb__after_unlock_lock();
                 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
                 WARN_ON_ONCE(rnp->qsmask);
-               ACCESS_ONCE(rnp->completed) = rsp->gpnum;
+               WRITE_ONCE(rnp->completed, rsp->gpnum);
                 rdp = this_cpu_ptr(rsp->rda);
                 if (rnp == rdp->mynode)
                         needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
@@ -1942,7 +1993,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
                 nocb += rcu_future_gp_cleanup(rsp, rnp);
                 raw_spin_unlock_irq(&rnp->lock);
                 cond_resched_rcu_qs();
-               ACCESS_ONCE(rsp->gp_activity) = jiffies;
+               WRITE_ONCE(rsp->gp_activity, jiffies);
+               rcu_gp_slow(rsp, gp_cleanup_delay);
         }
         rnp = rcu_get_root(rsp);
         raw_spin_lock_irq(&rnp->lock);
@@ -1950,16 +2002,16 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
         rcu_nocb_gp_set(rnp, nocb);
  
         /* Declare grace period done. */
-       ACCESS_ONCE(rsp->completed) = rsp->gpnum;
+       WRITE_ONCE(rsp->completed, rsp->gpnum);
         trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
         rsp->fqs_state = RCU_GP_IDLE;
         rdp = this_cpu_ptr(rsp->rda);
         /* Advance CBs to reduce false positives below. */
         needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
         if (needgp || cpu_needs_another_gp(rsp, rdp)) {
-               ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
+               WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
                 trace_rcu_grace_period(rsp->name,
-                                      ACCESS_ONCE(rsp->gpnum),
+                                      READ_ONCE(rsp->gpnum),
                                        TPS("newreq"));
         }
         raw_spin_unlock_irq(&rnp->lock);
@@ -1983,20 +2035,20 @@ static int __noreturn rcu_gp_kthread(void *arg)
                 /* Handle grace-period start. */
                 for (;;) {
                         trace_rcu_grace_period(rsp->name,
-                                              ACCESS_ONCE(rsp->gpnum),
+                                              READ_ONCE(rsp->gpnum),
                                                TPS("reqwait"));
                         rsp->gp_state = RCU_GP_WAIT_GPS;
                         wait_event_interruptible(rsp->gp_wq,
-                                                ACCESS_ONCE(rsp->gp_flags) &
+                                                READ_ONCE(rsp->gp_flags) &
                                                  RCU_GP_FLAG_INIT);
                         /* Locking provides needed memory barrier. */
                         if (rcu_gp_init(rsp))
                                 break;
                         cond_resched_rcu_qs();
-                       ACCESS_ONCE(rsp->gp_activity) = jiffies;
+                       WRITE_ONCE(rsp->gp_activity, jiffies);
                         WARN_ON(signal_pending(current));
                         trace_rcu_grace_period(rsp->name,
-                                              ACCESS_ONCE(rsp->gpnum),
+                                              READ_ONCE(rsp->gpnum),
                                                TPS("reqwaitsig"));
                 }
  
@@ -2012,39 +2064,39 @@ static int __noreturn rcu_gp_kthread(void *arg)
                         if (!ret)
                                 rsp->jiffies_force_qs = jiffies + j;
                         trace_rcu_grace_period(rsp->name,
-                                              ACCESS_ONCE(rsp->gpnum),
+                                              READ_ONCE(rsp->gpnum),
                                                TPS("fqswait"));
                         rsp->gp_state = RCU_GP_WAIT_FQS;
                         ret = wait_event_interruptible_timeout(rsp->gp_wq,
-                                       ((gf = ACCESS_ONCE(rsp->gp_flags)) &
+                                       ((gf = READ_ONCE(rsp->gp_flags)) &
                                          RCU_GP_FLAG_FQS) ||
-                                       (!ACCESS_ONCE(rnp->qsmask) &&
+                                       (!READ_ONCE(rnp->qsmask) &&
                                          !rcu_preempt_blocked_readers_cgp(rnp)),
                                         j);
                         /* Locking provides needed memory barriers. */
                         /* If grace period done, leave loop. */
-                       if (!ACCESS_ONCE(rnp->qsmask) &&
+                       if (!READ_ONCE(rnp->qsmask) &&
                             !rcu_preempt_blocked_readers_cgp(rnp))
                                 break;
                         /* If time for quiescent-state forcing, do it. */
                         if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
                             (gf & RCU_GP_FLAG_FQS)) {
                                 trace_rcu_grace_period(rsp->name,
-                                                      ACCESS_ONCE(rsp->gpnum),
+                                                      READ_ONCE(rsp->gpnum),
                                                        TPS("fqsstart"));
                                 fqs_state = rcu_gp_fqs(rsp, fqs_state);
                                 trace_rcu_grace_period(rsp->name,
-                                                      ACCESS_ONCE(rsp->gpnum),
+                                                      READ_ONCE(rsp->gpnum),
                                                        TPS("fqsend"));
                                 cond_resched_rcu_qs();
-                               ACCESS_ONCE(rsp->gp_activity) = jiffies;
+                               WRITE_ONCE(rsp->gp_activity, jiffies);
                         } else {
                                 /* Deal with stray signal. */
                                 cond_resched_rcu_qs();
-                               ACCESS_ONCE(rsp->gp_activity) = jiffies;
+                               WRITE_ONCE(rsp->gp_activity, jiffies);
                                 WARN_ON(signal_pending(current));
                                 trace_rcu_grace_period(rsp->name,
-                                                      ACCESS_ONCE(rsp->gpnum),
+                                                      READ_ONCE(rsp->gpnum),
                                                        TPS("fqswaitsig"));
                         }
                         j = jiffies_till_next_fqs;
@@ -2086,8 +2138,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
                  */
                 return false;
         }
-       ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
-       trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
+       WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
+       trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum),
                                TPS("newreq"));
  
         /*
@@ -2137,6 +2189,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
         __releases(rcu_get_root(rsp)->lock)
  {
         WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+       WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
         raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
         rcu_gp_kthread_wake(rsp);
  }
@@ -2334,8 +2387,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
         rcu_report_qs_rdp(rdp->cpu, rsp, rdp);
  }
  
-#ifdef CONFIG_HOTPLUG_CPU
-
  /*
   * Send the specified CPU's RCU callbacks to the orphanage.  The
   * specified CPU must be offline, and the caller must hold the
@@ -2346,7 +2397,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
                           struct rcu_node *rnp, struct rcu_data *rdp)
  {
         /* No-CBs CPUs do not have orphanable callbacks. */
-       if (rcu_is_nocb_cpu(rdp->cpu))
+       if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
                 return;
  
         /*
@@ -2359,7 +2410,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
                 rsp->qlen += rdp->qlen;
                 rdp->n_cbs_orphaned += rdp->qlen;
                 rdp->qlen_lazy = 0;
-               ACCESS_ONCE(rdp->qlen) = 0;
+               WRITE_ONCE(rdp->qlen, 0);
         }
  
         /*
@@ -2405,7 +2456,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
         struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
  
         /* No-CBs CPUs are handled specially. */
-       if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
+       if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+           rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
                 return;
  
         /* Do the accounting first. */
@@ -2452,6 +2504,9 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
         RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
         RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
  
+       if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+               return;
+
         RCU_TRACE(mask = rdp->grpmask);
         trace_rcu_grace_period(rsp->name,
                                rnp->gpnum + 1 - !!(rnp->qsmask & mask),
@@ -2480,7 +2535,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
         long mask;
         struct rcu_node *rnp = rnp_leaf;
  
-       if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
+       if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+           rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
                 return;
         for (;;) {
                 mask = rnp->grpmask;
@@ -2511,6 +2567,9 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
         struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
         struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
  
+       if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+               return;
+
         /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
         mask = rdp->grpmask;
         raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -2532,6 +2591,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
         struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
         struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
  
+       if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+               return;
+
         /* Adjust any no-longer-needed kthreads. */
         rcu_boost_kthread_setaffinity(rnp, -1);
  
@@ -2546,26 +2608,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
                   cpu, rdp->qlen, rdp->nxtlist);
  }
  
-#else /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
-{
-}
-
-static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
-{
-}
-
-static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
-{
-}
-
-static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
-
  /*
   * Invoke any RCU callbacks that have made it to the end of their grace
   * period.  Thottle as specified by rdp->blimit.
@@ -2580,7 +2622,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
         /* If no callbacks are ready, just return. */
         if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
                 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
-               trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
+               trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist),
                                     need_resched(), is_idle_task(current),
                                     rcu_is_callbacks_kthread());
                 return;
@@ -2636,7 +2678,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
         }
         smp_mb(); /* List handling before counting for rcu_barrier(). */
         rdp->qlen_lazy -= count_lazy;
-       ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;
+       WRITE_ONCE(rdp->qlen, rdp->qlen - count);
         rdp->n_cbs_invoked += count;
  
         /* Reinstate batch limit if we have worked down the excess. */
@@ -2730,10 +2772,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
                 mask = 0;
                 raw_spin_lock_irqsave(&rnp->lock, flags);
                 smp_mb__after_unlock_lock();
-               if (!rcu_gp_in_progress(rsp)) {
-                       raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                       return;
-               }
                 if (rnp->qsmask == 0) {
                         if (rcu_state_p == &rcu_sched_state ||
                             rsp != rcu_state_p ||
@@ -2763,8 +2801,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
                 bit = 1;
                 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
                         if ((rnp->qsmask & bit) != 0) {
-                               if ((rnp->qsmaskinit & bit) == 0)
-                                       *isidle = false; /* Pending hotplug. */
                                 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
                                         mask |= bit;
                         }
@@ -2793,7 +2829,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
         /* Funnel through hierarchy to reduce memory contention. */
         rnp = __this_cpu_read(rsp->rda->mynode);
         for (; rnp != NULL; rnp = rnp->parent) {
-               ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
+               ret = (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
                       !raw_spin_trylock(&rnp->fqslock);
                 if (rnp_old != NULL)
                         raw_spin_unlock(&rnp_old->fqslock);
@@ -2809,13 +2845,12 @@ static void force_quiescent_state(struct rcu_state *rsp)
         raw_spin_lock_irqsave(&rnp_old->lock, flags);
         smp_mb__after_unlock_lock();
         raw_spin_unlock(&rnp_old->fqslock);
-       if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+       if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
                 rsp->n_force_qs_lh++;
                 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
                 return;  /* Someone beat us to it. */
         }
-       ACCESS_ONCE(rsp->gp_flags) =
-               ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS;
+       WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
         raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
         rcu_gp_kthread_wake(rsp);
  }
@@ -2881,7 +2916,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
   */
  static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
  {
-       if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
+       if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
                 return;
         if (likely(!rsp->boost)) {
                 rcu_do_batch(rsp, rdp);
@@ -2972,7 +3007,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
         if (debug_rcu_head_queue(head)) {
                 /* Probable double call_rcu(), so leak the callback. */
-               ACCESS_ONCE(head->func) = rcu_leak_callback;
+               WRITE_ONCE(head->func, rcu_leak_callback);
                 WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
                 return;
         }
@@ -3011,7 +3046,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
                 if (!likely(rdp->nxtlist))
                         init_default_callback_list(rdp);
         }
-       ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
+       WRITE_ONCE(rdp->qlen, rdp->qlen + 1);
         if (lazy)
                 rdp->qlen_lazy++;
         else
@@ -3287,7 +3322,7 @@ void synchronize_sched_expedited(void)
         if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
                          (ulong)atomic_long_read(&rsp->expedited_done) +
                          ULONG_MAX / 8)) {
-               synchronize_sched();
+               wait_rcu_gp(call_rcu_sched);
                 atomic_long_inc(&rsp->expedited_wrap);
                 return;
         }
@@ -3450,14 +3485,14 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
         }
  
         /* Has another RCU grace period completed?  */
-       if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
+       if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
                 rdp->n_rp_gp_completed++;
                 return 1;
         }
  
         /* Has a new RCU grace period started? */
-       if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
-           unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
+       if (READ_ONCE(rnp->gpnum) != rdp->gpnum ||
+           unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */
                 rdp->n_rp_gp_started++;
                 return 1;
         }
@@ -3493,7 +3528,7 @@ static int rcu_pending(void)
   * non-NULL, store an indication of whether all callbacks are lazy.
   * (If there are no callbacks, all of them are deemed to be lazy.)
   */
-static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
+static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
  {
         bool al = true;
         bool hc = false;
@@ -3564,7 +3599,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
  {
         int cpu;
         struct rcu_data *rdp;
-       unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
+       unsigned long snap = READ_ONCE(rsp->n_barrier_done);
         unsigned long snap_done;
  
         _rcu_barrier_trace(rsp, "Begin", -1, snap);
@@ -3606,10 +3641,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
  
         /*
          * Increment ->n_barrier_done to avoid duplicate work.  Use
-        * ACCESS_ONCE() to prevent the compiler from speculating
+        * WRITE_ONCE() to prevent the compiler from speculating
          * the increment to precede the early-exit check.
          */
-       ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
+       WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
         WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
         _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
         smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
@@ -3645,7 +3680,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
                                 __call_rcu(&rdp->barrier_head,
                                            rcu_barrier_callback, rsp, cpu, 0);
                         }
-               } else if (ACCESS_ONCE(rdp->qlen)) {
+               } else if (READ_ONCE(rdp->qlen)) {
                         _rcu_barrier_trace(rsp, "OnlineQ", cpu,
                                            rsp->n_barrier_done);
                         smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
@@ -3665,7 +3700,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
  
         /* Increment ->n_barrier_done to prevent duplicate work. */
         smp_mb(); /* Keep increment after above mechanism. */
-       ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
+       WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
         WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
         _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
         smp_mb(); /* Keep increment before caller's subsequent code. */
@@ -3780,7 +3815,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
         rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
         rdp->completed = rnp->completed;
         rdp->passed_quiesce = false;
-       rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+       rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
         rdp->qs_pending = false;
         trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
         raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -3924,16 +3959,16 @@ void rcu_scheduler_starting(void)
  
  /*
   * Compute the per-level fanout, either using the exact fanout specified
- * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
   */
  static void __init rcu_init_levelspread(struct rcu_state *rsp)
  {
         int i;
  
-       if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) {
+       if (rcu_fanout_exact) {
                 rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
                 for (i = rcu_num_lvls - 2; i >= 0; i--)
-                       rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+                       rsp->levelspread[i] = RCU_FANOUT;
         } else {
                 int ccur;
                 int cprv;
@@ -3971,9 +4006,9 @@ static void __init rcu_init_one(struct rcu_state *rsp,
  
         BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
  
-       /* Silence gcc 4.8 warning about array index out of range. */
-       if (rcu_num_lvls > RCU_NUM_LVLS)
-               panic("rcu_init_one: rcu_num_lvls overflow");
+       /* Silence gcc 4.8 false positive about array index out of range. */
+       if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
+               panic("rcu_init_one: rcu_num_lvls out of range");
  
         /* Initialize the level-tracking arrays. */
  
@@ -4059,7 +4094,7 @@ static void __init rcu_init_geometry(void)
                 jiffies_till_next_fqs = d;
  
         /* If the compile-time values are accurate, just leave. */
-       if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
+       if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
             nr_cpu_ids == NR_CPUS)
                 return;
         pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
@@ -4073,7 +4108,7 @@ static void __init rcu_init_geometry(void)
         rcu_capacity[0] = 1;
         rcu_capacity[1] = rcu_fanout_leaf;
         for (i = 2; i <= MAX_RCU_LVLS; i++)
-               rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
+               rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
  
         /*
          * The boot-time rcu_fanout_leaf parameter is only permitted
@@ -4083,7 +4118,7 @@ static void __init rcu_init_geometry(void)
          * the configured number of CPUs.  Complain and fall back to the
          * compile-time values if these limits are exceeded.
          */
-       if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
+       if (rcu_fanout_leaf < RCU_FANOUT_LEAF ||
             rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
             n > rcu_capacity[MAX_RCU_LVLS]) {
                 WARN_ON(1);
@@ -4109,6 +4144,28 @@ static void __init rcu_init_geometry(void)
         rcu_num_nodes -= n;
  }
  
+/*
+ * Dump out the structure of the rcu_node combining tree associated
+ * with the rcu_state structure referenced by rsp.
+ */
+static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
+{
+       int level = 0;
+       struct rcu_node *rnp;
+
+       pr_info("rcu_node tree layout dump\n");
+       pr_info(" ");
+       rcu_for_each_node_breadth_first(rsp, rnp) {
+               if (rnp->level != level) {
+                       pr_cont("\n");
+                       pr_info(" ");
+                       level = rnp->level;
+               }
+               pr_cont("%d:%d ^%d  ", rnp->grplo, rnp->grphi, rnp->grpnum);
+       }
+       pr_cont("\n");
+}
+
  void __init rcu_init(void)
  {
         int cpu;
@@ -4119,6 +4176,8 @@ void __init rcu_init(void)
         rcu_init_geometry();
         rcu_init_one(&rcu_bh_state, &rcu_bh_data);
         rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+       if (dump_tree)
+               rcu_dump_rcu_node_tree(&rcu_sched_state);
         __rcu_init_preempt();
         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
  
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h

index a69d3dab2ec4dbf9dc8c412813c84c27363f9378..4adb7ca0bf47a209067c66205ace8b6f0dbebb61 100644 (file)
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -35,11 +35,33 @@
   * In practice, this did work well going from three levels to four.
   * Of course, your mileage may vary.
   */
+
  #define MAX_RCU_LVLS 4
-#define RCU_FANOUT_1         (CONFIG_RCU_FANOUT_LEAF)
-#define RCU_FANOUT_2         (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_3         (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_4         (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+
+#ifdef CONFIG_RCU_FANOUT
+#define RCU_FANOUT CONFIG_RCU_FANOUT
+#else /* #ifdef CONFIG_RCU_FANOUT */
+# ifdef CONFIG_64BIT
+# define RCU_FANOUT 64
+# else
+# define RCU_FANOUT 32
+# endif
+#endif /* #else #ifdef CONFIG_RCU_FANOUT */
+
+#ifdef CONFIG_RCU_FANOUT_LEAF
+#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
+#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
+# ifdef CONFIG_64BIT
+# define RCU_FANOUT_LEAF 64
+# else
+# define RCU_FANOUT_LEAF 32
+# endif
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
+
+#define RCU_FANOUT_1         (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2         (RCU_FANOUT_1 * RCU_FANOUT)
+#define RCU_FANOUT_3         (RCU_FANOUT_2 * RCU_FANOUT)
+#define RCU_FANOUT_4         (RCU_FANOUT_3 * RCU_FANOUT)
  
  #if NR_CPUS <= RCU_FANOUT_1
  #  define RCU_NUM_LVLS       1
@@ -170,7 +192,6 @@ struct rcu_node {
                                 /*  if there is no such task.  If there */
                                 /*  is no current expedited grace period, */
                                 /*  then there can cannot be any such task. */
-#ifdef CONFIG_RCU_BOOST
         struct list_head *boost_tasks;
                                 /* Pointer to first task that needs to be */
                                 /*  priority boosted, or NULL if no priority */
@@ -208,7 +229,6 @@ struct rcu_node {
         unsigned long n_balk_nos;
                                 /* Refused to boost: not sure why, though. */
                                 /*  This can happen due to race conditions. */
-#endif /* #ifdef CONFIG_RCU_BOOST */
  #ifdef CONFIG_RCU_NOCB_CPU
         wait_queue_head_t nocb_gp_wq[2];
                                 /* Place for rcu_nocb_kthread() to wait GP. */
@@ -519,14 +539,11 @@ extern struct list_head rcu_struct_flavors;
   * RCU implementation internal declarations:
   */
  extern struct rcu_state rcu_sched_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
  
  extern struct rcu_state rcu_bh_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
  
  #ifdef CONFIG_PREEMPT_RCU
  extern struct rcu_state rcu_preempt_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
  #endif /* #ifdef CONFIG_PREEMPT_RCU */
  
  #ifdef CONFIG_RCU_BOOST
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h

index 8c0ec0f5a02702f1a3c5ed5db0bdf346ac7ae140..32664347091a1a6b7e04e2bf6ae8128a3411fc42 100644 (file)
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -43,7 +43,17 @@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
  DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
  DEFINE_PER_CPU(char, rcu_cpu_has_work);
  
-#endif /* #ifdef CONFIG_RCU_BOOST */
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
+ * all uses are in dead code.  Provide a definition to keep the compiler
+ * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
+ * This probably needs to be excluded from -rt builds.
+ */
+#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
+
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
  
  #ifdef CONFIG_RCU_NOCB_CPU
  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@ -60,11 +70,11 @@ static void __init rcu_bootup_announce_oddness(void)
  {
         if (IS_ENABLED(CONFIG_RCU_TRACE))
                 pr_info("\tRCU debugfs-based tracing is enabled.\n");
-       if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) ||
-           (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32))
+       if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
+           (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
                 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
-                      CONFIG_RCU_FANOUT);
-       if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT))
+                      RCU_FANOUT);
+       if (rcu_fanout_exact)
                 pr_info("\tHierarchical RCU autobalancing is disabled.\n");
         if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
                 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
@@ -76,10 +86,10 @@ static void __init rcu_bootup_announce_oddness(void)
                 pr_info("\tAdditional per-CPU info printed with stalls.\n");
         if (NUM_RCU_LVL_4 != 0)
                 pr_info("\tFour-level hierarchy is enabled.\n");
-       if (CONFIG_RCU_FANOUT_LEAF != 16)
+       if (RCU_FANOUT_LEAF != 16)
                 pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
-                       CONFIG_RCU_FANOUT_LEAF);
-       if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
+                       RCU_FANOUT_LEAF);
+       if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
                 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
         if (nr_cpu_ids != NR_CPUS)
                 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
@@ -90,7 +100,8 @@ static void __init rcu_bootup_announce_oddness(void)
  #ifdef CONFIG_PREEMPT_RCU
  
  RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
-static struct rcu_state *rcu_state_p = &rcu_preempt_state;
+static struct rcu_state *const rcu_state_p = &rcu_preempt_state;
+static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data;
  
  static int rcu_preempted_readers_exp(struct rcu_node *rnp);
  static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
@@ -116,11 +127,11 @@ static void __init rcu_bootup_announce(void)
   */
  static void rcu_preempt_qs(void)
  {
-       if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) {
+       if (!__this_cpu_read(rcu_data_p->passed_quiesce)) {
                 trace_rcu_grace_period(TPS("rcu_preempt"),
-                                      __this_cpu_read(rcu_preempt_data.gpnum),
+                                      __this_cpu_read(rcu_data_p->gpnum),
                                        TPS("cpuqs"));
-               __this_cpu_write(rcu_preempt_data.passed_quiesce, 1);
+               __this_cpu_write(rcu_data_p->passed_quiesce, 1);
                 barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
                 current->rcu_read_unlock_special.b.need_qs = false;
         }
@@ -150,7 +161,7 @@ static void rcu_preempt_note_context_switch(void)
             !t->rcu_read_unlock_special.b.blocked) {
  
                 /* Possibly blocking in an RCU read-side critical section. */
-               rdp = this_cpu_ptr(rcu_preempt_state.rda);
+               rdp = this_cpu_ptr(rcu_state_p->rda);
                 rnp = rdp->mynode;
                 raw_spin_lock_irqsave(&rnp->lock, flags);
                 smp_mb__after_unlock_lock();
@@ -180,10 +191,9 @@ static void rcu_preempt_note_context_switch(void)
                 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
                         list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
                         rnp->gp_tasks = &t->rcu_node_entry;
-#ifdef CONFIG_RCU_BOOST
-                       if (rnp->boost_tasks != NULL)
+                       if (IS_ENABLED(CONFIG_RCU_BOOST) &&
+                           rnp->boost_tasks != NULL)
                                 rnp->boost_tasks = rnp->gp_tasks;
-#endif /* #ifdef CONFIG_RCU_BOOST */
                 } else {
                         list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
                         if (rnp->qsmask & rdp->grpmask)
@@ -263,9 +273,7 @@ void rcu_read_unlock_special(struct task_struct *t)
         bool empty_exp_now;
         unsigned long flags;
         struct list_head *np;
-#ifdef CONFIG_RCU_BOOST
         bool drop_boost_mutex = false;
-#endif /* #ifdef CONFIG_RCU_BOOST */
         struct rcu_node *rnp;
         union rcu_special special;
  
@@ -307,9 +315,11 @@ void rcu_read_unlock_special(struct task_struct *t)
                 t->rcu_read_unlock_special.b.blocked = false;
  
                 /*
-                * Remove this task from the list it blocked on.  The
-                * task can migrate while we acquire the lock, but at
-                * most one time.  So at most two passes through loop.
+                * Remove this task from the list it blocked on.  The task
+                * now remains queued on the rcu_node corresponding to
+                * the CPU it first blocked on, so the first attempt to
+                * acquire the task's rcu_node's ->lock will succeed.
+                * Keep the loop and add a WARN_ON() out of sheer paranoia.
                  */
                 for (;;) {
                         rnp = t->rcu_blocked_node;
@@ -317,6 +327,7 @@ void rcu_read_unlock_special(struct task_struct *t)
                         smp_mb__after_unlock_lock();
                         if (rnp == t->rcu_blocked_node)
                                 break;
+                       WARN_ON_ONCE(1);
                         raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                 }
                 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
@@ -331,12 +342,12 @@ void rcu_read_unlock_special(struct task_struct *t)
                         rnp->gp_tasks = np;
                 if (&t->rcu_node_entry == rnp->exp_tasks)
                         rnp->exp_tasks = np;
-#ifdef CONFIG_RCU_BOOST
-               if (&t->rcu_node_entry == rnp->boost_tasks)
-                       rnp->boost_tasks = np;
-               /* Snapshot ->boost_mtx ownership with rcu_node lock held. */
-               drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
-#endif /* #ifdef CONFIG_RCU_BOOST */
+               if (IS_ENABLED(CONFIG_RCU_BOOST)) {
+                       if (&t->rcu_node_entry == rnp->boost_tasks)
+                               rnp->boost_tasks = np;
+                       /* Snapshot ->boost_mtx ownership w/rnp->lock held. */
+                       drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+               }
  
                 /*
                  * If this was the last task on the current list, and if
@@ -353,24 +364,21 @@ void rcu_read_unlock_special(struct task_struct *t)
                                                          rnp->grplo,
                                                          rnp->grphi,
                                                          !!rnp->gp_tasks);
-                       rcu_report_unblock_qs_rnp(&rcu_preempt_state,
-                                                 rnp, flags);
+                       rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags);
                 } else {
                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
                 }
  
-#ifdef CONFIG_RCU_BOOST
                 /* Unboost if we were boosted. */
-               if (drop_boost_mutex)
+               if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
                         rt_mutex_unlock(&rnp->boost_mtx);
-#endif /* #ifdef CONFIG_RCU_BOOST */
  
                 /*
                  * If this was the last task on the expedited lists,
                  * then we need to report up the rcu_node hierarchy.
                  */
                 if (!empty_exp && empty_exp_now)
-                       rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
+                       rcu_report_exp_rnp(rcu_state_p, rnp, true);
         } else {
                 local_irq_restore(flags);
         }
@@ -390,7 +398,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
                 return;
         }
-       t = list_entry(rnp->gp_tasks,
+       t = list_entry(rnp->gp_tasks->prev,
                        struct task_struct, rcu_node_entry);
         list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
                 sched_show_task(t);
@@ -447,7 +455,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
         if (!rcu_preempt_blocked_readers_cgp(rnp))
                 return 0;
         rcu_print_task_stall_begin(rnp);
-       t = list_entry(rnp->gp_tasks,
+       t = list_entry(rnp->gp_tasks->prev,
                        struct task_struct, rcu_node_entry);
         list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
                 pr_cont(" P%d", t->pid);
@@ -491,8 +499,8 @@ static void rcu_preempt_check_callbacks(void)
                 return;
         }
         if (t->rcu_read_lock_nesting > 0 &&
-           __this_cpu_read(rcu_preempt_data.qs_pending) &&
-           !__this_cpu_read(rcu_preempt_data.passed_quiesce))
+           __this_cpu_read(rcu_data_p->qs_pending) &&
+           !__this_cpu_read(rcu_data_p->passed_quiesce))
                 t->rcu_read_unlock_special.b.need_qs = true;
  }
  
@@ -500,7 +508,7 @@ static void rcu_preempt_check_callbacks(void)
  
  static void rcu_preempt_do_callbacks(void)
  {
-       rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
+       rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
  }
  
  #endif /* #ifdef CONFIG_RCU_BOOST */
@@ -510,7 +518,7 @@ static void rcu_preempt_do_callbacks(void)
   */
  void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
  {
-       __call_rcu(head, func, &rcu_preempt_state, -1, 0);
+       __call_rcu(head, func, rcu_state_p, -1, 0);
  }
  EXPORT_SYMBOL_GPL(call_rcu);
  
@@ -570,7 +578,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp)
  static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
  {
         return !rcu_preempted_readers_exp(rnp) &&
-              ACCESS_ONCE(rnp->expmask) == 0;
+              READ_ONCE(rnp->expmask) == 0;
  }
  
  /*
@@ -711,12 +719,12 @@ sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
  void synchronize_rcu_expedited(void)
  {
         struct rcu_node *rnp;
-       struct rcu_state *rsp = &rcu_preempt_state;
+       struct rcu_state *rsp = rcu_state_p;
         unsigned long snap;
         int trycount = 0;
  
         smp_mb(); /* Caller's modifications seen first by other CPUs. */
-       snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
+       snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1;
         smp_mb(); /* Above access cannot bleed into critical section. */
  
         /*
@@ -740,7 +748,7 @@ void synchronize_rcu_expedited(void)
          */
         while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
                 if (ULONG_CMP_LT(snap,
-                   ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+                   READ_ONCE(sync_rcu_preempt_exp_count))) {
                         put_online_cpus();
                         goto mb_ret; /* Others did our work for us. */
                 }
@@ -752,7 +760,7 @@ void synchronize_rcu_expedited(void)
                         return;
                 }
         }
-       if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+       if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) {
                 put_online_cpus();
                 goto unlock_mb_ret; /* Others did our work for us. */
         }
@@ -780,8 +788,7 @@ void synchronize_rcu_expedited(void)
  
         /* Clean up and exit. */
         smp_mb(); /* ensure expedited GP seen before counter increment. */
-       ACCESS_ONCE(sync_rcu_preempt_exp_count) =
-                                       sync_rcu_preempt_exp_count + 1;
+       WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1);
  unlock_mb_ret:
         mutex_unlock(&sync_rcu_preempt_exp_mutex);
  mb_ret:
@@ -799,7 +806,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
   */
  void rcu_barrier(void)
  {
-       _rcu_barrier(&rcu_preempt_state);
+       _rcu_barrier(rcu_state_p);
  }
  EXPORT_SYMBOL_GPL(rcu_barrier);
  
@@ -808,7 +815,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier);
   */
  static void __init __rcu_init_preempt(void)
  {
-       rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
+       rcu_init_one(rcu_state_p, rcu_data_p);
  }
  
  /*
@@ -831,7 +838,8 @@ void exit_rcu(void)
  
  #else /* #ifdef CONFIG_PREEMPT_RCU */
  
-static struct rcu_state *rcu_state_p = &rcu_sched_state;
+static struct rcu_state *const rcu_state_p = &rcu_sched_state;
+static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data;
  
  /*
   * Tell them what RCU they are running.
@@ -994,8 +1002,8 @@ static int rcu_boost(struct rcu_node *rnp)
         struct task_struct *t;
         struct list_head *tb;
  
-       if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
-           ACCESS_ONCE(rnp->boost_tasks) == NULL)
+       if (READ_ONCE(rnp->exp_tasks) == NULL &&
+           READ_ONCE(rnp->boost_tasks) == NULL)
                 return 0;  /* Nothing left to boost. */
  
         raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1048,8 +1056,8 @@ static int rcu_boost(struct rcu_node *rnp)
         rt_mutex_lock(&rnp->boost_mtx);
         rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
  
-       return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
-              ACCESS_ONCE(rnp->boost_tasks) != NULL;
+       return READ_ONCE(rnp->exp_tasks) != NULL ||
+              READ_ONCE(rnp->boost_tasks) != NULL;
  }
  
  /*
@@ -1173,7 +1181,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
         struct sched_param sp;
         struct task_struct *t;
  
-       if (&rcu_preempt_state != rsp)
+       if (rcu_state_p != rsp)
                 return 0;
  
         if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
@@ -1367,13 +1375,12 @@ static void rcu_prepare_kthreads(int cpu)
   * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
   * any flavor of RCU.
   */
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
  int rcu_needs_cpu(unsigned long *delta_jiffies)
  {
         *delta_jiffies = ULONG_MAX;
-       return rcu_cpu_has_callbacks(NULL);
+       return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
+              ? 0 : rcu_cpu_has_callbacks(NULL);
  }
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
  
  /*
   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@ -1462,7 +1469,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
                  * callbacks not yet ready to invoke.
                  */
                 if ((rdp->completed != rnp->completed ||
-                    unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
+                    unlikely(READ_ONCE(rdp->gpwrap))) &&
                     rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
                         note_gp_changes(rsp, rdp);
  
@@ -1480,11 +1487,15 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
   *
   * The caller must have disabled interrupts.
   */
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
  int rcu_needs_cpu(unsigned long *dj)
  {
         struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
  
+       if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) {
+               *dj = ULONG_MAX;
+               return 0;
+       }
+
         /* Snapshot to detect later posting of non-lazy callback. */
         rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
  
@@ -1511,7 +1522,6 @@ int rcu_needs_cpu(unsigned long *dj)
         }
         return 0;
  }
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
  
  /*
   * Prepare a CPU for idle from an RCU perspective.  The first major task
@@ -1525,7 +1535,6 @@ int rcu_needs_cpu(unsigned long *dj)
   */
  static void rcu_prepare_for_idle(void)
  {
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
         bool needwake;
         struct rcu_data *rdp;
         struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
@@ -1533,8 +1542,11 @@ static void rcu_prepare_for_idle(void)
         struct rcu_state *rsp;
         int tne;
  
+       if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL))
+               return;
+
         /* Handle nohz enablement switches conservatively. */
-       tne = ACCESS_ONCE(tick_nohz_active);
+       tne = READ_ONCE(tick_nohz_active);
         if (tne != rdtp->tick_nohz_enabled_snap) {
                 if (rcu_cpu_has_callbacks(NULL))
                         invoke_rcu_core(); /* force nohz to see update. */
@@ -1580,7 +1592,6 @@ static void rcu_prepare_for_idle(void)
                 if (needwake)
                         rcu_gp_kthread_wake(rsp);
         }
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
  }
  
  /*
@@ -1590,12 +1601,11 @@ static void rcu_prepare_for_idle(void)
   */
  static void rcu_cleanup_after_idle(void)
  {
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
-       if (rcu_is_nocb_cpu(smp_processor_id()))
+       if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
+           rcu_is_nocb_cpu(smp_processor_id()))
                 return;
         if (rcu_try_advance_all_cbs())
                 invoke_rcu_core();
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
  }
  
  /*
@@ -1760,7 +1770,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
                atomic_read(&rdtp->dynticks) & 0xfff,
                rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
                rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
-              ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
+              READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
                fast_no_hz);
  }
  
@@ -1898,11 +1908,11 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
  {
         struct rcu_data *rdp_leader = rdp->nocb_leader;
  
-       if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
+       if (!READ_ONCE(rdp_leader->nocb_kthread))
                 return;
-       if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+       if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
                 /* Prior smp_mb__after_atomic() orders against prior enqueue. */
-               ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
+               WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
                 wake_up(&rdp_leader->nocb_wq);
         }
  }
@@ -1934,14 +1944,14 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
         ret = atomic_long_read(&rdp->nocb_q_count);
  
  #ifdef CONFIG_PROVE_RCU
-       rhp = ACCESS_ONCE(rdp->nocb_head);
+       rhp = READ_ONCE(rdp->nocb_head);
         if (!rhp)
-               rhp = ACCESS_ONCE(rdp->nocb_gp_head);
+               rhp = READ_ONCE(rdp->nocb_gp_head);
         if (!rhp)
-               rhp = ACCESS_ONCE(rdp->nocb_follower_head);
+               rhp = READ_ONCE(rdp->nocb_follower_head);
  
         /* Having no rcuo kthread but CBs after scheduler starts is bad! */
-       if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
+       if (!READ_ONCE(rdp->nocb_kthread) && rhp &&
             rcu_scheduler_fully_active) {
                 /* RCU callback enqueued before CPU first came online??? */
                 pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
@@ -1975,12 +1985,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
         atomic_long_add(rhcount, &rdp->nocb_q_count);
         /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
         old_rhpp = xchg(&rdp->nocb_tail, rhtp);
-       ACCESS_ONCE(*old_rhpp) = rhp;
+       WRITE_ONCE(*old_rhpp, rhp);
         atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
         smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
  
         /* If we are not being polled and there is a kthread, awaken it ... */
-       t = ACCESS_ONCE(rdp->nocb_kthread);
+       t = READ_ONCE(rdp->nocb_kthread);
         if (rcu_nocb_poll || !t) {
                 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                     TPS("WakeNotPoll"));
@@ -2118,7 +2128,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
         for (;;) {
                 wait_event_interruptible(
                         rnp->nocb_gp_wq[c & 0x1],
-                       (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
+                       (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
                 if (likely(d))
                         break;
                 WARN_ON(signal_pending(current));
@@ -2145,7 +2155,7 @@ wait_again:
         if (!rcu_nocb_poll) {
                 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
                 wait_event_interruptible(my_rdp->nocb_wq,
-                               !ACCESS_ONCE(my_rdp->nocb_leader_sleep));
+                               !READ_ONCE(my_rdp->nocb_leader_sleep));
                 /* Memory barrier handled by smp_mb() calls below and repoll. */
         } else if (firsttime) {
                 firsttime = false; /* Don't drown trace log with "Poll"! */
@@ -2159,12 +2169,12 @@ wait_again:
          */
         gotcbs = false;
         for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
-               rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
+               rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
                 if (!rdp->nocb_gp_head)
                         continue;  /* No CBs here, try next follower. */
  
                 /* Move callbacks to wait-for-GP list, which is empty. */
-               ACCESS_ONCE(rdp->nocb_head) = NULL;
+               WRITE_ONCE(rdp->nocb_head, NULL);
                 rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
                 gotcbs = true;
         }
@@ -2184,7 +2194,7 @@ wait_again:
                 my_rdp->nocb_leader_sleep = true;
                 smp_mb();  /* Ensure _sleep true before scan. */
                 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
-                       if (ACCESS_ONCE(rdp->nocb_head)) {
+                       if (READ_ONCE(rdp->nocb_head)) {
                                 /* Found CB, so short-circuit next wait. */
                                 my_rdp->nocb_leader_sleep = false;
                                 break;
@@ -2205,7 +2215,7 @@ wait_again:
  
         /* Each pass through the following loop wakes a follower, if needed. */
         for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
-               if (ACCESS_ONCE(rdp->nocb_head))
+               if (READ_ONCE(rdp->nocb_head))
                         my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
                 if (!rdp->nocb_gp_head)
                         continue; /* No CBs, so no need to wake follower. */
@@ -2241,7 +2251,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
                         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                             "FollowerSleep");
                         wait_event_interruptible(rdp->nocb_wq,
-                                                ACCESS_ONCE(rdp->nocb_follower_head));
+                                                READ_ONCE(rdp->nocb_follower_head));
                 } else if (firsttime) {
                         /* Don't drown trace log with "Poll"! */
                         firsttime = false;
@@ -2282,10 +2292,10 @@ static int rcu_nocb_kthread(void *arg)
                         nocb_follower_wait(rdp);
  
                 /* Pull the ready-to-invoke callbacks onto local list. */
-               list = ACCESS_ONCE(rdp->nocb_follower_head);
+               list = READ_ONCE(rdp->nocb_follower_head);
                 BUG_ON(!list);
                 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
-               ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
+               WRITE_ONCE(rdp->nocb_follower_head, NULL);
                 tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
  
                 /* Each pass through the following loop invokes a callback. */
@@ -2324,7 +2334,7 @@ static int rcu_nocb_kthread(void *arg)
  /* Is a deferred wakeup of rcu_nocb_kthread() required? */
  static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
  {
-       return ACCESS_ONCE(rdp->nocb_defer_wakeup);
+       return READ_ONCE(rdp->nocb_defer_wakeup);
  }
  
  /* Do a deferred wakeup of rcu_nocb_kthread(). */
@@ -2334,8 +2344,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
  
         if (!rcu_nocb_need_deferred_wakeup(rdp))
                 return;
-       ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup);
-       ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT;
+       ndw = READ_ONCE(rdp->nocb_defer_wakeup);
+       WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT);
         wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
  }
@@ -2448,7 +2458,7 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
         t = kthread_run(rcu_nocb_kthread, rdp_spawn,
                         "rcuo%c/%d", rsp->abbr, cpu);
         BUG_ON(IS_ERR(t));
-       ACCESS_ONCE(rdp_spawn->nocb_kthread) = t;
+       WRITE_ONCE(rdp_spawn->nocb_kthread, t);
  }
  
  /*
@@ -2663,7 +2673,7 @@ static void rcu_sysidle_enter(int irq)
  
         /* Record start of fully idle period. */
         j = jiffies;
-       ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
+       WRITE_ONCE(rdtp->dynticks_idle_jiffies, j);
         smp_mb__before_atomic();
         atomic_inc(&rdtp->dynticks_idle);
         smp_mb__after_atomic();
@@ -2681,7 +2691,7 @@ static void rcu_sysidle_enter(int irq)
   */
  void rcu_sysidle_force_exit(void)
  {
-       int oldstate = ACCESS_ONCE(full_sysidle_state);
+       int oldstate = READ_ONCE(full_sysidle_state);
         int newoldstate;
  
         /*
@@ -2794,7 +2804,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
         smp_mb(); /* Read counters before timestamps. */
  
         /* Pick up timestamps. */
-       j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
+       j = READ_ONCE(rdtp->dynticks_idle_jiffies);
         /* If this CPU entered idle more recently, update maxj timestamp. */
         if (ULONG_CMP_LT(*maxj, j))
                 *maxj = j;
@@ -2831,11 +2841,11 @@ static unsigned long rcu_sysidle_delay(void)
  static void rcu_sysidle(unsigned long j)
  {
         /* Check the current state. */
-       switch (ACCESS_ONCE(full_sysidle_state)) {
+       switch (READ_ONCE(full_sysidle_state)) {
         case RCU_SYSIDLE_NOT:
  
                 /* First time all are idle, so note a short idle period. */
-               ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
+               WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT);
                 break;
  
         case RCU_SYSIDLE_SHORT:
@@ -2873,7 +2883,7 @@ static void rcu_sysidle_cancel(void)
  {
         smp_mb();
         if (full_sysidle_state > RCU_SYSIDLE_SHORT)
-               ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+               WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT);
  }
  
  /*
@@ -2925,7 +2935,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
         smp_mb();  /* grace period precedes setting inuse. */
  
         rshp = container_of(rhp, struct rcu_sysidle_head, rh);
-       ACCESS_ONCE(rshp->inuse) = 0;
+       WRITE_ONCE(rshp->inuse, 0);
  }
  
  /*
@@ -2936,7 +2946,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
  bool rcu_sys_is_idle(void)
  {
         static struct rcu_sysidle_head rsh;
-       int rss = ACCESS_ONCE(full_sysidle_state);
+       int rss = READ_ONCE(full_sysidle_state);
  
         if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
                 return false;
@@ -2964,7 +2974,7 @@ bool rcu_sys_is_idle(void)
                         }
                         rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
                         oldrss = rss;
-                       rss = ACCESS_ONCE(full_sysidle_state);
+                       rss = READ_ONCE(full_sysidle_state);
                 }
         }
  
@@ -3048,10 +3058,10 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
  #ifdef CONFIG_NO_HZ_FULL
         if (tick_nohz_full_cpu(smp_processor_id()) &&
             (!rcu_gp_in_progress(rsp) ||
-            ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
-               return 1;
+            ULONG_CMP_LT(jiffies, READ_ONCE(rsp->gp_start) + HZ)))
+               return true;
  #endif /* #ifdef CONFIG_NO_HZ_FULL */
-       return 0;
+       return false;
  }
  
  /*
@@ -3077,7 +3087,7 @@ static void rcu_bind_gp_kthread(void)
  static void rcu_dynticks_task_enter(void)
  {
  #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
-       ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id();
+       WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
  #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
  }
  
@@ -3085,6 +3095,6 @@ static void rcu_dynticks_task_enter(void)
  static void rcu_dynticks_task_exit(void)
  {
  #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
-       ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1;
+       WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
  #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
  }
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c

index f92361efd0f55d970d851604d34800e7d109ecc8..3ea7ffc7d5c4a75378899d87c805314864804b51 100644 (file)
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -277,7 +277,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
         seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
                    rsp->n_force_qs, rsp->n_force_qs_ngp,
                    rsp->n_force_qs - rsp->n_force_qs_ngp,
-                  ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
+                  READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
         for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
                 if (rnp->level != level) {
                         seq_puts(m, "\n");
@@ -323,8 +323,8 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
         struct rcu_node *rnp = &rsp->node[0];
  
         raw_spin_lock_irqsave(&rnp->lock, flags);
-       completed = ACCESS_ONCE(rsp->completed);
-       gpnum = ACCESS_ONCE(rsp->gpnum);
+       completed = READ_ONCE(rsp->completed);
+       gpnum = READ_ONCE(rsp->gpnum);
         if (completed == gpnum)
                 gpage = 0;
         else
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c

index 1f133350da01e360bc6048b3a458e8b8cc0bdefc..afaecb7a799af235f63afb6877050cf348e4247c 100644 (file)
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -150,14 +150,14 @@ void __rcu_read_unlock(void)
                 barrier();  /* critical section before exit code. */
                 t->rcu_read_lock_nesting = INT_MIN;
                 barrier();  /* assign before ->rcu_read_unlock_special load */
-               if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s)))
+               if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
                         rcu_read_unlock_special(t);
                 barrier();  /* ->rcu_read_unlock_special load before assign */
                 t->rcu_read_lock_nesting = 0;
         }
  #ifdef CONFIG_PROVE_LOCKING
         {
-               int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+               int rrln = READ_ONCE(t->rcu_read_lock_nesting);
  
                 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
         }
@@ -389,17 +389,17 @@ module_param(rcu_cpu_stall_timeout, int, 0644);
  
  int rcu_jiffies_till_stall_check(void)
  {
-       int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+       int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
  
         /*
          * Limit check must be consistent with the Kconfig limits
          * for CONFIG_RCU_CPU_STALL_TIMEOUT.
          */
         if (till_stall_check < 3) {
-               ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+               WRITE_ONCE(rcu_cpu_stall_timeout, 3);
                 till_stall_check = 3;
         } else if (till_stall_check > 300) {
-               ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+               WRITE_ONCE(rcu_cpu_stall_timeout, 300);
                 till_stall_check = 300;
         }
         return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
@@ -550,12 +550,12 @@ static void check_holdout_task(struct task_struct *t,
  {
         int cpu;
  
-       if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
-           t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
-           !ACCESS_ONCE(t->on_rq) ||
+       if (!READ_ONCE(t->rcu_tasks_holdout) ||
+           t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) ||
+           !READ_ONCE(t->on_rq) ||
             (IS_ENABLED(CONFIG_NO_HZ_FULL) &&
              !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
-               ACCESS_ONCE(t->rcu_tasks_holdout) = false;
+               WRITE_ONCE(t->rcu_tasks_holdout, false);
                 list_del_init(&t->rcu_tasks_holdout_list);
                 put_task_struct(t);
                 return;
@@ -639,11 +639,11 @@ static int __noreturn rcu_tasks_kthread(void *arg)
                  */
                 rcu_read_lock();
                 for_each_process_thread(g, t) {
-                       if (t != current && ACCESS_ONCE(t->on_rq) &&
+                       if (t != current && READ_ONCE(t->on_rq) &&
                             !is_idle_task(t)) {
                                 get_task_struct(t);
-                               t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
-                               ACCESS_ONCE(t->rcu_tasks_holdout) = true;
+                               t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw);
+                               WRITE_ONCE(t->rcu_tasks_holdout, true);
                                 list_add(&t->rcu_tasks_holdout_list,
                                          &rcu_tasks_holdouts);
                         }
@@ -672,7 +672,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
                         struct task_struct *t1;
  
                         schedule_timeout_interruptible(HZ);
-                       rtst = ACCESS_ONCE(rcu_task_stall_timeout);
+                       rtst = READ_ONCE(rcu_task_stall_timeout);
                         needreport = rtst > 0 &&
                                      time_after(jiffies, lastreport + rtst);
                         if (needreport)
@@ -728,7 +728,7 @@ static void rcu_spawn_tasks_kthread(void)
         static struct task_struct *rcu_tasks_kthread_ptr;
         struct task_struct *t;
  
-       if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) {
+       if (READ_ONCE(rcu_tasks_kthread_ptr)) {
                 smp_mb(); /* Ensure caller sees full kthread. */
                 return;
         }
@@ -740,7 +740,7 @@ static void rcu_spawn_tasks_kthread(void)
         t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
         BUG_ON(IS_ERR(t));
         smp_mb(); /* Ensure others see full kthread. */
-       ACCESS_ONCE(rcu_tasks_kthread_ptr) = t;
+       WRITE_ONCE(rcu_tasks_kthread_ptr, t);
         mutex_unlock(&rcu_tasks_kthread_mutex);
  }
  
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile

index 46be8702487561cd88a7895fea8c6401d72e9ce6..67687973ce80d63d3f52698fb4b738b76964b896 100644 (file)
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
  CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
  endif
  
-obj-y += core.o proc.o clock.o cputime.o
+obj-y += core.o loadavg.o clock.o cputime.o
  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
  obj-y += wait.o completion.o idle.o
  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c

index eae160dd669d9d8d58bb911595c6391b2732edb4..750ed601ddf78e6dcdc5f10818c98b34b5feea3a 100644 (file)
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -1,5 +1,3 @@
-#ifdef CONFIG_SCHED_AUTOGROUP
-
  #include "sched.h"
  
  #include <linux/proc_fs.h>
@@ -141,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
  
         p->signal->autogroup = autogroup_kref_get(ag);
  
-       if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+       if (!READ_ONCE(sysctl_sched_autogroup_enabled))
                 goto out;
  
         for_each_thread(p, t)
@@ -249,5 +247,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
         return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
  }
  #endif /* CONFIG_SCHED_DEBUG */
-
-#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h

index 8bd047142816dea81894bb27ccc3c78a38ac3d61..890c95f2587a4d8c530c1a5df69eef8a65e5eaf7 100644 (file)
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
  static inline struct task_group *
  autogroup_task_group(struct task_struct *p, struct task_group *tg)
  {
-       int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+       int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
  
         if (enabled && task_wants_autogroup(p, tg))
                 return p->signal->autogroup->tg;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 123673291ffbb160734ed889b934d557611a1cf1..f89ca9bcf42a5f582e2c276dc2ef21338b8bd402 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -511,7 +511,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
  static bool set_nr_if_polling(struct task_struct *p)
  {
         struct thread_info *ti = task_thread_info(p);
-       typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+       typeof(ti->flags) old, val = READ_ONCE(ti->flags);
  
         for (;;) {
                 if (!(val & _TIF_POLLING_NRFLAG))
@@ -541,6 +541,52 @@ static bool set_nr_if_polling(struct task_struct *p)
  #endif
  #endif
  
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+       struct wake_q_node *node = &task->wake_q;
+
+       /*
+        * Atomically grab the task, if ->wake_q is !nil already it means
+        * its already queued (either by us or someone else) and will get the
+        * wakeup due to that.
+        *
+        * This cmpxchg() implies a full barrier, which pairs with the write
+        * barrier implied by the wakeup in wake_up_list().
+        */
+       if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+               return;
+
+       get_task_struct(task);
+
+       /*
+        * The head is context local, there can be no concurrency.
+        */
+       *head->lastp = node;
+       head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+       struct wake_q_node *node = head->first;
+
+       while (node != WAKE_Q_TAIL) {
+               struct task_struct *task;
+
+               task = container_of(node, struct task_struct, wake_q);
+               BUG_ON(!task);
+               /* task can safely be re-inserted now */
+               node = node->next;
+               task->wake_q.next = NULL;
+
+               /*
+                * wake_up_process() implies a wmb() to pair with the queueing
+                * in wake_q_add() so as not to miss wakeups.
+                */
+               wake_up_process(task);
+               put_task_struct(task);
+       }
+}
+
  /*
   * resched_curr - mark rq's current task 'to be rescheduled now'.
   *
@@ -1049,7 +1095,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                 if (p->sched_class->migrate_task_rq)
                         p->sched_class->migrate_task_rq(p, new_cpu);
                 p->se.nr_migrations++;
-               perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+               perf_event_task_migrate(p);
         }
  
         __set_task_cpu(p, new_cpu);
@@ -2105,12 +2151,15 @@ void wake_up_new_task(struct task_struct *p)
  
  #ifdef CONFIG_PREEMPT_NOTIFIERS
  
+static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+
  /**
   * preempt_notifier_register - tell me when current is being preempted & rescheduled
   * @notifier: notifier struct to register
   */
  void preempt_notifier_register(struct preempt_notifier *notifier)
  {
+       static_key_slow_inc(&preempt_notifier_key);
         hlist_add_head(&notifier->link, &current->preempt_notifiers);
  }
  EXPORT_SYMBOL_GPL(preempt_notifier_register);
@@ -2119,15 +2168,16 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register);
   * preempt_notifier_unregister - no longer interested in preemption notifications
   * @notifier: notifier struct to unregister
   *
- * This is safe to call from within a preemption notifier.
+ * This is *not* safe to call from within a preemption notifier.
   */
  void preempt_notifier_unregister(struct preempt_notifier *notifier)
  {
         hlist_del(&notifier->link);
+       static_key_slow_dec(&preempt_notifier_key);
  }
  EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
  
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
  {
         struct preempt_notifier *notifier;
  
@@ -2135,9 +2185,15 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
                 notifier->ops->sched_in(notifier, raw_smp_processor_id());
  }
  
+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+       if (static_key_false(&preempt_notifier_key))
+               __fire_sched_in_preempt_notifiers(curr);
+}
+
  static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
-                                struct task_struct *next)
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+                                  struct task_struct *next)
  {
         struct preempt_notifier *notifier;
  
@@ -2145,13 +2201,21 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
                 notifier->ops->sched_out(notifier, next);
  }
  
+static __always_inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+                                struct task_struct *next)
+{
+       if (static_key_false(&preempt_notifier_key))
+               __fire_sched_out_preempt_notifiers(curr, next);
+}
+
  #else /* !CONFIG_PREEMPT_NOTIFIERS */
  
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
  {
  }
  
-static void
+static inline void
  fire_sched_out_preempt_notifiers(struct task_struct *curr,
                                  struct task_struct *next)
  {
@@ -2397,9 +2461,9 @@ unsigned long nr_iowait_cpu(int cpu)
  
  void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
  {
-       struct rq *this = this_rq();
-       *nr_waiters = atomic_read(&this->nr_iowait);
-       *load = this->cpu_load[0];
+       struct rq *rq = this_rq();
+       *nr_waiters = atomic_read(&rq->nr_iowait);
+       *load = rq->load.weight;
  }
  
  #ifdef CONFIG_SMP
@@ -2497,6 +2561,7 @@ void scheduler_tick(void)
         update_rq_clock(rq);
         curr->sched_class->task_tick(rq, curr, 0);
         update_cpu_load_active(rq);
+       calc_global_load_tick(rq);
         raw_spin_unlock(&rq->lock);
  
         perf_event_task_tick();
@@ -2525,7 +2590,7 @@ void scheduler_tick(void)
  u64 scheduler_tick_max_deferment(void)
  {
         struct rq *rq = this_rq();
-       unsigned long next, now = ACCESS_ONCE(jiffies);
+       unsigned long next, now = READ_ONCE(jiffies);
  
         next = rq->last_sched_tick + HZ;
  
@@ -2726,9 +2791,7 @@ again:
   *          - return from syscall or exception to user-space
   *          - return from interrupt-handler to user-space
   *
- * WARNING: all callers must re-check need_resched() afterward and reschedule
- * accordingly in case an event triggered the need for rescheduling (such as
- * an interrupt waking up a task) while preemption was disabled in __schedule().
+ * WARNING: must be called with preemption disabled!
   */
  static void __sched __schedule(void)
  {
@@ -2737,7 +2800,6 @@ static void __sched __schedule(void)
         struct rq *rq;
         int cpu;
  
-       preempt_disable();
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
         rcu_note_context_switch();
@@ -2801,8 +2863,6 @@ static void __sched __schedule(void)
                 raw_spin_unlock_irq(&rq->lock);
  
         post_schedule(rq);
-
-       sched_preempt_enable_no_resched();
  }
  
  static inline void sched_submit_work(struct task_struct *tsk)
@@ -2823,7 +2883,9 @@ asmlinkage __visible void __sched schedule(void)
  
         sched_submit_work(tsk);
         do {
+               preempt_disable();
                 __schedule();
+               sched_preempt_enable_no_resched();
         } while (need_resched());
  }
  EXPORT_SYMBOL(schedule);
@@ -2862,15 +2924,14 @@ void __sched schedule_preempt_disabled(void)
  static void __sched notrace preempt_schedule_common(void)
  {
         do {
-               __preempt_count_add(PREEMPT_ACTIVE);
+               preempt_active_enter();
                 __schedule();
-               __preempt_count_sub(PREEMPT_ACTIVE);
+               preempt_active_exit();
  
                 /*
                  * Check again in case we missed a preemption opportunity
                  * between schedule and now.
                  */
-               barrier();
         } while (need_resched());
  }
  
@@ -2894,9 +2955,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
  NOKPROBE_SYMBOL(preempt_schedule);
  EXPORT_SYMBOL(preempt_schedule);
  
-#ifdef CONFIG_CONTEXT_TRACKING
  /**
- * preempt_schedule_context - preempt_schedule called by tracing
+ * preempt_schedule_notrace - preempt_schedule called by tracing
   *
   * The tracing infrastructure uses preempt_enable_notrace to prevent
   * recursion and tracing preempt enabling caused by the tracing
@@ -2909,7 +2969,7 @@ EXPORT_SYMBOL(preempt_schedule);
   * instead of preempt_schedule() to exit user context if needed before
   * calling the scheduler.
   */
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
  {
         enum ctx_state prev_ctx;
  
@@ -2917,7 +2977,13 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
                 return;
  
         do {
-               __preempt_count_add(PREEMPT_ACTIVE);
+               /*
+                * Use raw __prempt_count() ops that don't call function.
+                * We can't call functions before disabling preemption which
+                * disarm preemption tracing recursions.
+                */
+               __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+               barrier();
                 /*
                  * Needs preempt disabled in case user_exit() is traced
                  * and the tracer calls preempt_enable_notrace() causing
@@ -2927,12 +2993,11 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
                 __schedule();
                 exception_exit(prev_ctx);
  
-               __preempt_count_sub(PREEMPT_ACTIVE);
                 barrier();
+               __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
         } while (need_resched());
  }
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_CONTEXT_TRACKING */
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
  
  #endif /* CONFIG_PREEMPT */
  
@@ -2952,17 +3017,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
         prev_state = exception_enter();
  
         do {
-               __preempt_count_add(PREEMPT_ACTIVE);
+               preempt_active_enter();
                 local_irq_enable();
                 __schedule();
                 local_irq_disable();
-               __preempt_count_sub(PREEMPT_ACTIVE);
-
-               /*
-                * Check again in case we missed a preemption opportunity
-                * between schedule and now.
-                */
-               barrier();
+               preempt_active_exit();
         } while (need_resched());
  
         exception_exit(prev_state);
@@ -3040,7 +3099,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                 if (!dl_prio(p->normal_prio) ||
                     (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                         p->dl.dl_boosted = 1;
-                       p->dl.dl_throttled = 0;
                         enqueue_flag = ENQUEUE_REPLENISH;
                 } else
                         p->dl.dl_boosted = 0;
@@ -5314,7 +5372,7 @@ static struct notifier_block migration_notifier = {
         .priority = CPU_PRI_MIGRATION,
  };
  
-static void __cpuinit set_cpu_rq_start_time(void)
+static void set_cpu_rq_start_time(void)
  {
         int cpu = smp_processor_id();
         struct rq *rq = cpu_rq(cpu);
@@ -7734,11 +7792,11 @@ static long sched_group_rt_runtime(struct task_group *tg)
         return rt_runtime_us;
  }
  
-static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
  {
         u64 rt_runtime, rt_period;
  
-       rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+       rt_period = rt_period_us * NSEC_PER_USEC;
         rt_runtime = tg->rt_bandwidth.rt_runtime;
  
         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c

index 8394b1ee600c38ba6e9144a6326369b6ef0cdacd..f5a64ffad176f12b01381cb1dc2e25a05f02508d 100644 (file)
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new)
  {
         cputime_t old;
  
-       while (new > (old = ACCESS_ONCE(*counter)))
+       while (new > (old = READ_ONCE(*counter)))
                 cmpxchg_cputime(counter, old, new);
  }
  
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index 5e95145088fd37b3d07ccac66c3cd58f7effe10a..392e8fb94db36ef32aad026510d3ebfe3d89f6ef 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -640,7 +640,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
  }
  
  static
-int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
+int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
  {
         return (dl_se->runtime <= 0);
  }
@@ -684,7 +684,7 @@ static void update_curr_dl(struct rq *rq)
         sched_rt_avg_update(rq, delta_exec);
  
         dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
-       if (dl_runtime_exceeded(rq, dl_se)) {
+       if (dl_runtime_exceeded(dl_se)) {
                 dl_se->dl_throttled = 1;
                 __dequeue_task_dl(rq, curr, 0);
                 if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
@@ -995,7 +995,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
         rq = cpu_rq(cpu);
  
         rcu_read_lock();
-       curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+       curr = READ_ONCE(rq->curr); /* unlocked access */
  
         /*
          * If we are dealing with a -deadline task, we must
@@ -1012,7 +1012,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
             (p->nr_cpus_allowed > 1)) {
                 int target = find_later_rq(p);
  
-               if (target != -1)
+               if (target != -1 &&
+                               dl_time_before(p->dl.deadline,
+                                       cpu_rq(target)->dl.earliest_dl.curr))
                         cpu = target;
         }
         rcu_read_unlock();
@@ -1230,6 +1232,32 @@ next_node:
         return NULL;
  }
  
+/*
+ * Return the earliest pushable rq's task, which is suitable to be executed
+ * on the CPU, NULL otherwise:
+ */
+static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
+{
+       struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost;
+       struct task_struct *p = NULL;
+
+       if (!has_pushable_dl_tasks(rq))
+               return NULL;
+
+next_node:
+       if (next_node) {
+               p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
+
+               if (pick_dl_task(rq, p, cpu))
+                       return p;
+
+               next_node = rb_next(next_node);
+               goto next_node;
+       }
+
+       return NULL;
+}
+
  static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
  
  static int find_later_rq(struct task_struct *task)
@@ -1333,6 +1361,17 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
  
                 later_rq = cpu_rq(cpu);
  
+               if (!dl_time_before(task->dl.deadline,
+                                       later_rq->dl.earliest_dl.curr)) {
+                       /*
+                        * Target rq has tasks of equal or earlier deadline,
+                        * retrying does not release any lock and is unlikely
+                        * to yield a different result.
+                        */
+                       later_rq = NULL;
+                       break;
+               }
+
                 /* Retry if something changed. */
                 if (double_lock_balance(rq, later_rq)) {
                         if (unlikely(task_rq(task) != rq ||
@@ -1514,7 +1553,7 @@ static int pull_dl_task(struct rq *this_rq)
                 if (src_rq->dl.dl_nr_running <= 1)
                         goto skip;
  
-               p = pick_next_earliest_dl_task(src_rq, this_cpu);
+               p = pick_earliest_pushable_dl_task(src_rq, this_cpu);
  
                 /*
                  * We found a task to be pulled if:
@@ -1659,7 +1698,7 @@ static void rq_offline_dl(struct rq *rq)
         cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
  }
  
-void init_sched_dl_class(void)
+void __init init_sched_dl_class(void)
  {
         unsigned int i;
  
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index a245c1fc6f0a610f17e2d13635306d681e2ef821..704683cc90422d096d8a591326db9a880574ed23 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -132,12 +132,14 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
                 p->prio);
  #ifdef CONFIG_SCHEDSTATS
         SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-               SPLIT_NS(p->se.vruntime),
+               SPLIT_NS(p->se.statistics.wait_sum),
                 SPLIT_NS(p->se.sum_exec_runtime),
                 SPLIT_NS(p->se.statistics.sum_sleep_runtime));
  #else
-       SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
-               0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
+       SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+               0LL, 0L,
+               SPLIT_NS(p->se.sum_exec_runtime),
+               0LL, 0L);
  #endif
  #ifdef CONFIG_NUMA_BALANCING
         SEQ_printf(m, " %d", task_node(p));
@@ -156,7 +158,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
         SEQ_printf(m,
         "\nrunnable tasks:\n"
         "            task   PID         tree-key  switches  prio"
-       "     exec-runtime         sum-exec        sum-sleep\n"
+       "     wait-time             sum-exec        sum-sleep\n"
         "------------------------------------------------------"
         "----------------------------------------------------\n");
  
@@ -582,6 +584,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
         nr_switches = p->nvcsw + p->nivcsw;
  
  #ifdef CONFIG_SCHEDSTATS
+       PN(se.statistics.sum_sleep_runtime);
         PN(se.statistics.wait_start);
         PN(se.statistics.sleep_start);
         PN(se.statistics.block_start);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index ffeaa4105e48a36105ecaea8967082e1e7a7af98..433061d984eac6ce5322714ab29bd3beb148eaf4 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -141,9 +141,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
   *
   * This idea comes from the SD scheduler of Con Kolivas:
   */
-static int get_update_sysctl_factor(void)
+static unsigned int get_update_sysctl_factor(void)
  {
-       unsigned int cpus = min_t(int, num_online_cpus(), 8);
+       unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
         unsigned int factor;
  
         switch (sysctl_sched_tunable_scaling) {
@@ -576,7 +576,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
                 loff_t *ppos)
  {
         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-       int factor = get_update_sysctl_factor();
+       unsigned int factor = get_update_sysctl_factor();
  
         if (ret || !write)
                 return ret;
@@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
  
  static unsigned int task_scan_min(struct task_struct *p)
  {
-       unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
+       unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
         unsigned int scan, floor;
         unsigned int windows = 1;
  
@@ -1198,11 +1198,9 @@ static void task_numa_assign(struct task_numa_env *env,
  static bool load_too_imbalanced(long src_load, long dst_load,
                                 struct task_numa_env *env)
  {
+       long imb, old_imb;
+       long orig_src_load, orig_dst_load;
         long src_capacity, dst_capacity;
-       long orig_src_load;
-       long load_a, load_b;
-       long moved_load;
-       long imb;
  
         /*
          * The load is corrected for the CPU capacity available on each node.
@@ -1215,39 +1213,30 @@ static bool load_too_imbalanced(long src_load, long dst_load,
         dst_capacity = env->dst_stats.compute_capacity;
  
         /* We care about the slope of the imbalance, not the direction. */
-       load_a = dst_load;
-       load_b = src_load;
-       if (load_a < load_b)
-               swap(load_a, load_b);
+       if (dst_load < src_load)
+               swap(dst_load, src_load);
  
         /* Is the difference below the threshold? */
-       imb = load_a * src_capacity * 100 -
-               load_b * dst_capacity * env->imbalance_pct;
+       imb = dst_load * src_capacity * 100 -
+             src_load * dst_capacity * env->imbalance_pct;
         if (imb <= 0)
                 return false;
  
         /*
          * The imbalance is above the allowed threshold.
-        * Allow a move that brings us closer to a balanced situation,
-        * without moving things past the point of balance.
+        * Compare it with the old imbalance.
          */
         orig_src_load = env->src_stats.load;
+       orig_dst_load = env->dst_stats.load;
  
-       /*
-        * In a task swap, there will be one load moving from src to dst,
-        * and another moving back. This is the net sum of both moves.
-        * A simple task move will always have a positive value.
-        * Allow the move if it brings the system closer to a balanced
-        * situation, without crossing over the balance point.
-        */
-       moved_load = orig_src_load - src_load;
+       if (orig_dst_load < orig_src_load)
+               swap(orig_dst_load, orig_src_load);
  
-       if (moved_load > 0)
-               /* Moving src -> dst. Did we overshoot balance? */
-               return src_load * dst_capacity < dst_load * src_capacity;
-       else
-               /* Moving dst -> src. Did we overshoot balance? */
-               return dst_load * src_capacity < src_load * dst_capacity;
+       old_imb = orig_dst_load * src_capacity * 100 -
+                 orig_src_load * dst_capacity * env->imbalance_pct;
+
+       /* Would this change make things worse? */
+       return (imb > old_imb);
  }
  
  /*
@@ -1409,6 +1398,30 @@ static void task_numa_find_cpu(struct task_numa_env *env,
         }
  }
  
+/* Only move tasks to a NUMA node less busy than the current node. */
+static bool numa_has_capacity(struct task_numa_env *env)
+{
+       struct numa_stats *src = &env->src_stats;
+       struct numa_stats *dst = &env->dst_stats;
+
+       if (src->has_free_capacity && !dst->has_free_capacity)
+               return false;
+
+       /*
+        * Only consider a task move if the source has a higher load
+        * than the destination, corrected for CPU capacity on each node.
+        *
+        *      src->load                dst->load
+        * --------------------- vs ---------------------
+        * src->compute_capacity    dst->compute_capacity
+        */
+       if (src->load * dst->compute_capacity >
+           dst->load * src->compute_capacity)
+               return true;
+
+       return false;
+}
+
  static int task_numa_migrate(struct task_struct *p)
  {
         struct task_numa_env env = {
@@ -1463,7 +1476,8 @@ static int task_numa_migrate(struct task_struct *p)
         update_numa_stats(&env.dst_stats, env.dst_nid);
  
         /* Try to find a spot on the preferred nid. */
-       task_numa_find_cpu(&env, taskimp, groupimp);
+       if (numa_has_capacity(&env))
+               task_numa_find_cpu(&env, taskimp, groupimp);
  
         /*
          * Look at other nodes in these cases:
@@ -1494,7 +1508,8 @@ static int task_numa_migrate(struct task_struct *p)
                         env.dist = dist;
                         env.dst_nid = nid;
                         update_numa_stats(&env.dst_stats, env.dst_nid);
-                       task_numa_find_cpu(&env, taskimp, groupimp);
+                       if (numa_has_capacity(&env))
+                               task_numa_find_cpu(&env, taskimp, groupimp);
                 }
         }
  
@@ -1794,7 +1809,12 @@ static void task_numa_placement(struct task_struct *p)
         u64 runtime, period;
         spinlock_t *group_lock = NULL;
  
-       seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+       /*
+        * The p->mm->numa_scan_seq field gets updated without
+        * exclusive access. Use READ_ONCE() here to ensure
+        * that the field is read in a single access:
+        */
+       seq = READ_ONCE(p->mm->numa_scan_seq);
         if (p->numa_scan_seq == seq)
                 return;
         p->numa_scan_seq = seq;
@@ -1938,7 +1958,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
         }
  
         rcu_read_lock();
-       tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+       tsk = READ_ONCE(cpu_rq(cpu)->curr);
  
         if (!cpupid_match_pid(tsk, cpupid))
                 goto no_join;
@@ -2107,7 +2127,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
  
  static void reset_ptenuma_scan(struct task_struct *p)
  {
-       ACCESS_ONCE(p->mm->numa_scan_seq)++;
+       /*
+        * We only did a read acquisition of the mmap sem, so
+        * p->mm->numa_scan_seq is written to without exclusive access
+        * and the update is not guaranteed to be atomic. That's not
+        * much of an issue though, since this is just used for
+        * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
+        * expensive, to avoid any form of compiler optimizations:
+        */
+       WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
         p->mm->numa_scan_offset = 0;
  }
  
@@ -2181,7 +2209,7 @@ void task_numa_work(struct callback_head *work)
         }
         for (; vma; vma = vma->vm_next) {
                 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
-                       is_vm_hugetlb_page(vma)) {
+                       is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
                         continue;
                 }
  
@@ -4323,6 +4351,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  }
  
  #ifdef CONFIG_SMP
+
+/*
+ * per rq 'load' arrray crap; XXX kill this.
+ */
+
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT          7
+static const unsigned char
+               degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+               degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+                                       {0, 0, 0, 0, 0, 0, 0, 0},
+                                       {64, 32, 8, 0, 0, 0, 0, 0},
+                                       {96, 72, 40, 12, 1, 0, 0},
+                                       {112, 98, 75, 43, 15, 1, 0},
+                                       {120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+       int j = 0;
+
+       if (!missed_updates)
+               return load;
+
+       if (missed_updates >= degrade_zero_ticks[idx])
+               return 0;
+
+       if (idx == 1)
+               return load >> missed_updates;
+
+       while (missed_updates) {
+               if (missed_updates % 2)
+                       load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+               missed_updates >>= 1;
+               j++;
+       }
+       return load;
+}
+
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+ */
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+                             unsigned long pending_updates)
+{
+       int i, scale;
+
+       this_rq->nr_load_updates++;
+
+       /* Update our load: */
+       this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+       for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+               unsigned long old_load, new_load;
+
+               /* scale is effectively 1 << i now, and >> i divides by scale */
+
+               old_load = this_rq->cpu_load[i];
+               old_load = decay_load_missed(old_load, pending_updates - 1, i);
+               new_load = this_load;
+               /*
+                * Round up the averaging division if load is increasing. This
+                * prevents us from getting stuck on 9 if the load is 10, for
+                * example.
+                */
+               if (new_load > old_load)
+                       new_load += scale - 1;
+
+               this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+       }
+
+       sched_avg_update(this_rq);
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+static void update_idle_cpu_load(struct rq *this_rq)
+{
+       unsigned long curr_jiffies = READ_ONCE(jiffies);
+       unsigned long load = this_rq->cfs.runnable_load_avg;
+       unsigned long pending_updates;
+
+       /*
+        * bail if there's load or we're actually up-to-date.
+        */
+       if (load || curr_jiffies == this_rq->last_load_update_tick)
+               return;
+
+       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+       this_rq->last_load_update_tick = curr_jiffies;
+
+       __update_cpu_load(this_rq, load, pending_updates);
+}
+
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+       struct rq *this_rq = this_rq();
+       unsigned long curr_jiffies = READ_ONCE(jiffies);
+       unsigned long pending_updates;
+
+       if (curr_jiffies == this_rq->last_load_update_tick)
+               return;
+
+       raw_spin_lock(&this_rq->lock);
+       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+       if (pending_updates) {
+               this_rq->last_load_update_tick = curr_jiffies;
+               /*
+                * We were idle, this means load 0, the current load might be
+                * !0 due to remote wakeups and the sort.
+                */
+               __update_cpu_load(this_rq, 0, pending_updates);
+       }
+       raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
+ * Called from scheduler_tick()
+ */
+void update_cpu_load_active(struct rq *this_rq)
+{
+       unsigned long load = this_rq->cfs.runnable_load_avg;
+       /*
+        * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+        */
+       this_rq->last_load_update_tick = jiffies;
+       __update_cpu_load(this_rq, load, 1);
+}
+
  /* Used instead of source_load when we know the type == 0 */
  static unsigned long weighted_cpuload(const int cpu)
  {
@@ -4375,7 +4586,7 @@ static unsigned long capacity_orig_of(int cpu)
  static unsigned long cpu_avg_load_per_task(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
-       unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
+       unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
         unsigned long load_avg = rq->cfs.runnable_load_avg;
  
         if (nr_running)
@@ -5126,18 +5337,21 @@ again:
                  * entity, update_curr() will update its vruntime, otherwise
                  * forget we've ever seen it.
                  */
-               if (curr && curr->on_rq)
-                       update_curr(cfs_rq);
-               else
-                       curr = NULL;
+               if (curr) {
+                       if (curr->on_rq)
+                               update_curr(cfs_rq);
+                       else
+                               curr = NULL;
  
-               /*
-                * This call to check_cfs_rq_runtime() will do the throttle and
-                * dequeue its entity in the parent(s). Therefore the 'simple'
-                * nr_running test will indeed be correct.
-                */
-               if (unlikely(check_cfs_rq_runtime(cfs_rq)))
-                       goto simple;
+                       /*
+                        * This call to check_cfs_rq_runtime() will do the
+                        * throttle and dequeue its entity in the parent(s).
+                        * Therefore the 'simple' nr_running test will indeed
+                        * be correct.
+                        */
+                       if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+                               goto simple;
+               }
  
                 se = pick_next_entity(cfs_rq, curr);
                 cfs_rq = group_cfs_rq(se);
@@ -5467,10 +5681,15 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
  }
  
  #ifdef CONFIG_NUMA_BALANCING
-/* Returns true if the destination node has incurred more faults */
+/*
+ * Returns true if the destination node is the preferred node.
+ * Needs to match fbq_classify_rq(): if there is a runnable task
+ * that is not on its preferred node, we should identify it.
+ */
  static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
  {
         struct numa_group *numa_group = rcu_dereference(p->numa_group);
+       unsigned long src_faults, dst_faults;
         int src_nid, dst_nid;
  
         if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
@@ -5484,29 +5703,30 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
         if (src_nid == dst_nid)
                 return false;
  
-       if (numa_group) {
-               /* Task is already in the group's interleave set. */
-               if (node_isset(src_nid, numa_group->active_nodes))
-                       return false;
-
-               /* Task is moving into the group's interleave set. */
-               if (node_isset(dst_nid, numa_group->active_nodes))
-                       return true;
-
-               return group_faults(p, dst_nid) > group_faults(p, src_nid);
-       }
-
         /* Encourage migration to the preferred node. */
         if (dst_nid == p->numa_preferred_nid)
                 return true;
  
-       return task_faults(p, dst_nid) > task_faults(p, src_nid);
+       /* Migrating away from the preferred node is bad. */
+       if (src_nid == p->numa_preferred_nid)
+               return false;
+
+       if (numa_group) {
+               src_faults = group_faults(p, src_nid);
+               dst_faults = group_faults(p, dst_nid);
+       } else {
+               src_faults = task_faults(p, src_nid);
+               dst_faults = task_faults(p, dst_nid);
+       }
+
+       return dst_faults > src_faults;
  }
  
  
  static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
  {
         struct numa_group *numa_group = rcu_dereference(p->numa_group);
+       unsigned long src_faults, dst_faults;
         int src_nid, dst_nid;
  
         if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5521,23 +5741,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
         if (src_nid == dst_nid)
                 return false;
  
-       if (numa_group) {
-               /* Task is moving within/into the group's interleave set. */
-               if (node_isset(dst_nid, numa_group->active_nodes))
-                       return false;
+       /* Migrating away from the preferred node is bad. */
+       if (src_nid == p->numa_preferred_nid)
+               return true;
  
-               /* Task is moving out of the group's interleave set. */
-               if (node_isset(src_nid, numa_group->active_nodes))
-                       return true;
+       /* Encourage migration to the preferred node. */
+       if (dst_nid == p->numa_preferred_nid)
+               return false;
  
-               return group_faults(p, dst_nid) < group_faults(p, src_nid);
+       if (numa_group) {
+               src_faults = group_faults(p, src_nid);
+               dst_faults = group_faults(p, dst_nid);
+       } else {
+               src_faults = task_faults(p, src_nid);
+               dst_faults = task_faults(p, dst_nid);
         }
  
-       /* Migrating away from the preferred node is always bad. */
-       if (src_nid == p->numa_preferred_nid)
-               return true;
-
-       return task_faults(p, dst_nid) < task_faults(p, src_nid);
+       return dst_faults < src_faults;
  }
  
  #else
@@ -6037,8 +6257,8 @@ static unsigned long scale_rt_capacity(int cpu)
          * Since we're reading these variables without serialization make sure
          * we read them once before doing sanity checks on them.
          */
-       age_stamp = ACCESS_ONCE(rq->age_stamp);
-       avg = ACCESS_ONCE(rq->rt_avg);
+       age_stamp = READ_ONCE(rq->age_stamp);
+       avg = READ_ONCE(rq->rt_avg);
         delta = __rq_clock_broken(rq) - age_stamp;
  
         if (unlikely(delta < 0))
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c

new file mode 100644 (file)

index 0000000..ef71590
--- /dev/null
+++ b/kernel/sched/loadavg.c
@@ -0,0 +1,394 @@
+/*
+ * kernel/sched/loadavg.c
+ *
+ * This file contains the magic bits required to compute the global loadavg
+ * figure. Its a silly number but people think its important. We go through
+ * great pains to make it work on big machines and tickless kernels.
+ */
+
+#include <linux/export.h>
+
+#include "sched.h"
+
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ *   nr_active = 0;
+ *   for_each_possible_cpu(cpu)
+ *     nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ *  - for_each_possible_cpu() is prohibitively expensive on machines with
+ *    serious number of cpus, therefore we need to take a distributed approach
+ *    to calculating nr_active.
+ *
+ *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ *    So assuming nr_active := 0 when we start out -- true per definition, we
+ *    can simply take per-cpu deltas and fold those into a global accumulate
+ *    to obtain the same result. See calc_load_fold_active().
+ *
+ *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ *    across the machine, we assume 10 ticks is sufficient time for every
+ *    cpu to have completed this task.
+ *
+ *    This places an upper-bound on the IRQ-off latency of the machine. Then
+ *    again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ *    this would add another cross-cpu cacheline miss and atomic operation
+ *    to the wakeup path. Instead we increment on whatever cpu the task ran
+ *    when it went into uninterruptible state and decrement on whatever cpu
+ *    did the wakeup. This means that only the sum of nr_uninterruptible over
+ *    all cpus yields the correct result.
+ *
+ *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
+
+/* Variables and functions for calc_load */
+atomic_long_t calc_load_tasks;
+unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun); /* should be removed */
+
+/**
+ * get_avenrun - get the load average array
+ * @loads:     pointer to dest load array
+ * @offset:    offset to add
+ * @shift:     shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+       loads[0] = (avenrun[0] + offset) << shift;
+       loads[1] = (avenrun[1] + offset) << shift;
+       loads[2] = (avenrun[2] + offset) << shift;
+}
+
+long calc_load_fold_active(struct rq *this_rq)
+{
+       long nr_active, delta = 0;
+
+       nr_active = this_rq->nr_running;
+       nr_active += (long)this_rq->nr_uninterruptible;
+
+       if (nr_active != this_rq->calc_load_active) {
+               delta = nr_active - this_rq->calc_load_active;
+               this_rq->calc_load_active = nr_active;
+       }
+
+       return delta;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+       load *= exp;
+       load += active * (FIXED_1 - exp);
+       load += 1UL << (FSHIFT - 1);
+       return load >> FSHIFT;
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ *  - When we go NO_HZ idle during the window, we can negate our sample
+ *    contribution, causing under-accounting.
+ *
+ *    We avoid this by keeping two idle-delta counters and flipping them
+ *    when the window starts, thus separating old and new NO_HZ load.
+ *
+ *    The only trick is the slight shift in index flip for read vs write.
+ *
+ *        0s            5s            10s           15s
+ *          +10           +10           +10           +10
+ *        |-|-----------|-|-----------|-|-----------|-|
+ *    r:0 0 1           1 0           0 1           1 0
+ *    w:0 1 1           0 0           1 1           0 0
+ *
+ *    This ensures we'll fold the old idle contribution in this window while
+ *    accumlating the new one.
+ *
+ *  - When we wake up from NO_HZ idle during the window, we push up our
+ *    contribution, since we effectively move our sample point to a known
+ *    busy state.
+ *
+ *    This is solved by pushing the window forward, and thus skipping the
+ *    sample, for this cpu (effectively using the idle-delta for this cpu which
+ *    was in effect at the time the window opened). This also solves the issue
+ *    of having to deal with a cpu having been in NOHZ idle for multiple
+ *    LOAD_FREQ intervals.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
+
+static inline int calc_load_write_idx(void)
+{
+       int idx = calc_load_idx;
+
+       /*
+        * See calc_global_nohz(), if we observe the new index, we also
+        * need to observe the new update time.
+        */
+       smp_rmb();
+
+       /*
+        * If the folding window started, make sure we start writing in the
+        * next idle-delta.
+        */
+       if (!time_before(jiffies, calc_load_update))
+               idx++;
+
+       return idx & 1;
+}
+
+static inline int calc_load_read_idx(void)
+{
+       return calc_load_idx & 1;
+}
+
+void calc_load_enter_idle(void)
+{
+       struct rq *this_rq = this_rq();
+       long delta;
+
+       /*
+        * We're going into NOHZ mode, if there's any pending delta, fold it
+        * into the pending idle delta.
+        */
+       delta = calc_load_fold_active(this_rq);
+       if (delta) {
+               int idx = calc_load_write_idx();
+
+               atomic_long_add(delta, &calc_load_idle[idx]);
+       }
+}
+
+void calc_load_exit_idle(void)
+{
+       struct rq *this_rq = this_rq();
+
+       /*
+        * If we're still before the sample window, we're done.
+        */
+       if (time_before(jiffies, this_rq->calc_load_update))
+               return;
+
+       /*
+        * We woke inside or after the sample window, this means we're already
+        * accounted through the nohz accounting, so skip the entire deal and
+        * sync up for the next window.
+        */
+       this_rq->calc_load_update = calc_load_update;
+       if (time_before(jiffies, this_rq->calc_load_update + 10))
+               this_rq->calc_load_update += LOAD_FREQ;
+}
+
+static long calc_load_fold_idle(void)
+{
+       int idx = calc_load_read_idx();
+       long delta = 0;
+
+       if (atomic_long_read(&calc_load_idle[idx]))
+               delta = atomic_long_xchg(&calc_load_idle[idx], 0);
+
+       return delta;
+}
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x:         base of the power
+ * @frac_bits: fractional bits of @x
+ * @n:         power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+       unsigned long result = 1UL << frac_bits;
+
+       if (n) {
+               for (;;) {
+                       if (n & 1) {
+                               result *= x;
+                               result += 1UL << (frac_bits - 1);
+                               result >>= frac_bits;
+                       }
+                       n >>= 1;
+                       if (!n)
+                               break;
+                       x *= x;
+                       x += 1UL << (frac_bits - 1);
+                       x >>= frac_bits;
+               }
+       }
+
+       return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ *    = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ *  ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ *    = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ *              n         1 - x^(n+1)
+ *     S_n := \Sum x^i = -------------
+ *             i=0          1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+           unsigned long active, unsigned int n)
+{
+       return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(void)
+{
+       long delta, active, n;
+
+       if (!time_before(jiffies, calc_load_update + 10)) {
+               /*
+                * Catch-up, fold however many we are behind still
+                */
+               delta = jiffies - calc_load_update - 10;
+               n = 1 + (delta / LOAD_FREQ);
+
+               active = atomic_long_read(&calc_load_tasks);
+               active = active > 0 ? active * FIXED_1 : 0;
+
+               avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+               avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+               avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+               calc_load_update += n * LOAD_FREQ;
+       }
+
+       /*
+        * Flip the idle index...
+        *
+        * Make sure we first write the new time then flip the index, so that
+        * calc_load_write_idx() will see the new time when it reads the new
+        * index, this avoids a double flip messing things up.
+        */
+       smp_wmb();
+       calc_load_idx++;
+}
+#else /* !CONFIG_NO_HZ_COMMON */
+
+static inline long calc_load_fold_idle(void) { return 0; }
+static inline void calc_global_nohz(void) { }
+
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ *
+ * Called from the global timer code.
+ */
+void calc_global_load(unsigned long ticks)
+{
+       long active, delta;
+
+       if (time_before(jiffies, calc_load_update + 10))
+               return;
+
+       /*
+        * Fold the 'old' idle-delta to include all NO_HZ cpus.
+        */
+       delta = calc_load_fold_idle();
+       if (delta)
+               atomic_long_add(delta, &calc_load_tasks);
+
+       active = atomic_long_read(&calc_load_tasks);
+       active = active > 0 ? active * FIXED_1 : 0;
+
+       avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+       avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+       avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+       calc_load_update += LOAD_FREQ;
+
+       /*
+        * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
+        */
+       calc_global_nohz();
+}
+
+/*
+ * Called from scheduler_tick() to periodically update this CPU's
+ * active count.
+ */
+void calc_global_load_tick(struct rq *this_rq)
+{
+       long delta;
+
+       if (time_before(jiffies, this_rq->calc_load_update))
+               return;
+
+       delta  = calc_load_fold_active(this_rq);
+       if (delta)
+               atomic_long_add(delta, &calc_load_tasks);
+
+       this_rq->calc_load_update += LOAD_FREQ;
+}
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c

deleted file mode 100644 (file)

index 8ecd552..0000000
--- a/kernel/sched/proc.c
+++ /dev/null
@@ -1,584 +0,0 @@
-/*
- *  kernel/sched/proc.c
- *
- *  Kernel load calculations, forked from sched/core.c
- */
-
-#include <linux/export.h>
-
-#include "sched.h"
-
-/*
- * Global load-average calculations
- *
- * We take a distributed and async approach to calculating the global load-avg
- * in order to minimize overhead.
- *
- * The global load average is an exponentially decaying average of nr_running +
- * nr_uninterruptible.
- *
- * Once every LOAD_FREQ:
- *
- *   nr_active = 0;
- *   for_each_possible_cpu(cpu)
- *     nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
- *
- *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
- *
- * Due to a number of reasons the above turns in the mess below:
- *
- *  - for_each_possible_cpu() is prohibitively expensive on machines with
- *    serious number of cpus, therefore we need to take a distributed approach
- *    to calculating nr_active.
- *
- *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
- *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
- *
- *    So assuming nr_active := 0 when we start out -- true per definition, we
- *    can simply take per-cpu deltas and fold those into a global accumulate
- *    to obtain the same result. See calc_load_fold_active().
- *
- *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
- *    across the machine, we assume 10 ticks is sufficient time for every
- *    cpu to have completed this task.
- *
- *    This places an upper-bound on the IRQ-off latency of the machine. Then
- *    again, being late doesn't loose the delta, just wrecks the sample.
- *
- *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
- *    this would add another cross-cpu cacheline miss and atomic operation
- *    to the wakeup path. Instead we increment on whatever cpu the task ran
- *    when it went into uninterruptible state and decrement on whatever cpu
- *    did the wakeup. This means that only the sum of nr_uninterruptible over
- *    all cpus yields the correct result.
- *
- *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
- */
-
-/* Variables and functions for calc_load */
-atomic_long_t calc_load_tasks;
-unsigned long calc_load_update;
-unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun); /* should be removed */
-
-/**
- * get_avenrun - get the load average array
- * @loads:     pointer to dest load array
- * @offset:    offset to add
- * @shift:     shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
-       loads[0] = (avenrun[0] + offset) << shift;
-       loads[1] = (avenrun[1] + offset) << shift;
-       loads[2] = (avenrun[2] + offset) << shift;
-}
-
-long calc_load_fold_active(struct rq *this_rq)
-{
-       long nr_active, delta = 0;
-
-       nr_active = this_rq->nr_running;
-       nr_active += (long) this_rq->nr_uninterruptible;
-
-       if (nr_active != this_rq->calc_load_active) {
-               delta = nr_active - this_rq->calc_load_active;
-               this_rq->calc_load_active = nr_active;
-       }
-
-       return delta;
-}
-
-/*
- * a1 = a0 * e + a * (1 - e)
- */
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
-       load *= exp;
-       load += active * (FIXED_1 - exp);
-       load += 1UL << (FSHIFT - 1);
-       return load >> FSHIFT;
-}
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * Handle NO_HZ for the global load-average.
- *
- * Since the above described distributed algorithm to compute the global
- * load-average relies on per-cpu sampling from the tick, it is affected by
- * NO_HZ.
- *
- * The basic idea is to fold the nr_active delta into a global idle-delta upon
- * entering NO_HZ state such that we can include this as an 'extra' cpu delta
- * when we read the global state.
- *
- * Obviously reality has to ruin such a delightfully simple scheme:
- *
- *  - When we go NO_HZ idle during the window, we can negate our sample
- *    contribution, causing under-accounting.
- *
- *    We avoid this by keeping two idle-delta counters and flipping them
- *    when the window starts, thus separating old and new NO_HZ load.
- *
- *    The only trick is the slight shift in index flip for read vs write.
- *
- *        0s            5s            10s           15s
- *          +10           +10           +10           +10
- *        |-|-----------|-|-----------|-|-----------|-|
- *    r:0 0 1           1 0           0 1           1 0
- *    w:0 1 1           0 0           1 1           0 0
- *
- *    This ensures we'll fold the old idle contribution in this window while
- *    accumlating the new one.
- *
- *  - When we wake up from NO_HZ idle during the window, we push up our
- *    contribution, since we effectively move our sample point to a known
- *    busy state.
- *
- *    This is solved by pushing the window forward, and thus skipping the
- *    sample, for this cpu (effectively using the idle-delta for this cpu which
- *    was in effect at the time the window opened). This also solves the issue
- *    of having to deal with a cpu having been in NOHZ idle for multiple
- *    LOAD_FREQ intervals.
- *
- * When making the ILB scale, we should try to pull this in as well.
- */
-static atomic_long_t calc_load_idle[2];
-static int calc_load_idx;
-
-static inline int calc_load_write_idx(void)
-{
-       int idx = calc_load_idx;
-
-       /*
-        * See calc_global_nohz(), if we observe the new index, we also
-        * need to observe the new update time.
-        */
-       smp_rmb();
-
-       /*
-        * If the folding window started, make sure we start writing in the
-        * next idle-delta.
-        */
-       if (!time_before(jiffies, calc_load_update))
-               idx++;
-
-       return idx & 1;
-}
-
-static inline int calc_load_read_idx(void)
-{
-       return calc_load_idx & 1;
-}
-
-void calc_load_enter_idle(void)
-{
-       struct rq *this_rq = this_rq();
-       long delta;
-
-       /*
-        * We're going into NOHZ mode, if there's any pending delta, fold it
-        * into the pending idle delta.
-        */
-       delta = calc_load_fold_active(this_rq);
-       if (delta) {
-               int idx = calc_load_write_idx();
-               atomic_long_add(delta, &calc_load_idle[idx]);
-       }
-}
-
-void calc_load_exit_idle(void)
-{
-       struct rq *this_rq = this_rq();
-
-       /*
-        * If we're still before the sample window, we're done.
-        */
-       if (time_before(jiffies, this_rq->calc_load_update))
-               return;
-
-       /*
-        * We woke inside or after the sample window, this means we're already
-        * accounted through the nohz accounting, so skip the entire deal and
-        * sync up for the next window.
-        */
-       this_rq->calc_load_update = calc_load_update;
-       if (time_before(jiffies, this_rq->calc_load_update + 10))
-               this_rq->calc_load_update += LOAD_FREQ;
-}
-
-static long calc_load_fold_idle(void)
-{
-       int idx = calc_load_read_idx();
-       long delta = 0;
-
-       if (atomic_long_read(&calc_load_idle[idx]))
-               delta = atomic_long_xchg(&calc_load_idle[idx], 0);
-
-       return delta;
-}
-
-/**
- * fixed_power_int - compute: x^n, in O(log n) time
- *
- * @x:         base of the power
- * @frac_bits: fractional bits of @x
- * @n:         power to raise @x to.
- *
- * By exploiting the relation between the definition of the natural power
- * function: x^n := x*x*...*x (x multiplied by itself for n times), and
- * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
- * (where: n_i \elem {0, 1}, the binary vector representing n),
- * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
- * of course trivially computable in O(log_2 n), the length of our binary
- * vector.
- */
-static unsigned long
-fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
-{
-       unsigned long result = 1UL << frac_bits;
-
-       if (n) for (;;) {
-               if (n & 1) {
-                       result *= x;
-                       result += 1UL << (frac_bits - 1);
-                       result >>= frac_bits;
-               }
-               n >>= 1;
-               if (!n)
-                       break;
-               x *= x;
-               x += 1UL << (frac_bits - 1);
-               x >>= frac_bits;
-       }
-
-       return result;
-}
-
-/*
- * a1 = a0 * e + a * (1 - e)
- *
- * a2 = a1 * e + a * (1 - e)
- *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
- *    = a0 * e^2 + a * (1 - e) * (1 + e)
- *
- * a3 = a2 * e + a * (1 - e)
- *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
- *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
- *
- *  ...
- *
- * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
- *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
- *    = a0 * e^n + a * (1 - e^n)
- *
- * [1] application of the geometric series:
- *
- *              n         1 - x^(n+1)
- *     S_n := \Sum x^i = -------------
- *             i=0          1 - x
- */
-static unsigned long
-calc_load_n(unsigned long load, unsigned long exp,
-           unsigned long active, unsigned int n)
-{
-
-       return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
-}
-
-/*
- * NO_HZ can leave us missing all per-cpu ticks calling
- * calc_load_account_active(), but since an idle CPU folds its delta into
- * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
- * in the pending idle delta if our idle period crossed a load cycle boundary.
- *
- * Once we've updated the global active value, we need to apply the exponential
- * weights adjusted to the number of cycles missed.
- */
-static void calc_global_nohz(void)
-{
-       long delta, active, n;
-
-       if (!time_before(jiffies, calc_load_update + 10)) {
-               /*
-                * Catch-up, fold however many we are behind still
-                */
-               delta = jiffies - calc_load_update - 10;
-               n = 1 + (delta / LOAD_FREQ);
-
-               active = atomic_long_read(&calc_load_tasks);
-               active = active > 0 ? active * FIXED_1 : 0;
-
-               avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-               avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-               avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-
-               calc_load_update += n * LOAD_FREQ;
-       }
-
-       /*
-        * Flip the idle index...
-        *
-        * Make sure we first write the new time then flip the index, so that
-        * calc_load_write_idx() will see the new time when it reads the new
-        * index, this avoids a double flip messing things up.
-        */
-       smp_wmb();
-       calc_load_idx++;
-}
-#else /* !CONFIG_NO_HZ_COMMON */
-
-static inline long calc_load_fold_idle(void) { return 0; }
-static inline void calc_global_nohz(void) { }
-
-#endif /* CONFIG_NO_HZ_COMMON */
-
-/*
- * calc_load - update the avenrun load estimates 10 ticks after the
- * CPUs have updated calc_load_tasks.
- */
-void calc_global_load(unsigned long ticks)
-{
-       long active, delta;
-
-       if (time_before(jiffies, calc_load_update + 10))
-               return;
-
-       /*
-        * Fold the 'old' idle-delta to include all NO_HZ cpus.
-        */
-       delta = calc_load_fold_idle();
-       if (delta)
-               atomic_long_add(delta, &calc_load_tasks);
-
-       active = atomic_long_read(&calc_load_tasks);
-       active = active > 0 ? active * FIXED_1 : 0;
-
-       avenrun[0] = calc_load(avenrun[0], EXP_1, active);
-       avenrun[1] = calc_load(avenrun[1], EXP_5, active);
-       avenrun[2] = calc_load(avenrun[2], EXP_15, active);
-
-       calc_load_update += LOAD_FREQ;
-
-       /*
-        * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
-        */
-       calc_global_nohz();
-}
-
-/*
- * Called from update_cpu_load() to periodically update this CPU's
- * active count.
- */
-static void calc_load_account_active(struct rq *this_rq)
-{
-       long delta;
-
-       if (time_before(jiffies, this_rq->calc_load_update))
-               return;
-
-       delta  = calc_load_fold_active(this_rq);
-       if (delta)
-               atomic_long_add(delta, &calc_load_tasks);
-
-       this_rq->calc_load_update += LOAD_FREQ;
-}
-
-/*
- * End of global load-average stuff
- */
-
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT          7
-static const unsigned char
-               degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
-               degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
-                                       {0, 0, 0, 0, 0, 0, 0, 0},
-                                       {64, 32, 8, 0, 0, 0, 0, 0},
-                                       {96, 72, 40, 12, 1, 0, 0},
-                                       {112, 98, 75, 43, 15, 1, 0},
-                                       {120, 112, 98, 76, 45, 16, 2} };
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
-       int j = 0;
-
-       if (!missed_updates)
-               return load;
-
-       if (missed_updates >= degrade_zero_ticks[idx])
-               return 0;
-
-       if (idx == 1)
-               return load >> missed_updates;
-
-       while (missed_updates) {
-               if (missed_updates % 2)
-                       load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
-               missed_updates >>= 1;
-               j++;
-       }
-       return load;
-}
-
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
-                             unsigned long pending_updates)
-{
-       int i, scale;
-
-       this_rq->nr_load_updates++;
-
-       /* Update our load: */
-       this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
-       for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
-               unsigned long old_load, new_load;
-
-               /* scale is effectively 1 << i now, and >> i divides by scale */
-
-               old_load = this_rq->cpu_load[i];
-               old_load = decay_load_missed(old_load, pending_updates - 1, i);
-               new_load = this_load;
-               /*
-                * Round up the averaging division if load is increasing. This
-                * prevents us from getting stuck on 9 if the load is 10, for
-                * example.
-                */
-               if (new_load > old_load)
-                       new_load += scale - 1;
-
-               this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
-       }
-
-       sched_avg_update(this_rq);
-}
-
-#ifdef CONFIG_SMP
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
-       return rq->cfs.runnable_load_avg;
-}
-#else
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
-       return rq->load.weight;
-}
-#endif
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-void update_idle_cpu_load(struct rq *this_rq)
-{
-       unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-       unsigned long load = get_rq_runnable_load(this_rq);
-       unsigned long pending_updates;
-
-       /*
-        * bail if there's load or we're actually up-to-date.
-        */
-       if (load || curr_jiffies == this_rq->last_load_update_tick)
-               return;
-
-       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-       this_rq->last_load_update_tick = curr_jiffies;
-
-       __update_cpu_load(this_rq, load, pending_updates);
-}
-
-/*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
- */
-void update_cpu_load_nohz(void)
-{
-       struct rq *this_rq = this_rq();
-       unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-       unsigned long pending_updates;
-
-       if (curr_jiffies == this_rq->last_load_update_tick)
-               return;
-
-       raw_spin_lock(&this_rq->lock);
-       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-       if (pending_updates) {
-               this_rq->last_load_update_tick = curr_jiffies;
-               /*
-                * We were idle, this means load 0, the current load might be
-                * !0 due to remote wakeups and the sort.
-                */
-               __update_cpu_load(this_rq, 0, pending_updates);
-       }
-       raw_spin_unlock(&this_rq->lock);
-}
-#endif /* CONFIG_NO_HZ */
-
-/*
- * Called from scheduler_tick()
- */
-void update_cpu_load_active(struct rq *this_rq)
-{
-       unsigned long load = get_rq_runnable_load(this_rq);
-       /*
-        * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
-        */
-       this_rq->last_load_update_tick = jiffies;
-       __update_cpu_load(this_rq, load, 1);
-
-       calc_load_account_active(this_rq);
-}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index 575da76a3874a8c1b2ddd0f518e5ecea7a805262..560d2fa623c311c9aa5ad51ead007e1b27c6fa6c 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1323,7 +1323,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
         rq = cpu_rq(cpu);
  
         rcu_read_lock();
-       curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+       curr = READ_ONCE(rq->curr); /* unlocked access */
  
         /*
          * If the current task on @p's runqueue is an RT task, then
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index e0e1299939588ac47f08b13b45f1a6e2e9cf4d7f..d62b2882232b7a3017eda9c873de01c7ffcfc4a1 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running;
  extern unsigned long calc_load_update;
  extern atomic_long_t calc_load_tasks;
  
+extern void calc_global_load_tick(struct rq *this_rq);
  extern long calc_load_fold_active(struct rq *this_rq);
+
+#ifdef CONFIG_SMP
  extern void update_cpu_load_active(struct rq *this_rq);
+#else
+static inline void update_cpu_load_active(struct rq *this_rq) { }
+#endif
  
  /*
   * Helpers for converting nanosecond timing to jiffy resolution
@@ -707,7 +713,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
  
  static inline u64 __rq_clock_broken(struct rq *rq)
  {
-       return ACCESS_ONCE(rq->clock);
+       return READ_ONCE(rq->clock);
  }
  
  static inline u64 rq_clock(struct rq *rq)
@@ -1284,7 +1290,6 @@ extern void update_max_interval(void);
  extern void init_sched_dl_class(void);
  extern void init_sched_rt_class(void);
  extern void init_sched_fair_class(void);
-extern void init_sched_dl_class(void);
  
  extern void resched_curr(struct rq *rq);
  extern void resched_cpu(int cpu);
@@ -1298,8 +1303,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
  
  unsigned long to_ratio(u64 period, u64 runtime);
  
-extern void update_idle_cpu_load(struct rq *this_rq);
-
  extern void init_task_runnable_average(struct task_struct *p);
  
  static inline void add_nr_running(struct rq *rq, unsigned count)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h

index 4ab7043396569f201cd6dde0a997733a71a4bb7a..077ebbd5e10f14dc646148aae9231acf007e8a4d 100644 (file)
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
  {
         struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
  
-       if (!cputimer->running)
+       /* Check if cputimer isn't running. This is accessed without locking. */
+       if (!READ_ONCE(cputimer->running))
                 return false;
  
         /*
@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
         if (!cputimer_running(tsk))
                 return;
  
-       raw_spin_lock(&cputimer->lock);
-       cputimer->cputime.utime += cputime;
-       raw_spin_unlock(&cputimer->lock);
+       atomic64_add(cputime, &cputimer->cputime_atomic.utime);
  }
  
  /**
@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
         if (!cputimer_running(tsk))
                 return;
  
-       raw_spin_lock(&cputimer->lock);
-       cputimer->cputime.stime += cputime;
-       raw_spin_unlock(&cputimer->lock);
+       atomic64_add(cputime, &cputimer->cputime_atomic.stime);
  }
  
  /**
@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
         if (!cputimer_running(tsk))
                 return;
  
-       raw_spin_lock(&cputimer->lock);
-       cputimer->cputime.sum_exec_runtime += ns;
-       raw_spin_unlock(&cputimer->lock);
+       atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
  }
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c

index 9bc82329eaad8b0eb964f701ca81f0607e35db21..052e02672d12428ce1e9e1f7266c7cd754ace5af 100644 (file)
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io);
  
  __sched int bit_wait_timeout(struct wait_bit_key *word)
  {
-       unsigned long now = ACCESS_ONCE(jiffies);
+       unsigned long now = READ_ONCE(jiffies);
         if (signal_pending_state(current->state, current))
                 return 1;
         if (time_after_eq(now, word->timeout))
@@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
  
  __sched int bit_wait_io_timeout(struct wait_bit_key *word)
  {
-       unsigned long now = ACCESS_ONCE(jiffies);
+       unsigned long now = READ_ONCE(jiffies);
         if (signal_pending_state(current->state, current))
                 return 1;
         if (time_after_eq(now, word->timeout))
diff --git a/kernel/signal.c b/kernel/signal.c

index d51c5ddd855c84b9b65d4a7ef22eedcdff2eeafa..f19833b5db3c9121b540127004191a443c0a7fec 100644 (file)
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -245,7 +245,7 @@ static inline void print_dropped_signal(int sig)
   * RETURNS:
   * %true if @mask is set, %false if made noop because @task was dying.
   */
-bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
+bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
  {
         BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
                         JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
@@ -297,7 +297,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
   * CONTEXT:
   * Must be called with @task->sighand->siglock held.
   */
-void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
+void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
  {
         BUG_ON(mask & ~JOBCTL_PENDING_MASK);
  
@@ -2000,7 +2000,7 @@ static bool do_signal_stop(int signr)
         struct signal_struct *sig = current->signal;
  
         if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
-               unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
+               unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
                 struct task_struct *t;
  
                 /* signr will be recorded in task->jobctl for retries */
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c

index 695f0c6cd169a307de1f216cc67b4330a83363a4..fd643d8c4b424f858e3c6d7e0996bbb919a555e7 100644 (file)
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -211,25 +211,6 @@ static int multi_cpu_stop(void *data)
         return err;
  }
  
-struct irq_cpu_stop_queue_work_info {
-       int cpu1;
-       int cpu2;
-       struct cpu_stop_work *work1;
-       struct cpu_stop_work *work2;
-};
-
-/*
- * This function is always run with irqs and preemption disabled.
- * This guarantees that both work1 and work2 get queued, before
- * our local migrate thread gets the chance to preempt us.
- */
-static void irq_cpu_stop_queue_work(void *arg)
-{
-       struct irq_cpu_stop_queue_work_info *info = arg;
-       cpu_stop_queue_work(info->cpu1, info->work1);
-       cpu_stop_queue_work(info->cpu2, info->work2);
-}
-
  /**
   * stop_two_cpus - stops two cpus
   * @cpu1: the cpu to stop
@@ -245,7 +226,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
  {
         struct cpu_stop_done done;
         struct cpu_stop_work work1, work2;
-       struct irq_cpu_stop_queue_work_info call_args;
         struct multi_stop_data msdata;
  
         preempt_disable();
@@ -262,13 +242,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
                 .done = &done
         };
  
-       call_args = (struct irq_cpu_stop_queue_work_info){
-               .cpu1 = cpu1,
-               .cpu2 = cpu2,
-               .work1 = &work1,
-               .work2 = &work2,
-       };
-
         cpu_stop_init_done(&done, 2);
         set_state(&msdata, MULTI_STOP_PREPARE);
  
@@ -285,16 +258,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
                 return -ENOENT;
         }
  
-       lg_local_lock(&stop_cpus_lock);
-       /*
-        * Queuing needs to be done by the lowest numbered CPU, to ensure
-        * that works are always queued in the same order on every CPU.
-        * This prevents deadlocks.
-        */
-       smp_call_function_single(min(cpu1, cpu2),
-                                &irq_cpu_stop_queue_work,
-                                &call_args, 1);
-       lg_local_unlock(&stop_cpus_lock);
+       lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
+       cpu_stop_queue_work(cpu1, &work1);
+       cpu_stop_queue_work(cpu2, &work2);
+       lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+
         preempt_enable();
  
         wait_for_completion(&done.completion);
diff --git a/kernel/sys.c b/kernel/sys.c

index a4e372b798a5f29535f9120b32fea70b0489f603..8571296b7ddb9b5efffed3f6fc30c29b34c33099 100644 (file)
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -92,10 +92,10 @@
  # define SET_TSC_CTL(a)                (-EINVAL)
  #endif
  #ifndef MPX_ENABLE_MANAGEMENT
-# define MPX_ENABLE_MANAGEMENT(a)      (-EINVAL)
+# define MPX_ENABLE_MANAGEMENT()       (-EINVAL)
  #endif
  #ifndef MPX_DISABLE_MANAGEMENT
-# define MPX_DISABLE_MANAGEMENT(a)     (-EINVAL)
+# define MPX_DISABLE_MANAGEMENT()      (-EINVAL)
  #endif
  #ifndef GET_FP_MODE
  # define GET_FP_MODE(a)                (-EINVAL)
@@ -2230,12 +2230,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
         case PR_MPX_ENABLE_MANAGEMENT:
                 if (arg2 || arg3 || arg4 || arg5)
                         return -EINVAL;
-               error = MPX_ENABLE_MANAGEMENT(me);
+               error = MPX_ENABLE_MANAGEMENT();
                 break;
         case PR_MPX_DISABLE_MANAGEMENT:
                 if (arg2 || arg3 || arg4 || arg5)
                         return -EINVAL;
-               error = MPX_DISABLE_MANAGEMENT(me);
+               error = MPX_DISABLE_MANAGEMENT();
                 break;
         case PR_SET_FP_MODE:
                 error = SET_FP_MODE(me, arg2);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c

index 0075da74abf0c5f55c823f393e96b99d79b05e13..892e3dae0aac41199e9ebbbdef8b73f6b2d57afd 100644 (file)
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
         return 0;
  }
  
-static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+/*
+ * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
+ * to avoid race conditions with concurrent updates to cputime.
+ */
+static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
  {
-       if (b->utime > a->utime)
-               a->utime = b->utime;
+       u64 curr_cputime;
+retry:
+       curr_cputime = atomic64_read(cputime);
+       if (sum_cputime > curr_cputime) {
+               if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
+                       goto retry;
+       }
+}
  
-       if (b->stime > a->stime)
-               a->stime = b->stime;
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
+{
+       __update_gt_cputime(&cputime_atomic->utime, sum->utime);
+       __update_gt_cputime(&cputime_atomic->stime, sum->stime);
+       __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
+}
  
-       if (b->sum_exec_runtime > a->sum_exec_runtime)
-               a->sum_exec_runtime = b->sum_exec_runtime;
+/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
+static inline void sample_cputime_atomic(struct task_cputime *times,
+                                        struct task_cputime_atomic *atomic_times)
+{
+       times->utime = atomic64_read(&atomic_times->utime);
+       times->stime = atomic64_read(&atomic_times->stime);
+       times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
  }
  
  void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
  {
         struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
         struct task_cputime sum;
-       unsigned long flags;
  
-       if (!cputimer->running) {
+       /* Check if cputimer isn't running. This is accessed without locking. */
+       if (!READ_ONCE(cputimer->running)) {
                 /*
                  * The POSIX timer interface allows for absolute time expiry
                  * values through the TIMER_ABSTIME flag, therefore we have
-                * to synchronize the timer to the clock every time we start
-                * it.
+                * to synchronize the timer to the clock every time we start it.
                  */
                 thread_group_cputime(tsk, &sum);
-               raw_spin_lock_irqsave(&cputimer->lock, flags);
-               cputimer->running = 1;
-               update_gt_cputime(&cputimer->cputime, &sum);
-       } else
-               raw_spin_lock_irqsave(&cputimer->lock, flags);
-       *times = cputimer->cputime;
-       raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+               update_gt_cputime(&cputimer->cputime_atomic, &sum);
+
+               /*
+                * We're setting cputimer->running without a lock. Ensure
+                * this only gets written to in one operation. We set
+                * running after update_gt_cputime() as a small optimization,
+                * but barriers are not required because update_gt_cputime()
+                * can handle concurrent updates.
+                */
+               WRITE_ONCE(cputimer->running, 1);
+       }
+       sample_cputime_atomic(times, &cputimer->cputime_atomic);
  }
  
  /*
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
         if (!task_cputime_zero(&tsk->cputime_expires))
                 return false;
  
-       if (tsk->signal->cputimer.running)
+       /* Check if cputimer is running. This is accessed without locking. */
+       if (READ_ONCE(tsk->signal->cputimer.running))
                 return false;
  
         return true;
@@ -852,10 +876,10 @@ static void check_thread_timers(struct task_struct *tsk,
         /*
          * Check for the special case thread timers.
          */
-       soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+       soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
         if (soft != RLIM_INFINITY) {
                 unsigned long hard =
-                       ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
+                       READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
  
                 if (hard != RLIM_INFINITY &&
                     tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
         }
  }
  
-static void stop_process_timers(struct signal_struct *sig)
+static inline void stop_process_timers(struct signal_struct *sig)
  {
         struct thread_group_cputimer *cputimer = &sig->cputimer;
-       unsigned long flags;
  
-       raw_spin_lock_irqsave(&cputimer->lock, flags);
-       cputimer->running = 0;
-       raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+       /* Turn off cputimer->running. This is done without locking. */
+       WRITE_ONCE(cputimer->running, 0);
  }
  
  static u32 onecputick;
@@ -958,11 +980,11 @@ static void check_process_timers(struct task_struct *tsk,
                          SIGPROF);
         check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
                          SIGVTALRM);
-       soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+       soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
         if (soft != RLIM_INFINITY) {
                 unsigned long psecs = cputime_to_secs(ptime);
                 unsigned long hard =
-                       ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
+                       READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
                 cputime_t x;
                 if (psecs >= hard) {
                         /*
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
         }
  
         sig = tsk->signal;
-       if (sig->cputimer.running) {
+       /* Check if cputimer is running. This is accessed without locking. */
+       if (READ_ONCE(sig->cputimer.running)) {
                 struct task_cputime group_sample;
  
-               raw_spin_lock(&sig->cputimer.lock);
-               group_sample = sig->cputimer.cputime;
-               raw_spin_unlock(&sig->cputimer.lock);
+               sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
  
                 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
                         return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
          * If there are any active process wide timers (POSIX 1.b, itimers,
          * RLIMIT_CPU) cputimer must be running.
          */
-       if (tsk->signal->cputimer.running)
+       if (READ_ONCE(tsk->signal->cputimer.running))
                 check_process_timers(tsk, &firing);
  
         /*
diff --git a/kernel/torture.c b/kernel/torture.c

index dd70993c266c38785510ab09f0315d1f1775d05b..3e4840633d3ee7bd926f1fe67f8b0a4b324514da 100644 (file)
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -409,7 +409,7 @@ static void (*torture_shutdown_hook)(void);
   */
  void torture_shutdown_absorb(const char *title)
  {
-       while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+       while (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
                 pr_notice("torture thread %s parking due to system shutdown\n",
                           title);
                 schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
@@ -480,9 +480,9 @@ static int torture_shutdown_notify(struct notifier_block *unused1,
                                    unsigned long unused2, void *unused3)
  {
         mutex_lock(&fullstop_mutex);
-       if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
+       if (READ_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
                 VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected");
-               ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN;
+               WRITE_ONCE(fullstop, FULLSTOP_SHUTDOWN);
         } else {
                 pr_warn("Concurrent rmmod and shutdown illegal!\n");
         }
@@ -523,13 +523,13 @@ static int stutter;
   */
  void stutter_wait(const char *title)
  {
-       while (ACCESS_ONCE(stutter_pause_test) ||
-              (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
+       while (READ_ONCE(stutter_pause_test) ||
+              (torture_runnable && !READ_ONCE(*torture_runnable))) {
                 if (stutter_pause_test)
-                       if (ACCESS_ONCE(stutter_pause_test) == 1)
+                       if (READ_ONCE(stutter_pause_test) == 1)
                                 schedule_timeout_interruptible(1);
                         else
-                               while (ACCESS_ONCE(stutter_pause_test))
+                               while (READ_ONCE(stutter_pause_test))
                                         cond_resched();
                 else
                         schedule_timeout_interruptible(round_jiffies_relative(HZ));
@@ -549,14 +549,14 @@ static int torture_stutter(void *arg)
                 if (!torture_must_stop()) {
                         if (stutter > 1) {
                                 schedule_timeout_interruptible(stutter - 1);
-                               ACCESS_ONCE(stutter_pause_test) = 2;
+                               WRITE_ONCE(stutter_pause_test, 2);
                         }
                         schedule_timeout_interruptible(1);
-                       ACCESS_ONCE(stutter_pause_test) = 1;
+                       WRITE_ONCE(stutter_pause_test, 1);
                 }
                 if (!torture_must_stop())
                         schedule_timeout_interruptible(stutter);
-               ACCESS_ONCE(stutter_pause_test) = 0;
+               WRITE_ONCE(stutter_pause_test, 0);
                 torture_shutdown_absorb("torture_stutter");
         } while (!torture_must_stop());
         torture_kthread_stopping("torture_stutter");
@@ -642,13 +642,13 @@ EXPORT_SYMBOL_GPL(torture_init_end);
  bool torture_cleanup_begin(void)
  {
         mutex_lock(&fullstop_mutex);
-       if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+       if (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
                 pr_warn("Concurrent rmmod and shutdown illegal!\n");
                 mutex_unlock(&fullstop_mutex);
                 schedule_timeout_uninterruptible(10);
                 return true;
         }
-       ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD;
+       WRITE_ONCE(fullstop, FULLSTOP_RMMOD);
         mutex_unlock(&fullstop_mutex);
         torture_shutdown_cleanup();
         torture_shuffle_cleanup();
@@ -681,7 +681,7 @@ EXPORT_SYMBOL_GPL(torture_must_stop);
   */
  bool torture_must_stop_irq(void)
  {
-       return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP;
+       return READ_ONCE(fullstop) != FULLSTOP_DONTSTOP;
  }
  EXPORT_SYMBOL_GPL(torture_must_stop_irq);
  
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c

index 13d945c0d03f2bda5802971484b21bbe9f65301f..1b28df2d91042de97566454a80dcb36d24674a49 100644 (file)
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -450,7 +450,7 @@ static int __init ring_buffer_benchmark_init(void)
  
         if (producer_fifo >= 0) {
                 struct sched_param param = {
-                       .sched_priority = consumer_fifo
+                       .sched_priority = producer_fifo
                 };
                 sched_setscheduler(producer, SCHED_FIFO, &param);
         } else
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c

index ced69da0ff55ba08a7358cae7ceaae31546f9332..7f2e97ce71a7d12a9b2ed5e703969e635f320a57 100644 (file)
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1369,19 +1369,26 @@ static int check_preds(struct filter_parse_state *ps)
  {
         int n_normal_preds = 0, n_logical_preds = 0;
         struct postfix_elt *elt;
+       int cnt = 0;
  
         list_for_each_entry(elt, &ps->postfix, list) {
-               if (elt->op == OP_NONE)
+               if (elt->op == OP_NONE) {
+                       cnt++;
                         continue;
+               }
  
                 if (elt->op == OP_AND || elt->op == OP_OR) {
                         n_logical_preds++;
+                       cnt--;
                         continue;
                 }
+               if (elt->op != OP_NOT)
+                       cnt--;
                 n_normal_preds++;
+               WARN_ON_ONCE(cnt < 0);
         }
  
-       if (!n_normal_preds || n_logical_preds >= n_normal_preds) {
+       if (cnt != 1 || !n_normal_preds || n_logical_preds >= n_normal_preds) {
                 parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
                 return -EINVAL;
         }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug

index ba2b0c87e65b196c7c1f016798e0b59ea8dba97c..b908048f8d6a8e2b33723b222e1a4a88b2841774 100644 (file)
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1233,6 +1233,7 @@ config RCU_TORTURE_TEST
         depends on DEBUG_KERNEL
         select TORTURE_TEST
         select SRCU
+       select TASKS_RCU
         default n
         help
           This option provides a kernel module that runs torture tests
@@ -1261,12 +1262,38 @@ config RCU_TORTURE_TEST_RUNNABLE
           Say N here if you want the RCU torture tests to start only
           after being manually enabled via /proc.
  
+config RCU_TORTURE_TEST_SLOW_PREINIT
+       bool "Slow down RCU grace-period pre-initialization to expose races"
+       depends on RCU_TORTURE_TEST
+       help
+         This option delays grace-period pre-initialization (the
+         propagation of CPU-hotplug changes up the rcu_node combining
+         tree) for a few jiffies between initializing each pair of
+         consecutive rcu_node structures.  This helps to expose races
+         involving grace-period pre-initialization, in other words, it
+         makes your kernel less stable.  It can also greatly increase
+         grace-period latency, especially on systems with large numbers
+         of CPUs.  This is useful when torture-testing RCU, but in
+         almost no other circumstance.
+
+         Say Y here if you want your system to crash and hang more often.
+         Say N if you want a sane system.
+
+config RCU_TORTURE_TEST_SLOW_PREINIT_DELAY
+       int "How much to slow down RCU grace-period pre-initialization"
+       range 0 5
+       default 3
+       depends on RCU_TORTURE_TEST_SLOW_PREINIT
+       help
+         This option specifies the number of jiffies to wait between
+         each rcu_node structure pre-initialization step.
+
  config RCU_TORTURE_TEST_SLOW_INIT
         bool "Slow down RCU grace-period initialization to expose races"
         depends on RCU_TORTURE_TEST
         help
-         This option makes grace-period initialization block for a
-         few jiffies between initializing each pair of consecutive
+         This option delays grace-period initialization for a few
+         jiffies between initializing each pair of consecutive
           rcu_node structures.  This helps to expose races involving
           grace-period initialization, in other words, it makes your
           kernel less stable.  It can also greatly increase grace-period
@@ -1286,6 +1313,30 @@ config RCU_TORTURE_TEST_SLOW_INIT_DELAY
           This option specifies the number of jiffies to wait between
           each rcu_node structure initialization.
  
+config RCU_TORTURE_TEST_SLOW_CLEANUP
+       bool "Slow down RCU grace-period cleanup to expose races"
+       depends on RCU_TORTURE_TEST
+       help
+         This option delays grace-period cleanup for a few jiffies
+         between cleaning up each pair of consecutive rcu_node
+         structures.  This helps to expose races involving grace-period
+         cleanup, in other words, it makes your kernel less stable.
+         It can also greatly increase grace-period latency, especially
+         on systems with large numbers of CPUs.  This is useful when
+         torture-testing RCU, but in almost no other circumstance.
+
+         Say Y here if you want your system to crash and hang more often.
+         Say N if you want a sane system.
+
+config RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY
+       int "How much to slow down RCU grace-period cleanup"
+       range 0 5
+       default 3
+       depends on RCU_TORTURE_TEST_SLOW_CLEANUP
+       help
+         This option specifies the number of jiffies to wait between
+         each rcu_node structure cleanup operation.
+
  config RCU_CPU_STALL_TIMEOUT
         int "RCU CPU stall timeout in seconds"
         depends on RCU_STALL_COMMON
@@ -1322,6 +1373,17 @@ config RCU_TRACE
           Say Y here if you want to enable RCU tracing
           Say N if you are unsure.
  
+config RCU_EQS_DEBUG
+       bool "Use this when adding any sort of NO_HZ support to your arch"
+       depends on DEBUG_KERNEL
+       help
+         This option provides consistency checks in RCU's handling of
+         NO_HZ.  These checks have proven quite helpful in detecting
+         bugs in arch-specific NO_HZ code.
+
+         Say N here if you need ultimate kernel/user switch latencies
+         Say Y if you are unsure
+
  endmenu # "RCU Debugging"
  
  config DEBUG_BLOCK_EXT_DEVT
diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c

index 4f134d8907a7d03760877bfa41a3a10e7747913f..f610b2a10b3eda40c8e04f30f3779556502e9fe6 100644 (file)
--- a/lib/cpu_rmap.c
+++ b/lib/cpu_rmap.c
@@ -191,7 +191,7 @@ int cpu_rmap_update(struct cpu_rmap *rmap, u16 index,
         /* Update distances based on topology */
         for_each_cpu(cpu, update_mask) {
                 if (cpu_rmap_copy_neigh(rmap, cpu,
-                                       topology_thread_cpumask(cpu), 1))
+                                       topology_sibling_cpumask(cpu), 1))
                         continue;
                 if (cpu_rmap_copy_neigh(rmap, cpu,
                                         topology_core_cpumask(cpu), 2))
diff --git a/lib/cpumask.c b/lib/cpumask.c

index 5f627084f2e998b2605016c311411d91f7016918..5a70f6196f577a071ae0a31e9da7fa0e1dd1bc68 100644 (file)
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -16,11 +16,10 @@
  int cpumask_next_and(int n, const struct cpumask *src1p,
                      const struct cpumask *src2p)
  {
-       struct cpumask tmp;
-
-       if (cpumask_and(&tmp, src1p, src2p))
-               return cpumask_next(n, &tmp);
-       return nr_cpu_ids;
+       while ((n = cpumask_next(n, src1p)) < nr_cpu_ids)
+               if (cpumask_test_cpu(n, src2p))
+                       break;
+       return n;
  }
  EXPORT_SYMBOL(cpumask_next_and);
  
diff --git a/lib/mpi/longlong.h b/lib/mpi/longlong.h

index aac511417ad19af5d9e3472747a983be5ed3ee4b..a89d041592c8bfa7b092c382962a4085560f5b1a 100644 (file)
--- a/lib/mpi/longlong.h
+++ b/lib/mpi/longlong.h
@@ -639,7 +639,7 @@ do { \
         **************  MIPS  *****************
         ***************************************/
  #if defined(__mips__) && W_TYPE_SIZE == 32
-#if __GNUC__ >= 4 && __GNUC_MINOR__ >= 4
+#if (__GNUC__ >= 5) || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 4)
  #define umul_ppmm(w1, w0, u, v)                        \
  do {                                           \
         UDItype __ll = (UDItype)(u) * (v);      \
@@ -671,7 +671,7 @@ do {                                                \
         **************  MIPS/64  **************
         ***************************************/
  #if (defined(__mips) && __mips >= 3) && W_TYPE_SIZE == 64
-#if __GNUC__ >= 4 && __GNUC_MINOR__ >= 4
+#if (__GNUC__ >= 5) || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 4)
  #define umul_ppmm(w1, w0, u, v) \
  do {                                                                   \
         typedef unsigned int __ll_UTItype __attribute__((mode(TI)));    \
diff --git a/lib/radix-tree.c b/lib/radix-tree.c

index 3d2aa27b845b53f9e281e7bf5b7a080fd24c5bbb..061550de77bc040878a1a62ef72816d85043b3a6 100644 (file)
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -33,7 +33,7 @@
  #include <linux/string.h>
  #include <linux/bitops.h>
  #include <linux/rcupdate.h>
-#include <linux/preempt_mask.h>                /* in_interrupt() */
+#include <linux/preempt.h>             /* in_interrupt() */
  
  
  /*
diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h

index b7595484a8150c02e086f5d1a3ac41cd971bd91b..8fe9d9662abbcda7563000c00e5f516dc156f05f 100644 (file)
--- a/lib/raid6/x86.h
+++ b/lib/raid6/x86.h
@@ -23,7 +23,7 @@
  
  #ifdef __KERNEL__ /* Real code */
  
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
  
  #else /* Dummy code for user space testing */
  
diff --git a/lib/rhashtable.c b/lib/rhashtable.c

index 4396434e471536b4772ef06efcb983f87c580889..8609378e6505123a3688e0e95a18cdde013e278a 100644 (file)
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -26,6 +26,7 @@
  #include <linux/random.h>
  #include <linux/rhashtable.h>
  #include <linux/err.h>
+#include <linux/export.h>
  
  #define HASH_DEFAULT_SIZE      64UL
  #define HASH_MIN_SIZE          4U
diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c

index a28df5206d95c24d6f3b4116753747f1fb2a67e3..3a5f2b366d84ed209a012cf62491ca30f6a8bca8 100644 (file)
--- a/lib/strnlen_user.c
+++ b/lib/strnlen_user.c
@@ -57,7 +57,8 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
                         return res + find_zero(data) + 1 - align;
                 }
                 res += sizeof(unsigned long);
-               if (unlikely(max < sizeof(unsigned long)))
+               /* We already handled 'unsigned long' bytes. Did we do it all ? */
+               if (unlikely(max <= sizeof(unsigned long)))
                         break;
                 max -= sizeof(unsigned long);
                 if (unlikely(__get_user(c,(unsigned long __user *)(src+res))))
@@ -84,13 +85,21 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
   * @str: The string to measure.
   * @count: Maximum count (including NUL character)
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Get the size of a NUL-terminated string in user space.
   *
   * Returns the size of the string INCLUDING the terminating NUL.
- * If the string is too long, returns 'count+1'.
+ * If the string is too long, returns a number larger than @count. User
+ * has to check the return value against "> count".
   * On exception (or invalid count), returns 0.
+ *
+ * NOTE! You should basically never use this function. There is
+ * almost never any valid case for using the length of a user space
+ * string, since the string can be changed at any time by other
+ * threads. Use "strncpy_from_user()" instead to get a stable copy
+ * of the string.
   */
  long strnlen_user(const char __user *str, long count)
  {
@@ -113,7 +122,8 @@ EXPORT_SYMBOL(strnlen_user);
   * strlen_user: - Get the size of a user string INCLUDING final NUL.
   * @str: The string to measure.
   *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
   *
   * Get the size of a NUL-terminated string in user space.
   *
diff --git a/lib/swiotlb.c b/lib/swiotlb.c

index 4abda074ea458947390b84c36f3eaad7095a2ceb..42e192decbfd605dc4175cfbe9c93b260bbad8ef 100644 (file)
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -537,8 +537,9 @@ EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single);
   * Allocates bounce buffer and returns its kernel virtual address.
   */
  
-phys_addr_t map_single(struct device *hwdev, phys_addr_t phys, size_t size,
-                      enum dma_data_direction dir)
+static phys_addr_t
+map_single(struct device *hwdev, phys_addr_t phys, size_t size,
+          enum dma_data_direction dir)
  {
         dma_addr_t start_dma_addr = phys_to_dma(hwdev, io_tlb_start);
  
@@ -655,7 +656,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
                  */
                 phys_addr_t paddr = map_single(hwdev, 0, size, DMA_FROM_DEVICE);
                 if (paddr == SWIOTLB_MAP_ERROR)
-                       return NULL;
+                       goto err_warn;
  
                 ret = phys_to_virt(paddr);
                 dev_addr = phys_to_dma(hwdev, paddr);
@@ -669,7 +670,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
                         /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
                         swiotlb_tbl_unmap_single(hwdev, paddr,
                                                  size, DMA_TO_DEVICE);
-                       return NULL;
+                       goto err_warn;
                 }
         }
  
@@ -677,6 +678,13 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
         memset(ret, 0, size);
  
         return ret;
+
+err_warn:
+       pr_warn("swiotlb: coherent allocation failed for device %s size=%zu\n",
+               dev_name(hwdev), size);
+       dump_stack();
+
+       return NULL;
  }
  EXPORT_SYMBOL(swiotlb_alloc_coherent);
  
diff --git a/mm/backing-dev.c b/mm/backing-dev.c

index 6dc4580df2af040b10bc10a5f9c423becc3ff47e..000e7b3b9896f2a9479687befd2442c43193614e 100644 (file)
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -359,23 +359,6 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
         flush_delayed_work(&bdi->wb.dwork);
  }
  
-/*
- * Called when the device behind @bdi has been removed or ejected.
- *
- * We can't really do much here except for reducing the dirty ratio at
- * the moment.  In the future we should be able to set a flag so that
- * the filesystem can handle errors at mark_inode_dirty time instead
- * of only at writeback time.
- */
-void bdi_unregister(struct backing_dev_info *bdi)
-{
-       if (WARN_ON_ONCE(!bdi->dev))
-               return;
-
-       bdi_set_min_ratio(bdi, 0);
-}
-EXPORT_SYMBOL(bdi_unregister);
-
  static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
  {
         memset(wb, 0, sizeof(*wb));
@@ -443,6 +426,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
         int i;
  
         bdi_wb_shutdown(bdi);
+       bdi_set_min_ratio(bdi, 0);
  
         WARN_ON(!list_empty(&bdi->work_list));
         WARN_ON(delayed_work_pending(&bdi->wb.dwork));
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 14c2f2017e37cc405e52cb12bc30b128997f1f8e..a04225d372ba3ab77516b970c10135b19def3ac4 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2323,6 +2323,8 @@ done_restock:
         css_get_many(&memcg->css, batch);
         if (batch > nr_pages)
                 refill_stock(memcg, batch - nr_pages);
+       if (!(gfp_mask & __GFP_WAIT))
+               goto done;
         /*
          * If the hierarchy is above the normal consumption range,
          * make the charging task trim their excess contribution.
@@ -5833,9 +5835,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         if (!mem_cgroup_is_root(memcg))
                 page_counter_uncharge(&memcg->memory, 1);
  
-       /* XXX: caller holds IRQ-safe mapping->tree_lock */
-       VM_BUG_ON(!irqs_disabled());
-
+       /* Caller disabled preemption with mapping->tree_lock */
         mem_cgroup_charge_statistics(memcg, page, -1);
         memcg_check_events(memcg, page);
  }
diff --git a/mm/memory.c b/mm/memory.c

index 22e037e3364e0f49dcdc1de812181f54755bbbac..17734c3c1183ed799257d40eef18da55d809dd4a 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3737,7 +3737,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
  }
  
  #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
-void might_fault(void)
+void __might_fault(const char *file, int line)
  {
         /*
          * Some code (nfs/sunrpc) uses socket ops on kernel memory while
@@ -3747,21 +3747,15 @@ void might_fault(void)
          */
         if (segment_eq(get_fs(), KERNEL_DS))
                 return;
-
-       /*
-        * it would be nicer only to annotate paths which are not under
-        * pagefault_disable, however that requires a larger audit and
-        * providing helpers like get_user_atomic.
-        */
-       if (in_atomic())
+       if (pagefault_disabled())
                 return;
-
-       __might_sleep(__FILE__, __LINE__, 0);
-
+       __might_sleep(file, line, 0);
+#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
         if (current->mm)
                 might_lock_read(&current->mm->mmap_sem);
+#endif
  }
-EXPORT_SYMBOL(might_fault);
+EXPORT_SYMBOL(__might_fault);
  #endif
  
  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c

index 457bde530cbedcf0dea2f35e219466de0acf204d..9e88f749aa512395daea45f2727545fa0f281533 100644 (file)
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1969,8 +1969,10 @@ void try_offline_node(int nid)
                  * wait_table may be allocated from boot memory,
                  * here only free if it's allocated by vmalloc.
                  */
-               if (is_vmalloc_addr(zone->wait_table))
+               if (is_vmalloc_addr(zone->wait_table)) {
                         vfree(zone->wait_table);
+                       zone->wait_table = NULL;
+               }
         }
  }
  EXPORT_SYMBOL(try_offline_node);
diff --git a/mm/shmem.c b/mm/shmem.c

index de981370fbc5d596de3d419062c0977829602af7..3759099d8ce438f57398d84f50049b9e8821bd3a 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2451,6 +2451,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
                         return -ENOMEM;
                 }
                 inode->i_op = &shmem_short_symlink_operations;
+               inode->i_link = info->symlink;
         } else {
                 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
                 if (error) {
@@ -2474,30 +2475,23 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
         return 0;
  }
  
-static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
-{
-       nd_set_link(nd, SHMEM_I(d_inode(dentry))->symlink);
-       return NULL;
-}
-
-static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *shmem_follow_link(struct dentry *dentry, void **cookie)
  {
         struct page *page = NULL;
         int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL);
-       nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
-       if (page)
-               unlock_page(page);
-       return page;
+       if (error)
+               return ERR_PTR(error);
+       unlock_page(page);
+       *cookie = page;
+       return kmap(page);
  }
  
-static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+static void shmem_put_link(struct inode *unused, void *cookie)
  {
-       if (!IS_ERR(nd_get_link(nd))) {
-               struct page *page = cookie;
-               kunmap(page);
-               mark_page_accessed(page);
-               page_cache_release(page);
-       }
+       struct page *page = cookie;
+       kunmap(page);
+       mark_page_accessed(page);
+       page_cache_release(page);
  }
  
  #ifdef CONFIG_TMPFS_XATTR
@@ -2642,7 +2636,7 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
  
  static const struct inode_operations shmem_short_symlink_operations = {
         .readlink       = generic_readlink,
-       .follow_link    = shmem_follow_short_symlink,
+       .follow_link    = simple_follow_link,
  #ifdef CONFIG_TMPFS_XATTR
         .setxattr       = shmem_setxattr,
         .getxattr       = shmem_getxattr,
@@ -3401,7 +3395,13 @@ int shmem_zero_setup(struct vm_area_struct *vma)
         struct file *file;
         loff_t size = vma->vm_end - vma->vm_start;
  
-       file = shmem_file_setup("dev/zero", size, vma->vm_flags);
+       /*
+        * Cloning a new file under mmap_sem leads to a lock ordering conflict
+        * between XFS directory reading and selinux: since this file is only
+        * accessible to the user through its mapping, use S_PRIVATE flag to
+        * bypass file security, in the same way as shmem_kernel_file_setup().
+        */
+       file = __shmem_file_setup("dev/zero", size, vma->vm_flags, S_PRIVATE);
         if (IS_ERR(file))
                 return PTR_ERR(file);
  
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c

index 08bd7a3d464a9c6959a39e269d2284600e750a50..a8b5e749e84e7dbd50d325eecf84a47316145598 100644 (file)
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -289,7 +289,8 @@ static int create_handle_cache(struct zs_pool *pool)
  
  static void destroy_handle_cache(struct zs_pool *pool)
  {
-       kmem_cache_destroy(pool->handle_cachep);
+       if (pool->handle_cachep)
+               kmem_cache_destroy(pool->handle_cachep);
  }
  
  static unsigned long alloc_handle(struct zs_pool *pool)
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c

index e0670d7054f97c05d46b74952ee53d6fa6910776..659fb96672e41e2e6525323697ca23a41d271fbb 100644 (file)
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -796,9 +796,11 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge_port *p,
         int err = 0;
  
         if (ndm->ndm_flags & NTF_USE) {
+               local_bh_disable();
                 rcu_read_lock();
                 br_fdb_update(p->br, p, addr, vid, true);
                 rcu_read_unlock();
+               local_bh_enable();
         } else {
                 spin_lock_bh(&p->br->hash_lock);
                 err = fdb_add_entry(p, addr, ndm->ndm_state,
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c

index 22fd0419b31455965223566f4676b46efedd8722..ff667e18b2d6313f0a806752a4ef88435e939c4d 100644 (file)
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1167,6 +1167,9 @@ static void br_multicast_add_router(struct net_bridge *br,
         struct net_bridge_port *p;
         struct hlist_node *slot = NULL;
  
+       if (!hlist_unhashed(&port->rlist))
+               return;
+
         hlist_for_each_entry(p, &br->router_list, rlist) {
                 if ((unsigned long) port >= (unsigned long) p)
                         break;
@@ -1194,12 +1197,8 @@ static void br_multicast_mark_router(struct net_bridge *br,
         if (port->multicast_router != 1)
                 return;
  
-       if (!hlist_unhashed(&port->rlist))
-               goto timer;
-
         br_multicast_add_router(br, port);
  
-timer:
         mod_timer(&port->multicast_router_timer,
                   now + br->multicast_querier_interval);
  }
diff --git a/net/core/dev.c b/net/core/dev.c

index 2c1c67fad64d57f3d744c89843816b2d64f5b834..aa82f9ab6a36d164769bf7c9633fcdfd5971466f 100644 (file)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1718,15 +1718,8 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable);
  
  int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
  {
-       if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
-               if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
-                       atomic_long_inc(&dev->rx_dropped);
-                       kfree_skb(skb);
-                       return NET_RX_DROP;
-               }
-       }
-
-       if (unlikely(!is_skb_forwardable(dev, skb))) {
+       if (skb_orphan_frags(skb, GFP_ATOMIC) ||
+           unlikely(!is_skb_forwardable(dev, skb))) {
                 atomic_long_inc(&dev->rx_dropped);
                 kfree_skb(skb);
                 return NET_RX_DROP;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c

index 3cfff2a3d651fb7d7cd2baaa3698c123eb7fc00f..41ec02242ea7c2ff57a6b506b685df22c62f3dcc 100644 (file)
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4398,7 +4398,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
  
                 while (order) {
                         if (npages >= 1 << order) {
-                               page = alloc_pages(gfp_mask |
+                               page = alloc_pages((gfp_mask & ~__GFP_WAIT) |
                                                    __GFP_COMP |
                                                    __GFP_NOWARN |
                                                    __GFP_NORETRY,
diff --git a/net/core/sock.c b/net/core/sock.c

index 292f42228bfb361b5748998bbcc538b1e16a2f22..dc30dc5bb1b892923397fee073d42e9e5ef53a7e 100644 (file)
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -354,15 +354,12 @@ void sk_clear_memalloc(struct sock *sk)
  
         /*
          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
-        * progress of swapping. However, if SOCK_MEMALLOC is cleared while
-        * it has rmem allocations there is a risk that the user of the
-        * socket cannot make forward progress due to exceeding the rmem
-        * limits. By rights, sk_clear_memalloc() should only be called
-        * on sockets being torn down but warn and reset the accounting if
-        * that assumption breaks.
+        * progress of swapping. SOCK_MEMALLOC may be cleared while
+        * it has rmem allocations due to the last swapfile being deactivated
+        * but there is a risk that the socket is unusable due to exceeding
+        * the rmem limits. Reclaim the reserves and obey rmem limits again.
          */
-       if (WARN_ON(sk->sk_forward_alloc))
-               sk_mem_reclaim(sk);
+       sk_mem_reclaim(sk);
  }
  EXPORT_SYMBOL_GPL(sk_clear_memalloc);
  
@@ -1883,7 +1880,7 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
  
         pfrag->offset = 0;
         if (SKB_FRAG_PAGE_ORDER) {
-               pfrag->page = alloc_pages(gfp | __GFP_COMP |
+               pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
                                           __GFP_NOWARN | __GFP_NORETRY,
                                           SKB_FRAG_PAGE_ORDER);
                 if (likely(pfrag->page)) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c

index 1c92ea67baefefb801d334fe60144cbcc3af63f2..83aa604f9273c332c5a0e5399253d961ef92eb9a 100644 (file)
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -90,6 +90,7 @@
  #include <linux/socket.h>
  #include <linux/sockios.h>
  #include <linux/igmp.h>
+#include <linux/inetdevice.h>
  #include <linux/in.h>
  #include <linux/errno.h>
  #include <linux/timer.h>
@@ -1960,6 +1961,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
         struct sock *sk;
         struct dst_entry *dst;
         int dif = skb->dev->ifindex;
+       int ours;
  
         /* validate the packet */
         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
@@ -1969,14 +1971,24 @@ void udp_v4_early_demux(struct sk_buff *skb)
         uh = udp_hdr(skb);
  
         if (skb->pkt_type == PACKET_BROADCAST ||
-           skb->pkt_type == PACKET_MULTICAST)
+           skb->pkt_type == PACKET_MULTICAST) {
+               struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+
+               if (!in_dev)
+                       return;
+
+               ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
+                                      iph->protocol);
+               if (!ours)
+                       return;
                 sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
                                                    uh->source, iph->saddr, dif);
-       else if (skb->pkt_type == PACKET_HOST)
+       } else if (skb->pkt_type == PACKET_HOST) {
                 sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
                                              uh->source, iph->saddr, dif);
-       else
+       } else {
                 return;
+       }
  
         if (!sk)
                 return;
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c

index d873ceea86e6c74c34e7fcd31bec41c78ce5720b..ca09bf49ac6806b399dba51399f84e47590cb9ed 100644 (file)
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -133,6 +133,14 @@ static void snmp6_free_dev(struct inet6_dev *idev)
         free_percpu(idev->stats.ipv6);
  }
  
+static void in6_dev_finish_destroy_rcu(struct rcu_head *head)
+{
+       struct inet6_dev *idev = container_of(head, struct inet6_dev, rcu);
+
+       snmp6_free_dev(idev);
+       kfree(idev);
+}
+
  /* Nobody refers to this device, we may destroy it. */
  
  void in6_dev_finish_destroy(struct inet6_dev *idev)
@@ -151,7 +159,6 @@ void in6_dev_finish_destroy(struct inet6_dev *idev)
                 pr_warn("Freeing alive inet6 device %p\n", idev);
                 return;
         }
-       snmp6_free_dev(idev);
-       kfree_rcu(idev, rcu);
+       call_rcu(&idev->rcu, in6_dev_finish_destroy_rcu);
  }
  EXPORT_SYMBOL(in6_dev_finish_destroy);
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c

index 7b3f732269e43bb33dc1a6584eaa91b74eab9b64..1f93a5978f2ad43fc81a16427e34d07ca2c0f34e 100644 (file)
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -541,7 +541,7 @@ static void mpls_ifdown(struct net_device *dev)
  
         RCU_INIT_POINTER(dev->mpls_ptr, NULL);
  
-       kfree(mdev);
+       kfree_rcu(mdev, rcu);
  }
  
  static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
@@ -564,6 +564,17 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
         case NETDEV_UNREGISTER:
                 mpls_ifdown(dev);
                 break;
+       case NETDEV_CHANGENAME:
+               mdev = mpls_dev_get(dev);
+               if (mdev) {
+                       int err;
+
+                       mpls_dev_sysctl_unregister(mdev);
+                       err = mpls_dev_sysctl_register(dev, mdev);
+                       if (err)
+                               return notifier_from_errno(err);
+               }
+               break;
         }
         return NOTIFY_OK;
  }
diff --git a/net/mpls/internal.h b/net/mpls/internal.h

index b064c345042c17ccd9ec841535857fb29041a8a3..8cabeb5a1cb928c856c037c5994116df8547fb71 100644 (file)
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -16,6 +16,7 @@ struct mpls_dev {
         int                     input_enabled;
  
         struct ctl_table_header *sysctl;
+       struct rcu_head         rcu;
  };
  
  struct sk_buff;
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c

index 4776282c64175209924740fbd87a56de8e05b609..33e6d6e2908f553516c5ca97c4b93abee7b7057b 100644 (file)
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -125,6 +125,7 @@ static struct vport *netdev_create(const struct vport_parms *parms)
         if (err)
                 goto error_master_upper_dev_unlink;
  
+       dev_disable_lro(netdev_vport->dev);
         dev_set_promiscuity(netdev_vport->dev, 1);
         netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;
         rtnl_unlock();
diff --git a/net/sctp/auth.c b/net/sctp/auth.c

index fb7976aee61c84f38aecdc5c5f0d8be20e577fa9..4f15b7d730e13d6aaa58ba7a28262c9831afea95 100644 (file)
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -381,13 +381,14 @@ nomem:
  }
  
  
-/* Public interface to creat the association shared key.
+/* Public interface to create the association shared key.
   * See code above for the algorithm.
   */
  int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp)
  {
         struct sctp_auth_bytes  *secret;
         struct sctp_shared_key *ep_key;
+       struct sctp_chunk *chunk;
  
         /* If we don't support AUTH, or peer is not capable
          * we don't need to do anything.
@@ -410,6 +411,14 @@ int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp)
         sctp_auth_key_put(asoc->asoc_shared_key);
         asoc->asoc_shared_key = secret;
  
+       /* Update send queue in case any chunk already in there now
+        * needs authenticating
+        */
+       list_for_each_entry(chunk, &asoc->outqueue.out_chunk_list, list) {
+               if (sctp_auth_send_cid(chunk->chunk_hdr->type, asoc))
+                       chunk->auth = 1;
+       }
+
         return 0;
  }
  
diff --git a/net/tipc/socket.c b/net/tipc/socket.c

index 9074b5cede38b8edd75890b684a706d96b9f71ba..f485600c4507bc152cef654ae5667a03a52d990c 100644 (file)
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2142,11 +2142,17 @@ static void tipc_sk_timeout(unsigned long data)
         peer_node = tsk_peer_node(tsk);
  
         if (tsk->probing_state == TIPC_CONN_PROBING) {
-               /* Previous probe not answered -> self abort */
-               skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
-                                     TIPC_CONN_MSG, SHORT_H_SIZE, 0,
-                                     own_node, peer_node, tsk->portid,
-                                     peer_port, TIPC_ERR_NO_PORT);
+               if (!sock_owned_by_user(sk)) {
+                       sk->sk_socket->state = SS_DISCONNECTING;
+                       tsk->connected = 0;
+                       tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk),
+                                             tsk_peer_port(tsk));
+                       sk->sk_state_change(sk);
+               } else {
+                       /* Try again later */
+                       sk_reset_timer(sk, &sk->sk_timer, (HZ / 20));
+               }
+
         } else {
                 skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE,
                                       INT_H_SIZE, 0, peer_node, own_node,
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c

index fff1bef6ed6d916f9019a63d708652f4ab07cddf..fd682832a0e3635d52c734871d5402d270336dc3 100644 (file)
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -1333,6 +1333,8 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
         memcpy(bssid, wdev->current_bss->pub.bssid, ETH_ALEN);
         wdev_unlock(wdev);
  
+       memset(&sinfo, 0, sizeof(sinfo));
+
         if (rdev_get_station(rdev, dev, bssid, &sinfo))
                 return NULL;
  
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl

index 89b1df4e72ab3423bce45011fb03f86c193f5ad4..c5ec977b9c3786097b214e1c835efd8fa337c173 100755 (executable)
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3169,12 +3169,12 @@ sub process {
                 }
  
  # check for global initialisers.
-               if ($line =~ /^\+(\s*$Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/) {
+               if ($line =~ /^\+$Type\s*$Ident(?:\s+$Modifier)*\s*=\s*(?:0|NULL|false)\s*;/) {
                         if (ERROR("GLOBAL_INITIALISERS",
                                   "do not initialise globals to 0 or NULL\n" .
                                       $herecurr) &&
                             $fix) {
-                               $fixed[$fixlinenr] =~ s/($Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/$1;/;
+                               $fixed[$fixlinenr] =~ s/(^.$Type\s*$Ident(?:\s+$Modifier)*)\s*=\s*(0|NULL|false)\s*;/$1;/;
                         }
                 }
  # check for static initialisers.
diff --git a/security/capability.c b/security/capability.c

index 0d03fcc489a49ee3221b1369ca2c1ff931c691cd..7d3f38fe02ba6ca7d75446c6d20e41c049b17b00 100644 (file)
--- a/security/capability.c
+++ b/security/capability.c
@@ -209,8 +209,8 @@ static int cap_inode_readlink(struct dentry *dentry)
         return 0;
  }
  
-static int cap_inode_follow_link(struct dentry *dentry,
-                                struct nameidata *nameidata)
+static int cap_inode_follow_link(struct dentry *dentry, struct inode *inode,
+                                bool rcu)
  {
         return 0;
  }
diff --git a/security/security.c b/security/security.c

index 8e9b1f4b9b45dfac98287fe969b869a3a29fb2bc..04c8feca081a3bafc8dde4f95ebc44badda4e5a5 100644 (file)
--- a/security/security.c
+++ b/security/security.c
@@ -581,11 +581,12 @@ int security_inode_readlink(struct dentry *dentry)
         return security_ops->inode_readlink(dentry);
  }
  
-int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd)
+int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
+                              bool rcu)
  {
-       if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
+       if (unlikely(IS_PRIVATE(inode)))
                 return 0;
-       return security_ops->inode_follow_link(dentry, nd);
+       return security_ops->inode_follow_link(dentry, inode, rcu);
  }
  
  int security_inode_permission(struct inode *inode, int mask)
diff --git a/security/selinux/avc.c b/security/selinux/avc.c

index 3c17dda9571d4e97f7f460e162a6195bf215758b..0b122b1421a9dcc7dfd26ac6f80d00d1c6a0d55e 100644 (file)
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -761,7 +761,23 @@ int avc_has_perm(u32 ssid, u32 tsid, u16 tclass,
  
         rc = avc_has_perm_noaudit(ssid, tsid, tclass, requested, 0, &avd);
  
-       rc2 = avc_audit(ssid, tsid, tclass, requested, &avd, rc, auditdata);
+       rc2 = avc_audit(ssid, tsid, tclass, requested, &avd, rc, auditdata, 0);
+       if (rc2)
+               return rc2;
+       return rc;
+}
+
+int avc_has_perm_flags(u32 ssid, u32 tsid, u16 tclass,
+                      u32 requested, struct common_audit_data *auditdata,
+                      int flags)
+{
+       struct av_decision avd;
+       int rc, rc2;
+
+       rc = avc_has_perm_noaudit(ssid, tsid, tclass, requested, 0, &avd);
+
+       rc2 = avc_audit(ssid, tsid, tclass, requested, &avd, rc,
+                       auditdata, flags);
         if (rc2)
                 return rc2;
         return rc;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c

index 7dade28affba5a0ebc0944be49dbd59dbf5c8761..ffa5a642629a1cbf16467f7bc0ebd3b20cdf02f0 100644 (file)
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1564,7 +1564,7 @@ static int cred_has_capability(const struct cred *cred,
  
         rc = avc_has_perm_noaudit(sid, sid, sclass, av, 0, &avd);
         if (audit == SECURITY_CAP_AUDIT) {
-               int rc2 = avc_audit(sid, sid, sclass, av, &avd, rc, &ad);
+               int rc2 = avc_audit(sid, sid, sclass, av, &avd, rc, &ad, 0);
                 if (rc2)
                         return rc2;
         }
@@ -2861,11 +2861,23 @@ static int selinux_inode_readlink(struct dentry *dentry)
         return dentry_has_perm(cred, dentry, FILE__READ);
  }
  
-static int selinux_inode_follow_link(struct dentry *dentry, struct nameidata *nameidata)
+static int selinux_inode_follow_link(struct dentry *dentry, struct inode *inode,
+                                    bool rcu)
  {
         const struct cred *cred = current_cred();
+       struct common_audit_data ad;
+       struct inode_security_struct *isec;
+       u32 sid;
  
-       return dentry_has_perm(cred, dentry, FILE__READ);
+       validate_creds(cred);
+
+       ad.type = LSM_AUDIT_DATA_DENTRY;
+       ad.u.dentry = dentry;
+       sid = cred_sid(cred);
+       isec = inode->i_security;
+
+       return avc_has_perm_flags(sid, isec->sid, isec->sclass, FILE__READ, &ad,
+                                 rcu ? MAY_NOT_BLOCK : 0);
  }
  
  static noinline int audit_inode_permission(struct inode *inode,
diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h

index ddf8eec03f211757845de5378afd1c2d5ebfe774..5973c327c54e712edba1034808defd01afa8a8a0 100644 (file)
--- a/security/selinux/include/avc.h
+++ b/security/selinux/include/avc.h
@@ -130,7 +130,8 @@ static inline int avc_audit(u32 ssid, u32 tsid,
                             u16 tclass, u32 requested,
                             struct av_decision *avd,
                             int result,
-                           struct common_audit_data *a)
+                           struct common_audit_data *a,
+                           int flags)
  {
         u32 audited, denied;
         audited = avc_audit_required(requested, avd, result, 0, &denied);
@@ -138,7 +139,7 @@ static inline int avc_audit(u32 ssid, u32 tsid,
                 return 0;
         return slow_avc_audit(ssid, tsid, tclass,
                               requested, audited, denied, result,
-                             a, 0);
+                             a, flags);
  }
  
  #define AVC_STRICT 1 /* Ignore permissive mode. */
@@ -150,6 +151,10 @@ int avc_has_perm_noaudit(u32 ssid, u32 tsid,
  int avc_has_perm(u32 ssid, u32 tsid,
                  u16 tclass, u32 requested,
                  struct common_audit_data *auditdata);
+int avc_has_perm_flags(u32 ssid, u32 tsid,
+                      u16 tclass, u32 requested,
+                      struct common_audit_data *auditdata,
+                      int flags);
  
  u32 avc_policy_seqno(void);
  
diff --git a/sound/hda/hdac_regmap.c b/sound/hda/hdac_regmap.c

index 7371e0c3926f32a9104b521d0bf70f1c35f0740f..1eabcdf69457311129b766ec237d37e402f640bc 100644 (file)
--- a/sound/hda/hdac_regmap.c
+++ b/sound/hda/hdac_regmap.c
@@ -246,6 +246,9 @@ static int hda_reg_read(void *context, unsigned int reg, unsigned int *val)
                 return hda_reg_read_stereo_amp(codec, reg, val);
         if (verb == AC_VERB_GET_PROC_COEF)
                 return hda_reg_read_coef(codec, reg, val);
+       if ((verb & 0x700) == AC_VERB_SET_AMP_GAIN_MUTE)
+               reg &= ~AC_AMP_FAKE_MUTE;
+
         err = snd_hdac_exec_verb(codec, reg, 0, val);
         if (err < 0)
                 return err;
@@ -265,6 +268,9 @@ static int hda_reg_write(void *context, unsigned int reg, unsigned int val)
         unsigned int verb;
         int i, bytes, err;
  
+       if (codec->caps_overwriting)
+               return 0;
+
         reg &= ~0x00080000U; /* drop GET bit */
         reg |= (codec->addr << 28);
         verb = get_verb(reg);
@@ -280,6 +286,8 @@ static int hda_reg_write(void *context, unsigned int reg, unsigned int val)
  
         switch (verb & 0xf00) {
         case AC_VERB_SET_AMP_GAIN_MUTE:
+               if ((reg & AC_AMP_FAKE_MUTE) && (val & AC_AMP_MUTE))
+                       val = 0;
                 verb = AC_VERB_SET_AMP_GAIN_MUTE;
                 if (reg & AC_AMP_GET_LEFT)
                         verb |= AC_AMP_SET_LEFT >> 8;
diff --git a/sound/mips/Kconfig b/sound/mips/Kconfig

index d2f615ab177a7ca021d9f802d61ba2b57b10a7ca..2153d31fb66312025cb6221afd8476d313261785 100644 (file)
--- a/sound/mips/Kconfig
+++ b/sound/mips/Kconfig
@@ -12,12 +12,14 @@ if SND_MIPS
  config SND_SGI_O2
         tristate "SGI O2 Audio"
         depends on SGI_IP32
+       select SND_PCM
          help
                  Sound support for the SGI O2 Workstation. 
  
  config SND_SGI_HAL2
          tristate "SGI HAL2 Audio"
          depends on SGI_HAS_HAL2
+       select SND_PCM
          help
                  Sound support for the SGI Indy and Indigo2 Workstation.
  
diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c

index b49feff0a31982e7c22071c08e8d088e91a97727..5645481af3d9571b8340c963a27c34e377c405c5 100644 (file)
--- a/sound/pci/hda/hda_codec.c
+++ b/sound/pci/hda/hda_codec.c
@@ -436,7 +436,7 @@ static unsigned int get_num_devices(struct hda_codec *codec, hda_nid_t nid)
             get_wcaps_type(wcaps) != AC_WID_PIN)
                 return 0;
  
-       parm = snd_hda_param_read(codec, nid, AC_PAR_DEVLIST_LEN);
+       parm = snd_hdac_read_parm_uncached(&codec->core, nid, AC_PAR_DEVLIST_LEN);
         if (parm == -1 && codec->bus->rirb_error)
                 parm = 0;
         return parm & AC_DEV_LIST_LEN_MASK;
@@ -1375,6 +1375,31 @@ int snd_hda_override_amp_caps(struct hda_codec *codec, hda_nid_t nid, int dir,
  }
  EXPORT_SYMBOL_GPL(snd_hda_override_amp_caps);
  
+/**
+ * snd_hda_codec_amp_update - update the AMP mono value
+ * @codec: HD-audio codec
+ * @nid: NID to read the AMP value
+ * @ch: channel to update (0 or 1)
+ * @dir: #HDA_INPUT or #HDA_OUTPUT
+ * @idx: the index value (only for input direction)
+ * @mask: bit mask to set
+ * @val: the bits value to set
+ *
+ * Update the AMP values for the given channel, direction and index.
+ */
+int snd_hda_codec_amp_update(struct hda_codec *codec, hda_nid_t nid,
+                            int ch, int dir, int idx, int mask, int val)
+{
+       unsigned int cmd = snd_hdac_regmap_encode_amp(nid, ch, dir, idx);
+
+       /* enable fake mute if no h/w mute but min=mute */
+       if ((query_amp_caps(codec, nid, dir) &
+            (AC_AMPCAP_MUTE | AC_AMPCAP_MIN_MUTE)) == AC_AMPCAP_MIN_MUTE)
+               cmd |= AC_AMP_FAKE_MUTE;
+       return snd_hdac_regmap_update_raw(&codec->core, cmd, mask, val);
+}
+EXPORT_SYMBOL_GPL(snd_hda_codec_amp_update);
+
  /**
   * snd_hda_codec_amp_stereo - update the AMP stereo values
   * @codec: HD-audio codec
diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c

index fea198c58196a41caf4b093da8d6b37355e78a32..b6db25b23dd316d0205d6a14fed365f5ae8cde4f 100644 (file)
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -340,6 +340,11 @@ enum {
  #define use_vga_switcheroo(chip)       0
  #endif
  
+#define CONTROLLER_IN_GPU(pci) (((pci)->device == 0x0a0c) || \
+                                       ((pci)->device == 0x0c0c) || \
+                                       ((pci)->device == 0x0d0c) || \
+                                       ((pci)->device == 0x160c))
+
  static char *driver_short_names[] = {
         [AZX_DRIVER_ICH] = "HDA Intel",
         [AZX_DRIVER_PCH] = "HDA Intel PCH",
@@ -1854,8 +1859,17 @@ static int azx_probe_continue(struct azx *chip)
         if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL) {
  #ifdef CONFIG_SND_HDA_I915
                 err = hda_i915_init(hda);
-               if (err < 0)
-                       goto out_free;
+               if (err < 0) {
+                       /* if the controller is bound only with HDMI/DP
+                        * (for HSW and BDW), we need to abort the probe;
+                        * for other chips, still continue probing as other
+                        * codecs can be on the same link.
+                        */
+                       if (CONTROLLER_IN_GPU(pci))
+                               goto out_free;
+                       else
+                               goto skip_i915;
+               }
                 err = hda_display_power(hda, true);
                 if (err < 0) {
                         dev_err(chip->card->dev,
@@ -1865,6 +1879,9 @@ static int azx_probe_continue(struct azx *chip)
  #endif
         }
  
+#ifdef CONFIG_SND_HDA_I915
+ skip_i915:
+#endif
         err = azx_first_init(chip);
         if (err < 0)
                 goto out_free;
diff --git a/sound/pci/hda/hda_local.h b/sound/pci/hda/hda_local.h

index 3b567f42296b9d6b2ca148c66c59c12b5628cb9e..bed66c3144318de3f82dcddeb2445a84ad9ce594 100644 (file)
--- a/sound/pci/hda/hda_local.h
+++ b/sound/pci/hda/hda_local.h
@@ -129,8 +129,8 @@ int snd_hda_mixer_amp_switch_put_beep(struct snd_kcontrol *kcontrol,
  /* lowlevel accessor with caching; use carefully */
  #define snd_hda_codec_amp_read(codec, nid, ch, dir, idx) \
         snd_hdac_regmap_get_amp(&(codec)->core, nid, ch, dir, idx)
-#define snd_hda_codec_amp_update(codec, nid, ch, dir, idx, mask, val) \
-       snd_hdac_regmap_update_amp(&(codec)->core, nid, ch, dir, idx, mask, val)
+int snd_hda_codec_amp_update(struct hda_codec *codec, hda_nid_t nid,
+                            int ch, int dir, int idx, int mask, int val);
  int snd_hda_codec_amp_stereo(struct hda_codec *codec, hda_nid_t nid,
                              int dir, int idx, int mask, int val);
  int snd_hda_codec_amp_init(struct hda_codec *codec, hda_nid_t nid, int ch,
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c

index 4641684264653c5ef265e6f61f6ef8c26bfa5f86..6d010452c1f5c5d131c0ac0a4ae3b2c539e56ad9 100644 (file)
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -2168,6 +2168,7 @@ static const struct hda_fixup alc882_fixups[] = {
  static const struct snd_pci_quirk alc882_fixup_tbl[] = {
         SND_PCI_QUIRK(0x1025, 0x006c, "Acer Aspire 9810", ALC883_FIXUP_ACER_EAPD),
         SND_PCI_QUIRK(0x1025, 0x0090, "Acer Aspire", ALC883_FIXUP_ACER_EAPD),
+       SND_PCI_QUIRK(0x1025, 0x0107, "Acer Aspire", ALC883_FIXUP_ACER_EAPD),
         SND_PCI_QUIRK(0x1025, 0x010a, "Acer Ferrari 5000", ALC883_FIXUP_ACER_EAPD),
         SND_PCI_QUIRK(0x1025, 0x0110, "Acer Aspire", ALC883_FIXUP_ACER_EAPD),
         SND_PCI_QUIRK(0x1025, 0x0112, "Acer Aspire 9303", ALC883_FIXUP_ACER_EAPD),
@@ -4514,6 +4515,8 @@ enum {
         ALC288_FIXUP_DELL_HEADSET_MODE,
         ALC288_FIXUP_DELL1_MIC_NO_PRESENCE,
         ALC288_FIXUP_DELL_XPS_13_GPIO6,
+       ALC292_FIXUP_DELL_E7X,
+       ALC292_FIXUP_DISABLE_AAMIX,
  };
  
  static const struct hda_fixup alc269_fixups[] = {
@@ -5036,6 +5039,16 @@ static const struct hda_fixup alc269_fixups[] = {
                 .chained = true,
                 .chain_id = ALC288_FIXUP_DELL1_MIC_NO_PRESENCE
         },
+       [ALC292_FIXUP_DISABLE_AAMIX] = {
+               .type = HDA_FIXUP_FUNC,
+               .v.func = alc_fixup_disable_aamix,
+       },
+       [ALC292_FIXUP_DELL_E7X] = {
+               .type = HDA_FIXUP_FUNC,
+               .v.func = alc_fixup_dell_xps13,
+               .chained = true,
+               .chain_id = ALC292_FIXUP_DISABLE_AAMIX
+       },
  };
  
  static const struct snd_pci_quirk alc269_fixup_tbl[] = {
@@ -5048,6 +5061,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
         SND_PCI_QUIRK(0x1025, 0x0775, "Acer Aspire E1-572", ALC271_FIXUP_HP_GATE_MIC_JACK_E1_572),
         SND_PCI_QUIRK(0x1025, 0x079b, "Acer Aspire V5-573G", ALC282_FIXUP_ASPIRE_V5_PINS),
         SND_PCI_QUIRK(0x1028, 0x0470, "Dell M101z", ALC269_FIXUP_DELL_M101Z),
+       SND_PCI_QUIRK(0x1028, 0x05ca, "Dell Latitude E7240", ALC292_FIXUP_DELL_E7X),
+       SND_PCI_QUIRK(0x1028, 0x05cb, "Dell Latitude E7440", ALC292_FIXUP_DELL_E7X),
         SND_PCI_QUIRK(0x1028, 0x05da, "Dell Vostro 5460", ALC290_FIXUP_SUBWOOFER),
         SND_PCI_QUIRK(0x1028, 0x05f4, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE),
         SND_PCI_QUIRK(0x1028, 0x05f5, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE),
@@ -5057,6 +5072,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
         SND_PCI_QUIRK(0x1028, 0x0638, "Dell Inspiron 5439", ALC290_FIXUP_MONO_SPEAKERS_HSJACK),
         SND_PCI_QUIRK(0x1028, 0x064a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
         SND_PCI_QUIRK(0x1028, 0x064b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
+       SND_PCI_QUIRK(0x1028, 0x0665, "Dell XPS 13", ALC292_FIXUP_DELL_E7X),
         SND_PCI_QUIRK(0x1028, 0x06c7, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE),
         SND_PCI_QUIRK(0x1028, 0x06d9, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
         SND_PCI_QUIRK(0x1028, 0x06da, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
@@ -5376,6 +5392,13 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = {
                 {0x17, 0x40000000},
                 {0x1d, 0x40700001},
                 {0x21, 0x02211040}),
+       SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
+               ALC255_STANDARD_PINS,
+               {0x12, 0x90a60160},
+               {0x14, 0x90170120},
+               {0x17, 0x40000000},
+               {0x1d, 0x40700001},
+               {0x21, 0x02211030}),
         SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
                 ALC256_STANDARD_PINS,
                 {0x13, 0x40000000}),
@@ -5629,8 +5652,7 @@ static int patch_alc269(struct hda_codec *codec)
  
         spec = codec->spec;
         spec->gen.shared_mic_vref_pin = 0x18;
-       if (codec->core.vendor_id != 0x10ec0292)
-               codec->power_save_node = 1;
+       codec->power_save_node = 1;
  
         snd_hda_pick_fixup(codec, alc269_fixup_models,
                        alc269_fixup_tbl, alc269_fixups);
diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c

index 6833c74ed6ff47f60598d6d250b8709d914590a5..6c66d7e164391b7e824e2c72b5fa2e5ae889f16d 100644 (file)
--- a/sound/pci/hda/patch_sigmatel.c
+++ b/sound/pci/hda/patch_sigmatel.c
@@ -100,6 +100,7 @@ enum {
         STAC_HP_ENVY_BASS,
         STAC_HP_BNB13_EQ,
         STAC_HP_ENVY_TS_BASS,
+       STAC_HP_ENVY_TS_DAC_BIND,
         STAC_92HD83XXX_GPIO10_EAPD,
         STAC_92HD83XXX_MODELS
  };
@@ -2171,6 +2172,22 @@ static void stac92hd83xxx_fixup_gpio10_eapd(struct hda_codec *codec,
         spec->eapd_switch = 0;
  }
  
+static void hp_envy_ts_fixup_dac_bind(struct hda_codec *codec,
+                                           const struct hda_fixup *fix,
+                                           int action)
+{
+       struct sigmatel_spec *spec = codec->spec;
+       static hda_nid_t preferred_pairs[] = {
+               0xd, 0x13,
+               0
+       };
+
+       if (action != HDA_FIXUP_ACT_PRE_PROBE)
+               return;
+
+       spec->gen.preferred_dacs = preferred_pairs;
+}
+
  static const struct hda_verb hp_bnb13_eq_verbs[] = {
         /* 44.1KHz base */
         { 0x22, 0x7A6, 0x3E },
@@ -2686,6 +2703,12 @@ static const struct hda_fixup stac92hd83xxx_fixups[] = {
                         {}
                 },
         },
+       [STAC_HP_ENVY_TS_DAC_BIND] = {
+               .type = HDA_FIXUP_FUNC,
+               .v.func = hp_envy_ts_fixup_dac_bind,
+               .chained = true,
+               .chain_id = STAC_HP_ENVY_TS_BASS,
+       },
         [STAC_92HD83XXX_GPIO10_EAPD] = {
                 .type = HDA_FIXUP_FUNC,
                 .v.func = stac92hd83xxx_fixup_gpio10_eapd,
@@ -2764,6 +2787,8 @@ static const struct snd_pci_quirk stac92hd83xxx_fixup_tbl[] = {
                           "HP bNB13", STAC_HP_BNB13_EQ),
         SND_PCI_QUIRK(PCI_VENDOR_ID_HP, 0x190e,
                           "HP ENVY TS", STAC_HP_ENVY_TS_BASS),
+       SND_PCI_QUIRK(PCI_VENDOR_ID_HP, 0x1967,
+                         "HP ENVY TS", STAC_HP_ENVY_TS_DAC_BIND),
         SND_PCI_QUIRK(PCI_VENDOR_ID_HP, 0x1940,
                           "HP bNB13", STAC_HP_BNB13_EQ),
         SND_PCI_QUIRK(PCI_VENDOR_ID_HP, 0x1941,
diff --git a/sound/pci/hda/patch_via.c b/sound/pci/hda/patch_via.c

index 31a95cca015d4d1c34a1facff2e226b6821a5203..bab6c04932aa050ff63f054bf172acf288f5ee5e 100644 (file)
--- a/sound/pci/hda/patch_via.c
+++ b/sound/pci/hda/patch_via.c
@@ -449,6 +449,15 @@ static int via_suspend(struct hda_codec *codec)
  
         return 0;
  }
+
+static int via_resume(struct hda_codec *codec)
+{
+       /* some delay here to make jack detection working (bko#98921) */
+       msleep(10);
+       codec->patch_ops.init(codec);
+       regcache_sync(codec->core.regmap);
+       return 0;
+}
  #endif
  
  #ifdef CONFIG_PM
@@ -475,6 +484,7 @@ static const struct hda_codec_ops via_patch_ops = {
         .stream_pm = snd_hda_gen_stream_pm,
  #ifdef CONFIG_PM
         .suspend = via_suspend,
+       .resume = via_resume,
         .check_power_status = via_check_power_status,
  #endif
  };
diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c

index 3e2ef61c627b831bfec65724cc7166db051f5099..8b7e391dd0b80193d49f8634bb69fa45814593f0 100644 (file)
--- a/sound/usb/mixer.c
+++ b/sound/usb/mixer.c
@@ -918,6 +918,7 @@ static void volume_control_quirks(struct usb_mixer_elem_info *cval,
         case USB_ID(0x046d, 0x081d): /* HD Webcam c510 */
         case USB_ID(0x046d, 0x0825): /* HD Webcam c270 */
         case USB_ID(0x046d, 0x0826): /* HD Webcam c525 */
+       case USB_ID(0x046d, 0x08ca): /* Logitech Quickcam Fusion */
         case USB_ID(0x046d, 0x0991):
         /* Most audio usb devices lie about volume resolution.
          * Most Logitech webcams have res = 384.
@@ -1582,12 +1583,6 @@ static int parse_audio_mixer_unit(struct mixer_build *state, int unitid,
                               unitid);
                 return -EINVAL;
         }
-       /* no bmControls field (e.g. Maya44) -> ignore */
-       if (desc->bLength <= 10 + input_pins) {
-               usb_audio_dbg(state->chip, "MU %d has no bmControls field\n",
-                             unitid);
-               return 0;
-       }
  
         num_ins = 0;
         ich = 0;
@@ -1595,6 +1590,9 @@ static int parse_audio_mixer_unit(struct mixer_build *state, int unitid,
                 err = parse_audio_unit(state, desc->baSourceID[pin]);
                 if (err < 0)
                         continue;
+               /* no bmControls field (e.g. Maya44) -> ignore */
+               if (desc->bLength <= 10 + input_pins)
+                       continue;
                 err = check_input_term(state, desc->baSourceID[pin], &iterm);
                 if (err < 0)
                         return err;
diff --git a/sound/usb/mixer_maps.c b/sound/usb/mixer_maps.c

index b703cb3cda1993402d60efc03e9e7d840cb68f72..e5000da9e9d7093f6e287194665de2d63f046e93 100644 (file)
--- a/sound/usb/mixer_maps.c
+++ b/sound/usb/mixer_maps.c
@@ -436,6 +436,11 @@ static struct usbmix_ctl_map usbmix_ctl_maps[] = {
                 .id = USB_ID(0x200c, 0x1018),
                 .map = ebox44_map,
         },
+       {
+               /* MAYA44 USB+ */
+               .id = USB_ID(0x2573, 0x0008),
+               .map = maya44_map,
+       },
         {
                 /* KEF X300A */
                 .id = USB_ID(0x27ac, 0x1000),
diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c

index 29175346cc4f4f9311726cb13bf1615e68aad7cf..754e689596a21b43f3b3a45b8f3062ec29b74099 100644 (file)
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -1120,6 +1120,7 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip)
         case USB_ID(0x045E, 0x0772): /* MS Lifecam Studio */
         case USB_ID(0x045E, 0x0779): /* MS Lifecam HD-3000 */
         case USB_ID(0x04D8, 0xFEEA): /* Benchmark DAC1 Pre */
+       case USB_ID(0x074D, 0x3553): /* Outlaw RR2150 (Micronas UAC3553B) */
                 return true;
         }
         return false;
@@ -1266,8 +1267,9 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip,
                 if (fp->altsetting == 2)
                         return SNDRV_PCM_FMTBIT_DSD_U32_BE;
                 break;
-       /* DIYINHK DSD DXD 384kHz USB to I2S/DSD */
-       case USB_ID(0x20b1, 0x2009):
+
+       case USB_ID(0x20b1, 0x2009): /* DIYINHK DSD DXD 384kHz USB to I2S/DSD */
+       case USB_ID(0x20b1, 0x2023): /* JLsounds I2SoverUSB */
                 if (fp->altsetting == 3)
                         return SNDRV_PCM_FMTBIT_DSD_U32_BE;
                 break;
diff --git a/tools/Makefile b/tools/Makefile

index 9a617adc6675dc06552de428c93b3c611599900b..b35102721cbbc82d1560ae657d41cebebe2fa723 100644 (file)
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,3 +1,8 @@
+# Some of the tools (perf) use same make variables
+# as in kernel build.
+export srctree=
+export objtree=
+
  include scripts/Makefile.include
  
  help:
@@ -47,11 +52,16 @@ cgroup firewire hv guest usb virtio vm net: FORCE
  liblockdep: FORCE
         $(call descend,lib/lockdep)
  
-libapikfs: FORCE
+libapi: FORCE
         $(call descend,lib/api)
  
-perf: libapikfs FORCE
-       $(call descend,$@)
+# The perf build does not follow the descend function setup,
+# invoking it via it's own make rule.
+PERF_O   = $(if $(O),$(O)/tools/perf,)
+
+perf: FORCE
+       $(Q)mkdir -p $(PERF_O) .
+       $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir=
  
  selftests: FORCE
         $(call descend,testing/$@)
@@ -97,10 +107,10 @@ cgroup_clean hv_clean firewire_clean lguest_clean usb_clean virtio_clean vm_clea
  liblockdep_clean:
         $(call descend,lib/lockdep,clean)
  
-libapikfs_clean:
+libapi_clean:
         $(call descend,lib/api,clean)
  
-perf_clean: libapikfs_clean
+perf_clean:
         $(call descend,$(@:_clean=),clean)
  
  selftests_clean:
diff --git a/tools/arch/alpha/include/asm/barrier.h b/tools/arch/alpha/include/asm/barrier.h

new file mode 100644 (file)

index 0000000..95df19c
--- /dev/null
+++ b/tools/arch/alpha/include/asm/barrier.h
@@ -0,0 +1,8 @@
+#ifndef __TOOLS_LINUX_ASM_ALPHA_BARRIER_H
+#define __TOOLS_LINUX_ASM_ALPHA_BARRIER_H
+
+#define mb()   __asm__ __volatile__("mb": : :"memory")
+#define rmb()  __asm__ __volatile__("mb": : :"memory")
+#define wmb()  __asm__ __volatile__("wmb": : :"memory")
+
+#endif         /* __TOOLS_LINUX_ASM_ALPHA_BARRIER_H */
diff --git a/tools/arch/arm/include/asm/barrier.h b/tools/arch/arm/include/asm/barrier.h

new file mode 100644 (file)

index 0000000..005c618
--- /dev/null
+++ b/tools/arch/arm/include/asm/barrier.h
@@ -0,0 +1,12 @@
+#ifndef _TOOLS_LINUX_ASM_ARM_BARRIER_H
+#define _TOOLS_LINUX_ASM_ARM_BARRIER_H
+
+/*
+ * Use the __kuser_memory_barrier helper in the CPU helper page. See
+ * arch/arm/kernel/entry-armv.S in the kernel source for details.
+ */
+#define mb()           ((void(*)(void))0xffff0fa0)()
+#define wmb()          ((void(*)(void))0xffff0fa0)()
+#define rmb()          ((void(*)(void))0xffff0fa0)()
+
+#endif /* _TOOLS_LINUX_ASM_ARM_BARRIER_H */
diff --git a/tools/arch/arm64/include/asm/barrier.h b/tools/arch/arm64/include/asm/barrier.h

new file mode 100644 (file)

index 0000000..a0483c8
--- /dev/null
+++ b/tools/arch/arm64/include/asm/barrier.h
@@ -0,0 +1,16 @@
+#ifndef _TOOLS_LINUX_ASM_AARCH64_BARRIER_H
+#define _TOOLS_LINUX_ASM_AARCH64_BARRIER_H
+
+/*
+ * From tools/perf/perf-sys.h, last modified in:
+ * f428ebd184c82a7914b2aa7e9f868918aaf7ea78 perf tools: Fix AAAAARGH64 memory barriers
+ *
+ * XXX: arch/arm64/include/asm/barrier.h in the kernel sources use dsb, is this
+ * a case like for arm32 where we do things differently in userspace?
+ */
+
+#define mb()           asm volatile("dmb ish" ::: "memory")
+#define wmb()          asm volatile("dmb ishst" ::: "memory")
+#define rmb()          asm volatile("dmb ishld" ::: "memory")
+
+#endif /* _TOOLS_LINUX_ASM_AARCH64_BARRIER_H */
diff --git a/tools/arch/ia64/include/asm/barrier.h b/tools/arch/ia64/include/asm/barrier.h

new file mode 100644 (file)

index 0000000..e4422b4
--- /dev/null
+++ b/tools/arch/ia64/include/asm/barrier.h
@@ -0,0 +1,48 @@
+/*
+ * Copied from the kernel sources to tools/:
+ *
+ * Memory barrier definitions.  This is based on information published
+ * in the Processor Abstraction Layer and the System Abstraction Layer
+ * manual.
+ *
+ * Copyright (C) 1998-2003 Hewlett-Packard Co
+ *     David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
+ * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
+ */
+#ifndef _TOOLS_LINUX_ASM_IA64_BARRIER_H
+#define _TOOLS_LINUX_ASM_IA64_BARRIER_H
+
+#include <linux/compiler.h>
+
+/*
+ * Macros to force memory ordering.  In these descriptions, "previous"
+ * and "subsequent" refer to program order; "visible" means that all
+ * architecturally visible effects of a memory access have occurred
+ * (at a minimum, this means the memory has been read or written).
+ *
+ *   wmb():    Guarantees that all preceding stores to memory-
+ *             like regions are visible before any subsequent
+ *             stores and that all following stores will be
+ *             visible only after all previous stores.
+ *   rmb():    Like wmb(), but for reads.
+ *   mb():     wmb()/rmb() combo, i.e., all previous memory
+ *             accesses are visible before all subsequent
+ *             accesses and vice versa.  This is also known as
+ *             a "fence."
+ *
+ * Note: "mb()" and its variants cannot be used as a fence to order
+ * accesses to memory mapped I/O registers.  For that, mf.a needs to
+ * be used.  However, we don't want to always use mf.a because (a)
+ * it's (presumably) much slower than mf and (b) mf.a is supported for
+ * sequential memory pages only.
+ */
+
+/* XXX From arch/ia64/include/uapi/asm/gcc_intrin.h */
+#define ia64_mf()       asm volatile ("mf" ::: "memory")
+
+#define mb()           ia64_mf()
+#define rmb()          mb()
+#define wmb()          mb()
+
+#endif /* _TOOLS_LINUX_ASM_IA64_BARRIER_H */
diff --git a/tools/arch/mips/include/asm/barrier.h b/tools/arch/mips/include/asm/barrier.h

new file mode 100644 (file)

index 0000000..80f96f7
--- /dev/null
+++ b/tools/arch/mips/include/asm/barrier.h
@@ -0,0 +1,20 @@
+#ifndef _TOOLS_LINUX_ASM_MIPS_BARRIER_H
+#define _TOOLS_LINUX_ASM_MIPS_BARRIER_H
+/*
+ * FIXME: This came from tools/perf/perf-sys.h, where it was first introduced
+ * in c1e028ef40b8d6943b767028ba17d4f2ba020edb, more work needed to make it
+ * more closely follow the Linux kernel arch/mips/include/asm/barrier.h file.
+ * Probably when we continue work on tools/ Kconfig support to have all the
+ * CONFIG_ needed for properly doing that.
+ */
+#define mb()           asm volatile(                                   \
+                               ".set   mips2\n\t"                      \
+                               "sync\n\t"                              \
+                               ".set   mips0"                          \
+                               : /* no output */                       \
+                               : /* no input */                        \
+                               : "memory")
+#define wmb()  mb()
+#define rmb()  mb()
+
+#endif /* _TOOLS_LINUX_ASM_MIPS_BARRIER_H */
diff --git a/tools/arch/powerpc/include/asm/barrier.h b/tools/arch/powerpc/include/asm/barrier.h

new file mode 100644 (file)

index 0000000..b23aee8
--- /dev/null
+++ b/tools/arch/powerpc/include/asm/barrier.h
@@ -0,0 +1,29 @@
+/*
+ * Copied from the kernel sources:
+ *
+ * Copyright (C) 1999 Cort Dougan <cort@cs.nmt.edu>
+ */
+#ifndef _TOOLS_LINUX_ASM_POWERPC_BARRIER_H
+#define _TOOLS_LINUX_ASM_POWERPC_BARRIER_H
+
+/*
+ * Memory barrier.
+ * The sync instruction guarantees that all memory accesses initiated
+ * by this processor have been performed (with respect to all other
+ * mechanisms that access memory).  The eieio instruction is a barrier
+ * providing an ordering (separately) for (a) cacheable stores and (b)
+ * loads and stores to non-cacheable memory (e.g. I/O devices).
+ *
+ * mb() prevents loads and stores being reordered across this point.
+ * rmb() prevents loads being reordered across this point.
+ * wmb() prevents stores being reordered across this point.
+ *
+ * *mb() variants without smp_ prefix must order all types of memory
+ * operations with one another. sync is the only instruction sufficient
+ * to do this.
+ */
+#define mb()   __asm__ __volatile__ ("sync" : : : "memory")
+#define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
+#define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
+
+#endif /* _TOOLS_LINUX_ASM_POWERPC_BARRIER_H */
diff --git a/tools/arch/s390/include/asm/barrier.h b/tools/arch/s390/include/asm/barrier.h

new file mode 100644 (file)

index 0000000..f851412
--- /dev/null
+++ b/tools/arch/s390/include/asm/barrier.h
@@ -0,0 +1,30 @@
+/*
+ * Copied from the kernel sources:
+ *
+ * Copyright IBM Corp. 1999, 2009
+ *
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#ifndef __TOOLS_LINUX_ASM_BARRIER_H
+#define __TOOLS_LINUX_ASM_BARRIER_H
+
+/*
+ * Force strict CPU ordering.
+ * And yes, this is required on UP too when we're talking
+ * to devices.
+ */
+
+#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
+/* Fast-BCR without checkpoint synchronization */
+#define __ASM_BARRIER "bcr 14,0\n"
+#else
+#define __ASM_BARRIER "bcr 15,0\n"
+#endif
+
+#define mb() do {  asm volatile(__ASM_BARRIER : : : "memory"); } while (0)
+
+#define rmb()                          mb()
+#define wmb()                          mb()
+
+#endif /* __TOOLS_LIB_ASM_BARRIER_H */
diff --git a/tools/arch/sh/include/asm/barrier.h b/tools/arch/sh/include/asm/barrier.h

new file mode 100644 (file)

index 0000000..c18fd75
--- /dev/null
+++ b/tools/arch/sh/include/asm/barrier.h
@@ -0,0 +1,32 @@
+/*
+ * Copied from the kernel sources:
+ *
+ * Copyright (C) 1999, 2000  Niibe Yutaka  &  Kaz Kojima
+ * Copyright (C) 2002 Paul Mundt
+ */
+#ifndef __TOOLS_LINUX_ASM_SH_BARRIER_H
+#define __TOOLS_LINUX_ASM_SH_BARRIER_H
+
+/*
+ * A brief note on ctrl_barrier(), the control register write barrier.
+ *
+ * Legacy SH cores typically require a sequence of 8 nops after
+ * modification of a control register in order for the changes to take
+ * effect. On newer cores (like the sh4a and sh5) this is accomplished
+ * with icbi.
+ *
+ * Also note that on sh4a in the icbi case we can forego a synco for the
+ * write barrier, as it's not necessary for control registers.
+ *
+ * Historically we have only done this type of barrier for the MMUCR, but
+ * it's also necessary for the CCR, so we make it generic here instead.
+ */
+#if defined(__SH4A__) || defined(__SH5__)
+#define mb()           __asm__ __volatile__ ("synco": : :"memory")
+#define rmb()          mb()
+#define wmb()          mb()
+#endif
+
+#include <asm-generic/barrier.h>
+
+#endif /* __TOOLS_LINUX_ASM_SH_BARRIER_H */
diff --git a/tools/arch/sparc/include/asm/barrier.h b/tools/arch/sparc/include/asm/barrier.h

new file mode 100644 (file)

index 0000000..8c017b3
--- /dev/null
+++ b/tools/arch/sparc/include/asm/barrier.h
@@ -0,0 +1,8 @@
+#ifndef ___TOOLS_LINUX_ASM_SPARC_BARRIER_H
+#define ___TOOLS_LINUX_ASM_SPARC_BARRIER_H
+#if defined(__sparc__) && defined(__arch64__)
+#include "barrier_64.h"
+#else
+#include "barrier_32.h"
+#endif
+#endif
diff --git a/tools/arch/sparc/include/asm/barrier_32.h b/tools/arch/sparc/include/asm/barrier_32.h

new file mode 100644 (file)

index 0000000..c5eadd0
--- /dev/null
+++ b/tools/arch/sparc/include/asm/barrier_32.h
@@ -0,0 +1,6 @@
+#ifndef __TOOLS_PERF_SPARC_BARRIER_H
+#define __TOOLS_PERF_SPARC_BARRIER_H
+
+#include <asm-generic/barrier.h>
+
+#endif /* !(__TOOLS_PERF_SPARC_BARRIER_H) */
diff --git a/tools/arch/sparc/include/asm/barrier_64.h b/tools/arch/sparc/include/asm/barrier_64.h

new file mode 100644 (file)

index 0000000..9a7d732
--- /dev/null
+++ b/tools/arch/sparc/include/asm/barrier_64.h
@@ -0,0 +1,42 @@
+#ifndef __TOOLS_LINUX_SPARC64_BARRIER_H
+#define __TOOLS_LINUX_SPARC64_BARRIER_H
+
+/* Copied from the kernel sources to tools/:
+ *
+ * These are here in an effort to more fully work around Spitfire Errata
+ * #51.  Essentially, if a memory barrier occurs soon after a mispredicted
+ * branch, the chip can stop executing instructions until a trap occurs.
+ * Therefore, if interrupts are disabled, the chip can hang forever.
+ *
+ * It used to be believed that the memory barrier had to be right in the
+ * delay slot, but a case has been traced recently wherein the memory barrier
+ * was one instruction after the branch delay slot and the chip still hung.
+ * The offending sequence was the following in sym_wakeup_done() of the
+ * sym53c8xx_2 driver:
+ *
+ *     call    sym_ccb_from_dsa, 0
+ *      movge  %icc, 0, %l0
+ *     brz,pn  %o0, .LL1303
+ *      mov    %o0, %l2
+ *     membar  #LoadLoad
+ *
+ * The branch has to be mispredicted for the bug to occur.  Therefore, we put
+ * the memory barrier explicitly into a "branch always, predicted taken"
+ * delay slot to avoid the problem case.
+ */
+#define membar_safe(type) \
+do {   __asm__ __volatile__("ba,pt     %%xcc, 1f\n\t" \
+                            " membar   " type "\n" \
+                            "1:\n" \
+                            : : : "memory"); \
+} while (0)
+
+/* The kernel always executes in TSO memory model these days,
+ * and furthermore most sparc64 chips implement more stringent
+ * memory ordering than required by the specifications.
+ */
+#define mb()   membar_safe("#StoreLoad")
+#define rmb()  __asm__ __volatile__("":::"memory")
+#define wmb()  __asm__ __volatile__("":::"memory")
+
+#endif /* !(__TOOLS_LINUX_SPARC64_BARRIER_H) */
diff --git a/tools/arch/tile/include/asm/barrier.h b/tools/arch/tile/include/asm/barrier.h

new file mode 100644 (file)

index 0000000..7d3692c
--- /dev/null
+++ b/tools/arch/tile/include/asm/barrier.h
@@ -0,0 +1,15 @@
+#ifndef _TOOLS_LINUX_ASM_TILE_BARRIER_H
+#define _TOOLS_LINUX_ASM_TILE_BARRIER_H
+/*
+ * FIXME: This came from tools/perf/perf-sys.h, where it was first introduced
+ * in 620830b6954913647b7c7f68920cf48eddf6ad92, more work needed to make it
+ * more closely follow the Linux kernel arch/tile/include/asm/barrier.h file.
+ * Probably when we continue work on tools/ Kconfig support to have all the
+ * CONFIG_ needed for properly doing that.
+ */
+
+#define mb()           asm volatile ("mf" ::: "memory")
+#define wmb()          mb()
+#define rmb()          mb()
+
+#endif /* _TOOLS_LINUX_ASM_TILE_BARRIER_H */
diff --git a/tools/arch/x86/include/asm/atomic.h b/tools/arch/x86/include/asm/atomic.h

new file mode 100644 (file)

index 0000000..059e33e
--- /dev/null
+++ b/tools/arch/x86/include/asm/atomic.h
@@ -0,0 +1,65 @@
+#ifndef _TOOLS_LINUX_ASM_X86_ATOMIC_H
+#define _TOOLS_LINUX_ASM_X86_ATOMIC_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include "rmwcc.h"
+
+#define LOCK_PREFIX "\n\tlock; "
+
+/*
+ * Atomic operations that C can't guarantee us.  Useful for
+ * resource counting etc..
+ */
+
+#define ATOMIC_INIT(i) { (i) }
+
+/**
+ * atomic_read - read atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically reads the value of @v.
+ */
+static inline int atomic_read(const atomic_t *v)
+{
+       return ACCESS_ONCE((v)->counter);
+}
+
+/**
+ * atomic_set - set atomic variable
+ * @v: pointer of type atomic_t
+ * @i: required value
+ *
+ * Atomically sets the value of @v to @i.
+ */
+static inline void atomic_set(atomic_t *v, int i)
+{
+       v->counter = i;
+}
+
+/**
+ * atomic_inc - increment atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1.
+ */
+static inline void atomic_inc(atomic_t *v)
+{
+       asm volatile(LOCK_PREFIX "incl %0"
+                    : "+m" (v->counter));
+}
+
+/**
+ * atomic_dec_and_test - decrement and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically decrements @v by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic_dec_and_test(atomic_t *v)
+{
+       GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e");
+}
+
+#endif /* _TOOLS_LINUX_ASM_X86_ATOMIC_H */
diff --git a/tools/arch/x86/include/asm/barrier.h b/tools/arch/x86/include/asm/barrier.h

new file mode 100644 (file)

index 0000000..f366d8e
--- /dev/null
+++ b/tools/arch/x86/include/asm/barrier.h
@@ -0,0 +1,28 @@
+#ifndef _TOOLS_LINUX_ASM_X86_BARRIER_H
+#define _TOOLS_LINUX_ASM_X86_BARRIER_H
+
+/*
+ * Copied from the Linux kernel sources, and also moving code
+ * out from tools/perf/perf-sys.h so as to make it be located
+ * in a place similar as in the kernel sources.
+ *
+ * Force strict CPU ordering.
+ * And yes, this is required on UP too when we're talking
+ * to devices.
+ */
+
+#if defined(__i386__)
+/*
+ * Some non-Intel clones support out of order store. wmb() ceases to be a
+ * nop for these.
+ */
+#define mb()   asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define rmb()  asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define wmb()  asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#elif defined(__x86_64__)
+#define mb()   asm volatile("mfence":::"memory")
+#define rmb()  asm volatile("lfence":::"memory")
+#define wmb()  asm volatile("sfence" ::: "memory")
+#endif
+
+#endif /* _TOOLS_LINUX_ASM_X86_BARRIER_H */
diff --git a/tools/arch/x86/include/asm/rmwcc.h b/tools/arch/x86/include/asm/rmwcc.h

new file mode 100644 (file)

index 0000000..a6669bc
--- /dev/null
+++ b/tools/arch/x86/include/asm/rmwcc.h
@@ -0,0 +1,41 @@
+#ifndef _TOOLS_LINUX_ASM_X86_RMWcc
+#define _TOOLS_LINUX_ASM_X86_RMWcc
+
+#ifdef CC_HAVE_ASM_GOTO
+
+#define __GEN_RMWcc(fullop, var, cc, ...)                              \
+do {                                                                   \
+       asm_volatile_goto (fullop "; j" cc " %l[cc_label]"              \
+                       : : "m" (var), ## __VA_ARGS__                   \
+                       : "memory" : cc_label);                         \
+       return 0;                                                       \
+cc_label:                                                              \
+       return 1;                                                       \
+} while (0)
+
+#define GEN_UNARY_RMWcc(op, var, arg0, cc)                             \
+       __GEN_RMWcc(op " " arg0, var, cc)
+
+#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc)                 \
+       __GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val))
+
+#else /* !CC_HAVE_ASM_GOTO */
+
+#define __GEN_RMWcc(fullop, var, cc, ...)                              \
+do {                                                                   \
+       char c;                                                         \
+       asm volatile (fullop "; set" cc " %1"                           \
+                       : "+m" (var), "=qm" (c)                         \
+                       : __VA_ARGS__ : "memory");                      \
+       return c != 0;                                                  \
+} while (0)
+
+#define GEN_UNARY_RMWcc(op, var, arg0, cc)                             \
+       __GEN_RMWcc(op " " arg0, var, cc)
+
+#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc)                 \
+       __GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val))
+
+#endif /* CC_HAVE_ASM_GOTO */
+
+#endif /* _TOOLS_LINUX_ASM_X86_RMWcc */
diff --git a/tools/arch/xtensa/include/asm/barrier.h b/tools/arch/xtensa/include/asm/barrier.h

new file mode 100644 (file)

index 0000000..583800b
--- /dev/null
+++ b/tools/arch/xtensa/include/asm/barrier.h
@@ -0,0 +1,18 @@
+/*
+ * Copied from the kernel sources to tools/:
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2001 - 2012 Tensilica Inc.
+ */
+
+#ifndef _TOOLS_LINUX_XTENSA_SYSTEM_H
+#define _TOOLS_LINUX_XTENSA_SYSTEM_H
+
+#define mb()  ({ __asm__ __volatile__("memw" : : : "memory"); })
+#define rmb() barrier()
+#define wmb() mb()
+
+#endif /* _TOOLS_LINUX_XTENSA_SYSTEM_H */
diff --git a/tools/build/Makefile.build b/tools/build/Makefile.build

index 10df57237a66d26913a4baabd5949f3459f0a534..a51244a8022f91c26591b9553d16e0020c9f9d12 100644 (file)
--- a/tools/build/Makefile.build
+++ b/tools/build/Makefile.build
@@ -37,7 +37,7 @@ subdir-obj-y :=
  
  # Build definitions
  build-file := $(dir)/Build
-include $(build-file)
+-include $(build-file)
  
  quiet_cmd_flex  = FLEX     $@
  quiet_cmd_bison = BISON    $@
@@ -94,12 +94,12 @@ obj-y        := $(patsubst %/, %/$(obj)-in.o, $(obj-y))
  subdir-obj-y := $(filter %/$(obj)-in.o, $(obj-y))
  
  # '$(OUTPUT)/dir' prefix to all objects
-prefix       := $(subst ./,,$(OUTPUT)$(dir)/)
-obj-y        := $(addprefix $(prefix),$(obj-y))
-subdir-obj-y := $(addprefix $(prefix),$(subdir-obj-y))
+objprefix    := $(subst ./,,$(OUTPUT)$(dir)/)
+obj-y        := $(addprefix $(objprefix),$(obj-y))
+subdir-obj-y := $(addprefix $(objprefix),$(subdir-obj-y))
  
  # Final '$(obj)-in.o' object
-in-target := $(prefix)$(obj)-in.o
+in-target := $(objprefix)$(obj)-in.o
  
  PHONY += $(subdir-y)
  
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature

index 3a0b0ca2a28c1b0bcc6a568d6c7d3ee6ced8c627..2975632d51e2341e7e1a60286e0fa822cbec0279 100644 (file)
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -27,7 +27,7 @@ endef
  #   the rule that uses them - an example for that is the 'bionic'
  #   feature check. ]
  #
-FEATURE_TESTS =                        \
+FEATURE_TESTS ?=                       \
         backtrace                       \
         dwarf                           \
         fortify-source                  \
@@ -53,7 +53,7 @@ FEATURE_TESTS =                       \
         zlib                            \
         lzma
  
-FEATURE_DISPLAY =                      \
+FEATURE_DISPLAY ?=                     \
         dwarf                           \
         glibc                           \
         gtk2                            \
diff --git a/tools/build/tests/ex/Build b/tools/build/tests/ex/Build

index 0e6c3e6767e6c553c34bff9ad6db7b440429d515..70d876237c5709f36a985c4f8c52a56a063167f7 100644 (file)
--- a/tools/build/tests/ex/Build
+++ b/tools/build/tests/ex/Build
@@ -2,6 +2,7 @@ ex-y += ex.o
  ex-y += a.o
  ex-y += b.o
  ex-y += empty/
+ex-y += empty2/
  
  libex-y += c.o
  libex-y += d.o
diff --git a/tools/build/tests/ex/empty2/README b/tools/build/tests/ex/empty2/README

new file mode 100644 (file)

index 0000000..2107cc5
--- /dev/null
+++ b/tools/build/tests/ex/empty2/README
@@ -0,0 +1,2 @@
+This directory is left intentionally without Build file
+to test proper nesting into Build-less directories.
diff --git a/tools/include/asm-generic/atomic-gcc.h b/tools/include/asm-generic/atomic-gcc.h

new file mode 100644 (file)

index 0000000..2ba78c9
--- /dev/null
+++ b/tools/include/asm-generic/atomic-gcc.h
@@ -0,0 +1,63 @@
+#ifndef __TOOLS_ASM_GENERIC_ATOMIC_H
+#define __TOOLS_ASM_GENERIC_ATOMIC_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+/*
+ * Atomic operations that C can't guarantee us.  Useful for
+ * resource counting etc..
+ *
+ * Excerpts obtained from the Linux kernel sources.
+ */
+
+#define ATOMIC_INIT(i) { (i) }
+
+/**
+ * atomic_read - read atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically reads the value of @v.
+ */
+static inline int atomic_read(const atomic_t *v)
+{
+       return ACCESS_ONCE((v)->counter);
+}
+
+/**
+ * atomic_set - set atomic variable
+ * @v: pointer of type atomic_t
+ * @i: required value
+ *
+ * Atomically sets the value of @v to @i.
+ */
+static inline void atomic_set(atomic_t *v, int i)
+{
+        v->counter = i;
+}
+
+/**
+ * atomic_inc - increment atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1.
+ */
+static inline void atomic_inc(atomic_t *v)
+{
+       __sync_add_and_fetch(&v->counter, 1);
+}
+
+/**
+ * atomic_dec_and_test - decrement and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically decrements @v by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic_dec_and_test(atomic_t *v)
+{
+       return __sync_sub_and_fetch(&v->counter, 1) == 0;
+}
+
+#endif /* __TOOLS_ASM_GENERIC_ATOMIC_H */
diff --git a/tools/include/asm-generic/barrier.h b/tools/include/asm-generic/barrier.h

new file mode 100644 (file)

index 0000000..47b9339
--- /dev/null
+++ b/tools/include/asm-generic/barrier.h
@@ -0,0 +1,44 @@
+/*
+ * Copied from the kernel sources to tools/perf/:
+ *
+ * Generic barrier definitions, originally based on MN10300 definitions.
+ *
+ * It should be possible to use these on really simple architectures,
+ * but it serves more as a starting point for new ports.
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#ifndef __TOOLS_LINUX_ASM_GENERIC_BARRIER_H
+#define __TOOLS_LINUX_ASM_GENERIC_BARRIER_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/compiler.h>
+
+/*
+ * Force strict CPU ordering. And yes, this is required on UP too when we're
+ * talking to devices.
+ *
+ * Fall back to compiler barriers if nothing better is provided.
+ */
+
+#ifndef mb
+#define mb()   barrier()
+#endif
+
+#ifndef rmb
+#define rmb()  mb()
+#endif
+
+#ifndef wmb
+#define wmb()  mb()
+#endif
+
+#endif /* !__ASSEMBLY__ */
+#endif /* __TOOLS_LINUX_ASM_GENERIC_BARRIER_H */
diff --git a/tools/include/asm/atomic.h b/tools/include/asm/atomic.h

new file mode 100644 (file)

index 0000000..70794f5
--- /dev/null
+++ b/tools/include/asm/atomic.h
@@ -0,0 +1,10 @@
+#ifndef __TOOLS_LINUX_ASM_ATOMIC_H
+#define __TOOLS_LINUX_ASM_ATOMIC_H
+
+#if defined(__i386__) || defined(__x86_64__)
+#include "../../arch/x86/include/asm/atomic.h"
+#else
+#include <asm-generic/atomic-gcc.h>
+#endif
+
+#endif /* __TOOLS_LINUX_ASM_ATOMIC_H */
diff --git a/tools/include/asm/barrier.h b/tools/include/asm/barrier.h

new file mode 100644 (file)

index 0000000..ac66ac5
--- /dev/null
+++ b/tools/include/asm/barrier.h
@@ -0,0 +1,27 @@
+#if defined(__i386__) || defined(__x86_64__)
+#include "../../arch/x86/include/asm/barrier.h"
+#elif defined(__arm__)
+#include "../../arch/arm/include/asm/barrier.h"
+#elif defined(__aarch64__)
+#include "../../arch/arm64/include/asm/barrier.h"
+#elif defined(__powerpc__)
+#include "../../arch/powerpc/include/asm/barrier.h"
+#elif defined(__s390__)
+#include "../../arch/s390/include/asm/barrier.h"
+#elif defined(__sh__)
+#include "../../arch/sh/include/asm/barrier.h"
+#elif defined(__sparc__)
+#include "../../arch/sparc/include/asm/barrier.h"
+#elif defined(__tile__)
+#include "../../arch/tile/include/asm/barrier.h"
+#elif defined(__alpha__)
+#include "../../arch/alpha/include/asm/barrier.h"
+#elif defined(__mips__)
+#include "../../arch/mips/include/asm/barrier.h"
+#elif defined(__ia64__)
+#include "../../arch/ia64/include/asm/barrier.h"
+#elif defined(__xtensa__)
+#include "../../arch/xtensa/include/asm/barrier.h"
+#else
+#include <asm-generic/barrier.h>
+#endif
diff --git a/tools/include/linux/atomic.h b/tools/include/linux/atomic.h

new file mode 100644 (file)

index 0000000..4e3d3d1
--- /dev/null
+++ b/tools/include/linux/atomic.h
@@ -0,0 +1,6 @@
+#ifndef __TOOLS_LINUX_ATOMIC_H
+#define __TOOLS_LINUX_ATOMIC_H
+
+#include <asm/atomic.h>
+
+#endif /* __TOOLS_LINUX_ATOMIC_H */
diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h

index 88461f09cc860b0d17ab87c3d4a629ee44214af7..f0e72674c52d2c9b88b46cb281db39fe8fb68d8d 100644 (file)
--- a/tools/include/linux/compiler.h
+++ b/tools/include/linux/compiler.h
@@ -1,6 +1,10 @@
  #ifndef _TOOLS_LINUX_COMPILER_H_
  #define _TOOLS_LINUX_COMPILER_H_
  
+/* Optimization barrier */
+/* The "volatile" is due to gcc bugs */
+#define barrier() __asm__ __volatile__("": : :"memory")
+
  #ifndef __always_inline
  # define __always_inline       inline __attribute__((always_inline))
  #endif
diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h

new file mode 100644 (file)

index 0000000..76df535
--- /dev/null
+++ b/tools/include/linux/kernel.h
@@ -0,0 +1,107 @@
+#ifndef __TOOLS_LINUX_KERNEL_H
+#define __TOOLS_LINUX_KERNEL_H
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+
+#define PERF_ALIGN(x, a)       __PERF_ALIGN_MASK(x, (typeof(x))(a)-1)
+#define __PERF_ALIGN_MASK(x, mask)     (((x)+(mask))&~(mask))
+
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#endif
+
+#ifndef container_of
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:       the pointer to the member.
+ * @type:      the type of the container struct this is embedded in.
+ * @member:    the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({                     \
+       const typeof(((type *)0)->member) * __mptr = (ptr);     \
+       (type *)((char *)__mptr - offsetof(type, member)); })
+#endif
+
+#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
+
+#ifndef max
+#define max(x, y) ({                           \
+       typeof(x) _max1 = (x);                  \
+       typeof(y) _max2 = (y);                  \
+       (void) (&_max1 == &_max2);              \
+       _max1 > _max2 ? _max1 : _max2; })
+#endif
+
+#ifndef min
+#define min(x, y) ({                           \
+       typeof(x) _min1 = (x);                  \
+       typeof(y) _min2 = (y);                  \
+       (void) (&_min1 == &_min2);              \
+       _min1 < _min2 ? _min1 : _min2; })
+#endif
+
+#ifndef roundup
+#define roundup(x, y) (                                \
+{                                                      \
+       const typeof(y) __y = y;                       \
+       (((x) + (__y - 1)) / __y) * __y;               \
+}                                                      \
+)
+#endif
+
+#ifndef BUG_ON
+#ifdef NDEBUG
+#define BUG_ON(cond) do { if (cond) {} } while (0)
+#else
+#define BUG_ON(cond) assert(!(cond))
+#endif
+#endif
+
+/*
+ * Both need more care to handle endianness
+ * (Don't use bitmap_copy_le() for now)
+ */
+#define cpu_to_le64(x) (x)
+#define cpu_to_le32(x) (x)
+
+static inline int
+vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
+{
+       int i;
+       ssize_t ssize = size;
+
+       i = vsnprintf(buf, size, fmt, args);
+
+       return (i >= ssize) ? (ssize - 1) : i;
+}
+
+static inline int scnprintf(char * buf, size_t size, const char * fmt, ...)
+{
+       va_list args;
+       ssize_t ssize = size;
+       int i;
+
+       va_start(args, fmt);
+       i = vsnprintf(buf, size, fmt, args);
+       va_end(args);
+
+       return (i >= ssize) ? (ssize - 1) : i;
+}
+
+/*
+ * This looks more complex than it should be. But we need to
+ * get the type for the ~ right in round_down (it needs to be
+ * as wide as the result!), and we want to evaluate the macro
+ * arguments just once each.
+ */
+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
+#define round_down(x, y) ((x) & ~__round_mask(x, y))
+
+#endif
diff --git a/tools/include/linux/list.h b/tools/include/linux/list.h

new file mode 100644 (file)

index 0000000..76b014c
--- /dev/null
+++ b/tools/include/linux/list.h
@@ -0,0 +1,29 @@
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "../../../include/linux/list.h"
+
+#ifndef TOOLS_LIST_H
+#define TOOLS_LIST_H
+/**
+ * list_del_range - deletes range of entries from list.
+ * @begin: first element in the range to delete from the list.
+ * @end: last element in the range to delete from the list.
+ * Note: list_empty on the range of entries does not return true after this,
+ * the entries is in an undefined state.
+ */
+static inline void list_del_range(struct list_head *begin,
+                                 struct list_head *end)
+{
+       begin->prev->next = end->next;
+       end->next->prev = begin->prev;
+}
+
+/**
+ * list_for_each_from  -       iterate over a list from one of its nodes
+ * @pos:  the &struct list_head to use as a loop cursor, from where to start
+ * @head: the head for your list.
+ */
+#define list_for_each_from(pos, head) \
+       for (; pos != (head); pos = pos->next)
+#endif
diff --git a/tools/include/linux/poison.h b/tools/include/linux/poison.h

new file mode 100644 (file)

index 0000000..0c27bdf
--- /dev/null
+++ b/tools/include/linux/poison.h
@@ -0,0 +1 @@
+#include "../../../include/linux/poison.h"
diff --git a/tools/include/linux/types.h b/tools/include/linux/types.h

index b5cf25e05df2ecf945097b09199fbd28148a1b08..8ebf6278b2ef23d7cd45daf7b5f6f14bde675aa2 100644 (file)
--- a/tools/include/linux/types.h
+++ b/tools/include/linux/types.h
@@ -60,6 +60,14 @@ typedef __u32 __bitwise __be32;
  typedef __u64 __bitwise __le64;
  typedef __u64 __bitwise __be64;
  
+typedef struct {
+       int counter;
+} atomic_t;
+
+#ifndef __aligned_u64
+# define __aligned_u64 __u64 __attribute__((aligned(8)))
+#endif
+
  struct list_head {
         struct list_head *next, *prev;
  };
diff --git a/tools/lib/traceevent/.gitignore b/tools/lib/traceevent/.gitignore

index 35f56be5a4cdb9805ecb8e0b8190346ff804baf5..3c60335fe7be7c5bb35a8d5a64f59795a888e768 100644 (file)
--- a/tools/lib/traceevent/.gitignore
+++ b/tools/lib/traceevent/.gitignore
@@ -1 +1,2 @@
  TRACEEVENT-CFLAGS
+libtraceevent-dynamic-list
diff --git a/tools/lib/traceevent/Makefile b/tools/lib/traceevent/Makefile

index d410da335e3daeaeac8da1dbdfdabdc7e23fc38f..6daaff652affdde16240c91330781864aaedfd35 100644 (file)
--- a/tools/lib/traceevent/Makefile
+++ b/tools/lib/traceevent/Makefile
@@ -23,6 +23,7 @@ endef
  # Allow setting CC and AR, or setting CROSS_COMPILE as a prefix.
  $(call allow-override,CC,$(CROSS_COMPILE)gcc)
  $(call allow-override,AR,$(CROSS_COMPILE)ar)
+$(call allow-override,NM,$(CROSS_COMPILE)nm)
  
  EXT = -std=gnu99
  INSTALL = install
@@ -34,9 +35,15 @@ INSTALL = install
  DESTDIR ?=
  DESTDIR_SQ = '$(subst ','\'',$(DESTDIR))'
  
+LP64 := $(shell echo __LP64__ | ${CC} ${CFLAGS} -E -x c - | tail -n 1)
+ifeq ($(LP64), 1)
+  libdir_relative = lib64
+else
+  libdir_relative = lib
+endif
+
  prefix ?= /usr/local
-bindir_relative = bin
-bindir = $(prefix)/$(bindir_relative)
+libdir = $(prefix)/$(libdir_relative)
  man_dir = $(prefix)/share/man
  man_dir_SQ = '$(subst ','\'',$(man_dir))'
  
@@ -58,7 +65,7 @@ ifeq ($(prefix),$(HOME))
  override plugin_dir = $(HOME)/.traceevent/plugins
  set_plugin_dir := 0
  else
-override plugin_dir = $(prefix)/lib/traceevent/plugins
+override plugin_dir = $(libdir)/traceevent/plugins
  endif
  endif
  
@@ -85,11 +92,11 @@ srctree := $(patsubst %/,%,$(dir $(srctree)))
  #$(info Determined 'srctree' to be $(srctree))
  endif
  
-export prefix bindir src obj
+export prefix libdir src obj
  
  # Shell quotes
-bindir_SQ = $(subst ','\'',$(bindir))
-bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
+libdir_SQ = $(subst ','\'',$(libdir))
+libdir_relative_SQ = $(subst ','\'',$(libdir_relative))
  plugin_dir_SQ = $(subst ','\'',$(plugin_dir))
  
  LIB_FILE = libtraceevent.a libtraceevent.so
@@ -151,8 +158,9 @@ PLUGINS_IN := $(PLUGINS:.so=-in.o)
  
  TE_IN    := $(OUTPUT)libtraceevent-in.o
  LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE))
+DYNAMIC_LIST_FILE := $(OUTPUT)libtraceevent-dynamic-list
  
-CMD_TARGETS = $(LIB_FILE) $(PLUGINS)
+CMD_TARGETS = $(LIB_FILE) $(PLUGINS) $(DYNAMIC_LIST_FILE)
  
  TARGETS = $(CMD_TARGETS)
  
@@ -169,6 +177,9 @@ $(OUTPUT)libtraceevent.so: $(TE_IN)
  $(OUTPUT)libtraceevent.a: $(TE_IN)
         $(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^
  
+$(OUTPUT)libtraceevent-dynamic-list: $(PLUGINS)
+       $(QUIET_GEN)$(call do_generate_dynamic_list_file, $(PLUGINS), $@)
+
  plugins: $(PLUGINS)
  
  __plugin_obj = $(notdir $@)
@@ -238,9 +249,16 @@ define do_install_plugins
         done
  endef
  
+define do_generate_dynamic_list_file
+       (echo '{';                                                      \
+       $(NM) -u -D $1 | awk 'NF>1 {print "\t"$$2";"}' | sort -u;       \
+       echo '};';                                                      \
+       ) > $2
+endef
+
  install_lib: all_cmd install_plugins
         $(call QUIET_INSTALL, $(LIB_FILE)) \
-               $(call do_install,$(LIB_FILE),$(bindir_SQ))
+               $(call do_install,$(LIB_FILE),$(libdir_SQ))
  
  install_plugins: $(PLUGINS)
         $(call QUIET_INSTALL, trace_plugins) \
diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c

index 29f94f6f0d9e9e2510d38471724e686a9a21167c..cc25f059ab3dfcc5368804597120f96d20dabfdf 100644 (file)
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -1387,7 +1387,7 @@ static int event_read_fields(struct event_format *event, struct format_field **f
                         do_warning_event(event, "%s: no type found", __func__);
                         goto fail;
                 }
-               field->name = last_token;
+               field->name = field->alias = last_token;
  
                 if (test_type(type, EVENT_OP))
                         goto fail;
@@ -1469,7 +1469,7 @@ static int event_read_fields(struct event_format *event, struct format_field **f
                                 size_dynamic = type_size(field->name);
                                 free_token(field->name);
                                 strcat(field->type, brackets);
-                               field->name = token;
+                               field->name = field->alias = token;
                                 type = read_token(&token);
                         } else {
                                 char *new_type;
@@ -6444,6 +6444,8 @@ void pevent_ref(struct pevent *pevent)
  void pevent_free_format_field(struct format_field *field)
  {
         free(field->type);
+       if (field->alias != field->name)
+               free(field->alias);
         free(field->name);
         free(field);
  }
diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h

index 86a5839fb048e87d2f76982ab51231ef4646dfc5..063b1971eb35288ae1a8d72cab18a5df815126a4 100644 (file)
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -191,6 +191,7 @@ struct format_field {
         struct event_format     *event;
         char                    *type;
         char                    *name;
+       char                    *alias;
         int                     offset;
         int                     size;
         unsigned int            arraylen;
diff --git a/tools/lib/traceevent/plugin_cfg80211.c b/tools/lib/traceevent/plugin_cfg80211.c

index 4592d84383188e70d1c59619a7f3b642d49f861e..ec57d0c1fbc2b679a82baf35219ea6cfa523354d 100644 (file)
--- a/tools/lib/traceevent/plugin_cfg80211.c
+++ b/tools/lib/traceevent/plugin_cfg80211.c
@@ -4,6 +4,19 @@
  #include <endian.h>
  #include "event-parse.h"
  
+/*
+ * From glibc endian.h, for older systems where it is not present, e.g.: RHEL5,
+ * Fedora6.
+ */
+#ifndef le16toh
+# if __BYTE_ORDER == __LITTLE_ENDIAN
+#  define le16toh(x) (x)
+# else
+#  define le16toh(x) __bswap_16 (x)
+# endif
+#endif
+
+
  static unsigned long long
  process___le16_to_cpup(struct trace_seq *s, unsigned long long *args)
  {
diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore

index 812f904193e8869bf0bd179d5dd36e42d51c35c8..09db62ba5786673d2029d8c6b6f96122daa878a1 100644 (file)
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -28,3 +28,4 @@ config.mak.autogen
  *-flex.*
  *.pyc
  *.pyo
+.config-detected
diff --git a/tools/perf/Documentation/callchain-overhead-calculation.txt b/tools/perf/Documentation/callchain-overhead-calculation.txt

new file mode 100644 (file)

index 0000000..1a75792
--- /dev/null
+++ b/tools/perf/Documentation/callchain-overhead-calculation.txt
@@ -0,0 +1,108 @@
+Overhead calculation
+--------------------
+The overhead can be shown in two columns as 'Children' and 'Self' when
+perf collects callchains.  The 'self' overhead is simply calculated by
+adding all period values of the entry - usually a function (symbol).
+This is the value that perf shows traditionally and sum of all the
+'self' overhead values should be 100%.
+
+The 'children' overhead is calculated by adding all period values of
+the child functions so that it can show the total overhead of the
+higher level functions even if they don't directly execute much.
+'Children' here means functions that are called from another (parent)
+function.
+
+It might be confusing that the sum of all the 'children' overhead
+values exceeds 100% since each of them is already an accumulation of
+'self' overhead of its child functions.  But with this enabled, users
+can find which function has the most overhead even if samples are
+spread over the children.
+
+Consider the following example; there are three functions like below.
+
+-----------------------
+void foo(void) {
+    /* do something */
+}
+
+void bar(void) {
+    /* do something */
+    foo();
+}
+
+int main(void) {
+    bar()
+    return 0;
+}
+-----------------------
+
+In this case 'foo' is a child of 'bar', and 'bar' is an immediate
+child of 'main' so 'foo' also is a child of 'main'.  In other words,
+'main' is a parent of 'foo' and 'bar', and 'bar' is a parent of 'foo'.
+
+Suppose all samples are recorded in 'foo' and 'bar' only.  When it's
+recorded with callchains the output will show something like below
+in the usual (self-overhead-only) output of perf report:
+
+----------------------------------
+Overhead  Symbol
+........  .....................
+  60.00%  foo
+          |
+          --- foo
+              bar
+              main
+              __libc_start_main
+
+  40.00%  bar
+          |
+          --- bar
+              main
+              __libc_start_main
+----------------------------------
+
+When the --children option is enabled, the 'self' overhead values of
+child functions (i.e. 'foo' and 'bar') are added to the parents to
+calculate the 'children' overhead.  In this case the report could be
+displayed as:
+
+-------------------------------------------
+Children      Self  Symbol
+........  ........  ....................
+ 100.00%     0.00%  __libc_start_main
+          |
+          --- __libc_start_main
+
+ 100.00%     0.00%  main
+          |
+          --- main
+              __libc_start_main
+
+ 100.00%    40.00%  bar
+          |
+          --- bar
+              main
+              __libc_start_main
+
+  60.00%    60.00%  foo
+          |
+          --- foo
+              bar
+              main
+              __libc_start_main
+-------------------------------------------
+
+In the above output, the 'self' overhead of 'foo' (60%) was add to the
+'children' overhead of 'bar', 'main' and '\_\_libc_start_main'.
+Likewise, the 'self' overhead of 'bar' (40%) was added to the
+'children' overhead of 'main' and '\_\_libc_start_main'.
+
+So '\_\_libc_start_main' and 'main' are shown first since they have
+same (100%) 'children' overhead (even though they have zero 'self'
+overhead) and they are the parents of 'foo' and 'bar'.
+
+Since v3.16 the 'children' overhead is shown by default and the output
+is sorted by its values. The 'children' overhead is disabled by
+specifying --no-children option on the command line or by adding
+'report.children = false' or 'top.children = false' in the perf config
+file.
diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt

index f6480cbf309b40fee97e86112033c1a0cf04236a..bf3d0644bf1066677ae6ced215f82ecbc55f409d 100644 (file)
--- a/tools/perf/Documentation/perf-bench.txt
+++ b/tools/perf/Documentation/perf-bench.txt
@@ -210,6 +210,9 @@ Suite for evaluating hash tables.
  *wake*::
  Suite for evaluating wake calls.
  
+*wake-parallel*::
+Suite for evaluating parallel wake calls.
+
  *requeue*::
  Suite for evaluating requeue calls.
  
diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt

index dc7442cf3d7f80920b8ba36f00fb8c0da972534b..b876ae312699b9388f6565eec18e239305309f31 100644 (file)
--- a/tools/perf/Documentation/perf-inject.txt
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -44,6 +44,33 @@ OPTIONS
  --kallsyms=<file>::
         kallsyms pathname
  
+--itrace::
+       Decode Instruction Tracing data, replacing it with synthesized events.
+       Options are:
+
+               i       synthesize instructions events
+               b       synthesize branches events
+               c       synthesize branches events (calls only)
+               r       synthesize branches events (returns only)
+               x       synthesize transactions events
+               e       synthesize error events
+               d       create a debug log
+               g       synthesize a call chain (use with i or x)
+
+       The default is all events i.e. the same as --itrace=ibxe
+
+       In addition, the period (default 100000) for instructions events
+       can be specified in units of:
+
+               i       instructions
+               t       ticks
+               ms      milliseconds
+               us      microseconds
+               ns      nanoseconds (default)
+
+       Also the call chain size (default 16, max. 1024) for instructions or
+       transactions events can be specified.
+
  SEE ALSO
  --------
  linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1]
diff --git a/tools/perf/Documentation/perf-kmem.txt b/tools/perf/Documentation/perf-kmem.txt

index 23219c65c16f77892a30b9b85f38609f1b1dc1f5..ff0f433b3fce1ee402a375b62703d7034c468e34 100644 (file)
--- a/tools/perf/Documentation/perf-kmem.txt
+++ b/tools/perf/Documentation/perf-kmem.txt
@@ -37,7 +37,11 @@ OPTIONS
  
  -s <key[,key2...]>::
  --sort=<key[,key2...]>::
-       Sort the output (default: frag,hit,bytes)
+       Sort the output (default: 'frag,hit,bytes' for slab and 'bytes,hit'
+       for page).  Available sort keys are 'ptr, callsite, bytes, hit,
+       pingpong, frag' for slab and 'page, callsite, bytes, hit, order,
+       migtype, gfp' for page.  This option should be preceded by one of the
+       mode selection options - i.e. --slab, --page, --alloc and/or --caller.
  
  -l <num>::
  --line=<num>::
@@ -52,6 +56,11 @@ OPTIONS
  --page::
         Analyze page allocator events
  
+--live::
+       Show live page stat.  The perf kmem shows total allocation stat by
+       default, but this option shows live (currently allocated) pages
+       instead.  (This option works with --page option only)
+
  SEE ALSO
  --------
  linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-kvm.txt b/tools/perf/Documentation/perf-kvm.txt

index 6252e776009c022dda55970dad2aa72e13610cf4..6a5bb2b170391da59b572f8c4230209fb04199e2 100644 (file)
--- a/tools/perf/Documentation/perf-kvm.txt
+++ b/tools/perf/Documentation/perf-kvm.txt
@@ -151,6 +151,12 @@ STAT LIVE OPTIONS
         Show events other than HLT (x86 only) or Wait state (s390 only)
         that take longer than duration usecs.
  
+--proc-map-timeout::
+       When processing pre-existing threads /proc/XXX/mmap, it may take
+       a long time, because the file may be huge. A time out is needed
+       in such cases.
+       This option sets the time out limit. The default value is 500 ms.
+
  SEE ALSO
  --------
  linkperf:perf-top[1], linkperf:perf-record[1], linkperf:perf-report[1],
diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt

index 239609c09f83a10ebde91c8254ab94c9b4d15354..3a8a9ba2b0412aba28f1796e283106f84ec63266 100644 (file)
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -14,11 +14,13 @@ or
  or
  'perf probe' [options] --del='[GROUP:]EVENT' [...]
  or
-'perf probe' --list
+'perf probe' --list[=[GROUP:]EVENT]
  or
  'perf probe' [options] --line='LINE'
  or
  'perf probe' [options] --vars='PROBEPOINT'
+or
+'perf probe' [options] --funcs
  
  DESCRIPTION
  -----------
@@ -64,8 +66,8 @@ OPTIONS
         classes(e.g. [a-z], [!A-Z]).
  
  -l::
---list::
-       List up current probe events.
+--list[=[GROUP:]EVENT]::
+       List up current probe events. This can also accept filtering patterns of event names.
  
  -L::
  --line=::
@@ -81,10 +83,15 @@ OPTIONS
         (Only for --vars) Show external defined variables in addition to local
         variables.
  
+--no-inlines::
+       (Only for --add) Search only for non-inlined functions. The functions
+       which do not have instances are ignored.
+
  -F::
---funcs::
+--funcs[=FILTER]::
         Show available functions in given module or kernel. With -x/--exec,
         can also list functions in a user space executable / shared library.
+       This also can accept a FILTER rule argument.
  
  --filter=FILTER::
         (Only for --vars and --funcs) Set filter. FILTER is a combination of glob
@@ -148,7 +155,7 @@ Each probe argument follows below syntax.
   [NAME=]LOCALVAR|$retval|%REG|@SYMBOL[:TYPE]
  
  'NAME' specifies the name of this argument (optional). You can use the name of local variable, local data structure member (e.g. var->field, var.field2), local array with fixed index (e.g. array[1], var->array[0], var->pointer[2]), or kprobe-tracer argument format (e.g. $retval, %ax, etc). Note that the name of this argument will be set as the last member name if you specify a local data structure member (e.g. field2 for 'var->field1.field2'.)
-'$vars' special argument is also available for NAME, it is expanded to the local variables which can access at given probe point.
+'$vars' and '$params' special arguments are also available for NAME, '$vars' is expanded to the local variables (including function parameters) which can access at given probe point. '$params' is expanded to only the function parameters.
  'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo. You can specify 'string' type only for the local variable or structure member which is an array of or a pointer to 'char' or 'unsigned char' type.
  
  On x86 systems %REG is always the short form of the register: for example %AX. %RAX or %EAX is not valid.
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt

index 4847a793de6516df66dc609478c36724772bb941..9b9d9d086680ae82885d2603d2b90435132234bc 100644 (file)
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -108,6 +108,8 @@ OPTIONS
         Number of mmap data pages (must be a power of two) or size
         specification with appended unit character - B/K/M/G. The
         size is rounded up to have nearest pages power of two value.
+       Also, by adding a comma, the number of mmap pages for AUX
+       area tracing can be specified.
  
  --group::
         Put all events in a single event group.  This precedes the --event
@@ -145,16 +147,21 @@ OPTIONS
  
  -s::
  --stat::
-       Per thread counts.
+       Record per-thread event counts.  Use it with 'perf report -T' to see
+       the values.
  
  -d::
  --data::
-       Sample addresses.
+       Record the sample addresses.
  
  -T::
  --timestamp::
-       Sample timestamps. Use it with 'perf report -D' to see the timestamps,
-       for instance.
+       Record the sample timestamps. Use it with 'perf report -D' to see the
+       timestamps, for instance.
+
+-P::
+--period::
+       Record the sample period.
  
  -n::
  --no-samples::
@@ -257,6 +264,18 @@ records. See clock_gettime(). In particular CLOCK_MONOTONIC and
  CLOCK_MONOTONIC_RAW are supported, some events might also allow
  CLOCK_BOOTTIME, CLOCK_REALTIME and CLOCK_TAI.
  
+-S::
+--snapshot::
+Select AUX area tracing Snapshot Mode. This option is valid only with an
+AUX area tracing event. Optionally the number of bytes to capture per
+snapshot can be specified. In Snapshot Mode, trace data is captured only when
+signal SIGUSR2 is received.
+
+--proc-map-timeout::
+When processing pre-existing threads /proc/XXX/mmap, it may take a long time,
+because the file may be huge. A time out is needed in such cases.
+This option sets the time out limit. The default value is 500 ms.
+
  SEE ALSO
  --------
  linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt

index 4879cf63882482155ffad05a3bc2abc8dff90f14..c33b69f3374fda01b6a9ee7dcca6b7bbb2f74e8c 100644 (file)
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -34,7 +34,8 @@ OPTIONS
  
  -T::
  --threads::
-       Show per-thread event counters
+       Show per-thread event counters.  The input data file should be recorded
+       with -s option.
  -c::
  --comms=::
         Only consider symbols in these comms. CSV that understands
@@ -193,6 +194,7 @@ OPTIONS
         Accumulate callchain of children to parent entry so that then can
         show up in the output.  The output will have a new "Children" column
         and will be sorted on the data.  It requires callchains are recorded.
+       See the `overhead calculation' section for more details.
  
  --max-stack::
         Set the stack depth limit when parsing the callchain, anything
@@ -323,6 +325,37 @@ OPTIONS
  --header-only::
         Show only perf.data header (forces --stdio).
  
+--itrace::
+       Options for decoding instruction tracing data. The options are:
+
+               i       synthesize instructions events
+               b       synthesize branches events
+               c       synthesize branches events (calls only)
+               r       synthesize branches events (returns only)
+               x       synthesize transactions events
+               e       synthesize error events
+               d       create a debug log
+               g       synthesize a call chain (use with i or x)
+
+       The default is all events i.e. the same as --itrace=ibxe
+
+       In addition, the period (default 100000) for instructions events
+       can be specified in units of:
+
+               i       instructions
+               t       ticks
+               ms      milliseconds
+               us      microseconds
+               ns      nanoseconds (default)
+
+       Also the call chain size (default 16, max. 1024) for instructions or
+       transactions events can be specified.
+
+       To disable decoding entirely, use --no-itrace.
+
+
+include::callchain-overhead-calculation.txt[]
+
  SEE ALSO
  --------
  linkperf:perf-stat[1], linkperf:perf-annotate[1]
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt

index 79445750fcb322fb323c38b6e9a921500f5241c6..c82df572fac2ed4b4285487485d0ed33492e36c5 100644 (file)
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -115,7 +115,8 @@ OPTIONS
  -f::
  --fields::
          Comma separated list of fields to print. Options are:
-        comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff, srcline, period.
+        comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
+       srcline, period, flags.
          Field list can be prepended with the type, trace, sw or hw,
          to indicate to which event type the field list applies.
          e.g., -f sw:comm,tid,time,ip,sym  and -f trace:time,cpu,trace
@@ -165,6 +166,12 @@ OPTIONS
  
         At this point usage is displayed, and perf-script exits.
  
+       The flags field is synthesized and may have a value when Instruction
+       Trace decoding. The flags are "bcrosyiABEx" which stand for branch,
+       call, return, conditional, system, asynchronous, interrupt,
+       transaction abort, trace begin, trace end, and in transaction,
+       respectively.
+
         Finally, a user may not set fields to none for all event types.
         i.e., -f "" is not allowed.
  
@@ -221,6 +228,34 @@ OPTIONS
  --header-only
         Show only perf.data header.
  
+--itrace::
+       Options for decoding instruction tracing data. The options are:
+
+               i       synthesize instructions events
+               b       synthesize branches events
+               c       synthesize branches events (calls only)
+               r       synthesize branches events (returns only)
+               x       synthesize transactions events
+               e       synthesize error events
+               d       create a debug log
+               g       synthesize a call chain (use with i or x)
+
+       The default is all events i.e. the same as --itrace=ibxe
+
+       In addition, the period (default 100000) for instructions events
+       can be specified in units of:
+
+               i       instructions
+               t       ticks
+               ms      milliseconds
+               us      microseconds
+               ns      nanoseconds (default)
+
+       Also the call chain size (default 16, max. 1024) for instructions or
+       transactions events can be specified.
+
+       To disable decoding entirely, use --no-itrace.
+
  SEE ALSO
  --------
  linkperf:perf-record[1], linkperf:perf-script-perl[1],
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt

index 3265b10705188027ab30256f7475d218d78f6fbb..776aec4d092771ed8ea7c68c7ce205d0b7578aaa 100644 (file)
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -168,7 +168,7 @@ Default is to monitor all CPUS.
         Accumulate callchain of children to parent entry so that then can
         show up in the output.  The output will have a new "Children" column
         and will be sorted on the data.  It requires -g/--call-graph option
-       enabled.
+       enabled.  See the `overhead calculation' section for more details.
  
  --max-stack::
         Set the stack depth limit when parsing the callchain, anything
@@ -201,6 +201,12 @@ Default is to monitor all CPUS.
         Force each column width to the provided list, for large terminal
         readability.  0 means no limit (default behavior).
  
+--proc-map-timeout::
+       When processing pre-existing threads /proc/XXX/mmap, it may take
+       a long time, because the file may be huge. A time out is needed
+       in such cases.
+       This option sets the time out limit. The default value is 500 ms.
+
  
  INTERACTIVE PROMPTING KEYS
  --------------------------
@@ -234,6 +240,7 @@ INTERACTIVE PROMPTING KEYS
  
  Pressing any unmapped key displays a menu, and prompts for input.
  
+include::callchain-overhead-calculation.txt[]
  
  SEE ALSO
  --------
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt

index ba03fd5d1a5476ce218e6b5c7d36720a9ec26294..7ea078658a875029fb0ad03101bacdbe45e31564 100644 (file)
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -35,7 +35,7 @@ OPTIONS
  
  -e::
  --expr::
-       List of events to show, currently only syscall names.
+       List of syscalls to show, currently only syscall names.
         Prefixing with ! shows all syscalls but the ones specified.  You may
         need to escape it.
  
@@ -121,6 +121,11 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
  --event::
         Trace other events, see 'perf list' for a complete list.
  
+--proc-map-timeout::
+       When processing pre-existing threads /proc/XXX/mmap, it may take a long time,
+       because the file may be huge. A time out is needed in such cases.
+       This option sets the time out limit. The default value is 500 ms.
+
  PAGEFAULTS
  ----------
  
diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST

index 11ccbb22ea2b8f1538f07450504c35b50c10399f..fe50a1b34aa0035dec38a07a752ae33df9ad0e5d 100644 (file)
--- a/tools/perf/MANIFEST
+++ b/tools/perf/MANIFEST
@@ -1,12 +1,30 @@
  tools/perf
+tools/arch/alpha/include/asm/barrier.h
+tools/arch/arm/include/asm/barrier.h
+tools/arch/ia64/include/asm/barrier.h
+tools/arch/mips/include/asm/barrier.h
+tools/arch/powerpc/include/asm/barrier.h
+tools/arch/s390/include/asm/barrier.h
+tools/arch/sh/include/asm/barrier.h
+tools/arch/sparc/include/asm/barrier.h
+tools/arch/sparc/include/asm/barrier_32.h
+tools/arch/sparc/include/asm/barrier_64.h
+tools/arch/tile/include/asm/barrier.h
+tools/arch/x86/include/asm/barrier.h
+tools/arch/xtensa/include/asm/barrier.h
  tools/scripts
  tools/build
+tools/arch/x86/include/asm/atomic.h
+tools/arch/x86/include/asm/rmwcc.h
  tools/lib/traceevent
  tools/lib/api
  tools/lib/symbol/kallsyms.c
  tools/lib/symbol/kallsyms.h
  tools/lib/util/find_next_bit.c
+tools/include/asm/atomic.h
+tools/include/asm/barrier.h
  tools/include/asm/bug.h
+tools/include/asm-generic/barrier.h
  tools/include/asm-generic/bitops/arch_hweight.h
  tools/include/asm-generic/bitops/atomic.h
  tools/include/asm-generic/bitops/const_hweight.h
@@ -17,35 +35,35 @@ tools/include/asm-generic/bitops/fls64.h
  tools/include/asm-generic/bitops/fls.h
  tools/include/asm-generic/bitops/hweight.h
  tools/include/asm-generic/bitops.h
+tools/include/linux/atomic.h
  tools/include/linux/bitops.h
  tools/include/linux/compiler.h
  tools/include/linux/export.h
  tools/include/linux/hash.h
+tools/include/linux/kernel.h
+tools/include/linux/list.h
  tools/include/linux/log2.h
+tools/include/linux/poison.h
  tools/include/linux/types.h
  include/asm-generic/bitops/arch_hweight.h
  include/asm-generic/bitops/const_hweight.h
  include/asm-generic/bitops/fls64.h
  include/asm-generic/bitops/__fls.h
  include/asm-generic/bitops/fls.h
-include/linux/const.h
  include/linux/perf_event.h
  include/linux/rbtree.h
  include/linux/list.h
  include/linux/hash.h
  include/linux/stringify.h
-lib/find_next_bit.c
  lib/hweight.c
  lib/rbtree.c
  include/linux/swab.h
  arch/*/include/asm/unistd*.h
-arch/*/include/asm/perf_regs.h
  arch/*/include/uapi/asm/unistd*.h
  arch/*/include/uapi/asm/perf_regs.h
  arch/*/lib/memcpy*.S
  arch/*/lib/memset*.S
  include/linux/poison.h
-include/linux/magic.h
  include/linux/hw_breakpoint.h
  include/linux/rbtree_augmented.h
  include/uapi/linux/perf_event.h
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf

index c43a2051759157dd6b65118b34b342b149f58861..1af0cfeb7a57824980ef64fdf4d26f643fa7ab6c 100644 (file)
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -73,6 +73,8 @@ include config/utilities.mak
  # for CTF data format.
  #
  # Define NO_LZMA if you do not want to support compressed (xz) kernel modules
+#
+# Define NO_AUXTRACE if you do not want AUX area tracing support
  
  ifeq ($(srctree),)
  srctree := $(patsubst %/,%,$(dir $(shell pwd)))
@@ -171,6 +173,9 @@ endif
  LIBTRACEEVENT = $(TE_PATH)libtraceevent.a
  export LIBTRACEEVENT
  
+LIBTRACEEVENT_DYNAMIC_LIST = $(TE_PATH)libtraceevent-dynamic-list
+LIBTRACEEVENT_DYNAMIC_LIST_LDFLAGS = -Xlinker --dynamic-list=$(LIBTRACEEVENT_DYNAMIC_LIST)
+
  LIBAPI = $(LIB_PATH)libapi.a
  export LIBAPI
  
@@ -185,8 +190,9 @@ python-clean := $(call QUIET_CLEAN, python) $(RM) -r $(PYTHON_EXTBUILD) $(OUTPUT
  PYTHON_EXT_SRCS := $(shell grep -v ^\# util/python-ext-sources)
  PYTHON_EXT_DEPS := util/python-ext-sources util/setup.py $(LIBTRACEEVENT) $(LIBAPI)
  
-$(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS)
-       $(QUIET_GEN)CFLAGS='$(CFLAGS)' $(PYTHON_WORD) util/setup.py \
+$(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS) $(LIBTRACEEVENT_DYNAMIC_LIST)
+       $(QUIET_GEN)CFLAGS='$(CFLAGS)' LDFLAGS='$(LDFLAGS) $(LIBTRACEEVENT_DYNAMIC_LIST_LDFLAGS)' \
+         $(PYTHON_WORD) util/setup.py \
           --quiet build_ext; \
         mkdir -p $(OUTPUT)python && \
         cp $(PYTHON_EXTBUILD_LIB)perf.so $(OUTPUT)python/
@@ -276,8 +282,9 @@ build := -f $(srctree)/tools/build/Makefile.build dir=. obj
  $(PERF_IN): $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h FORCE
         $(Q)$(MAKE) $(build)=perf
  
-$(OUTPUT)perf: $(PERFLIBS) $(PERF_IN)
-       $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(PERF_IN) $(LIBS) -o $@
+$(OUTPUT)perf: $(PERFLIBS) $(PERF_IN) $(LIBTRACEEVENT_DYNAMIC_LIST)
+       $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(LIBTRACEEVENT_DYNAMIC_LIST_LDFLAGS) \
+               $(PERF_IN) $(LIBS) -o $@
  
  $(GTK_IN): FORCE
         $(Q)$(MAKE) $(build)=gtk
@@ -371,7 +378,13 @@ $(LIB_FILE): $(LIBPERF_IN)
  LIBTRACEEVENT_FLAGS += plugin_dir=$(plugindir_SQ)
  
  $(LIBTRACEEVENT): FORCE
-       $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) $(OUTPUT)libtraceevent.a plugins
+       $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) $(OUTPUT)libtraceevent.a
+
+libtraceevent_plugins: FORCE
+       $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) plugins
+
+$(LIBTRACEEVENT_DYNAMIC_LIST): libtraceevent_plugins
+       $(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) $(OUTPUT)libtraceevent-dynamic-list
  
  $(LIBTRACEEVENT)-clean:
         $(call QUIET_CLEAN, libtraceevent)
@@ -462,7 +475,7 @@ check: $(OUTPUT)common-cmds.h
  
  install-gtk:
  
-install-bin: all install-gtk
+install-tools: all install-gtk
         $(call QUIET_INSTALL, binaries) \
                 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'; \
                 $(INSTALL) $(OUTPUT)perf '$(DESTDIR_SQ)$(bindir_SQ)'; \
@@ -500,12 +513,16 @@ endif
         $(call QUIET_INSTALL, perf_completion-script) \
                 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d'; \
                 $(INSTALL) perf-completion.sh '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d/perf'
+
+install-tests: all install-gtk
         $(call QUIET_INSTALL, tests) \
                 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests'; \
                 $(INSTALL) tests/attr.py '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests'; \
                 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'; \
                 $(INSTALL) tests/attr/* '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'
  
+install-bin: install-tools install-tests
+
  install: install-bin try-install-man install-traceevent-plugins
  
  install-python_ext:
@@ -549,4 +566,5 @@ FORCE:
  .PHONY: all install clean config-clean strip install-gtk
  .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
  .PHONY: $(GIT-HEAD-PHONY) TAGS tags cscope FORCE single_dep
+.PHONY: libtraceevent_plugins
  
diff --git a/tools/perf/arch/arm64/Build b/tools/perf/arch/arm64/Build

index 54afe4a467e7d9a3f505cad40650378bdcd27296..41bf61da476a4ce3150fc0184c84718d807be894 100644 (file)
--- a/tools/perf/arch/arm64/Build
+++ b/tools/perf/arch/arm64/Build
@@ -1 +1,2 @@
  libperf-y += util/
+libperf-$(CONFIG_DWARF_UNWIND) += tests/
diff --git a/tools/perf/arch/arm64/include/perf_regs.h b/tools/perf/arch/arm64/include/perf_regs.h

index 1d3f39c3aa564fd2e85a950fddcb4bba18f5e8db..4e5af27e3fbfa5f42d9d474d8ff7dc237722206b 100644 (file)
--- a/tools/perf/arch/arm64/include/perf_regs.h
+++ b/tools/perf/arch/arm64/include/perf_regs.h
@@ -5,8 +5,11 @@
  #include <linux/types.h>
  #include <asm/perf_regs.h>
  
+void perf_regs_load(u64 *regs);
+
  #define PERF_REGS_MASK ((1ULL << PERF_REG_ARM64_MAX) - 1)
  #define PERF_REGS_MAX  PERF_REG_ARM64_MAX
+#define PERF_SAMPLE_REGS_ABI   PERF_SAMPLE_REGS_ABI_64
  
  #define PERF_REG_IP    PERF_REG_ARM64_PC
  #define PERF_REG_SP    PERF_REG_ARM64_SP
diff --git a/tools/perf/arch/arm64/tests/Build b/tools/perf/arch/arm64/tests/Build

new file mode 100644 (file)

index 0000000..b30eff9
--- /dev/null
+++ b/tools/perf/arch/arm64/tests/Build
@@ -0,0 +1,2 @@
+libperf-y += regs_load.o
+libperf-y += dwarf-unwind.o
diff --git a/tools/perf/arch/arm64/tests/dwarf-unwind.c b/tools/perf/arch/arm64/tests/dwarf-unwind.c

new file mode 100644 (file)

index 0000000..cf04a4c
--- /dev/null
+++ b/tools/perf/arch/arm64/tests/dwarf-unwind.c
@@ -0,0 +1,61 @@
+#include <string.h>
+#include "perf_regs.h"
+#include "thread.h"
+#include "map.h"
+#include "event.h"
+#include "debug.h"
+#include "tests/tests.h"
+
+#define STACK_SIZE 8192
+
+static int sample_ustack(struct perf_sample *sample,
+               struct thread *thread, u64 *regs)
+{
+       struct stack_dump *stack = &sample->user_stack;
+       struct map *map;
+       unsigned long sp;
+       u64 stack_size, *buf;
+
+       buf = malloc(STACK_SIZE);
+       if (!buf) {
+               pr_debug("failed to allocate sample uregs data\n");
+               return -1;
+       }
+
+       sp = (unsigned long) regs[PERF_REG_ARM64_SP];
+
+       map = map_groups__find(thread->mg, MAP__VARIABLE, (u64) sp);
+       if (!map) {
+               pr_debug("failed to get stack map\n");
+               free(buf);
+               return -1;
+       }
+
+       stack_size = map->end - sp;
+       stack_size = stack_size > STACK_SIZE ? STACK_SIZE : stack_size;
+
+       memcpy(buf, (void *) sp, stack_size);
+       stack->data = (char *) buf;
+       stack->size = stack_size;
+       return 0;
+}
+
+int test__arch_unwind_sample(struct perf_sample *sample,
+               struct thread *thread)
+{
+       struct regs_dump *regs = &sample->user_regs;
+       u64 *buf;
+
+       buf = calloc(1, sizeof(u64) * PERF_REGS_MAX);
+       if (!buf) {
+               pr_debug("failed to allocate sample uregs data\n");
+               return -1;
+       }
+
+       perf_regs_load(buf);
+       regs->abi  = PERF_SAMPLE_REGS_ABI;
+       regs->regs = buf;
+       regs->mask = PERF_REGS_MASK;
+
+       return sample_ustack(sample, thread, buf);
+}
diff --git a/tools/perf/arch/arm64/tests/regs_load.S b/tools/perf/arch/arm64/tests/regs_load.S

new file mode 100644 (file)

index 0000000..025b46e
--- /dev/null
+++ b/tools/perf/arch/arm64/tests/regs_load.S
@@ -0,0 +1,46 @@
+#include <linux/linkage.h>
+
+.text
+.type perf_regs_load,%function
+#define STR_REG(r)     str x##r, [x0, 8 * r]
+#define LDR_REG(r)     ldr x##r, [x0, 8 * r]
+#define SP     (8 * 31)
+#define PC     (8 * 32)
+ENTRY(perf_regs_load)
+       STR_REG(0)
+       STR_REG(1)
+       STR_REG(2)
+       STR_REG(3)
+       STR_REG(4)
+       STR_REG(5)
+       STR_REG(6)
+       STR_REG(7)
+       STR_REG(8)
+       STR_REG(9)
+       STR_REG(10)
+       STR_REG(11)
+       STR_REG(12)
+       STR_REG(13)
+       STR_REG(14)
+       STR_REG(15)
+       STR_REG(16)
+       STR_REG(17)
+       STR_REG(18)
+       STR_REG(19)
+       STR_REG(20)
+       STR_REG(21)
+       STR_REG(22)
+       STR_REG(23)
+       STR_REG(24)
+       STR_REG(25)
+       STR_REG(26)
+       STR_REG(27)
+       STR_REG(28)
+       STR_REG(29)
+       STR_REG(30)
+       mov x1, sp
+       str x1, [x0, #SP]
+       str x30, [x0, #PC]
+       LDR_REG(1)
+       ret
+ENDPROC(perf_regs_load)
diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c

index 49776f190abfab295920534840aa1df479a4c094..b7bb42c4469401d76527264f5e538b363b1d4bf9 100644 (file)
--- a/tools/perf/arch/common.c
+++ b/tools/perf/arch/common.c
@@ -61,7 +61,7 @@ const char *const mips_triplets[] = {
  static bool lookup_path(char *name)
  {
         bool found = false;
-       char *path, *tmp;
+       char *path, *tmp = NULL;
         char buf[PATH_MAX];
         char *env = getenv("PATH");
  
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build

index 0af6e9b3f72857af68ddef33862f00d884a5da1e..7b8b0d1a1b626065e0b414f42a7a998723e0e57f 100644 (file)
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -1,4 +1,5 @@
  libperf-y += header.o
+libperf-y += sym-handling.o
  
  libperf-$(CONFIG_DWARF) += dwarf-regs.o
  libperf-$(CONFIG_DWARF) += skip-callchain-idx.o
diff --git a/tools/perf/arch/powerpc/util/sym-handling.c b/tools/perf/arch/powerpc/util/sym-handling.c

new file mode 100644 (file)

index 0000000..bbc1a50
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/sym-handling.c
@@ -0,0 +1,82 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright (C) 2015 Naveen N. Rao, IBM Corporation
+ */
+
+#include "debug.h"
+#include "symbol.h"
+#include "map.h"
+#include "probe-event.h"
+
+#ifdef HAVE_LIBELF_SUPPORT
+bool elf__needs_adjust_symbols(GElf_Ehdr ehdr)
+{
+       return ehdr.e_type == ET_EXEC ||
+              ehdr.e_type == ET_REL ||
+              ehdr.e_type == ET_DYN;
+}
+
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+void arch__elf_sym_adjust(GElf_Sym *sym)
+{
+       sym->st_value += PPC64_LOCAL_ENTRY_OFFSET(sym->st_other);
+}
+#endif
+#endif
+
+#if !defined(_CALL_ELF) || _CALL_ELF != 2
+int arch__choose_best_symbol(struct symbol *syma,
+                            struct symbol *symb __maybe_unused)
+{
+       char *sym = syma->name;
+
+       /* Skip over any initial dot */
+       if (*sym == '.')
+               sym++;
+
+       /* Avoid "SyS" kernel syscall aliases */
+       if (strlen(sym) >= 3 && !strncmp(sym, "SyS", 3))
+               return SYMBOL_B;
+       if (strlen(sym) >= 10 && !strncmp(sym, "compat_SyS", 10))
+               return SYMBOL_B;
+
+       return SYMBOL_A;
+}
+
+/* Allow matching against dot variants */
+int arch__compare_symbol_names(const char *namea, const char *nameb)
+{
+       /* Skip over initial dot */
+       if (*namea == '.')
+               namea++;
+       if (*nameb == '.')
+               nameb++;
+
+       return strcmp(namea, nameb);
+}
+#endif
+
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+bool arch__prefers_symtab(void)
+{
+       return true;
+}
+
+#define PPC64LE_LEP_OFFSET     8
+
+void arch__fix_tev_from_maps(struct perf_probe_event *pev,
+                            struct probe_trace_event *tev, struct map *map)
+{
+       /*
+        * ppc64 ABIv2 local entry point is currently always 2 instructions
+        * (8 bytes) after the global entry point.
+        */
+       if (!pev->uprobes && map->dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS) {
+               tev->point.address += PPC64LE_LEP_OFFSET;
+               tev->point.offset += PPC64LE_LEP_OFFSET;
+       }
+}
+#endif
diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build

index 5ce98023d518fce2d51ee11e35b0a0f2df12d8de..c3ab760e06b4d7627896723dc9fe2b15c5327a0b 100644 (file)
--- a/tools/perf/bench/Build
+++ b/tools/perf/bench/Build
@@ -3,6 +3,7 @@ perf-y += sched-pipe.o
  perf-y += mem-memcpy.o
  perf-y += futex-hash.o
  perf-y += futex-wake.o
+perf-y += futex-wake-parallel.o
  perf-y += futex-requeue.o
  
  perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o
diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h

index 3c4dd44d45cb7b668bd22822b6f105c1de9eaf86..70b2f718cc217976ee73bd656d66b9aa884dfded 100644 (file)
--- a/tools/perf/bench/bench.h
+++ b/tools/perf/bench/bench.h
@@ -33,6 +33,8 @@ extern int bench_mem_memcpy(int argc, const char **argv,
  extern int bench_mem_memset(int argc, const char **argv, const char *prefix);
  extern int bench_futex_hash(int argc, const char **argv, const char *prefix);
  extern int bench_futex_wake(int argc, const char **argv, const char *prefix);
+extern int bench_futex_wake_parallel(int argc, const char **argv,
+                                    const char *prefix);
  extern int bench_futex_requeue(int argc, const char **argv, const char *prefix);
  
  #define BENCH_FORMAT_DEFAULT_STR       "default"
diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c

new file mode 100644 (file)

index 0000000..6d8c9fa
--- /dev/null
+++ b/tools/perf/bench/futex-wake-parallel.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright (C) 2015 Davidlohr Bueso.
+ *
+ * Block a bunch of threads and let parallel waker threads wakeup an
+ * equal amount of them. The program output reflects the avg latency
+ * for each individual thread to service its share of work. Ultimately
+ * it can be used to measure futex_wake() changes.
+ */
+
+#include "../perf.h"
+#include "../util/util.h"
+#include "../util/stat.h"
+#include "../util/parse-options.h"
+#include "../util/header.h"
+#include "bench.h"
+#include "futex.h"
+
+#include <err.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+struct thread_data {
+       pthread_t worker;
+       unsigned int nwoken;
+       struct timeval runtime;
+};
+
+static unsigned int nwakes = 1;
+
+/* all threads will block on the same futex -- hash bucket chaos ;) */
+static u_int32_t futex = 0;
+
+static pthread_t *blocked_worker;
+static bool done = false, silent = false, fshared = false;
+static unsigned int nblocked_threads = 0, nwaking_threads = 0;
+static pthread_mutex_t thread_lock;
+static pthread_cond_t thread_parent, thread_worker;
+static struct stats waketime_stats, wakeup_stats;
+static unsigned int ncpus, threads_starting;
+static int futex_flag = 0;
+
+static const struct option options[] = {
+       OPT_UINTEGER('t', "threads", &nblocked_threads, "Specify amount of threads"),
+       OPT_UINTEGER('w', "nwakers", &nwaking_threads, "Specify amount of waking threads"),
+       OPT_BOOLEAN( 's', "silent",  &silent,   "Silent mode: do not display data/details"),
+       OPT_BOOLEAN( 'S', "shared",  &fshared,  "Use shared futexes instead of private ones"),
+       OPT_END()
+};
+
+static const char * const bench_futex_wake_parallel_usage[] = {
+       "perf bench futex wake-parallel <options>",
+       NULL
+};
+
+static void *waking_workerfn(void *arg)
+{
+       struct thread_data *waker = (struct thread_data *) arg;
+       struct timeval start, end;
+
+       gettimeofday(&start, NULL);
+
+       waker->nwoken = futex_wake(&futex, nwakes, futex_flag);
+       if (waker->nwoken != nwakes)
+               warnx("couldn't wakeup all tasks (%d/%d)",
+                     waker->nwoken, nwakes);
+
+       gettimeofday(&end, NULL);
+       timersub(&end, &start, &waker->runtime);
+
+       pthread_exit(NULL);
+       return NULL;
+}
+
+static void wakeup_threads(struct thread_data *td, pthread_attr_t thread_attr)
+{
+       unsigned int i;
+
+       pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
+
+       /* create and block all threads */
+       for (i = 0; i < nwaking_threads; i++) {
+               /*
+                * Thread creation order will impact per-thread latency
+                * as it will affect the order to acquire the hb spinlock.
+                * For now let the scheduler decide.
+                */
+               if (pthread_create(&td[i].worker, &thread_attr,
+                                  waking_workerfn, (void *)&td[i]))
+                       err(EXIT_FAILURE, "pthread_create");
+       }
+
+       for (i = 0; i < nwaking_threads; i++)
+               if (pthread_join(td[i].worker, NULL))
+                       err(EXIT_FAILURE, "pthread_join");
+}
+
+static void *blocked_workerfn(void *arg __maybe_unused)
+{
+       pthread_mutex_lock(&thread_lock);
+       threads_starting--;
+       if (!threads_starting)
+               pthread_cond_signal(&thread_parent);
+       pthread_cond_wait(&thread_worker, &thread_lock);
+       pthread_mutex_unlock(&thread_lock);
+
+       while (1) { /* handle spurious wakeups */
+               if (futex_wait(&futex, 0, NULL, futex_flag) != EINTR)
+                       break;
+       }
+
+       pthread_exit(NULL);
+       return NULL;
+}
+
+static void block_threads(pthread_t *w, pthread_attr_t thread_attr)
+{
+       cpu_set_t cpu;
+       unsigned int i;
+
+       threads_starting = nblocked_threads;
+
+       /* create and block all threads */
+       for (i = 0; i < nblocked_threads; i++) {
+               CPU_ZERO(&cpu);
+               CPU_SET(i % ncpus, &cpu);
+
+               if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu))
+                       err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
+
+               if (pthread_create(&w[i], &thread_attr, blocked_workerfn, NULL))
+                       err(EXIT_FAILURE, "pthread_create");
+       }
+}
+
+static void print_run(struct thread_data *waking_worker, unsigned int run_num)
+{
+       unsigned int i, wakeup_avg;
+       double waketime_avg, waketime_stddev;
+       struct stats __waketime_stats, __wakeup_stats;
+
+       init_stats(&__wakeup_stats);
+       init_stats(&__waketime_stats);
+
+       for (i = 0; i < nwaking_threads; i++) {
+               update_stats(&__waketime_stats, waking_worker[i].runtime.tv_usec);
+               update_stats(&__wakeup_stats, waking_worker[i].nwoken);
+       }
+
+       waketime_avg = avg_stats(&__waketime_stats);
+       waketime_stddev = stddev_stats(&__waketime_stats);
+       wakeup_avg = avg_stats(&__wakeup_stats);
+
+       printf("[Run %d]: Avg per-thread latency (waking %d/%d threads) "
+              "in %.4f ms (+-%.2f%%)\n", run_num + 1, wakeup_avg,
+              nblocked_threads, waketime_avg/1e3,
+              rel_stddev_stats(waketime_stddev, waketime_avg));
+}
+
+static void print_summary(void)
+{
+       unsigned int wakeup_avg;
+       double waketime_avg, waketime_stddev;
+
+       waketime_avg = avg_stats(&waketime_stats);
+       waketime_stddev = stddev_stats(&waketime_stats);
+       wakeup_avg = avg_stats(&wakeup_stats);
+
+       printf("Avg per-thread latency (waking %d/%d threads) in %.4f ms (+-%.2f%%)\n",
+              wakeup_avg,
+              nblocked_threads,
+              waketime_avg/1e3,
+              rel_stddev_stats(waketime_stddev, waketime_avg));
+}
+
+
+static void do_run_stats(struct thread_data *waking_worker)
+{
+       unsigned int i;
+
+       for (i = 0; i < nwaking_threads; i++) {
+               update_stats(&waketime_stats, waking_worker[i].runtime.tv_usec);
+               update_stats(&wakeup_stats, waking_worker[i].nwoken);
+       }
+
+}
+
+static void toggle_done(int sig __maybe_unused,
+                       siginfo_t *info __maybe_unused,
+                       void *uc __maybe_unused)
+{
+       done = true;
+}
+
+int bench_futex_wake_parallel(int argc, const char **argv,
+                             const char *prefix __maybe_unused)
+{
+       int ret = 0;
+       unsigned int i, j;
+       struct sigaction act;
+       pthread_attr_t thread_attr;
+       struct thread_data *waking_worker;
+
+       argc = parse_options(argc, argv, options,
+                            bench_futex_wake_parallel_usage, 0);
+       if (argc) {
+               usage_with_options(bench_futex_wake_parallel_usage, options);
+               exit(EXIT_FAILURE);
+       }
+
+       sigfillset(&act.sa_mask);
+       act.sa_sigaction = toggle_done;
+       sigaction(SIGINT, &act, NULL);
+
+       ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+       if (!nblocked_threads)
+               nblocked_threads = ncpus;
+
+       /* some sanity checks */
+       if (nwaking_threads > nblocked_threads || !nwaking_threads)
+               nwaking_threads = nblocked_threads;
+
+       if (nblocked_threads % nwaking_threads)
+               errx(EXIT_FAILURE, "Must be perfectly divisible");
+       /*
+        * Each thread will wakeup nwakes tasks in
+        * a single futex_wait call.
+        */
+       nwakes = nblocked_threads/nwaking_threads;
+
+       blocked_worker = calloc(nblocked_threads, sizeof(*blocked_worker));
+       if (!blocked_worker)
+               err(EXIT_FAILURE, "calloc");
+
+       if (!fshared)
+               futex_flag = FUTEX_PRIVATE_FLAG;
+
+       printf("Run summary [PID %d]: blocking on %d threads (at [%s] "
+              "futex %p), %d threads waking up %d at a time.\n\n",
+              getpid(), nblocked_threads, fshared ? "shared":"private",
+              &futex, nwaking_threads, nwakes);
+
+       init_stats(&wakeup_stats);
+       init_stats(&waketime_stats);
+
+       pthread_attr_init(&thread_attr);
+       pthread_mutex_init(&thread_lock, NULL);
+       pthread_cond_init(&thread_parent, NULL);
+       pthread_cond_init(&thread_worker, NULL);
+
+       for (j = 0; j < bench_repeat && !done; j++) {
+               waking_worker = calloc(nwaking_threads, sizeof(*waking_worker));
+               if (!waking_worker)
+                       err(EXIT_FAILURE, "calloc");
+
+               /* create, launch & block all threads */
+               block_threads(blocked_worker, thread_attr);
+
+               /* make sure all threads are already blocked */
+               pthread_mutex_lock(&thread_lock);
+               while (threads_starting)
+                       pthread_cond_wait(&thread_parent, &thread_lock);
+               pthread_cond_broadcast(&thread_worker);
+               pthread_mutex_unlock(&thread_lock);
+
+               usleep(100000);
+
+               /* Ok, all threads are patiently blocked, start waking folks up */
+               wakeup_threads(waking_worker, thread_attr);
+
+               for (i = 0; i < nblocked_threads; i++) {
+                       ret = pthread_join(blocked_worker[i], NULL);
+                       if (ret)
+                               err(EXIT_FAILURE, "pthread_join");
+               }
+
+               do_run_stats(waking_worker);
+               if (!silent)
+                       print_run(waking_worker, j);
+
+               free(waking_worker);
+       }
+
+       /* cleanup & report results */
+       pthread_cond_destroy(&thread_parent);
+       pthread_cond_destroy(&thread_worker);
+       pthread_mutex_destroy(&thread_lock);
+       pthread_attr_destroy(&thread_attr);
+
+       print_summary();
+
+       free(blocked_worker);
+       return ret;
+}
diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c

index 929f762be47e9735058f5c57bd394c4f09360c45..e5e41d3bdce724230c16a3df2a199914b9cab29d 100644 (file)
--- a/tools/perf/bench/futex-wake.c
+++ b/tools/perf/bench/futex-wake.c
@@ -60,7 +60,12 @@ static void *workerfn(void *arg __maybe_unused)
         pthread_cond_wait(&thread_worker, &thread_lock);
         pthread_mutex_unlock(&thread_lock);
  
-       futex_wait(&futex1, 0, NULL, futex_flag);
+       while (1) {
+               if (futex_wait(&futex1, 0, NULL, futex_flag) != EINTR)
+                       break;
+       }
+
+       pthread_exit(NULL);
         return NULL;
  }
  
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c

index ba5efa4710b558239ff79c08b025ddc2da06efc5..870b7e665a203264c1b7b27684a860cbe147c450 100644 (file)
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -8,6 +8,7 @@
  #include "../builtin.h"
  #include "../util/util.h"
  #include "../util/parse-options.h"
+#include "../util/cloexec.h"
  
  #include "bench.h"
  
@@ -23,6 +24,7 @@
  #include <pthread.h>
  #include <sys/mman.h>
  #include <sys/time.h>
+#include <sys/resource.h>
  #include <sys/wait.h>
  #include <sys/prctl.h>
  #include <sys/types.h>
@@ -51,6 +53,9 @@ struct thread_data {
         unsigned int            loops_done;
         u64                     val;
         u64                     runtime_ns;
+       u64                     system_time_ns;
+       u64                     user_time_ns;
+       double                  speed_gbs;
         pthread_mutex_t         *process_lock;
  };
  
@@ -1042,6 +1047,7 @@ static void *worker_thread(void *__tdata)
         u64 bytes_done;
         long work_done;
         u32 l;
+       struct rusage rusage;
  
         bind_to_cpumask(td->bind_cpumask);
         bind_to_memnode(td->bind_node);
@@ -1194,6 +1200,13 @@ static void *worker_thread(void *__tdata)
         timersub(&stop, &start0, &diff);
         td->runtime_ns = diff.tv_sec * 1000000000ULL;
         td->runtime_ns += diff.tv_usec * 1000ULL;
+       td->speed_gbs = bytes_done / (td->runtime_ns / 1e9) / 1e9;
+
+       getrusage(RUSAGE_THREAD, &rusage);
+       td->system_time_ns = rusage.ru_stime.tv_sec * 1000000000ULL;
+       td->system_time_ns += rusage.ru_stime.tv_usec * 1000ULL;
+       td->user_time_ns = rusage.ru_utime.tv_sec * 1000000000ULL;
+       td->user_time_ns += rusage.ru_utime.tv_usec * 1000ULL;
  
         free_data(thread_data, g->p.bytes_thread);
  
@@ -1420,7 +1433,7 @@ static int __bench_numa(const char *name)
         double runtime_sec_min;
         int wait_stat;
         double bytes;
-       int i, t;
+       int i, t, p;
  
         if (init())
                 return -1;
@@ -1556,6 +1569,24 @@ static int __bench_numa(const char *name)
         print_res(name, bytes / runtime_sec_max / 1e9,
                 "GB/sec,", "total-speed",       "GB/sec total speed");
  
+       if (g->p.show_details >= 2) {
+               char tname[32];
+               struct thread_data *td;
+               for (p = 0; p < g->p.nr_proc; p++) {
+                       for (t = 0; t < g->p.nr_threads; t++) {
+                               memset(tname, 0, 32);
+                               td = g->threads + p*g->p.nr_threads + t;
+                               snprintf(tname, 32, "process%d:thread%d", p, t);
+                               print_res(tname, td->speed_gbs,
+                                       "GB/sec",       "thread-speed", "GB/sec/thread speed");
+                               print_res(tname, td->system_time_ns / 1e9,
+                                       "secs", "thread-system-time", "system CPU time/thread");
+                               print_res(tname, td->user_time_ns / 1e9,
+                                       "secs", "thread-user-time", "user CPU time/thread");
+                       }
+               }
+       }
+
         free(pids);
  
         deinit();
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c

index 71bf7451c0cad1bf1f43b946631bf949eed96c3c..2c1bec39c30ea191fecb46654df14dc6229d27d5 100644 (file)
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -59,6 +59,10 @@ static int perf_evsel__add_sample(struct perf_evsel *evsel,
             (al->sym == NULL ||
              strcmp(ann->sym_hist_filter, al->sym->name) != 0)) {
                 /* We're only interested in a symbol named sym_hist_filter */
+               /*
+                * FIXME: why isn't this done in the symbol_filter when loading
+                * the DSO?
+                */
                 if (al->sym != NULL) {
                         rb_erase(&al->sym->rb_node,
                                  &al->map->dso->symbols[al->map->type]);
@@ -84,6 +88,7 @@ static int process_sample_event(struct perf_tool *tool,
  {
         struct perf_annotate *ann = container_of(tool, struct perf_annotate, tool);
         struct addr_location al;
+       int ret = 0;
  
         if (perf_event__preprocess_sample(event, machine, &al, sample) < 0) {
                 pr_warning("problem processing %d event, skipping it.\n",
@@ -92,15 +97,16 @@ static int process_sample_event(struct perf_tool *tool,
         }
  
         if (ann->cpu_list && !test_bit(sample->cpu, ann->cpu_bitmap))
-               return 0;
+               goto out_put;
  
         if (!al.filtered && perf_evsel__add_sample(evsel, sample, &al, ann)) {
                 pr_warning("problem incrementing symbol count, "
                            "skipping event\n");
-               return -1;
+               ret = -1;
         }
-
-       return 0;
+out_put:
+       addr_location__put(&al);
+       return ret;
  }
  
  static int hist_entry__tty_annotate(struct hist_entry *he,
@@ -283,7 +289,6 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
                 },
         };
         struct perf_data_file file = {
-               .path  = input_name,
                 .mode  = PERF_DATA_MODE_READ,
         };
         const struct option options[] = {
@@ -324,6 +329,8 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
                    "objdump binary to use for disassembly and annotations"),
         OPT_BOOLEAN(0, "group", &symbol_conf.event_group,
                     "Show event group information together"),
+       OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period,
+                   "Show a column with the sum of periods"),
         OPT_END()
         };
         int ret = hists__init();
@@ -340,6 +347,8 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
         else if (annotate.use_gtk)
                 use_browser = 2;
  
+       file.path  = input_name;
+
         setup_browser(true);
  
         annotate.session = perf_session__new(&file, false, &annotate.tool);
diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c

index b9a56fa8333065271a9242e98cf052d9c8100b87..b5314e452ec7f24a2e9e4e6b8473d39b8ed7a4a7 100644 (file)
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c
@@ -58,6 +58,7 @@ static struct bench mem_benchmarks[] = {
  static struct bench futex_benchmarks[] = {
         { "hash",       "Benchmark for futex hash table",               bench_futex_hash        },
         { "wake",       "Benchmark for futex wake calls",               bench_futex_wake        },
+       { "wake-parallel", "Benchmark for parallel futex wake calls",   bench_futex_wake_parallel },
         { "requeue",    "Benchmark for futex requeue calls",            bench_futex_requeue     },
         { "all",        "Test all futex benchmarks",                    NULL                    },
         { NULL,         NULL,                                           NULL                    }
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c

index feb420f74c2d9fd34778a20e93764c2e5eaf47c8..9fe93c8d4fcff11b3c2638d901d8c62da5b921d2 100644 (file)
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -69,6 +69,15 @@ static int perf_session__list_build_ids(bool force, bool with_hits)
         session = perf_session__new(&file, false, &build_id__mark_dso_hit_ops);
         if (session == NULL)
                 return -1;
+
+       /*
+        * We take all buildids when the file contains AUX area tracing data
+        * because we do not decode the trace because it would take too long.
+        */
+       if (!perf_data_file__is_pipe(&file) &&
+           perf_header__has_feat(&session->header, HEADER_AUXTRACE))
+               with_hits = false;
+
         /*
          * in pipe-mode, the only way to get the buildids is to parse
          * the record stream. Buildids are stored as RECORD_HEADER_BUILD_ID
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c

index df6307b4050aaa2ab7e51403d22563e0a1866279..daaa7dca9c3ba81e1e2a7c671dc7660fe34ba4a5 100644 (file)
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -328,6 +328,7 @@ static int diff__process_sample_event(struct perf_tool *tool __maybe_unused,
  {
         struct addr_location al;
         struct hists *hists = evsel__hists(evsel);
+       int ret = -1;
  
         if (perf_event__preprocess_sample(event, machine, &al, sample) < 0) {
                 pr_warning("problem processing %d event, skipping it.\n",
@@ -338,7 +339,7 @@ static int diff__process_sample_event(struct perf_tool *tool __maybe_unused,
         if (hists__add_entry(hists, &al, sample->period,
                              sample->weight, sample->transaction)) {
                 pr_warning("problem incrementing symbol period, skipping event\n");
-               return -1;
+               goto out_put;
         }
  
         /*
@@ -350,8 +351,10 @@ static int diff__process_sample_event(struct perf_tool *tool __maybe_unused,
         hists->stats.total_period += sample->period;
         if (!al.filtered)
                 hists->stats.total_non_filtered_period += sample->period;
-
-       return 0;
+       ret = 0;
+out_put:
+       addr_location__put(&al);
+       return ret;
  }
  
  static struct perf_tool tool = {
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c

index 40a33d7334cce224fb0ac8317554189f1821dfcf..52ec66b236076c46c1bd0666967d1d204d2c95c3 100644 (file)
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -16,6 +16,7 @@
  #include "util/debug.h"
  #include "util/build-id.h"
  #include "util/data.h"
+#include "util/auxtrace.h"
  
  #include "util/parse-options.h"
  
@@ -26,10 +27,12 @@ struct perf_inject {
         struct perf_session     *session;
         bool                    build_ids;
         bool                    sched_stat;
+       bool                    have_auxtrace;
         const char              *input_name;
         struct perf_data_file   output;
         u64                     bytes_written;
         struct list_head        samples;
+       struct itrace_synth_opts itrace_synth_opts;
  };
  
  struct event_entry {
@@ -38,14 +41,11 @@ struct event_entry {
         union perf_event event[0];
  };
  
-static int perf_event__repipe_synth(struct perf_tool *tool,
-                                   union perf_event *event)
+static int output_bytes(struct perf_inject *inject, void *buf, size_t sz)
  {
-       struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
         ssize_t size;
  
-       size = perf_data_file__write(&inject->output, event,
-                                    event->header.size);
+       size = perf_data_file__write(&inject->output, buf, sz);
         if (size < 0)
                 return -errno;
  
@@ -53,6 +53,15 @@ static int perf_event__repipe_synth(struct perf_tool *tool,
         return 0;
  }
  
+static int perf_event__repipe_synth(struct perf_tool *tool,
+                                   union perf_event *event)
+{
+       struct perf_inject *inject = container_of(tool, struct perf_inject,
+                                                 tool);
+
+       return output_bytes(inject, event, event->header.size);
+}
+
  static int perf_event__repipe_oe_synth(struct perf_tool *tool,
                                        union perf_event *event,
                                        struct ordered_events *oe __maybe_unused)
@@ -86,6 +95,79 @@ static int perf_event__repipe_attr(struct perf_tool *tool,
         return perf_event__repipe_synth(tool, event);
  }
  
+#ifdef HAVE_AUXTRACE_SUPPORT
+
+static int copy_bytes(struct perf_inject *inject, int fd, off_t size)
+{
+       char buf[4096];
+       ssize_t ssz;
+       int ret;
+
+       while (size > 0) {
+               ssz = read(fd, buf, min(size, (off_t)sizeof(buf)));
+               if (ssz < 0)
+                       return -errno;
+               ret = output_bytes(inject, buf, ssz);
+               if (ret)
+                       return ret;
+               size -= ssz;
+       }
+
+       return 0;
+}
+
+static s64 perf_event__repipe_auxtrace(struct perf_tool *tool,
+                                      union perf_event *event,
+                                      struct perf_session *session
+                                      __maybe_unused)
+{
+       struct perf_inject *inject = container_of(tool, struct perf_inject,
+                                                 tool);
+       int ret;
+
+       inject->have_auxtrace = true;
+
+       if (!inject->output.is_pipe) {
+               off_t offset;
+
+               offset = lseek(inject->output.fd, 0, SEEK_CUR);
+               if (offset == -1)
+                       return -errno;
+               ret = auxtrace_index__auxtrace_event(&session->auxtrace_index,
+                                                    event, offset);
+               if (ret < 0)
+                       return ret;
+       }
+
+       if (perf_data_file__is_pipe(session->file) || !session->one_mmap) {
+               ret = output_bytes(inject, event, event->header.size);
+               if (ret < 0)
+                       return ret;
+               ret = copy_bytes(inject, perf_data_file__fd(session->file),
+                                event->auxtrace.size);
+       } else {
+               ret = output_bytes(inject, event,
+                                  event->header.size + event->auxtrace.size);
+       }
+       if (ret < 0)
+               return ret;
+
+       return event->auxtrace.size;
+}
+
+#else
+
+static s64
+perf_event__repipe_auxtrace(struct perf_tool *tool __maybe_unused,
+                           union perf_event *event __maybe_unused,
+                           struct perf_session *session __maybe_unused)
+{
+       pr_err("AUX area tracing not supported\n");
+       return -EINVAL;
+}
+
+#endif
+
  static int perf_event__repipe(struct perf_tool *tool,
                               union perf_event *event,
                               struct perf_sample *sample __maybe_unused,
@@ -155,6 +237,32 @@ static int perf_event__repipe_fork(struct perf_tool *tool,
         return err;
  }
  
+static int perf_event__repipe_comm(struct perf_tool *tool,
+                                  union perf_event *event,
+                                  struct perf_sample *sample,
+                                  struct machine *machine)
+{
+       int err;
+
+       err = perf_event__process_comm(tool, event, sample, machine);
+       perf_event__repipe(tool, event, sample, machine);
+
+       return err;
+}
+
+static int perf_event__repipe_exit(struct perf_tool *tool,
+                                  union perf_event *event,
+                                  struct perf_sample *sample,
+                                  struct machine *machine)
+{
+       int err;
+
+       err = perf_event__process_exit(tool, event, sample, machine);
+       perf_event__repipe(tool, event, sample, machine);
+
+       return err;
+}
+
  static int perf_event__repipe_tracing_data(struct perf_tool *tool,
                                            union perf_event *event,
                                            struct perf_session *session)
@@ -167,6 +275,18 @@ static int perf_event__repipe_tracing_data(struct perf_tool *tool,
         return err;
  }
  
+static int perf_event__repipe_id_index(struct perf_tool *tool,
+                                      union perf_event *event,
+                                      struct perf_session *session)
+{
+       int err;
+
+       perf_event__repipe_synth(tool, event);
+       err = perf_event__process_id_index(tool, event, session);
+
+       return err;
+}
+
  static int dso__read_build_id(struct dso *dso)
  {
         if (dso->has_build_id)
@@ -245,6 +365,7 @@ static int perf_event__inject_buildid(struct perf_tool *tool,
                 }
         }
  
+       thread__put(thread);
  repipe:
         perf_event__repipe(tool, event, sample, machine);
         return 0;
@@ -351,16 +472,20 @@ static int __cmd_inject(struct perf_inject *inject)
         struct perf_session *session = inject->session;
         struct perf_data_file *file_out = &inject->output;
         int fd = perf_data_file__fd(file_out);
+       u64 output_data_offset;
  
         signal(SIGINT, sig_handler);
  
-       if (inject->build_ids || inject->sched_stat) {
+       if (inject->build_ids || inject->sched_stat ||
+           inject->itrace_synth_opts.set) {
                 inject->tool.mmap         = perf_event__repipe_mmap;
                 inject->tool.mmap2        = perf_event__repipe_mmap2;
                 inject->tool.fork         = perf_event__repipe_fork;
                 inject->tool.tracing_data = perf_event__repipe_tracing_data;
         }
  
+       output_data_offset = session->header.data_offset;
+
         if (inject->build_ids) {
                 inject->tool.sample = perf_event__inject_buildid;
         } else if (inject->sched_stat) {
@@ -379,17 +504,43 @@ static int __cmd_inject(struct perf_inject *inject)
                         else if (!strncmp(name, "sched:sched_stat_", 17))
                                 evsel->handler = perf_inject__sched_stat;
                 }
+       } else if (inject->itrace_synth_opts.set) {
+               session->itrace_synth_opts = &inject->itrace_synth_opts;
+               inject->itrace_synth_opts.inject = true;
+               inject->tool.comm           = perf_event__repipe_comm;
+               inject->tool.exit           = perf_event__repipe_exit;
+               inject->tool.id_index       = perf_event__repipe_id_index;
+               inject->tool.auxtrace_info  = perf_event__process_auxtrace_info;
+               inject->tool.auxtrace       = perf_event__process_auxtrace;
+               inject->tool.ordered_events = true;
+               inject->tool.ordering_requires_timestamps = true;
+               /* Allow space in the header for new attributes */
+               output_data_offset = 4096;
         }
  
+       if (!inject->itrace_synth_opts.set)
+               auxtrace_index__free(&session->auxtrace_index);
+
         if (!file_out->is_pipe)
-               lseek(fd, session->header.data_offset, SEEK_SET);
+               lseek(fd, output_data_offset, SEEK_SET);
  
         ret = perf_session__process_events(session);
  
         if (!file_out->is_pipe) {
-               if (inject->build_ids)
+               if (inject->build_ids) {
                         perf_header__set_feat(&session->header,
                                               HEADER_BUILD_ID);
+                       if (inject->have_auxtrace)
+                               dsos__hit_all(session);
+               }
+               /*
+                * The AUX areas have been removed and replaced with
+                * synthesized hardware events, so clear the feature flag.
+                */
+               if (inject->itrace_synth_opts.set)
+                       perf_header__clear_feat(&session->header,
+                                               HEADER_AUXTRACE);
+               session->header.data_offset = output_data_offset;
                 session->header.data_size = inject->bytes_written;
                 perf_session__write_header(session, session->evlist, fd, true);
         }
@@ -408,11 +559,16 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
                         .fork           = perf_event__repipe,
                         .exit           = perf_event__repipe,
                         .lost           = perf_event__repipe,
+                       .aux            = perf_event__repipe,
+                       .itrace_start   = perf_event__repipe,
                         .read           = perf_event__repipe_sample,
                         .throttle       = perf_event__repipe,
                         .unthrottle     = perf_event__repipe,
                         .attr           = perf_event__repipe_attr,
                         .tracing_data   = perf_event__repipe_op2_synth,
+                       .auxtrace_info  = perf_event__repipe_op2_synth,
+                       .auxtrace       = perf_event__repipe_auxtrace,
+                       .auxtrace_error = perf_event__repipe_op2_synth,
                         .finished_round = perf_event__repipe_oe_synth,
                         .build_id       = perf_event__repipe_op2_synth,
                         .id_index       = perf_event__repipe_op2_synth,
@@ -444,6 +600,9 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
                 OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, "file",
                            "kallsyms pathname"),
                 OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
+               OPT_CALLBACK_OPTARG(0, "itrace", &inject.itrace_synth_opts,
+                                   NULL, "opts", "Instruction Tracing options",
+                                   itrace_parse_synth_opts),
                 OPT_END()
         };
         const char * const inject_usage[] = {
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c

index 1634186d537cdc2eb2ee38b174891361ef13db9f..950f296dfcf7a402ebbad0df1edf16e4d62a52bd 100644 (file)
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -10,6 +10,7 @@
  #include "util/header.h"
  #include "util/session.h"
  #include "util/tool.h"
+#include "util/callchain.h"
  
  #include "util/parse-options.h"
  #include "util/trace-event.h"
@@ -21,14 +22,19 @@
  #include <linux/rbtree.h>
  #include <linux/string.h>
  #include <locale.h>
+#include <regex.h>
  
  static int     kmem_slab;
  static int     kmem_page;
  
  static long    kmem_page_size;
+static enum {
+       KMEM_SLAB,
+       KMEM_PAGE,
+} kmem_default = KMEM_SLAB;  /* for backward compatibility */
  
  struct alloc_stat;
-typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);
+typedef int (*sort_fn_t)(void *, void *);
  
  static int                     alloc_flag;
  static int                     caller_flag;
@@ -179,8 +185,8 @@ static int perf_evsel__process_alloc_node_event(struct perf_evsel *evsel,
         return ret;
  }
  
-static int ptr_cmp(struct alloc_stat *, struct alloc_stat *);
-static int callsite_cmp(struct alloc_stat *, struct alloc_stat *);
+static int ptr_cmp(void *, void *);
+static int slab_callsite_cmp(void *, void *);
  
  static struct alloc_stat *search_alloc_stat(unsigned long ptr,
                                             unsigned long call_site,
@@ -221,7 +227,8 @@ static int perf_evsel__process_free_event(struct perf_evsel *evsel,
                 s_alloc->pingpong++;
  
                 s_caller = search_alloc_stat(0, s_alloc->call_site,
-                                            &root_caller_stat, callsite_cmp);
+                                            &root_caller_stat,
+                                            slab_callsite_cmp);
                 if (!s_caller)
                         return -1;
                 s_caller->pingpong++;
@@ -241,6 +248,8 @@ static unsigned long nr_page_fails;
  static unsigned long nr_page_nomatch;
  
  static bool use_pfn;
+static bool live_page;
+static struct perf_session *kmem_session;
  
  #define MAX_MIGRATE_TYPES  6
  #define MAX_PAGE_ORDER     11
@@ -250,6 +259,7 @@ static int order_stats[MAX_PAGE_ORDER][MAX_MIGRATE_TYPES];
  struct page_stat {
         struct rb_node  node;
         u64             page;
+       u64             callsite;
         int             order;
         unsigned        gfp_flags;
         unsigned        migrate_type;
@@ -259,13 +269,158 @@ struct page_stat {
         int             nr_free;
  };
  
-static struct rb_root page_tree;
+static struct rb_root page_live_tree;
  static struct rb_root page_alloc_tree;
  static struct rb_root page_alloc_sorted;
+static struct rb_root page_caller_tree;
+static struct rb_root page_caller_sorted;
  
-static struct page_stat *search_page(unsigned long page, bool create)
+struct alloc_func {
+       u64 start;
+       u64 end;
+       char *name;
+};
+
+static int nr_alloc_funcs;
+static struct alloc_func *alloc_func_list;
+
+static int funcmp(const void *a, const void *b)
+{
+       const struct alloc_func *fa = a;
+       const struct alloc_func *fb = b;
+
+       if (fa->start > fb->start)
+               return 1;
+       else
+               return -1;
+}
+
+static int callcmp(const void *a, const void *b)
+{
+       const struct alloc_func *fa = a;
+       const struct alloc_func *fb = b;
+
+       if (fb->start <= fa->start && fa->end < fb->end)
+               return 0;
+
+       if (fa->start > fb->start)
+               return 1;
+       else
+               return -1;
+}
+
+static int build_alloc_func_list(void)
  {
-       struct rb_node **node = &page_tree.rb_node;
+       int ret;
+       struct map *kernel_map;
+       struct symbol *sym;
+       struct rb_node *node;
+       struct alloc_func *func;
+       struct machine *machine = &kmem_session->machines.host;
+       regex_t alloc_func_regex;
+       const char pattern[] = "^_?_?(alloc|get_free|get_zeroed)_pages?";
+
+       ret = regcomp(&alloc_func_regex, pattern, REG_EXTENDED);
+       if (ret) {
+               char err[BUFSIZ];
+
+               regerror(ret, &alloc_func_regex, err, sizeof(err));
+               pr_err("Invalid regex: %s\n%s", pattern, err);
+               return -EINVAL;
+       }
+
+       kernel_map = machine->vmlinux_maps[MAP__FUNCTION];
+       if (map__load(kernel_map, NULL) < 0) {
+               pr_err("cannot load kernel map\n");
+               return -ENOENT;
+       }
+
+       map__for_each_symbol(kernel_map, sym, node) {
+               if (regexec(&alloc_func_regex, sym->name, 0, NULL, 0))
+                       continue;
+
+               func = realloc(alloc_func_list,
+                              (nr_alloc_funcs + 1) * sizeof(*func));
+               if (func == NULL)
+                       return -ENOMEM;
+
+               pr_debug("alloc func: %s\n", sym->name);
+               func[nr_alloc_funcs].start = sym->start;
+               func[nr_alloc_funcs].end   = sym->end;
+               func[nr_alloc_funcs].name  = sym->name;
+
+               alloc_func_list = func;
+               nr_alloc_funcs++;
+       }
+
+       qsort(alloc_func_list, nr_alloc_funcs, sizeof(*func), funcmp);
+
+       regfree(&alloc_func_regex);
+       return 0;
+}
+
+/*
+ * Find first non-memory allocation function from callchain.
+ * The allocation functions are in the 'alloc_func_list'.
+ */
+static u64 find_callsite(struct perf_evsel *evsel, struct perf_sample *sample)
+{
+       struct addr_location al;
+       struct machine *machine = &kmem_session->machines.host;
+       struct callchain_cursor_node *node;
+
+       if (alloc_func_list == NULL) {
+               if (build_alloc_func_list() < 0)
+                       goto out;
+       }
+
+       al.thread = machine__findnew_thread(machine, sample->pid, sample->tid);
+       sample__resolve_callchain(sample, NULL, evsel, &al, 16);
+
+       callchain_cursor_commit(&callchain_cursor);
+       while (true) {
+               struct alloc_func key, *caller;
+               u64 addr;
+
+               node = callchain_cursor_current(&callchain_cursor);
+               if (node == NULL)
+                       break;
+
+               key.start = key.end = node->ip;
+               caller = bsearch(&key, alloc_func_list, nr_alloc_funcs,
+                                sizeof(key), callcmp);
+               if (!caller) {
+                       /* found */
+                       if (node->map)
+                               addr = map__unmap_ip(node->map, node->ip);
+                       else
+                               addr = node->ip;
+
+                       return addr;
+               } else
+                       pr_debug3("skipping alloc function: %s\n", caller->name);
+
+               callchain_cursor_advance(&callchain_cursor);
+       }
+
+out:
+       pr_debug2("unknown callsite: %"PRIx64 "\n", sample->ip);
+       return sample->ip;
+}
+
+struct sort_dimension {
+       const char              name[20];
+       sort_fn_t               cmp;
+       struct list_head        list;
+};
+
+static LIST_HEAD(page_alloc_sort_input);
+static LIST_HEAD(page_caller_sort_input);
+
+static struct page_stat *
+__page_stat__findnew_page(struct page_stat *pstat, bool create)
+{
+       struct rb_node **node = &page_live_tree.rb_node;
         struct rb_node *parent = NULL;
         struct page_stat *data;
  
@@ -275,7 +430,7 @@ static struct page_stat *search_page(unsigned long page, bool create)
                 parent = *node;
                 data = rb_entry(*node, struct page_stat, node);
  
-               cmp = data->page - page;
+               cmp = data->page - pstat->page;
                 if (cmp < 0)
                         node = &parent->rb_left;
                 else if (cmp > 0)
@@ -289,49 +444,48 @@ static struct page_stat *search_page(unsigned long page, bool create)
  
         data = zalloc(sizeof(*data));
         if (data != NULL) {
-               data->page = page;
+               data->page = pstat->page;
+               data->order = pstat->order;
+               data->gfp_flags = pstat->gfp_flags;
+               data->migrate_type = pstat->migrate_type;
  
                 rb_link_node(&data->node, parent, node);
-               rb_insert_color(&data->node, &page_tree);
+               rb_insert_color(&data->node, &page_live_tree);
         }
  
         return data;
  }
  
-static int page_stat_cmp(struct page_stat *a, struct page_stat *b)
+static struct page_stat *page_stat__find_page(struct page_stat *pstat)
  {
-       if (a->page > b->page)
-               return -1;
-       if (a->page < b->page)
-               return 1;
-       if (a->order > b->order)
-               return -1;
-       if (a->order < b->order)
-               return 1;
-       if (a->migrate_type > b->migrate_type)
-               return -1;
-       if (a->migrate_type < b->migrate_type)
-               return 1;
-       if (a->gfp_flags > b->gfp_flags)
-               return -1;
-       if (a->gfp_flags < b->gfp_flags)
-               return 1;
-       return 0;
+       return __page_stat__findnew_page(pstat, false);
+}
+
+static struct page_stat *page_stat__findnew_page(struct page_stat *pstat)
+{
+       return __page_stat__findnew_page(pstat, true);
  }
  
-static struct page_stat *search_page_alloc_stat(struct page_stat *pstat, bool create)
+static struct page_stat *
+__page_stat__findnew_alloc(struct page_stat *pstat, bool create)
  {
         struct rb_node **node = &page_alloc_tree.rb_node;
         struct rb_node *parent = NULL;
         struct page_stat *data;
+       struct sort_dimension *sort;
  
         while (*node) {
-               s64 cmp;
+               int cmp = 0;
  
                 parent = *node;
                 data = rb_entry(*node, struct page_stat, node);
  
-               cmp = page_stat_cmp(data, pstat);
+               list_for_each_entry(sort, &page_alloc_sort_input, list) {
+                       cmp = sort->cmp(pstat, data);
+                       if (cmp)
+                               break;
+               }
+
                 if (cmp < 0)
                         node = &parent->rb_left;
                 else if (cmp > 0)
@@ -357,6 +511,71 @@ static struct page_stat *search_page_alloc_stat(struct page_stat *pstat, bool cr
         return data;
  }
  
+static struct page_stat *page_stat__find_alloc(struct page_stat *pstat)
+{
+       return __page_stat__findnew_alloc(pstat, false);
+}
+
+static struct page_stat *page_stat__findnew_alloc(struct page_stat *pstat)
+{
+       return __page_stat__findnew_alloc(pstat, true);
+}
+
+static struct page_stat *
+__page_stat__findnew_caller(struct page_stat *pstat, bool create)
+{
+       struct rb_node **node = &page_caller_tree.rb_node;
+       struct rb_node *parent = NULL;
+       struct page_stat *data;
+       struct sort_dimension *sort;
+
+       while (*node) {
+               int cmp = 0;
+
+               parent = *node;
+               data = rb_entry(*node, struct page_stat, node);
+
+               list_for_each_entry(sort, &page_caller_sort_input, list) {
+                       cmp = sort->cmp(pstat, data);
+                       if (cmp)
+                               break;
+               }
+
+               if (cmp < 0)
+                       node = &parent->rb_left;
+               else if (cmp > 0)
+                       node = &parent->rb_right;
+               else
+                       return data;
+       }
+
+       if (!create)
+               return NULL;
+
+       data = zalloc(sizeof(*data));
+       if (data != NULL) {
+               data->callsite = pstat->callsite;
+               data->order = pstat->order;
+               data->gfp_flags = pstat->gfp_flags;
+               data->migrate_type = pstat->migrate_type;
+
+               rb_link_node(&data->node, parent, node);
+               rb_insert_color(&data->node, &page_caller_tree);
+       }
+
+       return data;
+}
+
+static struct page_stat *page_stat__find_caller(struct page_stat *pstat)
+{
+       return __page_stat__findnew_caller(pstat, false);
+}
+
+static struct page_stat *page_stat__findnew_caller(struct page_stat *pstat)
+{
+       return __page_stat__findnew_caller(pstat, true);
+}
+
  static bool valid_page(u64 pfn_or_page)
  {
         if (use_pfn && pfn_or_page == -1UL)
@@ -366,6 +585,176 @@ static bool valid_page(u64 pfn_or_page)
         return true;
  }
  
+struct gfp_flag {
+       unsigned int flags;
+       char *compact_str;
+       char *human_readable;
+};
+
+static struct gfp_flag *gfps;
+static int nr_gfps;
+
+static int gfpcmp(const void *a, const void *b)
+{
+       const struct gfp_flag *fa = a;
+       const struct gfp_flag *fb = b;
+
+       return fa->flags - fb->flags;
+}
+
+/* see include/trace/events/gfpflags.h */
+static const struct {
+       const char *original;
+       const char *compact;
+} gfp_compact_table[] = {
+       { "GFP_TRANSHUGE",              "THP" },
+       { "GFP_HIGHUSER_MOVABLE",       "HUM" },
+       { "GFP_HIGHUSER",               "HU" },
+       { "GFP_USER",                   "U" },
+       { "GFP_TEMPORARY",              "TMP" },
+       { "GFP_KERNEL",                 "K" },
+       { "GFP_NOFS",                   "NF" },
+       { "GFP_ATOMIC",                 "A" },
+       { "GFP_NOIO",                   "NI" },
+       { "GFP_HIGH",                   "H" },
+       { "GFP_WAIT",                   "W" },
+       { "GFP_IO",                     "I" },
+       { "GFP_COLD",                   "CO" },
+       { "GFP_NOWARN",                 "NWR" },
+       { "GFP_REPEAT",                 "R" },
+       { "GFP_NOFAIL",                 "NF" },
+       { "GFP_NORETRY",                "NR" },
+       { "GFP_COMP",                   "C" },
+       { "GFP_ZERO",                   "Z" },
+       { "GFP_NOMEMALLOC",             "NMA" },
+       { "GFP_MEMALLOC",               "MA" },
+       { "GFP_HARDWALL",               "HW" },
+       { "GFP_THISNODE",               "TN" },
+       { "GFP_RECLAIMABLE",            "RC" },
+       { "GFP_MOVABLE",                "M" },
+       { "GFP_NOTRACK",                "NT" },
+       { "GFP_NO_KSWAPD",              "NK" },
+       { "GFP_OTHER_NODE",             "ON" },
+       { "GFP_NOWAIT",                 "NW" },
+};
+
+static size_t max_gfp_len;
+
+static char *compact_gfp_flags(char *gfp_flags)
+{
+       char *orig_flags = strdup(gfp_flags);
+       char *new_flags = NULL;
+       char *str, *pos = NULL;
+       size_t len = 0;
+
+       if (orig_flags == NULL)
+               return NULL;
+
+       str = strtok_r(orig_flags, "|", &pos);
+       while (str) {
+               size_t i;
+               char *new;
+               const char *cpt;
+
+               for (i = 0; i < ARRAY_SIZE(gfp_compact_table); i++) {
+                       if (strcmp(gfp_compact_table[i].original, str))
+                               continue;
+
+                       cpt = gfp_compact_table[i].compact;
+                       new = realloc(new_flags, len + strlen(cpt) + 2);
+                       if (new == NULL) {
+                               free(new_flags);
+                               return NULL;
+                       }
+
+                       new_flags = new;
+
+                       if (!len) {
+                               strcpy(new_flags, cpt);
+                       } else {
+                               strcat(new_flags, "|");
+                               strcat(new_flags, cpt);
+                               len++;
+                       }
+
+                       len += strlen(cpt);
+               }
+
+               str = strtok_r(NULL, "|", &pos);
+       }
+
+       if (max_gfp_len < len)
+               max_gfp_len = len;
+
+       free(orig_flags);
+       return new_flags;
+}
+
+static char *compact_gfp_string(unsigned long gfp_flags)
+{
+       struct gfp_flag key = {
+               .flags = gfp_flags,
+       };
+       struct gfp_flag *gfp;
+
+       gfp = bsearch(&key, gfps, nr_gfps, sizeof(*gfps), gfpcmp);
+       if (gfp)
+               return gfp->compact_str;
+
+       return NULL;
+}
+
+static int parse_gfp_flags(struct perf_evsel *evsel, struct perf_sample *sample,
+                          unsigned int gfp_flags)
+{
+       struct pevent_record record = {
+               .cpu = sample->cpu,
+               .data = sample->raw_data,
+               .size = sample->raw_size,
+       };
+       struct trace_seq seq;
+       char *str, *pos = NULL;
+
+       if (nr_gfps) {
+               struct gfp_flag key = {
+                       .flags = gfp_flags,
+               };
+
+               if (bsearch(&key, gfps, nr_gfps, sizeof(*gfps), gfpcmp))
+                       return 0;
+       }
+
+       trace_seq_init(&seq);
+       pevent_event_info(&seq, evsel->tp_format, &record);
+
+       str = strtok_r(seq.buffer, " ", &pos);
+       while (str) {
+               if (!strncmp(str, "gfp_flags=", 10)) {
+                       struct gfp_flag *new;
+
+                       new = realloc(gfps, (nr_gfps + 1) * sizeof(*gfps));
+                       if (new == NULL)
+                               return -ENOMEM;
+
+                       gfps = new;
+                       new += nr_gfps++;
+
+                       new->flags = gfp_flags;
+                       new->human_readable = strdup(str + 10);
+                       new->compact_str = compact_gfp_flags(str + 10);
+                       if (!new->human_readable || !new->compact_str)
+                               return -ENOMEM;
+
+                       qsort(gfps, nr_gfps, sizeof(*gfps), gfpcmp);
+               }
+
+               str = strtok_r(NULL, " ", &pos);
+       }
+
+       trace_seq_destroy(&seq);
+       return 0;
+}
+
  static int perf_evsel__process_page_alloc_event(struct perf_evsel *evsel,
                                                 struct perf_sample *sample)
  {
@@ -375,6 +764,7 @@ static int perf_evsel__process_page_alloc_event(struct perf_evsel *evsel,
         unsigned int migrate_type = perf_evsel__intval(evsel, sample,
                                                        "migratetype");
         u64 bytes = kmem_page_size << order;
+       u64 callsite;
         struct page_stat *pstat;
         struct page_stat this = {
                 .order = order,
@@ -397,20 +787,36 @@ static int perf_evsel__process_page_alloc_event(struct perf_evsel *evsel,
                 return 0;
         }
  
+       if (parse_gfp_flags(evsel, sample, gfp_flags) < 0)
+               return -1;
+
+       callsite = find_callsite(evsel, sample);
+
         /*
          * This is to find the current page (with correct gfp flags and
          * migrate type) at free event.
          */
-       pstat = search_page(page, true);
+       this.page = page;
+       pstat = page_stat__findnew_page(&this);
         if (pstat == NULL)
                 return -ENOMEM;
  
-       pstat->order = order;
-       pstat->gfp_flags = gfp_flags;
-       pstat->migrate_type = migrate_type;
+       pstat->nr_alloc++;
+       pstat->alloc_bytes += bytes;
+       pstat->callsite = callsite;
+
+       if (!live_page) {
+               pstat = page_stat__findnew_alloc(&this);
+               if (pstat == NULL)
+                       return -ENOMEM;
  
-       this.page = page;
-       pstat = search_page_alloc_stat(&this, true);
+               pstat->nr_alloc++;
+               pstat->alloc_bytes += bytes;
+               pstat->callsite = callsite;
+       }
+
+       this.callsite = callsite;
+       pstat = page_stat__findnew_caller(&this);
         if (pstat == NULL)
                 return -ENOMEM;
  
@@ -441,7 +847,8 @@ static int perf_evsel__process_page_free_event(struct perf_evsel *evsel,
         nr_page_frees++;
         total_page_free_bytes += bytes;
  
-       pstat = search_page(page, false);
+       this.page = page;
+       pstat = page_stat__find_page(&this);
         if (pstat == NULL) {
                 pr_debug2("missing free at page %"PRIx64" (order: %d)\n",
                           page, order);
@@ -452,20 +859,41 @@ static int perf_evsel__process_page_free_event(struct perf_evsel *evsel,
                 return 0;
         }
  
-       this.page = page;
         this.gfp_flags = pstat->gfp_flags;
         this.migrate_type = pstat->migrate_type;
+       this.callsite = pstat->callsite;
  
-       rb_erase(&pstat->node, &page_tree);
+       rb_erase(&pstat->node, &page_live_tree);
         free(pstat);
  
-       pstat = search_page_alloc_stat(&this, false);
+       if (live_page) {
+               order_stats[this.order][this.migrate_type]--;
+       } else {
+               pstat = page_stat__find_alloc(&this);
+               if (pstat == NULL)
+                       return -ENOMEM;
+
+               pstat->nr_free++;
+               pstat->free_bytes += bytes;
+       }
+
+       pstat = page_stat__find_caller(&this);
         if (pstat == NULL)
                 return -ENOENT;
  
         pstat->nr_free++;
         pstat->free_bytes += bytes;
  
+       if (live_page) {
+               pstat->nr_alloc--;
+               pstat->alloc_bytes -= bytes;
+
+               if (pstat->nr_alloc == 0) {
+                       rb_erase(&pstat->node, &page_caller_tree);
+                       free(pstat);
+               }
+       }
+
         return 0;
  }
  
@@ -478,6 +906,7 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused,
                                 struct perf_evsel *evsel,
                                 struct machine *machine)
  {
+       int err = 0;
         struct thread *thread = machine__findnew_thread(machine, sample->pid,
                                                         sample->tid);
  
@@ -491,10 +920,12 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused,
  
         if (evsel->handler != NULL) {
                 tracepoint_handler f = evsel->handler;
-               return f(evsel, sample);
+               err = f(evsel, sample);
         }
  
-       return 0;
+       thread__put(thread);
+
+       return err;
  }
  
  static struct perf_tool perf_kmem = {
@@ -576,41 +1007,111 @@ static const char * const migrate_type_str[] = {
         "UNKNOWN",
  };
  
-static void __print_page_result(struct rb_root *root,
-                               struct perf_session *session __maybe_unused,
-                               int n_lines)
+static void __print_page_alloc_result(struct perf_session *session, int n_lines)
  {
-       struct rb_node *next = rb_first(root);
+       struct rb_node *next = rb_first(&page_alloc_sorted);
+       struct machine *machine = &session->machines.host;
         const char *format;
+       int gfp_len = max(strlen("GFP flags"), max_gfp_len);
  
-       printf("\n%.80s\n", graph_dotted_line);
-       printf(" %-16s | Total alloc (KB) | Hits      | Order | Mig.type | GFP flags\n",
-              use_pfn ? "PFN" : "Page");
-       printf("%.80s\n", graph_dotted_line);
+       printf("\n%.105s\n", graph_dotted_line);
+       printf(" %-16s | %5s alloc (KB) | Hits      | Order | Mig.type | %-*s | Callsite\n",
+              use_pfn ? "PFN" : "Page", live_page ? "Live" : "Total",
+              gfp_len, "GFP flags");
+       printf("%.105s\n", graph_dotted_line);
  
         if (use_pfn)
-               format = " %16llu | %'16llu | %'9d | %5d | %8s |  %08lx\n";
+               format = " %16llu | %'16llu | %'9d | %5d | %8s | %-*s | %s\n";
         else
-               format = " %016llx | %'16llu | %'9d | %5d | %8s |  %08lx\n";
+               format = " %016llx | %'16llu | %'9d | %5d | %8s | %-*s | %s\n";
  
         while (next && n_lines--) {
                 struct page_stat *data;
+               struct symbol *sym;
+               struct map *map;
+               char buf[32];
+               char *caller = buf;
  
                 data = rb_entry(next, struct page_stat, node);
+               sym = machine__find_kernel_function(machine, data->callsite,
+                                                   &map, NULL);
+               if (sym && sym->name)
+                       caller = sym->name;
+               else
+                       scnprintf(buf, sizeof(buf), "%"PRIx64, data->callsite);
  
                 printf(format, (unsigned long long)data->page,
                        (unsigned long long)data->alloc_bytes / 1024,
                        data->nr_alloc, data->order,
                        migrate_type_str[data->migrate_type],
-                      (unsigned long)data->gfp_flags);
+                      gfp_len, compact_gfp_string(data->gfp_flags), caller);
  
                 next = rb_next(next);
         }
  
-       if (n_lines == -1)
-               printf(" ...              | ...              | ...       | ...   | ...      | ...     \n");
+       if (n_lines == -1) {
+               printf(" ...              | ...              | ...       | ...   | ...      | %-*s | ...\n",
+                      gfp_len, "...");
+       }
+
+       printf("%.105s\n", graph_dotted_line);
+}
+
+static void __print_page_caller_result(struct perf_session *session, int n_lines)
+{
+       struct rb_node *next = rb_first(&page_caller_sorted);
+       struct machine *machine = &session->machines.host;
+       int gfp_len = max(strlen("GFP flags"), max_gfp_len);
+
+       printf("\n%.105s\n", graph_dotted_line);
+       printf(" %5s alloc (KB) | Hits      | Order | Mig.type | %-*s | Callsite\n",
+              live_page ? "Live" : "Total", gfp_len, "GFP flags");
+       printf("%.105s\n", graph_dotted_line);
+
+       while (next && n_lines--) {
+               struct page_stat *data;
+               struct symbol *sym;
+               struct map *map;
+               char buf[32];
+               char *caller = buf;
+
+               data = rb_entry(next, struct page_stat, node);
+               sym = machine__find_kernel_function(machine, data->callsite,
+                                                   &map, NULL);
+               if (sym && sym->name)
+                       caller = sym->name;
+               else
+                       scnprintf(buf, sizeof(buf), "%"PRIx64, data->callsite);
+
+               printf(" %'16llu | %'9d | %5d | %8s | %-*s | %s\n",
+                      (unsigned long long)data->alloc_bytes / 1024,
+                      data->nr_alloc, data->order,
+                      migrate_type_str[data->migrate_type],
+                      gfp_len, compact_gfp_string(data->gfp_flags), caller);
+
+               next = rb_next(next);
+       }
+
+       if (n_lines == -1) {
+               printf(" ...              | ...       | ...   | ...      | %-*s | ...\n",
+                      gfp_len, "...");
+       }
  
-       printf("%.80s\n", graph_dotted_line);
+       printf("%.105s\n", graph_dotted_line);
+}
+
+static void print_gfp_flags(void)
+{
+       int i;
+
+       printf("#\n");
+       printf("# GFP flags\n");
+       printf("# ---------\n");
+       for (i = 0; i < nr_gfps; i++) {
+               printf("# %08x: %*s: %s\n", gfps[i].flags,
+                      (int) max_gfp_len, gfps[i].compact_str,
+                      gfps[i].human_readable);
+       }
  }
  
  static void print_slab_summary(void)
@@ -682,8 +1183,12 @@ static void print_slab_result(struct perf_session *session)
  
  static void print_page_result(struct perf_session *session)
  {
+       if (caller_flag || alloc_flag)
+               print_gfp_flags();
+       if (caller_flag)
+               __print_page_caller_result(session, caller_lines);
         if (alloc_flag)
-               __print_page_result(&page_alloc_sorted, session, alloc_lines);
+               __print_page_alloc_result(session, alloc_lines);
         print_page_summary();
  }
  
@@ -695,14 +1200,10 @@ static void print_result(struct perf_session *session)
                 print_page_result(session);
  }
  
-struct sort_dimension {
-       const char              name[20];
-       sort_fn_t               cmp;
-       struct list_head        list;
-};
-
-static LIST_HEAD(caller_sort);
-static LIST_HEAD(alloc_sort);
+static LIST_HEAD(slab_caller_sort);
+static LIST_HEAD(slab_alloc_sort);
+static LIST_HEAD(page_caller_sort);
+static LIST_HEAD(page_alloc_sort);
  
  static void sort_slab_insert(struct rb_root *root, struct alloc_stat *data,
                              struct list_head *sort_list)
@@ -751,10 +1252,12 @@ static void __sort_slab_result(struct rb_root *root, struct rb_root *root_sorted
         }
  }
  
-static void sort_page_insert(struct rb_root *root, struct page_stat *data)
+static void sort_page_insert(struct rb_root *root, struct page_stat *data,
+                            struct list_head *sort_list)
  {
         struct rb_node **new = &root->rb_node;
         struct rb_node *parent = NULL;
+       struct sort_dimension *sort;
  
         while (*new) {
                 struct page_stat *this;
@@ -763,8 +1266,11 @@ static void sort_page_insert(struct rb_root *root, struct page_stat *data)
                 this = rb_entry(*new, struct page_stat, node);
                 parent = *new;
  
-               /* TODO: support more sort key */
-               cmp = data->alloc_bytes - this->alloc_bytes;
+               list_for_each_entry(sort, sort_list, list) {
+                       cmp = sort->cmp(data, this);
+                       if (cmp)
+                               break;
+               }
  
                 if (cmp > 0)
                         new = &parent->rb_left;
@@ -776,7 +1282,8 @@ static void sort_page_insert(struct rb_root *root, struct page_stat *data)
         rb_insert_color(&data->node, root);
  }
  
-static void __sort_page_result(struct rb_root *root, struct rb_root *root_sorted)
+static void __sort_page_result(struct rb_root *root, struct rb_root *root_sorted,
+                              struct list_head *sort_list)
  {
         struct rb_node *node;
         struct page_stat *data;
@@ -788,7 +1295,7 @@ static void __sort_page_result(struct rb_root *root, struct rb_root *root_sorted
  
                 rb_erase(node, root);
                 data = rb_entry(node, struct page_stat, node);
-               sort_page_insert(root_sorted, data);
+               sort_page_insert(root_sorted, data, sort_list);
         }
  }
  
@@ -796,12 +1303,20 @@ static void sort_result(void)
  {
         if (kmem_slab) {
                 __sort_slab_result(&root_alloc_stat, &root_alloc_sorted,
-                                  &alloc_sort);
+                                  &slab_alloc_sort);
                 __sort_slab_result(&root_caller_stat, &root_caller_sorted,
-                                  &caller_sort);
+                                  &slab_caller_sort);
         }
         if (kmem_page) {
-               __sort_page_result(&page_alloc_tree, &page_alloc_sorted);
+               if (live_page)
+                       __sort_page_result(&page_live_tree, &page_alloc_sorted,
+                                          &page_alloc_sort);
+               else
+                       __sort_page_result(&page_alloc_tree, &page_alloc_sorted,
+                                          &page_alloc_sort);
+
+               __sort_page_result(&page_caller_tree, &page_caller_sorted,
+                                  &page_caller_sort);
         }
  }
  
@@ -850,8 +1365,12 @@ out:
         return err;
  }
  
-static int ptr_cmp(struct alloc_stat *l, struct alloc_stat *r)
+/* slab sort keys */
+static int ptr_cmp(void *a, void *b)
  {
+       struct alloc_stat *l = a;
+       struct alloc_stat *r = b;
+
         if (l->ptr < r->ptr)
                 return -1;
         else if (l->ptr > r->ptr)
@@ -864,8 +1383,11 @@ static struct sort_dimension ptr_sort_dimension = {
         .cmp    = ptr_cmp,
  };
  
-static int callsite_cmp(struct alloc_stat *l, struct alloc_stat *r)
+static int slab_callsite_cmp(void *a, void *b)
  {
+       struct alloc_stat *l = a;
+       struct alloc_stat *r = b;
+
         if (l->call_site < r->call_site)
                 return -1;
         else if (l->call_site > r->call_site)
@@ -875,11 +1397,14 @@ static int callsite_cmp(struct alloc_stat *l, struct alloc_stat *r)
  
  static struct sort_dimension callsite_sort_dimension = {
         .name   = "callsite",
-       .cmp    = callsite_cmp,
+       .cmp    = slab_callsite_cmp,
  };
  
-static int hit_cmp(struct alloc_stat *l, struct alloc_stat *r)
+static int hit_cmp(void *a, void *b)
  {
+       struct alloc_stat *l = a;
+       struct alloc_stat *r = b;
+
         if (l->hit < r->hit)
                 return -1;
         else if (l->hit > r->hit)
@@ -892,8 +1417,11 @@ static struct sort_dimension hit_sort_dimension = {
         .cmp    = hit_cmp,
  };
  
-static int bytes_cmp(struct alloc_stat *l, struct alloc_stat *r)
+static int bytes_cmp(void *a, void *b)
  {
+       struct alloc_stat *l = a;
+       struct alloc_stat *r = b;
+
         if (l->bytes_alloc < r->bytes_alloc)
                 return -1;
         else if (l->bytes_alloc > r->bytes_alloc)
@@ -906,9 +1434,11 @@ static struct sort_dimension bytes_sort_dimension = {
         .cmp    = bytes_cmp,
  };
  
-static int frag_cmp(struct alloc_stat *l, struct alloc_stat *r)
+static int frag_cmp(void *a, void *b)
  {
         double x, y;
+       struct alloc_stat *l = a;
+       struct alloc_stat *r = b;
  
         x = fragmentation(l->bytes_req, l->bytes_alloc);
         y = fragmentation(r->bytes_req, r->bytes_alloc);
@@ -925,8 +1455,11 @@ static struct sort_dimension frag_sort_dimension = {
         .cmp    = frag_cmp,
  };
  
-static int pingpong_cmp(struct alloc_stat *l, struct alloc_stat *r)
+static int pingpong_cmp(void *a, void *b)
  {
+       struct alloc_stat *l = a;
+       struct alloc_stat *r = b;
+
         if (l->pingpong < r->pingpong)
                 return -1;
         else if (l->pingpong > r->pingpong)
@@ -939,7 +1472,135 @@ static struct sort_dimension pingpong_sort_dimension = {
         .cmp    = pingpong_cmp,
  };
  
-static struct sort_dimension *avail_sorts[] = {
+/* page sort keys */
+static int page_cmp(void *a, void *b)
+{
+       struct page_stat *l = a;
+       struct page_stat *r = b;
+
+       if (l->page < r->page)
+               return -1;
+       else if (l->page > r->page)
+               return 1;
+       return 0;
+}
+
+static struct sort_dimension page_sort_dimension = {
+       .name   = "page",
+       .cmp    = page_cmp,
+};
+
+static int page_callsite_cmp(void *a, void *b)
+{
+       struct page_stat *l = a;
+       struct page_stat *r = b;
+
+       if (l->callsite < r->callsite)
+               return -1;
+       else if (l->callsite > r->callsite)
+               return 1;
+       return 0;
+}
+
+static struct sort_dimension page_callsite_sort_dimension = {
+       .name   = "callsite",
+       .cmp    = page_callsite_cmp,
+};
+
+static int page_hit_cmp(void *a, void *b)
+{
+       struct page_stat *l = a;
+       struct page_stat *r = b;
+
+       if (l->nr_alloc < r->nr_alloc)
+               return -1;
+       else if (l->nr_alloc > r->nr_alloc)
+               return 1;
+       return 0;
+}
+
+static struct sort_dimension page_hit_sort_dimension = {
+       .name   = "hit",
+       .cmp    = page_hit_cmp,
+};
+
+static int page_bytes_cmp(void *a, void *b)
+{
+       struct page_stat *l = a;
+       struct page_stat *r = b;
+
+       if (l->alloc_bytes < r->alloc_bytes)
+               return -1;
+       else if (l->alloc_bytes > r->alloc_bytes)
+               return 1;
+       return 0;
+}
+
+static struct sort_dimension page_bytes_sort_dimension = {
+       .name   = "bytes",
+       .cmp    = page_bytes_cmp,
+};
+
+static int page_order_cmp(void *a, void *b)
+{
+       struct page_stat *l = a;
+       struct page_stat *r = b;
+
+       if (l->order < r->order)
+               return -1;
+       else if (l->order > r->order)
+               return 1;
+       return 0;
+}
+
+static struct sort_dimension page_order_sort_dimension = {
+       .name   = "order",
+       .cmp    = page_order_cmp,
+};
+
+static int migrate_type_cmp(void *a, void *b)
+{
+       struct page_stat *l = a;
+       struct page_stat *r = b;
+
+       /* for internal use to find free'd page */
+       if (l->migrate_type == -1U)
+               return 0;
+
+       if (l->migrate_type < r->migrate_type)
+               return -1;
+       else if (l->migrate_type > r->migrate_type)
+               return 1;
+       return 0;
+}
+
+static struct sort_dimension migrate_type_sort_dimension = {
+       .name   = "migtype",
+       .cmp    = migrate_type_cmp,
+};
+
+static int gfp_flags_cmp(void *a, void *b)
+{
+       struct page_stat *l = a;
+       struct page_stat *r = b;
+
+       /* for internal use to find free'd page */
+       if (l->gfp_flags == -1U)
+               return 0;
+
+       if (l->gfp_flags < r->gfp_flags)
+               return -1;
+       else if (l->gfp_flags > r->gfp_flags)
+               return 1;
+       return 0;
+}
+
+static struct sort_dimension gfp_flags_sort_dimension = {
+       .name   = "gfp",
+       .cmp    = gfp_flags_cmp,
+};
+
+static struct sort_dimension *slab_sorts[] = {
         &ptr_sort_dimension,
         &callsite_sort_dimension,
         &hit_sort_dimension,
@@ -948,16 +1609,44 @@ static struct sort_dimension *avail_sorts[] = {
         &pingpong_sort_dimension,
  };
  
-#define NUM_AVAIL_SORTS        ((int)ARRAY_SIZE(avail_sorts))
+static struct sort_dimension *page_sorts[] = {
+       &page_sort_dimension,
+       &page_callsite_sort_dimension,
+       &page_hit_sort_dimension,
+       &page_bytes_sort_dimension,
+       &page_order_sort_dimension,
+       &migrate_type_sort_dimension,
+       &gfp_flags_sort_dimension,
+};
+
+static int slab_sort_dimension__add(const char *tok, struct list_head *list)
+{
+       struct sort_dimension *sort;
+       int i;
+
+       for (i = 0; i < (int)ARRAY_SIZE(slab_sorts); i++) {
+               if (!strcmp(slab_sorts[i]->name, tok)) {
+                       sort = memdup(slab_sorts[i], sizeof(*slab_sorts[i]));
+                       if (!sort) {
+                               pr_err("%s: memdup failed\n", __func__);
+                               return -1;
+                       }
+                       list_add_tail(&sort->list, list);
+                       return 0;
+               }
+       }
+
+       return -1;
+}
  
-static int sort_dimension__add(const char *tok, struct list_head *list)
+static int page_sort_dimension__add(const char *tok, struct list_head *list)
  {
         struct sort_dimension *sort;
         int i;
  
-       for (i = 0; i < NUM_AVAIL_SORTS; i++) {
-               if (!strcmp(avail_sorts[i]->name, tok)) {
-                       sort = memdup(avail_sorts[i], sizeof(*avail_sorts[i]));
+       for (i = 0; i < (int)ARRAY_SIZE(page_sorts); i++) {
+               if (!strcmp(page_sorts[i]->name, tok)) {
+                       sort = memdup(page_sorts[i], sizeof(*page_sorts[i]));
                         if (!sort) {
                                 pr_err("%s: memdup failed\n", __func__);
                                 return -1;
@@ -970,7 +1659,33 @@ static int sort_dimension__add(const char *tok, struct list_head *list)
         return -1;
  }
  
-static int setup_sorting(struct list_head *sort_list, const char *arg)
+static int setup_slab_sorting(struct list_head *sort_list, const char *arg)
+{
+       char *tok;
+       char *str = strdup(arg);
+       char *pos = str;
+
+       if (!str) {
+               pr_err("%s: strdup failed\n", __func__);
+               return -1;
+       }
+
+       while (true) {
+               tok = strsep(&pos, ",");
+               if (!tok)
+                       break;
+               if (slab_sort_dimension__add(tok, sort_list) < 0) {
+                       error("Unknown slab --sort key: '%s'", tok);
+                       free(str);
+                       return -1;
+               }
+       }
+
+       free(str);
+       return 0;
+}
+
+static int setup_page_sorting(struct list_head *sort_list, const char *arg)
  {
         char *tok;
         char *str = strdup(arg);
@@ -985,8 +1700,8 @@ static int setup_sorting(struct list_head *sort_list, const char *arg)
                 tok = strsep(&pos, ",");
                 if (!tok)
                         break;
-               if (sort_dimension__add(tok, sort_list) < 0) {
-                       error("Unknown --sort key: '%s'", tok);
+               if (page_sort_dimension__add(tok, sort_list) < 0) {
+                       error("Unknown page --sort key: '%s'", tok);
                         free(str);
                         return -1;
                 }
@@ -1002,10 +1717,18 @@ static int parse_sort_opt(const struct option *opt __maybe_unused,
         if (!arg)
                 return -1;
  
-       if (caller_flag > alloc_flag)
-               return setup_sorting(&caller_sort, arg);
-       else
-               return setup_sorting(&alloc_sort, arg);
+       if (kmem_page > kmem_slab ||
+           (kmem_page == 0 && kmem_slab == 0 && kmem_default == KMEM_PAGE)) {
+               if (caller_flag > alloc_flag)
+                       return setup_page_sorting(&page_caller_sort, arg);
+               else
+                       return setup_page_sorting(&page_alloc_sort, arg);
+       } else {
+               if (caller_flag > alloc_flag)
+                       return setup_slab_sorting(&slab_caller_sort, arg);
+               else
+                       return setup_slab_sorting(&slab_alloc_sort, arg);
+       }
  
         return 0;
  }
@@ -1084,7 +1807,7 @@ static int __cmd_record(int argc, const char **argv)
         if (kmem_slab)
                 rec_argc += ARRAY_SIZE(slab_events);
         if (kmem_page)
-               rec_argc += ARRAY_SIZE(page_events);
+               rec_argc += ARRAY_SIZE(page_events) + 1; /* for -g */
  
         rec_argv = calloc(rec_argc + 1, sizeof(char *));
  
@@ -1099,6 +1822,8 @@ static int __cmd_record(int argc, const char **argv)
                         rec_argv[i] = strdup(slab_events[j]);
         }
         if (kmem_page) {
+               rec_argv[i++] = strdup("-g");
+
                 for (j = 0; j < ARRAY_SIZE(page_events); j++, i++)
                         rec_argv[i] = strdup(page_events[j]);
         }
@@ -1109,9 +1834,26 @@ static int __cmd_record(int argc, const char **argv)
         return cmd_record(i, rec_argv, NULL);
  }
  
+static int kmem_config(const char *var, const char *value, void *cb)
+{
+       if (!strcmp(var, "kmem.default")) {
+               if (!strcmp(value, "slab"))
+                       kmem_default = KMEM_SLAB;
+               else if (!strcmp(value, "page"))
+                       kmem_default = KMEM_PAGE;
+               else
+                       pr_err("invalid default value ('slab' or 'page' required): %s\n",
+                              value);
+               return 0;
+       }
+
+       return perf_default_config(var, value, cb);
+}
+
  int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
  {
-       const char * const default_sort_order = "frag,hit,bytes";
+       const char * const default_slab_sort = "frag,hit,bytes";
+       const char * const default_page_sort = "bytes,hit";
         struct perf_data_file file = {
                 .mode = PERF_DATA_MODE_READ,
         };
@@ -1124,8 +1866,8 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
         OPT_CALLBACK_NOOPT(0, "alloc", NULL, NULL,
                            "show per-allocation statistics", parse_alloc_opt),
         OPT_CALLBACK('s', "sort", NULL, "key[,key2...]",
-                    "sort by keys: ptr, call_site, bytes, hit, pingpong, frag",
-                    parse_sort_opt),
+                    "sort by keys: ptr, callsite, bytes, hit, pingpong, frag, "
+                    "page, order, migtype, gfp", parse_sort_opt),
         OPT_CALLBACK('l', "line", NULL, "num", "show n lines", parse_line_opt),
         OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"),
         OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
@@ -1133,6 +1875,7 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
                            parse_slab_opt),
         OPT_CALLBACK_NOOPT(0, "page", NULL, NULL, "Analyze page allocator",
                            parse_page_opt),
+       OPT_BOOLEAN(0, "live", &live_page, "Show live page stat"),
         OPT_END()
         };
         const char *const kmem_subcommands[] = { "record", "stat", NULL };
@@ -1142,15 +1885,21 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
         };
         struct perf_session *session;
         int ret = -1;
+       const char errmsg[] = "No %s allocation events found.  Have you run 'perf kmem record --%s'?\n";
  
+       perf_config(kmem_config, NULL);
         argc = parse_options_subcommand(argc, argv, kmem_options,
                                         kmem_subcommands, kmem_usage, 0);
  
         if (!argc)
                 usage_with_options(kmem_usage, kmem_options);
  
-       if (kmem_slab == 0 && kmem_page == 0)
-               kmem_slab = 1;  /* for backward compatibility */
+       if (kmem_slab == 0 && kmem_page == 0) {
+               if (kmem_default == KMEM_SLAB)
+                       kmem_slab = 1;
+               else
+                       kmem_page = 1;
+       }
  
         if (!strncmp(argv[0], "rec", 3)) {
                 symbol__init(NULL);
@@ -1159,19 +1908,30 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
  
         file.path = input_name;
  
-       session = perf_session__new(&file, false, &perf_kmem);
+       kmem_session = session = perf_session__new(&file, false, &perf_kmem);
         if (session == NULL)
                 return -1;
  
+       if (kmem_slab) {
+               if (!perf_evlist__find_tracepoint_by_name(session->evlist,
+                                                         "kmem:kmalloc")) {
+                       pr_err(errmsg, "slab", "slab");
+                       return -1;
+               }
+       }
+
         if (kmem_page) {
-               struct perf_evsel *evsel = perf_evlist__first(session->evlist);
+               struct perf_evsel *evsel;
  
-               if (evsel == NULL || evsel->tp_format == NULL) {
-                       pr_err("invalid event found.. aborting\n");
+               evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
+                                                            "kmem:mm_page_alloc");
+               if (evsel == NULL) {
+                       pr_err(errmsg, "page", "page");
                         return -1;
                 }
  
                 kmem_page_size = pevent_get_page_size(evsel->tp_format->pevent);
+               symbol_conf.use_callchain = true;
         }
  
         symbol__init(&session->header.env);
@@ -1182,11 +1942,21 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
                 if (cpu__setup_cpunode_map())
                         goto out_delete;
  
-               if (list_empty(&caller_sort))
-                       setup_sorting(&caller_sort, default_sort_order);
-               if (list_empty(&alloc_sort))
-                       setup_sorting(&alloc_sort, default_sort_order);
-
+               if (list_empty(&slab_caller_sort))
+                       setup_slab_sorting(&slab_caller_sort, default_slab_sort);
+               if (list_empty(&slab_alloc_sort))
+                       setup_slab_sorting(&slab_alloc_sort, default_slab_sort);
+               if (list_empty(&page_caller_sort))
+                       setup_page_sorting(&page_caller_sort, default_page_sort);
+               if (list_empty(&page_alloc_sort))
+                       setup_page_sorting(&page_alloc_sort, default_page_sort);
+
+               if (kmem_page) {
+                       setup_page_sorting(&page_alloc_sort_input,
+                                          "page,order,migtype,gfp");
+                       setup_page_sorting(&page_caller_sort_input,
+                                          "callsite,order,migtype,gfp");
+               }
                 ret = __cmd_kmem(session);
         } else
                 usage_with_options(kmem_usage, kmem_options);
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c

index 1f9338f6109cdbe79f8f08e510c00e4a171d1f82..74878cd75078055e437396fc9a6b201603586076 100644 (file)
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -651,6 +651,7 @@ static int process_sample_event(struct perf_tool *tool,
                                 struct perf_evsel *evsel,
                                 struct machine *machine)
  {
+       int err = 0;
         struct thread *thread;
         struct perf_kvm_stat *kvm = container_of(tool, struct perf_kvm_stat,
                                                  tool);
@@ -666,9 +667,10 @@ static int process_sample_event(struct perf_tool *tool,
         }
  
         if (!handle_kvm_event(kvm, thread, evsel, sample))
-               return -1;
+               err = -1;
  
-       return 0;
+       thread__put(thread);
+       return err;
  }
  
  static int cpu_isa_config(struct perf_kvm_stat *kvm)
@@ -1309,6 +1311,8 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
                         "show events other than"
                         " HLT (x86 only) or Wait state (s390 only)"
                         " that take longer than duration usecs"),
+               OPT_UINTEGER(0, "proc-map-timeout", &kvm->opts.proc_map_timeout,
+                               "per thread proc mmap processing timeout in ms"),
                 OPT_END()
         };
         const char * const live_usage[] = {
@@ -1336,6 +1340,7 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
         kvm->opts.target.uses_mmap = false;
         kvm->opts.target.uid_str = NULL;
         kvm->opts.target.uid = UINT_MAX;
+       kvm->opts.proc_map_timeout = 500;
  
         symbol__init(NULL);
         disable_buildid_cache();
@@ -1391,7 +1396,7 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
         perf_session__set_id_hdr_size(kvm->session);
         ordered_events__set_copy_on_queue(&kvm->session->ordered_events, true);
         machine__synthesize_threads(&kvm->session->machines.host, &kvm->opts.target,
-                                   kvm->evlist->threads, false);
+                                   kvm->evlist->threads, false, kvm->opts.proc_map_timeout);
         err = kvm_live_open_events(kvm);
         if (err)
                 goto out;
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c

index d49c2ab85fc2dd1e3c6560391b71f7e3269cf31e..de16aaed516e6016b2a8d887f87727a8179acf19 100644 (file)
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -769,6 +769,7 @@ static void dump_threads(void)
                 t = perf_session__findnew(session, st->tid);
                 pr_info("%10d: %s\n", st->tid, thread__comm_str(t));
                 node = rb_next(node);
+               thread__put(t);
         };
  }
  
@@ -810,6 +811,7 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused,
                                 struct perf_evsel *evsel,
                                 struct machine *machine)
  {
+       int err = 0;
         struct thread *thread = machine__findnew_thread(machine, sample->pid,
                                                         sample->tid);
  
@@ -821,10 +823,12 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused,
  
         if (evsel->handler != NULL) {
                 tracepoint_handler f = evsel->handler;
-               return f(evsel, sample);
+               err = f(evsel, sample);
         }
  
-       return 0;
+       thread__put(thread);
+
+       return err;
  }
  
  static void sort_result(void)
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c

index 675216e08bfcd04baf2336ece7da328e914e21fd..da2ec06f0742dc6acf98c1c9b74d7cf45ff0fcb2 100644 (file)
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -74,7 +74,7 @@ dump_raw_samples(struct perf_tool *tool,
         }
  
         if (al.filtered || (mem->hide_unresolved && al.sym == NULL))
-               return 0;
+               goto out_put;
  
         if (al.map != NULL)
                 al.map->dso->hit = 1;
@@ -103,7 +103,8 @@ dump_raw_samples(struct perf_tool *tool,
                 symbol_conf.field_sep,
                 al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???",
                 al.sym ? al.sym->name : "???");
-
+out_put:
+       addr_location__put(&al);
         return 0;
  }
  
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c

index f7b1af67e9f686d86f8bd1539b96a935b04a4559..1272559fa22d9eb60367f34594ebd18a7e76e8d3 100644 (file)
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -44,25 +44,19 @@
  
  #define DEFAULT_VAR_FILTER "!__k???tab_* & !__crc_*"
  #define DEFAULT_FUNC_FILTER "!_*"
+#define DEFAULT_LIST_FILTER "*:*"
  
  /* Session management structure */
  static struct {
+       int command;    /* Command short_name */
         bool list_events;
-       bool force_add;
-       bool show_lines;
-       bool show_vars;
-       bool show_ext_vars;
-       bool show_funcs;
-       bool mod_events;
         bool uprobes;
         bool quiet;
         bool target_used;
         int nevents;
         struct perf_probe_event events[MAX_PROBES];
-       struct strlist *dellist;
         struct line_range line_range;
         char *target;
-       int max_probe_points;
         struct strfilter *filter;
  } params;
  
@@ -93,6 +87,28 @@ static int parse_probe_event(const char *str)
         return ret;
  }
  
+static int params_add_filter(const char *str)
+{
+       const char *err = NULL;
+       int ret = 0;
+
+       pr_debug2("Add filter: %s\n", str);
+       if (!params.filter) {
+               params.filter = strfilter__new(str, &err);
+               if (!params.filter)
+                       ret = err ? -EINVAL : -ENOMEM;
+       } else
+               ret = strfilter__or(params.filter, str, &err);
+
+       if (ret == -EINVAL) {
+               pr_err("Filter parse error at %td.\n", err - str + 1);
+               pr_err("Source: \"%s\"\n", str);
+               pr_err("         %*c\n", (int)(err - str + 1), '^');
+       }
+
+       return ret;
+}
+
  static int set_target(const char *ptr)
  {
         int found = 0;
@@ -152,34 +168,11 @@ static int parse_probe_event_argv(int argc, const char **argv)
  
                 len += sprintf(&buf[len], "%s ", argv[i]);
         }
-       params.mod_events = true;
         ret = parse_probe_event(buf);
         free(buf);
         return ret;
  }
  
-static int opt_add_probe_event(const struct option *opt __maybe_unused,
-                             const char *str, int unset __maybe_unused)
-{
-       if (str) {
-               params.mod_events = true;
-               return parse_probe_event(str);
-       } else
-               return 0;
-}
-
-static int opt_del_probe_event(const struct option *opt __maybe_unused,
-                              const char *str, int unset __maybe_unused)
-{
-       if (str) {
-               params.mod_events = true;
-               if (!params.dellist)
-                       params.dellist = strlist__new(true, NULL);
-               strlist__add(params.dellist, str);
-       }
-       return 0;
-}
-
  static int opt_set_target(const struct option *opt, const char *str,
                         int unset __maybe_unused)
  {
@@ -217,8 +210,10 @@ static int opt_set_target(const struct option *opt, const char *str,
         return ret;
  }
  
+/* Command option callbacks */
+
  #ifdef HAVE_DWARF_SUPPORT
-static int opt_show_lines(const struct option *opt __maybe_unused,
+static int opt_show_lines(const struct option *opt,
                           const char *str, int unset __maybe_unused)
  {
         int ret = 0;
@@ -226,19 +221,19 @@ static int opt_show_lines(const struct option *opt __maybe_unused,
         if (!str)
                 return 0;
  
-       if (params.show_lines) {
+       if (params.command == 'L') {
                 pr_warning("Warning: more than one --line options are"
                            " detected. Only the first one is valid.\n");
                 return 0;
         }
  
-       params.show_lines = true;
+       params.command = opt->short_name;
         ret = parse_line_range_desc(str, &params.line_range);
  
         return ret;
  }
  
-static int opt_show_vars(const struct option *opt __maybe_unused,
+static int opt_show_vars(const struct option *opt,
                          const char *str, int unset __maybe_unused)
  {
         struct perf_probe_event *pev = &params.events[params.nevents];
@@ -252,29 +247,39 @@ static int opt_show_vars(const struct option *opt __maybe_unused,
                 pr_err("  Error: '--vars' doesn't accept arguments.\n");
                 return -EINVAL;
         }
-       params.show_vars = true;
+       params.command = opt->short_name;
  
         return ret;
  }
  #endif
+static int opt_add_probe_event(const struct option *opt,
+                             const char *str, int unset __maybe_unused)
+{
+       if (str) {
+               params.command = opt->short_name;
+               return parse_probe_event(str);
+       }
+
+       return 0;
+}
+
+static int opt_set_filter_with_command(const struct option *opt,
+                                      const char *str, int unset)
+{
+       if (!unset)
+               params.command = opt->short_name;
+
+       if (str)
+               return params_add_filter(str);
+
+       return 0;
+}
  
  static int opt_set_filter(const struct option *opt __maybe_unused,
                           const char *str, int unset __maybe_unused)
  {
-       const char *err;
-
-       if (str) {
-               pr_debug2("Set filter: %s\n", str);
-               if (params.filter)
-                       strfilter__delete(params.filter);
-               params.filter = strfilter__new(str, &err);
-               if (!params.filter) {
-                       pr_err("Filter parse error at %td.\n", err - str + 1);
-                       pr_err("Source: \"%s\"\n", str);
-                       pr_err("         %*c\n", (int)(err - str + 1), '^');
-                       return -EINVAL;
-               }
-       }
+       if (str)
+               return params_add_filter(str);
  
         return 0;
  }
@@ -290,8 +295,6 @@ static void cleanup_params(void)
  
         for (i = 0; i < params.nevents; i++)
                 clear_perf_probe_event(params.events + i);
-       if (params.dellist)
-               strlist__delete(params.dellist);
         line_range__clear(&params.line_range);
         free(params.target);
         if (params.filter)
@@ -316,22 +319,24 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
                 "perf probe [<options>] 'PROBEDEF' ['PROBEDEF' ...]",
                 "perf probe [<options>] --add 'PROBEDEF' [--add 'PROBEDEF' ...]",
                 "perf probe [<options>] --del '[GROUP:]EVENT' ...",
-               "perf probe --list",
+               "perf probe --list [GROUP:]EVENT ...",
  #ifdef HAVE_DWARF_SUPPORT
                 "perf probe [<options>] --line 'LINEDESC'",
                 "perf probe [<options>] --vars 'PROBEPOINT'",
  #endif
+               "perf probe [<options>] --funcs",
                 NULL
-};
+       };
         struct option options[] = {
         OPT_INCR('v', "verbose", &verbose,
                     "be more verbose (show parsed arguments, etc)"),
         OPT_BOOLEAN('q', "quiet", &params.quiet,
                     "be quiet (do not show any mesages)"),
-       OPT_BOOLEAN('l', "list", &params.list_events,
-                   "list up current probe events"),
+       OPT_CALLBACK_DEFAULT('l', "list", NULL, "[GROUP:]EVENT",
+                            "list up probe events",
+                            opt_set_filter_with_command, DEFAULT_LIST_FILTER),
         OPT_CALLBACK('d', "del", NULL, "[GROUP:]EVENT", "delete a probe event.",
-               opt_del_probe_event),
+                    opt_set_filter_with_command),
         OPT_CALLBACK('a', "add", NULL,
  #ifdef HAVE_DWARF_SUPPORT
                 "[EVENT=]FUNC[@SRC][+OFF|%return|:RL|;PT]|SRC:AL|SRC;PT"
@@ -356,7 +361,7 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
                 "\t\tARG:\tProbe argument (kprobe-tracer argument format.)\n",
  #endif
                 opt_add_probe_event),
-       OPT_BOOLEAN('f', "force", &params.force_add, "forcibly add events"
+       OPT_BOOLEAN('f', "force", &probe_conf.force_add, "forcibly add events"
                     " with existing name"),
  #ifdef HAVE_DWARF_SUPPORT
         OPT_CALLBACK('L', "line", NULL,
@@ -365,8 +370,10 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
         OPT_CALLBACK('V', "vars", NULL,
                      "FUNC[@SRC][+OFF|%return|:RL|;PT]|SRC:AL|SRC;PT",
                      "Show accessible variables on PROBEDEF", opt_show_vars),
-       OPT_BOOLEAN('\0', "externs", &params.show_ext_vars,
+       OPT_BOOLEAN('\0', "externs", &probe_conf.show_ext_vars,
                     "Show external variables too (with --vars only)"),
+       OPT_BOOLEAN('\0', "range", &probe_conf.show_location_range,
+               "Show variables location range in scope (with --vars only)"),
         OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
                    "file", "vmlinux pathname"),
         OPT_STRING('s', "source", &symbol_conf.source_prefix,
@@ -374,12 +381,15 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
         OPT_CALLBACK('m', "module", NULL, "modname|path",
                 "target module name (for online) or path (for offline)",
                 opt_set_target),
+       OPT_BOOLEAN('\0', "no-inlines", &probe_conf.no_inlines,
+               "Don't search inlined functions"),
  #endif
         OPT__DRY_RUN(&probe_event_dry_run),
-       OPT_INTEGER('\0', "max-probes", &params.max_probe_points,
+       OPT_INTEGER('\0', "max-probes", &probe_conf.max_probes,
                  "Set how many probe points can be found for a probe."),
-       OPT_BOOLEAN('F', "funcs", &params.show_funcs,
-                   "Show potential probe-able functions."),
+       OPT_CALLBACK_DEFAULT('F', "funcs", NULL, "[FILTER]",
+                            "Show potential probe-able functions.",
+                            opt_set_filter_with_command, DEFAULT_FUNC_FILTER),
         OPT_CALLBACK('\0', "filter", NULL,
                      "[!]FILTER", "Set a filter (with --vars/funcs only)\n"
                      "\t\t\t(default: \"" DEFAULT_VAR_FILTER "\" for --vars,\n"
@@ -402,6 +412,7 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
         set_option_flag(options, 'L', "line", PARSE_OPT_EXCLUSIVE);
         set_option_flag(options, 'V', "vars", PARSE_OPT_EXCLUSIVE);
  #endif
+       set_option_flag(options, 'F', "funcs", PARSE_OPT_EXCLUSIVE);
  
         argc = parse_options(argc, argv, options, probe_usage,
                              PARSE_OPT_STOP_AT_NON_OPTION);
@@ -410,11 +421,16 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
                         pr_warning("  Error: '-' is not supported.\n");
                         usage_with_options(probe_usage, options);
                 }
+               if (params.command && params.command != 'a') {
+                       pr_warning("  Error: another command except --add is set.\n");
+                       usage_with_options(probe_usage, options);
+               }
                 ret = parse_probe_event_argv(argc, argv);
                 if (ret < 0) {
                         pr_err_with_code("  Error: Command Parse Error.", ret);
                         return ret;
                 }
+               params.command = 'a';
         }
  
         if (params.quiet) {
@@ -425,89 +441,70 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
                 verbose = -1;
         }
  
-       if (params.max_probe_points == 0)
-               params.max_probe_points = MAX_PROBES;
-
-       if ((!params.nevents && !params.dellist && !params.list_events &&
-            !params.show_lines && !params.show_funcs))
-               usage_with_options(probe_usage, options);
+       if (probe_conf.max_probes == 0)
+               probe_conf.max_probes = MAX_PROBES;
  
         /*
          * Only consider the user's kernel image path if given.
          */
         symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
  
-       if (params.list_events) {
+       switch (params.command) {
+       case 'l':
                 if (params.uprobes) {
                         pr_warning("  Error: Don't use --list with --exec.\n");
                         usage_with_options(probe_usage, options);
                 }
-               ret = show_perf_probe_events();
+               ret = show_perf_probe_events(params.filter);
                 if (ret < 0)
                         pr_err_with_code("  Error: Failed to show event list.", ret);
                 return ret;
-       }
-       if (params.show_funcs) {
-               if (!params.filter)
-                       params.filter = strfilter__new(DEFAULT_FUNC_FILTER,
-                                                      NULL);
+       case 'F':
                 ret = show_available_funcs(params.target, params.filter,
                                         params.uprobes);
-               strfilter__delete(params.filter);
-               params.filter = NULL;
                 if (ret < 0)
                         pr_err_with_code("  Error: Failed to show functions.", ret);
                 return ret;
-       }
-
  #ifdef HAVE_DWARF_SUPPORT
-       if (params.show_lines) {
+       case 'L':
                 ret = show_line_range(&params.line_range, params.target,
                                       params.uprobes);
                 if (ret < 0)
                         pr_err_with_code("  Error: Failed to show lines.", ret);
                 return ret;
-       }
-       if (params.show_vars) {
+       case 'V':
                 if (!params.filter)
                         params.filter = strfilter__new(DEFAULT_VAR_FILTER,
                                                        NULL);
  
                 ret = show_available_vars(params.events, params.nevents,
-                                         params.max_probe_points,
-                                         params.target,
-                                         params.filter,
-                                         params.show_ext_vars);
-               strfilter__delete(params.filter);
-               params.filter = NULL;
+                                         params.filter);
                 if (ret < 0)
                         pr_err_with_code("  Error: Failed to show vars.", ret);
                 return ret;
-       }
  #endif
-
-       if (params.dellist) {
-               ret = del_perf_probe_events(params.dellist);
+       case 'd':
+               ret = del_perf_probe_events(params.filter);
                 if (ret < 0) {
                         pr_err_with_code("  Error: Failed to delete events.", ret);
                         return ret;
                 }
-       }
-
-       if (params.nevents) {
+               break;
+       case 'a':
                 /* Ensure the last given target is used */
                 if (params.target && !params.target_used) {
                         pr_warning("  Error: -x/-m must follow the probe definitions.\n");
                         usage_with_options(probe_usage, options);
                 }
  
-               ret = add_perf_probe_events(params.events, params.nevents,
-                                           params.max_probe_points,
-                                           params.force_add);
+               ret = add_perf_probe_events(params.events, params.nevents);
                 if (ret < 0) {
                         pr_err_with_code("  Error: Failed to add events.", ret);
                         return ret;
                 }
+               break;
+       default:
+               usage_with_options(probe_usage, options);
         }
         return 0;
  }
@@ -522,5 +519,5 @@ int cmd_probe(int argc, const char **argv, const char *prefix)
                 cleanup_params();
         }
  
-       return ret;
+       return ret < 0 ? ret : 0;
  }
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c

index c3efdfb630b5b664349ed9e40c41374d3863752d..de165a1b92402ac7a6267bd0a0c5aa30a0053c92 100644 (file)
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -27,6 +27,8 @@
  #include "util/cpumap.h"
  #include "util/thread_map.h"
  #include "util/data.h"
+#include "util/auxtrace.h"
+#include "util/parse-branch-options.h"
  
  #include <unistd.h>
  #include <sched.h>
@@ -38,6 +40,7 @@ struct record {
         struct record_opts      opts;
         u64                     bytes_written;
         struct perf_data_file   file;
+       struct auxtrace_record  *itr;
         struct perf_evlist      *evlist;
         struct perf_session     *session;
         const char              *progname;
@@ -110,9 +113,12 @@ out:
         return rc;
  }
  
-static volatile int done = 0;
+static volatile int done;
  static volatile int signr = -1;
-static volatile int child_finished = 0;
+static volatile int child_finished;
+static volatile int auxtrace_snapshot_enabled;
+static volatile int auxtrace_snapshot_err;
+static volatile int auxtrace_record__snapshot_started;
  
  static void sig_handler(int sig)
  {
@@ -133,6 +139,133 @@ static void record__sig_exit(void)
         raise(signr);
  }
  
+#ifdef HAVE_AUXTRACE_SUPPORT
+
+static int record__process_auxtrace(struct perf_tool *tool,
+                                   union perf_event *event, void *data1,
+                                   size_t len1, void *data2, size_t len2)
+{
+       struct record *rec = container_of(tool, struct record, tool);
+       struct perf_data_file *file = &rec->file;
+       size_t padding;
+       u8 pad[8] = {0};
+
+       if (!perf_data_file__is_pipe(file)) {
+               off_t file_offset;
+               int fd = perf_data_file__fd(file);
+               int err;
+
+               file_offset = lseek(fd, 0, SEEK_CUR);
+               if (file_offset == -1)
+                       return -1;
+               err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
+                                                    event, file_offset);
+               if (err)
+                       return err;
+       }
+
+       /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
+       padding = (len1 + len2) & 7;
+       if (padding)
+               padding = 8 - padding;
+
+       record__write(rec, event, event->header.size);
+       record__write(rec, data1, len1);
+       if (len2)
+               record__write(rec, data2, len2);
+       record__write(rec, &pad, padding);
+
+       return 0;
+}
+
+static int record__auxtrace_mmap_read(struct record *rec,
+                                     struct auxtrace_mmap *mm)
+{
+       int ret;
+
+       ret = auxtrace_mmap__read(mm, rec->itr, &rec->tool,
+                                 record__process_auxtrace);
+       if (ret < 0)
+               return ret;
+
+       if (ret)
+               rec->samples++;
+
+       return 0;
+}
+
+static int record__auxtrace_mmap_read_snapshot(struct record *rec,
+                                              struct auxtrace_mmap *mm)
+{
+       int ret;
+
+       ret = auxtrace_mmap__read_snapshot(mm, rec->itr, &rec->tool,
+                                          record__process_auxtrace,
+                                          rec->opts.auxtrace_snapshot_size);
+       if (ret < 0)
+               return ret;
+
+       if (ret)
+               rec->samples++;
+
+       return 0;
+}
+
+static int record__auxtrace_read_snapshot_all(struct record *rec)
+{
+       int i;
+       int rc = 0;
+
+       for (i = 0; i < rec->evlist->nr_mmaps; i++) {
+               struct auxtrace_mmap *mm =
+                               &rec->evlist->mmap[i].auxtrace_mmap;
+
+               if (!mm->base)
+                       continue;
+
+               if (record__auxtrace_mmap_read_snapshot(rec, mm) != 0) {
+                       rc = -1;
+                       goto out;
+               }
+       }
+out:
+       return rc;
+}
+
+static void record__read_auxtrace_snapshot(struct record *rec)
+{
+       pr_debug("Recording AUX area tracing snapshot\n");
+       if (record__auxtrace_read_snapshot_all(rec) < 0) {
+               auxtrace_snapshot_err = -1;
+       } else {
+               auxtrace_snapshot_err = auxtrace_record__snapshot_finish(rec->itr);
+               if (!auxtrace_snapshot_err)
+                       auxtrace_snapshot_enabled = 1;
+       }
+}
+
+#else
+
+static inline
+int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
+                              struct auxtrace_mmap *mm __maybe_unused)
+{
+       return 0;
+}
+
+static inline
+void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
+{
+}
+
+static inline
+int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
+{
+       return 0;
+}
+
+#endif
+
  static int record__open(struct record *rec)
  {
         char msg[512];
@@ -169,13 +302,16 @@ try_again:
                 goto out;
         }
  
-       if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
+       if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, false,
+                                opts->auxtrace_mmap_pages,
+                                opts->auxtrace_snapshot_mode) < 0) {
                 if (errno == EPERM) {
                         pr_err("Permission error mapping pages.\n"
                                "Consider increasing "
                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
                                "or try again with a smaller value of -m/--mmap_pages.\n"
-                              "(current value: %u)\n", opts->mmap_pages);
+                              "(current value: %u,%u)\n",
+                              opts->mmap_pages, opts->auxtrace_mmap_pages);
                         rc = -errno;
                 } else {
                         pr_err("failed to mmap with %d (%s)\n", errno,
@@ -209,12 +345,9 @@ static int process_buildids(struct record *rec)
         struct perf_data_file *file  = &rec->file;
         struct perf_session *session = rec->session;
  
-       u64 size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
-       if (size == 0)
+       if (file->size == 0)
                 return 0;
  
-       file->size = size;
-
         /*
          * During this process, it'll load kernel map and replace the
          * dso->long_name to a real pathname it found.  In this case
@@ -270,12 +403,20 @@ static int record__mmap_read_all(struct record *rec)
         int rc = 0;
  
         for (i = 0; i < rec->evlist->nr_mmaps; i++) {
+               struct auxtrace_mmap *mm = &rec->evlist->mmap[i].auxtrace_mmap;
+
                 if (rec->evlist->mmap[i].base) {
                         if (record__mmap_read(rec, i) != 0) {
                                 rc = -1;
                                 goto out;
                         }
                 }
+
+               if (mm->base && !rec->opts.auxtrace_snapshot_mode &&
+                   record__auxtrace_mmap_read(rec, mm) != 0) {
+                       rc = -1;
+                       goto out;
+               }
         }
  
         /*
@@ -305,6 +446,9 @@ static void record__init_features(struct record *rec)
  
         if (!rec->opts.branch_stack)
                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
+
+       if (!rec->opts.full_auxtrace)
+               perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
  }
  
  static volatile int workload_exec_errno;
@@ -323,6 +467,8 @@ static void workload_exec_failed_signal(int signo __maybe_unused,
         child_finished = 1;
  }
  
+static void snapshot_sig_handler(int sig);
+
  static int __cmd_record(struct record *rec, int argc, const char **argv)
  {
         int err;
@@ -343,6 +489,10 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
         signal(SIGCHLD, sig_handler);
         signal(SIGINT, sig_handler);
         signal(SIGTERM, sig_handler);
+       if (rec->opts.auxtrace_snapshot_mode)
+               signal(SIGUSR2, snapshot_sig_handler);
+       else
+               signal(SIGUSR2, SIG_IGN);
  
         session = perf_session__new(file, false, tool);
         if (session == NULL) {
@@ -421,6 +571,13 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
                 }
         }
  
+       if (rec->opts.full_auxtrace) {
+               err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
+                                       session, process_synthesized_event);
+               if (err)
+                       goto out_delete_session;
+       }
+
         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
                                                  machine);
         if (err < 0)
@@ -441,7 +598,8 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
         }
  
         err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
-                                           process_synthesized_event, opts->sample_address);
+                                           process_synthesized_event, opts->sample_address,
+                                           opts->proc_map_timeout);
         if (err != 0)
                 goto out_child;
  
@@ -475,14 +633,27 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
                 perf_evlist__enable(rec->evlist);
         }
  
+       auxtrace_snapshot_enabled = 1;
         for (;;) {
                 int hits = rec->samples;
  
                 if (record__mmap_read_all(rec) < 0) {
+                       auxtrace_snapshot_enabled = 0;
                         err = -1;
                         goto out_child;
                 }
  
+               if (auxtrace_record__snapshot_started) {
+                       auxtrace_record__snapshot_started = 0;
+                       if (!auxtrace_snapshot_err)
+                               record__read_auxtrace_snapshot(rec);
+                       if (auxtrace_snapshot_err) {
+                               pr_err("AUX area tracing snapshot failed\n");
+                               err = -1;
+                               goto out_child;
+                       }
+               }
+
                 if (hits == rec->samples) {
                         if (done || draining)
                                 break;
@@ -505,10 +676,12 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
                  * disable events in this case.
                  */
                 if (done && !disabled && !target__none(&opts->target)) {
+                       auxtrace_snapshot_enabled = 0;
                         perf_evlist__disable(rec->evlist);
                         disabled = true;
                 }
         }
+       auxtrace_snapshot_enabled = 0;
  
         if (forks && workload_exec_errno) {
                 char msg[STRERR_BUFSIZE];
@@ -544,16 +717,25 @@ out_child:
  
         if (!err && !file->is_pipe) {
                 rec->session->header.data_size += rec->bytes_written;
+               file->size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
  
-               if (!rec->no_buildid)
+               if (!rec->no_buildid) {
                         process_buildids(rec);
+                       /*
+                        * We take all buildids when the file contains
+                        * AUX area tracing data because we do not decode the
+                        * trace because it would take too long.
+                        */
+                       if (rec->opts.full_auxtrace)
+                               dsos__hit_all(rec->session);
+               }
                 perf_session__write_header(rec->session, rec->evlist, fd, true);
         }
  
         if (!err && !quiet) {
                 char samples[128];
  
-               if (rec->samples)
+               if (rec->samples && !rec->opts.full_auxtrace)
                         scnprintf(samples, sizeof(samples),
                                   " (%" PRIu64 " samples)", rec->samples);
                 else
@@ -569,94 +751,6 @@ out_delete_session:
         return status;
  }
  
-#define BRANCH_OPT(n, m) \
-       { .name = n, .mode = (m) }
-
-#define BRANCH_END { .name = NULL }
-
-struct branch_mode {
-       const char *name;
-       int mode;
-};
-
-static const struct branch_mode branch_modes[] = {
-       BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
-       BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
-       BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
-       BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
-       BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
-       BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
-       BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
-       BRANCH_OPT("abort_tx", PERF_SAMPLE_BRANCH_ABORT_TX),
-       BRANCH_OPT("in_tx", PERF_SAMPLE_BRANCH_IN_TX),
-       BRANCH_OPT("no_tx", PERF_SAMPLE_BRANCH_NO_TX),
-       BRANCH_OPT("cond", PERF_SAMPLE_BRANCH_COND),
-       BRANCH_END
-};
-
-static int
-parse_branch_stack(const struct option *opt, const char *str, int unset)
-{
-#define ONLY_PLM \
-       (PERF_SAMPLE_BRANCH_USER        |\
-        PERF_SAMPLE_BRANCH_KERNEL      |\
-        PERF_SAMPLE_BRANCH_HV)
-
-       uint64_t *mode = (uint64_t *)opt->value;
-       const struct branch_mode *br;
-       char *s, *os = NULL, *p;
-       int ret = -1;
-
-       if (unset)
-               return 0;
-
-       /*
-        * cannot set it twice, -b + --branch-filter for instance
-        */
-       if (*mode)
-               return -1;
-
-       /* str may be NULL in case no arg is passed to -b */
-       if (str) {
-               /* because str is read-only */
-               s = os = strdup(str);
-               if (!s)
-                       return -1;
-
-               for (;;) {
-                       p = strchr(s, ',');
-                       if (p)
-                               *p = '\0';
-
-                       for (br = branch_modes; br->name; br++) {
-                               if (!strcasecmp(s, br->name))
-                                       break;
-                       }
-                       if (!br->name) {
-                               ui__warning("unknown branch filter %s,"
-                                           " check man page\n", s);
-                               goto error;
-                       }
-
-                       *mode |= br->mode;
-
-                       if (!p)
-                               break;
-
-                       s = p + 1;
-               }
-       }
-       ret = 0;
-
-       /* default to any branch */
-       if ((*mode & ~ONLY_PLM) == 0) {
-               *mode = PERF_SAMPLE_BRANCH_ANY;
-       }
-error:
-       free(os);
-       return ret;
-}
-
  static void callchain_debug(void)
  {
         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
@@ -795,6 +889,49 @@ static int parse_clockid(const struct option *opt, const char *str, int unset)
         return -1;
  }
  
+static int record__parse_mmap_pages(const struct option *opt,
+                                   const char *str,
+                                   int unset __maybe_unused)
+{
+       struct record_opts *opts = opt->value;
+       char *s, *p;
+       unsigned int mmap_pages;
+       int ret;
+
+       if (!str)
+               return -EINVAL;
+
+       s = strdup(str);
+       if (!s)
+               return -ENOMEM;
+
+       p = strchr(s, ',');
+       if (p)
+               *p = '\0';
+
+       if (*s) {
+               ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
+               if (ret)
+                       goto out_free;
+               opts->mmap_pages = mmap_pages;
+       }
+
+       if (!p) {
+               ret = 0;
+               goto out_free;
+       }
+
+       ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
+       if (ret)
+               goto out_free;
+
+       opts->auxtrace_mmap_pages = mmap_pages;
+
+out_free:
+       free(s);
+       return ret;
+}
+
  static const char * const __record_usage[] = {
         "perf record [<options>] [<command>]",
         "perf record [<options>] -- <command> [<options>]",
@@ -823,6 +960,7 @@ static struct record record = {
                         .uses_mmap   = true,
                         .default_per_cpu = true,
                 },
+               .proc_map_timeout     = 500,
         },
         .tool = {
                 .sample         = process_sample_event,
@@ -875,9 +1013,9 @@ struct option __record_options[] = {
                         &record.opts.no_inherit_set,
                         "child tasks do not inherit counters"),
         OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
-       OPT_CALLBACK('m', "mmap-pages", &record.opts.mmap_pages, "pages",
-                    "number of mmap data pages",
-                    perf_evlist__parse_mmap_pages),
+       OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
+                    "number of mmap data pages and AUX area tracing mmap pages",
+                    record__parse_mmap_pages),
         OPT_BOOLEAN(0, "group", &record.opts.group,
                     "put the counters into a counter group"),
         OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
@@ -891,10 +1029,9 @@ struct option __record_options[] = {
         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
                     "per thread counts"),
-       OPT_BOOLEAN('d', "data", &record.opts.sample_address,
-                   "Sample addresses"),
-       OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
-       OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
+       OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
+       OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Record the sample timestamps"),
+       OPT_BOOLEAN('P', "period", &record.opts.period, "Record the sample period"),
         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
                     "don't sample"),
         OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
@@ -929,6 +1066,10 @@ struct option __record_options[] = {
         OPT_CALLBACK('k', "clockid", &record.opts,
         "clockid", "clockid to use for events, see clock_gettime()",
         parse_clockid),
+       OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
+                         "opts", "AUX area tracing Snapshot Mode", ""),
+       OPT_UINTEGER(0, "proc-map-timeout", &record.opts.proc_map_timeout,
+                       "per thread proc mmap processing timeout in ms"),
         OPT_END()
  };
  
@@ -936,7 +1077,7 @@ struct option *record_options = __record_options;
  
  int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
  {
-       int err = -ENOMEM;
+       int err;
         struct record *rec = &record;
         char errbuf[BUFSIZ];
  
@@ -957,6 +1098,19 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
                 usage_with_options(record_usage, record_options);
         }
  
+       if (!rec->itr) {
+               rec->itr = auxtrace_record__init(rec->evlist, &err);
+               if (err)
+                       return err;
+       }
+
+       err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
+                                             rec->opts.auxtrace_snapshot_opts);
+       if (err)
+               return err;
+
+       err = -ENOMEM;
+
         symbol__init(NULL);
  
         if (symbol_conf.kptr_restrict)
@@ -1002,6 +1156,10 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
         if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
                 usage_with_options(record_usage, record_options);
  
+       err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
+       if (err)
+               goto out_symbol_exit;
+
         if (record_opts__config(&rec->opts)) {
                 err = -EINVAL;
                 goto out_symbol_exit;
@@ -1011,5 +1169,15 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
  out_symbol_exit:
         perf_evlist__delete(rec->evlist);
         symbol__exit();
+       auxtrace_record__free(rec->itr);
         return err;
  }
+
+static void snapshot_sig_handler(int sig __maybe_unused)
+{
+       if (!auxtrace_snapshot_enabled)
+               return;
+       auxtrace_snapshot_enabled = 0;
+       auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
+       auxtrace_record__snapshot_started = 1;
+}
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c

index b63aeda719be0c7604da5229e1a3a0ec33253400..32626ea3e2276b11279db88207e42c29eeed391a 100644 (file)
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -36,6 +36,8 @@
  #include "util/data.h"
  #include "arch/common.h"
  
+#include "util/auxtrace.h"
+
  #include <dlfcn.h>
  #include <linux/bitmap.h>
  
@@ -137,10 +139,12 @@ static int process_sample_event(struct perf_tool *tool,
         struct report *rep = container_of(tool, struct report, tool);
         struct addr_location al;
         struct hist_entry_iter iter = {
-               .hide_unresolved = rep->hide_unresolved,
-               .add_entry_cb = hist_iter__report_callback,
+               .evsel                  = evsel,
+               .sample                 = sample,
+               .hide_unresolved        = rep->hide_unresolved,
+               .add_entry_cb           = hist_iter__report_callback,
         };
-       int ret;
+       int ret = 0;
  
         if (perf_event__preprocess_sample(event, machine, &al, sample) < 0) {
                 pr_debug("problem processing %d event, skipping it.\n",
@@ -149,10 +153,10 @@ static int process_sample_event(struct perf_tool *tool,
         }
  
         if (rep->hide_unresolved && al.sym == NULL)
-               return 0;
+               goto out_put;
  
         if (rep->cpu_list && !test_bit(sample->cpu, rep->cpu_bitmap))
-               return 0;
+               goto out_put;
  
         if (sort__mode == SORT_MODE__BRANCH)
                 iter.ops = &hist_iter_branch;
@@ -166,11 +170,11 @@ static int process_sample_event(struct perf_tool *tool,
         if (al.map != NULL)
                 al.map->dso->hit = 1;
  
-       ret = hist_entry_iter__add(&iter, &al, evsel, sample, rep->max_stack,
-                                  rep);
+       ret = hist_entry_iter__add(&iter, &al, rep->max_stack, rep);
         if (ret < 0)
                 pr_debug("problem adding hist entry, skipping event\n");
-
+out_put:
+       addr_location__put(&al);
         return ret;
  }
  
@@ -316,6 +320,7 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
  {
         struct perf_evsel *pos;
  
+       fprintf(stdout, "#\n# Total Lost Samples: %" PRIu64 "\n#\n", evlist->stats.total_lost_samples);
         evlist__for_each(evlist, pos) {
                 struct hists *hists = evsel__hists(pos);
                 const char *evname = perf_evsel__name(pos);
@@ -330,15 +335,14 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
         }
  
         if (sort_order == NULL &&
-           parent_pattern == default_parent_pattern) {
+           parent_pattern == default_parent_pattern)
                 fprintf(stdout, "#\n# (%s)\n#\n", help);
  
-               if (rep->show_threads) {
-                       bool style = !strcmp(rep->pretty_printing_style, "raw");
-                       perf_read_values_display(stdout, &rep->show_threads_values,
-                                                style);
-                       perf_read_values_destroy(&rep->show_threads_values);
-               }
+       if (rep->show_threads) {
+               bool style = !strcmp(rep->pretty_printing_style, "raw");
+               perf_read_values_display(stdout, &rep->show_threads_values,
+                                        style);
+               perf_read_values_destroy(&rep->show_threads_values);
         }
  
         return 0;
@@ -585,6 +589,7 @@ parse_percent_limit(const struct option *opt, const char *str,
  int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
  {
         struct perf_session *session;
+       struct itrace_synth_opts itrace_synth_opts = { .set = 0, };
         struct stat st;
         bool has_br_stack = false;
         int branch_mode = -1;
@@ -607,6 +612,9 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                         .attr            = perf_event__process_attr,
                         .tracing_data    = perf_event__process_tracing_data,
                         .build_id        = perf_event__process_build_id,
+                       .id_index        = perf_event__process_id_index,
+                       .auxtrace_info   = perf_event__process_auxtrace_info,
+                       .auxtrace        = perf_event__process_auxtrace,
                         .ordered_events  = true,
                         .ordering_requires_timestamps = true,
                 },
@@ -717,6 +725,9 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                      "Don't show entries under that percent", parse_percent_limit),
         OPT_CALLBACK(0, "percentage", NULL, "relative|absolute",
                      "how to display percentage of filtered entries", parse_filter_percentage),
+       OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts",
+                           "Instruction Tracing options",
+                           itrace_parse_synth_opts),
         OPT_END()
         };
         struct perf_data_file file = {
@@ -761,6 +772,8 @@ repeat:
                                                report.queue_size);
         }
  
+       session->itrace_synth_opts = &itrace_synth_opts;
+
         report.session = session;
  
         has_br_stack = perf_header__has_feat(&session->header,
@@ -803,8 +816,8 @@ repeat:
                 goto error;
         }
  
-       /* Force tty output for header output. */
-       if (report.header || report.header_only)
+       /* Force tty output for header output and per-thread stat. */
+       if (report.header || report.header_only || report.show_threads)
                 use_browser = 0;
  
         if (strcmp(input_name, "-") != 0)
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c

index 5275bab703138cbeb9c40f1ff22174ac52ca2d13..33962612a5e9035ae42c83e15497a5713922556b 100644 (file)
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -95,6 +95,7 @@ struct work_atoms {
         u64                     total_lat;
         u64                     nb_atoms;
         u64                     total_runtime;
+       int                     num_merged;
  };
  
  typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
@@ -168,9 +169,10 @@ struct perf_sched {
         u64              all_runtime;
         u64              all_count;
         u64              cpu_last_switched[MAX_CPUS];
-       struct rb_root   atom_root, sorted_atom_root;
+       struct rb_root   atom_root, sorted_atom_root, merged_atom_root;
         struct list_head sort_list, cmp_pid;
         bool force;
+       bool skip_merge;
  };
  
  static u64 get_nsecs(void)
@@ -770,7 +772,7 @@ static int replay_fork_event(struct perf_sched *sched,
         if (child == NULL || parent == NULL) {
                 pr_debug("thread does not exist on fork event: child %p, parent %p\n",
                                  child, parent);
-               return 0;
+               goto out_put;
         }
  
         if (verbose) {
@@ -781,6 +783,9 @@ static int replay_fork_event(struct perf_sched *sched,
  
         register_pid(sched, parent->tid, thread__comm_str(parent));
         register_pid(sched, child->tid, thread__comm_str(child));
+out_put:
+       thread__put(child);
+       thread__put(parent);
         return 0;
  }
  
@@ -957,7 +962,7 @@ static int latency_switch_event(struct perf_sched *sched,
         struct work_atoms *out_events, *in_events;
         struct thread *sched_out, *sched_in;
         u64 timestamp0, timestamp = sample->time;
-       int cpu = sample->cpu;
+       int cpu = sample->cpu, err = -1;
         s64 delta;
  
         BUG_ON(cpu >= MAX_CPUS || cpu < 0);
@@ -976,15 +981,17 @@ static int latency_switch_event(struct perf_sched *sched,
  
         sched_out = machine__findnew_thread(machine, -1, prev_pid);
         sched_in = machine__findnew_thread(machine, -1, next_pid);
+       if (sched_out == NULL || sched_in == NULL)
+               goto out_put;
  
         out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
         if (!out_events) {
                 if (thread_atoms_insert(sched, sched_out))
-                       return -1;
+                       goto out_put;
                 out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
                 if (!out_events) {
                         pr_err("out-event: Internal tree error");
-                       return -1;
+                       goto out_put;
                 }
         }
         if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
@@ -993,22 +1000,25 @@ static int latency_switch_event(struct perf_sched *sched,
         in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
         if (!in_events) {
                 if (thread_atoms_insert(sched, sched_in))
-                       return -1;
+                       goto out_put;
                 in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
                 if (!in_events) {
                         pr_err("in-event: Internal tree error");
-                       return -1;
+                       goto out_put;
                 }
                 /*
                  * Take came in we have not heard about yet,
                  * add in an initial atom in runnable state:
                  */
                 if (add_sched_out_event(in_events, 'R', timestamp))
-                       return -1;
+                       goto out_put;
         }
         add_sched_in_event(in_events, timestamp);
-
-       return 0;
+       err = 0;
+out_put:
+       thread__put(sched_out);
+       thread__put(sched_in);
+       return err;
  }
  
  static int latency_runtime_event(struct perf_sched *sched,
@@ -1021,23 +1031,29 @@ static int latency_runtime_event(struct perf_sched *sched,
         struct thread *thread = machine__findnew_thread(machine, -1, pid);
         struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
         u64 timestamp = sample->time;
-       int cpu = sample->cpu;
+       int cpu = sample->cpu, err = -1;
+
+       if (thread == NULL)
+               return -1;
  
         BUG_ON(cpu >= MAX_CPUS || cpu < 0);
         if (!atoms) {
                 if (thread_atoms_insert(sched, thread))
-                       return -1;
+                       goto out_put;
                 atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
                 if (!atoms) {
                         pr_err("in-event: Internal tree error");
-                       return -1;
+                       goto out_put;
                 }
                 if (add_sched_out_event(atoms, 'R', timestamp))
-                       return -1;
+                       goto out_put;
         }
  
         add_runtime_event(atoms, runtime, timestamp);
-       return 0;
+       err = 0;
+out_put:
+       thread__put(thread);
+       return err;
  }
  
  static int latency_wakeup_event(struct perf_sched *sched,
@@ -1050,19 +1066,22 @@ static int latency_wakeup_event(struct perf_sched *sched,
         struct work_atom *atom;
         struct thread *wakee;
         u64 timestamp = sample->time;
+       int err = -1;
  
         wakee = machine__findnew_thread(machine, -1, pid);
+       if (wakee == NULL)
+               return -1;
         atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
         if (!atoms) {
                 if (thread_atoms_insert(sched, wakee))
-                       return -1;
+                       goto out_put;
                 atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
                 if (!atoms) {
                         pr_err("wakeup-event: Internal tree error");
-                       return -1;
+                       goto out_put;
                 }
                 if (add_sched_out_event(atoms, 'S', timestamp))
-                       return -1;
+                       goto out_put;
         }
  
         BUG_ON(list_empty(&atoms->work_list));
@@ -1081,17 +1100,21 @@ static int latency_wakeup_event(struct perf_sched *sched,
          * skip in this case.
          */
         if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
-               return 0;
+               goto out_ok;
  
         sched->nr_timestamps++;
         if (atom->sched_out_time > timestamp) {
                 sched->nr_unordered_timestamps++;
-               return 0;
+               goto out_ok;
         }
  
         atom->state = THREAD_WAIT_CPU;
         atom->wake_up_time = timestamp;
-       return 0;
+out_ok:
+       err = 0;
+out_put:
+       thread__put(wakee);
+       return err;
  }
  
  static int latency_migrate_task_event(struct perf_sched *sched,
@@ -1104,6 +1127,7 @@ static int latency_migrate_task_event(struct perf_sched *sched,
         struct work_atoms *atoms;
         struct work_atom *atom;
         struct thread *migrant;
+       int err = -1;
  
         /*
          * Only need to worry about migration when profiling one CPU.
@@ -1112,18 +1136,20 @@ static int latency_migrate_task_event(struct perf_sched *sched,
                 return 0;
  
         migrant = machine__findnew_thread(machine, -1, pid);
+       if (migrant == NULL)
+               return -1;
         atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
         if (!atoms) {
                 if (thread_atoms_insert(sched, migrant))
-                       return -1;
+                       goto out_put;
                 register_pid(sched, migrant->tid, thread__comm_str(migrant));
                 atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
                 if (!atoms) {
                         pr_err("migration-event: Internal tree error");
-                       return -1;
+                       goto out_put;
                 }
                 if (add_sched_out_event(atoms, 'R', timestamp))
-                       return -1;
+                       goto out_put;
         }
  
         BUG_ON(list_empty(&atoms->work_list));
@@ -1135,8 +1161,10 @@ static int latency_migrate_task_event(struct perf_sched *sched,
  
         if (atom->sched_out_time > timestamp)
                 sched->nr_unordered_timestamps++;
-
-       return 0;
+       err = 0;
+out_put:
+       thread__put(migrant);
+       return err;
  }
  
  static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
@@ -1156,7 +1184,10 @@ static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_
         sched->all_runtime += work_list->total_runtime;
         sched->all_count   += work_list->nb_atoms;
  
-       ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
+       if (work_list->num_merged > 1)
+               ret = printf("  %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
+       else
+               ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
  
         for (i = 0; i < 24 - ret; i++)
                 printf(" ");
@@ -1276,17 +1307,22 @@ static int sort_dimension__add(const char *tok, struct list_head *list)
  static void perf_sched__sort_lat(struct perf_sched *sched)
  {
         struct rb_node *node;
-
+       struct rb_root *root = &sched->atom_root;
+again:
         for (;;) {
                 struct work_atoms *data;
-               node = rb_first(&sched->atom_root);
+               node = rb_first(root);
                 if (!node)
                         break;
  
-               rb_erase(node, &sched->atom_root);
+               rb_erase(node, root);
                 data = rb_entry(node, struct work_atoms, node);
                 __thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
         }
+       if (root == &sched->atom_root) {
+               root = &sched->merged_atom_root;
+               goto again;
+       }
  }
  
  static int process_sched_wakeup_event(struct perf_tool *tool,
@@ -1330,8 +1366,10 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
         }
  
         sched_in = machine__findnew_thread(machine, -1, next_pid);
+       if (sched_in == NULL)
+               return -1;
  
-       sched->curr_thread[this_cpu] = sched_in;
+       sched->curr_thread[this_cpu] = thread__get(sched_in);
  
         printf("  ");
  
@@ -1381,6 +1419,8 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
                 printf("\n");
         }
  
+       thread__put(sched_in);
+
         return 0;
  }
  
@@ -1542,6 +1582,59 @@ static void print_bad_events(struct perf_sched *sched)
         }
  }
  
+static void __merge_work_atoms(struct rb_root *root, struct work_atoms *data)
+{
+       struct rb_node **new = &(root->rb_node), *parent = NULL;
+       struct work_atoms *this;
+       const char *comm = thread__comm_str(data->thread), *this_comm;
+
+       while (*new) {
+               int cmp;
+
+               this = container_of(*new, struct work_atoms, node);
+               parent = *new;
+
+               this_comm = thread__comm_str(this->thread);
+               cmp = strcmp(comm, this_comm);
+               if (cmp > 0) {
+                       new = &((*new)->rb_left);
+               } else if (cmp < 0) {
+                       new = &((*new)->rb_right);
+               } else {
+                       this->num_merged++;
+                       this->total_runtime += data->total_runtime;
+                       this->nb_atoms += data->nb_atoms;
+                       this->total_lat += data->total_lat;
+                       list_splice(&data->work_list, &this->work_list);
+                       if (this->max_lat < data->max_lat) {
+                               this->max_lat = data->max_lat;
+                               this->max_lat_at = data->max_lat_at;
+                       }
+                       zfree(&data);
+                       return;
+               }
+       }
+
+       data->num_merged++;
+       rb_link_node(&data->node, parent, new);
+       rb_insert_color(&data->node, root);
+}
+
+static void perf_sched__merge_lat(struct perf_sched *sched)
+{
+       struct work_atoms *data;
+       struct rb_node *node;
+
+       if (sched->skip_merge)
+               return;
+
+       while ((node = rb_first(&sched->atom_root))) {
+               rb_erase(node, &sched->atom_root);
+               data = rb_entry(node, struct work_atoms, node);
+               __merge_work_atoms(&sched->merged_atom_root, data);
+       }
+}
+
  static int perf_sched__lat(struct perf_sched *sched)
  {
         struct rb_node *next;
@@ -1551,6 +1644,7 @@ static int perf_sched__lat(struct perf_sched *sched)
         if (perf_sched__read_events(sched))
                 return -1;
  
+       perf_sched__merge_lat(sched);
         perf_sched__sort_lat(sched);
  
         printf("\n -----------------------------------------------------------------------------------------------------------------\n");
@@ -1702,6 +1796,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
                 .profile_cpu          = -1,
                 .next_shortname1      = 'A',
                 .next_shortname2      = '0',
+               .skip_merge           = 0,
         };
         const struct option latency_options[] = {
         OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
@@ -1712,6 +1807,8 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
                     "CPU to profile on"),
         OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
                     "dump raw trace in ASCII"),
+       OPT_BOOLEAN('p', "pids", &sched.skip_merge,
+                   "latency stats per pid instead of per comm"),
         OPT_END()
         };
         const struct option replay_options[] = {
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c

index 58f10b8e6ff20d51429634b8f79628188fbc2dd4..24809787369f5a1303451de5264798d0e31792b6 100644 (file)
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -16,6 +16,7 @@
  #include "util/evsel.h"
  #include "util/sort.h"
  #include "util/data.h"
+#include "util/auxtrace.h"
  #include <linux/bitmap.h>
  
  static char const              *script_name;
@@ -26,6 +27,7 @@ static u64                    nr_unordered;
  static bool                    no_callchain;
  static bool                    latency_format;
  static bool                    system_wide;
+static bool                    print_flags;
  static const char              *cpu_list;
  static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
  
@@ -146,9 +148,10 @@ static const char *output_field2str(enum perf_output_field field)
  
  #define PRINT_FIELD(x)  (output[attr->type].fields & PERF_OUTPUT_##x)
  
-static int perf_evsel__check_stype(struct perf_evsel *evsel,
-                                  u64 sample_type, const char *sample_msg,
-                                  enum perf_output_field field)
+static int perf_evsel__do_check_stype(struct perf_evsel *evsel,
+                                     u64 sample_type, const char *sample_msg,
+                                     enum perf_output_field field,
+                                     bool allow_user_set)
  {
         struct perf_event_attr *attr = &evsel->attr;
         int type = attr->type;
@@ -158,6 +161,8 @@ static int perf_evsel__check_stype(struct perf_evsel *evsel,
                 return 0;
  
         if (output[type].user_set) {
+               if (allow_user_set)
+                       return 0;
                 evname = perf_evsel__name(evsel);
                 pr_err("Samples for '%s' event do not have %s attribute set. "
                        "Cannot print '%s' field.\n",
@@ -175,10 +180,22 @@ static int perf_evsel__check_stype(struct perf_evsel *evsel,
         return 0;
  }
  
+static int perf_evsel__check_stype(struct perf_evsel *evsel,
+                                  u64 sample_type, const char *sample_msg,
+                                  enum perf_output_field field)
+{
+       return perf_evsel__do_check_stype(evsel, sample_type, sample_msg, field,
+                                         false);
+}
+
  static int perf_evsel__check_attr(struct perf_evsel *evsel,
                                   struct perf_session *session)
  {
         struct perf_event_attr *attr = &evsel->attr;
+       bool allow_user_set;
+
+       allow_user_set = perf_header__has_feat(&session->header,
+                                              HEADER_AUXTRACE);
  
         if (PRINT_FIELD(TRACE) &&
                 !perf_session__has_traces(session, "record -R"))
@@ -191,8 +208,8 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
         }
  
         if (PRINT_FIELD(ADDR) &&
-               perf_evsel__check_stype(evsel, PERF_SAMPLE_ADDR, "ADDR",
-                                       PERF_OUTPUT_ADDR))
+               perf_evsel__do_check_stype(evsel, PERF_SAMPLE_ADDR, "ADDR",
+                                          PERF_OUTPUT_ADDR, allow_user_set))
                 return -EINVAL;
  
         if (PRINT_FIELD(SYM) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR)) {
@@ -229,8 +246,8 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
                 return -EINVAL;
  
         if (PRINT_FIELD(CPU) &&
-               perf_evsel__check_stype(evsel, PERF_SAMPLE_CPU, "CPU",
-                                       PERF_OUTPUT_CPU))
+               perf_evsel__do_check_stype(evsel, PERF_SAMPLE_CPU, "CPU",
+                                          PERF_OUTPUT_CPU, allow_user_set))
                 return -EINVAL;
  
         if (PRINT_FIELD(PERIOD) &&
@@ -445,6 +462,25 @@ static void print_sample_bts(union perf_event *event,
         printf("\n");
  }
  
+static void print_sample_flags(u32 flags)
+{
+       const char *chars = PERF_IP_FLAG_CHARS;
+       const int n = strlen(PERF_IP_FLAG_CHARS);
+       char str[33];
+       int i, pos = 0;
+
+       for (i = 0; i < n; i++, flags >>= 1) {
+               if (flags & 1)
+                       str[pos++] = chars[i];
+       }
+       for (; i < 32; i++, flags >>= 1) {
+               if (flags & 1)
+                       str[pos++] = '?';
+       }
+       str[pos] = 0;
+       printf("  %-4s ", str);
+}
+
  static void process_event(union perf_event *event, struct perf_sample *sample,
                           struct perf_evsel *evsel, struct addr_location *al)
  {
@@ -464,6 +500,9 @@ static void process_event(union perf_event *event, struct perf_sample *sample,
                 printf("%s: ", evname ? evname : "[unknown]");
         }
  
+       if (print_flags)
+               print_sample_flags(sample->flags);
+
         if (is_bts_event(attr)) {
                 print_sample_bts(event, sample, evsel, thread, al);
                 return;
@@ -568,13 +607,14 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused,
         }
  
         if (al.filtered)
-               return 0;
+               goto out_put;
  
         if (cpu_list && !test_bit(sample->cpu, cpu_bitmap))
-               return 0;
+               goto out_put;
  
         scripting_ops->process_event(event, sample, evsel, &al);
-
+out_put:
+       addr_location__put(&al);
         return 0;
  }
  
@@ -642,8 +682,8 @@ static int process_comm_event(struct perf_tool *tool,
         print_sample_start(sample, thread, evsel);
         perf_event__fprintf(event, stdout);
         ret = 0;
-
  out:
+       thread__put(thread);
         return ret;
  }
  
@@ -674,6 +714,7 @@ static int process_fork_event(struct perf_tool *tool,
         }
         print_sample_start(sample, thread, evsel);
         perf_event__fprintf(event, stdout);
+       thread__put(thread);
  
         return 0;
  }
@@ -682,6 +723,7 @@ static int process_exit_event(struct perf_tool *tool,
                               struct perf_sample *sample,
                               struct machine *machine)
  {
+       int err = 0;
         struct thread *thread;
         struct perf_script *script = container_of(tool, struct perf_script, tool);
         struct perf_session *session = script->session;
@@ -703,9 +745,10 @@ static int process_exit_event(struct perf_tool *tool,
         perf_event__fprintf(event, stdout);
  
         if (perf_event__process_exit(tool, event, sample, machine) < 0)
-               return -1;
+               err = -1;
  
-       return 0;
+       thread__put(thread);
+       return err;
  }
  
  static int process_mmap_event(struct perf_tool *tool,
@@ -735,7 +778,7 @@ static int process_mmap_event(struct perf_tool *tool,
         }
         print_sample_start(sample, thread, evsel);
         perf_event__fprintf(event, stdout);
-
+       thread__put(thread);
         return 0;
  }
  
@@ -766,7 +809,7 @@ static int process_mmap2_event(struct perf_tool *tool,
         }
         print_sample_start(sample, thread, evsel);
         perf_event__fprintf(event, stdout);
-
+       thread__put(thread);
         return 0;
  }
  
@@ -999,12 +1042,15 @@ static int parse_output_fields(const struct option *opt __maybe_unused,
                 }
         }
  
-       tok = strtok(tok, ",");
-       while (tok) {
+       for (tok = strtok(tok, ","); tok; tok = strtok(NULL, ",")) {
                 for (i = 0; i < imax; ++i) {
                         if (strcmp(tok, all_output_options[i].str) == 0)
                                 break;
                 }
+               if (i == imax && strcmp(tok, "flags") == 0) {
+                       print_flags = true;
+                       continue;
+               }
                 if (i == imax) {
                         fprintf(stderr, "Invalid field requested.\n");
                         rc = -EINVAL;
@@ -1032,8 +1078,6 @@ static int parse_output_fields(const struct option *opt __maybe_unused,
                         }
                         output[type].fields |= all_output_options[i].field;
                 }
-
-               tok = strtok(NULL, ",");
         }
  
         if (type >= 0) {
@@ -1497,6 +1541,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
         char *rec_script_path = NULL;
         char *rep_script_path = NULL;
         struct perf_session *session;
+       struct itrace_synth_opts itrace_synth_opts = { .set = false, };
         char *script_path = NULL;
         const char **__argv;
         int i, j, err = 0;
@@ -1511,6 +1556,10 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
                         .attr            = process_attr,
                         .tracing_data    = perf_event__process_tracing_data,
                         .build_id        = perf_event__process_build_id,
+                       .id_index        = perf_event__process_id_index,
+                       .auxtrace_info   = perf_event__process_auxtrace_info,
+                       .auxtrace        = perf_event__process_auxtrace,
+                       .auxtrace_error  = perf_event__process_auxtrace_error,
                         .ordered_events  = true,
                         .ordering_requires_timestamps = true,
                 },
@@ -1549,7 +1598,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
                      "comma separated output fields prepend with 'type:'. "
                      "Valid types: hw,sw,trace,raw. "
                      "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
-                    "addr,symoff,period", parse_output_fields),
+                    "addr,symoff,period,flags", parse_output_fields),
         OPT_BOOLEAN('a', "all-cpus", &system_wide,
                     "system-wide collection from all CPUs"),
         OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
@@ -1570,6 +1619,9 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
         OPT_BOOLEAN('\0', "show-mmap-events", &script.show_mmap_events,
                     "Show the mmap events"),
         OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
+       OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts",
+                           "Instruction Tracing options",
+                           itrace_parse_synth_opts),
         OPT_END()
         };
         const char * const script_subcommands[] = { "record", "report", NULL };
@@ -1765,6 +1817,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
  
         script.session = session;
  
+       session->itrace_synth_opts = &itrace_synth_opts;
+
         if (cpu_list) {
                 err = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap);
                 if (err < 0)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c

index f7b8218785f6fa8911bc9c8544d7eb14ff9e2f2f..fcf99bdeb19e1cf73c54e0b6edc6d4426dfe1f48 100644 (file)
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -73,8 +73,8 @@ static void print_counter(struct perf_evsel *counter, char *prefix);
  static void print_aggr(char *prefix);
  
  /* Default events used for perf stat -T */
-static const char * const transaction_attrs[] = {
-       "task-clock",
+static const char *transaction_attrs = {
+       "task-clock,"
         "{"
         "instructions,"
         "cycles,"
@@ -86,8 +86,8 @@ static const char * const transaction_attrs[] = {
  };
  
  /* More limited version when the CPU does not have all events. */
-static const char * const transaction_limited_attrs[] = {
-       "task-clock",
+static const char * transaction_limited_attrs = {
+       "task-clock,"
         "{"
         "instructions,"
         "cycles,"
@@ -96,30 +96,12 @@ static const char * const transaction_limited_attrs[] = {
         "}"
  };
  
-/* must match transaction_attrs and the beginning limited_attrs */
-enum {
-       T_TASK_CLOCK,
-       T_INSTRUCTIONS,
-       T_CYCLES,
-       T_CYCLES_IN_TX,
-       T_TRANSACTION_START,
-       T_ELISION_START,
-       T_CYCLES_IN_TX_CP,
-};
-
  static struct perf_evlist      *evsel_list;
  
  static struct target target = {
         .uid    = UINT_MAX,
  };
  
-enum aggr_mode {
-       AGGR_NONE,
-       AGGR_GLOBAL,
-       AGGR_SOCKET,
-       AGGR_CORE,
-};
-
  static int                     run_count                       =  1;
  static bool                    no_inherit                      = false;
  static bool                    scale                           =  true;
@@ -147,10 +129,6 @@ static int                 (*aggr_get_id)(struct cpu_map *m, int cpu);
  
  static volatile int done = 0;
  
-struct perf_stat {
-       struct stats      res_stats[3];
-};
-
  static inline void diff_timespec(struct timespec *r, struct timespec *a,
                                  struct timespec *b)
  {
@@ -180,6 +158,8 @@ static void perf_evsel__reset_stat_priv(struct perf_evsel *evsel)
  
         for (i = 0; i < 3; i++)
                 init_stats(&ps->res_stats[i]);
+
+       perf_stat_evsel_id_init(evsel);
  }
  
  static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
@@ -198,24 +178,19 @@ static void perf_evsel__free_stat_priv(struct perf_evsel *evsel)
  
  static int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel)
  {
-       void *addr;
-       size_t sz;
+       struct perf_counts *counts;
  
-       sz = sizeof(*evsel->counts) +
-            (perf_evsel__nr_cpus(evsel) * sizeof(struct perf_counts_values));
+       counts = perf_counts__new(perf_evsel__nr_cpus(evsel));
+       if (counts)
+               evsel->prev_raw_counts = counts;
  
-       addr = zalloc(sz);
-       if (!addr)
-               return -ENOMEM;
-
-       evsel->prev_raw_counts =  addr;
-
-       return 0;
+       return counts ? 0 : -ENOMEM;
  }
  
  static void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel)
  {
-       zfree(&evsel->prev_raw_counts);
+       perf_counts__delete(evsel->prev_raw_counts);
+       evsel->prev_raw_counts = NULL;
  }
  
  static void perf_evlist__free_stats(struct perf_evlist *evlist)
@@ -247,22 +222,6 @@ out_free:
         return -1;
  }
  
-static struct stats runtime_nsecs_stats[MAX_NR_CPUS];
-static struct stats runtime_cycles_stats[MAX_NR_CPUS];
-static struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS];
-static struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS];
-static struct stats runtime_branches_stats[MAX_NR_CPUS];
-static struct stats runtime_cacherefs_stats[MAX_NR_CPUS];
-static struct stats runtime_l1_dcache_stats[MAX_NR_CPUS];
-static struct stats runtime_l1_icache_stats[MAX_NR_CPUS];
-static struct stats runtime_ll_cache_stats[MAX_NR_CPUS];
-static struct stats runtime_itlb_cache_stats[MAX_NR_CPUS];
-static struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS];
-static struct stats runtime_cycles_in_tx_stats[MAX_NR_CPUS];
-static struct stats walltime_nsecs_stats;
-static struct stats runtime_transaction_stats[MAX_NR_CPUS];
-static struct stats runtime_elision_stats[MAX_NR_CPUS];
-
  static void perf_stat__reset_stats(struct perf_evlist *evlist)
  {
         struct perf_evsel *evsel;
@@ -272,23 +231,7 @@ static void perf_stat__reset_stats(struct perf_evlist *evlist)
                 perf_evsel__reset_counts(evsel, perf_evsel__nr_cpus(evsel));
         }
  
-       memset(runtime_nsecs_stats, 0, sizeof(runtime_nsecs_stats));
-       memset(runtime_cycles_stats, 0, sizeof(runtime_cycles_stats));
-       memset(runtime_stalled_cycles_front_stats, 0, sizeof(runtime_stalled_cycles_front_stats));
-       memset(runtime_stalled_cycles_back_stats, 0, sizeof(runtime_stalled_cycles_back_stats));
-       memset(runtime_branches_stats, 0, sizeof(runtime_branches_stats));
-       memset(runtime_cacherefs_stats, 0, sizeof(runtime_cacherefs_stats));
-       memset(runtime_l1_dcache_stats, 0, sizeof(runtime_l1_dcache_stats));
-       memset(runtime_l1_icache_stats, 0, sizeof(runtime_l1_icache_stats));
-       memset(runtime_ll_cache_stats, 0, sizeof(runtime_ll_cache_stats));
-       memset(runtime_itlb_cache_stats, 0, sizeof(runtime_itlb_cache_stats));
-       memset(runtime_dtlb_cache_stats, 0, sizeof(runtime_dtlb_cache_stats));
-       memset(runtime_cycles_in_tx_stats, 0,
-                       sizeof(runtime_cycles_in_tx_stats));
-       memset(runtime_transaction_stats, 0,
-               sizeof(runtime_transaction_stats));
-       memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats));
-       memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
+       perf_stat__reset_shadow_stats();
  }
  
  static int create_perf_stat_counter(struct perf_evsel *evsel)
@@ -325,70 +268,6 @@ static inline int nsec_counter(struct perf_evsel *evsel)
         return 0;
  }
  
-static struct perf_evsel *nth_evsel(int n)
-{
-       static struct perf_evsel **array;
-       static int array_len;
-       struct perf_evsel *ev;
-       int j;
-
-       /* Assumes this only called when evsel_list does not change anymore. */
-       if (!array) {
-               evlist__for_each(evsel_list, ev)
-                       array_len++;
-               array = malloc(array_len * sizeof(void *));
-               if (!array)
-                       exit(ENOMEM);
-               j = 0;
-               evlist__for_each(evsel_list, ev)
-                       array[j++] = ev;
-       }
-       if (n < array_len)
-               return array[n];
-       return NULL;
-}
-
-/*
- * Update various tracking values we maintain to print
- * more semantic information such as miss/hit ratios,
- * instruction rates, etc:
- */
-static void update_shadow_stats(struct perf_evsel *counter, u64 *count,
-                               int cpu)
-{
-       if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
-               update_stats(&runtime_nsecs_stats[cpu], count[0]);
-       else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
-               update_stats(&runtime_cycles_stats[cpu], count[0]);
-       else if (transaction_run &&
-                perf_evsel__cmp(counter, nth_evsel(T_CYCLES_IN_TX)))
-               update_stats(&runtime_cycles_in_tx_stats[cpu], count[0]);
-       else if (transaction_run &&
-                perf_evsel__cmp(counter, nth_evsel(T_TRANSACTION_START)))
-               update_stats(&runtime_transaction_stats[cpu], count[0]);
-       else if (transaction_run &&
-                perf_evsel__cmp(counter, nth_evsel(T_ELISION_START)))
-               update_stats(&runtime_elision_stats[cpu], count[0]);
-       else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
-               update_stats(&runtime_stalled_cycles_front_stats[cpu], count[0]);
-       else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
-               update_stats(&runtime_stalled_cycles_back_stats[cpu], count[0]);
-       else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
-               update_stats(&runtime_branches_stats[cpu], count[0]);
-       else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
-               update_stats(&runtime_cacherefs_stats[cpu], count[0]);
-       else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D))
-               update_stats(&runtime_l1_dcache_stats[cpu], count[0]);
-       else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I))
-               update_stats(&runtime_l1_icache_stats[cpu], count[0]);
-       else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL))
-               update_stats(&runtime_ll_cache_stats[cpu], count[0]);
-       else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB))
-               update_stats(&runtime_dtlb_cache_stats[cpu], count[0]);
-       else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
-               update_stats(&runtime_itlb_cache_stats[cpu], count[0]);
-}
-
  static void zero_per_pkg(struct perf_evsel *counter)
  {
         if (counter->per_pkg_mask)
@@ -449,7 +328,7 @@ static int read_cb(struct perf_evsel *evsel, int cpu, int thread __maybe_unused,
                 perf_counts_values__scale(count, scale, NULL);
                 evsel->counts->cpu[cpu] = *count;
                 if (aggr_mode == AGGR_NONE)
-                       update_shadow_stats(evsel, count->values, cpu);
+                       perf_stat__update_shadow_stats(evsel, count->values, cpu);
                 break;
         case AGGR_GLOBAL:
                 aggr->val += count->val;
@@ -497,7 +376,7 @@ static int read_counter_aggr(struct perf_evsel *counter)
         /*
          * Save the full runtime - to allow normalization during printout:
          */
-       update_shadow_stats(counter, count, 0);
+       perf_stat__update_shadow_stats(counter, count, 0);
  
         return 0;
  }
@@ -665,7 +544,10 @@ static int __run_perf_stat(int argc, const char **argv)
                                         ui__warning("%s event is not supported by the kernel.\n",
                                                     perf_evsel__name(counter));
                                 counter->supported = false;
-                               continue;
+
+                               if ((counter->leader != counter) ||
+                                   !(counter->leader->nr_members > 1))
+                                       continue;
                         }
  
                         perf_evsel__open_strerror(counter, &target,
@@ -875,188 +757,8 @@ static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg)
                 fprintf(output, "                                   ");
  }
  
-/* used for get_ratio_color() */
-enum grc_type {
-       GRC_STALLED_CYCLES_FE,
-       GRC_STALLED_CYCLES_BE,
-       GRC_CACHE_MISSES,
-       GRC_MAX_NR
-};
-
-static const char *get_ratio_color(enum grc_type type, double ratio)
-{
-       static const double grc_table[GRC_MAX_NR][3] = {
-               [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 },
-               [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 },
-               [GRC_CACHE_MISSES]      = { 20.0, 10.0, 5.0 },
-       };
-       const char *color = PERF_COLOR_NORMAL;
-
-       if (ratio > grc_table[type][0])
-               color = PERF_COLOR_RED;
-       else if (ratio > grc_table[type][1])
-               color = PERF_COLOR_MAGENTA;
-       else if (ratio > grc_table[type][2])
-               color = PERF_COLOR_YELLOW;
-
-       return color;
-}
-
-static void print_stalled_cycles_frontend(int cpu,
-                                         struct perf_evsel *evsel
-                                         __maybe_unused, double avg)
-{
-       double total, ratio = 0.0;
-       const char *color;
-
-       total = avg_stats(&runtime_cycles_stats[cpu]);
-
-       if (total)
-               ratio = avg / total * 100.0;
-
-       color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio);
-
-       fprintf(output, " #  ");
-       color_fprintf(output, color, "%6.2f%%", ratio);
-       fprintf(output, " frontend cycles idle   ");
-}
-
-static void print_stalled_cycles_backend(int cpu,
-                                        struct perf_evsel *evsel
-                                        __maybe_unused, double avg)
-{
-       double total, ratio = 0.0;
-       const char *color;
-
-       total = avg_stats(&runtime_cycles_stats[cpu]);
-
-       if (total)
-               ratio = avg / total * 100.0;
-
-       color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio);
-
-       fprintf(output, " #  ");
-       color_fprintf(output, color, "%6.2f%%", ratio);
-       fprintf(output, " backend  cycles idle   ");
-}
-
-static void print_branch_misses(int cpu,
-                               struct perf_evsel *evsel __maybe_unused,
-                               double avg)
-{
-       double total, ratio = 0.0;
-       const char *color;
-
-       total = avg_stats(&runtime_branches_stats[cpu]);
-
-       if (total)
-               ratio = avg / total * 100.0;
-
-       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(output, " #  ");
-       color_fprintf(output, color, "%6.2f%%", ratio);
-       fprintf(output, " of all branches        ");
-}
-
-static void print_l1_dcache_misses(int cpu,
-                                  struct perf_evsel *evsel __maybe_unused,
-                                  double avg)
-{
-       double total, ratio = 0.0;
-       const char *color;
-
-       total = avg_stats(&runtime_l1_dcache_stats[cpu]);
-
-       if (total)
-               ratio = avg / total * 100.0;
-
-       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(output, " #  ");
-       color_fprintf(output, color, "%6.2f%%", ratio);
-       fprintf(output, " of all L1-dcache hits  ");
-}
-
-static void print_l1_icache_misses(int cpu,
-                                  struct perf_evsel *evsel __maybe_unused,
-                                  double avg)
-{
-       double total, ratio = 0.0;
-       const char *color;
-
-       total = avg_stats(&runtime_l1_icache_stats[cpu]);
-
-       if (total)
-               ratio = avg / total * 100.0;
-
-       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(output, " #  ");
-       color_fprintf(output, color, "%6.2f%%", ratio);
-       fprintf(output, " of all L1-icache hits  ");
-}
-
-static void print_dtlb_cache_misses(int cpu,
-                                   struct perf_evsel *evsel __maybe_unused,
-                                   double avg)
-{
-       double total, ratio = 0.0;
-       const char *color;
-
-       total = avg_stats(&runtime_dtlb_cache_stats[cpu]);
-
-       if (total)
-               ratio = avg / total * 100.0;
-
-       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(output, " #  ");
-       color_fprintf(output, color, "%6.2f%%", ratio);
-       fprintf(output, " of all dTLB cache hits ");
-}
-
-static void print_itlb_cache_misses(int cpu,
-                                   struct perf_evsel *evsel __maybe_unused,
-                                   double avg)
-{
-       double total, ratio = 0.0;
-       const char *color;
-
-       total = avg_stats(&runtime_itlb_cache_stats[cpu]);
-
-       if (total)
-               ratio = avg / total * 100.0;
-
-       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(output, " #  ");
-       color_fprintf(output, color, "%6.2f%%", ratio);
-       fprintf(output, " of all iTLB cache hits ");
-}
-
-static void print_ll_cache_misses(int cpu,
-                                 struct perf_evsel *evsel __maybe_unused,
-                                 double avg)
-{
-       double total, ratio = 0.0;
-       const char *color;
-
-       total = avg_stats(&runtime_ll_cache_stats[cpu]);
-
-       if (total)
-               ratio = avg / total * 100.0;
-
-       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-       fprintf(output, " #  ");
-       color_fprintf(output, color, "%6.2f%%", ratio);
-       fprintf(output, " of all LL-cache hits   ");
-}
-
  static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
  {
-       double total, ratio = 0.0, total2;
         double sc =  evsel->scale;
         const char *fmt;
         int cpu = cpu_map__id_to_cpu(id);
@@ -1090,138 +792,7 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
         if (csv_output || interval)
                 return;
  
-       if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
-               total = avg_stats(&runtime_cycles_stats[cpu]);
-               if (total) {
-                       ratio = avg / total;
-                       fprintf(output, " #   %5.2f  insns per cycle        ", ratio);
-               } else {
-                       fprintf(output, "                                   ");
-               }
-               total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]);
-               total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu]));
-
-               if (total && avg) {
-                       ratio = total / avg;
-                       fprintf(output, "\n");
-                       if (aggr_mode == AGGR_NONE)
-                               fprintf(output, "        ");
-                       fprintf(output, "                                                  #   %5.2f  stalled cycles per insn", ratio);
-               }
-
-       } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
-                       runtime_branches_stats[cpu].n != 0) {
-               print_branch_misses(cpu, evsel, avg);
-       } else if (
-               evsel->attr.type == PERF_TYPE_HW_CACHE &&
-               evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1D |
-                                       ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_l1_dcache_stats[cpu].n != 0) {
-               print_l1_dcache_misses(cpu, evsel, avg);
-       } else if (
-               evsel->attr.type == PERF_TYPE_HW_CACHE &&
-               evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1I |
-                                       ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_l1_icache_stats[cpu].n != 0) {
-               print_l1_icache_misses(cpu, evsel, avg);
-       } else if (
-               evsel->attr.type == PERF_TYPE_HW_CACHE &&
-               evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_DTLB |
-                                       ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_dtlb_cache_stats[cpu].n != 0) {
-               print_dtlb_cache_misses(cpu, evsel, avg);
-       } else if (
-               evsel->attr.type == PERF_TYPE_HW_CACHE &&
-               evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_ITLB |
-                                       ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_itlb_cache_stats[cpu].n != 0) {
-               print_itlb_cache_misses(cpu, evsel, avg);
-       } else if (
-               evsel->attr.type == PERF_TYPE_HW_CACHE &&
-               evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_LL |
-                                       ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-                       runtime_ll_cache_stats[cpu].n != 0) {
-               print_ll_cache_misses(cpu, evsel, avg);
-       } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) &&
-                       runtime_cacherefs_stats[cpu].n != 0) {
-               total = avg_stats(&runtime_cacherefs_stats[cpu]);
-
-               if (total)
-                       ratio = avg * 100 / total;
-
-               fprintf(output, " # %8.3f %% of all cache refs    ", ratio);
-
-       } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
-               print_stalled_cycles_frontend(cpu, evsel, avg);
-       } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
-               print_stalled_cycles_backend(cpu, evsel, avg);
-       } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
-               total = avg_stats(&runtime_nsecs_stats[cpu]);
-
-               if (total) {
-                       ratio = avg / total;
-                       fprintf(output, " # %8.3f GHz                    ", ratio);
-               } else {
-                       fprintf(output, "                                   ");
-               }
-       } else if (transaction_run &&
-                  perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX))) {
-               total = avg_stats(&runtime_cycles_stats[cpu]);
-               if (total)
-                       fprintf(output,
-                               " #   %5.2f%% transactional cycles   ",
-                               100.0 * (avg / total));
-       } else if (transaction_run &&
-                  perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX_CP))) {
-               total = avg_stats(&runtime_cycles_stats[cpu]);
-               total2 = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
-               if (total2 < avg)
-                       total2 = avg;
-               if (total)
-                       fprintf(output,
-                               " #   %5.2f%% aborted cycles         ",
-                               100.0 * ((total2-avg) / total));
-       } else if (transaction_run &&
-                  perf_evsel__cmp(evsel, nth_evsel(T_TRANSACTION_START)) &&
-                  avg > 0 &&
-                  runtime_cycles_in_tx_stats[cpu].n != 0) {
-               total = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
-
-               if (total)
-                       ratio = total / avg;
-
-               fprintf(output, " # %8.0f cycles / transaction   ", ratio);
-       } else if (transaction_run &&
-                  perf_evsel__cmp(evsel, nth_evsel(T_ELISION_START)) &&
-                  avg > 0 &&
-                  runtime_cycles_in_tx_stats[cpu].n != 0) {
-               total = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
-
-               if (total)
-                       ratio = total / avg;
-
-               fprintf(output, " # %8.0f cycles / elision       ", ratio);
-       } else if (runtime_nsecs_stats[cpu].n != 0) {
-               char unit = 'M';
-
-               total = avg_stats(&runtime_nsecs_stats[cpu]);
-
-               if (total)
-                       ratio = 1000.0 * avg / total;
-               if (ratio < 0.001) {
-                       ratio *= 1000;
-                       unit = 'K';
-               }
-
-               fprintf(output, " # %8.3f %c/sec                  ", ratio, unit);
-       } else {
-               fprintf(output, "                                   ");
-       }
+       perf_stat__print_shadow_stats(output, evsel, avg, cpu, aggr_mode);
  }
  
  static void print_aggr(char *prefix)
@@ -1536,17 +1107,6 @@ static int perf_stat_init_aggr_mode(void)
         return 0;
  }
  
-static int setup_events(const char * const *attrs, unsigned len)
-{
-       unsigned i;
-
-       for (i = 0; i < len; i++) {
-               if (parse_events(evsel_list, attrs[i]))
-                       return -1;
-       }
-       return 0;
-}
-
  /*
   * Add default attributes, if there were no attributes specified or
   * if -d/--detailed, -d -d or -d -d -d is used:
@@ -1668,12 +1228,10 @@ static int add_default_attributes(void)
                 int err;
                 if (pmu_have_event("cpu", "cycles-ct") &&
                     pmu_have_event("cpu", "el-start"))
-                       err = setup_events(transaction_attrs,
-                                       ARRAY_SIZE(transaction_attrs));
+                       err = parse_events(evsel_list, transaction_attrs, NULL);
                 else
-                       err = setup_events(transaction_limited_attrs,
-                                ARRAY_SIZE(transaction_limited_attrs));
-               if (err < 0) {
+                       err = parse_events(evsel_list, transaction_limited_attrs, NULL);
+               if (err) {
                         fprintf(stderr, "Cannot set up transaction events\n");
                         return -1;
                 }
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c

index e50fe1187b0ba2ca808f6c0a0d3effd8156b4982..30e59620179daef63c272c3119f884a8caefd12f 100644 (file)
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -61,13 +61,13 @@ struct timechart {
                                 tasks_only,
                                 with_backtrace,
                                 topology;
+       bool                    force;
         /* IO related settings */
-       u64                     io_events;
         bool                    io_only,
                                 skip_eagain;
+       u64                     io_events;
         u64                     min_time,
                                 merge_dist;
-       bool                    force;
  };
  
  struct per_pidcomm;
@@ -523,7 +523,7 @@ static const char *cat_backtrace(union perf_event *event,
                                  * Discard all.
                                  */
                                 zfree(&p);
-                               goto exit;
+                               goto exit_put;
                         }
                         continue;
                 }
@@ -538,7 +538,8 @@ static const char *cat_backtrace(union perf_event *event,
                 else
                         fprintf(f, "..... %016" PRIx64 "\n", ip);
         }
-
+exit_put:
+       addr_location__put(&al);
  exit:
         fclose(f);
  
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c

index 6a4d5d41c671d0ce176deb13d318de35acee0161..619a8696fda7c939cd0e6497abab5845813bda12 100644 (file)
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -235,10 +235,13 @@ static void perf_top__show_details(struct perf_top *top)
  
         more = symbol__annotate_printf(symbol, he->ms.map, top->sym_evsel,
                                        0, top->sym_pcnt_filter, top->print_entries, 4);
-       if (top->zero)
-               symbol__annotate_zero_histogram(symbol, top->sym_evsel->idx);
-       else
-               symbol__annotate_decay_histogram(symbol, top->sym_evsel->idx);
+
+       if (top->evlist->enabled) {
+               if (top->zero)
+                       symbol__annotate_zero_histogram(symbol, top->sym_evsel->idx);
+               else
+                       symbol__annotate_decay_histogram(symbol, top->sym_evsel->idx);
+       }
         if (more != 0)
                 printf("%d lines not displayed, maybe increase display entries [e]\n", more);
  out_unlock:
@@ -276,11 +279,13 @@ static void perf_top__print_sym_table(struct perf_top *top)
                 return;
         }
  
-       if (top->zero) {
-               hists__delete_entries(hists);
-       } else {
-               hists__decay_entries(hists, top->hide_user_symbols,
-                                    top->hide_kernel_symbols);
+       if (top->evlist->enabled) {
+               if (top->zero) {
+                       hists__delete_entries(hists);
+               } else {
+                       hists__decay_entries(hists, top->hide_user_symbols,
+                                            top->hide_kernel_symbols);
+               }
         }
  
         hists__collapse_resort(hists, NULL);
@@ -545,11 +550,13 @@ static void perf_top__sort_new_samples(void *arg)
  
         hists = evsel__hists(t->sym_evsel);
  
-       if (t->zero) {
-               hists__delete_entries(hists);
-       } else {
-               hists__decay_entries(hists, t->hide_user_symbols,
-                                    t->hide_kernel_symbols);
+       if (t->evlist->enabled) {
+               if (t->zero) {
+                       hists__delete_entries(hists);
+               } else {
+                       hists__decay_entries(hists, t->hide_user_symbols,
+                                            t->hide_kernel_symbols);
+               }
         }
  
         hists__collapse_resort(hists, NULL);
@@ -579,8 +586,27 @@ static void *display_thread_tui(void *arg)
                 hists->uid_filter_str = top->record_opts.target.uid_str;
         }
  
-       perf_evlist__tui_browse_hists(top->evlist, help, &hbt, top->min_percent,
-                                     &top->session->header.env);
+       while (true)  {
+               int key = perf_evlist__tui_browse_hists(top->evlist, help, &hbt,
+                                                       top->min_percent,
+                                                       &top->session->header.env);
+
+               if (key != 'f')
+                       break;
+
+               perf_evlist__toggle_enable(top->evlist);
+               /*
+                * No need to refresh, resort/decay histogram entries
+                * if we are not collecting samples:
+                */
+               if (top->evlist->enabled) {
+                       hbt.refresh = top->delay_secs;
+                       help = "Press 'f' to disable the events or 'h' to see other hotkeys";
+               } else {
+                       help = "Press 'f' again to re-enable the events";
+                       hbt.refresh = 0;
+               }
+       }
  
         done = 1;
         return NULL;
@@ -775,7 +801,9 @@ static void perf_event__process_sample(struct perf_tool *tool,
         if (al.sym == NULL || !al.sym->ignore) {
                 struct hists *hists = evsel__hists(evsel);
                 struct hist_entry_iter iter = {
-                       .add_entry_cb = hist_iter__top_callback,
+                       .evsel          = evsel,
+                       .sample         = sample,
+                       .add_entry_cb   = hist_iter__top_callback,
                 };
  
                 if (symbol_conf.cumulate_callchain)
@@ -785,15 +813,14 @@ static void perf_event__process_sample(struct perf_tool *tool,
  
                 pthread_mutex_lock(&hists->lock);
  
-               err = hist_entry_iter__add(&iter, &al, evsel, sample,
-                                          top->max_stack, top);
+               err = hist_entry_iter__add(&iter, &al, top->max_stack, top);
                 if (err < 0)
                         pr_err("Problem incrementing symbol period, skipping event\n");
  
                 pthread_mutex_unlock(&hists->lock);
         }
  
-       return;
+       addr_location__put(&al);
  }
  
  static void perf_top__mmap_read_idx(struct perf_top *top, int idx)
@@ -950,7 +977,7 @@ static int __cmd_top(struct perf_top *top)
                 goto out_delete;
  
         machine__synthesize_threads(&top->session->machines.host, &opts->target,
-                                   top->evlist->threads, false);
+                                   top->evlist->threads, false, opts->proc_map_timeout);
         ret = perf_top__start_counters(top);
         if (ret)
                 goto out_delete;
@@ -1060,6 +1087,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
                         .target         = {
                                 .uses_mmap   = true,
                         },
+                       .proc_map_timeout    = 500,
                 },
                 .max_stack           = PERF_MAX_STACK_DEPTH,
                 .sym_pcnt_filter     = 5,
@@ -1159,6 +1187,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
         OPT_STRING('w', "column-widths", &symbol_conf.col_width_list_str,
                    "width[,width...]",
                    "don't try to adjust column width, use these fixed values"),
+       OPT_UINTEGER(0, "proc-map-timeout", &opts->proc_map_timeout,
+                       "per thread proc mmap processing timeout in ms"),
         OPT_END()
         };
         const char * const top_usage[] = {
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c

index e122970361f21af6d07c321480aefa2cb90bf31d..de5d277d1ad7cb97cac2c5da67032fc8a12ffdf6 100644 (file)
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -16,7 +16,6 @@
  
  #include <libaudit.h>
  #include <stdlib.h>
-#include <sys/eventfd.h>
  #include <sys/mman.h>
  #include <linux/futex.h>
  
@@ -41,6 +40,51 @@
  # define EFD_SEMAPHORE         1
  #endif
  
+#ifndef EFD_NONBLOCK
+# define EFD_NONBLOCK          00004000
+#endif
+
+#ifndef EFD_CLOEXEC
+# define EFD_CLOEXEC           02000000
+#endif
+
+#ifndef O_CLOEXEC
+# define O_CLOEXEC             02000000
+#endif
+
+#ifndef SOCK_DCCP
+# define SOCK_DCCP             6
+#endif
+
+#ifndef SOCK_CLOEXEC
+# define SOCK_CLOEXEC          02000000
+#endif
+
+#ifndef SOCK_NONBLOCK
+# define SOCK_NONBLOCK         00004000
+#endif
+
+#ifndef MSG_CMSG_CLOEXEC
+# define MSG_CMSG_CLOEXEC      0x40000000
+#endif
+
+#ifndef PERF_FLAG_FD_NO_GROUP
+# define PERF_FLAG_FD_NO_GROUP         (1UL << 0)
+#endif
+
+#ifndef PERF_FLAG_FD_OUTPUT
+# define PERF_FLAG_FD_OUTPUT           (1UL << 1)
+#endif
+
+#ifndef PERF_FLAG_PID_CGROUP
+# define PERF_FLAG_PID_CGROUP          (1UL << 2) /* pid=cgroup id, per-cpu mode only */
+#endif
+
+#ifndef PERF_FLAG_FD_CLOEXEC
+# define PERF_FLAG_FD_CLOEXEC          (1UL << 3) /* O_CLOEXEC */
+#endif
+
+
  struct tp_field {
         int offset;
         union {
@@ -331,6 +375,14 @@ static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
  
  #define SCA_HEX syscall_arg__scnprintf_hex
  
+static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
+                                        struct syscall_arg *arg)
+{
+       return scnprintf(bf, size, "%d", arg->val);
+}
+
+#define SCA_INT syscall_arg__scnprintf_int
+
  static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
                                                struct syscall_arg *arg)
  {
@@ -783,6 +835,34 @@ static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
  
  #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
  
+static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
+                                               struct syscall_arg *arg)
+{
+       int printed = 0, flags = arg->val;
+
+       if (flags == 0)
+               return 0;
+
+#define        P_FLAG(n) \
+       if (flags & PERF_FLAG_##n) { \
+               printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+               flags &= ~PERF_FLAG_##n; \
+       }
+
+       P_FLAG(FD_NO_GROUP);
+       P_FLAG(FD_OUTPUT);
+       P_FLAG(PID_CGROUP);
+       P_FLAG(FD_CLOEXEC);
+#undef P_FLAG
+
+       if (flags)
+               printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+       return printed;
+}
+
+#define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
+
  static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
                                                    struct syscall_arg *arg)
  {
@@ -1050,6 +1130,11 @@ static struct syscall_fmt {
         { .name     = "openat",     .errmsg = true,
           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
+       { .name     = "perf_event_open", .errmsg = true,
+         .arg_scnprintf = { [1] = SCA_INT, /* pid */
+                            [2] = SCA_INT, /* cpu */
+                            [3] = SCA_FD,  /* group_fd */
+                            [4] = SCA_PERF_FLAGS,  /* flags */ }, },
         { .name     = "pipe2",      .errmsg = true,
           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
         { .name     = "poll",       .errmsg = true, .timeout = true, },
@@ -1433,7 +1518,8 @@ static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
                 return -ENOMEM;
  
         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
-                                           evlist->threads, trace__tool_process, false);
+                                           evlist->threads, trace__tool_process, false,
+                                           trace->opts.proc_map_timeout);
         if (err)
                 symbol__exit();
  
@@ -1712,7 +1798,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
         void *args;
         size_t printed = 0;
         struct thread *thread;
-       int id = perf_evsel__sc_tp_uint(evsel, id, sample);
+       int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
         struct syscall *sc = trace__syscall_info(trace, evsel, id);
         struct thread_trace *ttrace;
  
@@ -1725,14 +1811,14 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
         ttrace = thread__trace(thread, trace->output);
         if (ttrace == NULL)
-               return -1;
+               goto out_put;
  
         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
  
         if (ttrace->entry_str == NULL) {
                 ttrace->entry_str = malloc(1024);
                 if (!ttrace->entry_str)
-                       return -1;
+                       goto out_put;
         }
  
         if (!trace->summary_only)
@@ -1757,8 +1843,10 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
                 thread__put(trace->current);
                 trace->current = thread__get(thread);
         }
-
-       return 0;
+       err = 0;
+out_put:
+       thread__put(thread);
+       return err;
  }
  
  static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
@@ -1768,7 +1856,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
         long ret;
         u64 duration = 0;
         struct thread *thread;
-       int id = perf_evsel__sc_tp_uint(evsel, id, sample);
+       int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
         struct syscall *sc = trace__syscall_info(trace, evsel, id);
         struct thread_trace *ttrace;
  
@@ -1781,7 +1869,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
         ttrace = thread__trace(thread, trace->output);
         if (ttrace == NULL)
-               return -1;
+               goto out_put;
  
         if (trace->summary)
                 thread__update_stats(ttrace, id, sample);
@@ -1835,8 +1923,10 @@ signed_print:
         fputc('\n', trace->output);
  out:
         ttrace->entry_pending = false;
-
-       return 0;
+       err = 0;
+out_put:
+       thread__put(thread);
+       return err;
  }
  
  static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
@@ -1863,6 +1953,7 @@ static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evs
  
         ttrace->runtime_ms += runtime_ms;
         trace->runtime_ms += runtime_ms;
+       thread__put(thread);
         return 0;
  
  out_dump:
@@ -1872,6 +1963,7 @@ out_dump:
                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
                runtime,
                perf_evsel__intval(evsel, sample, "vruntime"));
+       thread__put(thread);
         return 0;
  }
  
@@ -1924,11 +2016,12 @@ static int trace__pgfault(struct trace *trace,
         struct addr_location al;
         char map_type = 'd';
         struct thread_trace *ttrace;
+       int err = -1;
  
         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
         ttrace = thread__trace(thread, trace->output);
         if (ttrace == NULL)
-               return -1;
+               goto out_put;
  
         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
                 ttrace->pfmaj++;
@@ -1936,7 +2029,7 @@ static int trace__pgfault(struct trace *trace,
                 ttrace->pfmin++;
  
         if (trace->summary_only)
-               return 0;
+               goto out;
  
         thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
                               sample->ip, &al);
@@ -1967,8 +2060,11 @@ static int trace__pgfault(struct trace *trace,
         print_location(trace->output, sample, &al, true, false);
  
         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
-
-       return 0;
+out:
+       err = 0;
+out_put:
+       thread__put(thread);
+       return err;
  }
  
  static bool skip_sample(struct trace *trace, struct perf_sample *sample)
@@ -2652,6 +2748,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
                         .user_interval = ULLONG_MAX,
                         .no_buffering  = true,
                         .mmap_pages    = UINT_MAX,
+                       .proc_map_timeout  = 500,
                 },
                 .output = stdout,
                 .show_comm = true,
@@ -2666,16 +2763,15 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
         OPT_BOOLEAN(0, "comm", &trace.show_comm,
                     "show the thread COMM next to its id"),
         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
-       OPT_STRING('e', "expr", &ev_qualifier_str, "expr",
-                   "list of events to trace"),
+       OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
         OPT_STRING('o', "output", &output_name, "file", "output file name"),
         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
                     "trace events on existing process id"),
         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
                     "trace events on existing thread id"),
-       OPT_CALLBACK(0, "filter-pids", &trace, "float",
-                    "show only events with duration > N.M ms", trace__set_filter_pids),
+       OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
+                    "pids to filter (by the kernel)", trace__set_filter_pids),
         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
                     "system-wide collection from all CPUs"),
         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
@@ -2702,6 +2798,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
                      "Trace pagefaults", parse_pagefaults, "maj"),
         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
+       OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
+                       "per thread proc mmap processing timeout in ms"),
         OPT_END()
         };
         const char * const trace_subcommands[] = { "record", NULL };
@@ -2712,11 +2810,10 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
         signal(SIGFPE, sighandler_dump_stack);
  
         trace.evlist = perf_evlist__new();
-       if (trace.evlist == NULL)
-               return -ENOMEM;
  
         if (trace.evlist == NULL) {
                 pr_err("Not enough memory to run!\n");
+               err = -ENOMEM;
                 goto out;
         }
  
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile

index 59a98c6432403874a564753dd6e790b8632b672e..317001c946608be1430d03d485fe19586f1e0446 100644 (file)
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -32,7 +32,7 @@ ifeq ($(ARCH),x86)
      LIBUNWIND_LIBS = -lunwind -lunwind-x86_64
      $(call detected,CONFIG_X86_64)
    else
-    LIBUNWIND_LIBS = -lunwind -lunwind-x86
+    LIBUNWIND_LIBS = -lunwind-x86 -llzma -lunwind
    endif
    NO_PERF_REGS := 0
  endif
@@ -130,6 +130,8 @@ endif
  
  ifeq ($(DEBUG),0)
    CFLAGS += -O6
+else
+  CFLAGS += $(call cc-option,-Og,-O0)
  endif
  
  ifdef PARSER_DEBUG
@@ -268,6 +270,10 @@ else
    endif # libelf support
  endif # NO_LIBELF
  
+ifdef NO_DWARF
+  NO_LIBDW_DWARF_UNWIND := 1
+endif
+
  ifndef NO_LIBELF
    CFLAGS += -DHAVE_LIBELF_SUPPORT
    EXTLIBS += -lelf
@@ -610,6 +616,11 @@ ifdef LIBBABELTRACE
    endif
  endif
  
+ifndef NO_AUXTRACE
+  $(call detected,CONFIG_AUXTRACE)
+  CFLAGS += -DHAVE_AUXTRACE_SUPPORT
+endif
+
  # Among the variables below, these:
  #   perfexecdir
  #   template_dir
diff --git a/tools/perf/config/utilities.mak b/tools/perf/config/utilities.mak

index c16ce833079c0a307642f2ae0e75f9c0d577c4d8..0ebef09c0842f89e16df8404931f884d7796c1ca 100644 (file)
--- a/tools/perf/config/utilities.mak
+++ b/tools/perf/config/utilities.mak
@@ -177,3 +177,22 @@ $(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2)))
  endef
  _ge_attempt = $(if $(get-executable),$(get-executable),$(call _gea_err,$(2)))
  _gea_err  = $(if $(1),$(error Please set '$(1)' appropriately))
+
+# try-run
+# Usage: option = $(call try-run, $(CC)...-o "$$TMP",option-ok,otherwise)
+# Exit code chooses option. "$$TMP" is can be used as temporary file and
+# is automatically cleaned up.
+try-run = $(shell set -e;              \
+       TMP="$(TMPOUT).$$$$.tmp";       \
+       TMPO="$(TMPOUT).$$$$.o";        \
+       if ($(1)) >/dev/null 2>&1;      \
+       then echo "$(2)";               \
+       else echo "$(3)";               \
+       fi;                             \
+       rm -f "$$TMP" "$$TMPO")
+
+# cc-option
+# Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586)
+
+cc-option = $(call try-run,\
+       $(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2))
diff --git a/tools/perf/perf-sys.h b/tools/perf/perf-sys.h

index 6ef68165c9db628d23bbe85b48945ac2581ec979..83a25cef82fdd2747ab0bc7f8b4fdc50c3c65b62 100644 (file)
--- a/tools/perf/perf-sys.h
+++ b/tools/perf/perf-sys.h
@@ -6,11 +6,9 @@
  #include <sys/syscall.h>
  #include <linux/types.h>
  #include <linux/perf_event.h>
+#include <asm/barrier.h>
  
  #if defined(__i386__)
-#define mb()           asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
-#define wmb()          asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
-#define rmb()          asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
  #define cpu_relax()    asm volatile("rep; nop" ::: "memory");
  #define CPUINFO_PROC   {"model name"}
  #ifndef __NR_perf_event_open
@@ -25,9 +23,6 @@
  #endif
  
  #if defined(__x86_64__)
-#define mb()           asm volatile("mfence" ::: "memory")
-#define wmb()          asm volatile("sfence" ::: "memory")
-#define rmb()          asm volatile("lfence" ::: "memory")
  #define cpu_relax()    asm volatile("rep; nop" ::: "memory");
  #define CPUINFO_PROC   {"model name"}
  #ifndef __NR_perf_event_open
@@ -43,129 +38,63 @@
  
  #ifdef __powerpc__
  #include "../../arch/powerpc/include/uapi/asm/unistd.h"
-#define mb()           asm volatile ("sync" ::: "memory")
-#define wmb()          asm volatile ("sync" ::: "memory")
-#define rmb()          asm volatile ("sync" ::: "memory")
  #define CPUINFO_PROC   {"cpu"}
  #endif
  
  #ifdef __s390__
-#define mb()           asm volatile("bcr 15,0" ::: "memory")
-#define wmb()          asm volatile("bcr 15,0" ::: "memory")
-#define rmb()          asm volatile("bcr 15,0" ::: "memory")
  #define CPUINFO_PROC   {"vendor_id"}
  #endif
  
  #ifdef __sh__
-#if defined(__SH4A__) || defined(__SH5__)
-# define mb()          asm volatile("synco" ::: "memory")
-# define wmb()         asm volatile("synco" ::: "memory")
-# define rmb()         asm volatile("synco" ::: "memory")
-#else
-# define mb()          asm volatile("" ::: "memory")
-# define wmb()         asm volatile("" ::: "memory")
-# define rmb()         asm volatile("" ::: "memory")
-#endif
  #define CPUINFO_PROC   {"cpu type"}
  #endif
  
  #ifdef __hppa__
-#define mb()           asm volatile("" ::: "memory")
-#define wmb()          asm volatile("" ::: "memory")
-#define rmb()          asm volatile("" ::: "memory")
  #define CPUINFO_PROC   {"cpu"}
  #endif
  
  #ifdef __sparc__
-#ifdef __LP64__
-#define mb()           asm volatile("ba,pt %%xcc, 1f\n"        \
-                                    "membar #StoreLoad\n"      \
-                                    "1:\n":::"memory")
-#else
-#define mb()           asm volatile("":::"memory")
-#endif
-#define wmb()          asm volatile("":::"memory")
-#define rmb()          asm volatile("":::"memory")
  #define CPUINFO_PROC   {"cpu"}
  #endif
  
  #ifdef __alpha__
-#define mb()           asm volatile("mb" ::: "memory")
-#define wmb()          asm volatile("wmb" ::: "memory")
-#define rmb()          asm volatile("mb" ::: "memory")
  #define CPUINFO_PROC   {"cpu model"}
  #endif
  
  #ifdef __ia64__
-#define mb()           asm volatile ("mf" ::: "memory")
-#define wmb()          asm volatile ("mf" ::: "memory")
-#define rmb()          asm volatile ("mf" ::: "memory")
  #define cpu_relax()    asm volatile ("hint @pause" ::: "memory")
  #define CPUINFO_PROC   {"model name"}
  #endif
  
  #ifdef __arm__
-/*
- * Use the __kuser_memory_barrier helper in the CPU helper page. See
- * arch/arm/kernel/entry-armv.S in the kernel source for details.
- */
-#define mb()           ((void(*)(void))0xffff0fa0)()
-#define wmb()          ((void(*)(void))0xffff0fa0)()
-#define rmb()          ((void(*)(void))0xffff0fa0)()
  #define CPUINFO_PROC   {"model name", "Processor"}
  #endif
  
  #ifdef __aarch64__
-#define mb()           asm volatile("dmb ish" ::: "memory")
-#define wmb()          asm volatile("dmb ishst" ::: "memory")
-#define rmb()          asm volatile("dmb ishld" ::: "memory")
  #define cpu_relax()    asm volatile("yield" ::: "memory")
  #endif
  
  #ifdef __mips__
-#define mb()           asm volatile(                                   \
-                               ".set   mips2\n\t"                      \
-                               "sync\n\t"                              \
-                               ".set   mips0"                          \
-                               : /* no output */                       \
-                               : /* no input */                        \
-                               : "memory")
-#define wmb()  mb()
-#define rmb()  mb()
  #define CPUINFO_PROC   {"cpu model"}
  #endif
  
  #ifdef __arc__
-#define mb()           asm volatile("" ::: "memory")
-#define wmb()          asm volatile("" ::: "memory")
-#define rmb()          asm volatile("" ::: "memory")
  #define CPUINFO_PROC   {"Processor"}
  #endif
  
  #ifdef __metag__
-#define mb()           asm volatile("" ::: "memory")
-#define wmb()          asm volatile("" ::: "memory")
-#define rmb()          asm volatile("" ::: "memory")
  #define CPUINFO_PROC   {"CPU"}
  #endif
  
  #ifdef __xtensa__
-#define mb()           asm volatile("memw" ::: "memory")
-#define wmb()          asm volatile("memw" ::: "memory")
-#define rmb()          asm volatile("" ::: "memory")
  #define CPUINFO_PROC   {"core ID"}
  #endif
  
  #ifdef __tile__
-#define mb()           asm volatile ("mf" ::: "memory")
-#define wmb()          asm volatile ("mf" ::: "memory")
-#define rmb()          asm volatile ("mf" ::: "memory")
  #define cpu_relax()    asm volatile ("mfspr zero, PASS" ::: "memory")
  #define CPUINFO_PROC    {"model name"}
  #endif
  
-#define barrier() asm volatile ("" ::: "memory")
-
  #ifndef cpu_relax
  #define cpu_relax() barrier()
  #endif
diff --git a/tools/perf/perf.h b/tools/perf/perf.h

index e14bb637255cc351ac40850a148eba1461d55cd0..4a5827fff7993d2bbe87666af51aa939f6f9dd1d 100644 (file)
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -54,16 +54,22 @@ struct record_opts {
         bool         period;
         bool         sample_intr_regs;
         bool         running_time;
+       bool         full_auxtrace;
+       bool         auxtrace_snapshot_mode;
         unsigned int freq;
         unsigned int mmap_pages;
+       unsigned int auxtrace_mmap_pages;
         unsigned int user_freq;
         u64          branch_stack;
         u64          default_interval;
         u64          user_interval;
+       size_t       auxtrace_snapshot_size;
+       const char   *auxtrace_snapshot_opts;
         bool         sample_transaction;
         unsigned     initial_delay;
         bool         use_clockid;
         clockid_t    clockid;
+       unsigned int proc_map_timeout;
  };
  
  struct option;
diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build

index 6a8801b32017018a09233390e7e5856afa4ae857..ee41e705b2eba7b726e417a1fdd73a421b7a985b 100644 (file)
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -3,9 +3,9 @@ perf-y += parse-events.o
  perf-y += dso-data.o
  perf-y += attr.o
  perf-y += vmlinux-kallsyms.o
-perf-y += open-syscall.o
-perf-y += open-syscall-all-cpus.o
-perf-y += open-syscall-tp-fields.o
+perf-y += openat-syscall.o
+perf-y += openat-syscall-all-cpus.o
+perf-y += openat-syscall-tp-fields.o
  perf-y += mmap-basic.o
  perf-y += perf-record.o
  perf-y += rdpmc.o
@@ -34,7 +34,7 @@ perf-y += kmod-path.o
  
  perf-$(CONFIG_X86) += perf-time-to-tsc.o
  
-ifeq ($(ARCH),$(filter $(ARCH),x86 arm))
+ifeq ($(ARCH),$(filter $(ARCH),x86 arm arm64))
  perf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o
  endif
  
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c

index 4f409816711249a540c77eb4136ec7d24e94991c..87b9961646e4a5f08728e694549195d9eeb2ad86 100644 (file)
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -23,12 +23,12 @@ static struct test {
                 .func = test__vmlinux_matches_kallsyms,
         },
         {
-               .desc = "detect open syscall event",
-               .func = test__open_syscall_event,
+               .desc = "detect openat syscall event",
+               .func = test__openat_syscall_event,
         },
         {
-               .desc = "detect open syscall event on all cpus",
-               .func = test__open_syscall_event_on_all_cpus,
+               .desc = "detect openat syscall event on all cpus",
+               .func = test__openat_syscall_event_on_all_cpus,
         },
         {
                 .desc = "read samples using the mmap interface",
@@ -73,8 +73,8 @@ static struct test {
                 .func = test__perf_evsel__tp_sched_test,
         },
         {
-               .desc = "Generate and check syscalls:sys_enter_open event fields",
-               .func = test__syscall_open_tp_fields,
+               .desc = "Generate and check syscalls:sys_enter_openat event fields",
+               .func = test__syscall_openat_tp_fields,
         },
         {
                 .desc = "struct perf_event_attr setup",
@@ -126,7 +126,7 @@ static struct test {
                 .desc = "Test parsing with no sample_id_all bit set",
                 .func = test__parse_no_sample_id_all,
         },
-#if defined(__x86_64__) || defined(__i386__) || defined(__arm__)
+#if defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__)
  #ifdef HAVE_DWARF_UNWIND_SUPPORT
         {
                 .desc = "Test dwarf unwind",
@@ -219,7 +219,7 @@ static int run_test(struct test *test)
         wait(&status);
  
         if (WIFEXITED(status)) {
-               err = WEXITSTATUS(status);
+               err = (signed char)WEXITSTATUS(status);
                 pr_debug("test child finished with %d\n", err);
         } else if (WIFSIGNALED(status)) {
                 err = -1;
diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c

index f671ec37a7c40c1346ebe92e77a6d201bc89305e..22f8a00446e1f1b3cb6b447dbc1bc21ccfda3108 100644 (file)
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -248,6 +248,7 @@ static int process_sample_event(struct machine *machine,
         struct perf_sample sample;
         struct thread *thread;
         u8 cpumode;
+       int ret;
  
         if (perf_evlist__parse_sample(evlist, event, &sample)) {
                 pr_debug("perf_evlist__parse_sample failed\n");
@@ -262,7 +263,9 @@ static int process_sample_event(struct machine *machine,
  
         cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
  
-       return read_object_code(sample.ip, READLEN, cpumode, thread, state);
+       ret = read_object_code(sample.ip, READLEN, cpumode, thread, state);
+       thread__put(thread);
+       return ret;
  }
  
  static int process_event(struct machine *machine, struct perf_evlist *evlist,
@@ -448,7 +451,7 @@ static int do_test_code_reading(bool try_kcore)
         }
  
         ret = perf_event__synthesize_thread_map(NULL, threads,
-                                               perf_event__process, machine, false);
+                                               perf_event__process, machine, false, 500);
         if (ret < 0) {
                 pr_debug("perf_event__synthesize_thread_map failed\n");
                 goto out_err;
@@ -457,13 +460,13 @@ static int do_test_code_reading(bool try_kcore)
         thread = machine__findnew_thread(machine, pid, pid);
         if (!thread) {
                 pr_debug("machine__findnew_thread failed\n");
-               goto out_err;
+               goto out_put;
         }
  
         cpus = cpu_map__new(NULL);
         if (!cpus) {
                 pr_debug("cpu_map__new failed\n");
-               goto out_err;
+               goto out_put;
         }
  
         while (1) {
@@ -472,7 +475,7 @@ static int do_test_code_reading(bool try_kcore)
                 evlist = perf_evlist__new();
                 if (!evlist) {
                         pr_debug("perf_evlist__new failed\n");
-                       goto out_err;
+                       goto out_put;
                 }
  
                 perf_evlist__set_maps(evlist, cpus, threads);
@@ -482,10 +485,10 @@ static int do_test_code_reading(bool try_kcore)
                 else
                         str = "cycles";
                 pr_debug("Parsing event '%s'\n", str);
-               ret = parse_events(evlist, str);
+               ret = parse_events(evlist, str, NULL);
                 if (ret < 0) {
                         pr_debug("parse_events failed\n");
-                       goto out_err;
+                       goto out_put;
                 }
  
                 perf_evlist__config(evlist, &opts);
@@ -506,7 +509,7 @@ static int do_test_code_reading(bool try_kcore)
                                 continue;
                         }
                         pr_debug("perf_evlist__open failed\n");
-                       goto out_err;
+                       goto out_put;
                 }
                 break;
         }
@@ -514,7 +517,7 @@ static int do_test_code_reading(bool try_kcore)
         ret = perf_evlist__mmap(evlist, UINT_MAX, false);
         if (ret < 0) {
                 pr_debug("perf_evlist__mmap failed\n");
-               goto out_err;
+               goto out_put;
         }
  
         perf_evlist__enable(evlist);
@@ -525,7 +528,7 @@ static int do_test_code_reading(bool try_kcore)
  
         ret = process_events(machine, evlist, &state);
         if (ret < 0)
-               goto out_err;
+               goto out_put;
  
         if (!have_vmlinux && !have_kcore && !try_kcore)
                 err = TEST_CODE_READING_NO_KERNEL_OBJ;
@@ -535,7 +538,10 @@ static int do_test_code_reading(bool try_kcore)
                 err = TEST_CODE_READING_NO_ACCESS;
         else
                 err = TEST_CODE_READING_OK;
+out_put:
+       thread__put(thread);
  out_err:
+
         if (evlist) {
                 perf_evlist__delete(evlist);
         } else {
diff --git a/tools/perf/tests/dso-data.c b/tools/perf/tests/dso-data.c

index 513e5febbe5a5016ed5d9a2564bea1e5bc92e4e8..a218aeaf56a002396bf0d0db9ef0e457a7445c9f 100644 (file)
--- a/tools/perf/tests/dso-data.c
+++ b/tools/perf/tests/dso-data.c
@@ -99,6 +99,17 @@ struct test_data_offset offsets[] = {
         },
  };
  
+/* move it from util/dso.c for compatibility */
+static int dso__data_fd(struct dso *dso, struct machine *machine)
+{
+       int fd = dso__data_get_fd(dso, machine);
+
+       if (fd >= 0)
+               dso__data_put_fd(dso);
+
+       return fd;
+}
+
  int test__dso_data(void)
  {
         struct machine machine;
@@ -155,7 +166,7 @@ int test__dso_data(void)
                 free(buf);
         }
  
-       dso__delete(dso);
+       dso__put(dso);
         unlink(file);
         return 0;
  }
@@ -215,7 +226,7 @@ static void dsos__delete(int cnt)
                 struct dso *dso = dsos[i];
  
                 unlink(dso->name);
-               dso__delete(dso);
+               dso__put(dso);
         }
  
         free(dsos);
diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c

index 0bf06bec68c7e9786668990ad399b578326726c5..40b36c4624275a360d4a0f3226eaf850d6d0e825 100644 (file)
--- a/tools/perf/tests/dwarf-unwind.c
+++ b/tools/perf/tests/dwarf-unwind.c
@@ -28,7 +28,7 @@ static int init_live_machine(struct machine *machine)
         pid_t pid = getpid();
  
         return perf_event__synthesize_mmap_events(NULL, &event, pid, pid,
-                                                 mmap_handler, machine, true);
+                                                 mmap_handler, machine, true, 500);
  }
  
  #define MAX_STACK 8
@@ -170,6 +170,7 @@ int test__dwarf_unwind(void)
         }
  
         err = krava_1(thread);
+       thread__put(thread);
  
   out:
         machine__delete_threads(machine);
diff --git a/tools/perf/tests/evsel-roundtrip-name.c b/tools/perf/tests/evsel-roundtrip-name.c

index b8d8341b383e7bc123c29301eff6fbe6e98be332..3fa715987a5ec2693e2bcdb31a33e3f20616c136 100644 (file)
--- a/tools/perf/tests/evsel-roundtrip-name.c
+++ b/tools/perf/tests/evsel-roundtrip-name.c
@@ -23,7 +23,7 @@ static int perf_evsel__roundtrip_cache_name_test(void)
                         for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
                                 __perf_evsel__hw_cache_type_op_res_name(type, op, i,
                                                                         name, sizeof(name));
-                               err = parse_events(evlist, name);
+                               err = parse_events(evlist, name, NULL);
                                 if (err)
                                         ret = err;
                         }
@@ -71,7 +71,7 @@ static int __perf_evsel__name_array_test(const char *names[], int nr_names)
                  return -ENOMEM;
  
         for (i = 0; i < nr_names; ++i) {
-               err = parse_events(evlist, names[i]);
+               err = parse_events(evlist, names[i], NULL);
                 if (err) {
                         pr_debug("failed to parse event '%s', err %d\n",
                                  names[i], err);
diff --git a/tools/perf/tests/hists_common.c b/tools/perf/tests/hists_common.c

index a62c091345163f70ea72ad2cb9dbc2297ae2da1a..ce80b274b097332d02b5502fb0c5b88fc6d6016a 100644 (file)
--- a/tools/perf/tests/hists_common.c
+++ b/tools/perf/tests/hists_common.c
@@ -96,6 +96,7 @@ struct machine *setup_fake_machine(struct machines *machines)
                         goto out;
  
                 thread__set_comm(thread, fake_threads[i].comm, 0);
+               thread__put(thread);
         }
  
         for (i = 0; i < ARRAY_SIZE(fake_mmap_info); i++) {
@@ -120,8 +121,7 @@ struct machine *setup_fake_machine(struct machines *machines)
                 size_t k;
                 struct dso *dso;
  
-               dso = __dsos__findnew(&machine->user_dsos,
-                                     fake_symbols[i].dso_name);
+               dso = machine__findnew_dso(machine, fake_symbols[i].dso_name);
                 if (dso == NULL)
                         goto out;
  
@@ -134,11 +134,15 @@ struct machine *setup_fake_machine(struct machines *machines)
  
                         sym = symbol__new(fsym->start, fsym->length,
                                           STB_GLOBAL, fsym->name);
-                       if (sym == NULL)
+                       if (sym == NULL) {
+                               dso__put(dso);
                                 goto out;
+                       }
  
                         symbols__insert(&dso->symbols[MAP__FUNCTION], sym);
                 }
+
+               dso__put(dso);
         }
  
         return machine;
diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c

index 18619966454c572a0f3c0a1b2330818fc6e1ffe4..7d82c8be5e360da5f89b1ff7b569a798c5a34fe0 100644 (file)
--- a/tools/perf/tests/hists_cumulate.c
+++ b/tools/perf/tests/hists_cumulate.c
@@ -87,6 +87,8 @@ static int add_hist_entries(struct hists *hists, struct machine *machine)
                         },
                 };
                 struct hist_entry_iter iter = {
+                       .evsel = evsel,
+                       .sample = &sample,
                         .hide_unresolved = false,
                 };
  
@@ -104,9 +106,11 @@ static int add_hist_entries(struct hists *hists, struct machine *machine)
                                                   &sample) < 0)
                         goto out;
  
-               if (hist_entry_iter__add(&iter, &al, evsel, &sample,
-                                        PERF_MAX_STACK_DEPTH, NULL) < 0)
+               if (hist_entry_iter__add(&iter, &al, PERF_MAX_STACK_DEPTH,
+                                        NULL) < 0) {
+                       addr_location__put(&al);
                         goto out;
+               }
  
                 fake_samples[i].thread = al.thread;
                 fake_samples[i].map = al.map;
@@ -695,7 +699,7 @@ int test__hists_cumulate(void)
  
         TEST_ASSERT_VAL("No memory", evlist);
  
-       err = parse_events(evlist, "cpu-clock");
+       err = parse_events(evlist, "cpu-clock", NULL);
         if (err)
                 goto out;
  
diff --git a/tools/perf/tests/hists_filter.c b/tools/perf/tests/hists_filter.c

index 59e53db7914c0ad6100ab2e616cdf21e39efea46..ce48775e6ada13886000013183908f3f6b63f26d 100644 (file)
--- a/tools/perf/tests/hists_filter.c
+++ b/tools/perf/tests/hists_filter.c
@@ -63,6 +63,8 @@ static int add_hist_entries(struct perf_evlist *evlist,
                                 },
                         };
                         struct hist_entry_iter iter = {
+                               .evsel = evsel,
+                               .sample = &sample,
                                 .ops = &hist_iter_normal,
                                 .hide_unresolved = false,
                         };
@@ -81,9 +83,11 @@ static int add_hist_entries(struct perf_evlist *evlist,
                                                           &sample) < 0)
                                 goto out;
  
-                       if (hist_entry_iter__add(&iter, &al, evsel, &sample,
-                                                PERF_MAX_STACK_DEPTH, NULL) < 0)
+                       if (hist_entry_iter__add(&iter, &al,
+                                                PERF_MAX_STACK_DEPTH, NULL) < 0) {
+                               addr_location__put(&al);
                                 goto out;
+                       }
  
                         fake_samples[i].thread = al.thread;
                         fake_samples[i].map = al.map;
@@ -108,10 +112,10 @@ int test__hists_filter(void)
  
         TEST_ASSERT_VAL("No memory", evlist);
  
-       err = parse_events(evlist, "cpu-clock");
+       err = parse_events(evlist, "cpu-clock", NULL);
         if (err)
                 goto out;
-       err = parse_events(evlist, "task-clock");
+       err = parse_events(evlist, "task-clock", NULL);
         if (err)
                 goto out;
  
diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c

index 278ba8344c236d000a0388c3ef1c18b60350f554..8c102b0114249708e4ae2059c81732ab23793c4b 100644 (file)
--- a/tools/perf/tests/hists_link.c
+++ b/tools/perf/tests/hists_link.c
@@ -91,8 +91,10 @@ static int add_hist_entries(struct perf_evlist *evlist, struct machine *machine)
  
                         he = __hists__add_entry(hists, &al, NULL,
                                                 NULL, NULL, 1, 1, 0, true);
-                       if (he == NULL)
+                       if (he == NULL) {
+                               addr_location__put(&al);
                                 goto out;
+                       }
  
                         fake_common_samples[k].thread = al.thread;
                         fake_common_samples[k].map = al.map;
@@ -115,8 +117,10 @@ static int add_hist_entries(struct perf_evlist *evlist, struct machine *machine)
  
                         he = __hists__add_entry(hists, &al, NULL,
                                                 NULL, NULL, 1, 1, 0, true);
-                       if (he == NULL)
+                       if (he == NULL) {
+                               addr_location__put(&al);
                                 goto out;
+                       }
  
                         fake_samples[i][k].thread = al.thread;
                         fake_samples[i][k].map = al.map;
@@ -282,10 +286,10 @@ int test__hists_link(void)
         if (evlist == NULL)
                  return -ENOMEM;
  
-       err = parse_events(evlist, "cpu-clock");
+       err = parse_events(evlist, "cpu-clock", NULL);
         if (err)
                 goto out;
-       err = parse_events(evlist, "task-clock");
+       err = parse_events(evlist, "task-clock", NULL);
         if (err)
                 goto out;
  
diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c

index b52c9faea22450ed4092d67acdb1eb15ce15c6a8..adbebc852cc8b58886a2618f973815f9284b7b3b 100644 (file)
--- a/tools/perf/tests/hists_output.c
+++ b/tools/perf/tests/hists_output.c
@@ -57,6 +57,8 @@ static int add_hist_entries(struct hists *hists, struct machine *machine)
                         },
                 };
                 struct hist_entry_iter iter = {
+                       .evsel = evsel,
+                       .sample = &sample,
                         .ops = &hist_iter_normal,
                         .hide_unresolved = false,
                 };
@@ -70,9 +72,11 @@ static int add_hist_entries(struct hists *hists, struct machine *machine)
                                                   &sample) < 0)
                         goto out;
  
-               if (hist_entry_iter__add(&iter, &al, evsel, &sample,
-                                        PERF_MAX_STACK_DEPTH, NULL) < 0)
+               if (hist_entry_iter__add(&iter, &al, PERF_MAX_STACK_DEPTH,
+                                        NULL) < 0) {
+                       addr_location__put(&al);
                         goto out;
+               }
  
                 fake_samples[i].thread = al.thread;
                 fake_samples[i].map = al.map;
@@ -590,7 +594,7 @@ int test__hists_output(void)
  
         TEST_ASSERT_VAL("No memory", evlist);
  
-       err = parse_events(evlist, "cpu-clock");
+       err = parse_events(evlist, "cpu-clock", NULL);
         if (err)
                 goto out;
  
diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c

index 7a5ab7b0b8f698146794be7584af453a446f79ee..5b171d1e338bdd26bcf1343f58e8b0bdb314b71c 100644 (file)
--- a/tools/perf/tests/keep-tracking.c
+++ b/tools/perf/tests/keep-tracking.c
@@ -78,8 +78,8 @@ int test__keep_tracking(void)
  
         perf_evlist__set_maps(evlist, cpus, threads);
  
-       CHECK__(parse_events(evlist, "dummy:u"));
-       CHECK__(parse_events(evlist, "cycles:u"));
+       CHECK__(parse_events(evlist, "dummy:u", NULL));
+       CHECK__(parse_events(evlist, "cycles:u", NULL));
  
         perf_evlist__config(evlist, &opts);
  
diff --git a/tools/perf/tests/kmod-path.c b/tools/perf/tests/kmod-path.c

index e8d7cbb9320c58c987de7743bf4c08bcd8314361..08c433b4bf4f30c8f69307ba3fd5b0ec21802e2f 100644 (file)
--- a/tools/perf/tests/kmod-path.c
+++ b/tools/perf/tests/kmod-path.c
@@ -34,9 +34,21 @@ static int test(const char *path, bool alloc_name, bool alloc_ext,
         return 0;
  }
  
+static int test_is_kernel_module(const char *path, int cpumode, bool expect)
+{
+       TEST_ASSERT_VAL("is_kernel_module",
+                       (!!is_kernel_module(path, cpumode)) == (!!expect));
+       pr_debug("%s (cpumode: %d) - is_kernel_module: %s\n",
+                       path, cpumode, expect ? "true" : "false");
+       return 0;
+}
+
  #define T(path, an, ae, k, c, n, e) \
         TEST_ASSERT_VAL("failed", !test(path, an, ae, k, c, n, e))
  
+#define M(path, c, e) \
+       TEST_ASSERT_VAL("failed", !test_is_kernel_module(path, c, e))
+
  int test__kmod_path__parse(void)
  {
         /* path                alloc_name  alloc_ext   kmod  comp   name     ext */
@@ -44,30 +56,90 @@ int test__kmod_path__parse(void)
         T("/xxxx/xxxx/x-x.ko", false     , true      , true, false, NULL   , NULL);
         T("/xxxx/xxxx/x-x.ko", true      , false     , true, false, "[x_x]", NULL);
         T("/xxxx/xxxx/x-x.ko", false     , false     , true, false, NULL   , NULL);
+       M("/xxxx/xxxx/x-x.ko", PERF_RECORD_MISC_CPUMODE_UNKNOWN, true);
+       M("/xxxx/xxxx/x-x.ko", PERF_RECORD_MISC_KERNEL, true);
+       M("/xxxx/xxxx/x-x.ko", PERF_RECORD_MISC_USER, false);
  
         /* path                alloc_name  alloc_ext   kmod  comp  name   ext */
         T("/xxxx/xxxx/x.ko.gz", true     , true      , true, true, "[x]", "gz");
         T("/xxxx/xxxx/x.ko.gz", false    , true      , true, true, NULL , "gz");
         T("/xxxx/xxxx/x.ko.gz", true     , false     , true, true, "[x]", NULL);
         T("/xxxx/xxxx/x.ko.gz", false    , false     , true, true, NULL , NULL);
+       M("/xxxx/xxxx/x.ko.gz", PERF_RECORD_MISC_CPUMODE_UNKNOWN, true);
+       M("/xxxx/xxxx/x.ko.gz", PERF_RECORD_MISC_KERNEL, true);
+       M("/xxxx/xxxx/x.ko.gz", PERF_RECORD_MISC_USER, false);
  
         /* path              alloc_name  alloc_ext  kmod   comp  name    ext */
         T("/xxxx/xxxx/x.gz", true      , true     , false, true, "x.gz" ,"gz");
         T("/xxxx/xxxx/x.gz", false     , true     , false, true, NULL   ,"gz");
         T("/xxxx/xxxx/x.gz", true      , false    , false, true, "x.gz" , NULL);
         T("/xxxx/xxxx/x.gz", false     , false    , false, true, NULL   , NULL);
+       M("/xxxx/xxxx/x.gz", PERF_RECORD_MISC_CPUMODE_UNKNOWN, false);
+       M("/xxxx/xxxx/x.gz", PERF_RECORD_MISC_KERNEL, false);
+       M("/xxxx/xxxx/x.gz", PERF_RECORD_MISC_USER, false);
  
         /* path   alloc_name  alloc_ext  kmod   comp  name     ext */
         T("x.gz", true      , true     , false, true, "x.gz", "gz");
         T("x.gz", false     , true     , false, true, NULL  , "gz");
         T("x.gz", true      , false    , false, true, "x.gz", NULL);
         T("x.gz", false     , false    , false, true, NULL  , NULL);
+       M("x.gz", PERF_RECORD_MISC_CPUMODE_UNKNOWN, false);
+       M("x.gz", PERF_RECORD_MISC_KERNEL, false);
+       M("x.gz", PERF_RECORD_MISC_USER, false);
  
         /* path      alloc_name  alloc_ext  kmod  comp  name  ext */
         T("x.ko.gz", true      , true     , true, true, "[x]", "gz");
         T("x.ko.gz", false     , true     , true, true, NULL , "gz");
         T("x.ko.gz", true      , false    , true, true, "[x]", NULL);
         T("x.ko.gz", false     , false    , true, true, NULL , NULL);
+       M("x.ko.gz", PERF_RECORD_MISC_CPUMODE_UNKNOWN, true);
+       M("x.ko.gz", PERF_RECORD_MISC_KERNEL, true);
+       M("x.ko.gz", PERF_RECORD_MISC_USER, false);
+
+       /* path            alloc_name  alloc_ext  kmod  comp   name             ext */
+       T("[test_module]", true      , true     , true, false, "[test_module]", NULL);
+       T("[test_module]", false     , true     , true, false, NULL           , NULL);
+       T("[test_module]", true      , false    , true, false, "[test_module]", NULL);
+       T("[test_module]", false     , false    , true, false, NULL           , NULL);
+       M("[test_module]", PERF_RECORD_MISC_CPUMODE_UNKNOWN, true);
+       M("[test_module]", PERF_RECORD_MISC_KERNEL, true);
+       M("[test_module]", PERF_RECORD_MISC_USER, false);
+
+       /* path            alloc_name  alloc_ext  kmod  comp   name             ext */
+       T("[test.module]", true      , true     , true, false, "[test.module]", NULL);
+       T("[test.module]", false     , true     , true, false, NULL           , NULL);
+       T("[test.module]", true      , false    , true, false, "[test.module]", NULL);
+       T("[test.module]", false     , false    , true, false, NULL           , NULL);
+       M("[test.module]", PERF_RECORD_MISC_CPUMODE_UNKNOWN, true);
+       M("[test.module]", PERF_RECORD_MISC_KERNEL, true);
+       M("[test.module]", PERF_RECORD_MISC_USER, false);
+
+       /* path     alloc_name  alloc_ext  kmod   comp   name      ext */
+       T("[vdso]", true      , true     , false, false, "[vdso]", NULL);
+       T("[vdso]", false     , true     , false, false, NULL    , NULL);
+       T("[vdso]", true      , false    , false, false, "[vdso]", NULL);
+       T("[vdso]", false     , false    , false, false, NULL    , NULL);
+       M("[vdso]", PERF_RECORD_MISC_CPUMODE_UNKNOWN, false);
+       M("[vdso]", PERF_RECORD_MISC_KERNEL, false);
+       M("[vdso]", PERF_RECORD_MISC_USER, false);
+
+       /* path         alloc_name  alloc_ext  kmod   comp   name          ext */
+       T("[vsyscall]", true      , true     , false, false, "[vsyscall]", NULL);
+       T("[vsyscall]", false     , true     , false, false, NULL        , NULL);
+       T("[vsyscall]", true      , false    , false, false, "[vsyscall]", NULL);
+       T("[vsyscall]", false     , false    , false, false, NULL        , NULL);
+       M("[vsyscall]", PERF_RECORD_MISC_CPUMODE_UNKNOWN, false);
+       M("[vsyscall]", PERF_RECORD_MISC_KERNEL, false);
+       M("[vsyscall]", PERF_RECORD_MISC_USER, false);
+
+       /* path                alloc_name  alloc_ext  kmod   comp   name      ext */
+       T("[kernel.kallsyms]", true      , true     , false, false, "[kernel.kallsyms]", NULL);
+       T("[kernel.kallsyms]", false     , true     , false, false, NULL               , NULL);
+       T("[kernel.kallsyms]", true      , false    , false, false, "[kernel.kallsyms]", NULL);
+       T("[kernel.kallsyms]", false     , false    , false, false, NULL               , NULL);
+       M("[kernel.kallsyms]", PERF_RECORD_MISC_CPUMODE_UNKNOWN, false);
+       M("[kernel.kallsyms]", PERF_RECORD_MISC_KERNEL, false);
+       M("[kernel.kallsyms]", PERF_RECORD_MISC_USER, false);
  
         return 0;
  }
diff --git a/tools/perf/tests/make b/tools/perf/tests/make

index bff85324f799bd1eeba79413ebf1433faac0a2e1..65280d28662e4c72177a20a3cfa56f968a10c359 100644 (file)
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -32,6 +32,7 @@ make_no_backtrace   := NO_BACKTRACE=1
  make_no_libnuma     := NO_LIBNUMA=1
  make_no_libaudit    := NO_LIBAUDIT=1
  make_no_libbionic   := NO_LIBBIONIC=1
+make_no_auxtrace    := NO_AUXTRACE=1
  make_tags           := tags
  make_cscope         := cscope
  make_help           := help
@@ -52,7 +53,7 @@ make_static         := LDFLAGS=-static
  make_minimal        := NO_LIBPERL=1 NO_LIBPYTHON=1 NO_NEWT=1 NO_GTK2=1
  make_minimal        += NO_DEMANGLE=1 NO_LIBELF=1 NO_LIBUNWIND=1 NO_BACKTRACE=1
  make_minimal        += NO_LIBNUMA=1 NO_LIBAUDIT=1 NO_LIBBIONIC=1
-make_minimal        += NO_LIBDW_DWARF_UNWIND=1
+make_minimal        += NO_LIBDW_DWARF_UNWIND=1 NO_AUXTRACE=1
  
  # $(run) contains all available tests
  run := make_pure
@@ -74,6 +75,7 @@ run += make_no_backtrace
  run += make_no_libnuma
  run += make_no_libaudit
  run += make_no_libbionic
+run += make_no_auxtrace
  run += make_help
  run += make_doc
  run += make_perf_o
@@ -223,7 +225,19 @@ tarpkg:
         echo "- $@: $$cmd" && echo $$cmd > $@ && \
         ( eval $$cmd ) >> $@ 2>&1
  
-all: $(run) $(run_O) tarpkg
+make_kernelsrc:
+       @echo " - make -C <kernelsrc> tools/perf"
+       $(call clean); \
+       (make -C ../.. tools/perf) > $@ 2>&1 && \
+       test -x perf && rm -f $@ || (cat $@ ; false)
+
+make_kernelsrc_tools:
+       @echo " - make -C <kernelsrc>/tools perf"
+       $(call clean); \
+       (make -C ../../tools perf) > $@ 2>&1 && \
+       test -x perf && rm -f $@ || (cat $@ ; false)
+
+all: $(run) $(run_O) tarpkg make_kernelsrc make_kernelsrc_tools
         @echo OK
  
  out: $(run_O)
diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c

index 9b9622a33932dadf2e98bd249850c62f16ac66e5..5855cf47121003479ae63e859059a5ad8809c5ec 100644 (file)
--- a/tools/perf/tests/mmap-basic.c
+++ b/tools/perf/tests/mmap-basic.c
@@ -23,10 +23,8 @@ int test__basic_mmap(void)
         struct cpu_map *cpus;
         struct perf_evlist *evlist;
         cpu_set_t cpu_set;
-       const char *syscall_names[] = { "getsid", "getppid", "getpgrp",
-                                       "getpgid", };
-       pid_t (*syscalls[])(void) = { (void *)getsid, getppid, getpgrp,
-                                     (void*)getpgid };
+       const char *syscall_names[] = { "getsid", "getppid", "getpgid", };
+       pid_t (*syscalls[])(void) = { (void *)getsid, getppid, (void*)getpgid };
  #define nsyscalls ARRAY_SIZE(syscall_names)
         unsigned int nr_events[nsyscalls],
                      expected_nr_events[nsyscalls], i, j;
diff --git a/tools/perf/tests/mmap-thread-lookup.c b/tools/perf/tests/mmap-thread-lookup.c

index 2113f1c8611fb569b0fb6bc7f676477e7de0a666..7f48efa7e295f63a72f0aa083857658ce68c45cb 100644 (file)
--- a/tools/perf/tests/mmap-thread-lookup.c
+++ b/tools/perf/tests/mmap-thread-lookup.c
@@ -129,7 +129,7 @@ static int synth_all(struct machine *machine)
  {
         return perf_event__synthesize_threads(NULL,
                                               perf_event__process,
-                                             machine, 0);
+                                             machine, 0, 500);
  }
  
  static int synth_process(struct machine *machine)
@@ -141,7 +141,7 @@ static int synth_process(struct machine *machine)
  
         err = perf_event__synthesize_thread_map(NULL, map,
                                                 perf_event__process,
-                                               machine, 0);
+                                               machine, 0, 500);
  
         thread_map__delete(map);
         return err;
@@ -191,6 +191,8 @@ static int mmap_events(synth_cb synth)
                                       PERF_RECORD_MISC_USER, MAP__FUNCTION,
                                       (unsigned long) (td->map + 1), &al);
  
+               thread__put(thread);
+
                 if (!al.map) {
                         pr_debug("failed, couldn't find map\n");
                         err = -1;
diff --git a/tools/perf/tests/open-syscall-all-cpus.c b/tools/perf/tests/open-syscall-all-cpus.c

deleted file mode 100644 (file)

index 3ec885c..0000000
--- a/tools/perf/tests/open-syscall-all-cpus.c
+++ /dev/null
@@ -1,115 +0,0 @@
-#include "evsel.h"
-#include "tests.h"
-#include "thread_map.h"
-#include "cpumap.h"
-#include "debug.h"
-
-int test__open_syscall_event_on_all_cpus(void)
-{
-       int err = -1, fd, cpu;
-       struct cpu_map *cpus;
-       struct perf_evsel *evsel;
-       unsigned int nr_open_calls = 111, i;
-       cpu_set_t cpu_set;
-       struct thread_map *threads = thread_map__new(-1, getpid(), UINT_MAX);
-       char sbuf[STRERR_BUFSIZE];
-
-       if (threads == NULL) {
-               pr_debug("thread_map__new\n");
-               return -1;
-       }
-
-       cpus = cpu_map__new(NULL);
-       if (cpus == NULL) {
-               pr_debug("cpu_map__new\n");
-               goto out_thread_map_delete;
-       }
-
-       CPU_ZERO(&cpu_set);
-
-       evsel = perf_evsel__newtp("syscalls", "sys_enter_open");
-       if (evsel == NULL) {
-               if (tracefs_configured())
-                       pr_debug("is tracefs mounted on /sys/kernel/tracing?\n");
-               else if (debugfs_configured())
-                       pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
-               else
-                       pr_debug("Neither tracefs or debugfs is enabled in this kernel\n");
-               goto out_thread_map_delete;
-       }
-
-       if (perf_evsel__open(evsel, cpus, threads) < 0) {
-               pr_debug("failed to open counter: %s, "
-                        "tweak /proc/sys/kernel/perf_event_paranoid?\n",
-                        strerror_r(errno, sbuf, sizeof(sbuf)));
-               goto out_evsel_delete;
-       }
-
-       for (cpu = 0; cpu < cpus->nr; ++cpu) {
-               unsigned int ncalls = nr_open_calls + cpu;
-               /*
-                * XXX eventually lift this restriction in a way that
-                * keeps perf building on older glibc installations
-                * without CPU_ALLOC. 1024 cpus in 2010 still seems
-                * a reasonable upper limit tho :-)
-                */
-               if (cpus->map[cpu] >= CPU_SETSIZE) {
-                       pr_debug("Ignoring CPU %d\n", cpus->map[cpu]);
-                       continue;
-               }
-
-               CPU_SET(cpus->map[cpu], &cpu_set);
-               if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) {
-                       pr_debug("sched_setaffinity() failed on CPU %d: %s ",
-                                cpus->map[cpu],
-                                strerror_r(errno, sbuf, sizeof(sbuf)));
-                       goto out_close_fd;
-               }
-               for (i = 0; i < ncalls; ++i) {
-                       fd = open("/etc/passwd", O_RDONLY);
-                       close(fd);
-               }
-               CPU_CLR(cpus->map[cpu], &cpu_set);
-       }
-
-       /*
-        * Here we need to explicitely preallocate the counts, as if
-        * we use the auto allocation it will allocate just for 1 cpu,
-        * as we start by cpu 0.
-        */
-       if (perf_evsel__alloc_counts(evsel, cpus->nr) < 0) {
-               pr_debug("perf_evsel__alloc_counts(ncpus=%d)\n", cpus->nr);
-               goto out_close_fd;
-       }
-
-       err = 0;
-
-       for (cpu = 0; cpu < cpus->nr; ++cpu) {
-               unsigned int expected;
-
-               if (cpus->map[cpu] >= CPU_SETSIZE)
-                       continue;
-
-               if (perf_evsel__read_on_cpu(evsel, cpu, 0) < 0) {
-                       pr_debug("perf_evsel__read_on_cpu\n");
-                       err = -1;
-                       break;
-               }
-
-               expected = nr_open_calls + cpu;
-               if (evsel->counts->cpu[cpu].val != expected) {
-                       pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls on cpu %d, got %" PRIu64 "\n",
-                                expected, cpus->map[cpu], evsel->counts->cpu[cpu].val);
-                       err = -1;
-               }
-       }
-
-       perf_evsel__free_counts(evsel);
-out_close_fd:
-       perf_evsel__close_fd(evsel, 1, threads->nr);
-out_evsel_delete:
-       perf_evsel__delete(evsel);
-out_thread_map_delete:
-       thread_map__delete(threads);
-       return err;
-}
diff --git a/tools/perf/tests/open-syscall-tp-fields.c b/tools/perf/tests/open-syscall-tp-fields.c

deleted file mode 100644 (file)

index 127dcae..0000000
--- a/tools/perf/tests/open-syscall-tp-fields.c
+++ /dev/null
@@ -1,121 +0,0 @@
-#include "perf.h"
-#include "evlist.h"
-#include "evsel.h"
-#include "thread_map.h"
-#include "tests.h"
-#include "debug.h"
-
-int test__syscall_open_tp_fields(void)
-{
-       struct record_opts opts = {
-               .target = {
-                       .uid = UINT_MAX,
-                       .uses_mmap = true,
-               },
-               .no_buffering = true,
-               .freq         = 1,
-               .mmap_pages   = 256,
-               .raw_samples  = true,
-       };
-       const char *filename = "/etc/passwd";
-       int flags = O_RDONLY | O_DIRECTORY;
-       struct perf_evlist *evlist = perf_evlist__new();
-       struct perf_evsel *evsel;
-       int err = -1, i, nr_events = 0, nr_polls = 0;
-       char sbuf[STRERR_BUFSIZE];
-
-       if (evlist == NULL) {
-               pr_debug("%s: perf_evlist__new\n", __func__);
-               goto out;
-       }
-
-       evsel = perf_evsel__newtp("syscalls", "sys_enter_open");
-       if (evsel == NULL) {
-               pr_debug("%s: perf_evsel__newtp\n", __func__);
-               goto out_delete_evlist;
-       }
-
-       perf_evlist__add(evlist, evsel);
-
-       err = perf_evlist__create_maps(evlist, &opts.target);
-       if (err < 0) {
-               pr_debug("%s: perf_evlist__create_maps\n", __func__);
-               goto out_delete_evlist;
-       }
-
-       perf_evsel__config(evsel, &opts);
-
-       evlist->threads->map[0] = getpid();
-
-       err = perf_evlist__open(evlist);
-       if (err < 0) {
-               pr_debug("perf_evlist__open: %s\n",
-                        strerror_r(errno, sbuf, sizeof(sbuf)));
-               goto out_delete_evlist;
-       }
-
-       err = perf_evlist__mmap(evlist, UINT_MAX, false);
-       if (err < 0) {
-               pr_debug("perf_evlist__mmap: %s\n",
-                        strerror_r(errno, sbuf, sizeof(sbuf)));
-               goto out_delete_evlist;
-       }
-
-       perf_evlist__enable(evlist);
-
-       /*
-        * Generate the event:
-        */
-       open(filename, flags);
-
-       while (1) {
-               int before = nr_events;
-
-               for (i = 0; i < evlist->nr_mmaps; i++) {
-                       union perf_event *event;
-
-                       while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
-                               const u32 type = event->header.type;
-                               int tp_flags;
-                               struct perf_sample sample;
-
-                               ++nr_events;
-
-                               if (type != PERF_RECORD_SAMPLE) {
-                                       perf_evlist__mmap_consume(evlist, i);
-                                       continue;
-                               }
-
-                               err = perf_evsel__parse_sample(evsel, event, &sample);
-                               if (err) {
-                                       pr_err("Can't parse sample, err = %d\n", err);
-                                       goto out_delete_evlist;
-                               }
-
-                               tp_flags = perf_evsel__intval(evsel, &sample, "flags");
-
-                               if (flags != tp_flags) {
-                                       pr_debug("%s: Expected flags=%#x, got %#x\n",
-                                                __func__, flags, tp_flags);
-                                       goto out_delete_evlist;
-                               }
-
-                               goto out_ok;
-                       }
-               }
-
-               if (nr_events == before)
-                       perf_evlist__poll(evlist, 10);
-
-               if (++nr_polls > 5) {
-                       pr_debug("%s: no events!\n", __func__);
-                       goto out_delete_evlist;
-               }
-       }
-out_ok:
-       err = 0;
-out_delete_evlist:
-       perf_evlist__delete(evlist);
-out:
-       return err;
-}
diff --git a/tools/perf/tests/open-syscall.c b/tools/perf/tests/open-syscall.c

deleted file mode 100644 (file)

index 07aa319..0000000
--- a/tools/perf/tests/open-syscall.c
+++ /dev/null
@@ -1,61 +0,0 @@
-#include "thread_map.h"
-#include "evsel.h"
-#include "debug.h"
-#include "tests.h"
-
-int test__open_syscall_event(void)
-{
-       int err = -1, fd;
-       struct perf_evsel *evsel;
-       unsigned int nr_open_calls = 111, i;
-       struct thread_map *threads = thread_map__new(-1, getpid(), UINT_MAX);
-       char sbuf[STRERR_BUFSIZE];
-
-       if (threads == NULL) {
-               pr_debug("thread_map__new\n");
-               return -1;
-       }
-
-       evsel = perf_evsel__newtp("syscalls", "sys_enter_open");
-       if (evsel == NULL) {
-               if (tracefs_configured())
-                       pr_debug("is tracefs mounted on /sys/kernel/tracing?\n");
-               else if (debugfs_configured())
-                       pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
-               else
-                       pr_debug("Neither tracefs or debugfs is enabled in this kernel\n");
-               goto out_thread_map_delete;
-       }
-
-       if (perf_evsel__open_per_thread(evsel, threads) < 0) {
-               pr_debug("failed to open counter: %s, "
-                        "tweak /proc/sys/kernel/perf_event_paranoid?\n",
-                        strerror_r(errno, sbuf, sizeof(sbuf)));
-               goto out_evsel_delete;
-       }
-
-       for (i = 0; i < nr_open_calls; ++i) {
-               fd = open("/etc/passwd", O_RDONLY);
-               close(fd);
-       }
-
-       if (perf_evsel__read_on_cpu(evsel, 0, 0) < 0) {
-               pr_debug("perf_evsel__read_on_cpu\n");
-               goto out_close_fd;
-       }
-
-       if (evsel->counts->cpu[0].val != nr_open_calls) {
-               pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls, got %" PRIu64 "\n",
-                        nr_open_calls, evsel->counts->cpu[0].val);
-               goto out_close_fd;
-       }
-
-       err = 0;
-out_close_fd:
-       perf_evsel__close_fd(evsel, 1, threads->nr);
-out_evsel_delete:
-       perf_evsel__delete(evsel);
-out_thread_map_delete:
-       thread_map__delete(threads);
-       return err;
-}
diff --git a/tools/perf/tests/openat-syscall-all-cpus.c b/tools/perf/tests/openat-syscall-all-cpus.c

new file mode 100644 (file)

index 0000000..9a7a116
--- /dev/null
+++ b/tools/perf/tests/openat-syscall-all-cpus.c
@@ -0,0 +1,116 @@
+#include "evsel.h"
+#include "tests.h"
+#include "thread_map.h"
+#include "cpumap.h"
+#include "debug.h"
+#include "stat.h"
+
+int test__openat_syscall_event_on_all_cpus(void)
+{
+       int err = -1, fd, cpu;
+       struct cpu_map *cpus;
+       struct perf_evsel *evsel;
+       unsigned int nr_openat_calls = 111, i;
+       cpu_set_t cpu_set;
+       struct thread_map *threads = thread_map__new(-1, getpid(), UINT_MAX);
+       char sbuf[STRERR_BUFSIZE];
+
+       if (threads == NULL) {
+               pr_debug("thread_map__new\n");
+               return -1;
+       }
+
+       cpus = cpu_map__new(NULL);
+       if (cpus == NULL) {
+               pr_debug("cpu_map__new\n");
+               goto out_thread_map_delete;
+       }
+
+       CPU_ZERO(&cpu_set);
+
+       evsel = perf_evsel__newtp("syscalls", "sys_enter_openat");
+       if (evsel == NULL) {
+               if (tracefs_configured())
+                       pr_debug("is tracefs mounted on /sys/kernel/tracing?\n");
+               else if (debugfs_configured())
+                       pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
+               else
+                       pr_debug("Neither tracefs or debugfs is enabled in this kernel\n");
+               goto out_thread_map_delete;
+       }
+
+       if (perf_evsel__open(evsel, cpus, threads) < 0) {
+               pr_debug("failed to open counter: %s, "
+                        "tweak /proc/sys/kernel/perf_event_paranoid?\n",
+                        strerror_r(errno, sbuf, sizeof(sbuf)));
+               goto out_evsel_delete;
+       }
+
+       for (cpu = 0; cpu < cpus->nr; ++cpu) {
+               unsigned int ncalls = nr_openat_calls + cpu;
+               /*
+                * XXX eventually lift this restriction in a way that
+                * keeps perf building on older glibc installations
+                * without CPU_ALLOC. 1024 cpus in 2010 still seems
+                * a reasonable upper limit tho :-)
+                */
+               if (cpus->map[cpu] >= CPU_SETSIZE) {
+                       pr_debug("Ignoring CPU %d\n", cpus->map[cpu]);
+                       continue;
+               }
+
+               CPU_SET(cpus->map[cpu], &cpu_set);
+               if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) {
+                       pr_debug("sched_setaffinity() failed on CPU %d: %s ",
+                                cpus->map[cpu],
+                                strerror_r(errno, sbuf, sizeof(sbuf)));
+                       goto out_close_fd;
+               }
+               for (i = 0; i < ncalls; ++i) {
+                       fd = openat(0, "/etc/passwd", O_RDONLY);
+                       close(fd);
+               }
+               CPU_CLR(cpus->map[cpu], &cpu_set);
+       }
+
+       /*
+        * Here we need to explicitely preallocate the counts, as if
+        * we use the auto allocation it will allocate just for 1 cpu,
+        * as we start by cpu 0.
+        */
+       if (perf_evsel__alloc_counts(evsel, cpus->nr) < 0) {
+               pr_debug("perf_evsel__alloc_counts(ncpus=%d)\n", cpus->nr);
+               goto out_close_fd;
+       }
+
+       err = 0;
+
+       for (cpu = 0; cpu < cpus->nr; ++cpu) {
+               unsigned int expected;
+
+               if (cpus->map[cpu] >= CPU_SETSIZE)
+                       continue;
+
+               if (perf_evsel__read_on_cpu(evsel, cpu, 0) < 0) {
+                       pr_debug("perf_evsel__read_on_cpu\n");
+                       err = -1;
+                       break;
+               }
+
+               expected = nr_openat_calls + cpu;
+               if (evsel->counts->cpu[cpu].val != expected) {
+                       pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls on cpu %d, got %" PRIu64 "\n",
+                                expected, cpus->map[cpu], evsel->counts->cpu[cpu].val);
+                       err = -1;
+               }
+       }
+
+       perf_evsel__free_counts(evsel);
+out_close_fd:
+       perf_evsel__close_fd(evsel, 1, threads->nr);
+out_evsel_delete:
+       perf_evsel__delete(evsel);
+out_thread_map_delete:
+       thread_map__delete(threads);
+       return err;
+}
diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c

new file mode 100644 (file)

index 0000000..6245221
--- /dev/null
+++ b/tools/perf/tests/openat-syscall-tp-fields.c
@@ -0,0 +1,121 @@
+#include "perf.h"
+#include "evlist.h"
+#include "evsel.h"
+#include "thread_map.h"
+#include "tests.h"
+#include "debug.h"
+
+int test__syscall_openat_tp_fields(void)
+{
+       struct record_opts opts = {
+               .target = {
+                       .uid = UINT_MAX,
+                       .uses_mmap = true,
+               },
+               .no_buffering = true,
+               .freq         = 1,
+               .mmap_pages   = 256,
+               .raw_samples  = true,
+       };
+       const char *filename = "/etc/passwd";
+       int flags = O_RDONLY | O_DIRECTORY;
+       struct perf_evlist *evlist = perf_evlist__new();
+       struct perf_evsel *evsel;
+       int err = -1, i, nr_events = 0, nr_polls = 0;
+       char sbuf[STRERR_BUFSIZE];
+
+       if (evlist == NULL) {
+               pr_debug("%s: perf_evlist__new\n", __func__);
+               goto out;
+       }
+
+       evsel = perf_evsel__newtp("syscalls", "sys_enter_openat");
+       if (evsel == NULL) {
+               pr_debug("%s: perf_evsel__newtp\n", __func__);
+               goto out_delete_evlist;
+       }
+
+       perf_evlist__add(evlist, evsel);
+
+       err = perf_evlist__create_maps(evlist, &opts.target);
+       if (err < 0) {
+               pr_debug("%s: perf_evlist__create_maps\n", __func__);
+               goto out_delete_evlist;
+       }
+
+       perf_evsel__config(evsel, &opts);
+
+       evlist->threads->map[0] = getpid();
+
+       err = perf_evlist__open(evlist);
+       if (err < 0) {
+               pr_debug("perf_evlist__open: %s\n",
+                        strerror_r(errno, sbuf, sizeof(sbuf)));
+               goto out_delete_evlist;
+       }
+
+       err = perf_evlist__mmap(evlist, UINT_MAX, false);
+       if (err < 0) {
+               pr_debug("perf_evlist__mmap: %s\n",
+                        strerror_r(errno, sbuf, sizeof(sbuf)));
+               goto out_delete_evlist;
+       }
+
+       perf_evlist__enable(evlist);
+
+       /*
+        * Generate the event:
+        */
+       openat(AT_FDCWD, filename, flags);
+
+       while (1) {
+               int before = nr_events;
+
+               for (i = 0; i < evlist->nr_mmaps; i++) {
+                       union perf_event *event;
+
+                       while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
+                               const u32 type = event->header.type;
+                               int tp_flags;
+                               struct perf_sample sample;
+
+                               ++nr_events;
+
+                               if (type != PERF_RECORD_SAMPLE) {
+                                       perf_evlist__mmap_consume(evlist, i);
+                                       continue;
+                               }
+
+                               err = perf_evsel__parse_sample(evsel, event, &sample);
+                               if (err) {
+                                       pr_err("Can't parse sample, err = %d\n", err);
+                                       goto out_delete_evlist;
+                               }
+
+                               tp_flags = perf_evsel__intval(evsel, &sample, "flags");
+
+                               if (flags != tp_flags) {
+                                       pr_debug("%s: Expected flags=%#x, got %#x\n",
+                                                __func__, flags, tp_flags);
+                                       goto out_delete_evlist;
+                               }
+
+                               goto out_ok;
+                       }
+               }
+
+               if (nr_events == before)
+                       perf_evlist__poll(evlist, 10);
+
+               if (++nr_polls > 5) {
+                       pr_debug("%s: no events!\n", __func__);
+                       goto out_delete_evlist;
+               }
+       }
+out_ok:
+       err = 0;
+out_delete_evlist:
+       perf_evlist__delete(evlist);
+out:
+       return err;
+}
diff --git a/tools/perf/tests/openat-syscall.c b/tools/perf/tests/openat-syscall.c

new file mode 100644 (file)

index 0000000..9f9491b
--- /dev/null
+++ b/tools/perf/tests/openat-syscall.c
@@ -0,0 +1,61 @@
+#include "thread_map.h"
+#include "evsel.h"
+#include "debug.h"
+#include "tests.h"
+
+int test__openat_syscall_event(void)
+{
+       int err = -1, fd;
+       struct perf_evsel *evsel;
+       unsigned int nr_openat_calls = 111, i;
+       struct thread_map *threads = thread_map__new(-1, getpid(), UINT_MAX);
+       char sbuf[STRERR_BUFSIZE];
+
+       if (threads == NULL) {
+               pr_debug("thread_map__new\n");
+               return -1;
+       }
+
+       evsel = perf_evsel__newtp("syscalls", "sys_enter_openat");
+       if (evsel == NULL) {
+               if (tracefs_configured())
+                       pr_debug("is tracefs mounted on /sys/kernel/tracing?\n");
+               else if (debugfs_configured())
+                       pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
+               else
+                       pr_debug("Neither tracefs or debugfs is enabled in this kernel\n");
+               goto out_thread_map_delete;
+       }
+
+       if (perf_evsel__open_per_thread(evsel, threads) < 0) {
+               pr_debug("failed to open counter: %s, "
+                        "tweak /proc/sys/kernel/perf_event_paranoid?\n",
+                        strerror_r(errno, sbuf, sizeof(sbuf)));
+               goto out_evsel_delete;
+       }
+
+       for (i = 0; i < nr_openat_calls; ++i) {
+               fd = openat(0, "/etc/passwd", O_RDONLY);
+               close(fd);
+       }
+
+       if (perf_evsel__read_on_cpu(evsel, 0, 0) < 0) {
+               pr_debug("perf_evsel__read_on_cpu\n");
+               goto out_close_fd;
+       }
+
+       if (evsel->counts->cpu[0].val != nr_openat_calls) {
+               pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls, got %" PRIu64 "\n",
+                        nr_openat_calls, evsel->counts->cpu[0].val);
+               goto out_close_fd;
+       }
+
+       err = 0;
+out_close_fd:
+       perf_evsel__close_fd(evsel, 1, threads->nr);
+out_evsel_delete:
+       perf_evsel__delete(evsel);
+out_thread_map_delete:
+       thread_map__delete(threads);
+       return err;
+}
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c

index 3de744961739c2c1502e0c0367c357b2f39c90a6..d76963f7ad3d4a0a117af25f69bcc364d475fa11 100644 (file)
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -427,7 +427,7 @@ static int test__checkevent_list(struct perf_evlist *evlist)
         TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
         TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
  
-       /* syscalls:sys_enter_open:k */
+       /* syscalls:sys_enter_openat:k */
         evsel = perf_evsel__next(evsel);
         TEST_ASSERT_VAL("wrong type", PERF_TYPE_TRACEPOINT == evsel->attr.type);
         TEST_ASSERT_VAL("wrong sample_type",
@@ -665,7 +665,7 @@ static int test__group3(struct perf_evlist *evlist __maybe_unused)
         TEST_ASSERT_VAL("wrong number of entries", 5 == evlist->nr_entries);
         TEST_ASSERT_VAL("wrong number of groups", 2 == evlist->nr_groups);
  
-       /* group1 syscalls:sys_enter_open:H */
+       /* group1 syscalls:sys_enter_openat:H */
         evsel = leader = perf_evlist__first(evlist);
         TEST_ASSERT_VAL("wrong type", PERF_TYPE_TRACEPOINT == evsel->attr.type);
         TEST_ASSERT_VAL("wrong sample_type",
@@ -1293,7 +1293,7 @@ struct evlist_test {
  
  static struct evlist_test test__events[] = {
         {
-               .name  = "syscalls:sys_enter_open",
+               .name  = "syscalls:sys_enter_openat",
                 .check = test__checkevent_tracepoint,
                 .id    = 0,
         },
@@ -1353,7 +1353,7 @@ static struct evlist_test test__events[] = {
                 .id    = 11,
         },
         {
-               .name  = "syscalls:sys_enter_open:k",
+               .name  = "syscalls:sys_enter_openat:k",
                 .check = test__checkevent_tracepoint_modifier,
                 .id    = 12,
         },
@@ -1408,7 +1408,7 @@ static struct evlist_test test__events[] = {
                 .id    = 22,
         },
         {
-               .name  = "r1,syscalls:sys_enter_open:k,1:1:hp",
+               .name  = "r1,syscalls:sys_enter_openat:k,1:1:hp",
                 .check = test__checkevent_list,
                 .id    = 23,
         },
@@ -1443,7 +1443,7 @@ static struct evlist_test test__events[] = {
                 .id    = 29,
         },
         {
-               .name  = "group1{syscalls:sys_enter_open:H,cycles:kppp},group2{cycles,1:3}:G,instructions:u",
+               .name  = "group1{syscalls:sys_enter_openat:H,cycles:kppp},group2{cycles,1:3}:G,instructions:u",
                 .check = test__group3,
                 .id    = 30,
         },
@@ -1571,7 +1571,7 @@ static int test_event(struct evlist_test *e)
         if (evlist == NULL)
                 return -ENOMEM;
  
-       ret = parse_events(evlist, e->name);
+       ret = parse_events(evlist, e->name, NULL);
         if (ret) {
                 pr_debug("failed to parse event '%s', err %d\n",
                          e->name, ret);
diff --git a/tools/perf/tests/perf-time-to-tsc.c b/tools/perf/tests/perf-time-to-tsc.c

index f238442b238a297d11e0622f275bf847e5b7db94..5f49484f1abc03bed80d035c508501914a577caf 100644 (file)
--- a/tools/perf/tests/perf-time-to-tsc.c
+++ b/tools/perf/tests/perf-time-to-tsc.c
@@ -68,7 +68,7 @@ int test__perf_time_to_tsc(void)
  
         perf_evlist__set_maps(evlist, cpus, threads);
  
-       CHECK__(parse_events(evlist, "cycles:u"));
+       CHECK__(parse_events(evlist, "cycles:u", NULL));
  
         perf_evlist__config(evlist, &opts);
  
diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c

index eeb68bb1972d44e41bafa5fc10809700e4afc630..faa04e9d5d5fc751a1ac8082522fb045f108a060 100644 (file)
--- a/tools/perf/tests/pmu.c
+++ b/tools/perf/tests/pmu.c
@@ -152,7 +152,8 @@ int test__pmu(void)
                 if (ret)
                         break;
  
-               ret = perf_pmu__config_terms(&formats, &attr, terms, false);
+               ret = perf_pmu__config_terms(&formats, &attr, terms,
+                                            false, NULL);
                 if (ret)
                         break;
  
diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c

index cc68648c7c555210c17c7c3d8d6b61eb14a39e73..0d31403ea593c7d2e689056af1670a18423a39ed 100644 (file)
--- a/tools/perf/tests/switch-tracking.c
+++ b/tools/perf/tests/switch-tracking.c
@@ -347,7 +347,7 @@ int test__switch_tracking(void)
         perf_evlist__set_maps(evlist, cpus, threads);
  
         /* First event */
-       err = parse_events(evlist, "cpu-clock:u");
+       err = parse_events(evlist, "cpu-clock:u", NULL);
         if (err) {
                 pr_debug("Failed to parse event dummy:u\n");
                 goto out_err;
@@ -356,7 +356,7 @@ int test__switch_tracking(void)
         cpu_clocks_evsel = perf_evlist__last(evlist);
  
         /* Second event */
-       err = parse_events(evlist, "cycles:u");
+       err = parse_events(evlist, "cycles:u", NULL);
         if (err) {
                 pr_debug("Failed to parse event cycles:u\n");
                 goto out_err;
@@ -371,7 +371,7 @@ int test__switch_tracking(void)
                 goto out;
         }
  
-       err = parse_events(evlist, sched_switch);
+       err = parse_events(evlist, sched_switch, NULL);
         if (err) {
                 pr_debug("Failed to parse event %s\n", sched_switch);
                 goto out_err;
@@ -401,7 +401,7 @@ int test__switch_tracking(void)
         perf_evsel__set_sample_bit(cycles_evsel, TIME);
  
         /* Fourth event */
-       err = parse_events(evlist, "dummy:u");
+       err = parse_events(evlist, "dummy:u", NULL);
         if (err) {
                 pr_debug("Failed to parse event dummy:u\n");
                 goto out_err;
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h

index 52758a33f64c5679bb39a2a545320d9a3a44d303..8e5038b48ba8dfe3313d9c508156ae9b4ecb8c5a 100644 (file)
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -9,6 +9,15 @@ do {                                                                    \
         }                                                                \
  } while (0)
  
+#define TEST_ASSERT_EQUAL(text, val, expected)                          \
+do {                                                                    \
+       if (val != expected) {                                           \
+               pr_debug("FAILED %s:%d %s (%d != %d)\n",                 \
+                        __FILE__, __LINE__, text, val, expected);       \
+               return -1;                                               \
+       }                                                                \
+} while (0)
+
  enum {
         TEST_OK   =  0,
         TEST_FAIL = -1,
@@ -17,14 +26,14 @@ enum {
  
  /* Tests */
  int test__vmlinux_matches_kallsyms(void);
-int test__open_syscall_event(void);
-int test__open_syscall_event_on_all_cpus(void);
+int test__openat_syscall_event(void);
+int test__openat_syscall_event_on_all_cpus(void);
  int test__basic_mmap(void);
  int test__PERF_RECORD(void);
  int test__rdpmc(void);
  int test__perf_evsel__roundtrip_name_test(void);
  int test__perf_evsel__tp_sched_test(void);
-int test__syscall_open_tp_fields(void);
+int test__syscall_openat_tp_fields(void);
  int test__pmu(void);
  int test__attr(void);
  int test__dso_data(void);
@@ -53,7 +62,7 @@ int test__fdarray__filter(void);
  int test__fdarray__add(void);
  int test__kmod_path__parse(void);
  
-#if defined(__x86_64__) || defined(__i386__) || defined(__arm__)
+#if defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__)
  #ifdef HAVE_DWARF_UNWIND_SUPPORT
  struct thread;
  struct perf_sample;
diff --git a/tools/perf/tests/thread-mg-share.c b/tools/perf/tests/thread-mg-share.c

index b028499dd3cf0f5ee7530a6c869aaa40ef802521..01fabb19d74607bb9157f0dbddfd160d21c8cebf 100644 (file)
--- a/tools/perf/tests/thread-mg-share.c
+++ b/tools/perf/tests/thread-mg-share.c
@@ -43,7 +43,7 @@ int test__thread_mg_share(void)
                         leader && t1 && t2 && t3 && other);
  
         mg = leader->mg;
-       TEST_ASSERT_VAL("wrong refcnt", mg->refcnt == 4);
+       TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 4);
  
         /* test the map groups pointer is shared */
         TEST_ASSERT_VAL("map groups don't match", mg == t1->mg);
@@ -58,33 +58,40 @@ int test__thread_mg_share(void)
         other_leader = machine__find_thread(machine, 4, 4);
         TEST_ASSERT_VAL("failed to find other leader", other_leader);
  
+       /*
+        * Ok, now that all the rbtree related operations were done,
+        * lets remove all of them from there so that we can do the
+        * refcounting tests.
+        */
+       machine__remove_thread(machine, leader);
+       machine__remove_thread(machine, t1);
+       machine__remove_thread(machine, t2);
+       machine__remove_thread(machine, t3);
+       machine__remove_thread(machine, other);
+       machine__remove_thread(machine, other_leader);
+
         other_mg = other->mg;
-       TEST_ASSERT_VAL("wrong refcnt", other_mg->refcnt == 2);
+       TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&other_mg->refcnt), 2);
  
         TEST_ASSERT_VAL("map groups don't match", other_mg == other_leader->mg);
  
         /* release thread group */
-       thread__delete(leader);
-       TEST_ASSERT_VAL("wrong refcnt", mg->refcnt == 3);
+       thread__put(leader);
+       TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 3);
  
-       thread__delete(t1);
-       TEST_ASSERT_VAL("wrong refcnt", mg->refcnt == 2);
+       thread__put(t1);
+       TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 2);
  
-       thread__delete(t2);
-       TEST_ASSERT_VAL("wrong refcnt", mg->refcnt == 1);
+       thread__put(t2);
+       TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 1);
  
-       thread__delete(t3);
+       thread__put(t3);
  
         /* release other group  */
-       thread__delete(other_leader);
-       TEST_ASSERT_VAL("wrong refcnt", other_mg->refcnt == 1);
+       thread__put(other_leader);
+       TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&other_mg->refcnt), 1);
  
-       thread__delete(other);
-
-       /*
-        * Cannot call machine__delete_threads(machine) now,
-        * because we've already released all the threads.
-        */
+       thread__put(other);
  
         machines__exit(&machines);
         return 0;
diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c

index 3d9088003a5b6d16da0038d0abbfa1fc427ed65b..b34c5fc829ae2b0da7389bd31649dd0389e40604 100644 (file)
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c
@@ -23,9 +23,10 @@ int test__vmlinux_matches_kallsyms(void)
         int err = -1;
         struct rb_node *nd;
         struct symbol *sym;
-       struct map *kallsyms_map, *vmlinux_map;
+       struct map *kallsyms_map, *vmlinux_map, *map;
         struct machine kallsyms, vmlinux;
         enum map_type type = MAP__FUNCTION;
+       struct maps *maps = &vmlinux.kmaps.maps[type];
         u64 mem_start, mem_end;
  
         /*
@@ -184,8 +185,8 @@ detour:
  
         pr_info("Maps only in vmlinux:\n");
  
-       for (nd = rb_first(&vmlinux.kmaps.maps[type]); nd; nd = rb_next(nd)) {
-               struct map *pos = rb_entry(nd, struct map, rb_node), *pair;
+       for (map = maps__first(maps); map; map = map__next(map)) {
+               struct map *
                 /*
                  * If it is the kernel, kallsyms is always "[kernel.kallsyms]", while
                  * the kernel will have the path for the vmlinux file being used,
@@ -193,22 +194,22 @@ detour:
                  * both cases.
                  */
                 pair = map_groups__find_by_name(&kallsyms.kmaps, type,
-                                               (pos->dso->kernel ?
-                                                       pos->dso->short_name :
-                                                       pos->dso->name));
+                                               (map->dso->kernel ?
+                                                       map->dso->short_name :
+                                                       map->dso->name));
                 if (pair)
                         pair->priv = 1;
                 else
-                       map__fprintf(pos, stderr);
+                       map__fprintf(map, stderr);
         }
  
         pr_info("Maps in vmlinux with a different name in kallsyms:\n");
  
-       for (nd = rb_first(&vmlinux.kmaps.maps[type]); nd; nd = rb_next(nd)) {
-               struct map *pos = rb_entry(nd, struct map, rb_node), *pair;
+       for (map = maps__first(maps); map; map = map__next(map)) {
+               struct map *pair;
  
-               mem_start = vmlinux_map->unmap_ip(vmlinux_map, pos->start);
-               mem_end = vmlinux_map->unmap_ip(vmlinux_map, pos->end);
+               mem_start = vmlinux_map->unmap_ip(vmlinux_map, map->start);
+               mem_end = vmlinux_map->unmap_ip(vmlinux_map, map->end);
  
                 pair = map_groups__find(&kallsyms.kmaps, type, mem_start);
                 if (pair == NULL || pair->priv)
@@ -217,7 +218,7 @@ detour:
                 if (pair->start == mem_start) {
                         pair->priv = 1;
                         pr_info(" %" PRIx64 "-%" PRIx64 " %" PRIx64 " %s in kallsyms as",
-                               pos->start, pos->end, pos->pgoff, pos->dso->name);
+                               map->start, map->end, map->pgoff, map->dso->name);
                         if (mem_end != pair->end)
                                 pr_info(":\n*%" PRIx64 "-%" PRIx64 " %" PRIx64,
                                         pair->start, pair->end, pair->pgoff);
@@ -228,12 +229,11 @@ detour:
  
         pr_info("Maps only in kallsyms:\n");
  
-       for (nd = rb_first(&kallsyms.kmaps.maps[type]);
-            nd; nd = rb_next(nd)) {
-               struct map *pos = rb_entry(nd, struct map, rb_node);
+       maps = &kallsyms.kmaps.maps[type];
  
-               if (!pos->priv)
-                       map__fprintf(pos, stderr);
+       for (map = maps__first(maps); map; map = map__next(map)) {
+               if (!map->priv)
+                       map__fprintf(map, stderr);
         }
  out:
         machine__exit(&kallsyms);
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c

index e5250eb2dd57866b1051736767dbe598bb4c78d2..5995a8bd7c6971dc4300f9ecc508135645420e7e 100644 (file)
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -11,16 +11,21 @@
  #include "../../util/evsel.h"
  #include <pthread.h>
  
+struct disasm_line_samples {
+       double          percent;
+       u64             nr;
+};
+
  struct browser_disasm_line {
-       struct rb_node  rb_node;
-       u32             idx;
-       int             idx_asm;
-       int             jump_sources;
+       struct rb_node                  rb_node;
+       u32                             idx;
+       int                             idx_asm;
+       int                             jump_sources;
         /*
          * actual length of this array is saved on the nr_events field
          * of the struct annotate_browser
          */
-       double          percent[1];
+       struct disasm_line_samples      samples[1];
  };
  
  static struct annotate_browser_opt {
@@ -28,7 +33,8 @@ static struct annotate_browser_opt {
              use_offset,
              jump_arrows,
              show_linenr,
-            show_nr_jumps;
+            show_nr_jumps,
+            show_total_period;
  } annotate_browser__opts = {
         .use_offset     = true,
         .jump_arrows    = true,
@@ -105,15 +111,20 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int
         char bf[256];
  
         for (i = 0; i < ab->nr_events; i++) {
-               if (bdl->percent[i] > percent_max)
-                       percent_max = bdl->percent[i];
+               if (bdl->samples[i].percent > percent_max)
+                       percent_max = bdl->samples[i].percent;
         }
  
         if (dl->offset != -1 && percent_max != 0.0) {
                 for (i = 0; i < ab->nr_events; i++) {
-                       ui_browser__set_percent_color(browser, bdl->percent[i],
+                       ui_browser__set_percent_color(browser,
+                                                     bdl->samples[i].percent,
                                                       current_entry);
-                       slsmg_printf("%6.2f ", bdl->percent[i]);
+                       if (annotate_browser__opts.show_total_period)
+                               slsmg_printf("%6" PRIu64 " ",
+                                            bdl->samples[i].nr);
+                       else
+                               slsmg_printf("%6.2f ", bdl->samples[i].percent);
                 }
         } else {
                 ui_browser__set_percent_color(browser, 0, current_entry);
@@ -273,9 +284,9 @@ static int disasm__cmp(struct browser_disasm_line *a,
         int i;
  
         for (i = 0; i < nr_pcnt; i++) {
-               if (a->percent[i] == b->percent[i])
+               if (a->samples[i].percent == b->samples[i].percent)
                         continue;
-               return a->percent[i] < b->percent[i];
+               return a->samples[i].percent < b->samples[i].percent;
         }
         return 0;
  }
@@ -366,14 +377,17 @@ static void annotate_browser__calc_percent(struct annotate_browser *browser,
                 next = disasm__get_next_ip_line(&notes->src->source, pos);
  
                 for (i = 0; i < browser->nr_events; i++) {
-                       bpos->percent[i] = disasm__calc_percent(notes,
+                       u64 nr_samples;
+
+                       bpos->samples[i].percent = disasm__calc_percent(notes,
                                                 evsel->idx + i,
                                                 pos->offset,
                                                 next ? next->offset : len,
-                                               &path);
+                                               &path, &nr_samples);
+                       bpos->samples[i].nr = nr_samples;
  
-                       if (max_percent < bpos->percent[i])
-                               max_percent = bpos->percent[i];
+                       if (max_percent < bpos->samples[i].percent)
+                               max_percent = bpos->samples[i].percent;
                 }
  
                 if (max_percent < 0.01) {
@@ -737,6 +751,7 @@ static int annotate_browser__run(struct annotate_browser *browser,
                 "n             Search next string\n"
                 "o             Toggle disassembler output/simplified view\n"
                 "s             Toggle source code view\n"
+               "t             Toggle total period view\n"
                 "/             Search string\n"
                 "k             Toggle line numbers\n"
                 "r             Run available scripts\n"
@@ -812,6 +827,11 @@ show_sup_ins:
                                 ui_helpline__puts("Actions are only available for 'callq', 'retq' & jump instructions.");
                         }
                         continue;
+               case 't':
+                       annotate_browser__opts.show_total_period =
+                         !annotate_browser__opts.show_total_period;
+                       annotate_browser__update_addr_width(browser);
+                       continue;
                 case K_LEFT:
                 case K_ESC:
                 case 'q':
@@ -832,12 +852,20 @@ out:
  int map_symbol__tui_annotate(struct map_symbol *ms, struct perf_evsel *evsel,
                              struct hist_browser_timer *hbt)
  {
+       /* Set default value for show_total_period.  */
+       annotate_browser__opts.show_total_period =
+         symbol_conf.show_total_period;
+
         return symbol__tui_annotate(ms->sym, ms->map, evsel, hbt);
  }
  
  int hist_entry__tui_annotate(struct hist_entry *he, struct perf_evsel *evsel,
                              struct hist_browser_timer *hbt)
  {
+       /* reset abort key so that it can get Ctrl-C as a key */
+       SLang_reset_tty();
+       SLang_init_tty(0, 0, 0);
+
         return map_symbol__tui_annotate(&he->ms, evsel, hbt);
  }
  
@@ -925,7 +953,8 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map,
  
         if (perf_evsel__is_group_event(evsel)) {
                 nr_pcnt = evsel->nr_members;
-               sizeof_bdl += sizeof(double) * (nr_pcnt - 1);
+               sizeof_bdl += sizeof(struct disasm_line_samples) *
+                 (nr_pcnt - 1);
         }
  
         if (symbol__annotate(sym, map, sizeof_bdl) < 0) {
@@ -1002,6 +1031,7 @@ static struct annotate_config {
         ANNOTATE_CFG(show_linenr),
         ANNOTATE_CFG(show_nr_jumps),
         ANNOTATE_CFG(use_offset),
+       ANNOTATE_CFG(show_total_period),
  };
  
  #undef ANNOTATE_CFG
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c

index 995b7a8596b1420e9764f08f18326d319f2a0a31..c42adb6000914554bf0d109e02d9ad5cec801313 100644 (file)
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -25,6 +25,9 @@ struct hist_browser {
         struct hists        *hists;
         struct hist_entry   *he_selection;
         struct map_symbol   *selection;
+       struct hist_browser_timer *hbt;
+       struct pstack       *pstack;
+       struct perf_session_env *env;
         int                  print_seq;
         bool                 show_dso;
         bool                 show_headers;
@@ -60,7 +63,7 @@ static int hist_browser__get_folding(struct hist_browser *browser)
                 struct hist_entry *he =
                         rb_entry(nd, struct hist_entry, rb_node);
  
-               if (he->ms.unfolded)
+               if (he->unfolded)
                         unfolded_rows += he->nr_rows;
         }
         return unfolded_rows;
@@ -136,24 +139,19 @@ static char tree__folded_sign(bool unfolded)
         return unfolded ? '-' : '+';
  }
  
-static char map_symbol__folded(const struct map_symbol *ms)
-{
-       return ms->has_children ? tree__folded_sign(ms->unfolded) : ' ';
-}
-
  static char hist_entry__folded(const struct hist_entry *he)
  {
-       return map_symbol__folded(&he->ms);
+       return he->has_children ? tree__folded_sign(he->unfolded) : ' ';
  }
  
  static char callchain_list__folded(const struct callchain_list *cl)
  {
-       return map_symbol__folded(&cl->ms);
+       return cl->has_children ? tree__folded_sign(cl->unfolded) : ' ';
  }
  
-static void map_symbol__set_folding(struct map_symbol *ms, bool unfold)
+static void callchain_list__set_folding(struct callchain_list *cl, bool unfold)
  {
-       ms->unfolded = unfold ? ms->has_children : false;
+       cl->unfolded = unfold ? cl->has_children : false;
  }
  
  static int callchain_node__count_rows_rb_tree(struct callchain_node *node)
@@ -189,7 +187,7 @@ static int callchain_node__count_rows(struct callchain_node *node)
  
         list_for_each_entry(chain, &node->val, list) {
                 ++n;
-               unfolded = chain->ms.unfolded;
+               unfolded = chain->unfolded;
         }
  
         if (unfolded)
@@ -211,15 +209,27 @@ static int callchain__count_rows(struct rb_root *chain)
         return n;
  }
  
-static bool map_symbol__toggle_fold(struct map_symbol *ms)
+static bool hist_entry__toggle_fold(struct hist_entry *he)
  {
-       if (!ms)
+       if (!he)
                 return false;
  
-       if (!ms->has_children)
+       if (!he->has_children)
                 return false;
  
-       ms->unfolded = !ms->unfolded;
+       he->unfolded = !he->unfolded;
+       return true;
+}
+
+static bool callchain_list__toggle_fold(struct callchain_list *cl)
+{
+       if (!cl)
+               return false;
+
+       if (!cl->has_children)
+               return false;
+
+       cl->unfolded = !cl->unfolded;
         return true;
  }
  
@@ -235,10 +245,10 @@ static void callchain_node__init_have_children_rb_tree(struct callchain_node *no
                 list_for_each_entry(chain, &child->val, list) {
                         if (first) {
                                 first = false;
-                               chain->ms.has_children = chain->list.next != &child->val ||
+                               chain->has_children = chain->list.next != &child->val ||
                                                          !RB_EMPTY_ROOT(&child->rb_root);
                         } else
-                               chain->ms.has_children = chain->list.next == &child->val &&
+                               chain->has_children = chain->list.next == &child->val &&
                                                          !RB_EMPTY_ROOT(&child->rb_root);
                 }
  
@@ -252,11 +262,11 @@ static void callchain_node__init_have_children(struct callchain_node *node,
         struct callchain_list *chain;
  
         chain = list_entry(node->val.next, struct callchain_list, list);
-       chain->ms.has_children = has_sibling;
+       chain->has_children = has_sibling;
  
         if (!list_empty(&node->val)) {
                 chain = list_entry(node->val.prev, struct callchain_list, list);
-               chain->ms.has_children = !RB_EMPTY_ROOT(&node->rb_root);
+               chain->has_children = !RB_EMPTY_ROOT(&node->rb_root);
         }
  
         callchain_node__init_have_children_rb_tree(node);
@@ -276,7 +286,7 @@ static void callchain__init_have_children(struct rb_root *root)
  static void hist_entry__init_have_children(struct hist_entry *he)
  {
         if (!he->init_have_children) {
-               he->ms.has_children = !RB_EMPTY_ROOT(&he->sorted_chain);
+               he->has_children = !RB_EMPTY_ROOT(&he->sorted_chain);
                 callchain__init_have_children(&he->sorted_chain);
                 he->init_have_children = true;
         }
@@ -284,14 +294,22 @@ static void hist_entry__init_have_children(struct hist_entry *he)
  
  static bool hist_browser__toggle_fold(struct hist_browser *browser)
  {
-       if (map_symbol__toggle_fold(browser->selection)) {
-               struct hist_entry *he = browser->he_selection;
+       struct hist_entry *he = browser->he_selection;
+       struct map_symbol *ms = browser->selection;
+       struct callchain_list *cl = container_of(ms, struct callchain_list, ms);
+       bool has_children;
  
+       if (ms == &he->ms)
+               has_children = hist_entry__toggle_fold(he);
+       else
+               has_children = callchain_list__toggle_fold(cl);
+
+       if (has_children) {
                 hist_entry__init_have_children(he);
                 browser->b.nr_entries -= he->nr_rows;
                 browser->nr_callchain_rows -= he->nr_rows;
  
-               if (he->ms.unfolded)
+               if (he->unfolded)
                         he->nr_rows = callchain__count_rows(&he->sorted_chain);
                 else
                         he->nr_rows = 0;
@@ -318,8 +336,8 @@ static int callchain_node__set_folding_rb_tree(struct callchain_node *node, bool
  
                 list_for_each_entry(chain, &child->val, list) {
                         ++n;
-                       map_symbol__set_folding(&chain->ms, unfold);
-                       has_children = chain->ms.has_children;
+                       callchain_list__set_folding(chain, unfold);
+                       has_children = chain->has_children;
                 }
  
                 if (has_children)
@@ -337,8 +355,8 @@ static int callchain_node__set_folding(struct callchain_node *node, bool unfold)
  
         list_for_each_entry(chain, &node->val, list) {
                 ++n;
-               map_symbol__set_folding(&chain->ms, unfold);
-               has_children = chain->ms.has_children;
+               callchain_list__set_folding(chain, unfold);
+               has_children = chain->has_children;
         }
  
         if (has_children)
@@ -363,9 +381,9 @@ static int callchain__set_folding(struct rb_root *chain, bool unfold)
  static void hist_entry__set_folding(struct hist_entry *he, bool unfold)
  {
         hist_entry__init_have_children(he);
-       map_symbol__set_folding(&he->ms, unfold);
+       he->unfolded = unfold ? he->has_children : false;
  
-       if (he->ms.has_children) {
+       if (he->has_children) {
                 int n = callchain__set_folding(&he->sorted_chain, unfold);
                 he->nr_rows = unfold ? n : 0;
         } else
@@ -406,11 +424,11 @@ static void ui_browser__warn_lost_events(struct ui_browser *browser)
                 "Or reduce the sampling frequency.");
  }
  
-static int hist_browser__run(struct hist_browser *browser,
-                            struct hist_browser_timer *hbt)
+static int hist_browser__run(struct hist_browser *browser, const char *help)
  {
         int key;
         char title[160];
+       struct hist_browser_timer *hbt = browser->hbt;
         int delay_secs = hbt ? hbt->refresh : 0;
  
         browser->b.entries = &browser->hists->entries;
@@ -418,8 +436,7 @@ static int hist_browser__run(struct hist_browser *browser,
  
         hists__browser_title(browser->hists, hbt, title, sizeof(title));
  
-       if (ui_browser__show(&browser->b, title,
-                            "Press '?' for help on key bindings") < 0)
+       if (ui_browser__show(&browser->b, title, help) < 0)
                 return -1;
  
         while (1) {
@@ -1016,7 +1033,7 @@ do_offset:
         if (offset > 0) {
                 do {
                         h = rb_entry(nd, struct hist_entry, rb_node);
-                       if (h->ms.unfolded) {
+                       if (h->unfolded) {
                                 u16 remaining = h->nr_rows - h->row_offset;
                                 if (offset > remaining) {
                                         offset -= remaining;
@@ -1037,7 +1054,7 @@ do_offset:
         } else if (offset < 0) {
                 while (1) {
                         h = rb_entry(nd, struct hist_entry, rb_node);
-                       if (h->ms.unfolded) {
+                       if (h->unfolded) {
                                 if (first) {
                                         if (-offset > h->row_offset) {
                                                 offset += h->row_offset;
@@ -1074,7 +1091,7 @@ do_offset:
                                  * row_offset at its last entry.
                                  */
                                 h = rb_entry(nd, struct hist_entry, rb_node);
-                               if (h->ms.unfolded)
+                               if (h->unfolded)
                                         h->row_offset = h->nr_rows;
                                 break;
                         }
@@ -1195,7 +1212,9 @@ static int hist_browser__dump(struct hist_browser *browser)
         return 0;
  }
  
-static struct hist_browser *hist_browser__new(struct hists *hists)
+static struct hist_browser *hist_browser__new(struct hists *hists,
+                                             struct hist_browser_timer *hbt,
+                                             struct perf_session_env *env)
  {
         struct hist_browser *browser = zalloc(sizeof(*browser));
  
@@ -1206,6 +1225,8 @@ static struct hist_browser *hist_browser__new(struct hists *hists)
                 browser->b.seek = ui_browser__hists_seek;
                 browser->b.use_navkeypressed = true;
                 browser->show_headers = symbol_conf.show_hist_headers;
+               browser->hbt = hbt;
+               browser->env = env;
         }
  
         return browser;
@@ -1395,6 +1416,257 @@ close_file_and_continue:
         return ret;
  }
  
+struct popup_action {
+       struct thread           *thread;
+       struct dso              *dso;
+       struct map_symbol       ms;
+
+       int (*fn)(struct hist_browser *browser, struct popup_action *act);
+};
+
+static int
+do_annotate(struct hist_browser *browser, struct popup_action *act)
+{
+       struct perf_evsel *evsel;
+       struct annotation *notes;
+       struct hist_entry *he;
+       int err;
+
+       if (!objdump_path && perf_session_env__lookup_objdump(browser->env))
+               return 0;
+
+       notes = symbol__annotation(act->ms.sym);
+       if (!notes->src)
+               return 0;
+
+       evsel = hists_to_evsel(browser->hists);
+       err = map_symbol__tui_annotate(&act->ms, evsel, browser->hbt);
+       he = hist_browser__selected_entry(browser);
+       /*
+        * offer option to annotate the other branch source or target
+        * (if they exists) when returning from annotate
+        */
+       if ((err == 'q' || err == CTRL('c')) && he->branch_info)
+               return 1;
+
+       ui_browser__update_nr_entries(&browser->b, browser->hists->nr_entries);
+       if (err)
+               ui_browser__handle_resize(&browser->b);
+       return 0;
+}
+
+static int
+add_annotate_opt(struct hist_browser *browser __maybe_unused,
+                struct popup_action *act, char **optstr,
+                struct map *map, struct symbol *sym)
+{
+       if (sym == NULL || map->dso->annotate_warned)
+               return 0;
+
+       if (asprintf(optstr, "Annotate %s", sym->name) < 0)
+               return 0;
+
+       act->ms.map = map;
+       act->ms.sym = sym;
+       act->fn = do_annotate;
+       return 1;
+}
+
+static int
+do_zoom_thread(struct hist_browser *browser, struct popup_action *act)
+{
+       struct thread *thread = act->thread;
+
+       if (browser->hists->thread_filter) {
+               pstack__remove(browser->pstack, &browser->hists->thread_filter);
+               perf_hpp__set_elide(HISTC_THREAD, false);
+               thread__zput(browser->hists->thread_filter);
+               ui_helpline__pop();
+       } else {
+               ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s(%d) thread\"",
+                                  thread->comm_set ? thread__comm_str(thread) : "",
+                                  thread->tid);
+               browser->hists->thread_filter = thread__get(thread);
+               perf_hpp__set_elide(HISTC_THREAD, false);
+               pstack__push(browser->pstack, &browser->hists->thread_filter);
+       }
+
+       hists__filter_by_thread(browser->hists);
+       hist_browser__reset(browser);
+       return 0;
+}
+
+static int
+add_thread_opt(struct hist_browser *browser, struct popup_action *act,
+              char **optstr, struct thread *thread)
+{
+       if (thread == NULL)
+               return 0;
+
+       if (asprintf(optstr, "Zoom %s %s(%d) thread",
+                    browser->hists->thread_filter ? "out of" : "into",
+                    thread->comm_set ? thread__comm_str(thread) : "",
+                    thread->tid) < 0)
+               return 0;
+
+       act->thread = thread;
+       act->fn = do_zoom_thread;
+       return 1;
+}
+
+static int
+do_zoom_dso(struct hist_browser *browser, struct popup_action *act)
+{
+       struct dso *dso = act->dso;
+
+       if (browser->hists->dso_filter) {
+               pstack__remove(browser->pstack, &browser->hists->dso_filter);
+               perf_hpp__set_elide(HISTC_DSO, false);
+               browser->hists->dso_filter = NULL;
+               ui_helpline__pop();
+       } else {
+               if (dso == NULL)
+                       return 0;
+               ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s DSO\"",
+                                  dso->kernel ? "the Kernel" : dso->short_name);
+               browser->hists->dso_filter = dso;
+               perf_hpp__set_elide(HISTC_DSO, true);
+               pstack__push(browser->pstack, &browser->hists->dso_filter);
+       }
+
+       hists__filter_by_dso(browser->hists);
+       hist_browser__reset(browser);
+       return 0;
+}
+
+static int
+add_dso_opt(struct hist_browser *browser, struct popup_action *act,
+           char **optstr, struct dso *dso)
+{
+       if (dso == NULL)
+               return 0;
+
+       if (asprintf(optstr, "Zoom %s %s DSO",
+                    browser->hists->dso_filter ? "out of" : "into",
+                    dso->kernel ? "the Kernel" : dso->short_name) < 0)
+               return 0;
+
+       act->dso = dso;
+       act->fn = do_zoom_dso;
+       return 1;
+}
+
+static int
+do_browse_map(struct hist_browser *browser __maybe_unused,
+             struct popup_action *act)
+{
+       map__browse(act->ms.map);
+       return 0;
+}
+
+static int
+add_map_opt(struct hist_browser *browser __maybe_unused,
+           struct popup_action *act, char **optstr, struct map *map)
+{
+       if (map == NULL)
+               return 0;
+
+       if (asprintf(optstr, "Browse map details") < 0)
+               return 0;
+
+       act->ms.map = map;
+       act->fn = do_browse_map;
+       return 1;
+}
+
+static int
+do_run_script(struct hist_browser *browser __maybe_unused,
+             struct popup_action *act)
+{
+       char script_opt[64];
+       memset(script_opt, 0, sizeof(script_opt));
+
+       if (act->thread) {
+               scnprintf(script_opt, sizeof(script_opt), " -c %s ",
+                         thread__comm_str(act->thread));
+       } else if (act->ms.sym) {
+               scnprintf(script_opt, sizeof(script_opt), " -S %s ",
+                         act->ms.sym->name);
+       }
+
+       script_browse(script_opt);
+       return 0;
+}
+
+static int
+add_script_opt(struct hist_browser *browser __maybe_unused,
+              struct popup_action *act, char **optstr,
+              struct thread *thread, struct symbol *sym)
+{
+       if (thread) {
+               if (asprintf(optstr, "Run scripts for samples of thread [%s]",
+                            thread__comm_str(thread)) < 0)
+                       return 0;
+       } else if (sym) {
+               if (asprintf(optstr, "Run scripts for samples of symbol [%s]",
+                            sym->name) < 0)
+                       return 0;
+       } else {
+               if (asprintf(optstr, "Run scripts for all samples") < 0)
+                       return 0;
+       }
+
+       act->thread = thread;
+       act->ms.sym = sym;
+       act->fn = do_run_script;
+       return 1;
+}
+
+static int
+do_switch_data(struct hist_browser *browser __maybe_unused,
+              struct popup_action *act __maybe_unused)
+{
+       if (switch_data_file()) {
+               ui__warning("Won't switch the data files due to\n"
+                           "no valid data file get selected!\n");
+               return 0;
+       }
+
+       return K_SWITCH_INPUT_DATA;
+}
+
+static int
+add_switch_opt(struct hist_browser *browser,
+              struct popup_action *act, char **optstr)
+{
+       if (!is_report_browser(browser->hbt))
+               return 0;
+
+       if (asprintf(optstr, "Switch to another data file in PWD") < 0)
+               return 0;
+
+       act->fn = do_switch_data;
+       return 1;
+}
+
+static int
+do_exit_browser(struct hist_browser *browser __maybe_unused,
+               struct popup_action *act __maybe_unused)
+{
+       return 0;
+}
+
+static int
+add_exit_opt(struct hist_browser *browser __maybe_unused,
+            struct popup_action *act, char **optstr)
+{
+       if (asprintf(optstr, "Exit") < 0)
+               return 0;
+
+       act->fn = do_exit_browser;
+       return 1;
+}
+
  static void hist_browser__update_nr_entries(struct hist_browser *hb)
  {
         u64 nr_entries = 0;
@@ -1421,14 +1693,14 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
                                     struct perf_session_env *env)
  {
         struct hists *hists = evsel__hists(evsel);
-       struct hist_browser *browser = hist_browser__new(hists);
+       struct hist_browser *browser = hist_browser__new(hists, hbt, env);
         struct branch_info *bi;
-       struct pstack *fstack;
-       char *options[16];
+#define MAX_OPTIONS  16
+       char *options[MAX_OPTIONS];
+       struct popup_action actions[MAX_OPTIONS];
         int nr_options = 0;
         int key = -1;
         char buf[64];
-       char script_opt[64];
         int delay_secs = hbt ? hbt->refresh : 0;
         struct perf_hpp_fmt *fmt;
  
@@ -1463,23 +1735,29 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
         "t             Zoom into current Thread\n"
         "V             Verbose (DSO names in callchains, etc)\n"
         "z             Toggle zeroing of samples\n"
+       "f             Enable/Disable events\n"
         "/             Filter symbol by name";
  
         if (browser == NULL)
                 return -1;
  
+       /* reset abort key so that it can get Ctrl-C as a key */
+       SLang_reset_tty();
+       SLang_init_tty(0, 0, 0);
+
         if (min_pcnt) {
                 browser->min_pcnt = min_pcnt;
                 hist_browser__update_nr_entries(browser);
         }
  
-       fstack = pstack__new(2);
-       if (fstack == NULL)
+       browser->pstack = pstack__new(2);
+       if (browser->pstack == NULL)
                 goto out;
  
         ui_helpline__push(helpline);
  
         memset(options, 0, sizeof(options));
+       memset(actions, 0, sizeof(actions));
  
         perf_hpp__for_each_format(fmt)
                 perf_hpp__reset_width(fmt, hists);
@@ -1489,16 +1767,12 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
  
         while (1) {
                 struct thread *thread = NULL;
-               const struct dso *dso = NULL;
-               int choice = 0,
-                   annotate = -2, zoom_dso = -2, zoom_thread = -2,
-                   annotate_f = -2, annotate_t = -2, browse_map = -2;
-               int scripts_comm = -2, scripts_symbol = -2,
-                   scripts_all = -2, switch_data = -2;
+               struct dso *dso = NULL;
+               int choice = 0;
  
                 nr_options = 0;
  
-               key = hist_browser__run(browser, hbt);
+               key = hist_browser__run(browser, helpline);
  
                 if (browser->he_selection != NULL) {
                         thread = hist_browser__selected_thread(browser);
@@ -1526,17 +1800,25 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
                             browser->selection->sym == NULL ||
                             browser->selection->map->dso->annotate_warned)
                                 continue;
-                       goto do_annotate;
+
+                       actions->ms.map = browser->selection->map;
+                       actions->ms.sym = browser->selection->sym;
+                       do_annotate(browser, actions);
+                       continue;
                 case 'P':
                         hist_browser__dump(browser);
                         continue;
                 case 'd':
-                       goto zoom_dso;
+                       actions->dso = dso;
+                       do_zoom_dso(browser, actions);
+                       continue;
                 case 'V':
                         browser->show_dso = !browser->show_dso;
                         continue;
                 case 't':
-                       goto zoom_thread;
+                       actions->thread = thread;
+                       do_zoom_thread(browser, actions);
+                       continue;
                 case '/':
                         if (ui_browser__input_window("Symbol to show",
                                         "Please enter the name of symbol you want to see",
@@ -1548,12 +1830,18 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
                         }
                         continue;
                 case 'r':
-                       if (is_report_browser(hbt))
-                               goto do_scripts;
+                       if (is_report_browser(hbt)) {
+                               actions->thread = NULL;
+                               actions->ms.sym = NULL;
+                               do_run_script(browser, actions);
+                       }
                         continue;
                 case 's':
-                       if (is_report_browser(hbt))
-                               goto do_data_switch;
+                       if (is_report_browser(hbt)) {
+                               key = do_switch_data(browser, actions);
+                               if (key == K_SWITCH_INPUT_DATA)
+                                       goto out_free_stack;
+                       }
                         continue;
                 case 'i':
                         /* env->arch is NULL for live-mode (i.e. perf top) */
@@ -1583,7 +1871,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
                 case K_LEFT: {
                         const void *top;
  
-                       if (pstack__empty(fstack)) {
+                       if (pstack__empty(browser->pstack)) {
                                 /*
                                  * Go back to the perf_evsel_menu__run or other user
                                  */
@@ -1591,11 +1879,17 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
                                         goto out_free_stack;
                                 continue;
                         }
-                       top = pstack__pop(fstack);
-                       if (top == &browser->hists->dso_filter)
-                               goto zoom_out_dso;
+                       top = pstack__peek(browser->pstack);
+                       if (top == &browser->hists->dso_filter) {
+                               /*
+                                * No need to set actions->dso here since
+                                * it's just to remove the current filter.
+                                * Ditto for thread below.
+                                */
+                               do_zoom_dso(browser, actions);
+                       }
                         if (top == &browser->hists->thread_filter)
-                               goto zoom_out_thread;
+                               do_zoom_thread(browser, actions);
                         continue;
                 }
                 case K_ESC:
@@ -1607,7 +1901,12 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
                 case 'q':
                 case CTRL('c'):
                         goto out_free_stack;
+               case 'f':
+                       if (!is_report_browser(hbt))
+                               goto out_free_stack;
+                       /* Fall thru */
                 default:
+                       helpline = "Press '?' for help on key bindings";
                         continue;
                 }
  
@@ -1623,196 +1922,71 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
                         if (bi == NULL)
                                 goto skip_annotation;
  
-                       if (bi->from.sym != NULL &&
-                           !bi->from.map->dso->annotate_warned &&
-                           asprintf(&options[nr_options], "Annotate %s", bi->from.sym->name) > 0) {
-                               annotate_f = nr_options++;
-                       }
-
-                       if (bi->to.sym != NULL &&
-                           !bi->to.map->dso->annotate_warned &&
-                           (bi->to.sym != bi->from.sym ||
-                            bi->to.map->dso != bi->from.map->dso) &&
-                           asprintf(&options[nr_options], "Annotate %s", bi->to.sym->name) > 0) {
-                               annotate_t = nr_options++;
-                       }
+                       nr_options += add_annotate_opt(browser,
+                                                      &actions[nr_options],
+                                                      &options[nr_options],
+                                                      bi->from.map,
+                                                      bi->from.sym);
+                       if (bi->to.sym != bi->from.sym)
+                               nr_options += add_annotate_opt(browser,
+                                                       &actions[nr_options],
+                                                       &options[nr_options],
+                                                       bi->to.map,
+                                                       bi->to.sym);
                 } else {
-                       if (browser->selection->sym != NULL &&
-                           !browser->selection->map->dso->annotate_warned) {
-                               struct annotation *notes;
-
-                               notes = symbol__annotation(browser->selection->sym);
-
-                               if (notes->src &&
-                                   asprintf(&options[nr_options], "Annotate %s",
-                                                browser->selection->sym->name) > 0) {
-                                       annotate = nr_options++;
-                               }
-                       }
+                       nr_options += add_annotate_opt(browser,
+                                                      &actions[nr_options],
+                                                      &options[nr_options],
+                                                      browser->selection->map,
+                                                      browser->selection->sym);
                 }
  skip_annotation:
-               if (thread != NULL &&
-                   asprintf(&options[nr_options], "Zoom %s %s(%d) thread",
-                            (browser->hists->thread_filter ? "out of" : "into"),
-                            (thread->comm_set ? thread__comm_str(thread) : ""),
-                            thread->tid) > 0)
-                       zoom_thread = nr_options++;
-
-               if (dso != NULL &&
-                   asprintf(&options[nr_options], "Zoom %s %s DSO",
-                            (browser->hists->dso_filter ? "out of" : "into"),
-                            (dso->kernel ? "the Kernel" : dso->short_name)) > 0)
-                       zoom_dso = nr_options++;
-
-               if (browser->selection != NULL &&
-                   browser->selection->map != NULL &&
-                   asprintf(&options[nr_options], "Browse map details") > 0)
-                       browse_map = nr_options++;
+               nr_options += add_thread_opt(browser, &actions[nr_options],
+                                            &options[nr_options], thread);
+               nr_options += add_dso_opt(browser, &actions[nr_options],
+                                         &options[nr_options], dso);
+               nr_options += add_map_opt(browser, &actions[nr_options],
+                                         &options[nr_options],
+                                         browser->selection->map);
  
                 /* perf script support */
                 if (browser->he_selection) {
-                       struct symbol *sym;
-
-                       if (asprintf(&options[nr_options], "Run scripts for samples of thread [%s]",
-                                    thread__comm_str(browser->he_selection->thread)) > 0)
-                               scripts_comm = nr_options++;
-
-                       sym = browser->he_selection->ms.sym;
-                       if (sym && sym->namelen &&
-                               asprintf(&options[nr_options], "Run scripts for samples of symbol [%s]",
-                                               sym->name) > 0)
-                               scripts_symbol = nr_options++;
+                       nr_options += add_script_opt(browser,
+                                                    &actions[nr_options],
+                                                    &options[nr_options],
+                                                    thread, NULL);
+                       nr_options += add_script_opt(browser,
+                                                    &actions[nr_options],
+                                                    &options[nr_options],
+                                                    NULL, browser->selection->sym);
                 }
-
-               if (asprintf(&options[nr_options], "Run scripts for all samples") > 0)
-                       scripts_all = nr_options++;
-
-               if (is_report_browser(hbt) && asprintf(&options[nr_options],
-                               "Switch to another data file in PWD") > 0)
-                       switch_data = nr_options++;
+               nr_options += add_script_opt(browser, &actions[nr_options],
+                                            &options[nr_options], NULL, NULL);
+               nr_options += add_switch_opt(browser, &actions[nr_options],
+                                            &options[nr_options]);
  add_exit_option:
-               options[nr_options++] = (char *)"Exit";
-retry_popup_menu:
-               choice = ui__popup_menu(nr_options, options);
-
-               if (choice == nr_options - 1)
-                       break;
-
-               if (choice == -1) {
-                       free_popup_options(options, nr_options - 1);
-                       continue;
-               }
-
-               if (choice == annotate || choice == annotate_t || choice == annotate_f) {
-                       struct hist_entry *he;
-                       struct annotation *notes;
-                       struct map_symbol ms;
-                       int err;
-do_annotate:
-                       if (!objdump_path && perf_session_env__lookup_objdump(env))
-                               continue;
-
-                       he = hist_browser__selected_entry(browser);
-                       if (he == NULL)
-                               continue;
-
-                       if (choice == annotate_f) {
-                               ms.map = he->branch_info->from.map;
-                               ms.sym = he->branch_info->from.sym;
-                       } else if (choice == annotate_t) {
-                               ms.map = he->branch_info->to.map;
-                               ms.sym = he->branch_info->to.sym;
-                       } else {
-                               ms = *browser->selection;
-                       }
+               nr_options += add_exit_opt(browser, &actions[nr_options],
+                                          &options[nr_options]);
  
-                       notes = symbol__annotation(ms.sym);
-                       if (!notes->src)
-                               continue;
-
-                       err = map_symbol__tui_annotate(&ms, evsel, hbt);
-                       /*
-                        * offer option to annotate the other branch source or target
-                        * (if they exists) when returning from annotate
-                        */
-                       if ((err == 'q' || err == CTRL('c'))
-                           && annotate_t != -2 && annotate_f != -2)
-                               goto retry_popup_menu;
-
-                       ui_browser__update_nr_entries(&browser->b, browser->hists->nr_entries);
-                       if (err)
-                               ui_browser__handle_resize(&browser->b);
-
-               } else if (choice == browse_map)
-                       map__browse(browser->selection->map);
-               else if (choice == zoom_dso) {
-zoom_dso:
-                       if (browser->hists->dso_filter) {
-                               pstack__remove(fstack, &browser->hists->dso_filter);
-zoom_out_dso:
-                               ui_helpline__pop();
-                               browser->hists->dso_filter = NULL;
-                               perf_hpp__set_elide(HISTC_DSO, false);
-                       } else {
-                               if (dso == NULL)
-                                       continue;
-                               ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s DSO\"",
-                                                  dso->kernel ? "the Kernel" : dso->short_name);
-                               browser->hists->dso_filter = dso;
-                               perf_hpp__set_elide(HISTC_DSO, true);
-                               pstack__push(fstack, &browser->hists->dso_filter);
-                       }
-                       hists__filter_by_dso(hists);
-                       hist_browser__reset(browser);
-               } else if (choice == zoom_thread) {
-zoom_thread:
-                       if (browser->hists->thread_filter) {
-                               pstack__remove(fstack, &browser->hists->thread_filter);
-zoom_out_thread:
-                               ui_helpline__pop();
-                               thread__zput(browser->hists->thread_filter);
-                               perf_hpp__set_elide(HISTC_THREAD, false);
-                       } else {
-                               ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s(%d) thread\"",
-                                                  thread->comm_set ? thread__comm_str(thread) : "",
-                                                  thread->tid);
-                               browser->hists->thread_filter = thread__get(thread);
-                               perf_hpp__set_elide(HISTC_THREAD, false);
-                               pstack__push(fstack, &browser->hists->thread_filter);
-                       }
-                       hists__filter_by_thread(hists);
-                       hist_browser__reset(browser);
-               }
-               /* perf scripts support */
-               else if (choice == scripts_all || choice == scripts_comm ||
-                               choice == scripts_symbol) {
-do_scripts:
-                       memset(script_opt, 0, 64);
+               do {
+                       struct popup_action *act;
  
-                       if (choice == scripts_comm)
-                               sprintf(script_opt, " -c %s ", thread__comm_str(browser->he_selection->thread));
+                       choice = ui__popup_menu(nr_options, options);
+                       if (choice == -1 || choice >= nr_options)
+                               break;
  
-                       if (choice == scripts_symbol)
-                               sprintf(script_opt, " -S %s ", browser->he_selection->ms.sym->name);
+                       act = &actions[choice];
+                       key = act->fn(browser, act);
+               } while (key == 1);
  
-                       script_browse(script_opt);
-               }
-               /* Switch to another data file */
-               else if (choice == switch_data) {
-do_data_switch:
-                       if (!switch_data_file()) {
-                               key = K_SWITCH_INPUT_DATA;
-                               break;
-                       } else
-                               ui__warning("Won't switch the data files due to\n"
-                                       "no valid data file get selected!\n");
-               }
+               if (key == K_SWITCH_INPUT_DATA)
+                       break;
         }
  out_free_stack:
-       pstack__delete(fstack);
+       pstack__delete(browser->pstack);
  out:
         hist_browser__delete(browser);
-       free_popup_options(options, nr_options - 1);
+       free_popup_options(options, MAX_OPTIONS);
         return key;
  }
  
diff --git a/tools/perf/ui/tui/setup.c b/tools/perf/ui/tui/setup.c

index b77e1d7713637c711e144886c9914fe02cb110ca..60d1f29b4b50a9fedf0a163855056edfce1ed22b 100644 (file)
--- a/tools/perf/ui/tui/setup.c
+++ b/tools/perf/ui/tui/setup.c
@@ -129,7 +129,7 @@ int ui__init(void)
         err = SLsmg_init_smg();
         if (err < 0)
                 goto out;
-       err = SLang_init_tty(0, 0, 0);
+       err = SLang_init_tty(-1, 0, 0);
         if (err < 0)
                 goto out;
  
diff --git a/tools/perf/util/Build b/tools/perf/util/Build

index 797490a40075600c47e0341378e4ad9e24ef225e..586a59d46022a9fc8901807f5c02be4e2551db25 100644 (file)
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -68,12 +68,15 @@ libperf-y += rblist.o
  libperf-y += intlist.o
  libperf-y += vdso.o
  libperf-y += stat.o
+libperf-y += stat-shadow.o
  libperf-y += record.o
  libperf-y += srcline.o
  libperf-y += data.o
  libperf-$(CONFIG_X86) += tsc.o
  libperf-y += cloexec.o
  libperf-y += thread-stack.o
+libperf-$(CONFIG_AUXTRACE) += auxtrace.o
+libperf-y += parse-branch-options.o
  
  libperf-$(CONFIG_LIBELF) += symbol-elf.o
  libperf-$(CONFIG_LIBELF) += probe-event.o
@@ -101,23 +104,23 @@ CFLAGS_exec_cmd.o += -DPERF_EXEC_PATH="BUILD_STR($(perfexecdir_SQ))" -DPREFIX="B
  
  $(OUTPUT)util/parse-events-flex.c: util/parse-events.l $(OUTPUT)util/parse-events-bison.c
         $(call rule_mkdir)
-       @$(call echo-cmd,flex)$(FLEX) -o $@ --header-file=$(OUTPUT)util/parse-events-flex.h $(PARSER_DEBUG_FLEX) util/parse-events.l
+       $(Q)$(call echo-cmd,flex)$(FLEX) -o $@ --header-file=$(OUTPUT)util/parse-events-flex.h $(PARSER_DEBUG_FLEX) util/parse-events.l
  
  $(OUTPUT)util/parse-events-bison.c: util/parse-events.y
         $(call rule_mkdir)
-       @$(call echo-cmd,bison)$(BISON) -v util/parse-events.y -d $(PARSER_DEBUG_BISON) -o $@ -p parse_events_
+       $(Q)$(call echo-cmd,bison)$(BISON) -v util/parse-events.y -d $(PARSER_DEBUG_BISON) -o $@ -p parse_events_
  
  $(OUTPUT)util/pmu-flex.c: util/pmu.l $(OUTPUT)util/pmu-bison.c
         $(call rule_mkdir)
-       @$(call echo-cmd,flex)$(FLEX) -o $@ --header-file=$(OUTPUT)util/pmu-flex.h util/pmu.l
+       $(Q)$(call echo-cmd,flex)$(FLEX) -o $@ --header-file=$(OUTPUT)util/pmu-flex.h util/pmu.l
  
  $(OUTPUT)util/pmu-bison.c: util/pmu.y
         $(call rule_mkdir)
-       @$(call echo-cmd,bison)$(BISON) -v util/pmu.y -d -o $@ -p perf_pmu_
+       $(Q)$(call echo-cmd,bison)$(BISON) -v util/pmu.y -d -o $@ -p perf_pmu_
  
  CFLAGS_parse-events-flex.o  += -w
  CFLAGS_pmu-flex.o           += -w
-CFLAGS_parse-events-bison.o += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w
+CFLAGS_parse-events-bison.o += -DYYENABLE_NLS=0 -w
  CFLAGS_pmu-bison.o          += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w
  
  $(OUTPUT)util/parse-events.o: $(OUTPUT)util/parse-events-flex.c $(OUTPUT)util/parse-events-bison.c
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c

index 7f5bdfc9bc87d1d1828efeb9f57755071b378e7a..03b7bc70eb66032d4502ec8bfda2e15a9d44cd57 100644 (file)
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -506,6 +506,17 @@ static int __symbol__inc_addr_samples(struct symbol *sym, struct map *map,
         return 0;
  }
  
+static struct annotation *symbol__get_annotation(struct symbol *sym)
+{
+       struct annotation *notes = symbol__annotation(sym);
+
+       if (notes->src == NULL) {
+               if (symbol__alloc_hist(sym) < 0)
+                       return NULL;
+       }
+       return notes;
+}
+
  static int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
                                     int evidx, u64 addr)
  {
@@ -513,13 +524,9 @@ static int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
  
         if (sym == NULL)
                 return 0;
-
-       notes = symbol__annotation(sym);
-       if (notes->src == NULL) {
-               if (symbol__alloc_hist(sym) < 0)
-                       return -ENOMEM;
-       }
-
+       notes = symbol__get_annotation(sym);
+       if (notes == NULL)
+               return -ENOMEM;
         return __symbol__inc_addr_samples(sym, map, notes, evidx, addr);
  }
  
@@ -647,14 +654,15 @@ struct disasm_line *disasm__get_next_ip_line(struct list_head *head, struct disa
  }
  
  double disasm__calc_percent(struct annotation *notes, int evidx, s64 offset,
-                           s64 end, const char **path)
+                           s64 end, const char **path, u64 *nr_samples)
  {
         struct source_line *src_line = notes->src->lines;
         double percent = 0.0;
+       *nr_samples = 0;
  
         if (src_line) {
                 size_t sizeof_src_line = sizeof(*src_line) +
-                               sizeof(src_line->p) * (src_line->nr_pcnt - 1);
+                               sizeof(src_line->samples) * (src_line->nr_pcnt - 1);
  
                 while (offset < end) {
                         src_line = (void *)notes->src->lines +
@@ -663,7 +671,8 @@ double disasm__calc_percent(struct annotation *notes, int evidx, s64 offset,
                         if (*path == NULL)
                                 *path = src_line->path;
  
-                       percent += src_line->p[evidx].percent;
+                       percent += src_line->samples[evidx].percent;
+                       *nr_samples += src_line->samples[evidx].nr;
                         offset++;
                 }
         } else {
@@ -673,8 +682,10 @@ double disasm__calc_percent(struct annotation *notes, int evidx, s64 offset,
                 while (offset < end)
                         hits += h->addr[offset++];
  
-               if (h->sum)
+               if (h->sum) {
+                       *nr_samples = hits;
                         percent = 100.0 * hits / h->sum;
+               }
         }
  
         return percent;
@@ -689,8 +700,10 @@ static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 st
  
         if (dl->offset != -1) {
                 const char *path = NULL;
+               u64 nr_samples;
                 double percent, max_percent = 0.0;
                 double *ppercents = &percent;
+               u64 *psamples = &nr_samples;
                 int i, nr_percent = 1;
                 const char *color;
                 struct annotation *notes = symbol__annotation(sym);
@@ -703,8 +716,10 @@ static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 st
                 if (perf_evsel__is_group_event(evsel)) {
                         nr_percent = evsel->nr_members;
                         ppercents = calloc(nr_percent, sizeof(double));
-                       if (ppercents == NULL)
+                       psamples = calloc(nr_percent, sizeof(u64));
+                       if (ppercents == NULL || psamples == NULL) {
                                 return -1;
+                       }
                 }
  
                 for (i = 0; i < nr_percent; i++) {
@@ -712,9 +727,10 @@ static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 st
                                         notes->src->lines ? i : evsel->idx + i,
                                         offset,
                                         next ? next->offset : (s64) len,
-                                       &path);
+                                       &path, &nr_samples);
  
                         ppercents[i] = percent;
+                       psamples[i] = nr_samples;
                         if (percent > max_percent)
                                 max_percent = percent;
                 }
@@ -752,8 +768,14 @@ static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 st
  
                 for (i = 0; i < nr_percent; i++) {
                         percent = ppercents[i];
+                       nr_samples = psamples[i];
                         color = get_percent_color(percent);
-                       color_fprintf(stdout, color, " %7.2f", percent);
+
+                       if (symbol_conf.show_total_period)
+                               color_fprintf(stdout, color, " %7" PRIu64,
+                                             nr_samples);
+                       else
+                               color_fprintf(stdout, color, " %7.2f", percent);
                 }
  
                 printf(" :      ");
@@ -763,6 +785,9 @@ static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 st
                 if (ppercents != &percent)
                         free(ppercents);
  
+               if (psamples != &nr_samples)
+                       free(psamples);
+
         } else if (max_lines && printed >= max_lines)
                 return 1;
         else {
@@ -1096,7 +1121,7 @@ static void insert_source_line(struct rb_root *root, struct source_line *src_lin
                 ret = strcmp(iter->path, src_line->path);
                 if (ret == 0) {
                         for (i = 0; i < src_line->nr_pcnt; i++)
-                               iter->p[i].percent_sum += src_line->p[i].percent;
+                               iter->samples[i].percent_sum += src_line->samples[i].percent;
                         return;
                 }
  
@@ -1107,7 +1132,7 @@ static void insert_source_line(struct rb_root *root, struct source_line *src_lin
         }
  
         for (i = 0; i < src_line->nr_pcnt; i++)
-               src_line->p[i].percent_sum = src_line->p[i].percent;
+               src_line->samples[i].percent_sum = src_line->samples[i].percent;
  
         rb_link_node(&src_line->node, parent, p);
         rb_insert_color(&src_line->node, root);
@@ -1118,9 +1143,9 @@ static int cmp_source_line(struct source_line *a, struct source_line *b)
         int i;
  
         for (i = 0; i < a->nr_pcnt; i++) {
-               if (a->p[i].percent_sum == b->p[i].percent_sum)
+               if (a->samples[i].percent_sum == b->samples[i].percent_sum)
                         continue;
-               return a->p[i].percent_sum > b->p[i].percent_sum;
+               return a->samples[i].percent_sum > b->samples[i].percent_sum;
         }
  
         return 0;
@@ -1172,7 +1197,7 @@ static void symbol__free_source_line(struct symbol *sym, int len)
         int i;
  
         sizeof_src_line = sizeof(*src_line) +
-                         (sizeof(src_line->p) * (src_line->nr_pcnt - 1));
+                         (sizeof(src_line->samples) * (src_line->nr_pcnt - 1));
  
         for (i = 0; i < len; i++) {
                 free_srcline(src_line->path);
@@ -1204,7 +1229,7 @@ static int symbol__get_source_line(struct symbol *sym, struct map *map,
                         h_sum += h->sum;
                 }
                 nr_pcnt = evsel->nr_members;
-               sizeof_src_line += (nr_pcnt - 1) * sizeof(src_line->p);
+               sizeof_src_line += (nr_pcnt - 1) * sizeof(src_line->samples);
         }
  
         if (!h_sum)
@@ -1224,10 +1249,10 @@ static int symbol__get_source_line(struct symbol *sym, struct map *map,
  
                 for (k = 0; k < nr_pcnt; k++) {
                         h = annotation__histogram(notes, evidx + k);
-                       src_line->p[k].percent = 100.0 * h->addr[i] / h->sum;
+                       src_line->samples[k].percent = 100.0 * h->addr[i] / h->sum;
  
-                       if (src_line->p[k].percent > percent_max)
-                               percent_max = src_line->p[k].percent;
+                       if (src_line->samples[k].percent > percent_max)
+                               percent_max = src_line->samples[k].percent;
                 }
  
                 if (percent_max <= 0.5)
@@ -1267,7 +1292,7 @@ static void print_summary(struct rb_root *root, const char *filename)
  
                 src_line = rb_entry(node, struct source_line, node);
                 for (i = 0; i < src_line->nr_pcnt; i++) {
-                       percent = src_line->p[i].percent_sum;
+                       percent = src_line->samples[i].percent_sum;
                         color = get_percent_color(percent);
                         color_fprintf(stdout, color, " %7.2f", percent);
  
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h

index cadbdc90a5cbf319385cb67aa5a88c06cdf107dc..7e78e6c270783475acb6dc897109254d6d266b35 100644 (file)
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -72,23 +72,24 @@ struct disasm_line *disasm__get_next_ip_line(struct list_head *head, struct disa
  int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool raw);
  size_t disasm__fprintf(struct list_head *head, FILE *fp);
  double disasm__calc_percent(struct annotation *notes, int evidx, s64 offset,
-                           s64 end, const char **path);
+                           s64 end, const char **path, u64 *nr_samples);
  
  struct sym_hist {
         u64             sum;
         u64             addr[0];
  };
  
-struct source_line_percent {
+struct source_line_samples {
         double          percent;
         double          percent_sum;
+       double          nr;
  };
  
  struct source_line {
         struct rb_node  node;
         char            *path;
         int             nr_pcnt;
-       struct source_line_percent p[1];
+       struct source_line_samples samples[1];
  };
  
  /** struct annotated_source - symbols with hits have this attached as in sannotation
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c

new file mode 100644 (file)

index 0000000..df66966
--- /dev/null
+++ b/tools/perf/util/auxtrace.c
@@ -0,0 +1,1352 @@
+/*
+ * auxtrace.c: AUX area trace support
+ * Copyright (c) 2013-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+
+#include <linux/kernel.h>
+#include <linux/perf_event.h>
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/log2.h>
+#include <linux/string.h>
+
+#include <sys/param.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <linux/list.h>
+
+#include "../perf.h"
+#include "util.h"
+#include "evlist.h"
+#include "cpumap.h"
+#include "thread_map.h"
+#include "asm/bug.h"
+#include "auxtrace.h"
+
+#include <linux/hash.h>
+
+#include "event.h"
+#include "session.h"
+#include "debug.h"
+#include "parse-options.h"
+
+int auxtrace_mmap__mmap(struct auxtrace_mmap *mm,
+                       struct auxtrace_mmap_params *mp,
+                       void *userpg, int fd)
+{
+       struct perf_event_mmap_page *pc = userpg;
+
+#if BITS_PER_LONG != 64 && !defined(HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT)
+       pr_err("Cannot use AUX area tracing mmaps\n");
+       return -1;
+#endif
+
+       WARN_ONCE(mm->base, "Uninitialized auxtrace_mmap\n");
+
+       mm->userpg = userpg;
+       mm->mask = mp->mask;
+       mm->len = mp->len;
+       mm->prev = 0;
+       mm->idx = mp->idx;
+       mm->tid = mp->tid;
+       mm->cpu = mp->cpu;
+
+       if (!mp->len) {
+               mm->base = NULL;
+               return 0;
+       }
+
+       pc->aux_offset = mp->offset;
+       pc->aux_size = mp->len;
+
+       mm->base = mmap(NULL, mp->len, mp->prot, MAP_SHARED, fd, mp->offset);
+       if (mm->base == MAP_FAILED) {
+               pr_debug2("failed to mmap AUX area\n");
+               mm->base = NULL;
+               return -1;
+       }
+
+       return 0;
+}
+
+void auxtrace_mmap__munmap(struct auxtrace_mmap *mm)
+{
+       if (mm->base) {
+               munmap(mm->base, mm->len);
+               mm->base = NULL;
+       }
+}
+
+void auxtrace_mmap_params__init(struct auxtrace_mmap_params *mp,
+                               off_t auxtrace_offset,
+                               unsigned int auxtrace_pages,
+                               bool auxtrace_overwrite)
+{
+       if (auxtrace_pages) {
+               mp->offset = auxtrace_offset;
+               mp->len = auxtrace_pages * (size_t)page_size;
+               mp->mask = is_power_of_2(mp->len) ? mp->len - 1 : 0;
+               mp->prot = PROT_READ | (auxtrace_overwrite ? 0 : PROT_WRITE);
+               pr_debug2("AUX area mmap length %zu\n", mp->len);
+       } else {
+               mp->len = 0;
+       }
+}
+
+void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp,
+                                  struct perf_evlist *evlist, int idx,
+                                  bool per_cpu)
+{
+       mp->idx = idx;
+
+       if (per_cpu) {
+               mp->cpu = evlist->cpus->map[idx];
+               if (evlist->threads)
+                       mp->tid = evlist->threads->map[0];
+               else
+                       mp->tid = -1;
+       } else {
+               mp->cpu = -1;
+               mp->tid = evlist->threads->map[idx];
+       }
+}
+
+#define AUXTRACE_INIT_NR_QUEUES        32
+
+static struct auxtrace_queue *auxtrace_alloc_queue_array(unsigned int nr_queues)
+{
+       struct auxtrace_queue *queue_array;
+       unsigned int max_nr_queues, i;
+
+       max_nr_queues = UINT_MAX / sizeof(struct auxtrace_queue);
+       if (nr_queues > max_nr_queues)
+               return NULL;
+
+       queue_array = calloc(nr_queues, sizeof(struct auxtrace_queue));
+       if (!queue_array)
+               return NULL;
+
+       for (i = 0; i < nr_queues; i++) {
+               INIT_LIST_HEAD(&queue_array[i].head);
+               queue_array[i].priv = NULL;
+       }
+
+       return queue_array;
+}
+
+int auxtrace_queues__init(struct auxtrace_queues *queues)
+{
+       queues->nr_queues = AUXTRACE_INIT_NR_QUEUES;
+       queues->queue_array = auxtrace_alloc_queue_array(queues->nr_queues);
+       if (!queues->queue_array)
+               return -ENOMEM;
+       return 0;
+}
+
+static int auxtrace_queues__grow(struct auxtrace_queues *queues,
+                                unsigned int new_nr_queues)
+{
+       unsigned int nr_queues = queues->nr_queues;
+       struct auxtrace_queue *queue_array;
+       unsigned int i;
+
+       if (!nr_queues)
+               nr_queues = AUXTRACE_INIT_NR_QUEUES;
+
+       while (nr_queues && nr_queues < new_nr_queues)
+               nr_queues <<= 1;
+
+       if (nr_queues < queues->nr_queues || nr_queues < new_nr_queues)
+               return -EINVAL;
+
+       queue_array = auxtrace_alloc_queue_array(nr_queues);
+       if (!queue_array)
+               return -ENOMEM;
+
+       for (i = 0; i < queues->nr_queues; i++) {
+               list_splice_tail(&queues->queue_array[i].head,
+                                &queue_array[i].head);
+               queue_array[i].priv = queues->queue_array[i].priv;
+       }
+
+       queues->nr_queues = nr_queues;
+       queues->queue_array = queue_array;
+
+       return 0;
+}
+
+static void *auxtrace_copy_data(u64 size, struct perf_session *session)
+{
+       int fd = perf_data_file__fd(session->file);
+       void *p;
+       ssize_t ret;
+
+       if (size > SSIZE_MAX)
+               return NULL;
+
+       p = malloc(size);
+       if (!p)
+               return NULL;
+
+       ret = readn(fd, p, size);
+       if (ret != (ssize_t)size) {
+               free(p);
+               return NULL;
+       }
+
+       return p;
+}
+
+static int auxtrace_queues__add_buffer(struct auxtrace_queues *queues,
+                                      unsigned int idx,
+                                      struct auxtrace_buffer *buffer)
+{
+       struct auxtrace_queue *queue;
+       int err;
+
+       if (idx >= queues->nr_queues) {
+               err = auxtrace_queues__grow(queues, idx + 1);
+               if (err)
+                       return err;
+       }
+
+       queue = &queues->queue_array[idx];
+
+       if (!queue->set) {
+               queue->set = true;
+               queue->tid = buffer->tid;
+               queue->cpu = buffer->cpu;
+       } else if (buffer->cpu != queue->cpu || buffer->tid != queue->tid) {
+               pr_err("auxtrace queue conflict: cpu %d, tid %d vs cpu %d, tid %d\n",
+                      queue->cpu, queue->tid, buffer->cpu, buffer->tid);
+               return -EINVAL;
+       }
+
+       buffer->buffer_nr = queues->next_buffer_nr++;
+
+       list_add_tail(&buffer->list, &queue->head);
+
+       queues->new_data = true;
+       queues->populated = true;
+
+       return 0;
+}
+
+/* Limit buffers to 32MiB on 32-bit */
+#define BUFFER_LIMIT_FOR_32_BIT (32 * 1024 * 1024)
+
+static int auxtrace_queues__split_buffer(struct auxtrace_queues *queues,
+                                        unsigned int idx,
+                                        struct auxtrace_buffer *buffer)
+{
+       u64 sz = buffer->size;
+       bool consecutive = false;
+       struct auxtrace_buffer *b;
+       int err;
+
+       while (sz > BUFFER_LIMIT_FOR_32_BIT) {
+               b = memdup(buffer, sizeof(struct auxtrace_buffer));
+               if (!b)
+                       return -ENOMEM;
+               b->size = BUFFER_LIMIT_FOR_32_BIT;
+               b->consecutive = consecutive;
+               err = auxtrace_queues__add_buffer(queues, idx, b);
+               if (err) {
+                       auxtrace_buffer__free(b);
+                       return err;
+               }
+               buffer->data_offset += BUFFER_LIMIT_FOR_32_BIT;
+               sz -= BUFFER_LIMIT_FOR_32_BIT;
+               consecutive = true;
+       }
+
+       buffer->size = sz;
+       buffer->consecutive = consecutive;
+
+       return 0;
+}
+
+static int auxtrace_queues__add_event_buffer(struct auxtrace_queues *queues,
+                                            struct perf_session *session,
+                                            unsigned int idx,
+                                            struct auxtrace_buffer *buffer)
+{
+       if (session->one_mmap) {
+               buffer->data = buffer->data_offset - session->one_mmap_offset +
+                              session->one_mmap_addr;
+       } else if (perf_data_file__is_pipe(session->file)) {
+               buffer->data = auxtrace_copy_data(buffer->size, session);
+               if (!buffer->data)
+                       return -ENOMEM;
+               buffer->data_needs_freeing = true;
+       } else if (BITS_PER_LONG == 32 &&
+                  buffer->size > BUFFER_LIMIT_FOR_32_BIT) {
+               int err;
+
+               err = auxtrace_queues__split_buffer(queues, idx, buffer);
+               if (err)
+                       return err;
+       }
+
+       return auxtrace_queues__add_buffer(queues, idx, buffer);
+}
+
+int auxtrace_queues__add_event(struct auxtrace_queues *queues,
+                              struct perf_session *session,
+                              union perf_event *event, off_t data_offset,
+                              struct auxtrace_buffer **buffer_ptr)
+{
+       struct auxtrace_buffer *buffer;
+       unsigned int idx;
+       int err;
+
+       buffer = zalloc(sizeof(struct auxtrace_buffer));
+       if (!buffer)
+               return -ENOMEM;
+
+       buffer->pid = -1;
+       buffer->tid = event->auxtrace.tid;
+       buffer->cpu = event->auxtrace.cpu;
+       buffer->data_offset = data_offset;
+       buffer->offset = event->auxtrace.offset;
+       buffer->reference = event->auxtrace.reference;
+       buffer->size = event->auxtrace.size;
+       idx = event->auxtrace.idx;
+
+       err = auxtrace_queues__add_event_buffer(queues, session, idx, buffer);
+       if (err)
+               goto out_err;
+
+       if (buffer_ptr)
+               *buffer_ptr = buffer;
+
+       return 0;
+
+out_err:
+       auxtrace_buffer__free(buffer);
+       return err;
+}
+
+static int auxtrace_queues__add_indexed_event(struct auxtrace_queues *queues,
+                                             struct perf_session *session,
+                                             off_t file_offset, size_t sz)
+{
+       union perf_event *event;
+       int err;
+       char buf[PERF_SAMPLE_MAX_SIZE];
+
+       err = perf_session__peek_event(session, file_offset, buf,
+                                      PERF_SAMPLE_MAX_SIZE, &event, NULL);
+       if (err)
+               return err;
+
+       if (event->header.type == PERF_RECORD_AUXTRACE) {
+               if (event->header.size < sizeof(struct auxtrace_event) ||
+                   event->header.size != sz) {
+                       err = -EINVAL;
+                       goto out;
+               }
+               file_offset += event->header.size;
+               err = auxtrace_queues__add_event(queues, session, event,
+                                                file_offset, NULL);
+       }
+out:
+       return err;
+}
+
+void auxtrace_queues__free(struct auxtrace_queues *queues)
+{
+       unsigned int i;
+
+       for (i = 0; i < queues->nr_queues; i++) {
+               while (!list_empty(&queues->queue_array[i].head)) {
+                       struct auxtrace_buffer *buffer;
+
+                       buffer = list_entry(queues->queue_array[i].head.next,
+                                           struct auxtrace_buffer, list);
+                       list_del(&buffer->list);
+                       auxtrace_buffer__free(buffer);
+               }
+       }
+
+       zfree(&queues->queue_array);
+       queues->nr_queues = 0;
+}
+
+static void auxtrace_heapify(struct auxtrace_heap_item *heap_array,
+                            unsigned int pos, unsigned int queue_nr,
+                            u64 ordinal)
+{
+       unsigned int parent;
+
+       while (pos) {
+               parent = (pos - 1) >> 1;
+               if (heap_array[parent].ordinal <= ordinal)
+                       break;
+               heap_array[pos] = heap_array[parent];
+               pos = parent;
+       }
+       heap_array[pos].queue_nr = queue_nr;
+       heap_array[pos].ordinal = ordinal;
+}
+
+int auxtrace_heap__add(struct auxtrace_heap *heap, unsigned int queue_nr,
+                      u64 ordinal)
+{
+       struct auxtrace_heap_item *heap_array;
+
+       if (queue_nr >= heap->heap_sz) {
+               unsigned int heap_sz = AUXTRACE_INIT_NR_QUEUES;
+
+               while (heap_sz <= queue_nr)
+                       heap_sz <<= 1;
+               heap_array = realloc(heap->heap_array,
+                                    heap_sz * sizeof(struct auxtrace_heap_item));
+               if (!heap_array)
+                       return -ENOMEM;
+               heap->heap_array = heap_array;
+               heap->heap_sz = heap_sz;
+       }
+
+       auxtrace_heapify(heap->heap_array, heap->heap_cnt++, queue_nr, ordinal);
+
+       return 0;
+}
+
+void auxtrace_heap__free(struct auxtrace_heap *heap)
+{
+       zfree(&heap->heap_array);
+       heap->heap_cnt = 0;
+       heap->heap_sz = 0;
+}
+
+void auxtrace_heap__pop(struct auxtrace_heap *heap)
+{
+       unsigned int pos, last, heap_cnt = heap->heap_cnt;
+       struct auxtrace_heap_item *heap_array;
+
+       if (!heap_cnt)
+               return;
+
+       heap->heap_cnt -= 1;
+
+       heap_array = heap->heap_array;
+
+       pos = 0;
+       while (1) {
+               unsigned int left, right;
+
+               left = (pos << 1) + 1;
+               if (left >= heap_cnt)
+                       break;
+               right = left + 1;
+               if (right >= heap_cnt) {
+                       heap_array[pos] = heap_array[left];
+                       return;
+               }
+               if (heap_array[left].ordinal < heap_array[right].ordinal) {
+                       heap_array[pos] = heap_array[left];
+                       pos = left;
+               } else {
+                       heap_array[pos] = heap_array[right];
+                       pos = right;
+               }
+       }
+
+       last = heap_cnt - 1;
+       auxtrace_heapify(heap_array, pos, heap_array[last].queue_nr,
+                        heap_array[last].ordinal);
+}
+
+size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr)
+{
+       if (itr)
+               return itr->info_priv_size(itr);
+       return 0;
+}
+
+static int auxtrace_not_supported(void)
+{
+       pr_err("AUX area tracing is not supported on this architecture\n");
+       return -EINVAL;
+}
+
+int auxtrace_record__info_fill(struct auxtrace_record *itr,
+                              struct perf_session *session,
+                              struct auxtrace_info_event *auxtrace_info,
+                              size_t priv_size)
+{
+       if (itr)
+               return itr->info_fill(itr, session, auxtrace_info, priv_size);
+       return auxtrace_not_supported();
+}
+
+void auxtrace_record__free(struct auxtrace_record *itr)
+{
+       if (itr)
+               itr->free(itr);
+}
+
+int auxtrace_record__snapshot_start(struct auxtrace_record *itr)
+{
+       if (itr && itr->snapshot_start)
+               return itr->snapshot_start(itr);
+       return 0;
+}
+
+int auxtrace_record__snapshot_finish(struct auxtrace_record *itr)
+{
+       if (itr && itr->snapshot_finish)
+               return itr->snapshot_finish(itr);
+       return 0;
+}
+
+int auxtrace_record__find_snapshot(struct auxtrace_record *itr, int idx,
+                                  struct auxtrace_mmap *mm,
+                                  unsigned char *data, u64 *head, u64 *old)
+{
+       if (itr && itr->find_snapshot)
+               return itr->find_snapshot(itr, idx, mm, data, head, old);
+       return 0;
+}
+
+int auxtrace_record__options(struct auxtrace_record *itr,
+                            struct perf_evlist *evlist,
+                            struct record_opts *opts)
+{
+       if (itr)
+               return itr->recording_options(itr, evlist, opts);
+       return 0;
+}
+
+u64 auxtrace_record__reference(struct auxtrace_record *itr)
+{
+       if (itr)
+               return itr->reference(itr);
+       return 0;
+}
+
+int auxtrace_parse_snapshot_options(struct auxtrace_record *itr,
+                                   struct record_opts *opts, const char *str)
+{
+       if (!str)
+               return 0;
+
+       if (itr)
+               return itr->parse_snapshot_options(itr, opts, str);
+
+       pr_err("No AUX area tracing to snapshot\n");
+       return -EINVAL;
+}
+
+struct auxtrace_record *__weak
+auxtrace_record__init(struct perf_evlist *evlist __maybe_unused, int *err)
+{
+       *err = 0;
+       return NULL;
+}
+
+static int auxtrace_index__alloc(struct list_head *head)
+{
+       struct auxtrace_index *auxtrace_index;
+
+       auxtrace_index = malloc(sizeof(struct auxtrace_index));
+       if (!auxtrace_index)
+               return -ENOMEM;
+
+       auxtrace_index->nr = 0;
+       INIT_LIST_HEAD(&auxtrace_index->list);
+
+       list_add_tail(&auxtrace_index->list, head);
+
+       return 0;
+}
+
+void auxtrace_index__free(struct list_head *head)
+{
+       struct auxtrace_index *auxtrace_index, *n;
+
+       list_for_each_entry_safe(auxtrace_index, n, head, list) {
+               list_del(&auxtrace_index->list);
+               free(auxtrace_index);
+       }
+}
+
+static struct auxtrace_index *auxtrace_index__last(struct list_head *head)
+{
+       struct auxtrace_index *auxtrace_index;
+       int err;
+
+       if (list_empty(head)) {
+               err = auxtrace_index__alloc(head);
+               if (err)
+                       return NULL;
+       }
+
+       auxtrace_index = list_entry(head->prev, struct auxtrace_index, list);
+
+       if (auxtrace_index->nr >= PERF_AUXTRACE_INDEX_ENTRY_COUNT) {
+               err = auxtrace_index__alloc(head);
+               if (err)
+                       return NULL;
+               auxtrace_index = list_entry(head->prev, struct auxtrace_index,
+                                           list);
+       }
+
+       return auxtrace_index;
+}
+
+int auxtrace_index__auxtrace_event(struct list_head *head,
+                                  union perf_event *event, off_t file_offset)
+{
+       struct auxtrace_index *auxtrace_index;
+       size_t nr;
+
+       auxtrace_index = auxtrace_index__last(head);
+       if (!auxtrace_index)
+               return -ENOMEM;
+
+       nr = auxtrace_index->nr;
+       auxtrace_index->entries[nr].file_offset = file_offset;
+       auxtrace_index->entries[nr].sz = event->header.size;
+       auxtrace_index->nr += 1;
+
+       return 0;
+}
+
+static int auxtrace_index__do_write(int fd,
+                                   struct auxtrace_index *auxtrace_index)
+{
+       struct auxtrace_index_entry ent;
+       size_t i;
+
+       for (i = 0; i < auxtrace_index->nr; i++) {
+               ent.file_offset = auxtrace_index->entries[i].file_offset;
+               ent.sz = auxtrace_index->entries[i].sz;
+               if (writen(fd, &ent, sizeof(ent)) != sizeof(ent))
+                       return -errno;
+       }
+       return 0;
+}
+
+int auxtrace_index__write(int fd, struct list_head *head)
+{
+       struct auxtrace_index *auxtrace_index;
+       u64 total = 0;
+       int err;
+
+       list_for_each_entry(auxtrace_index, head, list)
+               total += auxtrace_index->nr;
+
+       if (writen(fd, &total, sizeof(total)) != sizeof(total))
+               return -errno;
+
+       list_for_each_entry(auxtrace_index, head, list) {
+               err = auxtrace_index__do_write(fd, auxtrace_index);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+static int auxtrace_index__process_entry(int fd, struct list_head *head,
+                                        bool needs_swap)
+{
+       struct auxtrace_index *auxtrace_index;
+       struct auxtrace_index_entry ent;
+       size_t nr;
+
+       if (readn(fd, &ent, sizeof(ent)) != sizeof(ent))
+               return -1;
+
+       auxtrace_index = auxtrace_index__last(head);
+       if (!auxtrace_index)
+               return -1;
+
+       nr = auxtrace_index->nr;
+       if (needs_swap) {
+               auxtrace_index->entries[nr].file_offset =
+                                               bswap_64(ent.file_offset);
+               auxtrace_index->entries[nr].sz = bswap_64(ent.sz);
+       } else {
+               auxtrace_index->entries[nr].file_offset = ent.file_offset;
+               auxtrace_index->entries[nr].sz = ent.sz;
+       }
+
+       auxtrace_index->nr = nr + 1;
+
+       return 0;
+}
+
+int auxtrace_index__process(int fd, u64 size, struct perf_session *session,
+                           bool needs_swap)
+{
+       struct list_head *head = &session->auxtrace_index;
+       u64 nr;
+
+       if (readn(fd, &nr, sizeof(u64)) != sizeof(u64))
+               return -1;
+
+       if (needs_swap)
+               nr = bswap_64(nr);
+
+       if (sizeof(u64) + nr * sizeof(struct auxtrace_index_entry) > size)
+               return -1;
+
+       while (nr--) {
+               int err;
+
+               err = auxtrace_index__process_entry(fd, head, needs_swap);
+               if (err)
+                       return -1;
+       }
+
+       return 0;
+}
+
+static int auxtrace_queues__process_index_entry(struct auxtrace_queues *queues,
+                                               struct perf_session *session,
+                                               struct auxtrace_index_entry *ent)
+{
+       return auxtrace_queues__add_indexed_event(queues, session,
+                                                 ent->file_offset, ent->sz);
+}
+
+int auxtrace_queues__process_index(struct auxtrace_queues *queues,
+                                  struct perf_session *session)
+{
+       struct auxtrace_index *auxtrace_index;
+       struct auxtrace_index_entry *ent;
+       size_t i;
+       int err;
+
+       list_for_each_entry(auxtrace_index, &session->auxtrace_index, list) {
+               for (i = 0; i < auxtrace_index->nr; i++) {
+                       ent = &auxtrace_index->entries[i];
+                       err = auxtrace_queues__process_index_entry(queues,
+                                                                  session,
+                                                                  ent);
+                       if (err)
+                               return err;
+               }
+       }
+       return 0;
+}
+
+struct auxtrace_buffer *auxtrace_buffer__next(struct auxtrace_queue *queue,
+                                             struct auxtrace_buffer *buffer)
+{
+       if (buffer) {
+               if (list_is_last(&buffer->list, &queue->head))
+                       return NULL;
+               return list_entry(buffer->list.next, struct auxtrace_buffer,
+                                 list);
+       } else {
+               if (list_empty(&queue->head))
+                       return NULL;
+               return list_entry(queue->head.next, struct auxtrace_buffer,
+                                 list);
+       }
+}
+
+void *auxtrace_buffer__get_data(struct auxtrace_buffer *buffer, int fd)
+{
+       size_t adj = buffer->data_offset & (page_size - 1);
+       size_t size = buffer->size + adj;
+       off_t file_offset = buffer->data_offset - adj;
+       void *addr;
+
+       if (buffer->data)
+               return buffer->data;
+
+       addr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, file_offset);
+       if (addr == MAP_FAILED)
+               return NULL;
+
+       buffer->mmap_addr = addr;
+       buffer->mmap_size = size;
+
+       buffer->data = addr + adj;
+
+       return buffer->data;
+}
+
+void auxtrace_buffer__put_data(struct auxtrace_buffer *buffer)
+{
+       if (!buffer->data || !buffer->mmap_addr)
+               return;
+       munmap(buffer->mmap_addr, buffer->mmap_size);
+       buffer->mmap_addr = NULL;
+       buffer->mmap_size = 0;
+       buffer->data = NULL;
+       buffer->use_data = NULL;
+}
+
+void auxtrace_buffer__drop_data(struct auxtrace_buffer *buffer)
+{
+       auxtrace_buffer__put_data(buffer);
+       if (buffer->data_needs_freeing) {
+               buffer->data_needs_freeing = false;
+               zfree(&buffer->data);
+               buffer->use_data = NULL;
+               buffer->size = 0;
+       }
+}
+
+void auxtrace_buffer__free(struct auxtrace_buffer *buffer)
+{
+       auxtrace_buffer__drop_data(buffer);
+       free(buffer);
+}
+
+void auxtrace_synth_error(struct auxtrace_error_event *auxtrace_error, int type,
+                         int code, int cpu, pid_t pid, pid_t tid, u64 ip,
+                         const char *msg)
+{
+       size_t size;
+
+       memset(auxtrace_error, 0, sizeof(struct auxtrace_error_event));
+
+       auxtrace_error->header.type = PERF_RECORD_AUXTRACE_ERROR;
+       auxtrace_error->type = type;
+       auxtrace_error->code = code;
+       auxtrace_error->cpu = cpu;
+       auxtrace_error->pid = pid;
+       auxtrace_error->tid = tid;
+       auxtrace_error->ip = ip;
+       strlcpy(auxtrace_error->msg, msg, MAX_AUXTRACE_ERROR_MSG);
+
+       size = (void *)auxtrace_error->msg - (void *)auxtrace_error +
+              strlen(auxtrace_error->msg) + 1;
+       auxtrace_error->header.size = PERF_ALIGN(size, sizeof(u64));
+}
+
+int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr,
+                                        struct perf_tool *tool,
+                                        struct perf_session *session,
+                                        perf_event__handler_t process)
+{
+       union perf_event *ev;
+       size_t priv_size;
+       int err;
+
+       pr_debug2("Synthesizing auxtrace information\n");
+       priv_size = auxtrace_record__info_priv_size(itr);
+       ev = zalloc(sizeof(struct auxtrace_info_event) + priv_size);
+       if (!ev)
+               return -ENOMEM;
+
+       ev->auxtrace_info.header.type = PERF_RECORD_AUXTRACE_INFO;
+       ev->auxtrace_info.header.size = sizeof(struct auxtrace_info_event) +
+                                       priv_size;
+       err = auxtrace_record__info_fill(itr, session, &ev->auxtrace_info,
+                                        priv_size);
+       if (err)
+               goto out_free;
+
+       err = process(tool, ev, NULL, NULL);
+out_free:
+       free(ev);
+       return err;
+}
+
+static bool auxtrace__dont_decode(struct perf_session *session)
+{
+       return !session->itrace_synth_opts ||
+              session->itrace_synth_opts->dont_decode;
+}
+
+int perf_event__process_auxtrace_info(struct perf_tool *tool __maybe_unused,
+                                     union perf_event *event,
+                                     struct perf_session *session __maybe_unused)
+{
+       enum auxtrace_type type = event->auxtrace_info.type;
+
+       if (dump_trace)
+               fprintf(stdout, " type: %u\n", type);
+
+       switch (type) {
+       case PERF_AUXTRACE_UNKNOWN:
+       default:
+               return -EINVAL;
+       }
+}
+
+s64 perf_event__process_auxtrace(struct perf_tool *tool,
+                                union perf_event *event,
+                                struct perf_session *session)
+{
+       s64 err;
+
+       if (dump_trace)
+               fprintf(stdout, " size: %#"PRIx64"  offset: %#"PRIx64"  ref: %#"PRIx64"  idx: %u  tid: %d  cpu: %d\n",
+                       event->auxtrace.size, event->auxtrace.offset,
+                       event->auxtrace.reference, event->auxtrace.idx,
+                       event->auxtrace.tid, event->auxtrace.cpu);
+
+       if (auxtrace__dont_decode(session))
+               return event->auxtrace.size;
+
+       if (!session->auxtrace || event->header.type != PERF_RECORD_AUXTRACE)
+               return -EINVAL;
+
+       err = session->auxtrace->process_auxtrace_event(session, event, tool);
+       if (err < 0)
+               return err;
+
+       return event->auxtrace.size;
+}
+
+#define PERF_ITRACE_DEFAULT_PERIOD_TYPE                PERF_ITRACE_PERIOD_NANOSECS
+#define PERF_ITRACE_DEFAULT_PERIOD             100000
+#define PERF_ITRACE_DEFAULT_CALLCHAIN_SZ       16
+#define PERF_ITRACE_MAX_CALLCHAIN_SZ           1024
+
+void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts)
+{
+       synth_opts->instructions = true;
+       synth_opts->branches = true;
+       synth_opts->transactions = true;
+       synth_opts->errors = true;
+       synth_opts->period_type = PERF_ITRACE_DEFAULT_PERIOD_TYPE;
+       synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD;
+       synth_opts->callchain_sz = PERF_ITRACE_DEFAULT_CALLCHAIN_SZ;
+}
+
+/*
+ * Please check tools/perf/Documentation/perf-script.txt for information
+ * about the options parsed here, which is introduced after this cset,
+ * when support in 'perf script' for these options is introduced.
+ */
+int itrace_parse_synth_opts(const struct option *opt, const char *str,
+                           int unset)
+{
+       struct itrace_synth_opts *synth_opts = opt->value;
+       const char *p;
+       char *endptr;
+
+       synth_opts->set = true;
+
+       if (unset) {
+               synth_opts->dont_decode = true;
+               return 0;
+       }
+
+       if (!str) {
+               itrace_synth_opts__set_default(synth_opts);
+               return 0;
+       }
+
+       for (p = str; *p;) {
+               switch (*p++) {
+               case 'i':
+                       synth_opts->instructions = true;
+                       while (*p == ' ' || *p == ',')
+                               p += 1;
+                       if (isdigit(*p)) {
+                               synth_opts->period = strtoull(p, &endptr, 10);
+                               p = endptr;
+                               while (*p == ' ' || *p == ',')
+                                       p += 1;
+                               switch (*p++) {
+                               case 'i':
+                                       synth_opts->period_type =
+                                               PERF_ITRACE_PERIOD_INSTRUCTIONS;
+                                       break;
+                               case 't':
+                                       synth_opts->period_type =
+                                               PERF_ITRACE_PERIOD_TICKS;
+                                       break;
+                               case 'm':
+                                       synth_opts->period *= 1000;
+                                       /* Fall through */
+                               case 'u':
+                                       synth_opts->period *= 1000;
+                                       /* Fall through */
+                               case 'n':
+                                       if (*p++ != 's')
+                                               goto out_err;
+                                       synth_opts->period_type =
+                                               PERF_ITRACE_PERIOD_NANOSECS;
+                                       break;
+                               case '\0':
+                                       goto out;
+                               default:
+                                       goto out_err;
+                               }
+                       }
+                       break;
+               case 'b':
+                       synth_opts->branches = true;
+                       break;
+               case 'x':
+                       synth_opts->transactions = true;
+                       break;
+               case 'e':
+                       synth_opts->errors = true;
+                       break;
+               case 'd':
+                       synth_opts->log = true;
+                       break;
+               case 'c':
+                       synth_opts->branches = true;
+                       synth_opts->calls = true;
+                       break;
+               case 'r':
+                       synth_opts->branches = true;
+                       synth_opts->returns = true;
+                       break;
+               case 'g':
+                       synth_opts->callchain = true;
+                       synth_opts->callchain_sz =
+                                       PERF_ITRACE_DEFAULT_CALLCHAIN_SZ;
+                       while (*p == ' ' || *p == ',')
+                               p += 1;
+                       if (isdigit(*p)) {
+                               unsigned int val;
+
+                               val = strtoul(p, &endptr, 10);
+                               p = endptr;
+                               if (!val || val > PERF_ITRACE_MAX_CALLCHAIN_SZ)
+                                       goto out_err;
+                               synth_opts->callchain_sz = val;
+                       }
+                       break;
+               case ' ':
+               case ',':
+                       break;
+               default:
+                       goto out_err;
+               }
+       }
+out:
+       if (synth_opts->instructions) {
+               if (!synth_opts->period_type)
+                       synth_opts->period_type =
+                                       PERF_ITRACE_DEFAULT_PERIOD_TYPE;
+               if (!synth_opts->period)
+                       synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD;
+       }
+
+       return 0;
+
+out_err:
+       pr_err("Bad Instruction Tracing options '%s'\n", str);
+       return -EINVAL;
+}
+
+static const char * const auxtrace_error_type_name[] = {
+       [PERF_AUXTRACE_ERROR_ITRACE] = "instruction trace",
+};
+
+static const char *auxtrace_error_name(int type)
+{
+       const char *error_type_name = NULL;
+
+       if (type < PERF_AUXTRACE_ERROR_MAX)
+               error_type_name = auxtrace_error_type_name[type];
+       if (!error_type_name)
+               error_type_name = "unknown AUX";
+       return error_type_name;
+}
+
+size_t perf_event__fprintf_auxtrace_error(union perf_event *event, FILE *fp)
+{
+       struct auxtrace_error_event *e = &event->auxtrace_error;
+       int ret;
+
+       ret = fprintf(fp, " %s error type %u",
+                     auxtrace_error_name(e->type), e->type);
+       ret += fprintf(fp, " cpu %d pid %d tid %d ip %#"PRIx64" code %u: %s\n",
+                      e->cpu, e->pid, e->tid, e->ip, e->code, e->msg);
+       return ret;
+}
+
+void perf_session__auxtrace_error_inc(struct perf_session *session,
+                                     union perf_event *event)
+{
+       struct auxtrace_error_event *e = &event->auxtrace_error;
+
+       if (e->type < PERF_AUXTRACE_ERROR_MAX)
+               session->evlist->stats.nr_auxtrace_errors[e->type] += 1;
+}
+
+void events_stats__auxtrace_error_warn(const struct events_stats *stats)
+{
+       int i;
+
+       for (i = 0; i < PERF_AUXTRACE_ERROR_MAX; i++) {
+               if (!stats->nr_auxtrace_errors[i])
+                       continue;
+               ui__warning("%u %s errors\n",
+                           stats->nr_auxtrace_errors[i],
+                           auxtrace_error_name(i));
+       }
+}
+
+int perf_event__process_auxtrace_error(struct perf_tool *tool __maybe_unused,
+                                      union perf_event *event,
+                                      struct perf_session *session)
+{
+       if (auxtrace__dont_decode(session))
+               return 0;
+
+       perf_event__fprintf_auxtrace_error(event, stdout);
+       return 0;
+}
+
+static int __auxtrace_mmap__read(struct auxtrace_mmap *mm,
+                                struct auxtrace_record *itr,
+                                struct perf_tool *tool, process_auxtrace_t fn,
+                                bool snapshot, size_t snapshot_size)
+{
+       u64 head, old = mm->prev, offset, ref;
+       unsigned char *data = mm->base;
+       size_t size, head_off, old_off, len1, len2, padding;
+       union perf_event ev;
+       void *data1, *data2;
+
+       if (snapshot) {
+               head = auxtrace_mmap__read_snapshot_head(mm);
+               if (auxtrace_record__find_snapshot(itr, mm->idx, mm, data,
+                                                  &head, &old))
+                       return -1;
+       } else {
+               head = auxtrace_mmap__read_head(mm);
+       }
+
+       if (old == head)
+               return 0;
+
+       pr_debug3("auxtrace idx %d old %#"PRIx64" head %#"PRIx64" diff %#"PRIx64"\n",
+                 mm->idx, old, head, head - old);
+
+       if (mm->mask) {
+               head_off = head & mm->mask;
+               old_off = old & mm->mask;
+       } else {
+               head_off = head % mm->len;
+               old_off = old % mm->len;
+       }
+
+       if (head_off > old_off)
+               size = head_off - old_off;
+       else
+               size = mm->len - (old_off - head_off);
+
+       if (snapshot && size > snapshot_size)
+               size = snapshot_size;
+
+       ref = auxtrace_record__reference(itr);
+
+       if (head > old || size <= head || mm->mask) {
+               offset = head - size;
+       } else {
+               /*
+                * When the buffer size is not a power of 2, 'head' wraps at the
+                * highest multiple of the buffer size, so we have to subtract
+                * the remainder here.
+                */
+               u64 rem = (0ULL - mm->len) % mm->len;
+
+               offset = head - size - rem;
+       }
+
+       if (size > head_off) {
+               len1 = size - head_off;
+               data1 = &data[mm->len - len1];
+               len2 = head_off;
+               data2 = &data[0];
+       } else {
+               len1 = size;
+               data1 = &data[head_off - len1];
+               len2 = 0;
+               data2 = NULL;
+       }
+
+       /* padding must be written by fn() e.g. record__process_auxtrace() */
+       padding = size & 7;
+       if (padding)
+               padding = 8 - padding;
+
+       memset(&ev, 0, sizeof(ev));
+       ev.auxtrace.header.type = PERF_RECORD_AUXTRACE;
+       ev.auxtrace.header.size = sizeof(ev.auxtrace);
+       ev.auxtrace.size = size + padding;
+       ev.auxtrace.offset = offset;
+       ev.auxtrace.reference = ref;
+       ev.auxtrace.idx = mm->idx;
+       ev.auxtrace.tid = mm->tid;
+       ev.auxtrace.cpu = mm->cpu;
+
+       if (fn(tool, &ev, data1, len1, data2, len2))
+               return -1;
+
+       mm->prev = head;
+
+       if (!snapshot) {
+               auxtrace_mmap__write_tail(mm, head);
+               if (itr->read_finish) {
+                       int err;
+
+                       err = itr->read_finish(itr, mm->idx);
+                       if (err < 0)
+                               return err;
+               }
+       }
+
+       return 1;
+}
+
+int auxtrace_mmap__read(struct auxtrace_mmap *mm, struct auxtrace_record *itr,
+                       struct perf_tool *tool, process_auxtrace_t fn)
+{
+       return __auxtrace_mmap__read(mm, itr, tool, fn, false, 0);
+}
+
+int auxtrace_mmap__read_snapshot(struct auxtrace_mmap *mm,
+                                struct auxtrace_record *itr,
+                                struct perf_tool *tool, process_auxtrace_t fn,
+                                size_t snapshot_size)
+{
+       return __auxtrace_mmap__read(mm, itr, tool, fn, true, snapshot_size);
+}
+
+/**
+ * struct auxtrace_cache - hash table to implement a cache
+ * @hashtable: the hashtable
+ * @sz: hashtable size (number of hlists)
+ * @entry_size: size of an entry
+ * @limit: limit the number of entries to this maximum, when reached the cache
+ *         is dropped and caching begins again with an empty cache
+ * @cnt: current number of entries
+ * @bits: hashtable size (@sz = 2^@bits)
+ */
+struct auxtrace_cache {
+       struct hlist_head *hashtable;
+       size_t sz;
+       size_t entry_size;
+       size_t limit;
+       size_t cnt;
+       unsigned int bits;
+};
+
+struct auxtrace_cache *auxtrace_cache__new(unsigned int bits, size_t entry_size,
+                                          unsigned int limit_percent)
+{
+       struct auxtrace_cache *c;
+       struct hlist_head *ht;
+       size_t sz, i;
+
+       c = zalloc(sizeof(struct auxtrace_cache));
+       if (!c)
+               return NULL;
+
+       sz = 1UL << bits;
+
+       ht = calloc(sz, sizeof(struct hlist_head));
+       if (!ht)
+               goto out_free;
+
+       for (i = 0; i < sz; i++)
+               INIT_HLIST_HEAD(&ht[i]);
+
+       c->hashtable = ht;
+       c->sz = sz;
+       c->entry_size = entry_size;
+       c->limit = (c->sz * limit_percent) / 100;
+       c->bits = bits;
+
+       return c;
+
+out_free:
+       free(c);
+       return NULL;
+}
+
+static void auxtrace_cache__drop(struct auxtrace_cache *c)
+{
+       struct auxtrace_cache_entry *entry;
+       struct hlist_node *tmp;
+       size_t i;
+
+       if (!c)
+               return;
+
+       for (i = 0; i < c->sz; i++) {
+               hlist_for_each_entry_safe(entry, tmp, &c->hashtable[i], hash) {
+                       hlist_del(&entry->hash);
+                       auxtrace_cache__free_entry(c, entry);
+               }
+       }
+
+       c->cnt = 0;
+}
+
+void auxtrace_cache__free(struct auxtrace_cache *c)
+{
+       if (!c)
+               return;
+
+       auxtrace_cache__drop(c);
+       free(c->hashtable);
+       free(c);
+}
+
+void *auxtrace_cache__alloc_entry(struct auxtrace_cache *c)
+{
+       return malloc(c->entry_size);
+}
+
+void auxtrace_cache__free_entry(struct auxtrace_cache *c __maybe_unused,
+                               void *entry)
+{
+       free(entry);
+}
+
+int auxtrace_cache__add(struct auxtrace_cache *c, u32 key,
+                       struct auxtrace_cache_entry *entry)
+{
+       if (c->limit && ++c->cnt > c->limit)
+               auxtrace_cache__drop(c);
+
+       entry->key = key;
+       hlist_add_head(&entry->hash, &c->hashtable[hash_32(key, c->bits)]);
+
+       return 0;
+}
+
+void *auxtrace_cache__lookup(struct auxtrace_cache *c, u32 key)
+{
+       struct auxtrace_cache_entry *entry;
+       struct hlist_head *hlist;
+
+       if (!c)
+               return NULL;
+
+       hlist = &c->hashtable[hash_32(key, c->bits)];
+       hlist_for_each_entry(entry, hlist, hash) {
+               if (entry->key == key)
+                       return entry;
+       }
+
+       return NULL;
+}
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h

new file mode 100644 (file)

index 0000000..a171abb
--- /dev/null
+++ b/tools/perf/util/auxtrace.h
@@ -0,0 +1,643 @@
+/*
+ * auxtrace.h: AUX area trace support
+ * Copyright (c) 2013-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef __PERF_AUXTRACE_H
+#define __PERF_AUXTRACE_H
+
+#include <sys/types.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <linux/list.h>
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include "../perf.h"
+#include "event.h"
+#include "session.h"
+#include "debug.h"
+
+union perf_event;
+struct perf_session;
+struct perf_evlist;
+struct perf_tool;
+struct option;
+struct record_opts;
+struct auxtrace_info_event;
+struct events_stats;
+
+enum auxtrace_type {
+       PERF_AUXTRACE_UNKNOWN,
+};
+
+enum itrace_period_type {
+       PERF_ITRACE_PERIOD_INSTRUCTIONS,
+       PERF_ITRACE_PERIOD_TICKS,
+       PERF_ITRACE_PERIOD_NANOSECS,
+};
+
+/**
+ * struct itrace_synth_opts - AUX area tracing synthesis options.
+ * @set: indicates whether or not options have been set
+ * @inject: indicates the event (not just the sample) must be fully synthesized
+ *          because 'perf inject' will write it out
+ * @instructions: whether to synthesize 'instructions' events
+ * @branches: whether to synthesize 'branches' events
+ * @transactions: whether to synthesize events for transactions
+ * @errors: whether to synthesize decoder error events
+ * @dont_decode: whether to skip decoding entirely
+ * @log: write a decoding log
+ * @calls: limit branch samples to calls (can be combined with @returns)
+ * @returns: limit branch samples to returns (can be combined with @calls)
+ * @callchain: add callchain to 'instructions' events
+ * @callchain_sz: maximum callchain size
+ * @period: 'instructions' events period
+ * @period_type: 'instructions' events period type
+ */
+struct itrace_synth_opts {
+       bool                    set;
+       bool                    inject;
+       bool                    instructions;
+       bool                    branches;
+       bool                    transactions;
+       bool                    errors;
+       bool                    dont_decode;
+       bool                    log;
+       bool                    calls;
+       bool                    returns;
+       bool                    callchain;
+       unsigned int            callchain_sz;
+       unsigned long long      period;
+       enum itrace_period_type period_type;
+};
+
+/**
+ * struct auxtrace_index_entry - indexes a AUX area tracing event within a
+ *                               perf.data file.
+ * @file_offset: offset within the perf.data file
+ * @sz: size of the event
+ */
+struct auxtrace_index_entry {
+       u64                     file_offset;
+       u64                     sz;
+};
+
+#define PERF_AUXTRACE_INDEX_ENTRY_COUNT 256
+
+/**
+ * struct auxtrace_index - index of AUX area tracing events within a perf.data
+ *                         file.
+ * @list: linking a number of arrays of entries
+ * @nr: number of entries
+ * @entries: array of entries
+ */
+struct auxtrace_index {
+       struct list_head        list;
+       size_t                  nr;
+       struct auxtrace_index_entry entries[PERF_AUXTRACE_INDEX_ENTRY_COUNT];
+};
+
+/**
+ * struct auxtrace - session callbacks to allow AUX area data decoding.
+ * @process_event: lets the decoder see all session events
+ * @flush_events: process any remaining data
+ * @free_events: free resources associated with event processing
+ * @free: free resources associated with the session
+ */
+struct auxtrace {
+       int (*process_event)(struct perf_session *session,
+                            union perf_event *event,
+                            struct perf_sample *sample,
+                            struct perf_tool *tool);
+       int (*process_auxtrace_event)(struct perf_session *session,
+                                     union perf_event *event,
+                                     struct perf_tool *tool);
+       int (*flush_events)(struct perf_session *session,
+                           struct perf_tool *tool);
+       void (*free_events)(struct perf_session *session);
+       void (*free)(struct perf_session *session);
+};
+
+/**
+ * struct auxtrace_buffer - a buffer containing AUX area tracing data.
+ * @list: buffers are queued in a list held by struct auxtrace_queue
+ * @size: size of the buffer in bytes
+ * @pid: in per-thread mode, the pid this buffer is associated with
+ * @tid: in per-thread mode, the tid this buffer is associated with
+ * @cpu: in per-cpu mode, the cpu this buffer is associated with
+ * @data: actual buffer data (can be null if the data has not been loaded)
+ * @data_offset: file offset at which the buffer can be read
+ * @mmap_addr: mmap address at which the buffer can be read
+ * @mmap_size: size of the mmap at @mmap_addr
+ * @data_needs_freeing: @data was malloc'd so free it when it is no longer
+ *                      needed
+ * @consecutive: the original data was split up and this buffer is consecutive
+ *               to the previous buffer
+ * @offset: offset as determined by aux_head / aux_tail members of struct
+ *          perf_event_mmap_page
+ * @reference: an implementation-specific reference determined when the data is
+ *             recorded
+ * @buffer_nr: used to number each buffer
+ * @use_size: implementation actually only uses this number of bytes
+ * @use_data: implementation actually only uses data starting at this address
+ */
+struct auxtrace_buffer {
+       struct list_head        list;
+       size_t                  size;
+       pid_t                   pid;
+       pid_t                   tid;
+       int                     cpu;
+       void                    *data;
+       off_t                   data_offset;
+       void                    *mmap_addr;
+       size_t                  mmap_size;
+       bool                    data_needs_freeing;
+       bool                    consecutive;
+       u64                     offset;
+       u64                     reference;
+       u64                     buffer_nr;
+       size_t                  use_size;
+       void                    *use_data;
+};
+
+/**
+ * struct auxtrace_queue - a queue of AUX area tracing data buffers.
+ * @head: head of buffer list
+ * @tid: in per-thread mode, the tid this queue is associated with
+ * @cpu: in per-cpu mode, the cpu this queue is associated with
+ * @set: %true once this queue has been dedicated to a specific thread or cpu
+ * @priv: implementation-specific data
+ */
+struct auxtrace_queue {
+       struct list_head        head;
+       pid_t                   tid;
+       int                     cpu;
+       bool                    set;
+       void                    *priv;
+};
+
+/**
+ * struct auxtrace_queues - an array of AUX area tracing queues.
+ * @queue_array: array of queues
+ * @nr_queues: number of queues
+ * @new_data: set whenever new data is queued
+ * @populated: queues have been fully populated using the auxtrace_index
+ * @next_buffer_nr: used to number each buffer
+ */
+struct auxtrace_queues {
+       struct auxtrace_queue   *queue_array;
+       unsigned int            nr_queues;
+       bool                    new_data;
+       bool                    populated;
+       u64                     next_buffer_nr;
+};
+
+/**
+ * struct auxtrace_heap_item - element of struct auxtrace_heap.
+ * @queue_nr: queue number
+ * @ordinal: value used for sorting (lowest ordinal is top of the heap) expected
+ *           to be a timestamp
+ */
+struct auxtrace_heap_item {
+       unsigned int            queue_nr;
+       u64                     ordinal;
+};
+
+/**
+ * struct auxtrace_heap - a heap suitable for sorting AUX area tracing queues.
+ * @heap_array: the heap
+ * @heap_cnt: the number of elements in the heap
+ * @heap_sz: maximum number of elements (grows as needed)
+ */
+struct auxtrace_heap {
+       struct auxtrace_heap_item       *heap_array;
+       unsigned int            heap_cnt;
+       unsigned int            heap_sz;
+};
+
+/**
+ * struct auxtrace_mmap - records an mmap of the auxtrace buffer.
+ * @base: address of mapped area
+ * @userpg: pointer to buffer's perf_event_mmap_page
+ * @mask: %0 if @len is not a power of two, otherwise (@len - %1)
+ * @len: size of mapped area
+ * @prev: previous aux_head
+ * @idx: index of this mmap
+ * @tid: tid for a per-thread mmap (also set if there is only 1 tid on a per-cpu
+ *       mmap) otherwise %0
+ * @cpu: cpu number for a per-cpu mmap otherwise %-1
+ */
+struct auxtrace_mmap {
+       void            *base;
+       void            *userpg;
+       size_t          mask;
+       size_t          len;
+       u64             prev;
+       int             idx;
+       pid_t           tid;
+       int             cpu;
+};
+
+/**
+ * struct auxtrace_mmap_params - parameters to set up struct auxtrace_mmap.
+ * @mask: %0 if @len is not a power of two, otherwise (@len - %1)
+ * @offset: file offset of mapped area
+ * @len: size of mapped area
+ * @prot: mmap memory protection
+ * @idx: index of this mmap
+ * @tid: tid for a per-thread mmap (also set if there is only 1 tid on a per-cpu
+ *       mmap) otherwise %0
+ * @cpu: cpu number for a per-cpu mmap otherwise %-1
+ */
+struct auxtrace_mmap_params {
+       size_t          mask;
+       off_t           offset;
+       size_t          len;
+       int             prot;
+       int             idx;
+       pid_t           tid;
+       int             cpu;
+};
+
+/**
+ * struct auxtrace_record - callbacks for recording AUX area data.
+ * @recording_options: validate and process recording options
+ * @info_priv_size: return the size of the private data in auxtrace_info_event
+ * @info_fill: fill-in the private data in auxtrace_info_event
+ * @free: free this auxtrace record structure
+ * @snapshot_start: starting a snapshot
+ * @snapshot_finish: finishing a snapshot
+ * @find_snapshot: find data to snapshot within auxtrace mmap
+ * @parse_snapshot_options: parse snapshot options
+ * @reference: provide a 64-bit reference number for auxtrace_event
+ * @read_finish: called after reading from an auxtrace mmap
+ */
+struct auxtrace_record {
+       int (*recording_options)(struct auxtrace_record *itr,
+                                struct perf_evlist *evlist,
+                                struct record_opts *opts);
+       size_t (*info_priv_size)(struct auxtrace_record *itr);
+       int (*info_fill)(struct auxtrace_record *itr,
+                        struct perf_session *session,
+                        struct auxtrace_info_event *auxtrace_info,
+                        size_t priv_size);
+       void (*free)(struct auxtrace_record *itr);
+       int (*snapshot_start)(struct auxtrace_record *itr);
+       int (*snapshot_finish)(struct auxtrace_record *itr);
+       int (*find_snapshot)(struct auxtrace_record *itr, int idx,
+                            struct auxtrace_mmap *mm, unsigned char *data,
+                            u64 *head, u64 *old);
+       int (*parse_snapshot_options)(struct auxtrace_record *itr,
+                                     struct record_opts *opts,
+                                     const char *str);
+       u64 (*reference)(struct auxtrace_record *itr);
+       int (*read_finish)(struct auxtrace_record *itr, int idx);
+};
+
+#ifdef HAVE_AUXTRACE_SUPPORT
+
+/*
+ * In snapshot mode the mmapped page is read-only which makes using
+ * __sync_val_compare_and_swap() problematic.  However, snapshot mode expects
+ * the buffer is not updated while the snapshot is made (e.g. Intel PT disables
+ * the event) so there is not a race anyway.
+ */
+static inline u64 auxtrace_mmap__read_snapshot_head(struct auxtrace_mmap *mm)
+{
+       struct perf_event_mmap_page *pc = mm->userpg;
+       u64 head = ACCESS_ONCE(pc->aux_head);
+
+       /* Ensure all reads are done after we read the head */
+       rmb();
+       return head;
+}
+
+static inline u64 auxtrace_mmap__read_head(struct auxtrace_mmap *mm)
+{
+       struct perf_event_mmap_page *pc = mm->userpg;
+#if BITS_PER_LONG == 64 || !defined(HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT)
+       u64 head = ACCESS_ONCE(pc->aux_head);
+#else
+       u64 head = __sync_val_compare_and_swap(&pc->aux_head, 0, 0);
+#endif
+
+       /* Ensure all reads are done after we read the head */
+       rmb();
+       return head;
+}
+
+static inline void auxtrace_mmap__write_tail(struct auxtrace_mmap *mm, u64 tail)
+{
+       struct perf_event_mmap_page *pc = mm->userpg;
+#if BITS_PER_LONG != 64 && defined(HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT)
+       u64 old_tail;
+#endif
+
+       /* Ensure all reads are done before we write the tail out */
+       mb();
+#if BITS_PER_LONG == 64 || !defined(HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT)
+       pc->aux_tail = tail;
+#else
+       do {
+               old_tail = __sync_val_compare_and_swap(&pc->aux_tail, 0, 0);
+       } while (!__sync_bool_compare_and_swap(&pc->aux_tail, old_tail, tail));
+#endif
+}
+
+int auxtrace_mmap__mmap(struct auxtrace_mmap *mm,
+                       struct auxtrace_mmap_params *mp,
+                       void *userpg, int fd);
+void auxtrace_mmap__munmap(struct auxtrace_mmap *mm);
+void auxtrace_mmap_params__init(struct auxtrace_mmap_params *mp,
+                               off_t auxtrace_offset,
+                               unsigned int auxtrace_pages,
+                               bool auxtrace_overwrite);
+void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp,
+                                  struct perf_evlist *evlist, int idx,
+                                  bool per_cpu);
+
+typedef int (*process_auxtrace_t)(struct perf_tool *tool,
+                                 union perf_event *event, void *data1,
+                                 size_t len1, void *data2, size_t len2);
+
+int auxtrace_mmap__read(struct auxtrace_mmap *mm, struct auxtrace_record *itr,
+                       struct perf_tool *tool, process_auxtrace_t fn);
+
+int auxtrace_mmap__read_snapshot(struct auxtrace_mmap *mm,
+                                struct auxtrace_record *itr,
+                                struct perf_tool *tool, process_auxtrace_t fn,
+                                size_t snapshot_size);
+
+int auxtrace_queues__init(struct auxtrace_queues *queues);
+int auxtrace_queues__add_event(struct auxtrace_queues *queues,
+                              struct perf_session *session,
+                              union perf_event *event, off_t data_offset,
+                              struct auxtrace_buffer **buffer_ptr);
+void auxtrace_queues__free(struct auxtrace_queues *queues);
+int auxtrace_queues__process_index(struct auxtrace_queues *queues,
+                                  struct perf_session *session);
+struct auxtrace_buffer *auxtrace_buffer__next(struct auxtrace_queue *queue,
+                                             struct auxtrace_buffer *buffer);
+void *auxtrace_buffer__get_data(struct auxtrace_buffer *buffer, int fd);
+void auxtrace_buffer__put_data(struct auxtrace_buffer *buffer);
+void auxtrace_buffer__drop_data(struct auxtrace_buffer *buffer);
+void auxtrace_buffer__free(struct auxtrace_buffer *buffer);
+
+int auxtrace_heap__add(struct auxtrace_heap *heap, unsigned int queue_nr,
+                      u64 ordinal);
+void auxtrace_heap__pop(struct auxtrace_heap *heap);
+void auxtrace_heap__free(struct auxtrace_heap *heap);
+
+struct auxtrace_cache_entry {
+       struct hlist_node hash;
+       u32 key;
+};
+
+struct auxtrace_cache *auxtrace_cache__new(unsigned int bits, size_t entry_size,
+                                          unsigned int limit_percent);
+void auxtrace_cache__free(struct auxtrace_cache *auxtrace_cache);
+void *auxtrace_cache__alloc_entry(struct auxtrace_cache *c);
+void auxtrace_cache__free_entry(struct auxtrace_cache *c, void *entry);
+int auxtrace_cache__add(struct auxtrace_cache *c, u32 key,
+                       struct auxtrace_cache_entry *entry);
+void *auxtrace_cache__lookup(struct auxtrace_cache *c, u32 key);
+
+struct auxtrace_record *auxtrace_record__init(struct perf_evlist *evlist,
+                                             int *err);
+
+int auxtrace_parse_snapshot_options(struct auxtrace_record *itr,
+                                   struct record_opts *opts,
+                                   const char *str);
+int auxtrace_record__options(struct auxtrace_record *itr,
+                            struct perf_evlist *evlist,
+                            struct record_opts *opts);
+size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr);
+int auxtrace_record__info_fill(struct auxtrace_record *itr,
+                              struct perf_session *session,
+                              struct auxtrace_info_event *auxtrace_info,
+                              size_t priv_size);
+void auxtrace_record__free(struct auxtrace_record *itr);
+int auxtrace_record__snapshot_start(struct auxtrace_record *itr);
+int auxtrace_record__snapshot_finish(struct auxtrace_record *itr);
+int auxtrace_record__find_snapshot(struct auxtrace_record *itr, int idx,
+                                  struct auxtrace_mmap *mm,
+                                  unsigned char *data, u64 *head, u64 *old);
+u64 auxtrace_record__reference(struct auxtrace_record *itr);
+
+int auxtrace_index__auxtrace_event(struct list_head *head, union perf_event *event,
+                                  off_t file_offset);
+int auxtrace_index__write(int fd, struct list_head *head);
+int auxtrace_index__process(int fd, u64 size, struct perf_session *session,
+                           bool needs_swap);
+void auxtrace_index__free(struct list_head *head);
+
+void auxtrace_synth_error(struct auxtrace_error_event *auxtrace_error, int type,
+                         int code, int cpu, pid_t pid, pid_t tid, u64 ip,
+                         const char *msg);
+
+int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr,
+                                        struct perf_tool *tool,
+                                        struct perf_session *session,
+                                        perf_event__handler_t process);
+int perf_event__process_auxtrace_info(struct perf_tool *tool,
+                                     union perf_event *event,
+                                     struct perf_session *session);
+s64 perf_event__process_auxtrace(struct perf_tool *tool,
+                                union perf_event *event,
+                                struct perf_session *session);
+int perf_event__process_auxtrace_error(struct perf_tool *tool,
+                                      union perf_event *event,
+                                      struct perf_session *session);
+int itrace_parse_synth_opts(const struct option *opt, const char *str,
+                           int unset);
+void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts);
+
+size_t perf_event__fprintf_auxtrace_error(union perf_event *event, FILE *fp);
+void perf_session__auxtrace_error_inc(struct perf_session *session,
+                                     union perf_event *event);
+void events_stats__auxtrace_error_warn(const struct events_stats *stats);
+
+static inline int auxtrace__process_event(struct perf_session *session,
+                                         union perf_event *event,
+                                         struct perf_sample *sample,
+                                         struct perf_tool *tool)
+{
+       if (!session->auxtrace)
+               return 0;
+
+       return session->auxtrace->process_event(session, event, sample, tool);
+}
+
+static inline int auxtrace__flush_events(struct perf_session *session,
+                                        struct perf_tool *tool)
+{
+       if (!session->auxtrace)
+               return 0;
+
+       return session->auxtrace->flush_events(session, tool);
+}
+
+static inline void auxtrace__free_events(struct perf_session *session)
+{
+       if (!session->auxtrace)
+               return;
+
+       return session->auxtrace->free_events(session);
+}
+
+static inline void auxtrace__free(struct perf_session *session)
+{
+       if (!session->auxtrace)
+               return;
+
+       return session->auxtrace->free(session);
+}
+
+#else
+
+static inline struct auxtrace_record *
+auxtrace_record__init(struct perf_evlist *evlist __maybe_unused,
+                     int *err __maybe_unused)
+{
+       *err = 0;
+       return NULL;
+}
+
+static inline
+void auxtrace_record__free(struct auxtrace_record *itr __maybe_unused)
+{
+}
+
+static inline int
+perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr __maybe_unused,
+                                    struct perf_tool *tool __maybe_unused,
+                                    struct perf_session *session __maybe_unused,
+                                    perf_event__handler_t process __maybe_unused)
+{
+       return -EINVAL;
+}
+
+static inline
+int auxtrace_record__options(struct auxtrace_record *itr __maybe_unused,
+                            struct perf_evlist *evlist __maybe_unused,
+                            struct record_opts *opts __maybe_unused)
+{
+       return 0;
+}
+
+#define perf_event__process_auxtrace_info              0
+#define perf_event__process_auxtrace                   0
+#define perf_event__process_auxtrace_error             0
+
+static inline
+void perf_session__auxtrace_error_inc(struct perf_session *session
+                                     __maybe_unused,
+                                     union perf_event *event
+                                     __maybe_unused)
+{
+}
+
+static inline
+void events_stats__auxtrace_error_warn(const struct events_stats *stats
+                                      __maybe_unused)
+{
+}
+
+static inline
+int itrace_parse_synth_opts(const struct option *opt __maybe_unused,
+                           const char *str __maybe_unused,
+                           int unset __maybe_unused)
+{
+       pr_err("AUX area tracing not supported\n");
+       return -EINVAL;
+}
+
+static inline
+int auxtrace_parse_snapshot_options(struct auxtrace_record *itr __maybe_unused,
+                                   struct record_opts *opts __maybe_unused,
+                                   const char *str)
+{
+       if (!str)
+               return 0;
+       pr_err("AUX area tracing not supported\n");
+       return -EINVAL;
+}
+
+static inline
+int auxtrace__process_event(struct perf_session *session __maybe_unused,
+                           union perf_event *event __maybe_unused,
+                           struct perf_sample *sample __maybe_unused,
+                           struct perf_tool *tool __maybe_unused)
+{
+       return 0;
+}
+
+static inline
+int auxtrace__flush_events(struct perf_session *session __maybe_unused,
+                          struct perf_tool *tool __maybe_unused)
+{
+       return 0;
+}
+
+static inline
+void auxtrace__free_events(struct perf_session *session __maybe_unused)
+{
+}
+
+static inline
+void auxtrace_cache__free(struct auxtrace_cache *auxtrace_cache __maybe_unused)
+{
+}
+
+static inline
+void auxtrace__free(struct perf_session *session __maybe_unused)
+{
+}
+
+static inline
+int auxtrace_index__write(int fd __maybe_unused,
+                         struct list_head *head __maybe_unused)
+{
+       return -EINVAL;
+}
+
+static inline
+int auxtrace_index__process(int fd __maybe_unused,
+                           u64 size __maybe_unused,
+                           struct perf_session *session __maybe_unused,
+                           bool needs_swap __maybe_unused)
+{
+       return -EINVAL;
+}
+
+static inline
+void auxtrace_index__free(struct list_head *head __maybe_unused)
+{
+}
+
+int auxtrace_mmap__mmap(struct auxtrace_mmap *mm,
+                       struct auxtrace_mmap_params *mp,
+                       void *userpg, int fd);
+void auxtrace_mmap__munmap(struct auxtrace_mmap *mm);
+void auxtrace_mmap_params__init(struct auxtrace_mmap_params *mp,
+                               off_t auxtrace_offset,
+                               unsigned int auxtrace_pages,
+                               bool auxtrace_overwrite);
+void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp,
+                                  struct perf_evlist *evlist, int idx,
+                                  bool per_cpu);
+
+#endif
+
+#endif
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c

index 61867dff5d5aa6dea4079d060514f1bd57a63630..1f6fc2323ef97d5e9fdea6f70a9028db1d14a37e 100644 (file)
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -43,6 +43,7 @@ int build_id__mark_dso_hit(struct perf_tool *tool __maybe_unused,
         if (al.map != NULL)
                 al.map->dso->hit = 1;
  
+       thread__put(thread);
         return 0;
  }
  
@@ -59,8 +60,10 @@ static int perf_event__exit_del_thread(struct perf_tool *tool __maybe_unused,
         dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid,
                     event->fork.ppid, event->fork.ptid);
  
-       if (thread)
+       if (thread) {
                 machine__remove_thread(machine, thread);
+               thread__put(thread);
+       }
  
         return 0;
  }
@@ -159,15 +162,20 @@ static int write_buildid(const char *name, size_t name_len, u8 *build_id,
         return write_padded(fd, name, name_len + 1, len);
  }
  
-static int __dsos__write_buildid_table(struct list_head *head,
-                                      struct machine *machine,
-                                      pid_t pid, u16 misc, int fd)
+static int machine__write_buildid_table(struct machine *machine, int fd)
  {
+       int err = 0;
         char nm[PATH_MAX];
         struct dso *pos;
+       u16 kmisc = PERF_RECORD_MISC_KERNEL,
+           umisc = PERF_RECORD_MISC_USER;
  
-       dsos__for_each_with_build_id(pos, head) {
-               int err;
+       if (!machine__is_host(machine)) {
+               kmisc = PERF_RECORD_MISC_GUEST_KERNEL;
+               umisc = PERF_RECORD_MISC_GUEST_USER;
+       }
+
+       dsos__for_each_with_build_id(pos, &machine->dsos.head) {
                 const char *name;
                 size_t name_len;
  
@@ -186,32 +194,12 @@ static int __dsos__write_buildid_table(struct list_head *head,
                         name_len = pos->long_name_len + 1;
                 }
  
-               err = write_buildid(name, name_len, pos->build_id,
-                                   pid, misc, fd);
+               err = write_buildid(name, name_len, pos->build_id, machine->pid,
+                                   pos->kernel ? kmisc : umisc, fd);
                 if (err)
-                       return err;
-       }
-
-       return 0;
-}
-
-static int machine__write_buildid_table(struct machine *machine, int fd)
-{
-       int err;
-       u16 kmisc = PERF_RECORD_MISC_KERNEL,
-           umisc = PERF_RECORD_MISC_USER;
-
-       if (!machine__is_host(machine)) {
-               kmisc = PERF_RECORD_MISC_GUEST_KERNEL;
-               umisc = PERF_RECORD_MISC_GUEST_USER;
+                       break;
         }
  
-       err = __dsos__write_buildid_table(&machine->kernel_dsos.head, machine,
-                                         machine->pid, kmisc, fd);
-       if (err == 0)
-               err = __dsos__write_buildid_table(&machine->user_dsos.head,
-                                                 machine, machine->pid, umisc,
-                                                 fd);
         return err;
  }
  
@@ -244,13 +232,7 @@ static int __dsos__hit_all(struct list_head *head)
  
  static int machine__hit_all_dsos(struct machine *machine)
  {
-       int err;
-
-       err = __dsos__hit_all(&machine->kernel_dsos.head);
-       if (err)
-               return err;
-
-       return __dsos__hit_all(&machine->user_dsos.head);
+       return __dsos__hit_all(&machine->dsos.head);
  }
  
  int dsos__hit_all(struct perf_session *session)
@@ -490,9 +472,7 @@ static int __dsos__cache_build_ids(struct list_head *head,
  
  static int machine__cache_build_ids(struct machine *machine)
  {
-       int ret = __dsos__cache_build_ids(&machine->kernel_dsos.head, machine);
-       ret |= __dsos__cache_build_ids(&machine->user_dsos.head, machine);
-       return ret;
+       return __dsos__cache_build_ids(&machine->dsos.head, machine);
  }
  
  int perf_session__cache_build_ids(struct perf_session *session)
@@ -517,11 +497,7 @@ int perf_session__cache_build_ids(struct perf_session *session)
  
  static bool machine__read_build_ids(struct machine *machine, bool with_hits)
  {
-       bool ret;
-
-       ret  = __dsos__read_build_ids(&machine->kernel_dsos.head, with_hits);
-       ret |= __dsos__read_build_ids(&machine->user_dsos.head, with_hits);
-       return ret;
+       return __dsos__read_build_ids(&machine->dsos.head, with_hits);
  }
  
  bool perf_session__read_build_ids(struct perf_session *session, bool with_hits)
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h

index fbcca21d66ab9b6887f084b19bdff98109201612..c861373aaed33dafd233a7bf5eebefa4dfd9874b 100644 (file)
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -30,7 +30,6 @@ extern const char *perf_config_dirname(const char *, const char *);
  
  /* pager.c */
  extern void setup_pager(void);
-extern const char *pager_program;
  extern int pager_in_use(void);
  extern int pager_use_color;
  
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h

index 6033a0a212ca5c255434ae2cf34c7370d785d858..679c2c6d8ade7daeace3d55300049aca7fe68a0b 100644 (file)
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -72,6 +72,10 @@ extern struct callchain_param callchain_param;
  struct callchain_list {
         u64                     ip;
         struct map_symbol       ms;
+       struct /* for TUI */ {
+               bool            unfolded;
+               bool            has_children;
+       };
         char                   *srcline;
         struct list_head        list;
  };
diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c

index 88f7be3994321f8ef717262909ac6c38853fc691..32e12ecfe9c576767f18a3cb42e6c5dedfc3f048 100644 (file)
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -115,23 +115,19 @@ static int add_cgroup(struct perf_evlist *evlist, char *str)
                         goto found;
                 n++;
         }
-       if (cgrp->refcnt == 0)
+       if (atomic_read(&cgrp->refcnt) == 0)
                 free(cgrp);
  
         return -1;
  found:
-       cgrp->refcnt++;
+       atomic_inc(&cgrp->refcnt);
         counter->cgrp = cgrp;
         return 0;
  }
  
  void close_cgroup(struct cgroup_sel *cgrp)
  {
-       if (!cgrp)
-               return;
-
-       /* XXX: not reentrant */
-       if (--cgrp->refcnt == 0) {
+       if (cgrp && atomic_dec_and_test(&cgrp->refcnt)) {
                 close(cgrp->fd);
                 zfree(&cgrp->name);
                 free(cgrp);
diff --git a/tools/perf/util/cgroup.h b/tools/perf/util/cgroup.h

index 89acd6debdc5fc06d28889cb95eb30d7af98642f..b4b8cb42fe5e04b7458fe3b57f764b0bda6c18db 100644 (file)
--- a/tools/perf/util/cgroup.h
+++ b/tools/perf/util/cgroup.h
@@ -1,12 +1,14 @@
  #ifndef __CGROUP_H__
  #define __CGROUP_H__
  
+#include <linux/atomic.h>
+
  struct option;
  
  struct cgroup_sel {
         char *name;
         int fd;
-       int refcnt;
+       atomic_t refcnt;
  };
  
  
diff --git a/tools/perf/util/comm.c b/tools/perf/util/comm.c

index b2bb59df65e10c10cfe6dd1aacb495d4804a6a4f..21b7ff382c3f0dfb2e0bff43074adae3c0b10973 100644 (file)
--- a/tools/perf/util/comm.c
+++ b/tools/perf/util/comm.c
@@ -2,24 +2,27 @@
  #include "util.h"
  #include <stdlib.h>
  #include <stdio.h>
+#include <linux/atomic.h>
  
  struct comm_str {
         char *str;
         struct rb_node rb_node;
-       int ref;
+       atomic_t refcnt;
  };
  
  /* Should perhaps be moved to struct machine */
  static struct rb_root comm_str_root;
  
-static void comm_str__get(struct comm_str *cs)
+static struct comm_str *comm_str__get(struct comm_str *cs)
  {
-       cs->ref++;
+       if (cs)
+               atomic_inc(&cs->refcnt);
+       return cs;
  }
  
  static void comm_str__put(struct comm_str *cs)
  {
-       if (!--cs->ref) {
+       if (cs && atomic_dec_and_test(&cs->refcnt)) {
                 rb_erase(&cs->rb_node, &comm_str_root);
                 zfree(&cs->str);
                 free(cs);
@@ -40,6 +43,8 @@ static struct comm_str *comm_str__alloc(const char *str)
                 return NULL;
         }
  
+       atomic_set(&cs->refcnt, 0);
+
         return cs;
  }
  
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c

index dd17c9a32fbcfcf3c55c8cc33a44349582f4bb89..5bfc1198ab465c1873c7a112eefaa97bf286ee6f 100644 (file)
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -14,6 +14,7 @@
  #include <babeltrace/ctf-writer/event.h>
  #include <babeltrace/ctf-writer/event-types.h>
  #include <babeltrace/ctf-writer/event-fields.h>
+#include <babeltrace/ctf-ir/utils.h>
  #include <babeltrace/ctf/events.h>
  #include <traceevent/event-parse.h>
  #include "asm/bug.h"
@@ -38,12 +39,21 @@ struct evsel_priv {
         struct bt_ctf_event_class *event_class;
  };
  
+#define MAX_CPUS       4096
+
+struct ctf_stream {
+       struct bt_ctf_stream *stream;
+       int cpu;
+       u32 count;
+};
+
  struct ctf_writer {
         /* writer primitives */
-       struct bt_ctf_writer            *writer;
-       struct bt_ctf_stream            *stream;
-       struct bt_ctf_stream_class      *stream_class;
-       struct bt_ctf_clock             *clock;
+       struct bt_ctf_writer             *writer;
+       struct ctf_stream               **stream;
+       int                               stream_cnt;
+       struct bt_ctf_stream_class       *stream_class;
+       struct bt_ctf_clock              *clock;
  
         /* data types */
         union {
@@ -65,6 +75,9 @@ struct convert {
  
         u64                     events_size;
         u64                     events_count;
+
+       /* Ordered events configured queue size. */
+       u64                     queue_size;
  };
  
  static int value_set(struct bt_ctf_field_type *type,
@@ -153,6 +166,43 @@ get_tracepoint_field_type(struct ctf_writer *cw, struct format_field *field)
                 return cw->data.u32;
  }
  
+static unsigned long long adjust_signedness(unsigned long long value_int, int size)
+{
+       unsigned long long value_mask;
+
+       /*
+        * value_mask = (1 << (size * 8 - 1)) - 1.
+        * Directly set value_mask for code readers.
+        */
+       switch (size) {
+       case 1:
+               value_mask = 0x7fULL;
+               break;
+       case 2:
+               value_mask = 0x7fffULL;
+               break;
+       case 4:
+               value_mask = 0x7fffffffULL;
+               break;
+       case 8:
+               /*
+                * For 64 bit value, return it self. There is no need
+                * to fill high bit.
+                */
+               /* Fall through */
+       default:
+               /* BUG! */
+               return value_int;
+       }
+
+       /* If it is a positive value, don't adjust. */
+       if ((value_int & (~0ULL - value_mask)) == 0)
+               return value_int;
+
+       /* Fill upper part of value_int with 1 to make it a negative long long. */
+       return (value_int & value_mask) | ~value_mask;
+}
+
  static int add_tracepoint_field_value(struct ctf_writer *cw,
                                       struct bt_ctf_event_class *event_class,
                                       struct bt_ctf_event *event,
@@ -164,7 +214,6 @@ static int add_tracepoint_field_value(struct ctf_writer *cw,
         struct bt_ctf_field *field;
         const char *name = fmtf->name;
         void *data = sample->raw_data;
-       unsigned long long value_int;
         unsigned long flags = fmtf->flags;
         unsigned int n_items;
         unsigned int i;
@@ -172,6 +221,7 @@ static int add_tracepoint_field_value(struct ctf_writer *cw,
         unsigned int len;
         int ret;
  
+       name = fmtf->alias;
         offset = fmtf->offset;
         len = fmtf->size;
         if (flags & FIELD_IS_STRING)
@@ -208,11 +258,6 @@ static int add_tracepoint_field_value(struct ctf_writer *cw,
         type = get_tracepoint_field_type(cw, fmtf);
  
         for (i = 0; i < n_items; i++) {
-               if (!(flags & FIELD_IS_STRING))
-                       value_int = pevent_read_number(
-                                       fmtf->event->pevent,
-                                       data + offset + i * len, len);
-
                 if (flags & FIELD_IS_ARRAY)
                         field = bt_ctf_field_array_get_field(array_field, i);
                 else
@@ -226,12 +271,21 @@ static int add_tracepoint_field_value(struct ctf_writer *cw,
                 if (flags & FIELD_IS_STRING)
                         ret = bt_ctf_field_string_set_value(field,
                                         data + offset + i * len);
-               else if (!(flags & FIELD_IS_SIGNED))
-                       ret = bt_ctf_field_unsigned_integer_set_value(
-                                       field, value_int);
-               else
-                       ret = bt_ctf_field_signed_integer_set_value(
-                                       field, value_int);
+               else {
+                       unsigned long long value_int;
+
+                       value_int = pevent_read_number(
+                                       fmtf->event->pevent,
+                                       data + offset + i * len, len);
+
+                       if (!(flags & FIELD_IS_SIGNED))
+                               ret = bt_ctf_field_unsigned_integer_set_value(
+                                               field, value_int);
+                       else
+                               ret = bt_ctf_field_signed_integer_set_value(
+                                               field, adjust_signedness(value_int, len));
+               }
+
                 if (ret) {
                         pr_err("failed to set file value %s\n", name);
                         goto err_put_field;
@@ -346,12 +400,6 @@ static int add_generic_values(struct ctf_writer *cw,
                         return -1;
         }
  
-       if (type & PERF_SAMPLE_CPU) {
-               ret = value_set_u32(cw, event, "perf_cpu", sample->cpu);
-               if (ret)
-                       return -1;
-       }
-
         if (type & PERF_SAMPLE_PERIOD) {
                 ret = value_set_u64(cw, event, "perf_period", sample->period);
                 if (ret)
@@ -381,6 +429,129 @@ static int add_generic_values(struct ctf_writer *cw,
         return 0;
  }
  
+static int ctf_stream__flush(struct ctf_stream *cs)
+{
+       int err = 0;
+
+       if (cs) {
+               err = bt_ctf_stream_flush(cs->stream);
+               if (err)
+                       pr_err("CTF stream %d flush failed\n", cs->cpu);
+
+               pr("Flush stream for cpu %d (%u samples)\n",
+                  cs->cpu, cs->count);
+
+               cs->count = 0;
+       }
+
+       return err;
+}
+
+static struct ctf_stream *ctf_stream__create(struct ctf_writer *cw, int cpu)
+{
+       struct ctf_stream *cs;
+       struct bt_ctf_field *pkt_ctx   = NULL;
+       struct bt_ctf_field *cpu_field = NULL;
+       struct bt_ctf_stream *stream   = NULL;
+       int ret;
+
+       cs = zalloc(sizeof(*cs));
+       if (!cs) {
+               pr_err("Failed to allocate ctf stream\n");
+               return NULL;
+       }
+
+       stream = bt_ctf_writer_create_stream(cw->writer, cw->stream_class);
+       if (!stream) {
+               pr_err("Failed to create CTF stream\n");
+               goto out;
+       }
+
+       pkt_ctx = bt_ctf_stream_get_packet_context(stream);
+       if (!pkt_ctx) {
+               pr_err("Failed to obtain packet context\n");
+               goto out;
+       }
+
+       cpu_field = bt_ctf_field_structure_get_field(pkt_ctx, "cpu_id");
+       bt_ctf_field_put(pkt_ctx);
+       if (!cpu_field) {
+               pr_err("Failed to obtain cpu field\n");
+               goto out;
+       }
+
+       ret = bt_ctf_field_unsigned_integer_set_value(cpu_field, (u32) cpu);
+       if (ret) {
+               pr_err("Failed to update CPU number\n");
+               goto out;
+       }
+
+       bt_ctf_field_put(cpu_field);
+
+       cs->cpu    = cpu;
+       cs->stream = stream;
+       return cs;
+
+out:
+       if (cpu_field)
+               bt_ctf_field_put(cpu_field);
+       if (stream)
+               bt_ctf_stream_put(stream);
+
+       free(cs);
+       return NULL;
+}
+
+static void ctf_stream__delete(struct ctf_stream *cs)
+{
+       if (cs) {
+               bt_ctf_stream_put(cs->stream);
+               free(cs);
+       }
+}
+
+static struct ctf_stream *ctf_stream(struct ctf_writer *cw, int cpu)
+{
+       struct ctf_stream *cs = cw->stream[cpu];
+
+       if (!cs) {
+               cs = ctf_stream__create(cw, cpu);
+               cw->stream[cpu] = cs;
+       }
+
+       return cs;
+}
+
+static int get_sample_cpu(struct ctf_writer *cw, struct perf_sample *sample,
+                         struct perf_evsel *evsel)
+{
+       int cpu = 0;
+
+       if (evsel->attr.sample_type & PERF_SAMPLE_CPU)
+               cpu = sample->cpu;
+
+       if (cpu > cw->stream_cnt) {
+               pr_err("Event was recorded for CPU %d, limit is at %d.\n",
+                       cpu, cw->stream_cnt);
+               cpu = 0;
+       }
+
+       return cpu;
+}
+
+#define STREAM_FLUSH_COUNT 100000
+
+/*
+ * Currently we have no other way to determine the
+ * time for the stream flush other than keep track
+ * of the number of events and check it against
+ * threshold.
+ */
+static bool is_flush_needed(struct ctf_stream *cs)
+{
+       return cs->count >= STREAM_FLUSH_COUNT;
+}
+
  static int process_sample_event(struct perf_tool *tool,
                                 union perf_event *_event __maybe_unused,
                                 struct perf_sample *sample,
@@ -390,6 +561,7 @@ static int process_sample_event(struct perf_tool *tool,
         struct convert *c = container_of(tool, struct convert, tool);
         struct evsel_priv *priv = evsel->priv;
         struct ctf_writer *cw = &c->writer;
+       struct ctf_stream *cs;
         struct bt_ctf_event_class *event_class;
         struct bt_ctf_event *event;
         int ret;
@@ -424,9 +596,93 @@ static int process_sample_event(struct perf_tool *tool,
                         return -1;
         }
  
-       bt_ctf_stream_append_event(cw->stream, event);
+       cs = ctf_stream(cw, get_sample_cpu(cw, sample, evsel));
+       if (cs) {
+               if (is_flush_needed(cs))
+                       ctf_stream__flush(cs);
+
+               cs->count++;
+               bt_ctf_stream_append_event(cs->stream, event);
+       }
+
         bt_ctf_event_put(event);
-       return 0;
+       return cs ? 0 : -1;
+}
+
+/* If dup < 0, add a prefix. Else, add _dupl_X suffix. */
+static char *change_name(char *name, char *orig_name, int dup)
+{
+       char *new_name = NULL;
+       size_t len;
+
+       if (!name)
+               name = orig_name;
+
+       if (dup >= 10)
+               goto out;
+       /*
+        * Add '_' prefix to potential keywork.  According to
+        * Mathieu Desnoyers (https://lkml.org/lkml/2015/1/23/652),
+        * futher CTF spec updating may require us to use '$'.
+        */
+       if (dup < 0)
+               len = strlen(name) + sizeof("_");
+       else
+               len = strlen(orig_name) + sizeof("_dupl_X");
+
+       new_name = malloc(len);
+       if (!new_name)
+               goto out;
+
+       if (dup < 0)
+               snprintf(new_name, len, "_%s", name);
+       else
+               snprintf(new_name, len, "%s_dupl_%d", orig_name, dup);
+
+out:
+       if (name != orig_name)
+               free(name);
+       return new_name;
+}
+
+static int event_class_add_field(struct bt_ctf_event_class *event_class,
+               struct bt_ctf_field_type *type,
+               struct format_field *field)
+{
+       struct bt_ctf_field_type *t = NULL;
+       char *name;
+       int dup = 1;
+       int ret;
+
+       /* alias was already assigned */
+       if (field->alias != field->name)
+               return bt_ctf_event_class_add_field(event_class, type,
+                               (char *)field->alias);
+
+       name = field->name;
+
+       /* If 'name' is a keywork, add prefix. */
+       if (bt_ctf_validate_identifier(name))
+               name = change_name(name, field->name, -1);
+
+       if (!name) {
+               pr_err("Failed to fix invalid identifier.");
+               return -1;
+       }
+       while ((t = bt_ctf_event_class_get_field_by_name(event_class, name))) {
+               bt_ctf_field_type_put(t);
+               name = change_name(name, field->name, dup++);
+               if (!name) {
+                       pr_err("Failed to create dup name for '%s'\n", field->name);
+                       return -1;
+               }
+       }
+
+       ret = bt_ctf_event_class_add_field(event_class, type, name);
+       if (!ret)
+               field->alias = name;
+
+       return ret;
  }
  
  static int add_tracepoint_fields_types(struct ctf_writer *cw,
@@ -457,14 +713,14 @@ static int add_tracepoint_fields_types(struct ctf_writer *cw,
                 if (flags & FIELD_IS_ARRAY)
                         type = bt_ctf_field_type_array_create(type, field->arraylen);
  
-               ret = bt_ctf_event_class_add_field(event_class, type,
-                               field->name);
+               ret = event_class_add_field(event_class, type, field);
  
                 if (flags & FIELD_IS_ARRAY)
                         bt_ctf_field_type_put(type);
  
                 if (ret) {
-                       pr_err("Failed to add field '%s\n", field->name);
+                       pr_err("Failed to add field '%s': %d\n",
+                                       field->name, ret);
                         return -1;
                 }
         }
@@ -508,7 +764,7 @@ static int add_generic_types(struct ctf_writer *cw, struct perf_evsel *evsel,
         do {                                                            \
                 pr2("  field '%s'\n", n);                               \
                 if (bt_ctf_event_class_add_field(cl, t, n)) {           \
-                       pr_err("Failed to add field '%s;\n", n);        \
+                       pr_err("Failed to add field '%s';\n", n);       \
                         return -1;                                      \
                 }                                                       \
         } while (0)
@@ -528,9 +784,6 @@ static int add_generic_types(struct ctf_writer *cw, struct perf_evsel *evsel,
         if (type & PERF_SAMPLE_STREAM_ID)
                 ADD_FIELD(event_class, cw->data.u64, "perf_stream_id");
  
-       if (type & PERF_SAMPLE_CPU)
-               ADD_FIELD(event_class, cw->data.u32, "perf_cpu");
-
         if (type & PERF_SAMPLE_PERIOD)
                 ADD_FIELD(event_class, cw->data.u64, "perf_period");
  
@@ -604,6 +857,39 @@ static int setup_events(struct ctf_writer *cw, struct perf_session *session)
         return 0;
  }
  
+static int setup_streams(struct ctf_writer *cw, struct perf_session *session)
+{
+       struct ctf_stream **stream;
+       struct perf_header *ph = &session->header;
+       int ncpus;
+
+       /*
+        * Try to get the number of cpus used in the data file,
+        * if not present fallback to the MAX_CPUS.
+        */
+       ncpus = ph->env.nr_cpus_avail ?: MAX_CPUS;
+
+       stream = zalloc(sizeof(*stream) * ncpus);
+       if (!stream) {
+               pr_err("Failed to allocate streams.\n");
+               return -ENOMEM;
+       }
+
+       cw->stream     = stream;
+       cw->stream_cnt = ncpus;
+       return 0;
+}
+
+static void free_streams(struct ctf_writer *cw)
+{
+       int cpu;
+
+       for (cpu = 0; cpu < cw->stream_cnt; cpu++)
+               ctf_stream__delete(cw->stream[cpu]);
+
+       free(cw->stream);
+}
+
  static int ctf_writer__setup_env(struct ctf_writer *cw,
                                  struct perf_session *session)
  {
@@ -713,7 +999,7 @@ static void ctf_writer__cleanup(struct ctf_writer *cw)
         ctf_writer__cleanup_data(cw);
  
         bt_ctf_clock_put(cw->clock);
-       bt_ctf_stream_put(cw->stream);
+       free_streams(cw);
         bt_ctf_stream_class_put(cw->stream_class);
         bt_ctf_writer_put(cw->writer);
  
@@ -725,8 +1011,9 @@ static int ctf_writer__init(struct ctf_writer *cw, const char *path)
  {
         struct bt_ctf_writer            *writer;
         struct bt_ctf_stream_class      *stream_class;
-       struct bt_ctf_stream            *stream;
         struct bt_ctf_clock             *clock;
+       struct bt_ctf_field_type        *pkt_ctx_type;
+       int                             ret;
  
         /* CTF writer */
         writer = bt_ctf_writer_create(path);
@@ -767,14 +1054,15 @@ static int ctf_writer__init(struct ctf_writer *cw, const char *path)
         if (ctf_writer__init_data(cw))
                 goto err_cleanup;
  
-       /* CTF stream instance */
-       stream = bt_ctf_writer_create_stream(writer, stream_class);
-       if (!stream) {
-               pr("Failed to create CTF stream.\n");
+       /* Add cpu_id for packet context */
+       pkt_ctx_type = bt_ctf_stream_class_get_packet_context_type(stream_class);
+       if (!pkt_ctx_type)
                 goto err_cleanup;
-       }
  
-       cw->stream = stream;
+       ret = bt_ctf_field_type_structure_add_field(pkt_ctx_type, cw->data.u32, "cpu_id");
+       bt_ctf_field_type_put(pkt_ctx_type);
+       if (ret)
+               goto err_cleanup;
  
         /* CTF clock writer setup */
         if (bt_ctf_writer_add_clock(writer, clock)) {
@@ -791,6 +1079,28 @@ err:
         return -1;
  }
  
+static int ctf_writer__flush_streams(struct ctf_writer *cw)
+{
+       int cpu, ret = 0;
+
+       for (cpu = 0; cpu < cw->stream_cnt && !ret; cpu++)
+               ret = ctf_stream__flush(cw->stream[cpu]);
+
+       return ret;
+}
+
+static int convert__config(const char *var, const char *value, void *cb)
+{
+       struct convert *c = cb;
+
+       if (!strcmp(var, "convert.queue-size")) {
+               c->queue_size = perf_config_u64(var, value);
+               return 0;
+       }
+
+       return perf_default_config(var, value, cb);
+}
+
  int bt_convert__perf2ctf(const char *input, const char *path, bool force)
  {
         struct perf_session *session;
@@ -817,6 +1127,8 @@ int bt_convert__perf2ctf(const char *input, const char *path, bool force)
         struct ctf_writer *cw = &c.writer;
         int err = -1;
  
+       perf_config(convert__config, &c);
+
         /* CTF writer */
         if (ctf_writer__init(cw, path))
                 return -1;
@@ -826,6 +1138,11 @@ int bt_convert__perf2ctf(const char *input, const char *path, bool force)
         if (!session)
                 goto free_writer;
  
+       if (c.queue_size) {
+               ordered_events__set_alloc_size(&session->ordered_events,
+                                              c.queue_size);
+       }
+
         /* CTF writer env/clock setup  */
         if (ctf_writer__setup_env(cw, session))
                 goto free_session;
@@ -834,9 +1151,14 @@ int bt_convert__perf2ctf(const char *input, const char *path, bool force)
         if (setup_events(cw, session))
                 goto free_session;
  
+       if (setup_streams(cw, session))
+               goto free_session;
+
         err = perf_session__process_events(session);
         if (!err)
-               err = bt_ctf_stream_flush(cw->stream);
+               err = ctf_writer__flush_streams(cw);
+       else
+               pr_err("Error during conversion.\n");
  
         fprintf(stderr,
                 "[ perf data convert: Converted '%s' into CTF data '%s' ]\n",
@@ -847,11 +1169,15 @@ int bt_convert__perf2ctf(const char *input, const char *path, bool force)
                 (double) c.events_size / 1024.0 / 1024.0,
                 c.events_count);
  
-       /* its all good */
-free_session:
         perf_session__delete(session);
+       ctf_writer__cleanup(cw);
+
+       return err;
  
+free_session:
+       perf_session__delete(session);
  free_writer:
         ctf_writer__cleanup(cw);
+       pr_err("Error during conversion setup.\n");
         return err;
  }
diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c

index bb39a3ffc70b3951f88f9260f01836cc4e959fc9..1c9689e4cc179a3e931e1b3e0b427accb7553cba 100644 (file)
--- a/tools/perf/util/db-export.c
+++ b/tools/perf/util/db-export.c
@@ -122,6 +122,7 @@ int db_export__machine(struct db_export *dbe, struct machine *machine)
  int db_export__thread(struct db_export *dbe, struct thread *thread,
                       struct machine *machine, struct comm *comm)
  {
+       struct thread *main_thread;
         u64 main_thread_db_id = 0;
         int err;
  
@@ -131,8 +132,6 @@ int db_export__thread(struct db_export *dbe, struct thread *thread,
         thread->db_id = ++dbe->thread_last_db_id;
  
         if (thread->pid_ != -1) {
-               struct thread *main_thread;
-
                 if (thread->pid_ == thread->tid) {
                         main_thread = thread;
                 } else {
@@ -144,14 +143,16 @@ int db_export__thread(struct db_export *dbe, struct thread *thread,
                         err = db_export__thread(dbe, main_thread, machine,
                                                 comm);
                         if (err)
-                               return err;
+                               goto out_put;
                         if (comm) {
                                 err = db_export__comm_thread(dbe, comm, thread);
                                 if (err)
-                                       return err;
+                                       goto out_put;
                         }
                 }
                 main_thread_db_id = main_thread->db_id;
+               if (main_thread != thread)
+                       thread__put(main_thread);
         }
  
         if (dbe->export_thread)
@@ -159,6 +160,10 @@ int db_export__thread(struct db_export *dbe, struct thread *thread,
                                           machine);
  
         return 0;
+
+out_put:
+       thread__put(main_thread);
+       return err;
  }
  
  int db_export__comm(struct db_export *dbe, struct comm *comm,
@@ -229,7 +234,7 @@ int db_export__symbol(struct db_export *dbe, struct symbol *sym,
  static struct thread *get_main_thread(struct machine *machine, struct thread *thread)
  {
         if (thread->pid_ == thread->tid)
-               return thread;
+               return thread__get(thread);
  
         if (thread->pid_ == -1)
                 return NULL;
@@ -309,12 +314,12 @@ int db_export__sample(struct db_export *dbe, union perf_event *event,
  
         err = db_export__thread(dbe, thread, al->machine, comm);
         if (err)
-               return err;
+               goto out_put;
  
         if (comm) {
                 err = db_export__comm(dbe, comm, main_thread);
                 if (err)
-                       return err;
+                       goto out_put;
                 es.comm_db_id = comm->db_id;
         }
  
@@ -322,7 +327,7 @@ int db_export__sample(struct db_export *dbe, union perf_event *event,
  
         err = db_ids_from_al(dbe, al, &es.dso_db_id, &es.sym_db_id, &es.offset);
         if (err)
-               return err;
+               goto out_put;
  
         if ((evsel->attr.sample_type & PERF_SAMPLE_ADDR) &&
             sample_addr_correlates_sym(&evsel->attr)) {
@@ -332,20 +337,22 @@ int db_export__sample(struct db_export *dbe, union perf_event *event,
                 err = db_ids_from_al(dbe, &addr_al, &es.addr_dso_db_id,
                                      &es.addr_sym_db_id, &es.addr_offset);
                 if (err)
-                       return err;
+                       goto out_put;
                 if (dbe->crp) {
                         err = thread_stack__process(thread, comm, sample, al,
                                                     &addr_al, es.db_id,
                                                     dbe->crp);
                         if (err)
-                               return err;
+                               goto out_put;
                 }
         }
  
         if (dbe->export_sample)
-               return dbe->export_sample(dbe, &es);
+               err = dbe->export_sample(dbe, &es);
  
-       return 0;
+out_put:
+       thread__put(main_thread);
+       return err;
  }
  
  static struct {
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c

index fc0ddd5792a97f884e7ed142ad12fe87c2bfe408..7c0c08386a1d9d6fb5e8cba6c838e5c3c1949bd7 100644 (file)
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -4,6 +4,7 @@
  #include "symbol.h"
  #include "dso.h"
  #include "machine.h"
+#include "auxtrace.h"
  #include "util.h"
  #include "debug.h"
  
@@ -165,12 +166,28 @@ bool is_supported_compression(const char *ext)
         return false;
  }
  
-bool is_kernel_module(const char *pathname)
+bool is_kernel_module(const char *pathname, int cpumode)
  {
         struct kmod_path m;
-
-       if (kmod_path__parse(&m, pathname))
-               return NULL;
+       int mode = cpumode & PERF_RECORD_MISC_CPUMODE_MASK;
+
+       WARN_ONCE(mode != cpumode,
+                 "Internal error: passing unmasked cpumode (%x) to is_kernel_module",
+                 cpumode);
+
+       switch (mode) {
+       case PERF_RECORD_MISC_USER:
+       case PERF_RECORD_MISC_HYPERVISOR:
+       case PERF_RECORD_MISC_GUEST_USER:
+               return false;
+       /* Treat PERF_RECORD_MISC_CPUMODE_UNKNOWN as kernel */
+       default:
+               if (kmod_path__parse(&m, pathname)) {
+                       pr_err("Failed to check whether %s is a kernel module or not. Assume it is.",
+                                       pathname);
+                       return true;
+               }
+       }
  
         return m.kmod;
  }
@@ -214,12 +231,33 @@ int __kmod_path__parse(struct kmod_path *m, const char *path,
  {
         const char *name = strrchr(path, '/');
         const char *ext  = strrchr(path, '.');
+       bool is_simple_name = false;
  
         memset(m, 0x0, sizeof(*m));
         name = name ? name + 1 : path;
  
+       /*
+        * '.' is also a valid character for module name. For example:
+        * [aaa.bbb] is a valid module name. '[' should have higher
+        * priority than '.ko' suffix.
+        *
+        * The kernel names are from machine__mmap_name. Such
+        * name should belong to kernel itself, not kernel module.
+        */
+       if (name[0] == '[') {
+               is_simple_name = true;
+               if ((strncmp(name, "[kernel.kallsyms]", 17) == 0) ||
+                   (strncmp(name, "[guest.kernel.kallsyms", 22) == 0) ||
+                   (strncmp(name, "[vdso]", 6) == 0) ||
+                   (strncmp(name, "[vsyscall]", 10) == 0)) {
+                       m->kmod = false;
+
+               } else
+                       m->kmod = true;
+       }
+
         /* No extension, just return name. */
-       if (ext == NULL) {
+       if ((ext == NULL) || is_simple_name) {
                 if (alloc_name) {
                         m->name = strdup(name);
                         return m->name ? 0 : -ENOMEM;
@@ -264,6 +302,7 @@ int __kmod_path__parse(struct kmod_path *m, const char *path,
   */
  static LIST_HEAD(dso__data_open);
  static long dso__data_open_cnt;
+static pthread_mutex_t dso__data_open_lock = PTHREAD_MUTEX_INITIALIZER;
  
  static void dso__list_add(struct dso *dso)
  {
@@ -433,18 +472,12 @@ static void check_data_close(void)
   */
  void dso__data_close(struct dso *dso)
  {
+       pthread_mutex_lock(&dso__data_open_lock);
         close_dso(dso);
+       pthread_mutex_unlock(&dso__data_open_lock);
  }
  
-/**
- * dso__data_fd - Get dso's data file descriptor
- * @dso: dso object
- * @machine: machine object
- *
- * External interface to find dso's file, open it and
- * returns file descriptor.
- */
-int dso__data_fd(struct dso *dso, struct machine *machine)
+static void try_to_open_dso(struct dso *dso, struct machine *machine)
  {
         enum dso_binary_type binary_type_data[] = {
                 DSO_BINARY_TYPE__BUILD_ID_CACHE,
@@ -453,11 +486,8 @@ int dso__data_fd(struct dso *dso, struct machine *machine)
         };
         int i = 0;
  
-       if (dso->data.status == DSO_DATA_STATUS_ERROR)
-               return -1;
-
         if (dso->data.fd >= 0)
-               goto out;
+               return;
  
         if (dso->binary_type != DSO_BINARY_TYPE__NOT_FOUND) {
                 dso->data.fd = open_dso(dso, machine);
@@ -477,10 +507,38 @@ out:
                 dso->data.status = DSO_DATA_STATUS_OK;
         else
                 dso->data.status = DSO_DATA_STATUS_ERROR;
+}
+
+/**
+ * dso__data_get_fd - Get dso's data file descriptor
+ * @dso: dso object
+ * @machine: machine object
+ *
+ * External interface to find dso's file, open it and
+ * returns file descriptor.  It should be paired with
+ * dso__data_put_fd() if it returns non-negative value.
+ */
+int dso__data_get_fd(struct dso *dso, struct machine *machine)
+{
+       if (dso->data.status == DSO_DATA_STATUS_ERROR)
+               return -1;
+
+       if (pthread_mutex_lock(&dso__data_open_lock) < 0)
+               return -1;
+
+       try_to_open_dso(dso, machine);
+
+       if (dso->data.fd < 0)
+               pthread_mutex_unlock(&dso__data_open_lock);
  
         return dso->data.fd;
  }
  
+void dso__data_put_fd(struct dso *dso __maybe_unused)
+{
+       pthread_mutex_unlock(&dso__data_open_lock);
+}
+
  bool dso__data_status_seen(struct dso *dso, enum dso_data_status_seen by)
  {
         u32 flag = 1 << by;
@@ -494,10 +552,12 @@ bool dso__data_status_seen(struct dso *dso, enum dso_data_status_seen by)
  }
  
  static void
-dso_cache__free(struct rb_root *root)
+dso_cache__free(struct dso *dso)
  {
+       struct rb_root *root = &dso->data.cache;
         struct rb_node *next = rb_first(root);
  
+       pthread_mutex_lock(&dso->lock);
         while (next) {
                 struct dso_cache *cache;
  
@@ -506,10 +566,12 @@ dso_cache__free(struct rb_root *root)
                 rb_erase(&cache->rb_node, root);
                 free(cache);
         }
+       pthread_mutex_unlock(&dso->lock);
  }
  
-static struct dso_cache *dso_cache__find(const struct rb_root *root, u64 offset)
+static struct dso_cache *dso_cache__find(struct dso *dso, u64 offset)
  {
+       const struct rb_root *root = &dso->data.cache;
         struct rb_node * const *p = &root->rb_node;
         const struct rb_node *parent = NULL;
         struct dso_cache *cache;
@@ -528,17 +590,20 @@ static struct dso_cache *dso_cache__find(const struct rb_root *root, u64 offset)
                 else
                         return cache;
         }
+
         return NULL;
  }
  
-static void
-dso_cache__insert(struct rb_root *root, struct dso_cache *new)
+static struct dso_cache *
+dso_cache__insert(struct dso *dso, struct dso_cache *new)
  {
+       struct rb_root *root = &dso->data.cache;
         struct rb_node **p = &root->rb_node;
         struct rb_node *parent = NULL;
         struct dso_cache *cache;
         u64 offset = new->offset;
  
+       pthread_mutex_lock(&dso->lock);
         while (*p != NULL) {
                 u64 end;
  
@@ -550,10 +615,17 @@ dso_cache__insert(struct rb_root *root, struct dso_cache *new)
                         p = &(*p)->rb_left;
                 else if (offset >= end)
                         p = &(*p)->rb_right;
+               else
+                       goto out;
         }
  
         rb_link_node(&new->rb_node, parent, p);
         rb_insert_color(&new->rb_node, root);
+
+       cache = NULL;
+out:
+       pthread_mutex_unlock(&dso->lock);
+       return cache;
  }
  
  static ssize_t
@@ -568,19 +640,33 @@ dso_cache__memcpy(struct dso_cache *cache, u64 offset,
  }
  
  static ssize_t
-dso_cache__read(struct dso *dso, u64 offset, u8 *data, ssize_t size)
+dso_cache__read(struct dso *dso, struct machine *machine,
+               u64 offset, u8 *data, ssize_t size)
  {
         struct dso_cache *cache;
+       struct dso_cache *old;
         ssize_t ret;
  
         do {
                 u64 cache_offset;
  
-               ret = -ENOMEM;
-
                 cache = zalloc(sizeof(*cache) + DSO__DATA_CACHE_SIZE);
                 if (!cache)
+                       return -ENOMEM;
+
+               pthread_mutex_lock(&dso__data_open_lock);
+
+               /*
+                * dso->data.fd might be closed if other thread opened another
+                * file (dso) due to open file limit (RLIMIT_NOFILE).
+                */
+               try_to_open_dso(dso, machine);
+
+               if (dso->data.fd < 0) {
+                       ret = -errno;
+                       dso->data.status = DSO_DATA_STATUS_ERROR;
                         break;
+               }
  
                 cache_offset = offset & DSO__DATA_CACHE_MASK;
  
@@ -590,11 +676,20 @@ dso_cache__read(struct dso *dso, u64 offset, u8 *data, ssize_t size)
  
                 cache->offset = cache_offset;
                 cache->size   = ret;
-               dso_cache__insert(&dso->data.cache, cache);
+       } while (0);
  
-               ret = dso_cache__memcpy(cache, offset, data, size);
+       pthread_mutex_unlock(&dso__data_open_lock);
  
-       } while (0);
+       if (ret > 0) {
+               old = dso_cache__insert(dso, cache);
+               if (old) {
+                       /* we lose the race */
+                       free(cache);
+                       cache = old;
+               }
+
+               ret = dso_cache__memcpy(cache, offset, data, size);
+       }
  
         if (ret <= 0)
                 free(cache);
@@ -602,16 +697,16 @@ dso_cache__read(struct dso *dso, u64 offset, u8 *data, ssize_t size)
         return ret;
  }
  
-static ssize_t dso_cache_read(struct dso *dso, u64 offset,
-                             u8 *data, ssize_t size)
+static ssize_t dso_cache_read(struct dso *dso, struct machine *machine,
+                             u64 offset, u8 *data, ssize_t size)
  {
         struct dso_cache *cache;
  
-       cache = dso_cache__find(&dso->data.cache, offset);
+       cache = dso_cache__find(dso, offset);
         if (cache)
                 return dso_cache__memcpy(cache, offset, data, size);
         else
-               return dso_cache__read(dso, offset, data, size);
+               return dso_cache__read(dso, machine, offset, data, size);
  }
  
  /*
@@ -619,7 +714,8 @@ static ssize_t dso_cache_read(struct dso *dso, u64 offset,
   * in the rb_tree. Any read to already cached data is served
   * by cached data.
   */
-static ssize_t cached_read(struct dso *dso, u64 offset, u8 *data, ssize_t size)
+static ssize_t cached_read(struct dso *dso, struct machine *machine,
+                          u64 offset, u8 *data, ssize_t size)
  {
         ssize_t r = 0;
         u8 *p = data;
@@ -627,7 +723,7 @@ static ssize_t cached_read(struct dso *dso, u64 offset, u8 *data, ssize_t size)
         do {
                 ssize_t ret;
  
-               ret = dso_cache_read(dso, offset, p, size);
+               ret = dso_cache_read(dso, machine, offset, p, size);
                 if (ret < 0)
                         return ret;
  
@@ -647,21 +743,44 @@ static ssize_t cached_read(struct dso *dso, u64 offset, u8 *data, ssize_t size)
         return r;
  }
  
-static int data_file_size(struct dso *dso)
+static int data_file_size(struct dso *dso, struct machine *machine)
  {
+       int ret = 0;
         struct stat st;
         char sbuf[STRERR_BUFSIZE];
  
-       if (!dso->data.file_size) {
-               if (fstat(dso->data.fd, &st)) {
-                       pr_err("dso mmap failed, fstat: %s\n",
-                               strerror_r(errno, sbuf, sizeof(sbuf)));
-                       return -1;
-               }
-               dso->data.file_size = st.st_size;
+       if (dso->data.file_size)
+               return 0;
+
+       if (dso->data.status == DSO_DATA_STATUS_ERROR)
+               return -1;
+
+       pthread_mutex_lock(&dso__data_open_lock);
+
+       /*
+        * dso->data.fd might be closed if other thread opened another
+        * file (dso) due to open file limit (RLIMIT_NOFILE).
+        */
+       try_to_open_dso(dso, machine);
+
+       if (dso->data.fd < 0) {
+               ret = -errno;
+               dso->data.status = DSO_DATA_STATUS_ERROR;
+               goto out;
         }
  
-       return 0;
+       if (fstat(dso->data.fd, &st) < 0) {
+               ret = -errno;
+               pr_err("dso cache fstat failed: %s\n",
+                      strerror_r(errno, sbuf, sizeof(sbuf)));
+               dso->data.status = DSO_DATA_STATUS_ERROR;
+               goto out;
+       }
+       dso->data.file_size = st.st_size;
+
+out:
+       pthread_mutex_unlock(&dso__data_open_lock);
+       return ret;
  }
  
  /**
@@ -673,23 +792,17 @@ static int data_file_size(struct dso *dso)
   */
  off_t dso__data_size(struct dso *dso, struct machine *machine)
  {
-       int fd;
-
-       fd = dso__data_fd(dso, machine);
-       if (fd < 0)
-               return fd;
-
-       if (data_file_size(dso))
+       if (data_file_size(dso, machine))
                 return -1;
  
         /* For now just estimate dso data size is close to file size */
         return dso->data.file_size;
  }
  
-static ssize_t data_read_offset(struct dso *dso, u64 offset,
-                               u8 *data, ssize_t size)
+static ssize_t data_read_offset(struct dso *dso, struct machine *machine,
+                               u64 offset, u8 *data, ssize_t size)
  {
-       if (data_file_size(dso))
+       if (data_file_size(dso, machine))
                 return -1;
  
         /* Check the offset sanity. */
@@ -699,7 +812,7 @@ static ssize_t data_read_offset(struct dso *dso, u64 offset,
         if (offset + size < offset)
                 return -1;
  
-       return cached_read(dso, offset, data, size);
+       return cached_read(dso, machine, offset, data, size);
  }
  
  /**
@@ -716,10 +829,10 @@ static ssize_t data_read_offset(struct dso *dso, u64 offset,
  ssize_t dso__data_read_offset(struct dso *dso, struct machine *machine,
                               u64 offset, u8 *data, ssize_t size)
  {
-       if (dso__data_fd(dso, machine) < 0)
+       if (dso->data.status == DSO_DATA_STATUS_ERROR)
                 return -1;
  
-       return data_read_offset(dso, offset, data, size);
+       return data_read_offset(dso, machine, offset, data, size);
  }
  
  /**
@@ -751,13 +864,13 @@ struct map *dso__new_map(const char *name)
         return map;
  }
  
-struct dso *dso__kernel_findnew(struct machine *machine, const char *name,
-                   const char *short_name, int dso_type)
+struct dso *machine__findnew_kernel(struct machine *machine, const char *name,
+                                   const char *short_name, int dso_type)
  {
         /*
          * The kernel dso could be created by build_id processing.
          */
-       struct dso *dso = __dsos__findnew(&machine->kernel_dsos, name);
+       struct dso *dso = machine__findnew_dso(machine, name);
  
         /*
          * We need to run this in all cases, since during the build_id
@@ -776,8 +889,8 @@ struct dso *dso__kernel_findnew(struct machine *machine, const char *name,
   * Either one of the dso or name parameter must be non-NULL or the
   * function will not work.
   */
-static struct dso *dso__findlink_by_longname(struct rb_root *root,
-                                            struct dso *dso, const char *name)
+static struct dso *__dso__findlink_by_longname(struct rb_root *root,
+                                              struct dso *dso, const char *name)
  {
         struct rb_node **p = &root->rb_node;
         struct rb_node  *parent = NULL;
@@ -824,10 +937,10 @@ static struct dso *dso__findlink_by_longname(struct rb_root *root,
         return NULL;
  }
  
-static inline struct dso *
-dso__find_by_longname(const struct rb_root *root, const char *name)
+static inline struct dso *__dso__find_by_longname(struct rb_root *root,
+                                                 const char *name)
  {
-       return dso__findlink_by_longname((struct rb_root *)root, NULL, name);
+       return __dso__findlink_by_longname(root, NULL, name);
  }
  
  void dso__set_long_name(struct dso *dso, const char *name, bool name_allocated)
@@ -935,6 +1048,8 @@ struct dso *dso__new(const char *name)
                 RB_CLEAR_NODE(&dso->rb_node);
                 INIT_LIST_HEAD(&dso->node);
                 INIT_LIST_HEAD(&dso->data.open_entry);
+               pthread_mutex_init(&dso->lock, NULL);
+               atomic_set(&dso->refcnt, 1);
         }
  
         return dso;
@@ -961,12 +1076,27 @@ void dso__delete(struct dso *dso)
         }
  
         dso__data_close(dso);
-       dso_cache__free(&dso->data.cache);
+       auxtrace_cache__free(dso->auxtrace_cache);
+       dso_cache__free(dso);
         dso__free_a2l(dso);
         zfree(&dso->symsrc_filename);
+       pthread_mutex_destroy(&dso->lock);
         free(dso);
  }
  
+struct dso *dso__get(struct dso *dso)
+{
+       if (dso)
+               atomic_inc(&dso->refcnt);
+       return dso;
+}
+
+void dso__put(struct dso *dso)
+{
+       if (dso && atomic_dec_and_test(&dso->refcnt))
+               dso__delete(dso);
+}
+
  void dso__set_build_id(struct dso *dso, void *build_id)
  {
         memcpy(dso->build_id, build_id, sizeof(dso->build_id));
@@ -1033,14 +1163,41 @@ bool __dsos__read_build_ids(struct list_head *head, bool with_hits)
         return have_build_id;
  }
  
-void dsos__add(struct dsos *dsos, struct dso *dso)
+void __dsos__add(struct dsos *dsos, struct dso *dso)
  {
         list_add_tail(&dso->node, &dsos->head);
-       dso__findlink_by_longname(&dsos->root, dso, NULL);
+       __dso__findlink_by_longname(&dsos->root, dso, NULL);
+       /*
+        * It is now in the linked list, grab a reference, then garbage collect
+        * this when needing memory, by looking at LRU dso instances in the
+        * list with atomic_read(&dso->refcnt) == 1, i.e. no references
+        * anywhere besides the one for the list, do, under a lock for the
+        * list: remove it from the list, then a dso__put(), that probably will
+        * be the last and will then call dso__delete(), end of life.
+        *
+        * That, or at the end of the 'struct machine' lifetime, when all
+        * 'struct dso' instances will be removed from the list, in
+        * dsos__exit(), if they have no other reference from some other data
+        * structure.
+        *
+        * E.g.: after processing a 'perf.data' file and storing references
+        * to objects instantiated while processing events, we will have
+        * references to the 'thread', 'map', 'dso' structs all from 'struct
+        * hist_entry' instances, but we may not need anything not referenced,
+        * so we might as well call machines__exit()/machines__delete() and
+        * garbage collect it.
+        */
+       dso__get(dso);
+}
+
+void dsos__add(struct dsos *dsos, struct dso *dso)
+{
+       pthread_rwlock_wrlock(&dsos->lock);
+       __dsos__add(dsos, dso);
+       pthread_rwlock_unlock(&dsos->lock);
  }
  
-struct dso *dsos__find(const struct dsos *dsos, const char *name,
-                      bool cmp_short)
+struct dso *__dsos__find(struct dsos *dsos, const char *name, bool cmp_short)
  {
         struct dso *pos;
  
@@ -1050,15 +1207,24 @@ struct dso *dsos__find(const struct dsos *dsos, const char *name,
                                 return pos;
                 return NULL;
         }
-       return dso__find_by_longname(&dsos->root, name);
+       return __dso__find_by_longname(&dsos->root, name);
+}
+
+struct dso *dsos__find(struct dsos *dsos, const char *name, bool cmp_short)
+{
+       struct dso *dso;
+       pthread_rwlock_rdlock(&dsos->lock);
+       dso = __dsos__find(dsos, name, cmp_short);
+       pthread_rwlock_unlock(&dsos->lock);
+       return dso;
  }
  
-struct dso *dsos__addnew(struct dsos *dsos, const char *name)
+struct dso *__dsos__addnew(struct dsos *dsos, const char *name)
  {
         struct dso *dso = dso__new(name);
  
         if (dso != NULL) {
-               dsos__add(dsos, dso);
+               __dsos__add(dsos, dso);
                 dso__set_basename(dso);
         }
         return dso;
@@ -1066,9 +1232,18 @@ struct dso *dsos__addnew(struct dsos *dsos, const char *name)
  
  struct dso *__dsos__findnew(struct dsos *dsos, const char *name)
  {
-       struct dso *dso = dsos__find(dsos, name, false);
+       struct dso *dso = __dsos__find(dsos, name, false);
  
-       return dso ? dso : dsos__addnew(dsos, name);
+       return dso ? dso : __dsos__addnew(dsos, name);
+}
+
+struct dso *dsos__findnew(struct dsos *dsos, const char *name)
+{
+       struct dso *dso;
+       pthread_rwlock_wrlock(&dsos->lock);
+       dso = dso__get(__dsos__findnew(dsos, name));
+       pthread_rwlock_unlock(&dsos->lock);
+       return dso;
  }
  
  size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp,
@@ -1130,12 +1305,15 @@ size_t dso__fprintf(struct dso *dso, enum map_type type, FILE *fp)
  enum dso_type dso__type(struct dso *dso, struct machine *machine)
  {
         int fd;
+       enum dso_type type = DSO__TYPE_UNKNOWN;
  
-       fd = dso__data_fd(dso, machine);
-       if (fd < 0)
-               return DSO__TYPE_UNKNOWN;
+       fd = dso__data_get_fd(dso, machine);
+       if (fd >= 0) {
+               type = dso__type_fd(fd);
+               dso__data_put_fd(dso);
+       }
  
-       return dso__type_fd(fd);
+       return type;
  }
  
  int dso__strerror_load(struct dso *dso, char *buf, size_t buflen)
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h

index e0901b4ed8de0d08b1c6ad93f20ca371b9adef5f..2fe98bb0e95b0d4b88fb58f2fc28daee875a13bb 100644 (file)
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -1,9 +1,11 @@
  #ifndef __PERF_DSO
  #define __PERF_DSO
  
+#include <linux/atomic.h>
  #include <linux/types.h>
  #include <linux/rbtree.h>
  #include <stdbool.h>
+#include <pthread.h>
  #include <linux/types.h>
  #include <linux/bitops.h>
  #include "map.h"
@@ -124,9 +126,13 @@ struct dso_cache {
  struct dsos {
         struct list_head head;
         struct rb_root   root;  /* rbtree root sorted by long name */
+       pthread_rwlock_t lock;
  };
  
+struct auxtrace_cache;
+
  struct dso {
+       pthread_mutex_t  lock;
         struct list_head node;
         struct rb_node   rb_node;       /* rbtree node sorted by long name */
         struct rb_root   symbols[MAP__NR_TYPES];
@@ -156,6 +162,7 @@ struct dso {
         u16              long_name_len;
         u16              short_name_len;
         void            *dwfl;                  /* DWARF debug info */
+       struct auxtrace_cache *auxtrace_cache;
  
         /* dso data file */
         struct {
@@ -173,7 +180,7 @@ struct dso {
                 void     *priv;
                 u64      db_id;
         };
-
+       atomic_t         refcnt;
         char             name[0];
  };
  
@@ -200,6 +207,17 @@ void dso__set_long_name(struct dso *dso, const char *name, bool name_allocated);
  
  int dso__name_len(const struct dso *dso);
  
+struct dso *dso__get(struct dso *dso);
+void dso__put(struct dso *dso);
+
+static inline void __dso__zput(struct dso **dso)
+{
+       dso__put(*dso);
+       *dso = NULL;
+}
+
+#define dso__zput(dso) __dso__zput(&dso)
+
  bool dso__loaded(const struct dso *dso, enum map_type type);
  
  bool dso__sorted_by_name(const struct dso *dso, enum map_type type);
@@ -216,7 +234,7 @@ char dso__symtab_origin(const struct dso *dso);
  int dso__read_binary_type_filename(const struct dso *dso, enum dso_binary_type type,
                                    char *root_dir, char *filename, size_t size);
  bool is_supported_compression(const char *ext);
-bool is_kernel_module(const char *pathname);
+bool is_kernel_module(const char *pathname, int cpumode);
  bool decompress_to_file(const char *ext, const char *filename, int output_fd);
  bool dso__needs_decompress(struct dso *dso);
  
@@ -236,7 +254,8 @@ int __kmod_path__parse(struct kmod_path *m, const char *path,
  
  /*
   * The dso__data_* external interface provides following functions:
- *   dso__data_fd
+ *   dso__data_get_fd
+ *   dso__data_put_fd
   *   dso__data_close
   *   dso__data_size
   *   dso__data_read_offset
@@ -253,8 +272,11 @@ int __kmod_path__parse(struct kmod_path *m, const char *path,
   * The current usage of the dso__data_* interface is as follows:
   *
   * Get DSO's fd:
- *   int fd = dso__data_fd(dso, machine);
- *   USE 'fd' SOMEHOW
+ *   int fd = dso__data_get_fd(dso, machine);
+ *   if (fd >= 0) {
+ *       USE 'fd' SOMEHOW
+ *       dso__data_put_fd(dso);
+ *   }
   *
   * Read DSO's data:
   *   n = dso__data_read_offset(dso_0, &machine, 0, buf, BUFSIZE);
@@ -273,7 +295,8 @@ int __kmod_path__parse(struct kmod_path *m, const char *path,
   *
   * TODO
  */
-int dso__data_fd(struct dso *dso, struct machine *machine);
+int dso__data_get_fd(struct dso *dso, struct machine *machine);
+void dso__data_put_fd(struct dso *dso __maybe_unused);
  void dso__data_close(struct dso *dso);
  
  off_t dso__data_size(struct dso *dso, struct machine *machine);
@@ -285,14 +308,16 @@ ssize_t dso__data_read_addr(struct dso *dso, struct map *map,
  bool dso__data_status_seen(struct dso *dso, enum dso_data_status_seen by);
  
  struct map *dso__new_map(const char *name);
-struct dso *dso__kernel_findnew(struct machine *machine, const char *name,
-                               const char *short_name, int dso_type);
+struct dso *machine__findnew_kernel(struct machine *machine, const char *name,
+                                   const char *short_name, int dso_type);
  
+void __dsos__add(struct dsos *dsos, struct dso *dso);
  void dsos__add(struct dsos *dsos, struct dso *dso);
-struct dso *dsos__addnew(struct dsos *dsos, const char *name);
-struct dso *dsos__find(const struct dsos *dsos, const char *name,
-                      bool cmp_short);
+struct dso *__dsos__addnew(struct dsos *dsos, const char *name);
+struct dso *__dsos__find(struct dsos *dsos, const char *name, bool cmp_short);
+struct dso *dsos__find(struct dsos *dsos, const char *name, bool cmp_short);
  struct dso *__dsos__findnew(struct dsos *dsos, const char *name);
+struct dso *dsos__findnew(struct dsos *dsos, const char *name);
  bool __dsos__read_build_ids(struct list_head *head, bool with_hits);
  
  size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp,
diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c

index c34e024020c7c58602a9459f407283faea884b20..57f3ef41c2bc3e6261c03f54dc7f99791a003108 100644 (file)
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -139,10 +139,26 @@ int cu_walk_functions_at(Dwarf_Die *cu_die, Dwarf_Addr addr,
  bool die_compare_name(Dwarf_Die *dw_die, const char *tname)
  {
         const char *name;
+
         name = dwarf_diename(dw_die);
         return name ? (strcmp(tname, name) == 0) : false;
  }
  
+/**
+ * die_match_name - Match diename and glob
+ * @dw_die: a DIE
+ * @glob: a string of target glob pattern
+ *
+ * Glob matching the name of @dw_die and @glob. Return false if matching fail.
+ */
+bool die_match_name(Dwarf_Die *dw_die, const char *glob)
+{
+       const char *name;
+
+       name = dwarf_diename(dw_die);
+       return name ? strglobmatch(name, glob) : false;
+}
+
  /**
   * die_get_call_lineno - Get callsite line number of inline-function instance
   * @in_die: a DIE of an inlined function instance
@@ -417,6 +433,43 @@ struct __addr_die_search_param {
         Dwarf_Die       *die_mem;
  };
  
+static int __die_search_func_tail_cb(Dwarf_Die *fn_die, void *data)
+{
+       struct __addr_die_search_param *ad = data;
+       Dwarf_Addr addr = 0;
+
+       if (dwarf_tag(fn_die) == DW_TAG_subprogram &&
+           !dwarf_highpc(fn_die, &addr) &&
+           addr == ad->addr) {
+               memcpy(ad->die_mem, fn_die, sizeof(Dwarf_Die));
+               return DWARF_CB_ABORT;
+       }
+       return DWARF_CB_OK;
+}
+
+/**
+ * die_find_tailfunc - Search for a non-inlined function with tail call at
+ * given address
+ * @cu_die: a CU DIE which including @addr
+ * @addr: target address
+ * @die_mem: a buffer for result DIE
+ *
+ * Search for a non-inlined function DIE with tail call at @addr. Stores the
+ * DIE to @die_mem and returns it if found. Returns NULL if failed.
+ */
+Dwarf_Die *die_find_tailfunc(Dwarf_Die *cu_die, Dwarf_Addr addr,
+                                   Dwarf_Die *die_mem)
+{
+       struct __addr_die_search_param ad;
+       ad.addr = addr;
+       ad.die_mem = die_mem;
+       /* dwarf_getscopes can't find subprogram. */
+       if (!dwarf_getfuncs(cu_die, __die_search_func_tail_cb, &ad, 0))
+               return NULL;
+       else
+               return die_mem;
+}
+
  /* die_find callback for non-inlined function search */
  static int __die_search_func_cb(Dwarf_Die *fn_die, void *data)
  {
@@ -832,19 +885,17 @@ Dwarf_Die *die_find_member(Dwarf_Die *st_die, const char *name,
  /**
   * die_get_typename - Get the name of given variable DIE
   * @vr_die: a variable DIE
- * @buf: a buffer for result type name
- * @len: a max-length of @buf
+ * @buf: a strbuf for result type name
   *
- * Get the name of @vr_die and stores it to @buf. Return the actual length
- * of type name if succeeded. Return -E2BIG if @len is not enough long, and
- * Return -ENOENT if failed to find type name.
+ * Get the name of @vr_die and stores it to @buf. Return 0 if succeeded.
+ * and Return -ENOENT if failed to find type name.
   * Note that the result will stores typedef name if possible, and stores
   * "*(function_type)" if the type is a function pointer.
   */
-int die_get_typename(Dwarf_Die *vr_die, char *buf, int len)
+int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf)
  {
         Dwarf_Die type;
-       int tag, ret, ret2;
+       int tag, ret;
         const char *tmp = "";
  
         if (__die_get_real_type(vr_die, &type) == NULL)
@@ -855,8 +906,8 @@ int die_get_typename(Dwarf_Die *vr_die, char *buf, int len)
                 tmp = "*";
         else if (tag == DW_TAG_subroutine_type) {
                 /* Function pointer */
-               ret = snprintf(buf, len, "(function_type)");
-               return (ret >= len) ? -E2BIG : ret;
+               strbuf_addf(buf, "(function_type)");
+               return 0;
         } else {
                 if (!dwarf_diename(&type))
                         return -ENOENT;
@@ -867,39 +918,156 @@ int die_get_typename(Dwarf_Die *vr_die, char *buf, int len)
                 else if (tag == DW_TAG_enumeration_type)
                         tmp = "enum ";
                 /* Write a base name */
-               ret = snprintf(buf, len, "%s%s", tmp, dwarf_diename(&type));
-               return (ret >= len) ? -E2BIG : ret;
-       }
-       ret = die_get_typename(&type, buf, len);
-       if (ret > 0) {
-               ret2 = snprintf(buf + ret, len - ret, "%s", tmp);
-               ret = (ret2 >= len - ret) ? -E2BIG : ret2 + ret;
+               strbuf_addf(buf, "%s%s", tmp, dwarf_diename(&type));
+               return 0;
         }
+       ret = die_get_typename(&type, buf);
+       if (ret == 0)
+               strbuf_addf(buf, "%s", tmp);
+
         return ret;
  }
  
  /**
   * die_get_varname - Get the name and type of given variable DIE
   * @vr_die: a variable DIE
- * @buf: a buffer for type and variable name
- * @len: the max-length of @buf
+ * @buf: a strbuf for type and variable name
   *
   * Get the name and type of @vr_die and stores it in @buf as "type\tname".
   */
-int die_get_varname(Dwarf_Die *vr_die, char *buf, int len)
+int die_get_varname(Dwarf_Die *vr_die, struct strbuf *buf)
  {
-       int ret, ret2;
+       int ret;
  
-       ret = die_get_typename(vr_die, buf, len);
+       ret = die_get_typename(vr_die, buf);
         if (ret < 0) {
                 pr_debug("Failed to get type, make it unknown.\n");
-               ret = snprintf(buf, len, "(unknown_type)");
+               strbuf_addf(buf, "(unknown_type)");
         }
-       if (ret > 0) {
-               ret2 = snprintf(buf + ret, len - ret, "\t%s",
-                               dwarf_diename(vr_die));
-               ret = (ret2 >= len - ret) ? -E2BIG : ret2 + ret;
+
+       strbuf_addf(buf, "\t%s", dwarf_diename(vr_die));
+
+       return 0;
+}
+
+/**
+ * die_get_var_innermost_scope - Get innermost scope range of given variable DIE
+ * @sp_die: a subprogram DIE
+ * @vr_die: a variable DIE
+ * @buf: a strbuf for variable byte offset range
+ *
+ * Get the innermost scope range of @vr_die and stores it in @buf as
+ * "@<function_name+[NN-NN,NN-NN]>".
+ */
+static int die_get_var_innermost_scope(Dwarf_Die *sp_die, Dwarf_Die *vr_die,
+                               struct strbuf *buf)
+{
+       Dwarf_Die *scopes;
+       int count;
+       size_t offset = 0;
+       Dwarf_Addr base;
+       Dwarf_Addr start, end;
+       Dwarf_Addr entry;
+       int ret;
+       bool first = true;
+       const char *name;
+
+       ret = dwarf_entrypc(sp_die, &entry);
+       if (ret)
+               return ret;
+
+       name = dwarf_diename(sp_die);
+       if (!name)
+               return -ENOENT;
+
+       count = dwarf_getscopes_die(vr_die, &scopes);
+
+       /* (*SCOPES)[1] is the DIE for the scope containing that scope */
+       if (count <= 1) {
+               ret = -EINVAL;
+               goto out;
         }
+
+       while ((offset = dwarf_ranges(&scopes[1], offset, &base,
+                               &start, &end)) > 0) {
+               start -= entry;
+               end -= entry;
+
+               if (first) {
+                       strbuf_addf(buf, "@<%s+[%" PRIu64 "-%" PRIu64,
+                               name, start, end);
+                       first = false;
+               } else {
+                       strbuf_addf(buf, ",%" PRIu64 "-%" PRIu64,
+                               start, end);
+               }
+       }
+
+       if (!first)
+               strbuf_addf(buf, "]>");
+
+out:
+       free(scopes);
         return ret;
  }
  
+/**
+ * die_get_var_range - Get byte offset range of given variable DIE
+ * @sp_die: a subprogram DIE
+ * @vr_die: a variable DIE
+ * @buf: a strbuf for type and variable name and byte offset range
+ *
+ * Get the byte offset range of @vr_die and stores it in @buf as
+ * "@<function_name+[NN-NN,NN-NN]>".
+ */
+int die_get_var_range(Dwarf_Die *sp_die, Dwarf_Die *vr_die, struct strbuf *buf)
+{
+       int ret = 0;
+       Dwarf_Addr base;
+       Dwarf_Addr start, end;
+       Dwarf_Addr entry;
+       Dwarf_Op *op;
+       size_t nops;
+       size_t offset = 0;
+       Dwarf_Attribute attr;
+       bool first = true;
+       const char *name;
+
+       ret = dwarf_entrypc(sp_die, &entry);
+       if (ret)
+               return ret;
+
+       name = dwarf_diename(sp_die);
+       if (!name)
+               return -ENOENT;
+
+       if (dwarf_attr(vr_die, DW_AT_location, &attr) == NULL)
+               return -EINVAL;
+
+       while ((offset = dwarf_getlocations(
+                               &attr, offset, &base,
+                               &start, &end, &op, &nops)) > 0) {
+               if (start == 0) {
+                       /* Single Location Descriptions */
+                       ret = die_get_var_innermost_scope(sp_die, vr_die, buf);
+                       return ret;
+               }
+
+               /* Location Lists */
+               start -= entry;
+               end -= entry;
+               if (first) {
+                       strbuf_addf(buf, "@<%s+[%" PRIu64 "-%" PRIu64,
+                               name, start, end);
+                       first = false;
+               } else {
+                       strbuf_addf(buf, ",%" PRIu64 "-%" PRIu64,
+                               start, end);
+               }
+       }
+
+       if (!first)
+               strbuf_addf(buf, "]>");
+
+       return ret;
+}
diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h

index af7dbcd5f929947cfdd0b146f20ad65d45059bb8..c42ec366f2a72fb7999e22f257ebc82765ee4919 100644 (file)
--- a/tools/perf/util/dwarf-aux.h
+++ b/tools/perf/util/dwarf-aux.h
@@ -47,6 +47,9 @@ extern bool die_is_func_instance(Dwarf_Die *dw_die);
  /* Compare diename and tname */
  extern bool die_compare_name(Dwarf_Die *dw_die, const char *tname);
  
+/* Matching diename with glob pattern */
+extern bool die_match_name(Dwarf_Die *dw_die, const char *glob);
+
  /* Get callsite line number of inline-function instance */
  extern int die_get_call_lineno(Dwarf_Die *in_die);
  
@@ -82,6 +85,10 @@ extern Dwarf_Die *die_find_child(Dwarf_Die *rt_die,
  extern Dwarf_Die *die_find_realfunc(Dwarf_Die *cu_die, Dwarf_Addr addr,
                                     Dwarf_Die *die_mem);
  
+/* Search a non-inlined function with tail call at given address */
+Dwarf_Die *die_find_tailfunc(Dwarf_Die *cu_die, Dwarf_Addr addr,
+                                   Dwarf_Die *die_mem);
+
  /* Search the top inlined function including given address */
  extern Dwarf_Die *die_find_top_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr,
                                           Dwarf_Die *die_mem);
@@ -114,8 +121,10 @@ extern Dwarf_Die *die_find_member(Dwarf_Die *st_die, const char *name,
                                   Dwarf_Die *die_mem);
  
  /* Get the name of given variable DIE */
-extern int die_get_typename(Dwarf_Die *vr_die, char *buf, int len);
+extern int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf);
  
  /* Get the name and type of given variable DIE, stored as "type\tname" */
-extern int die_get_varname(Dwarf_Die *vr_die, char *buf, int len);
+extern int die_get_varname(Dwarf_Die *vr_die, struct strbuf *buf);
+extern int die_get_var_range(Dwarf_Die *sp_die, Dwarf_Die *vr_die,
+                       struct strbuf *buf);
  #endif
diff --git a/tools/perf/util/environment.c b/tools/perf/util/environment.c

index 275b0ee345f5eab4b1dcaebc18d5068698d9e437..7405123692f14919bc1928c67f4819ef3503e1ba 100644 (file)
--- a/tools/perf/util/environment.c
+++ b/tools/perf/util/environment.c
@@ -5,5 +5,4 @@
   */
  #include "cache.h"
  
-const char *pager_program;
  int pager_use_color = 1;
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c

index ff866c4d2e2f09ea4abc1650d7c413776b3bb93f..d7d986d8f23e5f890cc295b3b830470343f34063 100644 (file)
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -23,12 +23,18 @@ static const char *perf_event__names[] = {
         [PERF_RECORD_FORK]                      = "FORK",
         [PERF_RECORD_READ]                      = "READ",
         [PERF_RECORD_SAMPLE]                    = "SAMPLE",
+       [PERF_RECORD_AUX]                       = "AUX",
+       [PERF_RECORD_ITRACE_START]              = "ITRACE_START",
+       [PERF_RECORD_LOST_SAMPLES]              = "LOST_SAMPLES",
         [PERF_RECORD_HEADER_ATTR]               = "ATTR",
         [PERF_RECORD_HEADER_EVENT_TYPE]         = "EVENT_TYPE",
         [PERF_RECORD_HEADER_TRACING_DATA]       = "TRACING_DATA",
         [PERF_RECORD_HEADER_BUILD_ID]           = "BUILD_ID",
         [PERF_RECORD_FINISHED_ROUND]            = "FINISHED_ROUND",
         [PERF_RECORD_ID_INDEX]                  = "ID_INDEX",
+       [PERF_RECORD_AUXTRACE_INFO]             = "AUXTRACE_INFO",
+       [PERF_RECORD_AUXTRACE]                  = "AUXTRACE",
+       [PERF_RECORD_AUXTRACE_ERROR]            = "AUXTRACE_ERROR",
  };
  
  const char *perf_event__name(unsigned int id)
@@ -212,10 +218,14 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
                                        pid_t pid, pid_t tgid,
                                        perf_event__handler_t process,
                                        struct machine *machine,
-                                      bool mmap_data)
+                                      bool mmap_data,
+                                      unsigned int proc_map_timeout)
  {
         char filename[PATH_MAX];
         FILE *fp;
+       unsigned long long t;
+       bool truncation = false;
+       unsigned long long timeout = proc_map_timeout * 1000000ULL;
         int rc = 0;
  
         if (machine__is_default_guest(machine))
@@ -234,6 +244,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
         }
  
         event->header.type = PERF_RECORD_MMAP2;
+       t = rdclock();
  
         while (1) {
                 char bf[BUFSIZ];
@@ -247,6 +258,15 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
                 if (fgets(bf, sizeof(bf), fp) == NULL)
                         break;
  
+               if ((rdclock() - t) > timeout) {
+                       pr_warning("Reading %s time out. "
+                                  "You may want to increase "
+                                  "the time limit by --proc-map-timeout\n",
+                                  filename);
+                       truncation = true;
+                       goto out;
+               }
+
                 /* ensure null termination since stack will be reused. */
                 strcpy(execname, "");
  
@@ -295,6 +315,10 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
                         event->header.misc |= PERF_RECORD_MISC_MMAP_DATA;
                 }
  
+out:
+               if (truncation)
+                       event->header.misc |= PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT;
+
                 if (!strcmp(execname, ""))
                         strcpy(execname, anonstr);
  
@@ -313,6 +337,9 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
                         rc = -1;
                         break;
                 }
+
+               if (truncation)
+                       break;
         }
  
         fclose(fp);
@@ -324,8 +351,9 @@ int perf_event__synthesize_modules(struct perf_tool *tool,
                                    struct machine *machine)
  {
         int rc = 0;
-       struct rb_node *nd;
+       struct map *pos;
         struct map_groups *kmaps = &machine->kmaps;
+       struct maps *maps = &kmaps->maps[MAP__FUNCTION];
         union perf_event *event = zalloc((sizeof(event->mmap) +
                                           machine->id_hdr_size));
         if (event == NULL) {
@@ -345,10 +373,8 @@ int perf_event__synthesize_modules(struct perf_tool *tool,
         else
                 event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL;
  
-       for (nd = rb_first(&kmaps->maps[MAP__FUNCTION]);
-            nd; nd = rb_next(nd)) {
+       for (pos = maps__first(maps); pos; pos = map__next(pos)) {
                 size_t size;
-               struct map *pos = rb_entry(nd, struct map, rb_node);
  
                 if (pos->dso->kernel)
                         continue;
@@ -381,7 +407,9 @@ static int __event__synthesize_thread(union perf_event *comm_event,
                                       pid_t pid, int full,
                                           perf_event__handler_t process,
                                       struct perf_tool *tool,
-                                     struct machine *machine, bool mmap_data)
+                                     struct machine *machine,
+                                     bool mmap_data,
+                                     unsigned int proc_map_timeout)
  {
         char filename[PATH_MAX];
         DIR *tasks;
@@ -398,7 +426,8 @@ static int __event__synthesize_thread(union perf_event *comm_event,
                         return -1;
  
                 return perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid,
-                                                         process, machine, mmap_data);
+                                                         process, machine, mmap_data,
+                                                         proc_map_timeout);
         }
  
         if (machine__is_default_guest(machine))
@@ -439,7 +468,7 @@ static int __event__synthesize_thread(union perf_event *comm_event,
                 if (_pid == pid) {
                         /* process the parent's maps too */
                         rc = perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid,
-                                               process, machine, mmap_data);
+                                               process, machine, mmap_data, proc_map_timeout);
                         if (rc)
                                 break;
                 }
@@ -453,7 +482,8 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
                                       struct thread_map *threads,
                                       perf_event__handler_t process,
                                       struct machine *machine,
-                                     bool mmap_data)
+                                     bool mmap_data,
+                                     unsigned int proc_map_timeout)
  {
         union perf_event *comm_event, *mmap_event, *fork_event;
         int err = -1, thread, j;
@@ -476,7 +506,7 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
                                                fork_event,
                                                threads->map[thread], 0,
                                                process, tool, machine,
-                                              mmap_data)) {
+                                              mmap_data, proc_map_timeout)) {
                         err = -1;
                         break;
                 }
@@ -502,7 +532,7 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
                                                        fork_event,
                                                        comm_event->comm.pid, 0,
                                                        process, tool, machine,
-                                                      mmap_data)) {
+                                                      mmap_data, proc_map_timeout)) {
                                 err = -1;
                                 break;
                         }
@@ -519,7 +549,9 @@ out:
  
  int perf_event__synthesize_threads(struct perf_tool *tool,
                                    perf_event__handler_t process,
-                                  struct machine *machine, bool mmap_data)
+                                  struct machine *machine,
+                                  bool mmap_data,
+                                  unsigned int proc_map_timeout)
  {
         DIR *proc;
         char proc_path[PATH_MAX];
@@ -559,7 +591,8 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
                  * one thread couldn't be synthesized.
                  */
                 __event__synthesize_thread(comm_event, mmap_event, fork_event, pid,
-                                          1, process, tool, machine, mmap_data);
+                                          1, process, tool, machine, mmap_data,
+                                          proc_map_timeout);
         }
  
         err = 0;
@@ -692,6 +725,30 @@ int perf_event__process_lost(struct perf_tool *tool __maybe_unused,
         return machine__process_lost_event(machine, event, sample);
  }
  
+int perf_event__process_aux(struct perf_tool *tool __maybe_unused,
+                           union perf_event *event,
+                           struct perf_sample *sample __maybe_unused,
+                           struct machine *machine)
+{
+       return machine__process_aux_event(machine, event);
+}
+
+int perf_event__process_itrace_start(struct perf_tool *tool __maybe_unused,
+                                    union perf_event *event,
+                                    struct perf_sample *sample __maybe_unused,
+                                    struct machine *machine)
+{
+       return machine__process_itrace_start_event(machine, event);
+}
+
+int perf_event__process_lost_samples(struct perf_tool *tool __maybe_unused,
+                                    union perf_event *event,
+                                    struct perf_sample *sample,
+                                    struct machine *machine)
+{
+       return machine__process_lost_samples_event(machine, event, sample);
+}
+
  size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp)
  {
         return fprintf(fp, " %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %c %s\n",
@@ -755,6 +812,21 @@ int perf_event__process_exit(struct perf_tool *tool __maybe_unused,
         return machine__process_exit_event(machine, event, sample);
  }
  
+size_t perf_event__fprintf_aux(union perf_event *event, FILE *fp)
+{
+       return fprintf(fp, " offset: %#"PRIx64" size: %#"PRIx64" flags: %#"PRIx64" [%s%s]\n",
+                      event->aux.aux_offset, event->aux.aux_size,
+                      event->aux.flags,
+                      event->aux.flags & PERF_AUX_FLAG_TRUNCATED ? "T" : "",
+                      event->aux.flags & PERF_AUX_FLAG_OVERWRITE ? "O" : "");
+}
+
+size_t perf_event__fprintf_itrace_start(union perf_event *event, FILE *fp)
+{
+       return fprintf(fp, " pid: %u tid: %u\n",
+                      event->itrace_start.pid, event->itrace_start.tid);
+}
+
  size_t perf_event__fprintf(union perf_event *event, FILE *fp)
  {
         size_t ret = fprintf(fp, "PERF_RECORD_%s",
@@ -774,6 +846,12 @@ size_t perf_event__fprintf(union perf_event *event, FILE *fp)
         case PERF_RECORD_MMAP2:
                 ret += perf_event__fprintf_mmap2(event, fp);
                 break;
+       case PERF_RECORD_AUX:
+               ret += perf_event__fprintf_aux(event, fp);
+               break;
+       case PERF_RECORD_ITRACE_START:
+               ret += perf_event__fprintf_itrace_start(event, fp);
+               break;
         default:
                 ret += fprintf(fp, "\n");
         }
@@ -877,6 +955,10 @@ void thread__find_addr_location(struct thread *thread,
                 al->sym = NULL;
  }
  
+/*
+ * Callers need to drop the reference to al->thread, obtained in
+ * machine__findnew_thread()
+ */
  int perf_event__preprocess_sample(const union perf_event *event,
                                   struct machine *machine,
                                   struct addr_location *al,
@@ -937,6 +1019,17 @@ int perf_event__preprocess_sample(const union perf_event *event,
         return 0;
  }
  
+/*
+ * The preprocess_sample method will return with reference counts for the
+ * in it, when done using (and perhaps getting ref counts if needing to
+ * keep a pointer to one of those entries) it must be paired with
+ * addr_location__put(), so that the refcounts can be decremented.
+ */
+void addr_location__put(struct addr_location *al)
+{
+       thread__zput(al->thread);
+}
+
  bool is_bts_event(struct perf_event_attr *attr)
  {
         return attr->type == PERF_TYPE_HARDWARE &&
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h

index 09b9e8d3fcf7fae705afcc0d3ea688f2aca77665..c53f36384b64532abec852c9e6d7a496a0d982a6 100644 (file)
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -52,6 +52,11 @@ struct lost_event {
         u64 lost;
  };
  
+struct lost_samples_event {
+       struct perf_event_header header;
+       u64 lost;
+};
+
  /*
   * PERF_FORMAT_ENABLED | PERF_FORMAT_RUNNING | PERF_FORMAT_ID
   */
@@ -157,6 +162,8 @@ enum {
         PERF_IP_FLAG_IN_TX              = 1ULL << 10,
  };
  
+#define PERF_IP_FLAG_CHARS "bcrosyiABEx"
+
  #define PERF_BRANCH_MASK               (\
         PERF_IP_FLAG_BRANCH             |\
         PERF_IP_FLAG_CALL               |\
@@ -215,9 +222,17 @@ enum perf_user_event_type { /* above any possible kernel type */
         PERF_RECORD_HEADER_BUILD_ID             = 67,
         PERF_RECORD_FINISHED_ROUND              = 68,
         PERF_RECORD_ID_INDEX                    = 69,
+       PERF_RECORD_AUXTRACE_INFO               = 70,
+       PERF_RECORD_AUXTRACE                    = 71,
+       PERF_RECORD_AUXTRACE_ERROR              = 72,
         PERF_RECORD_HEADER_MAX
  };
  
+enum auxtrace_error_type {
+       PERF_AUXTRACE_ERROR_ITRACE  = 1,
+       PERF_AUXTRACE_ERROR_MAX
+};
+
  /*
   * The kernel collects the number of events it couldn't send in a stretch and
   * when possible sends this number in a PERF_RECORD_LOST event. The number of
@@ -225,6 +240,12 @@ enum perf_user_event_type { /* above any possible kernel type */
   * total_lost tells exactly how many events the kernel in fact lost, i.e. it is
   * the sum of all struct lost_event.lost fields reported.
   *
+ * The kernel discards mixed up samples and sends the number in a
+ * PERF_RECORD_LOST_SAMPLES event. The number of lost-samples events is stored
+ * in .nr_events[PERF_RECORD_LOST_SAMPLES] while total_lost_samples tells
+ * exactly how many samples the kernel in fact dropped, i.e. it is the sum of
+ * all struct lost_samples_event.lost fields reported.
+ *
   * The total_period is needed because by default auto-freq is used, so
   * multipling nr_events[PERF_EVENT_SAMPLE] by a frequency isn't possible to get
   * the total number of low level events, it is necessary to to sum all struct
@@ -234,6 +255,7 @@ struct events_stats {
         u64 total_period;
         u64 total_non_filtered_period;
         u64 total_lost;
+       u64 total_lost_samples;
         u64 total_invalid_chains;
         u32 nr_events[PERF_RECORD_HEADER_MAX];
         u32 nr_non_filtered_samples;
@@ -242,6 +264,8 @@ struct events_stats {
         u32 nr_invalid_chains;
         u32 nr_unknown_id;
         u32 nr_unprocessable_samples;
+       u32 nr_auxtrace_errors[PERF_AUXTRACE_ERROR_MAX];
+       u32 nr_proc_map_timeout;
  };
  
  struct attr_event {
@@ -280,6 +304,50 @@ struct id_index_event {
         struct id_index_entry entries[0];
  };
  
+struct auxtrace_info_event {
+       struct perf_event_header header;
+       u32 type;
+       u32 reserved__; /* For alignment */
+       u64 priv[];
+};
+
+struct auxtrace_event {
+       struct perf_event_header header;
+       u64 size;
+       u64 offset;
+       u64 reference;
+       u32 idx;
+       u32 tid;
+       u32 cpu;
+       u32 reserved__; /* For alignment */
+};
+
+#define MAX_AUXTRACE_ERROR_MSG 64
+
+struct auxtrace_error_event {
+       struct perf_event_header header;
+       u32 type;
+       u32 code;
+       u32 cpu;
+       u32 pid;
+       u32 tid;
+       u32 reserved__; /* For alignment */
+       u64 ip;
+       char msg[MAX_AUXTRACE_ERROR_MSG];
+};
+
+struct aux_event {
+       struct perf_event_header header;
+       u64     aux_offset;
+       u64     aux_size;
+       u64     flags;
+};
+
+struct itrace_start_event {
+       struct perf_event_header header;
+       u32 pid, tid;
+};
+
  union perf_event {
         struct perf_event_header        header;
         struct mmap_event               mmap;
@@ -287,6 +355,7 @@ union perf_event {
         struct comm_event               comm;
         struct fork_event               fork;
         struct lost_event               lost;
+       struct lost_samples_event       lost_samples;
         struct read_event               read;
         struct throttle_event           throttle;
         struct sample_event             sample;
@@ -295,6 +364,11 @@ union perf_event {
         struct tracing_data_event       tracing_data;
         struct build_id_event           build_id;
         struct id_index_event           id_index;
+       struct auxtrace_info_event      auxtrace_info;
+       struct auxtrace_event           auxtrace;
+       struct auxtrace_error_event     auxtrace_error;
+       struct aux_event                aux;
+       struct itrace_start_event       itrace_start;
  };
  
  void perf_event__print_totals(void);
@@ -310,10 +384,12 @@ typedef int (*perf_event__handler_t)(struct perf_tool *tool,
  int perf_event__synthesize_thread_map(struct perf_tool *tool,
                                       struct thread_map *threads,
                                       perf_event__handler_t process,
-                                     struct machine *machine, bool mmap_data);
+                                     struct machine *machine, bool mmap_data,
+                                     unsigned int proc_map_timeout);
  int perf_event__synthesize_threads(struct perf_tool *tool,
                                    perf_event__handler_t process,
-                                  struct machine *machine, bool mmap_data);
+                                  struct machine *machine, bool mmap_data,
+                                  unsigned int proc_map_timeout);
  int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
                                        perf_event__handler_t process,
                                        struct machine *machine);
@@ -330,6 +406,18 @@ int perf_event__process_lost(struct perf_tool *tool,
                              union perf_event *event,
                              struct perf_sample *sample,
                              struct machine *machine);
+int perf_event__process_lost_samples(struct perf_tool *tool,
+                                    union perf_event *event,
+                                    struct perf_sample *sample,
+                                    struct machine *machine);
+int perf_event__process_aux(struct perf_tool *tool,
+                           union perf_event *event,
+                           struct perf_sample *sample,
+                           struct machine *machine);
+int perf_event__process_itrace_start(struct perf_tool *tool,
+                                    union perf_event *event,
+                                    struct perf_sample *sample,
+                                    struct machine *machine);
  int perf_event__process_mmap(struct perf_tool *tool,
                              union perf_event *event,
                              struct perf_sample *sample,
@@ -358,6 +446,8 @@ int perf_event__preprocess_sample(const union perf_event *event,
                                   struct addr_location *al,
                                   struct perf_sample *sample);
  
+void addr_location__put(struct addr_location *al);
+
  struct thread;
  
  bool is_bts_event(struct perf_event_attr *attr);
@@ -381,12 +471,15 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
                                        pid_t pid, pid_t tgid,
                                        perf_event__handler_t process,
                                        struct machine *machine,
-                                      bool mmap_data);
+                                      bool mmap_data,
+                                      unsigned int proc_map_timeout);
  
  size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp);
  size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp);
  size_t perf_event__fprintf_mmap2(union perf_event *event, FILE *fp);
  size_t perf_event__fprintf_task(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_aux(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_itrace_start(union perf_event *event, FILE *fp);
  size_t perf_event__fprintf(union perf_event *event, FILE *fp);
  
  u64 kallsyms__get_function_start(const char *kallsyms_filename,
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c

index 080be93eea969f9893bd789cde845bf4a94a37bd..8366511b45f8327a65dc44e80544f6b08e0df24a 100644 (file)
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -297,6 +297,8 @@ void perf_evlist__disable(struct perf_evlist *evlist)
                                       PERF_EVENT_IOC_DISABLE, 0);
                 }
         }
+
+       evlist->enabled = false;
  }
  
  void perf_evlist__enable(struct perf_evlist *evlist)
@@ -316,6 +318,13 @@ void perf_evlist__enable(struct perf_evlist *evlist)
                                       PERF_EVENT_IOC_ENABLE, 0);
                 }
         }
+
+       evlist->enabled = true;
+}
+
+void perf_evlist__toggle_enable(struct perf_evlist *evlist)
+{
+       (evlist->enabled ? perf_evlist__disable : perf_evlist__enable)(evlist);
  }
  
  int perf_evlist__disable_event(struct perf_evlist *evlist,
@@ -634,11 +643,18 @@ static struct perf_evsel *perf_evlist__event2evsel(struct perf_evlist *evlist,
  union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
  {
         struct perf_mmap *md = &evlist->mmap[idx];
-       u64 head = perf_mmap__read_head(md);
+       u64 head;
         u64 old = md->prev;
         unsigned char *data = md->base + page_size;
         union perf_event *event = NULL;
  
+       /*
+        * Check if event was unmapped due to a POLLHUP/POLLERR.
+        */
+       if (!atomic_read(&md->refcnt))
+               return NULL;
+
+       head = perf_mmap__read_head(md);
         if (evlist->overwrite) {
                 /*
                  * If we're further behind than half the buffer, there's a chance
@@ -695,19 +711,19 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
  
  static bool perf_mmap__empty(struct perf_mmap *md)
  {
-       return perf_mmap__read_head(md) == md->prev;
+       return perf_mmap__read_head(md) == md->prev && !md->auxtrace_mmap.base;
  }
  
  static void perf_evlist__mmap_get(struct perf_evlist *evlist, int idx)
  {
-       ++evlist->mmap[idx].refcnt;
+       atomic_inc(&evlist->mmap[idx].refcnt);
  }
  
  static void perf_evlist__mmap_put(struct perf_evlist *evlist, int idx)
  {
-       BUG_ON(evlist->mmap[idx].refcnt == 0);
+       BUG_ON(atomic_read(&evlist->mmap[idx].refcnt) == 0);
  
-       if (--evlist->mmap[idx].refcnt == 0)
+       if (atomic_dec_and_test(&evlist->mmap[idx].refcnt))
                 __perf_evlist__munmap(evlist, idx);
  }
  
@@ -721,17 +737,46 @@ void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx)
                 perf_mmap__write_tail(md, old);
         }
  
-       if (md->refcnt == 1 && perf_mmap__empty(md))
+       if (atomic_read(&md->refcnt) == 1 && perf_mmap__empty(md))
                 perf_evlist__mmap_put(evlist, idx);
  }
  
+int __weak auxtrace_mmap__mmap(struct auxtrace_mmap *mm __maybe_unused,
+                              struct auxtrace_mmap_params *mp __maybe_unused,
+                              void *userpg __maybe_unused,
+                              int fd __maybe_unused)
+{
+       return 0;
+}
+
+void __weak auxtrace_mmap__munmap(struct auxtrace_mmap *mm __maybe_unused)
+{
+}
+
+void __weak auxtrace_mmap_params__init(
+                       struct auxtrace_mmap_params *mp __maybe_unused,
+                       off_t auxtrace_offset __maybe_unused,
+                       unsigned int auxtrace_pages __maybe_unused,
+                       bool auxtrace_overwrite __maybe_unused)
+{
+}
+
+void __weak auxtrace_mmap_params__set_idx(
+                       struct auxtrace_mmap_params *mp __maybe_unused,
+                       struct perf_evlist *evlist __maybe_unused,
+                       int idx __maybe_unused,
+                       bool per_cpu __maybe_unused)
+{
+}
+
  static void __perf_evlist__munmap(struct perf_evlist *evlist, int idx)
  {
         if (evlist->mmap[idx].base != NULL) {
                 munmap(evlist->mmap[idx].base, evlist->mmap_len);
                 evlist->mmap[idx].base = NULL;
-               evlist->mmap[idx].refcnt = 0;
+               atomic_set(&evlist->mmap[idx].refcnt, 0);
         }
+       auxtrace_mmap__munmap(&evlist->mmap[idx].auxtrace_mmap);
  }
  
  void perf_evlist__munmap(struct perf_evlist *evlist)
@@ -759,6 +804,7 @@ static int perf_evlist__alloc_mmap(struct perf_evlist *evlist)
  struct mmap_params {
         int prot;
         int mask;
+       struct auxtrace_mmap_params auxtrace_mp;
  };
  
  static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx,
@@ -777,7 +823,7 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx,
          * evlist layer can't just drop it when filtering events in
          * perf_evlist__filter_pollfd().
          */
-       evlist->mmap[idx].refcnt = 2;
+       atomic_set(&evlist->mmap[idx].refcnt, 2);
         evlist->mmap[idx].prev = 0;
         evlist->mmap[idx].mask = mp->mask;
         evlist->mmap[idx].base = mmap(NULL, evlist->mmap_len, mp->prot,
@@ -789,6 +835,10 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx,
                 return -1;
         }
  
+       if (auxtrace_mmap__mmap(&evlist->mmap[idx].auxtrace_mmap,
+                               &mp->auxtrace_mp, evlist->mmap[idx].base, fd))
+               return -1;
+
         return 0;
  }
  
@@ -853,6 +903,9 @@ static int perf_evlist__mmap_per_cpu(struct perf_evlist *evlist,
         for (cpu = 0; cpu < nr_cpus; cpu++) {
                 int output = -1;
  
+               auxtrace_mmap_params__set_idx(&mp->auxtrace_mp, evlist, cpu,
+                                             true);
+
                 for (thread = 0; thread < nr_threads; thread++) {
                         if (perf_evlist__mmap_per_evsel(evlist, cpu, mp, cpu,
                                                         thread, &output))
@@ -878,6 +931,9 @@ static int perf_evlist__mmap_per_thread(struct perf_evlist *evlist,
         for (thread = 0; thread < nr_threads; thread++) {
                 int output = -1;
  
+               auxtrace_mmap_params__set_idx(&mp->auxtrace_mp, evlist, thread,
+                                             false);
+
                 if (perf_evlist__mmap_per_evsel(evlist, thread, mp, 0, thread,
                                                 &output))
                         goto out_unmap;
@@ -960,10 +1016,8 @@ static long parse_pages_arg(const char *str, unsigned long min,
         return pages;
  }
  
-int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str,
-                                 int unset __maybe_unused)
+int __perf_evlist__parse_mmap_pages(unsigned int *mmap_pages, const char *str)
  {
-       unsigned int *mmap_pages = opt->value;
         unsigned long max = UINT_MAX;
         long pages;
  
@@ -980,20 +1034,32 @@ int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str,
         return 0;
  }
  
+int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str,
+                                 int unset __maybe_unused)
+{
+       return __perf_evlist__parse_mmap_pages(opt->value, str);
+}
+
  /**
- * perf_evlist__mmap - Create mmaps to receive events.
+ * perf_evlist__mmap_ex - Create mmaps to receive events.
   * @evlist: list of events
   * @pages: map length in pages
   * @overwrite: overwrite older events?
+ * @auxtrace_pages - auxtrace map length in pages
+ * @auxtrace_overwrite - overwrite older auxtrace data?
   *
   * If @overwrite is %false the user needs to signal event consumption using
   * perf_mmap__write_tail().  Using perf_evlist__mmap_read() does this
   * automatically.
   *
+ * Similarly, if @auxtrace_overwrite is %false the user needs to signal data
+ * consumption using auxtrace_mmap__write_tail().
+ *
   * Return: %0 on success, negative error code otherwise.
   */
-int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
-                     bool overwrite)
+int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
+                        bool overwrite, unsigned int auxtrace_pages,
+                        bool auxtrace_overwrite)
  {
         struct perf_evsel *evsel;
         const struct cpu_map *cpus = evlist->cpus;
@@ -1013,6 +1079,9 @@ int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
         pr_debug("mmap size %zuB\n", evlist->mmap_len);
         mp.mask = evlist->mmap_len - page_size - 1;
  
+       auxtrace_mmap_params__init(&mp.auxtrace_mp, evlist->mmap_len,
+                                  auxtrace_pages, auxtrace_overwrite);
+
         evlist__for_each(evlist, evsel) {
                 if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
                     evsel->sample_id == NULL &&
@@ -1026,6 +1095,12 @@ int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
         return perf_evlist__mmap_per_cpu(evlist, &mp);
  }
  
+int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
+                     bool overwrite)
+{
+       return perf_evlist__mmap_ex(evlist, pages, overwrite, 0, false);
+}
+
  int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target)
  {
         evlist->threads = thread_map__new_str(target->pid, target->tid,
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h

index b5cce95d644e0c3af3c04d5edd91cdaf3aabb20d..a8489b9d2812baecf1ba4709ff3b6da9787adda0 100644 (file)
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -1,6 +1,7 @@
  #ifndef __PERF_EVLIST_H
  #define __PERF_EVLIST_H 1
  
+#include <linux/atomic.h>
  #include <linux/list.h>
  #include <api/fd/array.h>
  #include <stdio.h>
@@ -8,6 +9,7 @@
  #include "event.h"
  #include "evsel.h"
  #include "util.h"
+#include "auxtrace.h"
  #include <unistd.h>
  
  struct pollfd;
@@ -26,8 +28,9 @@ struct record_opts;
  struct perf_mmap {
         void             *base;
         int              mask;
-       int              refcnt;
+       atomic_t         refcnt;
         u64              prev;
+       struct auxtrace_mmap auxtrace_mmap;
         char             event_copy[PERF_SAMPLE_MAX_SIZE] __attribute__((aligned(8)));
  };
  
@@ -37,6 +40,8 @@ struct perf_evlist {
         int              nr_entries;
         int              nr_groups;
         int              nr_mmaps;
+       bool             overwrite;
+       bool             enabled;
         size_t           mmap_len;
         int              id_pos;
         int              is_pos;
@@ -45,7 +50,6 @@ struct perf_evlist {
                 int     cork_fd;
                 pid_t   pid;
         } workload;
-       bool             overwrite;
         struct fdarray   pollfd;
         struct perf_mmap *mmap;
         struct thread_map *threads;
@@ -122,16 +126,21 @@ int perf_evlist__start_workload(struct perf_evlist *evlist);
  
  struct option;
  
+int __perf_evlist__parse_mmap_pages(unsigned int *mmap_pages, const char *str);
  int perf_evlist__parse_mmap_pages(const struct option *opt,
                                   const char *str,
                                   int unset);
  
+int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
+                        bool overwrite, unsigned int auxtrace_pages,
+                        bool auxtrace_overwrite);
  int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
                       bool overwrite);
  void perf_evlist__munmap(struct perf_evlist *evlist);
  
  void perf_evlist__disable(struct perf_evlist *evlist);
  void perf_evlist__enable(struct perf_evlist *evlist);
+void perf_evlist__toggle_enable(struct perf_evlist *evlist);
  
  int perf_evlist__disable_event(struct perf_evlist *evlist,
                                struct perf_evsel *evsel);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c

index 33e3fd8c2e682d19c8c8dbda12b9815199ef47bf..33449decf7bd2c24d981fdf30c10042063a503ee 100644 (file)
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -26,6 +26,7 @@
  #include "perf_regs.h"
  #include "debug.h"
  #include "trace-event.h"
+#include "stat.h"
  
  static struct {
         bool sample_id_all;
@@ -851,19 +852,6 @@ int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads)
         return 0;
  }
  
-void perf_evsel__reset_counts(struct perf_evsel *evsel, int ncpus)
-{
-       memset(evsel->counts, 0, (sizeof(*evsel->counts) +
-                                (ncpus * sizeof(struct perf_counts_values))));
-}
-
-int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus)
-{
-       evsel->counts = zalloc((sizeof(*evsel->counts) +
-                               (ncpus * sizeof(struct perf_counts_values))));
-       return evsel->counts != NULL ? 0 : -ENOMEM;
-}
-
  static void perf_evsel__free_fd(struct perf_evsel *evsel)
  {
         xyarray__delete(evsel->fd);
@@ -891,11 +879,6 @@ void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
                 }
  }
  
-void perf_evsel__free_counts(struct perf_evsel *evsel)
-{
-       zfree(&evsel->counts);
-}
-
  void perf_evsel__exit(struct perf_evsel *evsel)
  {
         assert(list_empty(&evsel->node));
@@ -1058,7 +1041,7 @@ static void __p_read_format(char *buf, size_t size, u64 value)
  
  #define BUF_SIZE               1024
  
-#define p_hex(val)             snprintf(buf, BUF_SIZE, "%"PRIx64, (uint64_t)(val))
+#define p_hex(val)             snprintf(buf, BUF_SIZE, "%#"PRIx64, (uint64_t)(val))
  #define p_unsigned(val)                snprintf(buf, BUF_SIZE, "%"PRIu64, (uint64_t)(val))
  #define p_signed(val)          snprintf(buf, BUF_SIZE, "%"PRId64, (int64_t)(val))
  #define p_sample_type(val)     __p_sample_type(buf, BUF_SIZE, val)
@@ -1121,6 +1104,7 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
         PRINT_ATTRf(sample_stack_user, p_unsigned);
         PRINT_ATTRf(clockid, p_signed);
         PRINT_ATTRf(sample_regs_intr, p_hex);
+       PRINT_ATTRf(aux_watermark, p_unsigned);
  
         return ret;
  }
@@ -2148,7 +2132,9 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
         case EMFILE:
                 return scnprintf(msg, size, "%s",
                          "Too many events are opened.\n"
-                        "Try again after reducing the number of events.");
+                        "Probably the maximum number of open file descriptors has been reached.\n"
+                        "Hint: Try again after reducing the number of events.\n"
+                        "Hint: Try increasing the limit with 'ulimit -n <limit>'");
         case ENODEV:
                 if (target->cpu_list)
                         return scnprintf(msg, size, "%s",
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h

index e486151b03089720eed68436bbae7a454defb3ec..bb0579e8a10a4556119c5aa313cd22ddacdbc220 100644 (file)
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -73,7 +73,6 @@ struct perf_evsel {
         char                    *name;
         double                  scale;
         const char              *unit;
-       bool                    snapshot;
         struct event_format     *tp_format;
         union {
                 void            *priv;
@@ -86,6 +85,7 @@ struct perf_evsel {
         unsigned int            sample_size;
         int                     id_pos;
         int                     is_pos;
+       bool                    snapshot;
         bool                    supported;
         bool                    needs_swap;
         bool                    no_aux_samples;
@@ -93,11 +93,11 @@ struct perf_evsel {
         bool                    system_wide;
         bool                    tracking;
         bool                    per_pkg;
-       unsigned long           *per_pkg_mask;
         /* parse modifier helper */
         int                     exclude_GH;
         int                     nr_members;
         int                     sample_read;
+       unsigned long           *per_pkg_mask;
         struct perf_evsel       *leader;
         char                    *group_name;
  };
@@ -170,9 +170,6 @@ const char *perf_evsel__group_name(struct perf_evsel *evsel);
  int perf_evsel__group_desc(struct perf_evsel *evsel, char *buf, size_t size);
  
  int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads);
-int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus);
-void perf_evsel__reset_counts(struct perf_evsel *evsel, int ncpus);
-void perf_evsel__free_counts(struct perf_evsel *evsel);
  void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
  
  void __perf_evsel__set_sample_bit(struct perf_evsel *evsel,
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c

index 918fd8ae2d80bca007a8b6f7e2f8488fed0771a3..21a77e7a171e8aa0664d5caf0737f141bc96e62f 100644 (file)
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -869,6 +869,20 @@ static int write_branch_stack(int fd __maybe_unused,
         return 0;
  }
  
+static int write_auxtrace(int fd, struct perf_header *h,
+                         struct perf_evlist *evlist __maybe_unused)
+{
+       struct perf_session *session;
+       int err;
+
+       session = container_of(h, struct perf_session, header);
+
+       err = auxtrace_index__write(fd, &session->auxtrace_index);
+       if (err < 0)
+               pr_err("Failed to write auxtrace index\n");
+       return err;
+}
+
  static void print_hostname(struct perf_header *ph, int fd __maybe_unused,
                            FILE *fp)
  {
@@ -1151,6 +1165,12 @@ static void print_branch_stack(struct perf_header *ph __maybe_unused,
         fprintf(fp, "# contains samples with branch stack\n");
  }
  
+static void print_auxtrace(struct perf_header *ph __maybe_unused,
+                          int fd __maybe_unused, FILE *fp)
+{
+       fprintf(fp, "# contains AUX area data (e.g. instruction trace)\n");
+}
+
  static void print_pmu_mappings(struct perf_header *ph, int fd __maybe_unused,
                                FILE *fp)
  {
@@ -1218,9 +1238,8 @@ static int __event_process_build_id(struct build_id_event *bev,
                                     struct perf_session *session)
  {
         int err = -1;
-       struct dsos *dsos;
         struct machine *machine;
-       u16 misc;
+       u16 cpumode;
         struct dso *dso;
         enum dso_kernel_type dso_type;
  
@@ -1228,39 +1247,37 @@ static int __event_process_build_id(struct build_id_event *bev,
         if (!machine)
                 goto out;
  
-       misc = bev->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+       cpumode = bev->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
  
-       switch (misc) {
+       switch (cpumode) {
         case PERF_RECORD_MISC_KERNEL:
                 dso_type = DSO_TYPE_KERNEL;
-               dsos = &machine->kernel_dsos;
                 break;
         case PERF_RECORD_MISC_GUEST_KERNEL:
                 dso_type = DSO_TYPE_GUEST_KERNEL;
-               dsos = &machine->kernel_dsos;
                 break;
         case PERF_RECORD_MISC_USER:
         case PERF_RECORD_MISC_GUEST_USER:
                 dso_type = DSO_TYPE_USER;
-               dsos = &machine->user_dsos;
                 break;
         default:
                 goto out;
         }
  
-       dso = __dsos__findnew(dsos, filename);
+       dso = machine__findnew_dso(machine, filename);
         if (dso != NULL) {
                 char sbuild_id[BUILD_ID_SIZE * 2 + 1];
  
                 dso__set_build_id(dso, &bev->build_id);
  
-               if (!is_kernel_module(filename))
+               if (!is_kernel_module(filename, cpumode))
                         dso->kernel = dso_type;
  
                 build_id__sprintf(dso->build_id, sizeof(dso->build_id),
                                   sbuild_id);
                 pr_debug("build id event received for %s: %s\n",
                          dso->long_name, sbuild_id);
+               dso__put(dso);
         }
  
         err = 0;
@@ -1821,6 +1838,22 @@ out_free:
         return ret;
  }
  
+static int process_auxtrace(struct perf_file_section *section,
+                           struct perf_header *ph, int fd,
+                           void *data __maybe_unused)
+{
+       struct perf_session *session;
+       int err;
+
+       session = container_of(ph, struct perf_session, header);
+
+       err = auxtrace_index__process(fd, section->size, session,
+                                     ph->needs_swap);
+       if (err < 0)
+               pr_err("Failed to process auxtrace index\n");
+       return err;
+}
+
  struct feature_ops {
         int (*write)(int fd, struct perf_header *h, struct perf_evlist *evlist);
         void (*print)(struct perf_header *h, int fd, FILE *fp);
@@ -1861,6 +1894,7 @@ static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = {
         FEAT_OPA(HEADER_BRANCH_STACK,   branch_stack),
         FEAT_OPP(HEADER_PMU_MAPPINGS,   pmu_mappings),
         FEAT_OPP(HEADER_GROUP_DESC,     group_desc),
+       FEAT_OPP(HEADER_AUXTRACE,       auxtrace),
  };
  
  struct header_print_data {
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h

index 3bb90ac172a1ba0c8429494f4ebf501909a55274..d4d57962c59129d6121b61921b1c1bf58fe2d12a 100644 (file)
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -30,6 +30,7 @@ enum {
         HEADER_BRANCH_STACK,
         HEADER_PMU_MAPPINGS,
         HEADER_GROUP_DESC,
+       HEADER_AUXTRACE,
         HEADER_LAST_FEATURE,
         HEADER_FEAT_BITS        = 256,
  };
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c

index cc22b9158b93c41fd0d44cd451ba189c56bc19dd..6f28d53d4e46093293e71363d9aa5e7c1e0b23f5 100644 (file)
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -313,8 +313,7 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template,
                                 memset(&he->stat, 0, sizeof(he->stat));
                 }
  
-               if (he->ms.map)
-                       he->ms.map->referenced = true;
+               map__get(he->ms.map);
  
                 if (he->branch_info) {
                         /*
@@ -324,6 +323,7 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template,
                          */
                         he->branch_info = malloc(sizeof(*he->branch_info));
                         if (he->branch_info == NULL) {
+                               map__zput(he->ms.map);
                                 free(he->stat_acc);
                                 free(he);
                                 return NULL;
@@ -332,17 +332,13 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template,
                         memcpy(he->branch_info, template->branch_info,
                                sizeof(*he->branch_info));
  
-                       if (he->branch_info->from.map)
-                               he->branch_info->from.map->referenced = true;
-                       if (he->branch_info->to.map)
-                               he->branch_info->to.map->referenced = true;
+                       map__get(he->branch_info->from.map);
+                       map__get(he->branch_info->to.map);
                 }
  
                 if (he->mem_info) {
-                       if (he->mem_info->iaddr.map)
-                               he->mem_info->iaddr.map->referenced = true;
-                       if (he->mem_info->daddr.map)
-                               he->mem_info->daddr.map->referenced = true;
+                       map__get(he->mem_info->iaddr.map);
+                       map__get(he->mem_info->daddr.map);
                 }
  
                 if (symbol_conf.use_callchain)
@@ -362,10 +358,10 @@ static u8 symbol__parent_filter(const struct symbol *parent)
         return 0;
  }
  
-static struct hist_entry *add_hist_entry(struct hists *hists,
-                                        struct hist_entry *entry,
-                                        struct addr_location *al,
-                                        bool sample_self)
+static struct hist_entry *hists__findnew_entry(struct hists *hists,
+                                              struct hist_entry *entry,
+                                              struct addr_location *al,
+                                              bool sample_self)
  {
         struct rb_node **p;
         struct rb_node *parent = NULL;
@@ -407,9 +403,8 @@ static struct hist_entry *add_hist_entry(struct hists *hists,
                          * the history counter to increment.
                          */
                         if (he->ms.map != entry->ms.map) {
-                               he->ms.map = entry->ms.map;
-                               if (he->ms.map)
-                                       he->ms.map->referenced = true;
+                               map__put(he->ms.map);
+                               he->ms.map = map__get(entry->ms.map);
                         }
                         goto out;
                 }
@@ -468,7 +463,7 @@ struct hist_entry *__hists__add_entry(struct hists *hists,
                 .transaction = transaction,
         };
  
-       return add_hist_entry(hists, &entry, al, sample_self);
+       return hists__findnew_entry(hists, &entry, al, sample_self);
  }
  
  static int
@@ -548,9 +543,9 @@ iter_finish_mem_entry(struct hist_entry_iter *iter,
  
  out:
         /*
-        * We don't need to free iter->priv (mem_info) here since
-        * the mem info was either already freed in add_hist_entry() or
-        * passed to a new hist entry by hist_entry__new().
+        * We don't need to free iter->priv (mem_info) here since the mem info
+        * was either already freed in hists__findnew_entry() or passed to a
+        * new hist entry by hist_entry__new().
          */
         iter->priv = NULL;
  
@@ -851,19 +846,15 @@ const struct hist_iter_ops hist_iter_cumulative = {
  };
  
  int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
-                        struct perf_evsel *evsel, struct perf_sample *sample,
                          int max_stack_depth, void *arg)
  {
         int err, err2;
  
-       err = sample__resolve_callchain(sample, &iter->parent, evsel, al,
-                                       max_stack_depth);
+       err = sample__resolve_callchain(iter->sample, &iter->parent,
+                                       iter->evsel, al, max_stack_depth);
         if (err)
                 return err;
  
-       iter->evsel = evsel;
-       iter->sample = sample;
-
         err = iter->ops->prepare_entry(iter, al);
         if (err)
                 goto out;
@@ -937,8 +928,20 @@ hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
  void hist_entry__delete(struct hist_entry *he)
  {
         thread__zput(he->thread);
-       zfree(&he->branch_info);
-       zfree(&he->mem_info);
+       map__zput(he->ms.map);
+
+       if (he->branch_info) {
+               map__zput(he->branch_info->from.map);
+               map__zput(he->branch_info->to.map);
+               zfree(&he->branch_info);
+       }
+
+       if (he->mem_info) {
+               map__zput(he->mem_info->iaddr.map);
+               map__zput(he->mem_info->daddr.map);
+               zfree(&he->mem_info);
+       }
+
         zfree(&he->stat_acc);
         free_srcline(he->srcline);
         free_callchain(he->callchain);
@@ -1163,7 +1166,7 @@ static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *h
                 return;
  
         /* force fold unfiltered entry for simplicity */
-       h->ms.unfolded = false;
+       h->unfolded = false;
         h->row_offset = 0;
         h->nr_rows = 0;
  
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h

index 9f31b89a527a2e8f9d02da1993cdff13709b84c1..5ed8d9c229814d9c6942ce3528898bbd9de1cb79 100644 (file)
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -111,7 +111,6 @@ struct hist_entry *__hists__add_entry(struct hists *hists,
                                       u64 weight, u64 transaction,
                                       bool sample_self);
  int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
-                        struct perf_evsel *evsel, struct perf_sample *sample,
                          int max_stack_depth, void *arg);
  
  int64_t hist_entry__cmp(struct hist_entry *left, struct hist_entry *right);
diff --git a/tools/perf/util/include/linux/kernel.h b/tools/perf/util/include/linux/kernel.h

deleted file mode 100644 (file)

index 09e8e7a..0000000
--- a/tools/perf/util/include/linux/kernel.h
+++ /dev/null
@@ -1,107 +0,0 @@
-#ifndef PERF_LINUX_KERNEL_H_
-#define PERF_LINUX_KERNEL_H_
-
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
-
-#define PERF_ALIGN(x, a)       __PERF_ALIGN_MASK(x, (typeof(x))(a)-1)
-#define __PERF_ALIGN_MASK(x, mask)     (((x)+(mask))&~(mask))
-
-#ifndef offsetof
-#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
-#endif
-
-#ifndef container_of
-/**
- * container_of - cast a member of a structure out to the containing structure
- * @ptr:       the pointer to the member.
- * @type:      the type of the container struct this is embedded in.
- * @member:    the name of the member within the struct.
- *
- */
-#define container_of(ptr, type, member) ({                     \
-       const typeof(((type *)0)->member) * __mptr = (ptr);     \
-       (type *)((char *)__mptr - offsetof(type, member)); })
-#endif
-
-#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
-
-#ifndef max
-#define max(x, y) ({                           \
-       typeof(x) _max1 = (x);                  \
-       typeof(y) _max2 = (y);                  \
-       (void) (&_max1 == &_max2);              \
-       _max1 > _max2 ? _max1 : _max2; })
-#endif
-
-#ifndef min
-#define min(x, y) ({                           \
-       typeof(x) _min1 = (x);                  \
-       typeof(y) _min2 = (y);                  \
-       (void) (&_min1 == &_min2);              \
-       _min1 < _min2 ? _min1 : _min2; })
-#endif
-
-#ifndef roundup
-#define roundup(x, y) (                                \
-{                                                      \
-       const typeof(y) __y = y;                       \
-       (((x) + (__y - 1)) / __y) * __y;               \
-}                                                      \
-)
-#endif
-
-#ifndef BUG_ON
-#ifdef NDEBUG
-#define BUG_ON(cond) do { if (cond) {} } while (0)
-#else
-#define BUG_ON(cond) assert(!(cond))
-#endif
-#endif
-
-/*
- * Both need more care to handle endianness
- * (Don't use bitmap_copy_le() for now)
- */
-#define cpu_to_le64(x) (x)
-#define cpu_to_le32(x) (x)
-
-static inline int
-vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
-{
-       int i;
-       ssize_t ssize = size;
-
-       i = vsnprintf(buf, size, fmt, args);
-
-       return (i >= ssize) ? (ssize - 1) : i;
-}
-
-static inline int scnprintf(char * buf, size_t size, const char * fmt, ...)
-{
-       va_list args;
-       ssize_t ssize = size;
-       int i;
-
-       va_start(args, fmt);
-       i = vsnprintf(buf, size, fmt, args);
-       va_end(args);
-
-       return (i >= ssize) ? (ssize - 1) : i;
-}
-
-/*
- * This looks more complex than it should be. But we need to
- * get the type for the ~ right in round_down (it needs to be
- * as wide as the result!), and we want to evaluate the macro
- * arguments just once each.
- */
-#define __round_mask(x, y) ((__typeof__(x))((y)-1))
-#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
-#define round_down(x, y) ((x) & ~__round_mask(x, y))
-
-#endif
diff --git a/tools/perf/util/include/linux/list.h b/tools/perf/util/include/linux/list.h

deleted file mode 100644 (file)

index 76ddbc7..0000000
--- a/tools/perf/util/include/linux/list.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/types.h>
-
-#include "../../../../include/linux/list.h"
-
-#ifndef PERF_LIST_H
-#define PERF_LIST_H
-/**
- * list_del_range - deletes range of entries from list.
- * @begin: first element in the range to delete from the list.
- * @end: last element in the range to delete from the list.
- * Note: list_empty on the range of entries does not return true after this,
- * the entries is in an undefined state.
- */
-static inline void list_del_range(struct list_head *begin,
-                                 struct list_head *end)
-{
-       begin->prev->next = end->next;
-       end->next->prev = begin->prev;
-}
-
-/**
- * list_for_each_from  -       iterate over a list from one of its nodes
- * @pos:  the &struct list_head to use as a loop cursor, from where to start
- * @head: the head for your list.
- */
-#define list_for_each_from(pos, head) \
-       for (; pos != (head); pos = pos->next)
-#endif
diff --git a/tools/perf/util/include/linux/poison.h b/tools/perf/util/include/linux/poison.h

deleted file mode 100644 (file)

index fef6dbc..0000000
--- a/tools/perf/util/include/linux/poison.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../../include/linux/poison.h"
diff --git a/tools/perf/util/include/linux/rbtree.h b/tools/perf/util/include/linux/rbtree.h

index 2a030c5af3aa2062082d87b1604b9980a1006844..f06d89f0b8678d8407c91db17e8c26250376c5a1 100644 (file)
--- a/tools/perf/util/include/linux/rbtree.h
+++ b/tools/perf/util/include/linux/rbtree.h
@@ -1,2 +1,16 @@
+#ifndef __TOOLS_LINUX_PERF_RBTREE_H
+#define __TOOLS_LINUX_PERF_RBTREE_H
  #include <stdbool.h>
  #include "../../../../include/linux/rbtree.h"
+
+/*
+ * Handy for checking that we are not deleting an entry that is
+ * already in a list, found in block/{blk-throttle,cfq-iosched}.c,
+ * probably should be moved to lib/rbtree.c...
+ */
+static inline void rb_erase_init(struct rb_node *n, struct rb_root *root)
+{
+       rb_erase(n, root);
+       RB_CLEAR_NODE(n);
+}
+#endif /* __TOOLS_LINUX_PERF_RBTREE_H */
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c

index 527e032e24f6e648e258b08b379e55fc6dcf8e5a..4744673aff1b287de3a091a40edade2a709a8e52 100644 (file)
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -14,20 +14,23 @@
  #include "unwind.h"
  #include "linux/hash.h"
  
+static void __machine__remove_thread(struct machine *machine, struct thread *th, bool lock);
+
  static void dsos__init(struct dsos *dsos)
  {
         INIT_LIST_HEAD(&dsos->head);
         dsos->root = RB_ROOT;
+       pthread_rwlock_init(&dsos->lock, NULL);
  }
  
  int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
  {
         map_groups__init(&machine->kmaps, machine);
         RB_CLEAR_NODE(&machine->rb_node);
-       dsos__init(&machine->user_dsos);
-       dsos__init(&machine->kernel_dsos);
+       dsos__init(&machine->dsos);
  
         machine->threads = RB_ROOT;
+       pthread_rwlock_init(&machine->threads_lock, NULL);
         INIT_LIST_HEAD(&machine->dead_threads);
         machine->last_match = NULL;
  
@@ -54,6 +57,7 @@ int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
  
                 snprintf(comm, sizeof(comm), "[guest/%d]", pid);
                 thread__set_comm(thread, comm, 0);
+               thread__put(thread);
         }
  
         machine->current_tid = NULL;
@@ -78,37 +82,50 @@ out_delete:
         return NULL;
  }
  
-static void dsos__delete(struct dsos *dsos)
+static void dsos__purge(struct dsos *dsos)
  {
         struct dso *pos, *n;
  
+       pthread_rwlock_wrlock(&dsos->lock);
+
         list_for_each_entry_safe(pos, n, &dsos->head, node) {
                 RB_CLEAR_NODE(&pos->rb_node);
-               list_del(&pos->node);
-               dso__delete(pos);
+               list_del_init(&pos->node);
+               dso__put(pos);
         }
+
+       pthread_rwlock_unlock(&dsos->lock);
+}
+
+static void dsos__exit(struct dsos *dsos)
+{
+       dsos__purge(dsos);
+       pthread_rwlock_destroy(&dsos->lock);
  }
  
  void machine__delete_threads(struct machine *machine)
  {
-       struct rb_node *nd = rb_first(&machine->threads);
+       struct rb_node *nd;
  
+       pthread_rwlock_wrlock(&machine->threads_lock);
+       nd = rb_first(&machine->threads);
         while (nd) {
                 struct thread *t = rb_entry(nd, struct thread, rb_node);
  
                 nd = rb_next(nd);
-               machine__remove_thread(machine, t);
+               __machine__remove_thread(machine, t, false);
         }
+       pthread_rwlock_unlock(&machine->threads_lock);
  }
  
  void machine__exit(struct machine *machine)
  {
         map_groups__exit(&machine->kmaps);
-       dsos__delete(&machine->user_dsos);
-       dsos__delete(&machine->kernel_dsos);
-       vdso__exit(machine);
+       dsos__exit(&machine->dsos);
+       machine__exit_vdso(machine);
         zfree(&machine->root_dir);
         zfree(&machine->current_tid);
+       pthread_rwlock_destroy(&machine->threads_lock);
  }
  
  void machine__delete(struct machine *machine)
@@ -303,7 +320,7 @@ static void machine__update_thread_pid(struct machine *machine,
         if (th->pid_ == th->tid)
                 return;
  
-       leader = machine__findnew_thread(machine, th->pid_, th->pid_);
+       leader = __machine__findnew_thread(machine, th->pid_, th->pid_);
         if (!leader)
                 goto out_err;
  
@@ -325,7 +342,7 @@ static void machine__update_thread_pid(struct machine *machine,
                 if (!map_groups__empty(th->mg))
                         pr_err("Discarding thread maps for %d:%d\n",
                                th->pid_, th->tid);
-               map_groups__delete(th->mg);
+               map_groups__put(th->mg);
         }
  
         th->mg = map_groups__get(leader->mg);
@@ -336,9 +353,9 @@ out_err:
         pr_err("Failed to join map groups for %d:%d\n", th->pid_, th->tid);
  }
  
-static struct thread *__machine__findnew_thread(struct machine *machine,
-                                               pid_t pid, pid_t tid,
-                                               bool create)
+static struct thread *____machine__findnew_thread(struct machine *machine,
+                                                 pid_t pid, pid_t tid,
+                                                 bool create)
  {
         struct rb_node **p = &machine->threads.rb_node;
         struct rb_node *parent = NULL;
@@ -356,7 +373,7 @@ static struct thread *__machine__findnew_thread(struct machine *machine,
                         return th;
                 }
  
-               thread__zput(machine->last_match);
+               machine->last_match = NULL;
         }
  
         while (*p != NULL) {
@@ -364,7 +381,7 @@ static struct thread *__machine__findnew_thread(struct machine *machine,
                 th = rb_entry(parent, struct thread, rb_node);
  
                 if (th->tid == tid) {
-                       machine->last_match = thread__get(th);
+                       machine->last_match = th;
                         machine__update_thread_pid(machine, th, pid);
                         return th;
                 }
@@ -392,7 +409,8 @@ static struct thread *__machine__findnew_thread(struct machine *machine,
                  * leader and that would screwed the rb tree.
                  */
                 if (thread__init_map_groups(th, machine)) {
-                       rb_erase(&th->rb_node, &machine->threads);
+                       rb_erase_init(&th->rb_node, &machine->threads);
+                       RB_CLEAR_NODE(&th->rb_node);
                         thread__delete(th);
                         return NULL;
                 }
@@ -400,22 +418,36 @@ static struct thread *__machine__findnew_thread(struct machine *machine,
                  * It is now in the rbtree, get a ref
                  */
                 thread__get(th);
-               machine->last_match = thread__get(th);
+               machine->last_match = th;
         }
  
         return th;
  }
  
+struct thread *__machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid)
+{
+       return ____machine__findnew_thread(machine, pid, tid, true);
+}
+
  struct thread *machine__findnew_thread(struct machine *machine, pid_t pid,
                                        pid_t tid)
  {
-       return __machine__findnew_thread(machine, pid, tid, true);
+       struct thread *th;
+
+       pthread_rwlock_wrlock(&machine->threads_lock);
+       th = thread__get(__machine__findnew_thread(machine, pid, tid));
+       pthread_rwlock_unlock(&machine->threads_lock);
+       return th;
  }
  
  struct thread *machine__find_thread(struct machine *machine, pid_t pid,
                                     pid_t tid)
  {
-       return __machine__findnew_thread(machine, pid, tid, false);
+       struct thread *th;
+       pthread_rwlock_rdlock(&machine->threads_lock);
+       th =  thread__get(____machine__findnew_thread(machine, pid, tid, false));
+       pthread_rwlock_unlock(&machine->threads_lock);
+       return th;
  }
  
  struct comm *machine__thread_exec_comm(struct machine *machine,
@@ -434,6 +466,7 @@ int machine__process_comm_event(struct machine *machine, union perf_event *event
                                                         event->comm.pid,
                                                         event->comm.tid);
         bool exec = event->header.misc & PERF_RECORD_MISC_COMM_EXEC;
+       int err = 0;
  
         if (exec)
                 machine->comm_exec = true;
@@ -444,10 +477,12 @@ int machine__process_comm_event(struct machine *machine, union perf_event *event
         if (thread == NULL ||
             __thread__set_comm(thread, event->comm.comm, sample->time, exec)) {
                 dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
-               return -1;
+               err = -1;
         }
  
-       return 0;
+       thread__put(thread);
+
+       return err;
  }
  
  int machine__process_lost_event(struct machine *machine __maybe_unused,
@@ -458,17 +493,27 @@ int machine__process_lost_event(struct machine *machine __maybe_unused,
         return 0;
  }
  
-static struct dso*
-machine__module_dso(struct machine *machine, struct kmod_path *m,
-                   const char *filename)
+int machine__process_lost_samples_event(struct machine *machine __maybe_unused,
+                                       union perf_event *event, struct perf_sample *sample)
+{
+       dump_printf(": id:%" PRIu64 ": lost samples :%" PRIu64 "\n",
+                   sample->id, event->lost_samples.lost);
+       return 0;
+}
+
+static struct dso *machine__findnew_module_dso(struct machine *machine,
+                                              struct kmod_path *m,
+                                              const char *filename)
  {
         struct dso *dso;
  
-       dso = dsos__find(&machine->kernel_dsos, m->name, true);
+       pthread_rwlock_wrlock(&machine->dsos.lock);
+
+       dso = __dsos__find(&machine->dsos, m->name, true);
         if (!dso) {
-               dso = dsos__addnew(&machine->kernel_dsos, m->name);
+               dso = __dsos__addnew(&machine->dsos, m->name);
                 if (dso == NULL)
-                       return NULL;
+                       goto out_unlock;
  
                 if (machine__is_host(machine))
                         dso->symtab_type = DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE;
@@ -483,11 +528,30 @@ machine__module_dso(struct machine *machine, struct kmod_path *m,
                 dso__set_long_name(dso, strdup(filename), true);
         }
  
+       dso__get(dso);
+out_unlock:
+       pthread_rwlock_unlock(&machine->dsos.lock);
         return dso;
  }
  
-struct map *machine__new_module(struct machine *machine, u64 start,
-                               const char *filename)
+int machine__process_aux_event(struct machine *machine __maybe_unused,
+                              union perf_event *event)
+{
+       if (dump_trace)
+               perf_event__fprintf_aux(event, stdout);
+       return 0;
+}
+
+int machine__process_itrace_start_event(struct machine *machine __maybe_unused,
+                                       union perf_event *event)
+{
+       if (dump_trace)
+               perf_event__fprintf_itrace_start(event, stdout);
+       return 0;
+}
+
+struct map *machine__findnew_module_map(struct machine *machine, u64 start,
+                                       const char *filename)
  {
         struct map *map = NULL;
         struct dso *dso;
@@ -501,7 +565,7 @@ struct map *machine__new_module(struct machine *machine, u64 start,
         if (map)
                 goto out;
  
-       dso = machine__module_dso(machine, &m, filename);
+       dso = machine__findnew_module_dso(machine, &m, filename);
         if (dso == NULL)
                 goto out;
  
@@ -519,13 +583,11 @@ out:
  size_t machines__fprintf_dsos(struct machines *machines, FILE *fp)
  {
         struct rb_node *nd;
-       size_t ret = __dsos__fprintf(&machines->host.kernel_dsos.head, fp) +
-                    __dsos__fprintf(&machines->host.user_dsos.head, fp);
+       size_t ret = __dsos__fprintf(&machines->host.dsos.head, fp);
  
         for (nd = rb_first(&machines->guests); nd; nd = rb_next(nd)) {
                 struct machine *pos = rb_entry(nd, struct machine, rb_node);
-               ret += __dsos__fprintf(&pos->kernel_dsos.head, fp);
-               ret += __dsos__fprintf(&pos->user_dsos.head, fp);
+               ret += __dsos__fprintf(&pos->dsos.head, fp);
         }
  
         return ret;
@@ -534,8 +596,7 @@ size_t machines__fprintf_dsos(struct machines *machines, FILE *fp)
  size_t machine__fprintf_dsos_buildid(struct machine *m, FILE *fp,
                                      bool (skip)(struct dso *dso, int parm), int parm)
  {
-       return __dsos__fprintf_buildid(&m->kernel_dsos.head, fp, skip, parm) +
-              __dsos__fprintf_buildid(&m->user_dsos.head, fp, skip, parm);
+       return __dsos__fprintf_buildid(&m->dsos.head, fp, skip, parm);
  }
  
  size_t machines__fprintf_dsos_buildid(struct machines *machines, FILE *fp,
@@ -575,12 +636,16 @@ size_t machine__fprintf(struct machine *machine, FILE *fp)
         size_t ret = 0;
         struct rb_node *nd;
  
+       pthread_rwlock_rdlock(&machine->threads_lock);
+
         for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) {
                 struct thread *pos = rb_entry(nd, struct thread, rb_node);
  
                 ret += thread__fprintf(pos, fp);
         }
  
+       pthread_rwlock_unlock(&machine->threads_lock);
+
         return ret;
  }
  
@@ -594,9 +659,8 @@ static struct dso *machine__get_kernel(struct machine *machine)
                 if (!vmlinux_name)
                         vmlinux_name = "[kernel.kallsyms]";
  
-               kernel = dso__kernel_findnew(machine, vmlinux_name,
-                                            "[kernel]",
-                                            DSO_TYPE_KERNEL);
+               kernel = machine__findnew_kernel(machine, vmlinux_name,
+                                                "[kernel]", DSO_TYPE_KERNEL);
         } else {
                 char bf[PATH_MAX];
  
@@ -606,9 +670,9 @@ static struct dso *machine__get_kernel(struct machine *machine)
                         vmlinux_name = machine__mmap_name(machine, bf,
                                                           sizeof(bf));
  
-               kernel = dso__kernel_findnew(machine, vmlinux_name,
-                                            "[guest.kernel]",
-                                            DSO_TYPE_GUEST_KERNEL);
+               kernel = machine__findnew_kernel(machine, vmlinux_name,
+                                                "[guest.kernel]",
+                                                DSO_TYPE_GUEST_KERNEL);
         }
  
         if (kernel != NULL && (!kernel->has_build_id))
@@ -713,7 +777,6 @@ void machine__destroy_kernel_maps(struct machine *machine)
                                 kmap->ref_reloc_sym = NULL;
                 }
  
-               map__delete(machine->vmlinux_maps[type]);
                 machine->vmlinux_maps[type] = NULL;
         }
  }
@@ -970,7 +1033,7 @@ static int machine__create_module(void *arg, const char *name, u64 start)
         struct machine *machine = arg;
         struct map *map;
  
-       map = machine__new_module(machine, start, name);
+       map = machine__findnew_module_map(machine, start, name);
         if (map == NULL)
                 return -1;
  
@@ -1062,7 +1125,7 @@ static bool machine__uses_kcore(struct machine *machine)
  {
         struct dso *dso;
  
-       list_for_each_entry(dso, &machine->kernel_dsos.head, node) {
+       list_for_each_entry(dso, &machine->dsos.head, node) {
                 if (dso__is_kcore(dso))
                         return true;
         }
@@ -1093,8 +1156,8 @@ static int machine__process_kernel_mmap_event(struct machine *machine,
                                 strlen(kmmap_prefix) - 1) == 0;
         if (event->mmap.filename[0] == '/' ||
             (!is_kernel_mmap && event->mmap.filename[0] == '[')) {
-               map = machine__new_module(machine, event->mmap.start,
-                                         event->mmap.filename);
+               map = machine__findnew_module_map(machine, event->mmap.start,
+                                                 event->mmap.filename);
                 if (map == NULL)
                         goto out_problem;
  
@@ -1109,23 +1172,48 @@ static int machine__process_kernel_mmap_event(struct machine *machine,
                 struct dso *kernel = NULL;
                 struct dso *dso;
  
-               list_for_each_entry(dso, &machine->kernel_dsos.head, node) {
-                       if (is_kernel_module(dso->long_name))
+               pthread_rwlock_rdlock(&machine->dsos.lock);
+
+               list_for_each_entry(dso, &machine->dsos.head, node) {
+
+                       /*
+                        * The cpumode passed to is_kernel_module is not the
+                        * cpumode of *this* event. If we insist on passing
+                        * correct cpumode to is_kernel_module, we should
+                        * record the cpumode when we adding this dso to the
+                        * linked list.
+                        *
+                        * However we don't really need passing correct
+                        * cpumode.  We know the correct cpumode must be kernel
+                        * mode (if not, we should not link it onto kernel_dsos
+                        * list).
+                        *
+                        * Therefore, we pass PERF_RECORD_MISC_CPUMODE_UNKNOWN.
+                        * is_kernel_module() treats it as a kernel cpumode.
+                        */
+
+                       if (!dso->kernel ||
+                           is_kernel_module(dso->long_name,
+                                            PERF_RECORD_MISC_CPUMODE_UNKNOWN))
                                 continue;
  
+
                         kernel = dso;
                         break;
                 }
  
+               pthread_rwlock_unlock(&machine->dsos.lock);
+
                 if (kernel == NULL)
-                       kernel = __dsos__findnew(&machine->kernel_dsos,
-                                                kmmap_prefix);
+                       kernel = machine__findnew_dso(machine, kmmap_prefix);
                 if (kernel == NULL)
                         goto out_problem;
  
                 kernel->kernel = kernel_type;
-               if (__machine__create_kernel_maps(machine, kernel) < 0)
+               if (__machine__create_kernel_maps(machine, kernel) < 0) {
+                       dso__put(kernel);
                         goto out_problem;
+               }
  
                 if (strstr(kernel->long_name, "vmlinux"))
                         dso__set_short_name(kernel, "[kernel.vmlinux]", false);
@@ -1197,11 +1285,15 @@ int machine__process_mmap2_event(struct machine *machine,
                         event->mmap2.filename, type, thread);
  
         if (map == NULL)
-               goto out_problem;
+               goto out_problem_map;
  
         thread__insert_map(thread, map);
+       thread__put(thread);
+       map__put(map);
         return 0;
  
+out_problem_map:
+       thread__put(thread);
  out_problem:
         dump_printf("problem processing PERF_RECORD_MMAP2, skipping event.\n");
         return 0;
@@ -1244,31 +1336,46 @@ int machine__process_mmap_event(struct machine *machine, union perf_event *event
                         type, thread);
  
         if (map == NULL)
-               goto out_problem;
+               goto out_problem_map;
  
         thread__insert_map(thread, map);
+       thread__put(thread);
+       map__put(map);
         return 0;
  
+out_problem_map:
+       thread__put(thread);
  out_problem:
         dump_printf("problem processing PERF_RECORD_MMAP, skipping event.\n");
         return 0;
  }
  
-void machine__remove_thread(struct machine *machine, struct thread *th)
+static void __machine__remove_thread(struct machine *machine, struct thread *th, bool lock)
  {
         if (machine->last_match == th)
-               thread__zput(machine->last_match);
+               machine->last_match = NULL;
  
-       rb_erase(&th->rb_node, &machine->threads);
+       BUG_ON(atomic_read(&th->refcnt) == 0);
+       if (lock)
+               pthread_rwlock_wrlock(&machine->threads_lock);
+       rb_erase_init(&th->rb_node, &machine->threads);
+       RB_CLEAR_NODE(&th->rb_node);
         /*
          * Move it first to the dead_threads list, then drop the reference,
          * if this is the last reference, then the thread__delete destructor
          * will be called and we will remove it from the dead_threads list.
          */
         list_add_tail(&th->node, &machine->dead_threads);
+       if (lock)
+               pthread_rwlock_unlock(&machine->threads_lock);
         thread__put(th);
  }
  
+void machine__remove_thread(struct machine *machine, struct thread *th)
+{
+       return __machine__remove_thread(machine, th, true);
+}
+
  int machine__process_fork_event(struct machine *machine, union perf_event *event,
                                 struct perf_sample *sample)
  {
@@ -1278,10 +1385,13 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event
         struct thread *parent = machine__findnew_thread(machine,
                                                         event->fork.ppid,
                                                         event->fork.ptid);
+       int err = 0;
  
         /* if a thread currently exists for the thread id remove it */
-       if (thread != NULL)
+       if (thread != NULL) {
                 machine__remove_thread(machine, thread);
+               thread__put(thread);
+       }
  
         thread = machine__findnew_thread(machine, event->fork.pid,
                                          event->fork.tid);
@@ -1291,10 +1401,12 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event
         if (thread == NULL || parent == NULL ||
             thread__fork(thread, parent, sample->time) < 0) {
                 dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n");
-               return -1;
+               err = -1;
         }
+       thread__put(thread);
+       thread__put(parent);
  
-       return 0;
+       return err;
  }
  
  int machine__process_exit_event(struct machine *machine, union perf_event *event,
@@ -1307,8 +1419,10 @@ int machine__process_exit_event(struct machine *machine, union perf_event *event
         if (dump_trace)
                 perf_event__fprintf_task(event, stdout);
  
-       if (thread != NULL)
+       if (thread != NULL) {
                 thread__exited(thread);
+               thread__put(thread);
+       }
  
         return 0;
  }
@@ -1331,6 +1445,13 @@ int machine__process_event(struct machine *machine, union perf_event *event,
                 ret = machine__process_exit_event(machine, event, sample); break;
         case PERF_RECORD_LOST:
                 ret = machine__process_lost_event(machine, event, sample); break;
+       case PERF_RECORD_AUX:
+               ret = machine__process_aux_event(machine, event); break;
+       case PERF_RECORD_ITRACE_START:
+               ret = machine__process_itrace_start_event(machine, event);
+       case PERF_RECORD_LOST_SAMPLES:
+               ret = machine__process_lost_samples_event(machine, event, sample); break;
+               break;
         default:
                 ret = -1;
                 break;
@@ -1769,14 +1890,36 @@ int machine__for_each_thread(struct machine *machine,
         return rc;
  }
  
+int machines__for_each_thread(struct machines *machines,
+                             int (*fn)(struct thread *thread, void *p),
+                             void *priv)
+{
+       struct rb_node *nd;
+       int rc = 0;
+
+       rc = machine__for_each_thread(&machines->host, fn, priv);
+       if (rc != 0)
+               return rc;
+
+       for (nd = rb_first(&machines->guests); nd; nd = rb_next(nd)) {
+               struct machine *machine = rb_entry(nd, struct machine, rb_node);
+
+               rc = machine__for_each_thread(machine, fn, priv);
+               if (rc != 0)
+                       return rc;
+       }
+       return rc;
+}
+
  int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool,
                                   struct target *target, struct thread_map *threads,
-                                 perf_event__handler_t process, bool data_mmap)
+                                 perf_event__handler_t process, bool data_mmap,
+                                 unsigned int proc_map_timeout)
  {
         if (target__has_task(target))
-               return perf_event__synthesize_thread_map(tool, threads, process, machine, data_mmap);
+               return perf_event__synthesize_thread_map(tool, threads, process, machine, data_mmap, proc_map_timeout);
         else if (target__has_cpu(target))
-               return perf_event__synthesize_threads(tool, process, machine, data_mmap);
+               return perf_event__synthesize_threads(tool, process, machine, data_mmap, proc_map_timeout);
         /* command specified */
         return 0;
  }
@@ -1820,6 +1963,7 @@ int machine__set_current_tid(struct machine *machine, int cpu, pid_t pid,
                 return -ENOMEM;
  
         thread->cpu = cpu;
+       thread__put(thread);
  
         return 0;
  }
@@ -1845,3 +1989,8 @@ int machine__get_kernel_start(struct machine *machine)
         }
         return err;
  }
+
+struct dso *machine__findnew_dso(struct machine *machine, const char *filename)
+{
+       return dsos__findnew(&machine->dsos, filename);
+}
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h

index 6d64cedb9d1e8f6fb255068b27761f6c52735b5e..887798e511e9f3dd382109927860269b03744246 100644 (file)
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -30,11 +30,11 @@ struct machine {
         bool              comm_exec;
         char              *root_dir;
         struct rb_root    threads;
+       pthread_rwlock_t  threads_lock;
         struct list_head  dead_threads;
         struct thread     *last_match;
         struct vdso_info  *vdso_info;
-       struct dsos       user_dsos;
-       struct dsos       kernel_dsos;
+       struct dsos       dsos;
         struct map_groups kmaps;
         struct map        *vmlinux_maps[MAP__NR_TYPES];
         u64               kernel_start;
@@ -81,6 +81,12 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event
                                 struct perf_sample *sample);
  int machine__process_lost_event(struct machine *machine, union perf_event *event,
                                 struct perf_sample *sample);
+int machine__process_lost_samples_event(struct machine *machine, union perf_event *event,
+                                       struct perf_sample *sample);
+int machine__process_aux_event(struct machine *machine,
+                              union perf_event *event);
+int machine__process_itrace_start_event(struct machine *machine,
+                                       union perf_event *event);
  int machine__process_mmap_event(struct machine *machine, union perf_event *event,
                                 struct perf_sample *sample);
  int machine__process_mmap2_event(struct machine *machine, union perf_event *event,
@@ -147,8 +153,10 @@ static inline bool machine__is_host(struct machine *machine)
         return machine ? machine->pid == HOST_KERNEL_ID : false;
  }
  
-struct thread *machine__findnew_thread(struct machine *machine, pid_t pid,
-                                      pid_t tid);
+struct thread *__machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid);
+struct thread *machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid);
+
+struct dso *machine__findnew_dso(struct machine *machine, const char *filename);
  
  size_t machine__fprintf(struct machine *machine, FILE *fp);
  
@@ -181,8 +189,8 @@ struct symbol *machine__find_kernel_function_by_name(struct machine *machine,
                                                  filter);
  }
  
-struct map *machine__new_module(struct machine *machine, u64 start,
-                               const char *filename);
+struct map *machine__findnew_module_map(struct machine *machine, u64 start,
+                                       const char *filename);
  
  int machine__load_kallsyms(struct machine *machine, const char *filename,
                            enum map_type type, symbol_filter_t filter);
@@ -208,16 +216,22 @@ size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp);
  int machine__for_each_thread(struct machine *machine,
                              int (*fn)(struct thread *thread, void *p),
                              void *priv);
+int machines__for_each_thread(struct machines *machines,
+                             int (*fn)(struct thread *thread, void *p),
+                             void *priv);
  
  int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool,
                                   struct target *target, struct thread_map *threads,
-                                 perf_event__handler_t process, bool data_mmap);
+                                 perf_event__handler_t process, bool data_mmap,
+                                 unsigned int proc_map_timeout);
  static inline
  int machine__synthesize_threads(struct machine *machine, struct target *target,
-                               struct thread_map *threads, bool data_mmap)
+                               struct thread_map *threads, bool data_mmap,
+                               unsigned int proc_map_timeout)
  {
         return __machine__synthesize_threads(machine, NULL, target, threads,
-                                            perf_event__process, data_mmap);
+                                            perf_event__process, data_mmap,
+                                            proc_map_timeout);
  }
  
  pid_t machine__get_current_tid(struct machine *machine, int cpu);
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c

index a14f08f416863944527412b82a6b3cfaeda3c603..b5a5e9c024379652fccd722d7da2739169e0ab5d 100644 (file)
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -16,6 +16,8 @@
  #include "machine.h"
  #include <linux/string.h>
  
+static void __maps__insert(struct maps *maps, struct map *map);
+
  const char *map_type__name[MAP__NR_TYPES] = {
         [MAP__FUNCTION] = "Functions",
         [MAP__VARIABLE] = "Variables",
@@ -130,13 +132,13 @@ void map__init(struct map *map, enum map_type type,
         map->end      = end;
         map->pgoff    = pgoff;
         map->reloc    = 0;
-       map->dso      = dso;
+       map->dso      = dso__get(dso);
         map->map_ip   = map__map_ip;
         map->unmap_ip = map__unmap_ip;
         RB_CLEAR_NODE(&map->rb_node);
         map->groups   = NULL;
-       map->referenced = false;
         map->erange_warned = false;
+       atomic_set(&map->refcnt, 1);
  }
  
  struct map *map__new(struct machine *machine, u64 start, u64 len,
@@ -175,9 +177,9 @@ struct map *map__new(struct machine *machine, u64 start, u64 len,
  
                 if (vdso) {
                         pgoff = 0;
-                       dso = vdso__dso_findnew(machine, thread);
+                       dso = machine__findnew_vdso(machine, thread);
                 } else
-                       dso = __dsos__findnew(&machine->user_dsos, filename);
+                       dso = machine__findnew_dso(machine, filename);
  
                 if (dso == NULL)
                         goto out_delete;
@@ -195,6 +197,7 @@ struct map *map__new(struct machine *machine, u64 start, u64 len,
                         if (type != MAP__FUNCTION)
                                 dso__set_loaded(dso, map->type);
                 }
+               dso__put(dso);
         }
         return map;
  out_delete:
@@ -221,11 +224,24 @@ struct map *map__new2(u64 start, struct dso *dso, enum map_type type)
         return map;
  }
  
+static void map__exit(struct map *map)
+{
+       BUG_ON(!RB_EMPTY_NODE(&map->rb_node));
+       dso__zput(map->dso);
+}
+
  void map__delete(struct map *map)
  {
+       map__exit(map);
         free(map);
  }
  
+void map__put(struct map *map)
+{
+       if (map && atomic_dec_and_test(&map->refcnt))
+               map__delete(map);
+}
+
  void map__fixup_start(struct map *map)
  {
         struct rb_root *symbols = &map->dso->symbols[map->type];
@@ -292,6 +308,11 @@ int map__load(struct map *map, symbol_filter_t filter)
         return 0;
  }
  
+int __weak arch__compare_symbol_names(const char *namea, const char *nameb)
+{
+       return strcmp(namea, nameb);
+}
+
  struct symbol *map__find_symbol(struct map *map, u64 addr,
                                 symbol_filter_t filter)
  {
@@ -413,48 +434,49 @@ u64 map__objdump_2mem(struct map *map, u64 ip)
         return ip + map->reloc;
  }
  
+static void maps__init(struct maps *maps)
+{
+       maps->entries = RB_ROOT;
+       pthread_rwlock_init(&maps->lock, NULL);
+}
+
  void map_groups__init(struct map_groups *mg, struct machine *machine)
  {
         int i;
         for (i = 0; i < MAP__NR_TYPES; ++i) {
-               mg->maps[i] = RB_ROOT;
-               INIT_LIST_HEAD(&mg->removed_maps[i]);
+               maps__init(&mg->maps[i]);
         }
         mg->machine = machine;
-       mg->refcnt = 1;
+       atomic_set(&mg->refcnt, 1);
  }
  
-static void maps__delete(struct rb_root *maps)
+static void __maps__purge(struct maps *maps)
  {
-       struct rb_node *next = rb_first(maps);
+       struct rb_root *root = &maps->entries;
+       struct rb_node *next = rb_first(root);
  
         while (next) {
                 struct map *pos = rb_entry(next, struct map, rb_node);
  
                 next = rb_next(&pos->rb_node);
-               rb_erase(&pos->rb_node, maps);
-               map__delete(pos);
+               rb_erase_init(&pos->rb_node, root);
+               map__put(pos);
         }
  }
  
-static void maps__delete_removed(struct list_head *maps)
+static void maps__exit(struct maps *maps)
  {
-       struct map *pos, *n;
-
-       list_for_each_entry_safe(pos, n, maps, node) {
-               list_del(&pos->node);
-               map__delete(pos);
-       }
+       pthread_rwlock_wrlock(&maps->lock);
+       __maps__purge(maps);
+       pthread_rwlock_unlock(&maps->lock);
  }
  
  void map_groups__exit(struct map_groups *mg)
  {
         int i;
  
-       for (i = 0; i < MAP__NR_TYPES; ++i) {
-               maps__delete(&mg->maps[i]);
-               maps__delete_removed(&mg->removed_maps[i]);
-       }
+       for (i = 0; i < MAP__NR_TYPES; ++i)
+               maps__exit(&mg->maps[i]);
  }
  
  bool map_groups__empty(struct map_groups *mg)
@@ -464,8 +486,6 @@ bool map_groups__empty(struct map_groups *mg)
         for (i = 0; i < MAP__NR_TYPES; ++i) {
                 if (maps__first(&mg->maps[i]))
                         return false;
-               if (!list_empty(&mg->removed_maps[i]))
-                       return false;
         }
  
         return true;
@@ -489,32 +509,10 @@ void map_groups__delete(struct map_groups *mg)
  
  void map_groups__put(struct map_groups *mg)
  {
-       if (--mg->refcnt == 0)
+       if (mg && atomic_dec_and_test(&mg->refcnt))
                 map_groups__delete(mg);
  }
  
-void map_groups__flush(struct map_groups *mg)
-{
-       int type;
-
-       for (type = 0; type < MAP__NR_TYPES; type++) {
-               struct rb_root *root = &mg->maps[type];
-               struct rb_node *next = rb_first(root);
-
-               while (next) {
-                       struct map *pos = rb_entry(next, struct map, rb_node);
-                       next = rb_next(&pos->rb_node);
-                       rb_erase(&pos->rb_node, root);
-                       /*
-                        * We may have references to this map, for
-                        * instance in some hist_entry instances, so
-                        * just move them to a separate list.
-                        */
-                       list_add_tail(&pos->node, &mg->removed_maps[pos->type]);
-               }
-       }
-}
-
  struct symbol *map_groups__find_symbol(struct map_groups *mg,
                                        enum map_type type, u64 addr,
                                        struct map **mapp,
@@ -538,20 +536,28 @@ struct symbol *map_groups__find_symbol_by_name(struct map_groups *mg,
                                                struct map **mapp,
                                                symbol_filter_t filter)
  {
+       struct maps *maps = &mg->maps[type];
+       struct symbol *sym;
         struct rb_node *nd;
  
-       for (nd = rb_first(&mg->maps[type]); nd; nd = rb_next(nd)) {
+       pthread_rwlock_rdlock(&maps->lock);
+
+       for (nd = rb_first(&maps->entries); nd; nd = rb_next(nd)) {
                 struct map *pos = rb_entry(nd, struct map, rb_node);
-               struct symbol *sym = map__find_symbol_by_name(pos, name, filter);
+
+               sym = map__find_symbol_by_name(pos, name, filter);
  
                 if (sym == NULL)
                         continue;
                 if (mapp != NULL)
                         *mapp = pos;
-               return sym;
+               goto out;
         }
  
-       return NULL;
+       sym = NULL;
+out:
+       pthread_rwlock_unlock(&maps->lock);
+       return sym;
  }
  
  int map_groups__find_ams(struct addr_map_symbol *ams, symbol_filter_t filter)
@@ -571,73 +577,54 @@ int map_groups__find_ams(struct addr_map_symbol *ams, symbol_filter_t filter)
         return ams->sym ? 0 : -1;
  }
  
-size_t __map_groups__fprintf_maps(struct map_groups *mg, enum map_type type,
-                                 FILE *fp)
+static size_t maps__fprintf(struct maps *maps, FILE *fp)
  {
-       size_t printed = fprintf(fp, "%s:\n", map_type__name[type]);
+       size_t printed = 0;
         struct rb_node *nd;
  
-       for (nd = rb_first(&mg->maps[type]); nd; nd = rb_next(nd)) {
+       pthread_rwlock_rdlock(&maps->lock);
+
+       for (nd = rb_first(&maps->entries); nd; nd = rb_next(nd)) {
                 struct map *pos = rb_entry(nd, struct map, rb_node);
                 printed += fprintf(fp, "Map:");
                 printed += map__fprintf(pos, fp);
                 if (verbose > 2) {
-                       printed += dso__fprintf(pos->dso, type, fp);
+                       printed += dso__fprintf(pos->dso, pos->type, fp);
                         printed += fprintf(fp, "--\n");
                 }
         }
  
-       return printed;
-}
+       pthread_rwlock_unlock(&maps->lock);
  
-static size_t map_groups__fprintf_maps(struct map_groups *mg, FILE *fp)
-{
-       size_t printed = 0, i;
-       for (i = 0; i < MAP__NR_TYPES; ++i)
-               printed += __map_groups__fprintf_maps(mg, i, fp);
         return printed;
  }
  
-static size_t __map_groups__fprintf_removed_maps(struct map_groups *mg,
-                                                enum map_type type, FILE *fp)
+size_t __map_groups__fprintf_maps(struct map_groups *mg, enum map_type type,
+                                 FILE *fp)
  {
-       struct map *pos;
-       size_t printed = 0;
-
-       list_for_each_entry(pos, &mg->removed_maps[type], node) {
-               printed += fprintf(fp, "Map:");
-               printed += map__fprintf(pos, fp);
-               if (verbose > 1) {
-                       printed += dso__fprintf(pos->dso, type, fp);
-                       printed += fprintf(fp, "--\n");
-               }
-       }
-       return printed;
+       size_t printed = fprintf(fp, "%s:\n", map_type__name[type]);
+       return printed += maps__fprintf(&mg->maps[type], fp);
  }
  
-static size_t map_groups__fprintf_removed_maps(struct map_groups *mg,
-                                              FILE *fp)
+size_t map_groups__fprintf(struct map_groups *mg, FILE *fp)
  {
         size_t printed = 0, i;
         for (i = 0; i < MAP__NR_TYPES; ++i)
-               printed += __map_groups__fprintf_removed_maps(mg, i, fp);
+               printed += __map_groups__fprintf_maps(mg, i, fp);
         return printed;
  }
  
-size_t map_groups__fprintf(struct map_groups *mg, FILE *fp)
+static int maps__fixup_overlappings(struct maps *maps, struct map *map, FILE *fp)
  {
-       size_t printed = map_groups__fprintf_maps(mg, fp);
-       printed += fprintf(fp, "Removed maps:\n");
-       return printed + map_groups__fprintf_removed_maps(mg, fp);
-}
-
-int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map,
-                                  FILE *fp)
-{
-       struct rb_root *root = &mg->maps[map->type];
-       struct rb_node *next = rb_first(root);
+       struct rb_root *root;
+       struct rb_node *next;
         int err = 0;
  
+       pthread_rwlock_wrlock(&maps->lock);
+
+       root = &maps->entries;
+       next = rb_first(root);
+
         while (next) {
                 struct map *pos = rb_entry(next, struct map, rb_node);
                 next = rb_next(&pos->rb_node);
@@ -651,7 +638,7 @@ int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map,
                         map__fprintf(pos, fp);
                 }
  
-               rb_erase(&pos->rb_node, root);
+               rb_erase_init(&pos->rb_node, root);
                 /*
                  * Now check if we need to create new maps for areas not
                  * overlapped by the new map:
@@ -661,11 +648,11 @@ int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map,
  
                         if (before == NULL) {
                                 err = -ENOMEM;
-                               goto move_map;
+                               goto put_map;
                         }
  
                         before->end = map->start;
-                       map_groups__insert(mg, before);
+                       __maps__insert(maps, before);
                         if (verbose >= 2)
                                 map__fprintf(before, fp);
                 }
@@ -675,28 +662,31 @@ int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map,
  
                         if (after == NULL) {
                                 err = -ENOMEM;
-                               goto move_map;
+                               goto put_map;
                         }
  
                         after->start = map->end;
-                       map_groups__insert(mg, after);
+                       __maps__insert(maps, after);
                         if (verbose >= 2)
                                 map__fprintf(after, fp);
                 }
-move_map:
-               /*
-                * If we have references, just move them to a separate list.
-                */
-               if (pos->referenced)
-                       list_add_tail(&pos->node, &mg->removed_maps[map->type]);
-               else
-                       map__delete(pos);
+put_map:
+               map__put(pos);
  
                 if (err)
-                       return err;
+                       goto out;
         }
  
-       return 0;
+       err = 0;
+out:
+       pthread_rwlock_unlock(&maps->lock);
+       return err;
+}
+
+int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map,
+                                  FILE *fp)
+{
+       return maps__fixup_overlappings(&mg->maps[map->type], map, fp);
  }
  
  /*
@@ -705,20 +695,28 @@ move_map:
  int map_groups__clone(struct map_groups *mg,
                       struct map_groups *parent, enum map_type type)
  {
-       struct rb_node *nd;
-       for (nd = rb_first(&parent->maps[type]); nd; nd = rb_next(nd)) {
-               struct map *map = rb_entry(nd, struct map, rb_node);
+       int err = -ENOMEM;
+       struct map *map;
+       struct maps *maps = &parent->maps[type];
+
+       pthread_rwlock_rdlock(&maps->lock);
+
+       for (map = maps__first(maps); map; map = map__next(map)) {
                 struct map *new = map__clone(map);
                 if (new == NULL)
-                       return -ENOMEM;
+                       goto out_unlock;
                 map_groups__insert(mg, new);
         }
-       return 0;
+
+       err = 0;
+out_unlock:
+       pthread_rwlock_unlock(&maps->lock);
+       return err;
  }
  
-void maps__insert(struct rb_root *maps, struct map *map)
+static void __maps__insert(struct maps *maps, struct map *map)
  {
-       struct rb_node **p = &maps->rb_node;
+       struct rb_node **p = &maps->entries.rb_node;
         struct rb_node *parent = NULL;
         const u64 ip = map->start;
         struct map *m;
@@ -733,20 +731,38 @@ void maps__insert(struct rb_root *maps, struct map *map)
         }
  
         rb_link_node(&map->rb_node, parent, p);
-       rb_insert_color(&map->rb_node, maps);
+       rb_insert_color(&map->rb_node, &maps->entries);
+       map__get(map);
  }
  
-void maps__remove(struct rb_root *maps, struct map *map)
+void maps__insert(struct maps *maps, struct map *map)
  {
-       rb_erase(&map->rb_node, maps);
+       pthread_rwlock_wrlock(&maps->lock);
+       __maps__insert(maps, map);
+       pthread_rwlock_unlock(&maps->lock);
  }
  
-struct map *maps__find(struct rb_root *maps, u64 ip)
+static void __maps__remove(struct maps *maps, struct map *map)
  {
-       struct rb_node **p = &maps->rb_node;
-       struct rb_node *parent = NULL;
+       rb_erase_init(&map->rb_node, &maps->entries);
+       map__put(map);
+}
+
+void maps__remove(struct maps *maps, struct map *map)
+{
+       pthread_rwlock_wrlock(&maps->lock);
+       __maps__remove(maps, map);
+       pthread_rwlock_unlock(&maps->lock);
+}
+
+struct map *maps__find(struct maps *maps, u64 ip)
+{
+       struct rb_node **p, *parent = NULL;
         struct map *m;
  
+       pthread_rwlock_rdlock(&maps->lock);
+
+       p = &maps->entries.rb_node;
         while (*p != NULL) {
                 parent = *p;
                 m = rb_entry(parent, struct map, rb_node);
@@ -755,22 +771,25 @@ struct map *maps__find(struct rb_root *maps, u64 ip)
                 else if (ip >= m->end)
                         p = &(*p)->rb_right;
                 else
-                       return m;
+                       goto out;
         }
  
-       return NULL;
+       m = NULL;
+out:
+       pthread_rwlock_unlock(&maps->lock);
+       return m;
  }
  
-struct map *maps__first(struct rb_root *maps)
+struct map *maps__first(struct maps *maps)
  {
-       struct rb_node *first = rb_first(maps);
+       struct rb_node *first = rb_first(&maps->entries);
  
         if (first)
                 return rb_entry(first, struct map, rb_node);
         return NULL;
  }
  
-struct map *maps__next(struct map *map)
+struct map *map__next(struct map *map)
  {
         struct rb_node *next = rb_next(&map->rb_node);
  
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h

index ec19c59ca38e07deba4a8c2c254ec3a4c71c6c91..d73e687b224e4e0d3b427f1244695ea4e19c4fcb 100644 (file)
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -1,9 +1,11 @@
  #ifndef __PERF_MAP_H
  #define __PERF_MAP_H
  
+#include <linux/atomic.h>
  #include <linux/compiler.h>
  #include <linux/list.h>
  #include <linux/rbtree.h>
+#include <pthread.h>
  #include <stdio.h>
  #include <stdbool.h>
  #include <linux/types.h>
@@ -32,7 +34,6 @@ struct map {
         u64                     start;
         u64                     end;
         u8 /* enum map_type */  type;
-       bool                    referenced;
         bool                    erange_warned;
         u32                     priv;
         u32                     prot;
@@ -50,6 +51,7 @@ struct map {
  
         struct dso              *dso;
         struct map_groups       *groups;
+       atomic_t                refcnt;
  };
  
  struct kmap {
@@ -57,11 +59,15 @@ struct kmap {
         struct map_groups       *kmaps;
  };
  
+struct maps {
+       struct rb_root   entries;
+       pthread_rwlock_t lock;
+};
+
  struct map_groups {
-       struct rb_root   maps[MAP__NR_TYPES];
-       struct list_head removed_maps[MAP__NR_TYPES];
+       struct maps      maps[MAP__NR_TYPES];
         struct machine   *machine;
-       int              refcnt;
+       atomic_t         refcnt;
  };
  
  struct map_groups *map_groups__new(struct machine *machine);
@@ -70,7 +76,8 @@ bool map_groups__empty(struct map_groups *mg);
  
  static inline struct map_groups *map_groups__get(struct map_groups *mg)
  {
-       ++mg->refcnt;
+       if (mg)
+               atomic_inc(&mg->refcnt);
         return mg;
  }
  
@@ -124,7 +131,7 @@ struct thread;
   */
  #define __map__for_each_symbol_by_name(map, sym_name, pos, filter)     \
         for (pos = map__find_symbol_by_name(map, sym_name, filter);     \
-            pos && strcmp(pos->name, sym_name) == 0;           \
+            pos && arch__compare_symbol_names(pos->name, sym_name) == 0;       \
              pos = symbol__next_by_name(pos))
  
  #define map__for_each_symbol_by_name(map, sym_name, pos)               \
@@ -132,6 +139,7 @@ struct thread;
  
  typedef int (*symbol_filter_t)(struct map *map, struct symbol *sym);
  
+int arch__compare_symbol_names(const char *namea, const char *nameb);
  void map__init(struct map *map, enum map_type type,
                u64 start, u64 end, u64 pgoff, struct dso *dso);
  struct map *map__new(struct machine *machine, u64 start, u64 len,
@@ -141,6 +149,24 @@ struct map *map__new(struct machine *machine, u64 start, u64 len,
  struct map *map__new2(u64 start, struct dso *dso, enum map_type type);
  void map__delete(struct map *map);
  struct map *map__clone(struct map *map);
+
+static inline struct map *map__get(struct map *map)
+{
+       if (map)
+               atomic_inc(&map->refcnt);
+       return map;
+}
+
+void map__put(struct map *map);
+
+static inline void __map__zput(struct map **map)
+{
+       map__put(*map);
+       *map = NULL;
+}
+
+#define map__zput(map) __map__zput(&map)
+
  int map__overlap(struct map *l, struct map *r);
  size_t map__fprintf(struct map *map, FILE *fp);
  size_t map__fprintf_dsoname(struct map *map, FILE *fp);
@@ -159,11 +185,11 @@ void map__reloc_vmlinux(struct map *map);
  
  size_t __map_groups__fprintf_maps(struct map_groups *mg, enum map_type type,
                                   FILE *fp);
-void maps__insert(struct rb_root *maps, struct map *map);
-void maps__remove(struct rb_root *maps, struct map *map);
-struct map *maps__find(struct rb_root *maps, u64 addr);
-struct map *maps__first(struct rb_root *maps);
-struct map *maps__next(struct map *map);
+void maps__insert(struct maps *maps, struct map *map);
+void maps__remove(struct maps *maps, struct map *map);
+struct map *maps__find(struct maps *maps, u64 addr);
+struct map *maps__first(struct maps *maps);
+struct map *map__next(struct map *map);
  void map_groups__init(struct map_groups *mg, struct machine *machine);
  void map_groups__exit(struct map_groups *mg);
  int map_groups__clone(struct map_groups *mg,
@@ -198,7 +224,7 @@ static inline struct map *map_groups__first(struct map_groups *mg,
  
  static inline struct map *map_groups__next(struct map *map)
  {
-       return maps__next(map);
+       return map__next(map);
  }
  
  struct symbol *map_groups__find_symbol(struct map_groups *mg,
@@ -230,6 +256,4 @@ int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map,
  struct map *map_groups__find_by_name(struct map_groups *mg,
                                      enum map_type type, const char *name);
  
-void map_groups__flush(struct map_groups *mg);
-
  #endif /* __PERF_MAP_H */
diff --git a/tools/perf/util/pager.c b/tools/perf/util/pager.c

index 31ee02d4e988a7ea4b0101c0ef4e1ae74f349023..53ef006a951c3f3c90ce8c62e9a5cd73b7750a74 100644 (file)
--- a/tools/perf/util/pager.c
+++ b/tools/perf/util/pager.c
@@ -50,11 +50,6 @@ void setup_pager(void)
  
         if (!isatty(1))
                 return;
-       if (!pager) {
-               if (!pager_program)
-                       perf_config(perf_default_config, NULL);
-               pager = pager_program;
-       }
         if (!pager)
                 pager = getenv("PAGER");
         if (!(pager || access("/usr/bin/pager", X_OK)))
diff --git a/tools/perf/util/parse-branch-options.c b/tools/perf/util/parse-branch-options.c

new file mode 100644 (file)

index 0000000..a3b1e13
--- /dev/null
+++ b/tools/perf/util/parse-branch-options.c
@@ -0,0 +1,94 @@
+#include "perf.h"
+#include "util/util.h"
+#include "util/debug.h"
+#include "util/parse-options.h"
+#include "util/parse-branch-options.h"
+
+#define BRANCH_OPT(n, m) \
+       { .name = n, .mode = (m) }
+
+#define BRANCH_END { .name = NULL }
+
+struct branch_mode {
+       const char *name;
+       int mode;
+};
+
+static const struct branch_mode branch_modes[] = {
+       BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
+       BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
+       BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
+       BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
+       BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
+       BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
+       BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
+       BRANCH_OPT("abort_tx", PERF_SAMPLE_BRANCH_ABORT_TX),
+       BRANCH_OPT("in_tx", PERF_SAMPLE_BRANCH_IN_TX),
+       BRANCH_OPT("no_tx", PERF_SAMPLE_BRANCH_NO_TX),
+       BRANCH_OPT("cond", PERF_SAMPLE_BRANCH_COND),
+       BRANCH_OPT("ind_jmp", PERF_SAMPLE_BRANCH_IND_JUMP),
+       BRANCH_END
+};
+
+int
+parse_branch_stack(const struct option *opt, const char *str, int unset)
+{
+#define ONLY_PLM \
+       (PERF_SAMPLE_BRANCH_USER        |\
+        PERF_SAMPLE_BRANCH_KERNEL      |\
+        PERF_SAMPLE_BRANCH_HV)
+
+       uint64_t *mode = (uint64_t *)opt->value;
+       const struct branch_mode *br;
+       char *s, *os = NULL, *p;
+       int ret = -1;
+
+       if (unset)
+               return 0;
+
+       /*
+        * cannot set it twice, -b + --branch-filter for instance
+        */
+       if (*mode)
+               return -1;
+
+       /* str may be NULL in case no arg is passed to -b */
+       if (str) {
+               /* because str is read-only */
+               s = os = strdup(str);
+               if (!s)
+                       return -1;
+
+               for (;;) {
+                       p = strchr(s, ',');
+                       if (p)
+                               *p = '\0';
+
+                       for (br = branch_modes; br->name; br++) {
+                               if (!strcasecmp(s, br->name))
+                                       break;
+                       }
+                       if (!br->name) {
+                               ui__warning("unknown branch filter %s,"
+                                           " check man page\n", s);
+                               goto error;
+                       }
+
+                       *mode |= br->mode;
+
+                       if (!p)
+                               break;
+
+                       s = p + 1;
+               }
+       }
+       ret = 0;
+
+       /* default to any branch */
+       if ((*mode & ~ONLY_PLM) == 0) {
+               *mode = PERF_SAMPLE_BRANCH_ANY;
+       }
+error:
+       free(os);
+       return ret;
+}
diff --git a/tools/perf/util/parse-branch-options.h b/tools/perf/util/parse-branch-options.h

new file mode 100644 (file)

index 0000000..b9d9470
--- /dev/null
+++ b/tools/perf/util/parse-branch-options.h
@@ -0,0 +1,5 @@
+#ifndef _PERF_PARSE_BRANCH_OPTIONS_H
+#define _PERF_PARSE_BRANCH_OPTIONS_H 1
+struct option;
+int parse_branch_stack(const struct option *opt, const char *str, int unset);
+#endif /* _PERF_PARSE_BRANCH_OPTIONS_H */
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c

index be0655388b38e4238d69782ee0e9c8357276a574..2a4d1ec028464757d6723bbd08c2d0ce0a010a14 100644 (file)
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -17,6 +17,7 @@
  #include "parse-events-flex.h"
  #include "pmu.h"
  #include "thread_map.h"
+#include "asm/bug.h"
  
  #define MAX_NAME_LEN 100
  
@@ -538,16 +539,40 @@ int parse_events_add_breakpoint(struct list_head *list, int *idx,
         return add_event(list, idx, &attr, NULL);
  }
  
+static int check_type_val(struct parse_events_term *term,
+                         struct parse_events_error *err,
+                         int type)
+{
+       if (type == term->type_val)
+               return 0;
+
+       if (err) {
+               err->idx = term->err_val;
+               if (type == PARSE_EVENTS__TERM_TYPE_NUM)
+                       err->str = strdup("expected numeric value");
+               else
+                       err->str = strdup("expected string value");
+       }
+       return -EINVAL;
+}
+
  static int config_term(struct perf_event_attr *attr,
-                      struct parse_events_term *term)
+                      struct parse_events_term *term,
+                      struct parse_events_error *err)
  {
-#define CHECK_TYPE_VAL(type)                                   \
-do {                                                           \
-       if (PARSE_EVENTS__TERM_TYPE_ ## type != term->type_val) \
-               return -EINVAL;                                 \
+#define CHECK_TYPE_VAL(type)                                              \
+do {                                                                      \
+       if (check_type_val(term, err, PARSE_EVENTS__TERM_TYPE_ ## type)) \
+               return -EINVAL;                                            \
  } while (0)
  
         switch (term->type_term) {
+       case PARSE_EVENTS__TERM_TYPE_USER:
+               /*
+                * Always succeed for sysfs terms, as we dont know
+                * at this point what type they need to have.
+                */
+               return 0;
         case PARSE_EVENTS__TERM_TYPE_CONFIG:
                 CHECK_TYPE_VAL(NUM);
                 attr->config = term->val.num;
@@ -582,18 +607,20 @@ do {                                                              \
  }
  
  static int config_attr(struct perf_event_attr *attr,
-                      struct list_head *head, int fail)
+                      struct list_head *head,
+                      struct parse_events_error *err)
  {
         struct parse_events_term *term;
  
         list_for_each_entry(term, head, list)
-               if (config_term(attr, term) && fail)
+               if (config_term(attr, term, err))
                         return -EINVAL;
  
         return 0;
  }
  
-int parse_events_add_numeric(struct list_head *list, int *idx,
+int parse_events_add_numeric(struct parse_events_evlist *data,
+                            struct list_head *list,
                              u32 type, u64 config,
                              struct list_head *head_config)
  {
@@ -604,10 +631,10 @@ int parse_events_add_numeric(struct list_head *list, int *idx,
         attr.config = config;
  
         if (head_config &&
-           config_attr(&attr, head_config, 1))
+           config_attr(&attr, head_config, data->error))
                 return -EINVAL;
  
-       return add_event(list, idx, &attr, NULL);
+       return add_event(list, &data->idx, &attr, NULL);
  }
  
  static int parse_events__is_name_term(struct parse_events_term *term)
@@ -626,8 +653,9 @@ static char *pmu_event_name(struct list_head *head_terms)
         return NULL;
  }
  
-int parse_events_add_pmu(struct list_head *list, int *idx,
-                        char *name, struct list_head *head_config)
+int parse_events_add_pmu(struct parse_events_evlist *data,
+                        struct list_head *list, char *name,
+                        struct list_head *head_config)
  {
         struct perf_event_attr attr;
         struct perf_pmu_info info;
@@ -647,7 +675,7 @@ int parse_events_add_pmu(struct list_head *list, int *idx,
  
         if (!head_config) {
                 attr.type = pmu->type;
-               evsel = __add_event(list, idx, &attr, NULL, pmu->cpus);
+               evsel = __add_event(list, &data->idx, &attr, NULL, pmu->cpus);
                 return evsel ? 0 : -ENOMEM;
         }
  
@@ -658,13 +686,14 @@ int parse_events_add_pmu(struct list_head *list, int *idx,
          * Configure hardcoded terms first, no need to check
          * return value when called with fail == 0 ;)
          */
-       config_attr(&attr, head_config, 0);
+       if (config_attr(&attr, head_config, data->error))
+               return -EINVAL;
  
-       if (perf_pmu__config(pmu, &attr, head_config))
+       if (perf_pmu__config(pmu, &attr, head_config, data->error))
                 return -EINVAL;
  
-       evsel = __add_event(list, idx, &attr, pmu_event_name(head_config),
-                           pmu->cpus);
+       evsel = __add_event(list, &data->idx, &attr,
+                           pmu_event_name(head_config), pmu->cpus);
         if (evsel) {
                 evsel->unit = info.unit;
                 evsel->scale = info.scale;
@@ -1019,11 +1048,13 @@ int parse_events_terms(struct list_head *terms, const char *str)
         return ret;
  }
  
-int parse_events(struct perf_evlist *evlist, const char *str)
+int parse_events(struct perf_evlist *evlist, const char *str,
+                struct parse_events_error *err)
  {
         struct parse_events_evlist data = {
-               .list = LIST_HEAD_INIT(data.list),
-               .idx  = evlist->nr_entries,
+               .list  = LIST_HEAD_INIT(data.list),
+               .idx   = evlist->nr_entries,
+               .error = err,
         };
         int ret;
  
@@ -1044,16 +1075,87 @@ int parse_events(struct perf_evlist *evlist, const char *str)
         return ret;
  }
  
+#define MAX_WIDTH 1000
+static int get_term_width(void)
+{
+       struct winsize ws;
+
+       get_term_dimensions(&ws);
+       return ws.ws_col > MAX_WIDTH ? MAX_WIDTH : ws.ws_col;
+}
+
+static void parse_events_print_error(struct parse_events_error *err,
+                                    const char *event)
+{
+       const char *str = "invalid or unsupported event: ";
+       char _buf[MAX_WIDTH];
+       char *buf = (char *) event;
+       int idx = 0;
+
+       if (err->str) {
+               /* -2 for extra '' in the final fprintf */
+               int width       = get_term_width() - 2;
+               int len_event   = strlen(event);
+               int len_str, max_len, cut = 0;
+
+               /*
+                * Maximum error index indent, we will cut
+                * the event string if it's bigger.
+                */
+               int max_err_idx = 10;
+
+               /*
+                * Let's be specific with the message when
+                * we have the precise error.
+                */
+               str     = "event syntax error: ";
+               len_str = strlen(str);
+               max_len = width - len_str;
+
+               buf = _buf;
+
+               /* We're cutting from the beggining. */
+               if (err->idx > max_err_idx)
+                       cut = err->idx - max_err_idx;
+
+               strncpy(buf, event + cut, max_len);
+
+               /* Mark cut parts with '..' on both sides. */
+               if (cut)
+                       buf[0] = buf[1] = '.';
+
+               if ((len_event - cut) > max_len) {
+                       buf[max_len - 1] = buf[max_len - 2] = '.';
+                       buf[max_len] = 0;
+               }
+
+               idx = len_str + err->idx - cut;
+       }
+
+       fprintf(stderr, "%s'%s'\n", str, buf);
+       if (idx) {
+               fprintf(stderr, "%*s\\___ %s\n", idx + 1, "", err->str);
+               if (err->help)
+                       fprintf(stderr, "\n%s\n", err->help);
+               free(err->str);
+               free(err->help);
+       }
+
+       fprintf(stderr, "Run 'perf list' for a list of valid events\n");
+}
+
+#undef MAX_WIDTH
+
  int parse_events_option(const struct option *opt, const char *str,
                         int unset __maybe_unused)
  {
         struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
-       int ret = parse_events(evlist, str);
+       struct parse_events_error err = { .idx = 0, };
+       int ret = parse_events(evlist, str, &err);
+
+       if (ret)
+               parse_events_print_error(&err, str);
  
-       if (ret) {
-               fprintf(stderr, "invalid or unsupported event: '%s'\n", str);
-               fprintf(stderr, "Run 'perf list' for a list of valid events\n");
-       }
         return ret;
  }
  
@@ -1460,7 +1562,7 @@ int parse_events__is_hardcoded_term(struct parse_events_term *term)
  
  static int new_term(struct parse_events_term **_term, int type_val,
                     int type_term, char *config,
-                   char *str, u64 num)
+                   char *str, u64 num, int err_term, int err_val)
  {
         struct parse_events_term *term;
  
@@ -1472,6 +1574,8 @@ static int new_term(struct parse_events_term **_term, int type_val,
         term->type_val  = type_val;
         term->type_term = type_term;
         term->config = config;
+       term->err_term = err_term;
+       term->err_val  = err_val;
  
         switch (type_val) {
         case PARSE_EVENTS__TERM_TYPE_NUM:
@@ -1490,17 +1594,29 @@ static int new_term(struct parse_events_term **_term, int type_val,
  }
  
  int parse_events_term__num(struct parse_events_term **term,
-                          int type_term, char *config, u64 num)
+                          int type_term, char *config, u64 num,
+                          void *loc_term_, void *loc_val_)
  {
+       YYLTYPE *loc_term = loc_term_;
+       YYLTYPE *loc_val = loc_val_;
+
         return new_term(term, PARSE_EVENTS__TERM_TYPE_NUM, type_term,
-                       config, NULL, num);
+                       config, NULL, num,
+                       loc_term ? loc_term->first_column : 0,
+                       loc_val ? loc_val->first_column : 0);
  }
  
  int parse_events_term__str(struct parse_events_term **term,
-                          int type_term, char *config, char *str)
+                          int type_term, char *config, char *str,
+                          void *loc_term_, void *loc_val_)
  {
+       YYLTYPE *loc_term = loc_term_;
+       YYLTYPE *loc_val = loc_val_;
+
         return new_term(term, PARSE_EVENTS__TERM_TYPE_STR, type_term,
-                       config, str, 0);
+                       config, str, 0,
+                       loc_term ? loc_term->first_column : 0,
+                       loc_val ? loc_val->first_column : 0);
  }
  
  int parse_events_term__sym_hw(struct parse_events_term **term,
@@ -1514,18 +1630,20 @@ int parse_events_term__sym_hw(struct parse_events_term **term,
         if (config)
                 return new_term(term, PARSE_EVENTS__TERM_TYPE_STR,
                                 PARSE_EVENTS__TERM_TYPE_USER, config,
-                               (char *) sym->symbol, 0);
+                               (char *) sym->symbol, 0, 0, 0);
         else
                 return new_term(term, PARSE_EVENTS__TERM_TYPE_STR,
                                 PARSE_EVENTS__TERM_TYPE_USER,
-                               (char *) "event", (char *) sym->symbol, 0);
+                               (char *) "event", (char *) sym->symbol,
+                               0, 0, 0);
  }
  
  int parse_events_term__clone(struct parse_events_term **new,
                              struct parse_events_term *term)
  {
         return new_term(new, term->type_val, term->type_term, term->config,
-                       term->val.str, term->val.num);
+                       term->val.str, term->val.num,
+                       term->err_term, term->err_val);
  }
  
  void parse_events__free_terms(struct list_head *terms)
@@ -1535,3 +1653,15 @@ void parse_events__free_terms(struct list_head *terms)
         list_for_each_entry_safe(term, h, terms, list)
                 free(term);
  }
+
+void parse_events_evlist_error(struct parse_events_evlist *data,
+                              int idx, const char *str)
+{
+       struct parse_events_error *err = data->error;
+
+       if (!err)
+               return;
+       err->idx = idx;
+       err->str = strdup(str);
+       WARN_ONCE(!err->str, "WARNING: failed to allocate error string");
+}
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h

index 52a2dda4f954a7682d43376bd3ac2ee91d127447..131f29b2f13258d647276820026207526a08ef02 100644 (file)
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -12,6 +12,7 @@
  struct list_head;
  struct perf_evsel;
  struct perf_evlist;
+struct parse_events_error;
  
  struct option;
  
@@ -29,7 +30,8 @@ const char *event_type(int type);
  
  extern int parse_events_option(const struct option *opt, const char *str,
                                int unset);
-extern int parse_events(struct perf_evlist *evlist, const char *str);
+extern int parse_events(struct perf_evlist *evlist, const char *str,
+                       struct parse_events_error *error);
  extern int parse_events_terms(struct list_head *terms, const char *str);
  extern int parse_filter(const struct option *opt, const char *str, int unset);
  
@@ -72,12 +74,23 @@ struct parse_events_term {
         int type_term;
         struct list_head list;
         bool used;
+
+       /* error string indexes for within parsed string */
+       int err_term;
+       int err_val;
+};
+
+struct parse_events_error {
+       int   idx;      /* index in the parsed string */
+       char *str;      /* string to display at the index */
+       char *help;     /* optional help string */
  };
  
  struct parse_events_evlist {
-       struct list_head list;
-       int idx;
-       int nr_groups;
+       struct list_head           list;
+       int                        idx;
+       int                        nr_groups;
+       struct parse_events_error *error;
  };
  
  struct parse_events_terms {
@@ -85,10 +98,12 @@ struct parse_events_terms {
  };
  
  int parse_events__is_hardcoded_term(struct parse_events_term *term);
-int parse_events_term__num(struct parse_events_term **_term,
-                          int type_term, char *config, u64 num);
-int parse_events_term__str(struct parse_events_term **_term,
-                          int type_term, char *config, char *str);
+int parse_events_term__num(struct parse_events_term **term,
+                          int type_term, char *config, u64 num,
+                          void *loc_term, void *loc_val);
+int parse_events_term__str(struct parse_events_term **term,
+                          int type_term, char *config, char *str,
+                          void *loc_term, void *loc_val);
  int parse_events_term__sym_hw(struct parse_events_term **term,
                               char *config, unsigned idx);
  int parse_events_term__clone(struct parse_events_term **new,
@@ -99,21 +114,24 @@ int parse_events__modifier_group(struct list_head *list, char *event_mod);
  int parse_events_name(struct list_head *list, char *name);
  int parse_events_add_tracepoint(struct list_head *list, int *idx,
                                 char *sys, char *event);
-int parse_events_add_numeric(struct list_head *list, int *idx,
+int parse_events_add_numeric(struct parse_events_evlist *data,
+                            struct list_head *list,
                              u32 type, u64 config,
                              struct list_head *head_config);
  int parse_events_add_cache(struct list_head *list, int *idx,
                            char *type, char *op_result1, char *op_result2);
  int parse_events_add_breakpoint(struct list_head *list, int *idx,
                                 void *ptr, char *type, u64 len);
-int parse_events_add_pmu(struct list_head *list, int *idx,
-                        char *pmu , struct list_head *head_config);
+int parse_events_add_pmu(struct parse_events_evlist *data,
+                        struct list_head *list, char *name,
+                        struct list_head *head_config);
  enum perf_pmu_event_symbol_type
  perf_pmu__parse_check(const char *name);
  void parse_events__set_leader(char *name, struct list_head *list);
  void parse_events_update_lists(struct list_head *list_event,
                                struct list_head *list_all);
-void parse_events_error(void *data, void *scanner, char const *msg);
+void parse_events_evlist_error(struct parse_events_evlist *data,
+                              int idx, const char *str);
  
  void print_events(const char *event_glob, bool name_only);
  
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l

index 8895cf3132ab242c078c70c6f7713f52030c9f6a..09e738fe9ea2790a1c304f2015cdb20c03c20614 100644 (file)
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -3,6 +3,8 @@
  %option bison-bridge
  %option prefix="parse_events_"
  %option stack
+%option bison-locations
+%option yylineno
  
  %{
  #include <errno.h>
@@ -51,6 +53,18 @@ static int str(yyscan_t scanner, int token)
         return token;
  }
  
+#define REWIND(__alloc)                                \
+do {                                                           \
+       YYSTYPE *__yylval = parse_events_get_lval(yyscanner);   \
+       char *text = parse_events_get_text(yyscanner);          \
+                                                               \
+       if (__alloc)                                            \
+               __yylval->str = strdup(text);                   \
+                                                               \
+       yycolumn -= strlen(text);                               \
+       yyless(0);                                              \
+} while (0)
+
  static int pmu_str_check(yyscan_t scanner)
  {
         YYSTYPE *yylval = parse_events_get_lval(scanner);
@@ -85,6 +99,13 @@ static int term(yyscan_t scanner, int type)
         return PE_TERM;
  }
  
+#define YY_USER_ACTION                                 \
+do {                                                   \
+       yylloc->last_column  = yylloc->first_column;    \
+       yylloc->first_column = yycolumn;                \
+       yycolumn += yyleng;                             \
+} while (0);
+
  %}
  
  %x mem
@@ -119,6 +140,12 @@ modifier_bp        [rwx]{1,3}
  
                 if (start_token) {
                         parse_events_set_extra(NULL, yyscanner);
+                       /*
+                        * The flex parser does not init locations variable
+                        * via the scan_string interface, so we need do the
+                        * init in here.
+                        */
+                       yycolumn = 0;
                         return start_token;
                 }
           }
@@ -127,24 +154,30 @@ modifier_bp       [rwx]{1,3}
  <event>{
  
  {group}                {
-                       BEGIN(INITIAL); yyless(0);
+                       BEGIN(INITIAL);
+                       REWIND(0);
                 }
  
  {event_pmu}    |
  {event}                {
-                       str(yyscanner, PE_EVENT_NAME);
-                       BEGIN(INITIAL); yyless(0);
+                       BEGIN(INITIAL);
+                       REWIND(1);
                         return PE_EVENT_NAME;
                 }
  
  .              |
  <<EOF>>                {
-                       BEGIN(INITIAL); yyless(0);
+                       BEGIN(INITIAL);
+                       REWIND(0);
                 }
  
  }
  
  <config>{
+       /*
+        * Please update formats_error_string any time
+        * new static term is added.
+        */
  config                 { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG); }
  config1                        { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG1); }
  config2                        { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG2); }
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y

index 72def077dbbfda149dfe893135dd3940ca2ed648..591905a02b926b6029447a372f2e5f7b7d34864a 100644 (file)
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -2,6 +2,7 @@
  %parse-param {void *_data}
  %parse-param {void *scanner}
  %lex-param {void* scanner}
+%locations
  
  %{
  
@@ -14,8 +15,6 @@
  #include "parse-events.h"
  #include "parse-events-bison.h"
  
-extern int parse_events_lex (YYSTYPE* lvalp, void* scanner);
-
  #define ABORT_ON(val) \
  do { \
         if (val) \
@@ -208,7 +207,7 @@ PE_NAME '/' event_config '/'
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_pmu(list, &data->idx, $1, $3));
+       ABORT_ON(parse_events_add_pmu(data, list, $1, $3));
         parse_events__free_terms($3);
         $$ = list;
  }
@@ -219,7 +218,7 @@ PE_NAME '/' '/'
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_pmu(list, &data->idx, $1, NULL));
+       ABORT_ON(parse_events_add_pmu(data, list, $1, NULL));
         $$ = list;
  }
  |
@@ -232,11 +231,11 @@ PE_KERNEL_PMU_EVENT sep_dc
  
         ALLOC_LIST(head);
         ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-                                       $1, 1));
+                                       $1, 1, &@1, NULL));
         list_add_tail(&term->list, head);
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_pmu(list, &data->idx, "cpu", head));
+       ABORT_ON(parse_events_add_pmu(data, list, "cpu", head));
         parse_events__free_terms(head);
         $$ = list;
  }
@@ -252,7 +251,7 @@ PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc
  
         ALLOC_LIST(head);
         ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-                                       &pmu_name, 1));
+                                       &pmu_name, 1, &@1, NULL));
         list_add_tail(&term->list, head);
  
         ALLOC_LIST(list);
@@ -275,8 +274,7 @@ value_sym '/' event_config '/'
         int config = $1 & 255;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_numeric(list, &data->idx,
-                                         type, config, $3));
+       ABORT_ON(parse_events_add_numeric(data, list, type, config, $3));
         parse_events__free_terms($3);
         $$ = list;
  }
@@ -289,8 +287,7 @@ value_sym sep_slash_dc
         int config = $1 & 255;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_numeric(list, &data->idx,
-                                         type, config, NULL));
+       ABORT_ON(parse_events_add_numeric(data, list, type, config, NULL));
         $$ = list;
  }
  
@@ -389,7 +386,15 @@ PE_NAME ':' PE_NAME
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_tracepoint(list, &data->idx, $1, $3));
+       if (parse_events_add_tracepoint(list, &data->idx, $1, $3)) {
+               struct parse_events_error *error = data->error;
+
+               if (error) {
+                       error->idx = @1.first_column;
+                       error->str = strdup("unknown tracepoint");
+               }
+               return -1;
+       }
         $$ = list;
  }
  
@@ -400,7 +405,7 @@ PE_VALUE ':' PE_VALUE
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_numeric(list, &data->idx, (u32)$1, $3, NULL));
+       ABORT_ON(parse_events_add_numeric(data, list, (u32)$1, $3, NULL));
         $$ = list;
  }
  
@@ -411,8 +416,7 @@ PE_RAW
         struct list_head *list;
  
         ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_numeric(list, &data->idx,
-                                         PERF_TYPE_RAW, $1, NULL));
+       ABORT_ON(parse_events_add_numeric(data, list, PERF_TYPE_RAW, $1, NULL));
         $$ = list;
  }
  
@@ -450,7 +454,7 @@ PE_NAME '=' PE_NAME
         struct parse_events_term *term;
  
         ABORT_ON(parse_events_term__str(&term, PARSE_EVENTS__TERM_TYPE_USER,
-                                       $1, $3));
+                                       $1, $3, &@1, &@3));
         $$ = term;
  }
  |
@@ -459,7 +463,7 @@ PE_NAME '=' PE_VALUE
         struct parse_events_term *term;
  
         ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-                                       $1, $3));
+                                       $1, $3, &@1, &@3));
         $$ = term;
  }
  |
@@ -477,7 +481,7 @@ PE_NAME
         struct parse_events_term *term;
  
         ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-                                       $1, 1));
+                                       $1, 1, &@1, NULL));
         $$ = term;
  }
  |
@@ -494,7 +498,7 @@ PE_TERM '=' PE_NAME
  {
         struct parse_events_term *term;
  
-       ABORT_ON(parse_events_term__str(&term, (int)$1, NULL, $3));
+       ABORT_ON(parse_events_term__str(&term, (int)$1, NULL, $3, &@1, &@3));
         $$ = term;
  }
  |
@@ -502,7 +506,7 @@ PE_TERM '=' PE_VALUE
  {
         struct parse_events_term *term;
  
-       ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, $3));
+       ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, $3, &@1, &@3));
         $$ = term;
  }
  |
@@ -510,7 +514,7 @@ PE_TERM
  {
         struct parse_events_term *term;
  
-       ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1));
+       ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1, &@1, NULL));
         $$ = term;
  }
  
@@ -520,7 +524,9 @@ sep_slash_dc: '/' | ':' |
  
  %%
  
-void parse_events_error(void *data __maybe_unused, void *scanner __maybe_unused,
+void parse_events_error(YYLTYPE *loc, void *data,
+                       void *scanner __maybe_unused,
                         char const *msg __maybe_unused)
  {
+       parse_events_evlist_error(data, loc->last_column, "parser error");
  }
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h

index 59561fd86278276040fcde6c075b334008e13d67..367d8b816cc7e7ae0a99bb8200e0bd81b7999546 100644 (file)
--- a/tools/perf/util/parse-options.h
+++ b/tools/perf/util/parse-options.h
@@ -123,6 +123,10 @@ struct option {
  #define OPT_LONG(s, l, v, h)        { .type = OPTION_LONG, .short_name = (s), .long_name = (l), .value = check_vtype(v, long *), .help = (h) }
  #define OPT_U64(s, l, v, h)         { .type = OPTION_U64, .short_name = (s), .long_name = (l), .value = check_vtype(v, u64 *), .help = (h) }
  #define OPT_STRING(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h) }
+#define OPT_STRING_OPTARG(s, l, v, a, h, d) \
+       { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), \
+         .value = check_vtype(v, const char **), (a), .help = (h), \
+         .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d) }
  #define OPT_STRING_NOEMPTY(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h), .flags = PARSE_OPT_NOEMPTY}
  #define OPT_DATE(s, l, v, h) \
         { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb }
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c

index 48411674da0f9cef6c87ba74a08bec513b112d6c..0fcc624eb76767b1c3fe211f678a981a71b744ce 100644 (file)
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -112,7 +112,11 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *
         if (sret < 0)
                 goto error;
  
-       scale[sret] = '\0';
+       if (scale[sret - 1] == '\n')
+               scale[sret - 1] = '\0';
+       else
+               scale[sret] = '\0';
+
         /*
          * save current locale
          */
@@ -154,7 +158,10 @@ static int perf_pmu__parse_unit(struct perf_pmu_alias *alias, char *dir, char *n
  
         close(fd);
  
-       alias->unit[sret] = '\0';
+       if (alias->unit[sret - 1] == '\n')
+               alias->unit[sret - 1] = '\0';
+       else
+               alias->unit[sret] = '\0';
  
         return 0;
  error:
@@ -442,6 +449,10 @@ static struct perf_pmu *pmu_lookup(const char *name)
         LIST_HEAD(aliases);
         __u32 type;
  
+       /* No support for intel_bts or intel_pt so disallow them */
+       if (!strcmp(name, "intel_bts") || !strcmp(name, "intel_pt"))
+               return NULL;
+
         /*
          * The pmu data we store & need consists of the pmu
          * type value and format definitions. Load both right
@@ -579,6 +590,38 @@ static int pmu_resolve_param_term(struct parse_events_term *term,
         return -1;
  }
  
+static char *formats_error_string(struct list_head *formats)
+{
+       struct perf_pmu_format *format;
+       char *err, *str;
+       static const char *static_terms = "config,config1,config2,name,period,branch_type\n";
+       unsigned i = 0;
+
+       if (!asprintf(&str, "valid terms:"))
+               return NULL;
+
+       /* sysfs exported terms */
+       list_for_each_entry(format, formats, list) {
+               char c = i++ ? ',' : ' ';
+
+               err = str;
+               if (!asprintf(&str, "%s%c%s", err, c, format->name))
+                       goto fail;
+               free(err);
+       }
+
+       /* static terms */
+       err = str;
+       if (!asprintf(&str, "%s,%s", err, static_terms))
+               goto fail;
+
+       free(err);
+       return str;
+fail:
+       free(err);
+       return NULL;
+}
+
  /*
   * Setup one of config[12] attr members based on the
   * user input data - term parameter.
@@ -587,7 +630,7 @@ static int pmu_config_term(struct list_head *formats,
                            struct perf_event_attr *attr,
                            struct parse_events_term *term,
                            struct list_head *head_terms,
-                          bool zero)
+                          bool zero, struct parse_events_error *err)
  {
         struct perf_pmu_format *format;
         __u64 *vp;
@@ -611,6 +654,11 @@ static int pmu_config_term(struct list_head *formats,
         if (!format) {
                 if (verbose)
                         printf("Invalid event/parameter '%s'\n", term->config);
+               if (err) {
+                       err->idx  = term->err_term;
+                       err->str  = strdup("unknown term");
+                       err->help = formats_error_string(formats);
+               }
                 return -EINVAL;
         }
  
@@ -636,9 +684,14 @@ static int pmu_config_term(struct list_head *formats,
                 val = term->val.num;
         else if (term->type_val == PARSE_EVENTS__TERM_TYPE_STR) {
                 if (strcmp(term->val.str, "?")) {
-                       if (verbose)
+                       if (verbose) {
                                 pr_info("Invalid sysfs entry %s=%s\n",
                                                 term->config, term->val.str);
+                       }
+                       if (err) {
+                               err->idx = term->err_val;
+                               err->str = strdup("expected numeric value");
+                       }
                         return -EINVAL;
                 }
  
@@ -654,12 +707,13 @@ static int pmu_config_term(struct list_head *formats,
  int perf_pmu__config_terms(struct list_head *formats,
                            struct perf_event_attr *attr,
                            struct list_head *head_terms,
-                          bool zero)
+                          bool zero, struct parse_events_error *err)
  {
         struct parse_events_term *term;
  
         list_for_each_entry(term, head_terms, list) {
-               if (pmu_config_term(formats, attr, term, head_terms, zero))
+               if (pmu_config_term(formats, attr, term, head_terms,
+                                   zero, err))
                         return -EINVAL;
         }
  
@@ -672,12 +726,14 @@ int perf_pmu__config_terms(struct list_head *formats,
   * 2) pmu format definitions - specified by pmu parameter
   */
  int perf_pmu__config(struct perf_pmu *pmu, struct perf_event_attr *attr,
-                    struct list_head *head_terms)
+                    struct list_head *head_terms,
+                    struct parse_events_error *err)
  {
         bool zero = !!pmu->default_config;
  
         attr->type = pmu->type;
-       return perf_pmu__config_terms(&pmu->format, attr, head_terms, zero);
+       return perf_pmu__config_terms(&pmu->format, attr, head_terms,
+                                     zero, err);
  }
  
  static struct perf_pmu_alias *pmu_find_alias(struct perf_pmu *pmu,
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h

index 6b1249fbdb5f3a736c7c55a4005bf2c679addfa7..7b9c8cf8ae3e590578abb0f71a41739cb8fc968a 100644 (file)
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -4,6 +4,7 @@
  #include <linux/bitmap.h>
  #include <linux/perf_event.h>
  #include <stdbool.h>
+#include "parse-events.h"
  
  enum {
         PERF_PMU_FORMAT_VALUE_CONFIG,
@@ -47,11 +48,12 @@ struct perf_pmu_alias {
  
  struct perf_pmu *perf_pmu__find(const char *name);
  int perf_pmu__config(struct perf_pmu *pmu, struct perf_event_attr *attr,
-                    struct list_head *head_terms);
+                    struct list_head *head_terms,
+                    struct parse_events_error *error);
  int perf_pmu__config_terms(struct list_head *formats,
                            struct perf_event_attr *attr,
                            struct list_head *head_terms,
-                          bool zero);
+                          bool zero, struct parse_events_error *error);
  int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms,
                           struct perf_pmu_info *info);
  struct list_head *perf_pmu__alias(struct perf_pmu *pmu,
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c

index d05b77cf35f77051354b9d08acc035cf4575dd5b..076527b639bdbcab38b4e196f1d388352f7e22c1 100644 (file)
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -51,6 +51,7 @@
  #define PERFPROBE_GROUP "probe"
  
  bool probe_event_dry_run;      /* Dry run flag */
+struct probe_conf probe_conf;
  
  #define semantic_error(msg ...) pr_err("Semantic error :" msg)
  
@@ -161,18 +162,18 @@ static u64 kernel_get_symbol_address_by_name(const char *name, bool reloc)
  
  static struct map *kernel_get_module_map(const char *module)
  {
-       struct rb_node *nd;
         struct map_groups *grp = &host_machine->kmaps;
+       struct maps *maps = &grp->maps[MAP__FUNCTION];
+       struct map *pos;
  
         /* A file path -- this is an offline module */
         if (module && strchr(module, '/'))
-               return machine__new_module(host_machine, 0, module);
+               return machine__findnew_module_map(host_machine, 0, module);
  
         if (!module)
                 module = "kernel";
  
-       for (nd = rb_first(&grp->maps[MAP__FUNCTION]); nd; nd = rb_next(nd)) {
-               struct map *pos = rb_entry(nd, struct map, rb_node);
+       for (pos = maps__first(maps); pos; pos = map__next(pos)) {
                 if (strncmp(pos->dso->short_name + 1, module,
                             pos->dso->short_name_len - 2) == 0) {
                         return pos;
@@ -194,52 +195,11 @@ static void put_target_map(struct map *map, bool user)
  {
         if (map && user) {
                 /* Only the user map needs to be released */
-               dso__delete(map->dso);
-               map__delete(map);
+               map__put(map);
         }
  }
  
  
-static struct dso *kernel_get_module_dso(const char *module)
-{
-       struct dso *dso;
-       struct map *map;
-       const char *vmlinux_name;
-
-       if (module) {
-               list_for_each_entry(dso, &host_machine->kernel_dsos.head,
-                                   node) {
-                       if (strncmp(dso->short_name + 1, module,
-                                   dso->short_name_len - 2) == 0)
-                               goto found;
-               }
-               pr_debug("Failed to find module %s.\n", module);
-               return NULL;
-       }
-
-       map = host_machine->vmlinux_maps[MAP__FUNCTION];
-       dso = map->dso;
-
-       vmlinux_name = symbol_conf.vmlinux_name;
-       if (vmlinux_name) {
-               if (dso__load_vmlinux(dso, map, vmlinux_name, false, NULL) <= 0)
-                       return NULL;
-       } else {
-               if (dso__load_vmlinux_path(dso, map, NULL) <= 0) {
-                       pr_debug("Failed to load kernel map.\n");
-                       return NULL;
-               }
-       }
-found:
-       return dso;
-}
-
-const char *kernel_get_module_path(const char *module)
-{
-       struct dso *dso = kernel_get_module_dso(module);
-       return (dso) ? dso->long_name : NULL;
-}
-
  static int convert_exec_to_group(const char *exec, char **result)
  {
         char *ptr1, *ptr2, *exec_copy;
@@ -286,7 +246,55 @@ static void clear_probe_trace_events(struct probe_trace_event *tevs, int ntevs)
                 clear_probe_trace_event(tevs + i);
  }
  
+static bool kprobe_blacklist__listed(unsigned long address);
+static bool kprobe_warn_out_range(const char *symbol, unsigned long address)
+{
+       /* Get the address of _etext for checking non-probable text symbol */
+       if (kernel_get_symbol_address_by_name("_etext", false) < address)
+               pr_warning("%s is out of .text, skip it.\n", symbol);
+       else if (kprobe_blacklist__listed(address))
+               pr_warning("%s is blacklisted function, skip it.\n", symbol);
+       else
+               return false;
+
+       return true;
+}
+
  #ifdef HAVE_DWARF_SUPPORT
+
+static int kernel_get_module_dso(const char *module, struct dso **pdso)
+{
+       struct dso *dso;
+       struct map *map;
+       const char *vmlinux_name;
+       int ret = 0;
+
+       if (module) {
+               list_for_each_entry(dso, &host_machine->dsos.head, node) {
+                       if (!dso->kernel)
+                               continue;
+                       if (strncmp(dso->short_name + 1, module,
+                                   dso->short_name_len - 2) == 0)
+                               goto found;
+               }
+               pr_debug("Failed to find module %s.\n", module);
+               return -ENOENT;
+       }
+
+       map = host_machine->vmlinux_maps[MAP__FUNCTION];
+       dso = map->dso;
+
+       vmlinux_name = symbol_conf.vmlinux_name;
+       dso->load_errno = 0;
+       if (vmlinux_name)
+               ret = dso__load_vmlinux(dso, map, vmlinux_name, false, NULL);
+       else
+               ret = dso__load_vmlinux_path(dso, map, NULL);
+found:
+       *pdso = dso;
+       return ret;
+}
+
  /*
   * Some binaries like glibc have special symbols which are on the symbol
   * table, but not in the debuginfo. If we can find the address of the
@@ -344,15 +352,14 @@ out:
  
  static int get_alternative_probe_event(struct debuginfo *dinfo,
                                        struct perf_probe_event *pev,
-                                      struct perf_probe_point *tmp,
-                                      const char *target)
+                                      struct perf_probe_point *tmp)
  {
         int ret;
  
         memcpy(tmp, &pev->point, sizeof(*tmp));
         memset(&pev->point, 0, sizeof(pev->point));
         ret = find_alternative_probe_point(dinfo, tmp, &pev->point,
-                                          target, pev->uprobes);
+                                          pev->target, pev->uprobes);
         if (ret < 0)
                 memcpy(&pev->point, tmp, sizeof(*tmp));
  
@@ -390,16 +397,25 @@ static int get_alternative_line_range(struct debuginfo *dinfo,
  static struct debuginfo *open_debuginfo(const char *module, bool silent)
  {
         const char *path = module;
-       struct debuginfo *ret;
+       char reason[STRERR_BUFSIZE];
+       struct debuginfo *ret = NULL;
+       struct dso *dso = NULL;
+       int err;
  
         if (!module || !strchr(module, '/')) {
-               path = kernel_get_module_path(module);
-               if (!path) {
+               err = kernel_get_module_dso(module, &dso);
+               if (err < 0) {
+                       if (!dso || dso->load_errno == 0) {
+                               if (!strerror_r(-err, reason, STRERR_BUFSIZE))
+                                       strcpy(reason, "(unknown)");
+                       } else
+                               dso__strerror_load(dso, reason, STRERR_BUFSIZE);
                         if (!silent)
-                               pr_err("Failed to find path of %s module.\n",
-                                      module ?: "kernel");
+                               pr_err("Failed to find the path for %s: %s\n",
+                                       module ?: "kernel", reason);
                         return NULL;
                 }
+               path = dso->long_name;
         }
         ret = debuginfo__new(path);
         if (!ret && !silent) {
@@ -413,6 +429,41 @@ static struct debuginfo *open_debuginfo(const char *module, bool silent)
         return ret;
  }
  
+/* For caching the last debuginfo */
+static struct debuginfo *debuginfo_cache;
+static char *debuginfo_cache_path;
+
+static struct debuginfo *debuginfo_cache__open(const char *module, bool silent)
+{
+       if ((debuginfo_cache_path && !strcmp(debuginfo_cache_path, module)) ||
+           (!debuginfo_cache_path && !module && debuginfo_cache))
+               goto out;
+
+       /* Copy module path */
+       free(debuginfo_cache_path);
+       if (module) {
+               debuginfo_cache_path = strdup(module);
+               if (!debuginfo_cache_path) {
+                       debuginfo__delete(debuginfo_cache);
+                       debuginfo_cache = NULL;
+                       goto out;
+               }
+       }
+
+       debuginfo_cache = open_debuginfo(module, silent);
+       if (!debuginfo_cache)
+               zfree(&debuginfo_cache_path);
+out:
+       return debuginfo_cache;
+}
+
+static void debuginfo_cache__exit(void)
+{
+       debuginfo__delete(debuginfo_cache);
+       debuginfo_cache = NULL;
+       zfree(&debuginfo_cache_path);
+}
+
  
  static int get_text_start_address(const char *exec, unsigned long *address)
  {
@@ -474,12 +525,11 @@ static int find_perf_probe_point_from_dwarf(struct probe_trace_point *tp,
         pr_debug("try to find information at %" PRIx64 " in %s\n", addr,
                  tp->module ? : "kernel");
  
-       dinfo = open_debuginfo(tp->module, verbose == 0);
-       if (dinfo) {
+       dinfo = debuginfo_cache__open(tp->module, verbose == 0);
+       if (dinfo)
                 ret = debuginfo__find_probe_point(dinfo,
                                                  (unsigned long)addr, pp);
-               debuginfo__delete(dinfo);
-       } else
+       else
                 ret = -ENOENT;
  
         if (ret > 0) {
@@ -558,7 +608,7 @@ static int post_process_probe_trace_events(struct probe_trace_event *tevs,
  {
         struct ref_reloc_sym *reloc_sym;
         char *tmp;
-       int i;
+       int i, skipped = 0;
  
         if (uprobe)
                 return add_exec_to_probe_trace_events(tevs, ntevs, module);
@@ -574,31 +624,40 @@ static int post_process_probe_trace_events(struct probe_trace_event *tevs,
         }
  
         for (i = 0; i < ntevs; i++) {
-               if (tevs[i].point.address && !tevs[i].point.retprobe) {
+               if (!tevs[i].point.address || tevs[i].point.retprobe)
+                       continue;
+               /* If we found a wrong one, mark it by NULL symbol */
+               if (kprobe_warn_out_range(tevs[i].point.symbol,
+                                         tevs[i].point.address)) {
+                       tmp = NULL;
+                       skipped++;
+               } else {
                         tmp = strdup(reloc_sym->name);
                         if (!tmp)
                                 return -ENOMEM;
-                       free(tevs[i].point.symbol);
-                       tevs[i].point.symbol = tmp;
-                       tevs[i].point.offset = tevs[i].point.address -
-                                              reloc_sym->unrelocated_addr;
                 }
+               /* If we have no realname, use symbol for it */
+               if (!tevs[i].point.realname)
+                       tevs[i].point.realname = tevs[i].point.symbol;
+               else
+                       free(tevs[i].point.symbol);
+               tevs[i].point.symbol = tmp;
+               tevs[i].point.offset = tevs[i].point.address -
+                                      reloc_sym->unrelocated_addr;
         }
-       return 0;
+       return skipped;
  }
  
  /* Try to find perf_probe_event with debuginfo */
  static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
-                                         struct probe_trace_event **tevs,
-                                         int max_tevs, const char *target)
+                                         struct probe_trace_event **tevs)
  {
         bool need_dwarf = perf_probe_event_need_dwarf(pev);
         struct perf_probe_point tmp;
         struct debuginfo *dinfo;
         int ntevs, ret = 0;
  
-       dinfo = open_debuginfo(target, !need_dwarf);
-
+       dinfo = open_debuginfo(pev->target, !need_dwarf);
         if (!dinfo) {
                 if (need_dwarf)
                         return -ENOENT;
@@ -608,13 +667,12 @@ static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
  
         pr_debug("Try to find probe point from debuginfo.\n");
         /* Searching trace events corresponding to a probe event */
-       ntevs = debuginfo__find_trace_events(dinfo, pev, tevs, max_tevs);
+       ntevs = debuginfo__find_trace_events(dinfo, pev, tevs);
  
         if (ntevs == 0) {  /* Not found, retry with an alternative */
-               ret = get_alternative_probe_event(dinfo, pev, &tmp, target);
+               ret = get_alternative_probe_event(dinfo, pev, &tmp);
                 if (!ret) {
-                       ntevs = debuginfo__find_trace_events(dinfo, pev,
-                                                            tevs, max_tevs);
+                       ntevs = debuginfo__find_trace_events(dinfo, pev, tevs);
                         /*
                          * Write back to the original probe_event for
                          * setting appropriate (user given) event name
@@ -629,12 +687,15 @@ static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
         if (ntevs > 0) {        /* Succeeded to find trace events */
                 pr_debug("Found %d probe_trace_events.\n", ntevs);
                 ret = post_process_probe_trace_events(*tevs, ntevs,
-                                                       target, pev->uprobes);
-               if (ret < 0) {
+                                               pev->target, pev->uprobes);
+               if (ret < 0 || ret == ntevs) {
                         clear_probe_trace_events(*tevs, ntevs);
                         zfree(tevs);
                 }
-               return ret < 0 ? ret : ntevs;
+               if (ret != ntevs)
+                       return ret < 0 ? ret : ntevs;
+               ntevs = 0;
+               /* Fall through */
         }
  
         if (ntevs == 0) {       /* No error but failed to find probe point. */
@@ -809,8 +870,7 @@ int show_line_range(struct line_range *lr, const char *module, bool user)
  
  static int show_available_vars_at(struct debuginfo *dinfo,
                                   struct perf_probe_event *pev,
-                                 int max_vls, struct strfilter *_filter,
-                                 bool externs, const char *target)
+                                 struct strfilter *_filter)
  {
         char *buf;
         int ret, i, nvars;
@@ -824,13 +884,12 @@ static int show_available_vars_at(struct debuginfo *dinfo,
                 return -EINVAL;
         pr_debug("Searching variables at %s\n", buf);
  
-       ret = debuginfo__find_available_vars_at(dinfo, pev, &vls,
-                                               max_vls, externs);
+       ret = debuginfo__find_available_vars_at(dinfo, pev, &vls);
         if (!ret) {  /* Not found, retry with an alternative */
-               ret = get_alternative_probe_event(dinfo, pev, &tmp, target);
+               ret = get_alternative_probe_event(dinfo, pev, &tmp);
                 if (!ret) {
                         ret = debuginfo__find_available_vars_at(dinfo, pev,
-                                               &vls, max_vls, externs);
+                                                               &vls);
                         /* Release the old probe_point */
                         clear_perf_probe_point(&tmp);
                 }
@@ -877,8 +936,7 @@ end:
  
  /* Show available variables on given probe point */
  int show_available_vars(struct perf_probe_event *pevs, int npevs,
-                       int max_vls, const char *module,
-                       struct strfilter *_filter, bool externs)
+                       struct strfilter *_filter)
  {
         int i, ret = 0;
         struct debuginfo *dinfo;
@@ -887,7 +945,7 @@ int show_available_vars(struct perf_probe_event *pevs, int npevs,
         if (ret < 0)
                 return ret;
  
-       dinfo = open_debuginfo(module, false);
+       dinfo = open_debuginfo(pevs->target, false);
         if (!dinfo) {
                 ret = -ENOENT;
                 goto out;
@@ -896,8 +954,7 @@ int show_available_vars(struct perf_probe_event *pevs, int npevs,
         setup_pager();
  
         for (i = 0; i < npevs && ret >= 0; i++)
-               ret = show_available_vars_at(dinfo, &pevs[i], max_vls, _filter,
-                                            externs, module);
+               ret = show_available_vars_at(dinfo, &pevs[i], _filter);
  
         debuginfo__delete(dinfo);
  out:
@@ -907,6 +964,10 @@ out:
  
  #else  /* !HAVE_DWARF_SUPPORT */
  
+static void debuginfo_cache__exit(void)
+{
+}
+
  static int
  find_perf_probe_point_from_dwarf(struct probe_trace_point *tp __maybe_unused,
                                  struct perf_probe_point *pp __maybe_unused,
@@ -916,9 +977,7 @@ find_perf_probe_point_from_dwarf(struct probe_trace_point *tp __maybe_unused,
  }
  
  static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
-                               struct probe_trace_event **tevs __maybe_unused,
-                               int max_tevs __maybe_unused,
-                               const char *target __maybe_unused)
+                               struct probe_trace_event **tevs __maybe_unused)
  {
         if (perf_probe_event_need_dwarf(pev)) {
                 pr_warning("Debuginfo-analysis is not supported.\n");
@@ -937,10 +996,8 @@ int show_line_range(struct line_range *lr __maybe_unused,
  }
  
  int show_available_vars(struct perf_probe_event *pevs __maybe_unused,
-                       int npevs __maybe_unused, int max_vls __maybe_unused,
-                       const char *module __maybe_unused,
-                       struct strfilter *filter __maybe_unused,
-                       bool externs __maybe_unused)
+                       int npevs __maybe_unused,
+                       struct strfilter *filter __maybe_unused)
  {
         pr_warning("Debuginfo-analysis is not supported.\n");
         return -ENOSYS;
@@ -980,6 +1037,18 @@ static int parse_line_num(char **ptr, int *val, const char *what)
         return 0;
  }
  
+/* Check the name is good for event, group or function */
+static bool is_c_func_name(const char *name)
+{
+       if (!isalpha(*name) && *name != '_')
+               return false;
+       while (*++name != '\0') {
+               if (!isalpha(*name) && !isdigit(*name) && *name != '_')
+                       return false;
+       }
+       return true;
+}
+
  /*
   * Stuff 'lr' according to the line range described by 'arg'.
   * The line range syntax is described by:
@@ -1048,10 +1117,15 @@ int parse_line_range_desc(const char *arg, struct line_range *lr)
                         goto err;
                 }
                 lr->function = name;
-       } else if (strchr(name, '.'))
+       } else if (strchr(name, '/') || strchr(name, '.'))
                 lr->file = name;
-       else
+       else if (is_c_func_name(name))/* We reuse it for checking funcname */
                 lr->function = name;
+       else {  /* Invalid name */
+               semantic_error("'%s' is not a valid function name.\n", name);
+               err = -EINVAL;
+               goto err;
+       }
  
         return 0;
  err:
@@ -1059,24 +1133,13 @@ err:
         return err;
  }
  
-/* Check the name is good for event/group */
-static bool check_event_name(const char *name)
-{
-       if (!isalpha(*name) && *name != '_')
-               return false;
-       while (*++name != '\0') {
-               if (!isalpha(*name) && !isdigit(*name) && *name != '_')
-                       return false;
-       }
-       return true;
-}
-
  /* Parse probepoint definition. */
  static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
  {
         struct perf_probe_point *pp = &pev->point;
         char *ptr, *tmp;
         char c, nc = 0;
+       bool file_spec = false;
         /*
          * <Syntax>
          * perf probe [EVENT=]SRC[:LN|;PTN]
@@ -1095,7 +1158,7 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
                         semantic_error("Group name is not supported yet.\n");
                         return -ENOTSUP;
                 }
-               if (!check_event_name(arg)) {
+               if (!is_c_func_name(arg)) {
                         semantic_error("%s is bad for event name -it must "
                                        "follow C symbol-naming rule.\n", arg);
                         return -EINVAL;
@@ -1107,6 +1170,23 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
                 arg = tmp;
         }
  
+       /*
+        * Check arg is function or file name and copy it.
+        *
+        * We consider arg to be a file spec if and only if it satisfies
+        * all of the below criteria::
+        * - it does not include any of "+@%",
+        * - it includes one of ":;", and
+        * - it has a period '.' in the name.
+        *
+        * Otherwise, we consider arg to be a function specification.
+        */
+       if (!strpbrk(arg, "+@%") && (ptr = strpbrk(arg, ";:")) != NULL) {
+               /* This is a file spec if it includes a '.' before ; or : */
+               if (memchr(arg, '.', ptr - arg))
+                       file_spec = true;
+       }
+
         ptr = strpbrk(arg, ";:+@%");
         if (ptr) {
                 nc = *ptr;
@@ -1117,10 +1197,9 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
         if (tmp == NULL)
                 return -ENOMEM;
  
-       /* Check arg is function or file and copy it */
-       if (strchr(tmp, '.'))   /* File */
+       if (file_spec)
                 pp->file = tmp;
-       else                    /* Function */
+       else
                 pp->function = tmp;
  
         /* Parse other options */
@@ -1762,8 +1841,7 @@ static int find_perf_probe_point_from_map(struct probe_trace_point *tp,
  
  out:
         if (map && !is_kprobe) {
-               dso__delete(map->dso);
-               map__delete(map);
+               map__put(map);
         }
  
         return ret;
@@ -1877,6 +1955,7 @@ static void clear_probe_trace_event(struct probe_trace_event *tev)
         free(tev->event);
         free(tev->group);
         free(tev->point.symbol);
+       free(tev->point.realname);
         free(tev->point.module);
         for (i = 0; i < tev->nargs; i++) {
                 free(tev->args[i].name);
@@ -1954,7 +2033,7 @@ static int open_probe_events(const char *trace_file, bool readwrite)
         if (ret >= 0) {
                 pr_debug("Opening %s write=%d\n", buf, readwrite);
                 if (readwrite && !probe_event_dry_run)
-                       ret = open(buf, O_RDWR, O_APPEND);
+                       ret = open(buf, O_RDWR | O_APPEND, 0);
                 else
                         ret = open(buf, O_RDONLY, 0);
  
@@ -2095,9 +2174,31 @@ kprobe_blacklist__find_by_address(struct list_head *blacklist,
         return NULL;
  }
  
-/* Show an event */
-static int show_perf_probe_event(struct perf_probe_event *pev,
-                                const char *module)
+static LIST_HEAD(kprobe_blacklist);
+
+static void kprobe_blacklist__init(void)
+{
+       if (!list_empty(&kprobe_blacklist))
+               return;
+
+       if (kprobe_blacklist__load(&kprobe_blacklist) < 0)
+               pr_debug("No kprobe blacklist support, ignored\n");
+}
+
+static void kprobe_blacklist__release(void)
+{
+       kprobe_blacklist__delete(&kprobe_blacklist);
+}
+
+static bool kprobe_blacklist__listed(unsigned long address)
+{
+       return !!kprobe_blacklist__find_by_address(&kprobe_blacklist, address);
+}
+
+static int perf_probe_event__sprintf(const char *group, const char *event,
+                                    struct perf_probe_event *pev,
+                                    const char *module,
+                                    struct strbuf *result)
  {
         int i, ret;
         char buf[128];
@@ -2108,30 +2209,67 @@ static int show_perf_probe_event(struct perf_probe_event *pev,
         if (!place)
                 return -EINVAL;
  
-       ret = e_snprintf(buf, 128, "%s:%s", pev->group, pev->event);
+       ret = e_snprintf(buf, 128, "%s:%s", group, event);
         if (ret < 0)
-               return ret;
+               goto out;
  
-       pr_info("  %-20s (on %s", buf, place);
+       strbuf_addf(result, "  %-20s (on %s", buf, place);
         if (module)
-               pr_info(" in %s", module);
+               strbuf_addf(result, " in %s", module);
  
         if (pev->nargs > 0) {
-               pr_info(" with");
+               strbuf_addstr(result, " with");
                 for (i = 0; i < pev->nargs; i++) {
                         ret = synthesize_perf_probe_arg(&pev->args[i],
                                                         buf, 128);
                         if (ret < 0)
-                               break;
-                       pr_info(" %s", buf);
+                               goto out;
+                       strbuf_addf(result, " %s", buf);
                 }
         }
-       pr_info(")\n");
+       strbuf_addch(result, ')');
+out:
         free(place);
         return ret;
  }
  
-static int __show_perf_probe_events(int fd, bool is_kprobe)
+/* Show an event */
+static int show_perf_probe_event(const char *group, const char *event,
+                                struct perf_probe_event *pev,
+                                const char *module, bool use_stdout)
+{
+       struct strbuf buf = STRBUF_INIT;
+       int ret;
+
+       ret = perf_probe_event__sprintf(group, event, pev, module, &buf);
+       if (ret >= 0) {
+               if (use_stdout)
+                       printf("%s\n", buf.buf);
+               else
+                       pr_info("%s\n", buf.buf);
+       }
+       strbuf_release(&buf);
+
+       return ret;
+}
+
+static bool filter_probe_trace_event(struct probe_trace_event *tev,
+                                    struct strfilter *filter)
+{
+       char tmp[128];
+
+       /* At first, check the event name itself */
+       if (strfilter__compare(filter, tev->event))
+               return true;
+
+       /* Next, check the combination of name and group */
+       if (e_snprintf(tmp, 128, "%s:%s", tev->group, tev->event) < 0)
+               return false;
+       return strfilter__compare(filter, tmp);
+}
+
+static int __show_perf_probe_events(int fd, bool is_kprobe,
+                                   struct strfilter *filter)
  {
         int ret = 0;
         struct probe_trace_event tev;
@@ -2149,24 +2287,31 @@ static int __show_perf_probe_events(int fd, bool is_kprobe)
         strlist__for_each(ent, rawlist) {
                 ret = parse_probe_trace_command(ent->s, &tev);
                 if (ret >= 0) {
+                       if (!filter_probe_trace_event(&tev, filter))
+                               goto next;
                         ret = convert_to_perf_probe_event(&tev, &pev,
                                                                 is_kprobe);
-                       if (ret >= 0)
-                               ret = show_perf_probe_event(&pev,
-                                                           tev.point.module);
+                       if (ret < 0)
+                               goto next;
+                       ret = show_perf_probe_event(pev.group, pev.event,
+                                                   &pev, tev.point.module,
+                                                   true);
                 }
+next:
                 clear_perf_probe_event(&pev);
                 clear_probe_trace_event(&tev);
                 if (ret < 0)
                         break;
         }
         strlist__delete(rawlist);
+       /* Cleanup cached debuginfo if needed */
+       debuginfo_cache__exit();
  
         return ret;
  }
  
  /* List up current perf-probe events */
-int show_perf_probe_events(void)
+int show_perf_probe_events(struct strfilter *filter)
  {
         int kp_fd, up_fd, ret;
  
@@ -2178,7 +2323,7 @@ int show_perf_probe_events(void)
  
         kp_fd = open_kprobe_events(false);
         if (kp_fd >= 0) {
-               ret = __show_perf_probe_events(kp_fd, true);
+               ret = __show_perf_probe_events(kp_fd, true, filter);
                 close(kp_fd);
                 if (ret < 0)
                         goto out;
@@ -2192,7 +2337,7 @@ int show_perf_probe_events(void)
         }
  
         if (up_fd >= 0) {
-               ret = __show_perf_probe_events(up_fd, false);
+               ret = __show_perf_probe_events(up_fd, false, filter);
                 close(up_fd);
         }
  out:
@@ -2266,6 +2411,10 @@ static int get_new_event_name(char *buf, size_t len, const char *base,
                               struct strlist *namelist, bool allow_suffix)
  {
         int i, ret;
+       char *p;
+
+       if (*base == '.')
+               base++;
  
         /* Try no suffix */
         ret = e_snprintf(buf, len, "%s", base);
@@ -2273,6 +2422,10 @@ static int get_new_event_name(char *buf, size_t len, const char *base,
                 pr_debug("snprintf() failed: %d\n", ret);
                 return ret;
         }
+       /* Cut off the postfixes (e.g. .const, .isra)*/
+       p = strchr(buf, '.');
+       if (p && p != buf)
+               *p = '\0';
         if (!strlist__has_entry(namelist, buf))
                 return 0;
  
@@ -2328,10 +2481,9 @@ static int __add_probe_trace_events(struct perf_probe_event *pev,
         int i, fd, ret;
         struct probe_trace_event *tev = NULL;
         char buf[64];
-       const char *event, *group;
+       const char *event = NULL, *group = NULL;
         struct strlist *namelist;
-       LIST_HEAD(blacklist);
-       struct kprobe_blacklist_node *node;
+       bool safename;
  
         if (pev->uprobes)
                 fd = open_uprobe_events(true);
@@ -2347,34 +2499,26 @@ static int __add_probe_trace_events(struct perf_probe_event *pev,
         namelist = get_probe_trace_event_names(fd, false);
         if (!namelist) {
                 pr_debug("Failed to get current event list.\n");
-               return -EIO;
-       }
-       /* Get kprobe blacklist if exists */
-       if (!pev->uprobes) {
-               ret = kprobe_blacklist__load(&blacklist);
-               if (ret < 0)
-                       pr_debug("No kprobe blacklist support, ignored\n");
+               ret = -ENOMEM;
+               goto close_out;
         }
  
+       safename = (pev->point.function && !strisglob(pev->point.function));
         ret = 0;
         pr_info("Added new event%s\n", (ntevs > 1) ? "s:" : ":");
         for (i = 0; i < ntevs; i++) {
                 tev = &tevs[i];
-               /* Ensure that the address is NOT blacklisted */
-               node = kprobe_blacklist__find_by_address(&blacklist,
-                                                        tev->point.address);
-               if (node) {
-                       pr_warning("Warning: Skipped probing on blacklisted function: %s\n", node->symbol);
+               /* Skip if the symbol is out of .text or blacklisted */
+               if (!tev->point.symbol)
                         continue;
-               }
  
                 if (pev->event)
                         event = pev->event;
                 else
-                       if (pev->point.function)
+                       if (safename)
                                 event = pev->point.function;
                         else
-                               event = tev->point.symbol;
+                               event = tev->point.realname;
                 if (pev->group)
                         group = pev->group;
                 else
@@ -2399,15 +2543,12 @@ static int __add_probe_trace_events(struct perf_probe_event *pev,
                 /* Add added event name to namelist */
                 strlist__add(namelist, event);
  
-               /* Trick here - save current event/group */
-               event = pev->event;
-               group = pev->group;
-               pev->event = tev->event;
-               pev->group = tev->group;
-               show_perf_probe_event(pev, tev->point.module);
-               /* Trick here - restore current event/group */
-               pev->event = (char *)event;
-               pev->group = (char *)group;
+               /* We use tev's name for showing new events */
+               show_perf_probe_event(tev->group, tev->event, pev,
+                                     tev->point.module, false);
+               /* Save the last valid name */
+               event = tev->event;
+               group = tev->group;
  
                 /*
                  * Probes after the first probe which comes from same
@@ -2421,26 +2562,34 @@ static int __add_probe_trace_events(struct perf_probe_event *pev,
                 warn_uprobe_event_compat(tev);
  
         /* Note that it is possible to skip all events because of blacklist */
-       if (ret >= 0 && tev->event) {
+       if (ret >= 0 && event) {
                 /* Show how to use the event. */
                 pr_info("\nYou can now use it in all perf tools, such as:\n\n");
-               pr_info("\tperf record -e %s:%s -aR sleep 1\n\n", tev->group,
-                        tev->event);
+               pr_info("\tperf record -e %s:%s -aR sleep 1\n\n", group, event);
         }
  
-       kprobe_blacklist__delete(&blacklist);
         strlist__delete(namelist);
+close_out:
         close(fd);
         return ret;
  }
  
-static int find_probe_functions(struct map *map, char *name)
+static int find_probe_functions(struct map *map, char *name,
+                               struct symbol **syms)
  {
         int found = 0;
         struct symbol *sym;
+       struct rb_node *tmp;
+
+       if (map__load(map, NULL) < 0)
+               return 0;
  
-       map__for_each_symbol_by_name(map, name, sym) {
-               found++;
+       map__for_each_symbol(map, sym, tmp) {
+               if (strglobmatch(sym->name, name)) {
+                       found++;
+                       if (syms && found < probe_conf.max_probes)
+                               syms[found - 1] = sym;
+               }
         }
  
         return found;
@@ -2449,42 +2598,52 @@ static int find_probe_functions(struct map *map, char *name)
  #define strdup_or_goto(str, label)     \
         ({ char *__p = strdup(str); if (!__p) goto label; __p; })
  
+void __weak arch__fix_tev_from_maps(struct perf_probe_event *pev __maybe_unused,
+                               struct probe_trace_event *tev __maybe_unused,
+                               struct map *map __maybe_unused) { }
+
  /*
   * Find probe function addresses from map.
   * Return an error or the number of found probe_trace_event
   */
  static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
-                                           struct probe_trace_event **tevs,
-                                           int max_tevs, const char *target)
+                                           struct probe_trace_event **tevs)
  {
         struct map *map = NULL;
         struct ref_reloc_sym *reloc_sym = NULL;
         struct symbol *sym;
+       struct symbol **syms = NULL;
         struct probe_trace_event *tev;
         struct perf_probe_point *pp = &pev->point;
         struct probe_trace_point *tp;
         int num_matched_functions;
-       int ret, i;
+       int ret, i, j, skipped = 0;
  
-       map = get_target_map(target, pev->uprobes);
+       map = get_target_map(pev->target, pev->uprobes);
         if (!map) {
                 ret = -EINVAL;
                 goto out;
         }
  
+       syms = malloc(sizeof(struct symbol *) * probe_conf.max_probes);
+       if (!syms) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
         /*
          * Load matched symbols: Since the different local symbols may have
          * same name but different addresses, this lists all the symbols.
          */
-       num_matched_functions = find_probe_functions(map, pp->function);
+       num_matched_functions = find_probe_functions(map, pp->function, syms);
         if (num_matched_functions == 0) {
                 pr_err("Failed to find symbol %s in %s\n", pp->function,
-                       target ? : "kernel");
+                       pev->target ? : "kernel");
                 ret = -ENOENT;
                 goto out;
-       } else if (num_matched_functions > max_tevs) {
+       } else if (num_matched_functions > probe_conf.max_probes) {
                 pr_err("Too many functions matched in %s\n",
-                       target ? : "kernel");
+                       pev->target ? : "kernel");
                 ret = -E2BIG;
                 goto out;
         }
@@ -2507,7 +2666,9 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
  
         ret = 0;
  
-       map__for_each_symbol_by_name(map, pp->function, sym) {
+       for (j = 0; j < num_matched_functions; j++) {
+               sym = syms[j];
+
                 tev = (*tevs) + ret;
                 tp = &tev->point;
                 if (ret == num_matched_functions) {
@@ -2524,16 +2685,24 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
                 }
                 /* Add one probe point */
                 tp->address = map->unmap_ip(map, sym->start) + pp->offset;
-               if (reloc_sym) {
+               /* If we found a wrong one, mark it by NULL symbol */
+               if (!pev->uprobes &&
+                   kprobe_warn_out_range(sym->name, tp->address)) {
+                       tp->symbol = NULL;      /* Skip it */
+                       skipped++;
+               } else if (reloc_sym) {
                         tp->symbol = strdup_or_goto(reloc_sym->name, nomem_out);
                         tp->offset = tp->address - reloc_sym->addr;
                 } else {
                         tp->symbol = strdup_or_goto(sym->name, nomem_out);
                         tp->offset = pp->offset;
                 }
+               tp->realname = strdup_or_goto(sym->name, nomem_out);
+
                 tp->retprobe = pp->retprobe;
-               if (target)
-                       tev->point.module = strdup_or_goto(target, nomem_out);
+               if (pev->target)
+                       tev->point.module = strdup_or_goto(pev->target,
+                                                          nomem_out);
                 tev->uprobes = pev->uprobes;
                 tev->nargs = pev->nargs;
                 if (tev->nargs) {
@@ -2555,10 +2724,16 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
                                         strdup_or_goto(pev->args[i].type,
                                                         nomem_out);
                 }
+               arch__fix_tev_from_maps(pev, tev, map);
+       }
+       if (ret == skipped) {
+               ret = -ENOENT;
+               goto err_out;
         }
  
  out:
         put_target_map(map, pev->uprobes);
+       free(syms);
         return ret;
  
  nomem_out:
@@ -2569,27 +2744,34 @@ err_out:
         goto out;
  }
  
+bool __weak arch__prefers_symtab(void) { return false; }
+
  static int convert_to_probe_trace_events(struct perf_probe_event *pev,
-                                         struct probe_trace_event **tevs,
-                                         int max_tevs, const char *target)
+                                        struct probe_trace_event **tevs)
  {
         int ret;
  
         if (pev->uprobes && !pev->group) {
                 /* Replace group name if not given */
-               ret = convert_exec_to_group(target, &pev->group);
+               ret = convert_exec_to_group(pev->target, &pev->group);
                 if (ret != 0) {
                         pr_warning("Failed to make a group name.\n");
                         return ret;
                 }
         }
  
+       if (arch__prefers_symtab() && !perf_probe_event_need_dwarf(pev)) {
+               ret = find_probe_trace_events_from_map(pev, tevs);
+               if (ret > 0)
+                       return ret; /* Found in symbol table */
+       }
+
         /* Convert perf_probe_event with debuginfo */
-       ret = try_to_find_probe_trace_events(pev, tevs, max_tevs, target);
+       ret = try_to_find_probe_trace_events(pev, tevs);
         if (ret != 0)
                 return ret;     /* Found in debuginfo or got an error */
  
-       return find_probe_trace_events_from_map(pev, tevs, max_tevs, target);
+       return find_probe_trace_events_from_map(pev, tevs);
  }
  
  struct __event_package {
@@ -2598,8 +2780,7 @@ struct __event_package {
         int                             ntevs;
  };
  
-int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
-                         int max_tevs, bool force_add)
+int add_perf_probe_events(struct perf_probe_event *pevs, int npevs)
  {
         int i, j, ret;
         struct __event_package *pkgs;
@@ -2619,20 +2800,24 @@ int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
         /* Loop 1: convert all events */
         for (i = 0; i < npevs; i++) {
                 pkgs[i].pev = &pevs[i];
+               /* Init kprobe blacklist if needed */
+               if (!pkgs[i].pev->uprobes)
+                       kprobe_blacklist__init();
                 /* Convert with or without debuginfo */
                 ret  = convert_to_probe_trace_events(pkgs[i].pev,
-                                                    &pkgs[i].tevs,
-                                                    max_tevs,
-                                                    pkgs[i].pev->target);
+                                                    &pkgs[i].tevs);
                 if (ret < 0)
                         goto end;
                 pkgs[i].ntevs = ret;
         }
+       /* This just release blacklist only if allocated */
+       kprobe_blacklist__release();
  
         /* Loop 2: add all events */
         for (i = 0; i < npevs; i++) {
                 ret = __add_probe_trace_events(pkgs[i].pev, pkgs[i].tevs,
-                                               pkgs[i].ntevs, force_add);
+                                              pkgs[i].ntevs,
+                                              probe_conf.force_add);
                 if (ret < 0)
                         break;
         }
@@ -2684,40 +2869,39 @@ error:
         return ret;
  }
  
-static int del_trace_probe_event(int fd, const char *buf,
-                                                 struct strlist *namelist)
+static int del_trace_probe_events(int fd, struct strfilter *filter,
+                                 struct strlist *namelist)
  {
-       struct str_node *ent, *n;
-       int ret = -1;
+       struct str_node *ent;
+       const char *p;
+       int ret = -ENOENT;
  
-       if (strpbrk(buf, "*?")) { /* Glob-exp */
-               strlist__for_each_safe(ent, n, namelist)
-                       if (strglobmatch(ent->s, buf)) {
-                               ret = __del_trace_probe_event(fd, ent);
-                               if (ret < 0)
-                                       break;
-                               strlist__remove(namelist, ent);
-                       }
-       } else {
-               ent = strlist__find(namelist, buf);
-               if (ent) {
+       if (!namelist)
+               return -ENOENT;
+
+       strlist__for_each(ent, namelist) {
+               p = strchr(ent->s, ':');
+               if ((p && strfilter__compare(filter, p + 1)) ||
+                   strfilter__compare(filter, ent->s)) {
                         ret = __del_trace_probe_event(fd, ent);
-                       if (ret >= 0)
-                               strlist__remove(namelist, ent);
+                       if (ret < 0)
+                               break;
                 }
         }
  
         return ret;
  }
  
-int del_perf_probe_events(struct strlist *dellist)
+int del_perf_probe_events(struct strfilter *filter)
  {
-       int ret = -1, ufd = -1, kfd = -1;
-       char buf[128];
-       const char *group, *event;
-       char *p, *str;
-       struct str_node *ent;
+       int ret, ret2, ufd = -1, kfd = -1;
         struct strlist *namelist = NULL, *unamelist = NULL;
+       char *str = strfilter__string(filter);
+
+       if (!str)
+               return -EINVAL;
+
+       pr_debug("Delete filter: \'%s\'\n", str);
  
         /* Get current event names */
         kfd = open_kprobe_events(true);
@@ -2730,49 +2914,23 @@ int del_perf_probe_events(struct strlist *dellist)
  
         if (kfd < 0 && ufd < 0) {
                 print_both_open_warning(kfd, ufd);
+               ret = kfd;
                 goto error;
         }
  
-       if (namelist == NULL && unamelist == NULL)
+       ret = del_trace_probe_events(kfd, filter, namelist);
+       if (ret < 0 && ret != -ENOENT)
                 goto error;
  
-       strlist__for_each(ent, dellist) {
-               str = strdup(ent->s);
-               if (str == NULL) {
-                       ret = -ENOMEM;
-                       goto error;
-               }
-               pr_debug("Parsing: %s\n", str);
-               p = strchr(str, ':');
-               if (p) {
-                       group = str;
-                       *p = '\0';
-                       event = p + 1;
-               } else {
-                       group = "*";
-                       event = str;
-               }
-
-               ret = e_snprintf(buf, 128, "%s:%s", group, event);
-               if (ret < 0) {
-                       pr_err("Failed to copy event.");
-                       free(str);
-                       goto error;
-               }
-
-               pr_debug("Group: %s, Event: %s\n", group, event);
-
-               if (namelist)
-                       ret = del_trace_probe_event(kfd, buf, namelist);
-
-               if (unamelist && ret != 0)
-                       ret = del_trace_probe_event(ufd, buf, unamelist);
-
-               if (ret != 0)
-                       pr_info("Info: Event \"%s\" does not exist.\n", buf);
-
-               free(str);
+       ret2 = del_trace_probe_events(ufd, filter, unamelist);
+       if (ret2 < 0 && ret2 != -ENOENT) {
+               ret = ret2;
+               goto error;
         }
+       if (ret == -ENOENT && ret2 == -ENOENT)
+               pr_debug("\"%s\" does not hit any event.\n", str);
+               /* Note that this is silently ignored */
+       ret = 0;
  
  error:
         if (kfd >= 0) {
@@ -2784,6 +2942,7 @@ error:
                 strlist__delete(unamelist);
                 close(ufd);
         }
+       free(str);
  
         return ret;
  }
@@ -2837,8 +2996,7 @@ int show_available_funcs(const char *target, struct strfilter *_filter,
         dso__fprintf_symbols_by_name(map->dso, map->type, stdout);
  end:
         if (user) {
-               dso__delete(map->dso);
-               map__delete(map);
+               map__put(map);
         }
         exit_symbol_maps();
  
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h

index d6b783447be95d6b8764187f05692e0559eed371..31db6ee7db5478139dabfc97537568c0ec104736 100644 (file)
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -6,10 +6,20 @@
  #include "strlist.h"
  #include "strfilter.h"
  
+/* Probe related configurations */
+struct probe_conf {
+       bool    show_ext_vars;
+       bool    show_location_range;
+       bool    force_add;
+       bool    no_inlines;
+       int     max_probes;
+};
+extern struct probe_conf probe_conf;
  extern bool probe_event_dry_run;
  
  /* kprobe-tracer and uprobe-tracer tracing point */
  struct probe_trace_point {
+       char            *realname;      /* function real name (if needed) */
         char            *symbol;        /* Base symbol */
         char            *module;        /* Module name */
         unsigned long   offset;         /* Offset from symbol */
@@ -121,20 +131,18 @@ extern void line_range__clear(struct line_range *lr);
  /* Initialize line range */
  extern int line_range__init(struct line_range *lr);
  
-/* Internal use: Return kernel/module path */
-extern const char *kernel_get_module_path(const char *module);
-
-extern int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
-                                int max_probe_points, bool force_add);
-extern int del_perf_probe_events(struct strlist *dellist);
-extern int show_perf_probe_events(void);
+extern int add_perf_probe_events(struct perf_probe_event *pevs, int npevs);
+extern int del_perf_probe_events(struct strfilter *filter);
+extern int show_perf_probe_events(struct strfilter *filter);
  extern int show_line_range(struct line_range *lr, const char *module,
                            bool user);
  extern int show_available_vars(struct perf_probe_event *pevs, int npevs,
-                              int max_probe_points, const char *module,
-                              struct strfilter *filter, bool externs);
+                              struct strfilter *filter);
  extern int show_available_funcs(const char *module, struct strfilter *filter,
                                 bool user);
+bool arch__prefers_symtab(void);
+void arch__fix_tev_from_maps(struct perf_probe_event *pev,
+                            struct probe_trace_event *tev, struct map *map);
  
  /* Maximum index number of event-name postfix */
  #define MAX_EVENT_INDEX        1024
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c

index 2a76e14db73289d196a0171f4830693b46445e23..2da65a7108932857bd585681eb0fac195f0c850b 100644 (file)
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -130,7 +130,7 @@ struct debuginfo *debuginfo__new(const char *path)
                         continue;
                 dinfo = __debuginfo__new(buf);
         }
-       dso__delete(dso);
+       dso__put(dso);
  
  out:
         /* if failed to open all distro debuginfo, open given binary */
@@ -177,7 +177,7 @@ static int convert_variable_location(Dwarf_Die *vr_die, Dwarf_Addr addr,
         Dwarf_Word offs = 0;
         bool ref = false;
         const char *regs;
-       int ret;
+       int ret, ret2 = 0;
  
         if (dwarf_attr(vr_die, DW_AT_external, &attr) != NULL)
                 goto static_var;
@@ -187,9 +187,19 @@ static int convert_variable_location(Dwarf_Die *vr_die, Dwarf_Addr addr,
                 return -EINVAL; /* Broken DIE ? */
         if (dwarf_getlocation_addr(&attr, addr, &op, &nops, 1) <= 0) {
                 ret = dwarf_entrypc(sp_die, &tmp);
-               if (ret || addr != tmp ||
-                   dwarf_tag(vr_die) != DW_TAG_formal_parameter ||
-                   dwarf_highpc(sp_die, &tmp))
+               if (ret)
+                       return -ENOENT;
+
+               if (probe_conf.show_location_range &&
+                       (dwarf_tag(vr_die) == DW_TAG_variable)) {
+                       ret2 = -ERANGE;
+               } else if (addr != tmp ||
+                       dwarf_tag(vr_die) != DW_TAG_formal_parameter) {
+                       return -ENOENT;
+               }
+
+               ret = dwarf_highpc(sp_die, &tmp);
+               if (ret)
                         return -ENOENT;
                 /*
                  * This is fuzzed by fentry mcount. We try to find the
@@ -210,7 +220,7 @@ found:
         if (op->atom == DW_OP_addr) {
  static_var:
                 if (!tvar)
-                       return 0;
+                       return ret2;
                 /* Static variables on memory (not stack), make @varname */
                 ret = strlen(dwarf_diename(vr_die));
                 tvar->value = zalloc(ret + 2);
@@ -220,7 +230,7 @@ static_var:
                 tvar->ref = alloc_trace_arg_ref((long)offs);
                 if (tvar->ref == NULL)
                         return -ENOMEM;
-               return 0;
+               return ret2;
         }
  
         /* If this is based on frame buffer, set the offset */
@@ -250,14 +260,14 @@ static_var:
         }
  
         if (!tvar)
-               return 0;
+               return ret2;
  
         regs = get_arch_regstr(regn);
         if (!regs) {
                 /* This should be a bug in DWARF or this tool */
                 pr_warning("Mapping for the register number %u "
                            "missing on this architecture.\n", regn);
-               return -ERANGE;
+               return -ENOTSUP;
         }
  
         tvar->value = strdup(regs);
@@ -269,7 +279,7 @@ static_var:
                 if (tvar->ref == NULL)
                         return -ENOMEM;
         }
-       return 0;
+       return ret2;
  }
  
  #define BYTES_TO_BITS(nb)      ((nb) * BITS_PER_LONG / sizeof(long))
@@ -517,10 +527,12 @@ static int convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
  
         ret = convert_variable_location(vr_die, pf->addr, pf->fb_ops,
                                         &pf->sp_die, pf->tvar);
-       if (ret == -ENOENT || ret == -EINVAL)
-               pr_err("Failed to find the location of %s at this address.\n"
-                      " Perhaps, it has been optimized out.\n", pf->pvar->var);
-       else if (ret == -ENOTSUP)
+       if (ret == -ENOENT || ret == -EINVAL) {
+               pr_err("Failed to find the location of the '%s' variable at this address.\n"
+                      " Perhaps it has been optimized out.\n"
+                      " Use -V with the --range option to show '%s' location range.\n",
+                      pf->pvar->var, pf->pvar->var);
+       } else if (ret == -ENOTSUP)
                 pr_err("Sorry, we don't support this variable location yet.\n");
         else if (ret == 0 && pf->pvar->field) {
                 ret = convert_variable_fields(vr_die, pf->pvar->var,
@@ -662,9 +674,15 @@ static int call_probe_finder(Dwarf_Die *sc_die, struct probe_finder *pf)
         /* If not a real subprogram, find a real one */
         if (!die_is_func_def(sc_die)) {
                 if (!die_find_realfunc(&pf->cu_die, pf->addr, &pf->sp_die)) {
-                       pr_warning("Failed to find probe point in any "
-                                  "functions.\n");
-                       return -ENOENT;
+                       if (die_find_tailfunc(&pf->cu_die, pf->addr, &pf->sp_die)) {
+                               pr_warning("Ignoring tail call from %s\n",
+                                               dwarf_diename(&pf->sp_die));
+                               return 0;
+                       } else {
+                               pr_warning("Failed to find probe point in any "
+                                          "functions.\n");
+                               return -ENOENT;
+                       }
                 }
         } else
                 memcpy(&pf->sp_die, sc_die, sizeof(Dwarf_Die));
@@ -719,7 +737,7 @@ static int find_best_scope_cb(Dwarf_Die *fn_die, void *data)
         }
         /* If the function name is given, that's what user expects */
         if (fsp->function) {
-               if (die_compare_name(fn_die, fsp->function)) {
+               if (die_match_name(fn_die, fsp->function)) {
                         memcpy(fsp->die_mem, fn_die, sizeof(Dwarf_Die));
                         fsp->found = true;
                         return 1;
@@ -922,13 +940,14 @@ static int probe_point_search_cb(Dwarf_Die *sp_die, void *data)
  
         /* Check tag and diename */
         if (!die_is_func_def(sp_die) ||
-           !die_compare_name(sp_die, pp->function))
+           !die_match_name(sp_die, pp->function))
                 return DWARF_CB_OK;
  
         /* Check declared file */
         if (pp->file && strtailcmp(pp->file, dwarf_decl_file(sp_die)))
                 return DWARF_CB_OK;
  
+       pr_debug("Matched function: %s\n", dwarf_diename(sp_die));
         pf->fname = dwarf_decl_file(sp_die);
         if (pp->line) { /* Function relative line */
                 dwarf_decl_line(sp_die, &pf->lno);
@@ -945,10 +964,20 @@ static int probe_point_search_cb(Dwarf_Die *sp_die, void *data)
                         /* TODO: Check the address in this function */
                         param->retval = call_probe_finder(sp_die, pf);
                 }
-       } else
+       } else if (!probe_conf.no_inlines) {
                 /* Inlined function: search instances */
                 param->retval = die_walk_instances(sp_die,
                                         probe_point_inline_cb, (void *)pf);
+               /* This could be a non-existed inline definition */
+               if (param->retval == -ENOENT && strisglob(pp->function))
+                       param->retval = 0;
+       }
+
+       /* We need to find other candidates */
+       if (strisglob(pp->function) && param->retval >= 0) {
+               param->retval = 0;      /* We have to clear the result */
+               return DWARF_CB_OK;
+       }
  
         return DWARF_CB_ABORT; /* Exit; no same symbol in this CU. */
  }
@@ -977,7 +1006,7 @@ static int pubname_search_cb(Dwarf *dbg, Dwarf_Global *gl, void *data)
                 if (dwarf_tag(param->sp_die) != DW_TAG_subprogram)
                         return DWARF_CB_OK;
  
-               if (die_compare_name(param->sp_die, param->function)) {
+               if (die_match_name(param->sp_die, param->function)) {
                         if (!dwarf_offdie(dbg, gl->cu_offset, param->cu_die))
                                 return DWARF_CB_OK;
  
@@ -1030,7 +1059,7 @@ static int debuginfo__find_probes(struct debuginfo *dbg,
                 return -ENOMEM;
  
         /* Fastpath: lookup by function name from .debug_pubnames section */
-       if (pp->function) {
+       if (pp->function && !strisglob(pp->function)) {
                 struct pubname_callback_param pubname_param = {
                         .function = pp->function,
                         .file     = pp->file,
@@ -1089,6 +1118,7 @@ found:
  struct local_vars_finder {
         struct probe_finder *pf;
         struct perf_probe_arg *args;
+       bool vars;
         int max_args;
         int nargs;
         int ret;
@@ -1103,7 +1133,7 @@ static int copy_variables_cb(Dwarf_Die *die_mem, void *data)
  
         tag = dwarf_tag(die_mem);
         if (tag == DW_TAG_formal_parameter ||
-           tag == DW_TAG_variable) {
+           (tag == DW_TAG_variable && vf->vars)) {
                 if (convert_variable_location(die_mem, vf->pf->addr,
                                               vf->pf->fb_ops, &pf->sp_die,
                                               NULL) == 0) {
@@ -1129,26 +1159,28 @@ static int expand_probe_args(Dwarf_Die *sc_die, struct probe_finder *pf,
         Dwarf_Die die_mem;
         int i;
         int n = 0;
-       struct local_vars_finder vf = {.pf = pf, .args = args,
+       struct local_vars_finder vf = {.pf = pf, .args = args, .vars = false,
                                 .max_args = MAX_PROBE_ARGS, .ret = 0};
  
         for (i = 0; i < pf->pev->nargs; i++) {
                 /* var never be NULL */
-               if (strcmp(pf->pev->args[i].var, "$vars") == 0) {
-                       pr_debug("Expanding $vars into:");
-                       vf.nargs = n;
-                       /* Special local variables */
-                       die_find_child(sc_die, copy_variables_cb, (void *)&vf,
-                                      &die_mem);
-                       pr_debug(" (%d)\n", vf.nargs - n);
-                       if (vf.ret < 0)
-                               return vf.ret;
-                       n = vf.nargs;
-               } else {
+               if (strcmp(pf->pev->args[i].var, PROBE_ARG_VARS) == 0)
+                       vf.vars = true;
+               else if (strcmp(pf->pev->args[i].var, PROBE_ARG_PARAMS) != 0) {
                         /* Copy normal argument */
                         args[n] = pf->pev->args[i];
                         n++;
+                       continue;
                 }
+               pr_debug("Expanding %s into:", pf->pev->args[i].var);
+               vf.nargs = n;
+               /* Special local variables */
+               die_find_child(sc_die, copy_variables_cb, (void *)&vf,
+                              &die_mem);
+               pr_debug(" (%d)\n", vf.nargs - n);
+               if (vf.ret < 0)
+                       return vf.ret;
+               n = vf.nargs;
         }
         return n;
  }
@@ -1176,6 +1208,10 @@ static int add_probe_trace_event(Dwarf_Die *sc_die, struct probe_finder *pf)
         if (ret < 0)
                 return ret;
  
+       tev->point.realname = strdup(dwarf_diename(sc_die));
+       if (!tev->point.realname)
+               return -ENOMEM;
+
         pr_debug("Probe point found: %s+%lu\n", tev->point.symbol,
                  tev->point.offset);
  
@@ -1213,15 +1249,15 @@ end:
  /* Find probe_trace_events specified by perf_probe_event from debuginfo */
  int debuginfo__find_trace_events(struct debuginfo *dbg,
                                  struct perf_probe_event *pev,
-                                struct probe_trace_event **tevs, int max_tevs)
+                                struct probe_trace_event **tevs)
  {
         struct trace_event_finder tf = {
                         .pf = {.pev = pev, .callback = add_probe_trace_event},
-                       .mod = dbg->mod, .max_tevs = max_tevs};
+                       .max_tevs = probe_conf.max_probes, .mod = dbg->mod};
         int ret;
  
         /* Allocate result tevs array */
-       *tevs = zalloc(sizeof(struct probe_trace_event) * max_tevs);
+       *tevs = zalloc(sizeof(struct probe_trace_event) * tf.max_tevs);
         if (*tevs == NULL)
                 return -ENOMEM;
  
@@ -1237,14 +1273,11 @@ int debuginfo__find_trace_events(struct debuginfo *dbg,
         return (ret < 0) ? ret : tf.ntevs;
  }
  
-#define MAX_VAR_LEN 64
-
  /* Collect available variables in this scope */
  static int collect_variables_cb(Dwarf_Die *die_mem, void *data)
  {
         struct available_var_finder *af = data;
         struct variable_list *vl;
-       char buf[MAX_VAR_LEN];
         int tag, ret;
  
         vl = &af->vls[af->nvls - 1];
@@ -1255,11 +1288,38 @@ static int collect_variables_cb(Dwarf_Die *die_mem, void *data)
                 ret = convert_variable_location(die_mem, af->pf.addr,
                                                 af->pf.fb_ops, &af->pf.sp_die,
                                                 NULL);
-               if (ret == 0) {
-                       ret = die_get_varname(die_mem, buf, MAX_VAR_LEN);
-                       pr_debug2("Add new var: %s\n", buf);
-                       if (ret > 0)
-                               strlist__add(vl->vars, buf);
+               if (ret == 0 || ret == -ERANGE) {
+                       int ret2;
+                       bool externs = !af->child;
+                       struct strbuf buf;
+
+                       strbuf_init(&buf, 64);
+
+                       if (probe_conf.show_location_range) {
+                               if (!externs) {
+                                       if (ret)
+                                               strbuf_addf(&buf, "[INV]\t");
+                                       else
+                                               strbuf_addf(&buf, "[VAL]\t");
+                               } else
+                                       strbuf_addf(&buf, "[EXT]\t");
+                       }
+
+                       ret2 = die_get_varname(die_mem, &buf);
+
+                       if (!ret2 && probe_conf.show_location_range &&
+                               !externs) {
+                               strbuf_addf(&buf, "\t");
+                               ret2 = die_get_var_range(&af->pf.sp_die,
+                                                       die_mem, &buf);
+                       }
+
+                       pr_debug("Add new var: %s\n", buf.buf);
+                       if (ret2 == 0) {
+                               strlist__add(vl->vars,
+                                       strbuf_detach(&buf, NULL));
+                       }
+                       strbuf_release(&buf);
                 }
         }
  
@@ -1302,9 +1362,9 @@ static int add_available_vars(Dwarf_Die *sc_die, struct probe_finder *pf)
         die_find_child(sc_die, collect_variables_cb, (void *)af, &die_mem);
  
         /* Find external variables */
-       if (!af->externs)
+       if (!probe_conf.show_ext_vars)
                 goto out;
-       /* Don't need to search child DIE for externs. */
+       /* Don't need to search child DIE for external vars. */
         af->child = false;
         die_find_child(&pf->cu_die, collect_variables_cb, (void *)af, &die_mem);
  
@@ -1324,17 +1384,16 @@ out:
   */
  int debuginfo__find_available_vars_at(struct debuginfo *dbg,
                                       struct perf_probe_event *pev,
-                                     struct variable_list **vls,
-                                     int max_vls, bool externs)
+                                     struct variable_list **vls)
  {
         struct available_var_finder af = {
                         .pf = {.pev = pev, .callback = add_available_vars},
                         .mod = dbg->mod,
-                       .max_vls = max_vls, .externs = externs};
+                       .max_vls = probe_conf.max_probes};
         int ret;
  
         /* Allocate result vls array */
-       *vls = zalloc(sizeof(struct variable_list) * max_vls);
+       *vls = zalloc(sizeof(struct variable_list) * af.max_vls);
         if (*vls == NULL)
                 return -ENOMEM;
  
@@ -1535,7 +1594,7 @@ static int line_range_search_cb(Dwarf_Die *sp_die, void *data)
                 return DWARF_CB_OK;
  
         if (die_is_func_def(sp_die) &&
-           die_compare_name(sp_die, lr->function)) {
+           die_match_name(sp_die, lr->function)) {
                 lf->fname = dwarf_decl_file(sp_die);
                 dwarf_decl_line(sp_die, &lr->offset);
                 pr_debug("fname: %s, lineno:%d\n", lf->fname, lr->offset);
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h

index ebf8c8c814531ff4efaf5a7461c2ef6a7414df69..bed82716e1b44960a0ebc435d0ba1e94ed30730d 100644 (file)
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -10,6 +10,9 @@
  #define MAX_PROBES              128
  #define MAX_PROBE_ARGS          128
  
+#define PROBE_ARG_VARS         "$vars"
+#define PROBE_ARG_PARAMS       "$params"
+
  static inline int is_c_varname(const char *name)
  {
         /* TODO */
@@ -37,8 +40,7 @@ extern void debuginfo__delete(struct debuginfo *dbg);
  /* Find probe_trace_events specified by perf_probe_event from debuginfo */
  extern int debuginfo__find_trace_events(struct debuginfo *dbg,
                                         struct perf_probe_event *pev,
-                                       struct probe_trace_event **tevs,
-                                       int max_tevs);
+                                       struct probe_trace_event **tevs);
  
  /* Find a perf_probe_point from debuginfo */
  extern int debuginfo__find_probe_point(struct debuginfo *dbg,
@@ -52,8 +54,7 @@ extern int debuginfo__find_line_range(struct debuginfo *dbg,
  /* Find available variables */
  extern int debuginfo__find_available_vars_at(struct debuginfo *dbg,
                                              struct perf_probe_event *pev,
-                                            struct variable_list **vls,
-                                            int max_points, bool externs);
+                                            struct variable_list **vls);
  
  /* Find a src file from a DWARF tag path */
  int get_real_path(const char *raw_path, const char *comp_dir,
@@ -96,7 +97,6 @@ struct available_var_finder {
         struct variable_list    *vls;           /* Found variable lists */
         int                     nvls;           /* Number of variable lists */
         int                     max_vls;        /* Max no. of variable lists */
-       bool                    externs;        /* Find external vars too */
         bool                    child;          /* Search child scopes */
  };
  
diff --git a/tools/perf/util/pstack.c b/tools/perf/util/pstack.c

index a126e6cc6e73ad8554e21a611b373a25599f5a9a..b234a6e3d0d4f378ff2fd4899159bd3190956a99 100644 (file)
--- a/tools/perf/util/pstack.c
+++ b/tools/perf/util/pstack.c
@@ -74,3 +74,10 @@ void *pstack__pop(struct pstack *pstack)
         pstack->entries[pstack->top] = NULL;
         return ret;
  }
+
+void *pstack__peek(struct pstack *pstack)
+{
+       if (pstack->top == 0)
+               return NULL;
+       return pstack->entries[pstack->top - 1];
+}
diff --git a/tools/perf/util/pstack.h b/tools/perf/util/pstack.h

index c3cb6584d52763f24c3074886de50c9ff741c71c..ded7f2e36624a9ff2d6ebe49c92aed2018c097dc 100644 (file)
--- a/tools/perf/util/pstack.h
+++ b/tools/perf/util/pstack.h
@@ -10,5 +10,6 @@ bool pstack__empty(const struct pstack *pstack);
  void pstack__remove(struct pstack *pstack, void *key);
  void pstack__push(struct pstack *pstack, void *key);
  void *pstack__pop(struct pstack *pstack);
+void *pstack__peek(struct pstack *pstack);
  
  #endif /* _PERF_PSTACK_ */
diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources

index 4d28624a1ecaa6c6dc5d70fb2f0d43e91c56d0cf..5925fec90562fc355514489ae3d33814800f3896 100644 (file)
--- a/tools/perf/util/python-ext-sources
+++ b/tools/perf/util/python-ext-sources
@@ -16,6 +16,7 @@ util/util.c
  util/xyarray.c
  util/cgroup.c
  util/rblist.c
+util/stat.c
  util/strlist.c
  util/trace-event.c
  ../../lib/rbtree.c
diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c

index 8acd0df88b5c4b75063d7ad2f83a55ef4f4f6286..d457c523a33d8bb7d00669dbd09e589eecf6de4b 100644 (file)
--- a/tools/perf/util/record.c
+++ b/tools/perf/util/record.c
@@ -20,7 +20,7 @@ static int perf_do_probe_api(setup_probe_fn_t fn, int cpu, const char *str)
         if (!evlist)
                 return -ENOMEM;
  
-       if (parse_events(evlist, str))
+       if (parse_events(evlist, str, NULL))
                 goto out_delete;
  
         evsel = perf_evlist__first(evlist);
@@ -119,7 +119,16 @@ void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts)
                         evsel->attr.comm_exec = 1;
         }
  
-       if (evlist->nr_entries > 1) {
+       if (opts->full_auxtrace) {
+               /*
+                * Need to be able to synthesize and parse selected events with
+                * arbitrary sample types, which requires always being able to
+                * match the id.
+                */
+               use_sample_identifier = perf_can_sample_identifier();
+               evlist__for_each(evlist, evsel)
+                       perf_evsel__set_sample_id(evsel, use_sample_identifier);
+       } else if (evlist->nr_entries > 1) {
                 struct perf_evsel *first = perf_evlist__first(evlist);
  
                 evlist__for_each(evlist, evsel) {
@@ -207,7 +216,7 @@ bool perf_evlist__can_select_event(struct perf_evlist *evlist, const char *str)
         if (!temp_evlist)
                 return false;
  
-       err = parse_events(temp_evlist, str);
+       err = parse_events(temp_evlist, str, NULL);
         if (err)
                 goto out_delete;
  
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c

index 0c74012575ac925648c3f2eb350d7fb95bd154ce..aa482c10469d748fb2c6379ff854ba80f53b2b73 100644 (file)
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -15,12 +15,14 @@
  #include "cpumap.h"
  #include "perf_regs.h"
  #include "asm/bug.h"
+#include "auxtrace.h"
+#include "thread-stack.h"
  
-static int machines__deliver_event(struct machines *machines,
-                                  struct perf_evlist *evlist,
-                                  union perf_event *event,
-                                  struct perf_sample *sample,
-                                  struct perf_tool *tool, u64 file_offset);
+static int perf_session__deliver_event(struct perf_session *session,
+                                      union perf_event *event,
+                                      struct perf_sample *sample,
+                                      struct perf_tool *tool,
+                                      u64 file_offset);
  
  static int perf_session__open(struct perf_session *session)
  {
@@ -105,8 +107,8 @@ static int ordered_events__deliver_event(struct ordered_events *oe,
                 return ret;
         }
  
-       return machines__deliver_event(&session->machines, session->evlist, event->event,
-                                      &sample, session->tool, event->file_offset);
+       return perf_session__deliver_event(session, event->event, &sample,
+                                          session->tool, event->file_offset);
  }
  
  struct perf_session *perf_session__new(struct perf_data_file *file,
@@ -119,6 +121,7 @@ struct perf_session *perf_session__new(struct perf_data_file *file,
  
         session->repipe = repipe;
         session->tool   = tool;
+       INIT_LIST_HEAD(&session->auxtrace_index);
         machines__init(&session->machines);
         ordered_events__init(&session->ordered_events, ordered_events__deliver_event);
  
@@ -185,6 +188,8 @@ static void perf_session_env__delete(struct perf_session_env *env)
  
  void perf_session__delete(struct perf_session *session)
  {
+       auxtrace__free(session);
+       auxtrace_index__free(&session->auxtrace_index);
         perf_session__destroy_kernel_maps(session);
         perf_session__delete_threads(session);
         perf_session_env__delete(&session->header.env);
@@ -262,6 +267,49 @@ static int process_id_index_stub(struct perf_tool *tool __maybe_unused,
         return 0;
  }
  
+static int process_event_auxtrace_info_stub(struct perf_tool *tool __maybe_unused,
+                               union perf_event *event __maybe_unused,
+                               struct perf_session *session __maybe_unused)
+{
+       dump_printf(": unhandled!\n");
+       return 0;
+}
+
+static int skipn(int fd, off_t n)
+{
+       char buf[4096];
+       ssize_t ret;
+
+       while (n > 0) {
+               ret = read(fd, buf, min(n, (off_t)sizeof(buf)));
+               if (ret <= 0)
+                       return ret;
+               n -= ret;
+       }
+
+       return 0;
+}
+
+static s64 process_event_auxtrace_stub(struct perf_tool *tool __maybe_unused,
+                                      union perf_event *event,
+                                      struct perf_session *session
+                                      __maybe_unused)
+{
+       dump_printf(": unhandled!\n");
+       if (perf_data_file__is_pipe(session->file))
+               skipn(perf_data_file__fd(session->file), event->auxtrace.size);
+       return event->auxtrace.size;
+}
+
+static
+int process_event_auxtrace_error_stub(struct perf_tool *tool __maybe_unused,
+                                     union perf_event *event __maybe_unused,
+                                     struct perf_session *session __maybe_unused)
+{
+       dump_printf(": unhandled!\n");
+       return 0;
+}
+
  void perf_tool__fill_defaults(struct perf_tool *tool)
  {
         if (tool->sample == NULL)
@@ -278,6 +326,12 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
                 tool->exit = process_event_stub;
         if (tool->lost == NULL)
                 tool->lost = perf_event__process_lost;
+       if (tool->lost_samples == NULL)
+               tool->lost_samples = perf_event__process_lost_samples;
+       if (tool->aux == NULL)
+               tool->aux = perf_event__process_aux;
+       if (tool->itrace_start == NULL)
+               tool->itrace_start = perf_event__process_itrace_start;
         if (tool->read == NULL)
                 tool->read = process_event_sample_stub;
         if (tool->throttle == NULL)
@@ -298,6 +352,12 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
         }
         if (tool->id_index == NULL)
                 tool->id_index = process_id_index_stub;
+       if (tool->auxtrace_info == NULL)
+               tool->auxtrace_info = process_event_auxtrace_info_stub;
+       if (tool->auxtrace == NULL)
+               tool->auxtrace = process_event_auxtrace_stub;
+       if (tool->auxtrace_error == NULL)
+               tool->auxtrace_error = process_event_auxtrace_error_stub;
  }
  
  static void swap_sample_id_all(union perf_event *event, void *data)
@@ -390,6 +450,26 @@ static void perf_event__read_swap(union perf_event *event, bool sample_id_all)
                 swap_sample_id_all(event, &event->read + 1);
  }
  
+static void perf_event__aux_swap(union perf_event *event, bool sample_id_all)
+{
+       event->aux.aux_offset = bswap_64(event->aux.aux_offset);
+       event->aux.aux_size   = bswap_64(event->aux.aux_size);
+       event->aux.flags      = bswap_64(event->aux.flags);
+
+       if (sample_id_all)
+               swap_sample_id_all(event, &event->aux + 1);
+}
+
+static void perf_event__itrace_start_swap(union perf_event *event,
+                                         bool sample_id_all)
+{
+       event->itrace_start.pid  = bswap_32(event->itrace_start.pid);
+       event->itrace_start.tid  = bswap_32(event->itrace_start.tid);
+
+       if (sample_id_all)
+               swap_sample_id_all(event, &event->itrace_start + 1);
+}
+
  static void perf_event__throttle_swap(union perf_event *event,
                                       bool sample_id_all)
  {
@@ -438,19 +518,42 @@ void perf_event__attr_swap(struct perf_event_attr *attr)
  {
         attr->type              = bswap_32(attr->type);
         attr->size              = bswap_32(attr->size);
-       attr->config            = bswap_64(attr->config);
-       attr->sample_period     = bswap_64(attr->sample_period);
-       attr->sample_type       = bswap_64(attr->sample_type);
-       attr->read_format       = bswap_64(attr->read_format);
-       attr->wakeup_events     = bswap_32(attr->wakeup_events);
-       attr->bp_type           = bswap_32(attr->bp_type);
-       attr->bp_addr           = bswap_64(attr->bp_addr);
-       attr->bp_len            = bswap_64(attr->bp_len);
-       attr->branch_sample_type = bswap_64(attr->branch_sample_type);
-       attr->sample_regs_user   = bswap_64(attr->sample_regs_user);
-       attr->sample_stack_user  = bswap_32(attr->sample_stack_user);
  
-       swap_bitfield((u8 *) (&attr->read_format + 1), sizeof(u64));
+#define bswap_safe(f, n)                                       \
+       (attr->size > (offsetof(struct perf_event_attr, f) +    \
+                      sizeof(attr->f) * (n)))
+#define bswap_field(f, sz)                     \
+do {                                           \
+       if (bswap_safe(f, 0))                   \
+               attr->f = bswap_##sz(attr->f);  \
+} while(0)
+#define bswap_field_32(f) bswap_field(f, 32)
+#define bswap_field_64(f) bswap_field(f, 64)
+
+       bswap_field_64(config);
+       bswap_field_64(sample_period);
+       bswap_field_64(sample_type);
+       bswap_field_64(read_format);
+       bswap_field_32(wakeup_events);
+       bswap_field_32(bp_type);
+       bswap_field_64(bp_addr);
+       bswap_field_64(bp_len);
+       bswap_field_64(branch_sample_type);
+       bswap_field_64(sample_regs_user);
+       bswap_field_32(sample_stack_user);
+       bswap_field_32(aux_watermark);
+
+       /*
+        * After read_format are bitfields. Check read_format because
+        * we are unable to use offsetof on bitfield.
+        */
+       if (bswap_safe(read_format, 1))
+               swap_bitfield((u8 *) (&attr->read_format + 1),
+                             sizeof(u64));
+#undef bswap_field_64
+#undef bswap_field_32
+#undef bswap_field
+#undef bswap_safe
  }
  
  static void perf_event__hdr_attr_swap(union perf_event *event,
@@ -478,6 +581,40 @@ static void perf_event__tracing_data_swap(union perf_event *event,
         event->tracing_data.size = bswap_32(event->tracing_data.size);
  }
  
+static void perf_event__auxtrace_info_swap(union perf_event *event,
+                                          bool sample_id_all __maybe_unused)
+{
+       size_t size;
+
+       event->auxtrace_info.type = bswap_32(event->auxtrace_info.type);
+
+       size = event->header.size;
+       size -= (void *)&event->auxtrace_info.priv - (void *)event;
+       mem_bswap_64(event->auxtrace_info.priv, size);
+}
+
+static void perf_event__auxtrace_swap(union perf_event *event,
+                                     bool sample_id_all __maybe_unused)
+{
+       event->auxtrace.size      = bswap_64(event->auxtrace.size);
+       event->auxtrace.offset    = bswap_64(event->auxtrace.offset);
+       event->auxtrace.reference = bswap_64(event->auxtrace.reference);
+       event->auxtrace.idx       = bswap_32(event->auxtrace.idx);
+       event->auxtrace.tid       = bswap_32(event->auxtrace.tid);
+       event->auxtrace.cpu       = bswap_32(event->auxtrace.cpu);
+}
+
+static void perf_event__auxtrace_error_swap(union perf_event *event,
+                                           bool sample_id_all __maybe_unused)
+{
+       event->auxtrace_error.type = bswap_32(event->auxtrace_error.type);
+       event->auxtrace_error.code = bswap_32(event->auxtrace_error.code);
+       event->auxtrace_error.cpu  = bswap_32(event->auxtrace_error.cpu);
+       event->auxtrace_error.pid  = bswap_32(event->auxtrace_error.pid);
+       event->auxtrace_error.tid  = bswap_32(event->auxtrace_error.tid);
+       event->auxtrace_error.ip   = bswap_64(event->auxtrace_error.ip);
+}
+
  typedef void (*perf_event__swap_op)(union perf_event *event,
                                     bool sample_id_all);
  
@@ -492,11 +629,17 @@ static perf_event__swap_op perf_event__swap_ops[] = {
         [PERF_RECORD_THROTTLE]            = perf_event__throttle_swap,
         [PERF_RECORD_UNTHROTTLE]          = perf_event__throttle_swap,
         [PERF_RECORD_SAMPLE]              = perf_event__all64_swap,
+       [PERF_RECORD_AUX]                 = perf_event__aux_swap,
+       [PERF_RECORD_ITRACE_START]        = perf_event__itrace_start_swap,
+       [PERF_RECORD_LOST_SAMPLES]        = perf_event__all64_swap,
         [PERF_RECORD_HEADER_ATTR]         = perf_event__hdr_attr_swap,
         [PERF_RECORD_HEADER_EVENT_TYPE]   = perf_event__event_type_swap,
         [PERF_RECORD_HEADER_TRACING_DATA] = perf_event__tracing_data_swap,
         [PERF_RECORD_HEADER_BUILD_ID]     = NULL,
         [PERF_RECORD_ID_INDEX]            = perf_event__all64_swap,
+       [PERF_RECORD_AUXTRACE_INFO]       = perf_event__auxtrace_info_swap,
+       [PERF_RECORD_AUXTRACE]            = perf_event__auxtrace_swap,
+       [PERF_RECORD_AUXTRACE_ERROR]      = perf_event__auxtrace_error_swap,
         [PERF_RECORD_HEADER_MAX]          = NULL,
  };
  
@@ -921,6 +1064,8 @@ static int machines__deliver_event(struct machines *machines,
         case PERF_RECORD_MMAP:
                 return tool->mmap(tool, event, sample, machine);
         case PERF_RECORD_MMAP2:
+               if (event->header.misc & PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT)
+                       ++evlist->stats.nr_proc_map_timeout;
                 return tool->mmap2(tool, event, sample, machine);
         case PERF_RECORD_COMM:
                 return tool->comm(tool, event, sample, machine);
@@ -932,18 +1077,44 @@ static int machines__deliver_event(struct machines *machines,
                 if (tool->lost == perf_event__process_lost)
                         evlist->stats.total_lost += event->lost.lost;
                 return tool->lost(tool, event, sample, machine);
+       case PERF_RECORD_LOST_SAMPLES:
+               if (tool->lost_samples == perf_event__process_lost_samples)
+                       evlist->stats.total_lost_samples += event->lost_samples.lost;
+               return tool->lost_samples(tool, event, sample, machine);
         case PERF_RECORD_READ:
                 return tool->read(tool, event, sample, evsel, machine);
         case PERF_RECORD_THROTTLE:
                 return tool->throttle(tool, event, sample, machine);
         case PERF_RECORD_UNTHROTTLE:
                 return tool->unthrottle(tool, event, sample, machine);
+       case PERF_RECORD_AUX:
+               return tool->aux(tool, event, sample, machine);
+       case PERF_RECORD_ITRACE_START:
+               return tool->itrace_start(tool, event, sample, machine);
         default:
                 ++evlist->stats.nr_unknown_events;
                 return -1;
         }
  }
  
+static int perf_session__deliver_event(struct perf_session *session,
+                                      union perf_event *event,
+                                      struct perf_sample *sample,
+                                      struct perf_tool *tool,
+                                      u64 file_offset)
+{
+       int ret;
+
+       ret = auxtrace__process_event(session, event, sample, tool);
+       if (ret < 0)
+               return ret;
+       if (ret > 0)
+               return 0;
+
+       return machines__deliver_event(&session->machines, session->evlist,
+                                      event, sample, tool, file_offset);
+}
+
  static s64 perf_session__process_user_event(struct perf_session *session,
                                             union perf_event *event,
                                             u64 file_offset)
@@ -980,6 +1151,15 @@ static s64 perf_session__process_user_event(struct perf_session *session,
                 return tool->finished_round(tool, event, oe);
         case PERF_RECORD_ID_INDEX:
                 return tool->id_index(tool, event, session);
+       case PERF_RECORD_AUXTRACE_INFO:
+               return tool->auxtrace_info(tool, event, session);
+       case PERF_RECORD_AUXTRACE:
+               /* setup for reading amidst mmap */
+               lseek(fd, file_offset + event->header.size, SEEK_SET);
+               return tool->auxtrace(tool, event, session);
+       case PERF_RECORD_AUXTRACE_ERROR:
+               perf_session__auxtrace_error_inc(session, event);
+               return tool->auxtrace_error(tool, event, session);
         default:
                 return -EINVAL;
         }
@@ -1034,7 +1214,7 @@ int perf_session__peek_event(struct perf_session *session, off_t file_offset,
                 return -1;
  
         if (lseek(fd, file_offset, SEEK_SET) == (off_t)-1 ||
-           readn(fd, &buf, hdr_sz) != (ssize_t)hdr_sz)
+           readn(fd, buf, hdr_sz) != (ssize_t)hdr_sz)
                 return -1;
  
         event = (union perf_event *)buf;
@@ -1042,12 +1222,12 @@ int perf_session__peek_event(struct perf_session *session, off_t file_offset,
         if (session->header.needs_swap)
                 perf_event_header__bswap(&event->header);
  
-       if (event->header.size < hdr_sz)
+       if (event->header.size < hdr_sz || event->header.size > buf_sz)
                 return -1;
  
         rest = event->header.size - hdr_sz;
  
-       if (readn(fd, &buf, rest) != (ssize_t)rest)
+       if (readn(fd, buf, rest) != (ssize_t)rest)
                 return -1;
  
         if (session->header.needs_swap)
@@ -1096,8 +1276,8 @@ static s64 perf_session__process_event(struct perf_session *session,
                         return ret;
         }
  
-       return machines__deliver_event(&session->machines, evlist, event,
-                                      &sample, tool, file_offset);
+       return perf_session__deliver_event(session, event, &sample, tool,
+                                          file_offset);
  }
  
  void perf_event_header__bswap(struct perf_event_header *hdr)
@@ -1138,6 +1318,18 @@ static void perf_session__warn_about_errors(const struct perf_session *session)
                             stats->nr_events[PERF_RECORD_LOST]);
         }
  
+       if (session->tool->lost_samples == perf_event__process_lost_samples) {
+               double drop_rate;
+
+               drop_rate = (double)stats->total_lost_samples /
+                           (double) (stats->nr_events[PERF_RECORD_SAMPLE] + stats->total_lost_samples);
+               if (drop_rate > 0.05) {
+                       ui__warning("Processed %" PRIu64 " samples and lost %3.2f%% samples!\n\n",
+                                   stats->nr_events[PERF_RECORD_SAMPLE] + stats->total_lost_samples,
+                                   drop_rate * 100.0);
+               }
+       }
+
         if (stats->nr_unknown_events != 0) {
                 ui__warning("Found %u unknown events!\n\n"
                             "Is this an older tool processing a perf.data "
@@ -1168,6 +1360,32 @@ static void perf_session__warn_about_errors(const struct perf_session *session)
  
         if (oe->nr_unordered_events != 0)
                 ui__warning("%u out of order events recorded.\n", oe->nr_unordered_events);
+
+       events_stats__auxtrace_error_warn(stats);
+
+       if (stats->nr_proc_map_timeout != 0) {
+               ui__warning("%d map information files for pre-existing threads were\n"
+                           "not processed, if there are samples for addresses they\n"
+                           "will not be resolved, you may find out which are these\n"
+                           "threads by running with -v and redirecting the output\n"
+                           "to a file.\n"
+                           "The time limit to process proc map is too short?\n"
+                           "Increase it by --proc-map-timeout\n",
+                           stats->nr_proc_map_timeout);
+       }
+}
+
+static int perf_session__flush_thread_stack(struct thread *thread,
+                                           void *p __maybe_unused)
+{
+       return thread_stack__flush(thread);
+}
+
+static int perf_session__flush_thread_stacks(struct perf_session *session)
+{
+       return machines__for_each_thread(&session->machines,
+                                        perf_session__flush_thread_stack,
+                                        NULL);
  }
  
  volatile int session_done;
@@ -1256,10 +1474,17 @@ more:
  done:
         /* do the final flush for ordered samples */
         err = ordered_events__flush(oe, OE_FLUSH__FINAL);
+       if (err)
+               goto out_err;
+       err = auxtrace__flush_events(session, tool);
+       if (err)
+               goto out_err;
+       err = perf_session__flush_thread_stacks(session);
  out_err:
         free(buf);
         perf_session__warn_about_errors(session);
         ordered_events__free(&session->ordered_events);
+       auxtrace__free_events(session);
         return err;
  }
  
@@ -1402,10 +1627,17 @@ more:
  out:
         /* do the final flush for ordered samples */
         err = ordered_events__flush(oe, OE_FLUSH__FINAL);
+       if (err)
+               goto out_err;
+       err = auxtrace__flush_events(session, tool);
+       if (err)
+               goto out_err;
+       err = perf_session__flush_thread_stacks(session);
  out_err:
         ui_progress__finish();
         perf_session__warn_about_errors(session);
         ordered_events__free(&session->ordered_events);
+       auxtrace__free_events(session);
         session->one_mmap = false;
         return err;
  }
@@ -1488,7 +1720,13 @@ size_t perf_session__fprintf_dsos_buildid(struct perf_session *session, FILE *fp
  
  size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp)
  {
-       size_t ret = fprintf(fp, "Aggregated stats:\n");
+       size_t ret;
+       const char *msg = "";
+
+       if (perf_header__has_feat(&session->header, HEADER_AUXTRACE))
+               msg = " (excludes AUX area (e.g. instruction trace) decoded / synthesized events)";
+
+       ret = fprintf(fp, "Aggregated stats:%s\n", msg);
  
         ret += events_stats__fprintf(&session->evlist->stats, fp);
         return ret;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h

index d5fa7b7916ef40dd5216e533a5ce04bfdfc8a7d2..b44afc75d1cc51feb05f943973d0813e8556122c 100644 (file)
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -15,10 +15,16 @@
  struct ip_callchain;
  struct thread;
  
+struct auxtrace;
+struct itrace_synth_opts;
+
  struct perf_session {
         struct perf_header      header;
         struct machines         machines;
         struct perf_evlist      *evlist;
+       struct auxtrace         *auxtrace;
+       struct itrace_synth_opts *itrace_synth_opts;
+       struct list_head        auxtrace_index;
         struct trace_event      tevent;
         bool                    repipe;
         bool                    one_mmap;
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c

index 4593f36ecc4c651bef30112b008979b336894e0c..4c65a143a34c96747ab7c6d39284f264f0f8d41e 100644 (file)
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -89,14 +89,14 @@ static int64_t
  sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
  {
         /* Compare the addr that should be unique among comm */
-       return comm__str(right->comm) - comm__str(left->comm);
+       return strcmp(comm__str(right->comm), comm__str(left->comm));
  }
  
  static int64_t
  sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
  {
         /* Compare the addr that should be unique among comm */
-       return comm__str(right->comm) - comm__str(left->comm);
+       return strcmp(comm__str(right->comm), comm__str(left->comm));
  }
  
  static int64_t
@@ -182,18 +182,16 @@ static int64_t _sort__addr_cmp(u64 left_ip, u64 right_ip)
  
  static int64_t _sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r)
  {
-       u64 ip_l, ip_r;
-
         if (!sym_l || !sym_r)
                 return cmp_null(sym_l, sym_r);
  
         if (sym_l == sym_r)
                 return 0;
  
-       ip_l = sym_l->start;
-       ip_r = sym_r->start;
+       if (sym_l->start != sym_r->start)
+               return (int64_t)(sym_r->start - sym_l->start);
  
-       return (int64_t)(ip_r - ip_l);
+       return (int64_t)(sym_r->end - sym_l->end);
  }
  
  static int64_t
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h

index 846036a921dc9152623fd5e70671b5f196e4b3de..e97cd476d336f2a9cad2a1eeb9daba34d08ed3a4 100644 (file)
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -58,15 +58,16 @@ struct he_stat {
  
  struct hist_entry_diff {
         bool    computed;
+       union {
+               /* PERF_HPP__DELTA */
+               double  period_ratio_delta;
  
-       /* PERF_HPP__DELTA */
-       double  period_ratio_delta;
-
-       /* PERF_HPP__RATIO */
-       double  period_ratio;
+               /* PERF_HPP__RATIO */
+               double  period_ratio;
  
-       /* HISTC_WEIGHTED_DIFF */
-       s64     wdiff;
+               /* HISTC_WEIGHTED_DIFF */
+               s64     wdiff;
+       };
  };
  
  /**
@@ -92,21 +93,28 @@ struct hist_entry {
         s32                     cpu;
         u8                      cpumode;
  
-       struct hist_entry_diff  diff;
-
         /* We are added by hists__add_dummy_entry. */
         bool                    dummy;
  
-       /* XXX These two should move to some tree widget lib */
-       u16                     row_offset;
-       u16                     nr_rows;
-
-       bool                    init_have_children;
         char                    level;
         u8                      filtered;
+       union {
+               /*
+                * Since perf diff only supports the stdio output, TUI
+                * fields are only accessed from perf report (or perf
+                * top).  So make it an union to reduce memory usage.
+                */
+               struct hist_entry_diff  diff;
+               struct /* for TUI */ {
+                       u16     row_offset;
+                       u16     nr_rows;
+                       bool    init_have_children;
+                       bool    unfolded;
+                       bool    has_children;
+               };
+       };
         char                    *srcline;
         struct symbol           *parent;
-       unsigned long           position;
         struct rb_root          sorted_chain;
         struct branch_info      *branch_info;
         struct hists            *hists;
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c

new file mode 100644 (file)

index 0000000..53e8bb7
--- /dev/null
+++ b/tools/perf/util/stat-shadow.c
@@ -0,0 +1,434 @@
+#include <stdio.h>
+#include "evsel.h"
+#include "stat.h"
+#include "color.h"
+
+enum {
+       CTX_BIT_USER    = 1 << 0,
+       CTX_BIT_KERNEL  = 1 << 1,
+       CTX_BIT_HV      = 1 << 2,
+       CTX_BIT_HOST    = 1 << 3,
+       CTX_BIT_IDLE    = 1 << 4,
+       CTX_BIT_MAX     = 1 << 5,
+};
+
+#define NUM_CTX CTX_BIT_MAX
+
+static struct stats runtime_nsecs_stats[MAX_NR_CPUS];
+static struct stats runtime_cycles_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_stalled_cycles_front_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_stalled_cycles_back_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_branches_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_cacherefs_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_l1_dcache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_l1_icache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_ll_cache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_itlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS];
+
+struct stats walltime_nsecs_stats;
+
+static int evsel_context(struct perf_evsel *evsel)
+{
+       int ctx = 0;
+
+       if (evsel->attr.exclude_kernel)
+               ctx |= CTX_BIT_KERNEL;
+       if (evsel->attr.exclude_user)
+               ctx |= CTX_BIT_USER;
+       if (evsel->attr.exclude_hv)
+               ctx |= CTX_BIT_HV;
+       if (evsel->attr.exclude_host)
+               ctx |= CTX_BIT_HOST;
+       if (evsel->attr.exclude_idle)
+               ctx |= CTX_BIT_IDLE;
+
+       return ctx;
+}
+
+void perf_stat__reset_shadow_stats(void)
+{
+       memset(runtime_nsecs_stats, 0, sizeof(runtime_nsecs_stats));
+       memset(runtime_cycles_stats, 0, sizeof(runtime_cycles_stats));
+       memset(runtime_stalled_cycles_front_stats, 0, sizeof(runtime_stalled_cycles_front_stats));
+       memset(runtime_stalled_cycles_back_stats, 0, sizeof(runtime_stalled_cycles_back_stats));
+       memset(runtime_branches_stats, 0, sizeof(runtime_branches_stats));
+       memset(runtime_cacherefs_stats, 0, sizeof(runtime_cacherefs_stats));
+       memset(runtime_l1_dcache_stats, 0, sizeof(runtime_l1_dcache_stats));
+       memset(runtime_l1_icache_stats, 0, sizeof(runtime_l1_icache_stats));
+       memset(runtime_ll_cache_stats, 0, sizeof(runtime_ll_cache_stats));
+       memset(runtime_itlb_cache_stats, 0, sizeof(runtime_itlb_cache_stats));
+       memset(runtime_dtlb_cache_stats, 0, sizeof(runtime_dtlb_cache_stats));
+       memset(runtime_cycles_in_tx_stats, 0,
+                       sizeof(runtime_cycles_in_tx_stats));
+       memset(runtime_transaction_stats, 0,
+               sizeof(runtime_transaction_stats));
+       memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats));
+       memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
+}
+
+/*
+ * Update various tracking values we maintain to print
+ * more semantic information such as miss/hit ratios,
+ * instruction rates, etc:
+ */
+void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
+                                   int cpu)
+{
+       int ctx = evsel_context(counter);
+
+       if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
+               update_stats(&runtime_nsecs_stats[cpu], count[0]);
+       else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
+               update_stats(&runtime_cycles_stats[ctx][cpu], count[0]);
+       else if (perf_stat_evsel__is(counter, CYCLES_IN_TX))
+               update_stats(&runtime_transaction_stats[ctx][cpu], count[0]);
+       else if (perf_stat_evsel__is(counter, TRANSACTION_START))
+               update_stats(&runtime_transaction_stats[ctx][cpu], count[0]);
+       else if (perf_stat_evsel__is(counter, ELISION_START))
+               update_stats(&runtime_elision_stats[ctx][cpu], count[0]);
+       else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
+               update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count[0]);
+       else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
+               update_stats(&runtime_stalled_cycles_back_stats[ctx][cpu], count[0]);
+       else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
+               update_stats(&runtime_branches_stats[ctx][cpu], count[0]);
+       else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
+               update_stats(&runtime_cacherefs_stats[ctx][cpu], count[0]);
+       else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D))
+               update_stats(&runtime_l1_dcache_stats[ctx][cpu], count[0]);
+       else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I))
+               update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]);
+       else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL))
+               update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]);
+       else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB))
+               update_stats(&runtime_dtlb_cache_stats[ctx][cpu], count[0]);
+       else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
+               update_stats(&runtime_itlb_cache_stats[ctx][cpu], count[0]);
+}
+
+/* used for get_ratio_color() */
+enum grc_type {
+       GRC_STALLED_CYCLES_FE,
+       GRC_STALLED_CYCLES_BE,
+       GRC_CACHE_MISSES,
+       GRC_MAX_NR
+};
+
+static const char *get_ratio_color(enum grc_type type, double ratio)
+{
+       static const double grc_table[GRC_MAX_NR][3] = {
+               [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 },
+               [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 },
+               [GRC_CACHE_MISSES]      = { 20.0, 10.0, 5.0 },
+       };
+       const char *color = PERF_COLOR_NORMAL;
+
+       if (ratio > grc_table[type][0])
+               color = PERF_COLOR_RED;
+       else if (ratio > grc_table[type][1])
+               color = PERF_COLOR_MAGENTA;
+       else if (ratio > grc_table[type][2])
+               color = PERF_COLOR_YELLOW;
+
+       return color;
+}
+
+static void print_stalled_cycles_frontend(FILE *out, int cpu,
+                                         struct perf_evsel *evsel
+                                         __maybe_unused, double avg)
+{
+       double total, ratio = 0.0;
+       const char *color;
+       int ctx = evsel_context(evsel);
+
+       total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
+
+       if (total)
+               ratio = avg / total * 100.0;
+
+       color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio);
+
+       fprintf(out, " #  ");
+       color_fprintf(out, color, "%6.2f%%", ratio);
+       fprintf(out, " frontend cycles idle   ");
+}
+
+static void print_stalled_cycles_backend(FILE *out, int cpu,
+                                        struct perf_evsel *evsel
+                                        __maybe_unused, double avg)
+{
+       double total, ratio = 0.0;
+       const char *color;
+       int ctx = evsel_context(evsel);
+
+       total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
+
+       if (total)
+               ratio = avg / total * 100.0;
+
+       color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio);
+
+       fprintf(out, " #  ");
+       color_fprintf(out, color, "%6.2f%%", ratio);
+       fprintf(out, " backend  cycles idle   ");
+}
+
+static void print_branch_misses(FILE *out, int cpu,
+                               struct perf_evsel *evsel __maybe_unused,
+                               double avg)
+{
+       double total, ratio = 0.0;
+       const char *color;
+       int ctx = evsel_context(evsel);
+
+       total = avg_stats(&runtime_branches_stats[ctx][cpu]);
+
+       if (total)
+               ratio = avg / total * 100.0;
+
+       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
+
+       fprintf(out, " #  ");
+       color_fprintf(out, color, "%6.2f%%", ratio);
+       fprintf(out, " of all branches        ");
+}
+
+static void print_l1_dcache_misses(FILE *out, int cpu,
+                                  struct perf_evsel *evsel __maybe_unused,
+                                  double avg)
+{
+       double total, ratio = 0.0;
+       const char *color;
+       int ctx = evsel_context(evsel);
+
+       total = avg_stats(&runtime_l1_dcache_stats[ctx][cpu]);
+
+       if (total)
+               ratio = avg / total * 100.0;
+
+       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
+
+       fprintf(out, " #  ");
+       color_fprintf(out, color, "%6.2f%%", ratio);
+       fprintf(out, " of all L1-dcache hits  ");
+}
+
+static void print_l1_icache_misses(FILE *out, int cpu,
+                                  struct perf_evsel *evsel __maybe_unused,
+                                  double avg)
+{
+       double total, ratio = 0.0;
+       const char *color;
+       int ctx = evsel_context(evsel);
+
+       total = avg_stats(&runtime_l1_icache_stats[ctx][cpu]);
+
+       if (total)
+               ratio = avg / total * 100.0;
+
+       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
+
+       fprintf(out, " #  ");
+       color_fprintf(out, color, "%6.2f%%", ratio);
+       fprintf(out, " of all L1-icache hits  ");
+}
+
+static void print_dtlb_cache_misses(FILE *out, int cpu,
+                                   struct perf_evsel *evsel __maybe_unused,
+                                   double avg)
+{
+       double total, ratio = 0.0;
+       const char *color;
+       int ctx = evsel_context(evsel);
+
+       total = avg_stats(&runtime_dtlb_cache_stats[ctx][cpu]);
+
+       if (total)
+               ratio = avg / total * 100.0;
+
+       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
+
+       fprintf(out, " #  ");
+       color_fprintf(out, color, "%6.2f%%", ratio);
+       fprintf(out, " of all dTLB cache hits ");
+}
+
+static void print_itlb_cache_misses(FILE *out, int cpu,
+                                   struct perf_evsel *evsel __maybe_unused,
+                                   double avg)
+{
+       double total, ratio = 0.0;
+       const char *color;
+       int ctx = evsel_context(evsel);
+
+       total = avg_stats(&runtime_itlb_cache_stats[ctx][cpu]);
+
+       if (total)
+               ratio = avg / total * 100.0;
+
+       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
+
+       fprintf(out, " #  ");
+       color_fprintf(out, color, "%6.2f%%", ratio);
+       fprintf(out, " of all iTLB cache hits ");
+}
+
+static void print_ll_cache_misses(FILE *out, int cpu,
+                                 struct perf_evsel *evsel __maybe_unused,
+                                 double avg)
+{
+       double total, ratio = 0.0;
+       const char *color;
+       int ctx = evsel_context(evsel);
+
+       total = avg_stats(&runtime_ll_cache_stats[ctx][cpu]);
+
+       if (total)
+               ratio = avg / total * 100.0;
+
+       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
+
+       fprintf(out, " #  ");
+       color_fprintf(out, color, "%6.2f%%", ratio);
+       fprintf(out, " of all LL-cache hits   ");
+}
+
+void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
+                                  double avg, int cpu, enum aggr_mode aggr)
+{
+       double total, ratio = 0.0, total2;
+       int ctx = evsel_context(evsel);
+
+       if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
+               total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
+               if (total) {
+                       ratio = avg / total;
+                       fprintf(out, " #   %5.2f  insns per cycle        ", ratio);
+               } else {
+                       fprintf(out, "                                   ");
+               }
+               total = avg_stats(&runtime_stalled_cycles_front_stats[ctx][cpu]);
+               total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[ctx][cpu]));
+
+               if (total && avg) {
+                       ratio = total / avg;
+                       fprintf(out, "\n");
+                       if (aggr == AGGR_NONE)
+                               fprintf(out, "        ");
+                       fprintf(out, "                                                  #   %5.2f  stalled cycles per insn", ratio);
+               }
+
+       } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
+                       runtime_branches_stats[ctx][cpu].n != 0) {
+               print_branch_misses(out, cpu, evsel, avg);
+       } else if (
+               evsel->attr.type == PERF_TYPE_HW_CACHE &&
+               evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1D |
+                                       ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
+                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
+                       runtime_l1_dcache_stats[ctx][cpu].n != 0) {
+               print_l1_dcache_misses(out, cpu, evsel, avg);
+       } else if (
+               evsel->attr.type == PERF_TYPE_HW_CACHE &&
+               evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1I |
+                                       ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
+                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
+                       runtime_l1_icache_stats[ctx][cpu].n != 0) {
+               print_l1_icache_misses(out, cpu, evsel, avg);
+       } else if (
+               evsel->attr.type == PERF_TYPE_HW_CACHE &&
+               evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_DTLB |
+                                       ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
+                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
+                       runtime_dtlb_cache_stats[ctx][cpu].n != 0) {
+               print_dtlb_cache_misses(out, cpu, evsel, avg);
+       } else if (
+               evsel->attr.type == PERF_TYPE_HW_CACHE &&
+               evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_ITLB |
+                                       ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
+                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
+                       runtime_itlb_cache_stats[ctx][cpu].n != 0) {
+               print_itlb_cache_misses(out, cpu, evsel, avg);
+       } else if (
+               evsel->attr.type == PERF_TYPE_HW_CACHE &&
+               evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_LL |
+                                       ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
+                                       ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
+                       runtime_ll_cache_stats[ctx][cpu].n != 0) {
+               print_ll_cache_misses(out, cpu, evsel, avg);
+       } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) &&
+                       runtime_cacherefs_stats[ctx][cpu].n != 0) {
+               total = avg_stats(&runtime_cacherefs_stats[ctx][cpu]);
+
+               if (total)
+                       ratio = avg * 100 / total;
+
+               fprintf(out, " # %8.3f %% of all cache refs    ", ratio);
+
+       } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
+               print_stalled_cycles_frontend(out, cpu, evsel, avg);
+       } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
+               print_stalled_cycles_backend(out, cpu, evsel, avg);
+       } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
+               total = avg_stats(&runtime_nsecs_stats[cpu]);
+
+               if (total) {
+                       ratio = avg / total;
+                       fprintf(out, " # %8.3f GHz                    ", ratio);
+               } else {
+                       fprintf(out, "                                   ");
+               }
+       } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) {
+               total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
+               if (total)
+                       fprintf(out,
+                               " #   %5.2f%% transactional cycles   ",
+                               100.0 * (avg / total));
+       } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) {
+               total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
+               total2 = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
+               if (total2 < avg)
+                       total2 = avg;
+               if (total)
+                       fprintf(out,
+                               " #   %5.2f%% aborted cycles         ",
+                               100.0 * ((total2-avg) / total));
+       } else if (perf_stat_evsel__is(evsel, TRANSACTION_START) &&
+                  avg > 0 &&
+                  runtime_cycles_in_tx_stats[ctx][cpu].n != 0) {
+               total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
+
+               if (total)
+                       ratio = total / avg;
+
+               fprintf(out, " # %8.0f cycles / transaction   ", ratio);
+       } else if (perf_stat_evsel__is(evsel, ELISION_START) &&
+                  avg > 0 &&
+                  runtime_cycles_in_tx_stats[ctx][cpu].n != 0) {
+               total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
+
+               if (total)
+                       ratio = total / avg;
+
+               fprintf(out, " # %8.0f cycles / elision       ", ratio);
+       } else if (runtime_nsecs_stats[cpu].n != 0) {
+               char unit = 'M';
+
+               total = avg_stats(&runtime_nsecs_stats[cpu]);
+
+               if (total)
+                       ratio = 1000.0 * avg / total;
+               if (ratio < 0.001) {
+                       ratio *= 1000;
+                       unit = 'K';
+               }
+
+               fprintf(out, " # %8.3f %c/sec                  ", ratio, unit);
+       } else {
+               fprintf(out, "                                   ");
+       }
+}
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c

index 6506b3dfb6059f71aa6c345df21fcbc16e651604..4014b709f956b96b86dab3526f3b35fec6d8c166 100644 (file)
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -1,6 +1,6 @@
  #include <math.h>
-
  #include "stat.h"
+#include "evsel.h"
  
  void update_stats(struct stats *stats, u64 val)
  {
@@ -61,3 +61,72 @@ double rel_stddev_stats(double stddev, double avg)
  
         return pct;
  }
+
+bool __perf_evsel_stat__is(struct perf_evsel *evsel,
+                          enum perf_stat_evsel_id id)
+{
+       struct perf_stat *ps = evsel->priv;
+
+       return ps->id == id;
+}
+
+#define ID(id, name) [PERF_STAT_EVSEL_ID__##id] = #name
+static const char *id_str[PERF_STAT_EVSEL_ID__MAX] = {
+       ID(NONE,                x),
+       ID(CYCLES_IN_TX,        cpu/cycles-t/),
+       ID(TRANSACTION_START,   cpu/tx-start/),
+       ID(ELISION_START,       cpu/el-start/),
+       ID(CYCLES_IN_TX_CP,     cpu/cycles-ct/),
+};
+#undef ID
+
+void perf_stat_evsel_id_init(struct perf_evsel *evsel)
+{
+       struct perf_stat *ps = evsel->priv;
+       int i;
+
+       /* ps->id is 0 hence PERF_STAT_EVSEL_ID__NONE by default */
+
+       for (i = 0; i < PERF_STAT_EVSEL_ID__MAX; i++) {
+               if (!strcmp(perf_evsel__name(evsel), id_str[i])) {
+                       ps->id = i;
+                       break;
+               }
+       }
+}
+
+struct perf_counts *perf_counts__new(int ncpus)
+{
+       int size = sizeof(struct perf_counts) +
+                  ncpus * sizeof(struct perf_counts_values);
+
+       return zalloc(size);
+}
+
+void perf_counts__delete(struct perf_counts *counts)
+{
+       free(counts);
+}
+
+static void perf_counts__reset(struct perf_counts *counts, int ncpus)
+{
+       memset(counts, 0, (sizeof(*counts) +
+              (ncpus * sizeof(struct perf_counts_values))));
+}
+
+void perf_evsel__reset_counts(struct perf_evsel *evsel, int ncpus)
+{
+       perf_counts__reset(evsel->counts, ncpus);
+}
+
+int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus)
+{
+       evsel->counts = perf_counts__new(ncpus);
+       return evsel->counts != NULL ? 0 : -ENOMEM;
+}
+
+void perf_evsel__free_counts(struct perf_evsel *evsel)
+{
+       perf_counts__delete(evsel->counts);
+       evsel->counts = NULL;
+}
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h

index 5667fc3e39cf45fe31f099e702a68eeeb202d16f..093dc3cb28dd3f62cb593095dfdc9c59317e0446 100644 (file)
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -2,6 +2,7 @@
  #define __PERF_STATS_H
  
  #include <linux/types.h>
+#include <stdio.h>
  
  struct stats
  {
@@ -9,6 +10,27 @@ struct stats
         u64 max, min;
  };
  
+enum perf_stat_evsel_id {
+       PERF_STAT_EVSEL_ID__NONE = 0,
+       PERF_STAT_EVSEL_ID__CYCLES_IN_TX,
+       PERF_STAT_EVSEL_ID__TRANSACTION_START,
+       PERF_STAT_EVSEL_ID__ELISION_START,
+       PERF_STAT_EVSEL_ID__CYCLES_IN_TX_CP,
+       PERF_STAT_EVSEL_ID__MAX,
+};
+
+struct perf_stat {
+       struct stats            res_stats[3];
+       enum perf_stat_evsel_id id;
+};
+
+enum aggr_mode {
+       AGGR_NONE,
+       AGGR_GLOBAL,
+       AGGR_SOCKET,
+       AGGR_CORE,
+};
+
  void update_stats(struct stats *stats, u64 val);
  double avg_stats(struct stats *stats);
  double stddev_stats(struct stats *stats);
@@ -22,4 +44,28 @@ static inline void init_stats(struct stats *stats)
         stats->min  = (u64) -1;
         stats->max  = 0;
  }
+
+struct perf_evsel;
+bool __perf_evsel_stat__is(struct perf_evsel *evsel,
+                          enum perf_stat_evsel_id id);
+
+#define perf_stat_evsel__is(evsel, id) \
+       __perf_evsel_stat__is(evsel, PERF_STAT_EVSEL_ID__ ## id)
+
+void perf_stat_evsel_id_init(struct perf_evsel *evsel);
+
+extern struct stats walltime_nsecs_stats;
+
+void perf_stat__reset_shadow_stats(void);
+void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
+                                   int cpu);
+void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
+                                  double avg, int cpu, enum aggr_mode aggr);
+
+struct perf_counts *perf_counts__new(int ncpus);
+void perf_counts__delete(struct perf_counts *counts);
+
+void perf_evsel__reset_counts(struct perf_evsel *evsel, int ncpus);
+int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus);
+void perf_evsel__free_counts(struct perf_evsel *evsel);
  #endif
diff --git a/tools/perf/util/strfilter.c b/tools/perf/util/strfilter.c

index 79a757a2a15c22db5eec8494d662071cde57a12e..bcae659b65462cddff5c03c2c41a8fc675ad05bc 100644 (file)
--- a/tools/perf/util/strfilter.c
+++ b/tools/perf/util/strfilter.c
@@ -170,6 +170,46 @@ struct strfilter *strfilter__new(const char *rules, const char **err)
         return filter;
  }
  
+static int strfilter__append(struct strfilter *filter, bool _or,
+                            const char *rules, const char **err)
+{
+       struct strfilter_node *right, *root;
+       const char *ep = NULL;
+
+       if (!filter || !rules)
+               return -EINVAL;
+
+       right = strfilter_node__new(rules, &ep);
+       if (!right || *ep != '\0') {
+               if (err)
+                       *err = ep;
+               goto error;
+       }
+       root = strfilter_node__alloc(_or ? OP_or : OP_and, filter->root, right);
+       if (!root) {
+               ep = NULL;
+               goto error;
+       }
+
+       filter->root = root;
+       return 0;
+
+error:
+       strfilter_node__delete(right);
+       return ep ? -EINVAL : -ENOMEM;
+}
+
+int strfilter__or(struct strfilter *filter, const char *rules, const char **err)
+{
+       return strfilter__append(filter, true, rules, err);
+}
+
+int strfilter__and(struct strfilter *filter, const char *rules,
+                  const char **err)
+{
+       return strfilter__append(filter, false, rules, err);
+}
+
  static bool strfilter_node__compare(struct strfilter_node *node,
                                     const char *str)
  {
@@ -197,3 +237,70 @@ bool strfilter__compare(struct strfilter *filter, const char *str)
                 return false;
         return strfilter_node__compare(filter->root, str);
  }
+
+static int strfilter_node__sprint(struct strfilter_node *node, char *buf);
+
+/* sprint node in parenthesis if needed */
+static int strfilter_node__sprint_pt(struct strfilter_node *node, char *buf)
+{
+       int len;
+       int pt = node->r ? 2 : 0;       /* don't need to check node->l */
+
+       if (buf && pt)
+               *buf++ = '(';
+       len = strfilter_node__sprint(node, buf);
+       if (len < 0)
+               return len;
+       if (buf && pt)
+               *(buf + len) = ')';
+       return len + pt;
+}
+
+static int strfilter_node__sprint(struct strfilter_node *node, char *buf)
+{
+       int len = 0, rlen;
+
+       if (!node || !node->p)
+               return -EINVAL;
+
+       switch (*node->p) {
+       case '|':
+       case '&':
+               len = strfilter_node__sprint_pt(node->l, buf);
+               if (len < 0)
+                       return len;
+       case '!':
+               if (buf) {
+                       *(buf + len++) = *node->p;
+                       buf += len;
+               } else
+                       len++;
+               rlen = strfilter_node__sprint_pt(node->r, buf);
+               if (rlen < 0)
+                       return rlen;
+               len += rlen;
+               break;
+       default:
+               len = strlen(node->p);
+               if (buf)
+                       strcpy(buf, node->p);
+       }
+
+       return len;
+}
+
+char *strfilter__string(struct strfilter *filter)
+{
+       int len;
+       char *ret = NULL;
+
+       len = strfilter_node__sprint(filter->root, NULL);
+       if (len < 0)
+               return NULL;
+
+       ret = malloc(len + 1);
+       if (ret)
+               strfilter_node__sprint(filter->root, ret);
+
+       return ret;
+}
diff --git a/tools/perf/util/strfilter.h b/tools/perf/util/strfilter.h

index fe611f3c9e3965007a7ea5ed9f3fbbc768b5f271..cff5eda88728b20e9ebae22d7476ed14f367869c 100644 (file)
--- a/tools/perf/util/strfilter.h
+++ b/tools/perf/util/strfilter.h
@@ -28,6 +28,32 @@ struct strfilter {
   */
  struct strfilter *strfilter__new(const char *rules, const char **err);
  
+/**
+ * strfilter__or - Append an additional rule by logical-or
+ * @filter: Original string filter
+ * @rules: Filter rule to be appended at left of the root of
+ *         @filter by using logical-or.
+ * @err: Pointer which points an error detected on @rules
+ *
+ * Parse @rules and join it to the @filter by using logical-or.
+ * Return 0 if success, or return the error code.
+ */
+int strfilter__or(struct strfilter *filter,
+                 const char *rules, const char **err);
+
+/**
+ * strfilter__add - Append an additional rule by logical-and
+ * @filter: Original string filter
+ * @rules: Filter rule to be appended at left of the root of
+ *         @filter by using logical-and.
+ * @err: Pointer which points an error detected on @rules
+ *
+ * Parse @rules and join it to the @filter by using logical-and.
+ * Return 0 if success, or return the error code.
+ */
+int strfilter__and(struct strfilter *filter,
+                  const char *rules, const char **err);
+
  /**
   * strfilter__compare - compare given string and a string filter
   * @filter: String filter
@@ -45,4 +71,13 @@ bool strfilter__compare(struct strfilter *filter, const char *str);
   */
  void strfilter__delete(struct strfilter *filter);
  
+/**
+ * strfilter__string - Reconstruct a rule string from filter
+ * @filter: String filter to reconstruct
+ *
+ * Reconstruct a rule string from @filter. This will be good for
+ * debug messages. Note that returning string must be freed afterward.
+ */
+char *strfilter__string(struct strfilter *filter);
+
  #endif
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c

index a7ab6063e0389488a420680db46b96743366c1c2..65f7e389ae0996cae131cbfbcd2196181f15f1e7 100644 (file)
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -630,6 +630,11 @@ void symsrc__destroy(struct symsrc *ss)
         close(ss->fd);
  }
  
+bool __weak elf__needs_adjust_symbols(GElf_Ehdr ehdr)
+{
+       return ehdr.e_type == ET_EXEC || ehdr.e_type == ET_REL;
+}
+
  int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
                  enum dso_binary_type type)
  {
@@ -678,6 +683,7 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
                 }
  
                 if (!dso__build_id_equal(dso, build_id)) {
+                       pr_debug("%s: build id mismatch for %s.\n", __func__, name);
                         dso->load_errno = DSO_LOAD_ERRNO__MISMATCHING_BUILDID;
                         goto out_elf_end;
                 }
@@ -711,8 +717,7 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
                                                      ".gnu.prelink_undo",
                                                      NULL) != NULL);
         } else {
-               ss->adjust_symbols = ehdr.e_type == ET_EXEC ||
-                                    ehdr.e_type == ET_REL;
+               ss->adjust_symbols = elf__needs_adjust_symbols(ehdr);
         }
  
         ss->name   = strdup(name);
@@ -771,6 +776,8 @@ static bool want_demangle(bool is_kernel_sym)
         return is_kernel_sym ? symbol_conf.demangle_kernel : symbol_conf.demangle;
  }
  
+void __weak arch__elf_sym_adjust(GElf_Sym *sym __maybe_unused) { }
+
  int dso__load_sym(struct dso *dso, struct map *map,
                   struct symsrc *syms_ss, struct symsrc *runtime_ss,
                   symbol_filter_t filter, int kmodule)
@@ -935,6 +942,8 @@ int dso__load_sym(struct dso *dso, struct map *map,
                     (sym.st_value & 1))
                         --sym.st_value;
  
+               arch__elf_sym_adjust(&sym);
+
                 if (dso->kernel || kmodule) {
                         char dso_name[PATH_MAX];
  
@@ -963,8 +972,10 @@ int dso__load_sym(struct dso *dso, struct map *map,
                                         map->unmap_ip = map__unmap_ip;
                                         /* Ensure maps are correctly ordered */
                                         if (kmaps) {
+                                               map__get(map);
                                                 map_groups__remove(kmaps, map);
                                                 map_groups__insert(kmaps, map);
+                                               map__put(map);
                                         }
                                 }
  
@@ -1005,7 +1016,7 @@ int dso__load_sym(struct dso *dso, struct map *map,
                                 curr_map = map__new2(start, curr_dso,
                                                      map->type);
                                 if (curr_map == NULL) {
-                                       dso__delete(curr_dso);
+                                       dso__put(curr_dso);
                                         goto out_elf_end;
                                 }
                                 if (adjust_kernel_syms) {
@@ -1020,11 +1031,7 @@ int dso__load_sym(struct dso *dso, struct map *map,
                                 }
                                 curr_dso->symtab_type = dso->symtab_type;
                                 map_groups__insert(kmaps, curr_map);
-                               /*
-                                * The new DSO should go to the kernel DSOS
-                                */
-                               dsos__add(&map->groups->machine->kernel_dsos,
-                                         curr_dso);
+                               dsos__add(&map->groups->machine->dsos, curr_dso);
                                 dso__set_loaded(curr_dso, map->type);
                         } else
                                 curr_dso = curr_map->dso;
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c

index 201f6c4ca738ddffb46d5270a876cf96e8da9322..504f2d73b7eefe2699349ac75c5cc3b875961375 100644 (file)
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -85,8 +85,17 @@ static int prefix_underscores_count(const char *str)
         return tail - str;
  }
  
-#define SYMBOL_A 0
-#define SYMBOL_B 1
+int __weak arch__choose_best_symbol(struct symbol *syma,
+                                   struct symbol *symb __maybe_unused)
+{
+       /* Avoid "SyS" kernel syscall aliases */
+       if (strlen(syma->name) >= 3 && !strncmp(syma->name, "SyS", 3))
+               return SYMBOL_B;
+       if (strlen(syma->name) >= 10 && !strncmp(syma->name, "compat_SyS", 10))
+               return SYMBOL_B;
+
+       return SYMBOL_A;
+}
  
  static int choose_best_symbol(struct symbol *syma, struct symbol *symb)
  {
@@ -134,13 +143,7 @@ static int choose_best_symbol(struct symbol *syma, struct symbol *symb)
         else if (na < nb)
                 return SYMBOL_B;
  
-       /* Avoid "SyS" kernel syscall aliases */
-       if (na >= 3 && !strncmp(syma->name, "SyS", 3))
-               return SYMBOL_B;
-       if (na >= 10 && !strncmp(syma->name, "compat_SyS", 10))
-               return SYMBOL_B;
-
-       return SYMBOL_A;
+       return arch__choose_best_symbol(syma, symb);
  }
  
  void symbols__fixup_duplicate(struct rb_root *symbols)
@@ -199,18 +202,18 @@ void symbols__fixup_end(struct rb_root *symbols)
  
  void __map_groups__fixup_end(struct map_groups *mg, enum map_type type)
  {
-       struct map *prev, *curr;
-       struct rb_node *nd, *prevnd = rb_first(&mg->maps[type]);
+       struct maps *maps = &mg->maps[type];
+       struct map *next, *curr;
  
-       if (prevnd == NULL)
-               return;
+       pthread_rwlock_wrlock(&maps->lock);
  
-       curr = rb_entry(prevnd, struct map, rb_node);
+       curr = maps__first(maps);
+       if (curr == NULL)
+               goto out_unlock;
  
-       for (nd = rb_next(prevnd); nd; nd = rb_next(nd)) {
-               prev = curr;
-               curr = rb_entry(nd, struct map, rb_node);
-               prev->end = curr->start;
+       for (next = map__next(curr); next; next = map__next(curr)) {
+               curr->end = next->start;
+               curr = next;
         }
  
         /*
@@ -218,6 +221,9 @@ void __map_groups__fixup_end(struct map_groups *mg, enum map_type type)
          * last map final address.
          */
         curr->end = ~0ULL;
+
+out_unlock:
+       pthread_rwlock_unlock(&maps->lock);
  }
  
  struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name)
@@ -397,7 +403,7 @@ static struct symbol *symbols__find_by_name(struct rb_root *symbols,
                                             const char *name)
  {
         struct rb_node *n;
-       struct symbol_name_rb_node *s;
+       struct symbol_name_rb_node *s = NULL;
  
         if (symbols == NULL)
                 return NULL;
@@ -408,7 +414,7 @@ static struct symbol *symbols__find_by_name(struct rb_root *symbols,
                 int cmp;
  
                 s = rb_entry(n, struct symbol_name_rb_node, rb_node);
-               cmp = strcmp(name, s->sym.name);
+               cmp = arch__compare_symbol_names(name, s->sym.name);
  
                 if (cmp < 0)
                         n = n->rb_left;
@@ -426,7 +432,7 @@ static struct symbol *symbols__find_by_name(struct rb_root *symbols,
                 struct symbol_name_rb_node *tmp;
  
                 tmp = rb_entry(n, struct symbol_name_rb_node, rb_node);
-               if (strcmp(tmp->sym.name, s->sym.name))
+               if (arch__compare_symbol_names(tmp->sym.name, s->sym.name))
                         break;
  
                 s = tmp;
@@ -653,14 +659,14 @@ static int dso__split_kallsyms_for_kcore(struct dso *dso, struct map *map,
                 curr_map = map_groups__find(kmaps, map->type, pos->start);
  
                 if (!curr_map || (filter && filter(curr_map, pos))) {
-                       rb_erase(&pos->rb_node, root);
+                       rb_erase_init(&pos->rb_node, root);
                         symbol__delete(pos);
                 } else {
                         pos->start -= curr_map->start - curr_map->pgoff;
                         if (pos->end)
                                 pos->end -= curr_map->start - curr_map->pgoff;
                         if (curr_map != map) {
-                               rb_erase(&pos->rb_node, root);
+                               rb_erase_init(&pos->rb_node, root);
                                 symbols__insert(
                                         &curr_map->dso->symbols[curr_map->type],
                                         pos);
@@ -780,7 +786,7 @@ static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta,
  
                         curr_map = map__new2(pos->start, ndso, map->type);
                         if (curr_map == NULL) {
-                               dso__delete(ndso);
+                               dso__put(ndso);
                                 return -1;
                         }
  
@@ -1167,20 +1173,23 @@ static int dso__load_kcore(struct dso *dso, struct map *map,
         /* Add new maps */
         while (!list_empty(&md.maps)) {
                 new_map = list_entry(md.maps.next, struct map, node);
-               list_del(&new_map->node);
+               list_del_init(&new_map->node);
                 if (new_map == replacement_map) {
                         map->start      = new_map->start;
                         map->end        = new_map->end;
                         map->pgoff      = new_map->pgoff;
                         map->map_ip     = new_map->map_ip;
                         map->unmap_ip   = new_map->unmap_ip;
-                       map__delete(new_map);
                         /* Ensure maps are correctly ordered */
+                       map__get(map);
                         map_groups__remove(kmaps, map);
                         map_groups__insert(kmaps, map);
+                       map__put(map);
                 } else {
                         map_groups__insert(kmaps, new_map);
                 }
+
+               map__put(new_map);
         }
  
         /*
@@ -1205,8 +1214,8 @@ static int dso__load_kcore(struct dso *dso, struct map *map,
  out_err:
         while (!list_empty(&md.maps)) {
                 map = list_entry(md.maps.next, struct map, node);
-               list_del(&map->node);
-               map__delete(map);
+               list_del_init(&map->node);
+               map__put(map);
         }
         close(fd);
         return -EINVAL;
@@ -1355,7 +1364,7 @@ static bool dso__is_compatible_symtab_type(struct dso *dso, bool kmod,
         case DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE_COMP:
                 /*
                  * kernel modules know their symtab type - it's set when
-                * creating a module dso in machine__new_module().
+                * creating a module dso in machine__findnew_module_map().
                  */
                 return kmod && dso->symtab_type == type;
  
@@ -1380,12 +1389,22 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
         struct symsrc *syms_ss = NULL, *runtime_ss = NULL;
         bool kmod;
  
-       dso__set_loaded(dso, map->type);
+       pthread_mutex_lock(&dso->lock);
+
+       /* check again under the dso->lock */
+       if (dso__loaded(dso, map->type)) {
+               ret = 1;
+               goto out;
+       }
  
-       if (dso->kernel == DSO_TYPE_KERNEL)
-               return dso__load_kernel_sym(dso, map, filter);
-       else if (dso->kernel == DSO_TYPE_GUEST_KERNEL)
-               return dso__load_guest_kernel_sym(dso, map, filter);
+       if (dso->kernel) {
+               if (dso->kernel == DSO_TYPE_KERNEL)
+                       ret = dso__load_kernel_sym(dso, map, filter);
+               else if (dso->kernel == DSO_TYPE_GUEST_KERNEL)
+                       ret = dso__load_guest_kernel_sym(dso, map, filter);
+
+               goto out;
+       }
  
         if (map->groups && map->groups->machine)
                 machine = map->groups->machine;
@@ -1398,18 +1417,18 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
                 struct stat st;
  
                 if (lstat(dso->name, &st) < 0)
-                       return -1;
+                       goto out;
  
                 if (st.st_uid && (st.st_uid != geteuid())) {
                         pr_warning("File %s not owned by current user or root, "
                                 "ignoring it.\n", dso->name);
-                       return -1;
+                       goto out;
                 }
  
                 ret = dso__load_perf_map(dso, map, filter);
                 dso->symtab_type = ret > 0 ? DSO_BINARY_TYPE__JAVA_JIT :
                                              DSO_BINARY_TYPE__NOT_FOUND;
-               return ret;
+               goto out;
         }
  
         if (machine)
@@ -1417,7 +1436,7 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
  
         name = malloc(PATH_MAX);
         if (!name)
-               return -1;
+               goto out;
  
         kmod = dso->symtab_type == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE ||
                 dso->symtab_type == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE_COMP ||
@@ -1498,23 +1517,32 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
  out_free:
         free(name);
         if (ret < 0 && strstr(dso->name, " (deleted)") != NULL)
-               return 0;
+               ret = 0;
+out:
+       dso__set_loaded(dso, map->type);
+       pthread_mutex_unlock(&dso->lock);
+
         return ret;
  }
  
  struct map *map_groups__find_by_name(struct map_groups *mg,
                                      enum map_type type, const char *name)
  {
-       struct rb_node *nd;
+       struct maps *maps = &mg->maps[type];
+       struct map *map;
  
-       for (nd = rb_first(&mg->maps[type]); nd; nd = rb_next(nd)) {
-               struct map *map = rb_entry(nd, struct map, rb_node);
+       pthread_rwlock_rdlock(&maps->lock);
  
+       for (map = maps__first(maps); map; map = map__next(map)) {
                 if (map->dso && strcmp(map->dso->short_name, name) == 0)
-                       return map;
+                       goto out_unlock;
         }
  
-       return NULL;
+       map = NULL;
+
+out_unlock:
+       pthread_rwlock_unlock(&maps->lock);
+       return map;
  }
  
  int dso__load_vmlinux(struct dso *dso, struct map *map,
@@ -1802,6 +1830,7 @@ static void vmlinux_path__exit(void)
  {
         while (--vmlinux_path__nr_entries >= 0)
                 zfree(&vmlinux_path[vmlinux_path__nr_entries]);
+       vmlinux_path__nr_entries = 0;
  
         zfree(&vmlinux_path);
  }
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h

index 09561500164a07997b49ca744f7f557ee200f043..bef47ead1d9bd1efc5e9620f04b13714286ff616 100644 (file)
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -158,8 +158,6 @@ struct ref_reloc_sym {
  struct map_symbol {
         struct map    *map;
         struct symbol *sym;
-       bool          unfolded;
-       bool          has_children;
  };
  
  struct addr_map_symbol {
@@ -303,4 +301,14 @@ int setup_list(struct strlist **list, const char *list_str,
  int setup_intlist(struct intlist **list, const char *list_str,
                   const char *list_name);
  
+#ifdef HAVE_LIBELF_SUPPORT
+bool elf__needs_adjust_symbols(GElf_Ehdr ehdr);
+void arch__elf_sym_adjust(GElf_Sym *sym);
+#endif
+
+#define SYMBOL_A 0
+#define SYMBOL_B 1
+
+int arch__choose_best_symbol(struct symbol *syma, struct symbol *symb);
+
  #endif /* __PERF_SYMBOL */
diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c

index 9ed59a452d1ff82966fd4eb99697d9bc3d9dd604..679688e70ae7e72e73d14cc7659cb965cc4d7016 100644 (file)
--- a/tools/perf/util/thread-stack.c
+++ b/tools/perf/util/thread-stack.c
@@ -219,7 +219,7 @@ static int thread_stack__call_return(struct thread *thread,
         return crp->process(&cr, crp->data);
  }
  
-static int thread_stack__flush(struct thread *thread, struct thread_stack *ts)
+static int __thread_stack__flush(struct thread *thread, struct thread_stack *ts)
  {
         struct call_return_processor *crp = ts->crp;
         int err;
@@ -242,6 +242,14 @@ static int thread_stack__flush(struct thread *thread, struct thread_stack *ts)
         return 0;
  }
  
+int thread_stack__flush(struct thread *thread)
+{
+       if (thread->ts)
+               return __thread_stack__flush(thread, thread->ts);
+
+       return 0;
+}
+
  int thread_stack__event(struct thread *thread, u32 flags, u64 from_ip,
                         u64 to_ip, u16 insn_len, u64 trace_nr)
  {
@@ -264,7 +272,7 @@ int thread_stack__event(struct thread *thread, u32 flags, u64 from_ip,
          */
         if (trace_nr != thread->ts->trace_nr) {
                 if (thread->ts->trace_nr)
-                       thread_stack__flush(thread, thread->ts);
+                       __thread_stack__flush(thread, thread->ts);
                 thread->ts->trace_nr = trace_nr;
         }
  
@@ -297,7 +305,7 @@ void thread_stack__set_trace_nr(struct thread *thread, u64 trace_nr)
  
         if (trace_nr != thread->ts->trace_nr) {
                 if (thread->ts->trace_nr)
-                       thread_stack__flush(thread, thread->ts);
+                       __thread_stack__flush(thread, thread->ts);
                 thread->ts->trace_nr = trace_nr;
         }
  }
@@ -305,7 +313,7 @@ void thread_stack__set_trace_nr(struct thread *thread, u64 trace_nr)
  void thread_stack__free(struct thread *thread)
  {
         if (thread->ts) {
-               thread_stack__flush(thread, thread->ts);
+               __thread_stack__flush(thread, thread->ts);
                 zfree(&thread->ts->stack);
                 zfree(&thread->ts);
         }
@@ -689,7 +697,7 @@ int thread_stack__process(struct thread *thread, struct comm *comm,
  
         /* Flush stack on exec */
         if (ts->comm != comm && thread->pid_ == thread->tid) {
-               err = thread_stack__flush(thread, ts);
+               err = __thread_stack__flush(thread, ts);
                 if (err)
                         return err;
                 ts->comm = comm;
diff --git a/tools/perf/util/thread-stack.h b/tools/perf/util/thread-stack.h

index b843bbef8ba2177a66d3b2a3cebb8ac56d8b6f44..e1528f1374c3e5131efe8c2293ef9a6736ea3ed4 100644 (file)
--- a/tools/perf/util/thread-stack.h
+++ b/tools/perf/util/thread-stack.h
@@ -96,6 +96,7 @@ int thread_stack__event(struct thread *thread, u32 flags, u64 from_ip,
  void thread_stack__set_trace_nr(struct thread *thread, u64 trace_nr);
  void thread_stack__sample(struct thread *thread, struct ip_callchain *chain,
                           size_t sz, u64 ip);
+int thread_stack__flush(struct thread *thread);
  void thread_stack__free(struct thread *thread);
  
  struct call_return_processor *
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c

index 1c8fbc9588c5fddc978e8a07562aecc167a2a752..28c4b746baa19bef9830814c4fe7c69f1be0b06b 100644 (file)
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -18,7 +18,7 @@ int thread__init_map_groups(struct thread *thread, struct machine *machine)
         if (pid == thread->tid || pid == -1) {
                 thread->mg = map_groups__new(machine);
         } else {
-               leader = machine__findnew_thread(machine, pid, pid);
+               leader = __machine__findnew_thread(machine, pid, pid);
                 if (leader)
                         thread->mg = map_groups__get(leader->mg);
         }
@@ -53,7 +53,8 @@ struct thread *thread__new(pid_t pid, pid_t tid)
                         goto err_thread;
  
                 list_add(&comm->list, &thread->comm_list);
-
+               atomic_set(&thread->refcnt, 0);
+               RB_CLEAR_NODE(&thread->rb_node);
         }
  
         return thread;
@@ -67,6 +68,8 @@ void thread__delete(struct thread *thread)
  {
         struct comm *comm, *tmp;
  
+       BUG_ON(!RB_EMPTY_NODE(&thread->rb_node));
+
         thread_stack__free(thread);
  
         if (thread->mg) {
@@ -84,13 +87,14 @@ void thread__delete(struct thread *thread)
  
  struct thread *thread__get(struct thread *thread)
  {
-       ++thread->refcnt;
+       if (thread)
+               atomic_inc(&thread->refcnt);
         return thread;
  }
  
  void thread__put(struct thread *thread)
  {
-       if (thread && --thread->refcnt == 0) {
+       if (thread && atomic_dec_and_test(&thread->refcnt)) {
                 list_del_init(&thread->node);
                 thread__delete(thread);
         }
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h

index 9b8a54dc34a81963d8026e226bcd3334713b1606..a0ac0317affb5ffc46f69dc00c4c258d0c40c684 100644 (file)
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -1,6 +1,7 @@
  #ifndef __PERF_THREAD_H
  #define __PERF_THREAD_H
  
+#include <linux/atomic.h>
  #include <linux/rbtree.h>
  #include <linux/list.h>
  #include <unistd.h>
@@ -21,12 +22,12 @@ struct thread {
         pid_t                   tid;
         pid_t                   ppid;
         int                     cpu;
-       int                     refcnt;
+       atomic_t                refcnt;
         char                    shortname[3];
         bool                    comm_set;
+       int                     comm_len;
         bool                    dead; /* if set thread has exited */
         struct list_head        comm_list;
-       int                     comm_len;
         u64                     db_id;
  
         void                    *priv;
diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c

index f93b9734735b9478d3b8da0a2edf6be03403079d..f4822bd03709af52aba3eed89d50f06bc53e4f86 100644 (file)
--- a/tools/perf/util/thread_map.c
+++ b/tools/perf/util/thread_map.c
@@ -20,6 +20,15 @@ static int filter(const struct dirent *dir)
                 return 1;
  }
  
+static struct thread_map *thread_map__realloc(struct thread_map *map, int nr)
+{
+       size_t size = sizeof(*map) + sizeof(pid_t) * nr;
+
+       return realloc(map, size);
+}
+
+#define thread_map__alloc(__nr) thread_map__realloc(NULL, __nr)
+
  struct thread_map *thread_map__new_by_pid(pid_t pid)
  {
         struct thread_map *threads;
@@ -33,7 +42,7 @@ struct thread_map *thread_map__new_by_pid(pid_t pid)
         if (items <= 0)
                 return NULL;
  
-       threads = malloc(sizeof(*threads) + sizeof(pid_t) * items);
+       threads = thread_map__alloc(items);
         if (threads != NULL) {
                 for (i = 0; i < items; i++)
                         threads->map[i] = atoi(namelist[i]->d_name);
@@ -49,7 +58,7 @@ struct thread_map *thread_map__new_by_pid(pid_t pid)
  
  struct thread_map *thread_map__new_by_tid(pid_t tid)
  {
-       struct thread_map *threads = malloc(sizeof(*threads) + sizeof(pid_t));
+       struct thread_map *threads = thread_map__alloc(1);
  
         if (threads != NULL) {
                 threads->map[0] = tid;
@@ -65,8 +74,8 @@ struct thread_map *thread_map__new_by_uid(uid_t uid)
         int max_threads = 32, items, i;
         char path[256];
         struct dirent dirent, *next, **namelist = NULL;
-       struct thread_map *threads = malloc(sizeof(*threads) +
-                                           max_threads * sizeof(pid_t));
+       struct thread_map *threads = thread_map__alloc(max_threads);
+
         if (threads == NULL)
                 goto out;
  
@@ -185,8 +194,7 @@ static struct thread_map *thread_map__new_by_pid_str(const char *pid_str)
                         goto out_free_threads;
  
                 total_tasks += items;
-               nt = realloc(threads, (sizeof(*threads) +
-                                      sizeof(pid_t) * total_tasks));
+               nt = thread_map__realloc(threads, total_tasks);
                 if (nt == NULL)
                         goto out_free_namelist;
  
@@ -216,7 +224,7 @@ out_free_threads:
  
  struct thread_map *thread_map__new_dummy(void)
  {
-       struct thread_map *threads = malloc(sizeof(*threads) + sizeof(pid_t));
+       struct thread_map *threads = thread_map__alloc(1);
  
         if (threads != NULL) {
                 threads->map[0] = -1;
@@ -253,7 +261,7 @@ static struct thread_map *thread_map__new_by_tid_str(const char *tid_str)
                         continue;
  
                 ntasks++;
-               nt = realloc(threads, sizeof(*threads) + sizeof(pid_t) * ntasks);
+               nt = thread_map__realloc(threads, ntasks);
  
                 if (nt == NULL)
                         goto out_free_threads;
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h

index 51d9e56c0f841d89f730f00e72e984a867b9f4fc..c307dd4382863dd7f68314885115138f489c385b 100644 (file)
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -3,6 +3,8 @@
  
  #include <stdbool.h>
  
+#include <linux/types.h>
+
  struct perf_session;
  union perf_event;
  struct perf_evlist;
@@ -29,6 +31,9 @@ typedef int (*event_op2)(struct perf_tool *tool, union perf_event *event,
  typedef int (*event_oe)(struct perf_tool *tool, union perf_event *event,
                         struct ordered_events *oe);
  
+typedef s64 (*event_op3)(struct perf_tool *tool, union perf_event *event,
+                        struct perf_session *session);
+
  struct perf_tool {
         event_sample    sample,
                         read;
@@ -38,13 +43,19 @@ struct perf_tool {
                         fork,
                         exit,
                         lost,
+                       lost_samples,
+                       aux,
+                       itrace_start,
                         throttle,
                         unthrottle;
         event_attr_op   attr;
         event_op2       tracing_data;
         event_oe        finished_round;
         event_op2       build_id,
-                       id_index;
+                       id_index,
+                       auxtrace_info,
+                       auxtrace_error;
+       event_op3       auxtrace;
         bool            ordered_events;
         bool            ordering_requires_timestamps;
  };
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c

index 25d6c737be3e673db1904104a83286c5fbda0b40..d4957418657ec3ab88cce7f77b70095ff21d62c8 100644 (file)
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -173,7 +173,7 @@ void parse_ftrace_printk(struct pevent *pevent,
         char *line;
         char *next = NULL;
         char *addr_str;
-       char *fmt;
+       char *fmt = NULL;
  
         line = strtok_r(file, "\n", &next);
         while (line) {
diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c

index 7b09a443a280429d3de68f03ced731f66b36d3fc..4c00507ee3fd2ad488642def35226711cba821fe 100644 (file)
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -269,13 +269,14 @@ static int read_unwind_spec_eh_frame(struct dso *dso, struct machine *machine,
         u64 offset = dso->data.eh_frame_hdr_offset;
  
         if (offset == 0) {
-               fd = dso__data_fd(dso, machine);
+               fd = dso__data_get_fd(dso, machine);
                 if (fd < 0)
                         return -EINVAL;
  
                 /* Check the .eh_frame section for unwinding info */
                 offset = elf_section_offset(fd, ".eh_frame_hdr");
                 dso->data.eh_frame_hdr_offset = offset;
+               dso__data_put_fd(dso);
         }
  
         if (offset)
@@ -294,13 +295,14 @@ static int read_unwind_spec_debug_frame(struct dso *dso,
         u64 ofs = dso->data.debug_frame_offset;
  
         if (ofs == 0) {
-               fd = dso__data_fd(dso, machine);
+               fd = dso__data_get_fd(dso, machine);
                 if (fd < 0)
                         return -EINVAL;
  
                 /* Check the .debug_frame section for unwinding info */
                 ofs = elf_section_offset(fd, ".debug_frame");
                 dso->data.debug_frame_offset = ofs;
+               dso__data_put_fd(dso);
         }
  
         *offset = ofs;
@@ -353,10 +355,13 @@ find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi,
  #ifndef NO_LIBUNWIND_DEBUG_FRAME
         /* Check the .debug_frame section for unwinding info */
         if (!read_unwind_spec_debug_frame(map->dso, ui->machine, &segbase)) {
-               int fd = dso__data_fd(map->dso, ui->machine);
+               int fd = dso__data_get_fd(map->dso, ui->machine);
                 int is_exec = elf_is_exec(fd, map->dso->name);
                 unw_word_t base = is_exec ? 0 : map->start;
  
+               if (fd >= 0)
+                       dso__data_put_fd(map->dso);
+
                 memset(&di, 0, sizeof(di));
                 if (dwarf_find_debug_frame(0, &di, ip, base, map->dso->name,
                                            map->start, map->end))
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c

index 4ee6d0d4c9931752e76abe15dd098468c4e0c01f..edc2d633b33224530e9dcb7780877bf7d1be08b3 100644 (file)
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -72,20 +72,60 @@ int mkdir_p(char *path, mode_t mode)
         return (stat(path, &st) && mkdir(path, mode)) ? -1 : 0;
  }
  
-static int slow_copyfile(const char *from, const char *to, mode_t mode)
+int rm_rf(char *path)
+{
+       DIR *dir;
+       int ret = 0;
+       struct dirent *d;
+       char namebuf[PATH_MAX];
+
+       dir = opendir(path);
+       if (dir == NULL)
+               return 0;
+
+       while ((d = readdir(dir)) != NULL && !ret) {
+               struct stat statbuf;
+
+               if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, ".."))
+                       continue;
+
+               scnprintf(namebuf, sizeof(namebuf), "%s/%s",
+                         path, d->d_name);
+
+               ret = stat(namebuf, &statbuf);
+               if (ret < 0) {
+                       pr_debug("stat failed: %s\n", namebuf);
+                       break;
+               }
+
+               if (S_ISREG(statbuf.st_mode))
+                       ret = unlink(namebuf);
+               else if (S_ISDIR(statbuf.st_mode))
+                       ret = rm_rf(namebuf);
+               else {
+                       pr_debug("unknown file: %s\n", namebuf);
+                       ret = -1;
+               }
+       }
+       closedir(dir);
+
+       if (ret < 0)
+               return ret;
+
+       return rmdir(path);
+}
+
+static int slow_copyfile(const char *from, const char *to)
  {
         int err = -1;
         char *line = NULL;
         size_t n;
         FILE *from_fp = fopen(from, "r"), *to_fp;
-       mode_t old_umask;
  
         if (from_fp == NULL)
                 goto out;
  
-       old_umask = umask(mode ^ 0777);
         to_fp = fopen(to, "w");
-       umask(old_umask);
         if (to_fp == NULL)
                 goto out_fclose_from;
  
@@ -102,42 +142,81 @@ out:
         return err;
  }
  
+int copyfile_offset(int ifd, loff_t off_in, int ofd, loff_t off_out, u64 size)
+{
+       void *ptr;
+       loff_t pgoff;
+
+       pgoff = off_in & ~(page_size - 1);
+       off_in -= pgoff;
+
+       ptr = mmap(NULL, off_in + size, PROT_READ, MAP_PRIVATE, ifd, pgoff);
+       if (ptr == MAP_FAILED)
+               return -1;
+
+       while (size) {
+               ssize_t ret = pwrite(ofd, ptr + off_in, size, off_out);
+               if (ret < 0 && errno == EINTR)
+                       continue;
+               if (ret <= 0)
+                       break;
+
+               size -= ret;
+               off_in += ret;
+               off_out -= ret;
+       }
+       munmap(ptr, off_in + size);
+
+       return size ? -1 : 0;
+}
+
  int copyfile_mode(const char *from, const char *to, mode_t mode)
  {
         int fromfd, tofd;
         struct stat st;
-       void *addr;
         int err = -1;
+       char *tmp = NULL, *ptr = NULL;
  
         if (stat(from, &st))
                 goto out;
  
-       if (st.st_size == 0) /* /proc? do it slowly... */
-               return slow_copyfile(from, to, mode);
-
-       fromfd = open(from, O_RDONLY);
-       if (fromfd < 0)
+       /* extra 'x' at the end is to reserve space for '.' */
+       if (asprintf(&tmp, "%s.XXXXXXx", to) < 0) {
+               tmp = NULL;
                 goto out;
+       }
+       ptr = strrchr(tmp, '/');
+       if (!ptr)
+               goto out;
+       ptr = memmove(ptr + 1, ptr, strlen(ptr) - 1);
+       *ptr = '.';
  
-       tofd = creat(to, mode);
+       tofd = mkstemp(tmp);
         if (tofd < 0)
-               goto out_close_from;
+               goto out;
+
+       if (fchmod(tofd, mode))
+               goto out_close_to;
+
+       if (st.st_size == 0) { /* /proc? do it slowly... */
+               err = slow_copyfile(from, tmp);
+               goto out_close_to;
+       }
  
-       addr = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fromfd, 0);
-       if (addr == MAP_FAILED)
+       fromfd = open(from, O_RDONLY);
+       if (fromfd < 0)
                 goto out_close_to;
  
-       if (write(tofd, addr, st.st_size) == st.st_size)
-               err = 0;
+       err = copyfile_offset(fromfd, 0, tofd, 0, st.st_size);
  
-       munmap(addr, st.st_size);
+       close(fromfd);
  out_close_to:
         close(tofd);
-       if (err)
-               unlink(to);
-out_close_from:
-       close(fromfd);
+       if (!err)
+               err = link(tmp, to);
+       unlink(tmp);
  out:
+       free(tmp);
         return err;
  }
  
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h

index 1ff23e04ad2730b99f78dbeef3af56ba1a45e9ec..8bce58b47a826918db8f395e3e6c2711e9bbba8e 100644 (file)
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -249,14 +249,20 @@ static inline int sane_case(int x, int high)
  }
  
  int mkdir_p(char *path, mode_t mode);
+int rm_rf(char *path);
  int copyfile(const char *from, const char *to);
  int copyfile_mode(const char *from, const char *to, mode_t mode);
+int copyfile_offset(int fromfd, loff_t from_ofs, int tofd, loff_t to_ofs, u64 size);
  
  s64 perf_atoll(const char *str);
  char **argv_split(const char *str, int *argcp);
  void argv_free(char **argv);
  bool strglobmatch(const char *str, const char *pat);
  bool strlazymatch(const char *str, const char *pat);
+static inline bool strisglob(const char *str)
+{
+       return strpbrk(str, "*?[") != NULL;
+}
  int strtailcmp(const char *s1, const char *s2);
  char *strxfrchar(char *s, char from, char to);
  unsigned long convert_unit(unsigned long value, char *unit);
diff --git a/tools/perf/util/vdso.c b/tools/perf/util/vdso.c

index 5c7dd796979d0d625df236c82c30b5e22f5ac7d9..4b89118f158db458ae29cca6d20b093d81392cf6 100644 (file)
--- a/tools/perf/util/vdso.c
+++ b/tools/perf/util/vdso.c
@@ -101,7 +101,7 @@ static char *get_file(struct vdso_file *vdso_file)
         return vdso;
  }
  
-void vdso__exit(struct machine *machine)
+void machine__exit_vdso(struct machine *machine)
  {
         struct vdso_info *vdso_info = machine->vdso_info;
  
@@ -120,14 +120,14 @@ void vdso__exit(struct machine *machine)
         zfree(&machine->vdso_info);
  }
  
-static struct dso *vdso__new(struct machine *machine, const char *short_name,
-                            const char *long_name)
+static struct dso *__machine__addnew_vdso(struct machine *machine, const char *short_name,
+                                         const char *long_name)
  {
         struct dso *dso;
  
         dso = dso__new(short_name);
         if (dso != NULL) {
-               dsos__add(&machine->user_dsos, dso);
+               __dsos__add(&machine->dsos, dso);
                 dso__set_long_name(dso, long_name, false);
         }
  
@@ -230,27 +230,31 @@ static const char *vdso__get_compat_file(struct vdso_file *vdso_file)
         return vdso_file->temp_file_name;
  }
  
-static struct dso *vdso__findnew_compat(struct machine *machine,
-                                       struct vdso_file *vdso_file)
+static struct dso *__machine__findnew_compat(struct machine *machine,
+                                            struct vdso_file *vdso_file)
  {
         const char *file_name;
         struct dso *dso;
  
-       dso = dsos__find(&machine->user_dsos, vdso_file->dso_name, true);
+       pthread_rwlock_wrlock(&machine->dsos.lock);
+       dso = __dsos__find(&machine->dsos, vdso_file->dso_name, true);
         if (dso)
-               return dso;
+               goto out_unlock;
  
         file_name = vdso__get_compat_file(vdso_file);
         if (!file_name)
-               return NULL;
+               goto out_unlock;
  
-       return vdso__new(machine, vdso_file->dso_name, file_name);
+       dso = __machine__addnew_vdso(machine, vdso_file->dso_name, file_name);
+out_unlock:
+       pthread_rwlock_unlock(&machine->dsos.lock);
+       return dso;
  }
  
-static int vdso__dso_findnew_compat(struct machine *machine,
-                                   struct thread *thread,
-                                   struct vdso_info *vdso_info,
-                                   struct dso **dso)
+static int __machine__findnew_vdso_compat(struct machine *machine,
+                                         struct thread *thread,
+                                         struct vdso_info *vdso_info,
+                                         struct dso **dso)
  {
         enum dso_type dso_type;
  
@@ -267,10 +271,10 @@ static int vdso__dso_findnew_compat(struct machine *machine,
  
         switch (dso_type) {
         case DSO__TYPE_32BIT:
-               *dso = vdso__findnew_compat(machine, &vdso_info->vdso32);
+               *dso = __machine__findnew_compat(machine, &vdso_info->vdso32);
                 return 1;
         case DSO__TYPE_X32BIT:
-               *dso = vdso__findnew_compat(machine, &vdso_info->vdsox32);
+               *dso = __machine__findnew_compat(machine, &vdso_info->vdsox32);
                 return 1;
         case DSO__TYPE_UNKNOWN:
         case DSO__TYPE_64BIT:
@@ -281,35 +285,37 @@ static int vdso__dso_findnew_compat(struct machine *machine,
  
  #endif
  
-struct dso *vdso__dso_findnew(struct machine *machine,
-                             struct thread *thread __maybe_unused)
+struct dso *machine__findnew_vdso(struct machine *machine,
+                                 struct thread *thread __maybe_unused)
  {
         struct vdso_info *vdso_info;
-       struct dso *dso;
+       struct dso *dso = NULL;
  
+       pthread_rwlock_wrlock(&machine->dsos.lock);
         if (!machine->vdso_info)
                 machine->vdso_info = vdso_info__new();
  
         vdso_info = machine->vdso_info;
         if (!vdso_info)
-               return NULL;
+               goto out_unlock;
  
  #if BITS_PER_LONG == 64
-       if (vdso__dso_findnew_compat(machine, thread, vdso_info, &dso))
-               return dso;
+       if (__machine__findnew_vdso_compat(machine, thread, vdso_info, &dso))
+               goto out_unlock;
  #endif
  
-       dso = dsos__find(&machine->user_dsos, DSO__NAME_VDSO, true);
+       dso = __dsos__find(&machine->dsos, DSO__NAME_VDSO, true);
         if (!dso) {
                 char *file;
  
                 file = get_file(&vdso_info->vdso);
-               if (!file)
-                       return NULL;
-
-               dso = vdso__new(machine, DSO__NAME_VDSO, file);
+               if (file)
+                       dso = __machine__addnew_vdso(machine, DSO__NAME_VDSO, file);
         }
  
+out_unlock:
+       dso__get(dso);
+       pthread_rwlock_unlock(&machine->dsos.lock);
         return dso;
  }
  
diff --git a/tools/perf/util/vdso.h b/tools/perf/util/vdso.h

index d97da1616f0c5b658d8c1846b0285835b3a3478c..cdc4fabfc2124efa9c2c497dc35a5f6b77840a85 100644 (file)
--- a/tools/perf/util/vdso.h
+++ b/tools/perf/util/vdso.h
@@ -23,7 +23,7 @@ bool dso__is_vdso(struct dso *dso);
  struct machine;
  struct thread;
  
-struct dso *vdso__dso_findnew(struct machine *machine, struct thread *thread);
-void vdso__exit(struct machine *machine);
+struct dso *machine__findnew_vdso(struct machine *machine, struct thread *thread);
+void machine__exit_vdso(struct machine *machine);
  
  #endif /* __PERF_VDSO__ */
diff --git a/tools/perf/util/xyarray.c b/tools/perf/util/xyarray.c

index 22afbf6c536adbb291f4ee87a35edf970722fc5a..c10ba41ef3f6298eb77e624f7f1b14f41555c36c 100644 (file)
--- a/tools/perf/util/xyarray.c
+++ b/tools/perf/util/xyarray.c
@@ -9,11 +9,19 @@ struct xyarray *xyarray__new(int xlen, int ylen, size_t entry_size)
         if (xy != NULL) {
                 xy->entry_size = entry_size;
                 xy->row_size   = row_size;
+               xy->entries    = xlen * ylen;
         }
  
         return xy;
  }
  
+void xyarray__reset(struct xyarray *xy)
+{
+       size_t n = xy->entries * xy->entry_size;
+
+       memset(xy->contents, 0, n);
+}
+
  void xyarray__delete(struct xyarray *xy)
  {
         free(xy);
diff --git a/tools/perf/util/xyarray.h b/tools/perf/util/xyarray.h

index c488a07275dd2783f13360a341063de1d87eab6c..7f30af371b7ee692f23b9bf4aeb354584f453722 100644 (file)
--- a/tools/perf/util/xyarray.h
+++ b/tools/perf/util/xyarray.h
@@ -6,11 +6,13 @@
  struct xyarray {
         size_t row_size;
         size_t entry_size;
+       size_t entries;
         char contents[];
  };
  
  struct xyarray *xyarray__new(int xlen, int ylen, size_t entry_size);
  void xyarray__delete(struct xyarray *xy);
+void xyarray__reset(struct xyarray *xy);
  
  static inline void *xyarray__entry(struct xyarray *xy, int x, int y)
  {
diff --git a/tools/testing/selftests/rcutorture/bin/configinit.sh b/tools/testing/selftests/rcutorture/bin/configinit.sh

index 15f1a17ca96e69695f5f266cc58198102ba1a0c9..3f81a109520693ec6ebac0abe78c73388b8fc334 100755 (executable)
--- a/tools/testing/selftests/rcutorture/bin/configinit.sh
+++ b/tools/testing/selftests/rcutorture/bin/configinit.sh
@@ -66,7 +66,7 @@ make $buildloc $TORTURE_DEFCONFIG > $builddir/Make.defconfig.out 2>&1
  mv $builddir/.config $builddir/.config.sav
  sh $T/upd.sh < $builddir/.config.sav > $builddir/.config
  cp $builddir/.config $builddir/.config.new
-yes '' | make $buildloc oldconfig > $builddir/Make.modconfig.out 2>&1
+yes '' | make $buildloc oldconfig > $builddir/Make.oldconfig.out 2> $builddir/Make.oldconfig.err
  
  # verify new config matches specification.
  configcheck.sh $builddir/.config $c
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh

index 4f5b20f367a944f07f0443a480dd79d061031d87..d86bdd6b6cc2df3148adf7bacff4c6a014edc3cc 100755 (executable)
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
@@ -43,6 +43,10 @@ do
                 if test -f "$i/console.log"
                 then
                         configcheck.sh $i/.config $i/ConfigFragment
+                       if test -r $i/Make.oldconfig.err
+                       then
+                               cat $i/Make.oldconfig.err
+                       fi
                         parse-build.sh $i/Make.out $configfile
                         parse-torture.sh $i/console.log $configfile
                         parse-console.sh $i/console.log $configfile
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh

index dd2812ceb0baba2bb4f39343e428582e9619179f..fbe2dbff1e210c2f4711d2f581823d6b14c85799 100755 (executable)
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -55,7 +55,7 @@ usage () {
         echo "       --bootargs kernel-boot-arguments"
         echo "       --bootimage relative-path-to-kernel-boot-image"
         echo "       --buildonly"
-       echo "       --configs \"config-file list\""
+       echo "       --configs \"config-file list w/ repeat factor (3*TINY01)\""
         echo "       --cpus N"
         echo "       --datestamp string"
         echo "       --defconfig string"
@@ -178,13 +178,26 @@ fi
  touch $T/cfgcpu
  for CF in $configs
  do
-       if test -f "$CONFIGFRAG/$CF"
+       case $CF in
+       [0-9]\**|[0-9][0-9]\**|[0-9][0-9][0-9]\**)
+               config_reps=`echo $CF | sed -e 's/\*.*$//'`
+               CF1=`echo $CF | sed -e 's/^[^*]*\*//'`
+               ;;
+       *)
+               config_reps=1
+               CF1=$CF
+               ;;
+       esac
+       if test -f "$CONFIGFRAG/$CF1"
         then
-               cpu_count=`configNR_CPUS.sh $CONFIGFRAG/$CF`
-               cpu_count=`configfrag_boot_cpus "$TORTURE_BOOTARGS" "$CONFIGFRAG/$CF" "$cpu_count"`
-               echo $CF $cpu_count >> $T/cfgcpu
+               cpu_count=`configNR_CPUS.sh $CONFIGFRAG/$CF1`
+               cpu_count=`configfrag_boot_cpus "$TORTURE_BOOTARGS" "$CONFIGFRAG/$CF1" "$cpu_count"`
+               for ((cur_rep=0;cur_rep<$config_reps;cur_rep++))
+               do
+                       echo $CF1 $cpu_count >> $T/cfgcpu
+               done
         else
-               echo "The --configs file $CF does not exist, terminating."
+               echo "The --configs file $CF1 does not exist, terminating."
                 exit 1
         fi
  done
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon

index 49701218dc620481ff32344863411413e208da66..f824b4c9d9d9132d9681c5fe51c26caa76d4d8de 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
@@ -1,3 +1,5 @@
  CONFIG_RCU_TORTURE_TEST=y
  CONFIG_PRINTK_TIME=y
+CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y
  CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y
+CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N

index 9fbb41b9b3149ed62cb0e97aabe6731d7bdc3c6c..1a087c3c8bb861584c069142bfe1fd8b02415956 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N
@@ -5,3 +5,4 @@ CONFIG_HOTPLUG_CPU=y
  CONFIG_PREEMPT_NONE=y
  CONFIG_PREEMPT_VOLUNTARY=n
  CONFIG_PREEMPT=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P

index 4b6f272dba27f8483f45c99a4a28f419e1b9d69a..4837430a71c0c3456979eb02a635ac0b9a3e1318 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P
@@ -5,3 +5,4 @@ CONFIG_HOTPLUG_CPU=y
  CONFIG_PREEMPT_NONE=n
  CONFIG_PREEMPT_VOLUNTARY=n
  CONFIG_PREEMPT=y
+#CHECK#CONFIG_RCU_EXPERT=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot

index 238bfe3bd0cccf5096cace6c1012dac22d111728..84a7d51b7481e7a4173b0e87f9d46b51b65ef1eb 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot
@@ -1 +1 @@
-rcutorture.torture_type=srcu
+rcutorture.torture_type=srcud
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01

index 97f0a0b27ef7293ed4cb9fa364fbee81629807c2..2cc0e60eba6eed66c1ebf6f5d0c85a19b0564ec3 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01
@@ -5,5 +5,6 @@ CONFIG_PREEMPT_NONE=n
  CONFIG_PREEMPT_VOLUNTARY=n
  CONFIG_PREEMPT=y
  CONFIG_DEBUG_LOCK_ALLOC=y
-CONFIG_PROVE_RCU=y
-CONFIG_TASKS_RCU=y
+CONFIG_PROVE_LOCKING=n
+#CHECK#CONFIG_PROVE_RCU=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02

index 696d2ea74d13bb92ba91ed45a0f0df951d7234b9..ad2be91e5ee7624e95df63885f70dbbc833afb64 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02
@@ -2,4 +2,3 @@ CONFIG_SMP=n
  CONFIG_PREEMPT_NONE=y
  CONFIG_PREEMPT_VOLUNTARY=n
  CONFIG_PREEMPT=n
-CONFIG_TASKS_RCU=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03

index 9c60da5b5d1ddae021ef28ed3b5f5a008186dc11..c70c51d5ded15af901e1a2c267b144a9cc88ba4d 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03
@@ -6,8 +6,8 @@ CONFIG_HIBERNATION=n
  CONFIG_PREEMPT_NONE=n
  CONFIG_PREEMPT_VOLUNTARY=n
  CONFIG_PREEMPT=y
-CONFIG_TASKS_RCU=y
  CONFIG_HZ_PERIODIC=n
  CONFIG_NO_HZ_IDLE=n
  CONFIG_NO_HZ_FULL=y
  CONFIG_NO_HZ_FULL_ALL=y
+#CHECK#CONFIG_RCU_EXPERT=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TINY02 b/tools/testing/selftests/rcutorture/configs/rcu/TINY02

index 36e41df3d27aa6ad7abdf56dba609fd51bb1a9b1..f1892e0371c954bd5cfc800a6b9cc87297e9433e 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TINY02
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TINY02
@@ -8,7 +8,7 @@ CONFIG_NO_HZ_IDLE=n
  CONFIG_NO_HZ_FULL=n
  CONFIG_RCU_TRACE=y
  CONFIG_PROVE_LOCKING=y
-CONFIG_PROVE_RCU=y
+#CHECK#CONFIG_PROVE_RCU=y
  CONFIG_DEBUG_LOCK_ALLOC=y
  CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
  CONFIG_PREEMPT_COUNT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TINY02.boot b/tools/testing/selftests/rcutorture/configs/rcu/TINY02.boot

index 0f0802730014c675b6f830601d4aed340aa1f4d3..6c1a292a65fb499968bd2495fa046aedb02b39f8 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TINY02.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TINY02.boot
@@ -1,2 +1,3 @@
  rcupdate.rcu_self_test=1
  rcupdate.rcu_self_test_bh=1
+rcutorture.torture_type=rcu_bh
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 b/tools/testing/selftests/rcutorture/configs/rcu/TREE01

index f8a10a7500c64f057987e3d5e3ab765d322973fb..8e9137f66831f2fa6f7ea1c2de7aba6fd8fa99c2 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01
@@ -16,3 +16,4 @@ CONFIG_DEBUG_LOCK_ALLOC=n
  CONFIG_RCU_CPU_STALL_INFO=n
  CONFIG_RCU_BOOST=n
  CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02 b/tools/testing/selftests/rcutorture/configs/rcu/TREE02

index 629122fb8b4a12323e800f09453a3a06718927d8..aeea6a204d14b1ea87737d6d5024f75668f6dd68 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE02
@@ -14,10 +14,10 @@ CONFIG_SUSPEND=n
  CONFIG_HIBERNATION=n
  CONFIG_RCU_FANOUT=3
  CONFIG_RCU_FANOUT_LEAF=3
-CONFIG_RCU_FANOUT_EXACT=n
  CONFIG_RCU_NOCB_CPU=n
  CONFIG_DEBUG_LOCK_ALLOC=y
  CONFIG_PROVE_LOCKING=n
  CONFIG_RCU_CPU_STALL_INFO=n
  CONFIG_RCU_BOOST=n
  CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T b/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T

index a25de47888a4fc11371adc7bc99a93fe2dfcf8d6..2ac9e68ea3d1481764b82eaba427039ed45fa992 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T
@@ -14,7 +14,6 @@ CONFIG_SUSPEND=n
  CONFIG_HIBERNATION=n
  CONFIG_RCU_FANOUT=3
  CONFIG_RCU_FANOUT_LEAF=3
-CONFIG_RCU_FANOUT_EXACT=n
  CONFIG_RCU_NOCB_CPU=n
  CONFIG_DEBUG_LOCK_ALLOC=y
  CONFIG_PROVE_LOCKING=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03 b/tools/testing/selftests/rcutorture/configs/rcu/TREE03

index 53f24e0a0ab618c8470e1727ecec0a8d78ce5db3..72aa7d87ea99159db02b4f07976abdc99da3e71b 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03
@@ -1,5 +1,5 @@
  CONFIG_SMP=y
-CONFIG_NR_CPUS=8
+CONFIG_NR_CPUS=16
  CONFIG_PREEMPT_NONE=n
  CONFIG_PREEMPT_VOLUNTARY=n
  CONFIG_PREEMPT=y
@@ -9,12 +9,12 @@ CONFIG_NO_HZ_IDLE=n
  CONFIG_NO_HZ_FULL=n
  CONFIG_RCU_TRACE=y
  CONFIG_HOTPLUG_CPU=y
-CONFIG_RCU_FANOUT=4
-CONFIG_RCU_FANOUT_LEAF=4
-CONFIG_RCU_FANOUT_EXACT=n
+CONFIG_RCU_FANOUT=2
+CONFIG_RCU_FANOUT_LEAF=2
  CONFIG_RCU_NOCB_CPU=n
  CONFIG_DEBUG_LOCK_ALLOC=n
  CONFIG_RCU_CPU_STALL_INFO=n
  CONFIG_RCU_BOOST=y
  CONFIG_RCU_KTHREAD_PRIO=2
  CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot

new file mode 100644 (file)

index 0000000..120c0c8
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
@@ -0,0 +1 @@
+rcutorture.onoff_interval=1 rcutorture.onoff_holdoff=30
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04

index 0f84db35b36d6221b05a1a25d001f459b0870a08..3f5112751cda0d571af0967395d0b80e18e8543a 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
@@ -13,10 +13,10 @@ CONFIG_RCU_TRACE=y
  CONFIG_HOTPLUG_CPU=n
  CONFIG_SUSPEND=n
  CONFIG_HIBERNATION=n
-CONFIG_RCU_FANOUT=2
-CONFIG_RCU_FANOUT_LEAF=2
-CONFIG_RCU_FANOUT_EXACT=n
+CONFIG_RCU_FANOUT=4
+CONFIG_RCU_FANOUT_LEAF=4
  CONFIG_RCU_NOCB_CPU=n
  CONFIG_DEBUG_LOCK_ALLOC=n
-CONFIG_RCU_CPU_STALL_INFO=y
+CONFIG_RCU_CPU_STALL_INFO=n
  CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 b/tools/testing/selftests/rcutorture/configs/rcu/TREE05

index 212e3bfd2b2ac0b8d21bc261e247600733e3c5cc..c04dfea6fd217e5ddb45cb7b46257b4979454a44 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05
@@ -12,11 +12,11 @@ CONFIG_RCU_TRACE=n
  CONFIG_HOTPLUG_CPU=y
  CONFIG_RCU_FANOUT=6
  CONFIG_RCU_FANOUT_LEAF=6
-CONFIG_RCU_FANOUT_EXACT=n
  CONFIG_RCU_NOCB_CPU=y
  CONFIG_RCU_NOCB_CPU_NONE=y
  CONFIG_DEBUG_LOCK_ALLOC=y
  CONFIG_PROVE_LOCKING=y
-CONFIG_PROVE_RCU=y
+#CHECK#CONFIG_PROVE_RCU=y
  CONFIG_RCU_CPU_STALL_INFO=n
  CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06 b/tools/testing/selftests/rcutorture/configs/rcu/TREE06

index 7eee63b442181267b6ae4ca0c723af59bcb2ac37..f51d2c73a68ec221f45d26d93e0a2743751447b7 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE06
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06
@@ -14,10 +14,10 @@ CONFIG_SUSPEND=n
  CONFIG_HIBERNATION=n
  CONFIG_RCU_FANOUT=6
  CONFIG_RCU_FANOUT_LEAF=6
-CONFIG_RCU_FANOUT_EXACT=y
  CONFIG_RCU_NOCB_CPU=n
  CONFIG_DEBUG_LOCK_ALLOC=y
  CONFIG_PROVE_LOCKING=y
-CONFIG_PROVE_RCU=y
+#CHECK#CONFIG_PROVE_RCU=y
  CONFIG_RCU_CPU_STALL_INFO=n
  CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot

index da9a03a398db148effea02957c0e52cad42f0224..dd90f28ed700b31e897ef5f2eff1ed577f031d05 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot
@@ -1,3 +1,4 @@
  rcupdate.rcu_self_test=1
  rcupdate.rcu_self_test_bh=1
  rcupdate.rcu_self_test_sched=1
+rcutree.rcu_fanout_exact=1
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07

index 92a97fa97dec1c4b06dabea9dcbea5e614bad965..f422af4ff5a31bf0c497c7d652b1c0b6e7d30a34 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07
@@ -15,8 +15,8 @@ CONFIG_RCU_TRACE=y
  CONFIG_HOTPLUG_CPU=y
  CONFIG_RCU_FANOUT=2
  CONFIG_RCU_FANOUT_LEAF=2
-CONFIG_RCU_FANOUT_EXACT=n
  CONFIG_RCU_NOCB_CPU=n
  CONFIG_DEBUG_LOCK_ALLOC=n
-CONFIG_RCU_CPU_STALL_INFO=y
+CONFIG_RCU_CPU_STALL_INFO=n
  CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08 b/tools/testing/selftests/rcutorture/configs/rcu/TREE08

index 5812027d6f9ff043fecdcb2558a0c4befe83b50c..a24d2ca30646c3afe48417cdbb47e085a09d7486 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08
@@ -1,5 +1,5 @@
  CONFIG_SMP=y
-CONFIG_NR_CPUS=16
+CONFIG_NR_CPUS=8
  CONFIG_PREEMPT_NONE=n
  CONFIG_PREEMPT_VOLUNTARY=n
  CONFIG_PREEMPT=y
@@ -13,13 +13,13 @@ CONFIG_HOTPLUG_CPU=n
  CONFIG_SUSPEND=n
  CONFIG_HIBERNATION=n
  CONFIG_RCU_FANOUT=3
-CONFIG_RCU_FANOUT_EXACT=y
  CONFIG_RCU_FANOUT_LEAF=2
  CONFIG_RCU_NOCB_CPU=y
  CONFIG_RCU_NOCB_CPU_ALL=y
  CONFIG_DEBUG_LOCK_ALLOC=n
  CONFIG_PROVE_LOCKING=y
-CONFIG_PROVE_RCU=y
+#CHECK#CONFIG_PROVE_RCU=y
  CONFIG_RCU_CPU_STALL_INFO=n
  CONFIG_RCU_BOOST=n
  CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T b/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T

index 3eaeccacb08389117d7d2ace6e4381049a2e5e57..b2b8cea69dc9935cadf032524c64406a4aa92ef7 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T
@@ -13,7 +13,6 @@ CONFIG_HOTPLUG_CPU=n
  CONFIG_SUSPEND=n
  CONFIG_HIBERNATION=n
  CONFIG_RCU_FANOUT=3
-CONFIG_RCU_FANOUT_EXACT=y
  CONFIG_RCU_FANOUT_LEAF=2
  CONFIG_RCU_NOCB_CPU=y
  CONFIG_RCU_NOCB_CPU_ALL=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T.boot

new file mode 100644 (file)

index 0000000..883149b
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T.boot
@@ -0,0 +1 @@
+rcutree.rcu_fanout_exact=1
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot

index 2561daf605ad5f9a09920e98b18ab451d3ddada3..fb066dc82769fe4f910f4e86ec3c4e1541071adb 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot
@@ -1,3 +1,4 @@
  rcutorture.torture_type=sched
  rcupdate.rcu_self_test=1
  rcupdate.rcu_self_test_sched=1
+rcutree.rcu_fanout_exact=1
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 b/tools/testing/selftests/rcutorture/configs/rcu/TREE09

index 6076b36f6c0b452c27ffb0c80e40fdacc4014eca..aa4ed08d999d9dbe8b438e39841ab8093d157c34 100644 (file)
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE09
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE09
@@ -16,3 +16,4 @@ CONFIG_DEBUG_LOCK_ALLOC=n
  CONFIG_RCU_CPU_STALL_INFO=n
  CONFIG_RCU_BOOST=n
  CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+#CHECK#CONFIG_RCU_EXPERT=n
diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt

index ec03c883db005192c3d5c644ff2595f3e3c6f673..b24c0004fc499868046eee81993caba27b5f1827 100644 (file)
--- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -12,13 +12,12 @@ CONFIG_NO_HZ_IDLE -- Do those not otherwise specified. (Groups of two.)
  CONFIG_NO_HZ_FULL -- Do two, one with CONFIG_NO_HZ_FULL_SYSIDLE.
  CONFIG_NO_HZ_FULL_SYSIDLE -- Do one.
  CONFIG_PREEMPT -- Do half.  (First three and #8.)
-CONFIG_PROVE_LOCKING -- Do all but two, covering CONFIG_PROVE_RCU and not.
-CONFIG_PROVE_RCU -- Do all but one under CONFIG_PROVE_LOCKING.
+CONFIG_PROVE_LOCKING -- Do several, covering CONFIG_DEBUG_LOCK_ALLOC=y and not.
+CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING.
  CONFIG_RCU_BOOST -- one of PREEMPT_RCU.
  CONFIG_RCU_KTHREAD_PRIO -- set to 2 for _BOOST testing.
-CONFIG_RCU_CPU_STALL_INFO -- Do one.
-CONFIG_RCU_FANOUT -- Cover hierarchy as currently, but overlap with others.
-CONFIG_RCU_FANOUT_EXACT -- Do one.
+CONFIG_RCU_CPU_STALL_INFO -- Now default, avoid at least twice.
+CONFIG_RCU_FANOUT -- Cover hierarchy, but overlap with others.
  CONFIG_RCU_FANOUT_LEAF -- Do one non-default.
  CONFIG_RCU_FAST_NO_HZ -- Do one, but not with CONFIG_RCU_NOCB_CPU_ALL.
  CONFIG_RCU_NOCB_CPU -- Do three, see below.
@@ -27,28 +26,19 @@ CONFIG_RCU_NOCB_CPU_NONE -- Do one.
  CONFIG_RCU_NOCB_CPU_ZERO -- Do one.
  CONFIG_RCU_TRACE -- Do half.
  CONFIG_SMP -- Need one !SMP for PREEMPT_RCU.
+!RCU_EXPERT -- Do a few, but these have to be vanilla configurations.
  RCU-bh: Do one with PREEMPT and one with !PREEMPT.
  RCU-sched: Do one with PREEMPT but not BOOST.
  
  
-Hierarchy:
-
-TREE01.        CONFIG_NR_CPUS=8, CONFIG_RCU_FANOUT=8, CONFIG_RCU_FANOUT_EXACT=n.
-TREE02.        CONFIG_NR_CPUS=8, CONFIG_RCU_FANOUT=3, CONFIG_RCU_FANOUT_EXACT=n,
-       CONFIG_RCU_FANOUT_LEAF=3.
-TREE03.        CONFIG_NR_CPUS=8, CONFIG_RCU_FANOUT=4, CONFIG_RCU_FANOUT_EXACT=n,
-       CONFIG_RCU_FANOUT_LEAF=4.
-TREE04.        CONFIG_NR_CPUS=8, CONFIG_RCU_FANOUT=2, CONFIG_RCU_FANOUT_EXACT=n,
-       CONFIG_RCU_FANOUT_LEAF=2.
-TREE05.        CONFIG_NR_CPUS=8, CONFIG_RCU_FANOUT=6, CONFIG_RCU_FANOUT_EXACT=n
-       CONFIG_RCU_FANOUT_LEAF=6.
-TREE06.        CONFIG_NR_CPUS=8, CONFIG_RCU_FANOUT=6, CONFIG_RCU_FANOUT_EXACT=y
-       CONFIG_RCU_FANOUT_LEAF=6.
-TREE07.        CONFIG_NR_CPUS=16, CONFIG_RCU_FANOUT=2, CONFIG_RCU_FANOUT_EXACT=n,
-       CONFIG_RCU_FANOUT_LEAF=2.
-TREE08.        CONFIG_NR_CPUS=16, CONFIG_RCU_FANOUT=3, CONFIG_RCU_FANOUT_EXACT=y,
-       CONFIG_RCU_FANOUT_LEAF=2.
-TREE09.        CONFIG_NR_CPUS=1.
+Boot parameters:
+
+nohz_full - do at least one.
+maxcpu -- do at least one.
+rcupdate.rcu_self_test_bh -- Do at least one each, offloaded and not.
+rcupdate.rcu_self_test_sched -- Do at least one each, offloaded and not.
+rcupdate.rcu_self_test -- Do at least one each, offloaded and not.
+rcutree.rcu_fanout_exact -- Do at least one.
  
  
  Kconfig Parameters Ignored:
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile

index 59d364aef1a835f25664c45d54c5e22da2d1ed2b..caa60d56d7d1f56d00c6cf087a5c48e863ea7be9 100644 (file)
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -5,8 +5,10 @@ include ../lib.mk
  .PHONY: all all_32 all_64 warn_32bit_failure clean
  
  TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs
+TARGETS_C_32BIT_ONLY := entry_from_vm86
  
-BINARIES_32 := $(TARGETS_C_BOTHBITS:%=%_32)
+TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
+BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
  BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64)
  
  CFLAGS := -O2 -g -std=gnu99 -pthread -Wall
@@ -32,7 +34,7 @@ all_64: $(BINARIES_64)
  clean:
         $(RM) $(BINARIES_32) $(BINARIES_64)
  
-$(TARGETS_C_BOTHBITS:%=%_32): %_32: %.c
+$(TARGETS_C_32BIT_ALL:%=%_32): %_32: %.c
         $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
  
  $(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c
diff --git a/tools/testing/selftests/x86/entry_from_vm86.c b/tools/testing/selftests/x86/entry_from_vm86.c

new file mode 100644 (file)

index 0000000..5c38a18
--- /dev/null
+++ b/tools/testing/selftests/x86/entry_from_vm86.c
@@ -0,0 +1,114 @@
+/*
+ * entry_from_vm86.c - tests kernel entries from vm86 mode
+ * Copyright (c) 2014-2015 Andrew Lutomirski
+ *
+ * This exercises a few paths that need to special-case vm86 mode.
+ *
+ * GPL v2.
+ */
+
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <sys/signal.h>
+#include <sys/ucontext.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <err.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <sys/vm86.h>
+
+static unsigned long load_addr = 0x10000;
+static int nerrs = 0;
+
+asm (
+       ".pushsection .rodata\n\t"
+       ".type vmcode_bound, @object\n\t"
+       "vmcode:\n\t"
+       "vmcode_bound:\n\t"
+       ".code16\n\t"
+       "bound %ax, (2048)\n\t"
+       "int3\n\t"
+       "vmcode_sysenter:\n\t"
+       "sysenter\n\t"
+       ".size vmcode, . - vmcode\n\t"
+       "end_vmcode:\n\t"
+       ".code32\n\t"
+       ".popsection"
+       );
+
+extern unsigned char vmcode[], end_vmcode[];
+extern unsigned char vmcode_bound[], vmcode_sysenter[];
+
+static void do_test(struct vm86plus_struct *v86, unsigned long eip,
+                   const char *text)
+{
+       long ret;
+
+       printf("[RUN]\t%s from vm86 mode\n", text);
+       v86->regs.eip = eip;
+       ret = vm86(VM86_ENTER, v86);
+
+       if (ret == -1 && errno == ENOSYS) {
+               printf("[SKIP]\tvm86 not supported\n");
+               return;
+       }
+
+       if (VM86_TYPE(ret) == VM86_INTx) {
+               char trapname[32];
+               int trapno = VM86_ARG(ret);
+               if (trapno == 13)
+                       strcpy(trapname, "GP");
+               else if (trapno == 5)
+                       strcpy(trapname, "BR");
+               else if (trapno == 14)
+                       strcpy(trapname, "PF");
+               else
+                       sprintf(trapname, "%d", trapno);
+
+               printf("[OK]\tExited vm86 mode due to #%s\n", trapname);
+       } else if (VM86_TYPE(ret) == VM86_UNKNOWN) {
+               printf("[OK]\tExited vm86 mode due to unhandled GP fault\n");
+       } else {
+               printf("[OK]\tExited vm86 mode due to type %ld, arg %ld\n",
+                      VM86_TYPE(ret), VM86_ARG(ret));
+       }
+}
+
+int main(void)
+{
+       struct vm86plus_struct v86;
+       unsigned char *addr = mmap((void *)load_addr, 4096,
+                                  PROT_READ | PROT_WRITE | PROT_EXEC,
+                                  MAP_ANONYMOUS | MAP_PRIVATE, -1,0);
+       if (addr != (unsigned char *)load_addr)
+               err(1, "mmap");
+
+       memcpy(addr, vmcode, end_vmcode - vmcode);
+       addr[2048] = 2;
+       addr[2050] = 3;
+
+       memset(&v86, 0, sizeof(v86));
+
+       v86.regs.cs = load_addr / 16;
+       v86.regs.ss = load_addr / 16;
+       v86.regs.ds = load_addr / 16;
+       v86.regs.es = load_addr / 16;
+
+       assert((v86.regs.cs & 3) == 0); /* Looks like RPL = 0 */
+
+       /* #BR -- should deliver SIG??? */
+       do_test(&v86, vmcode_bound - vmcode, "#BR");
+
+       /* SYSENTER -- should cause #GP or #UD depending on CPU */
+       do_test(&v86, vmcode_sysenter - vmcode, "SYSENTER");
+
+       return (nerrs == 0 ? 0 : 1);
+}
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 23 Jun 2015 00:59:09 +0000 (17:59 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 23 Jun 2015 00:59:09 +0000 (17:59 -0700)