Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 30 Dec 2017 01:02:49 +0000 (17:02 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 30 Dec 2017 01:02:49 +0000 (17:02 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 30 Dec 2017 01:02:49 +0000 (17:02 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 30 Dec 2017 01:02:49 +0000 (17:02 -0800)
diff --combined Documentation/admin-guide/kernel-parameters.txt

index 6571fbfdb2a1527c25b3a01e9c4228c84adce639,520fdec15bbb5b1b8f01719bf6714090152d0cea..e49311d5350496b1db6e106634da7a09b58fc4a3
--- 1/Documentation/admin-guide/kernel-parameters.txt
--- 2/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@@ -314,7 -314,7 +314,7 @@@
         amijoy.map=     [HW,JOY] Amiga joystick support
                         Map of devices attached to JOY0DAT and JOY1DAT
                         Format: <a>,<b>
- -                      See also Documentation/input/joystick.txt
+ +                      See also Documentation/input/joydev/joystick.rst
   
         analog.map=     [HW,JOY] Analog joystick and gamepad support
                         Specifies type or capabilities of an analog joystick
@@@ -439,7 -439,7 +439,7 @@@
         bttv.card=      [HW,V4L] bttv (bt848 + bt878 based grabber cards)
         bttv.radio=     Most important insmod options are available as
                         kernel args too.
- -      bttv.pll=       See Documentation/video4linux/bttv/Insmod-options
+ +      bttv.pll=       See Documentation/media/v4l-drivers/bttv.rst
         bttv.tuner=
   
         bulk_remove=off [PPC]  This parameter disables the use of the pSeries
@@@ -641,8 -641,8 +641,8 @@@
                 For now, only VisioBraille is supported.
   
         consoleblank=   [KNL] The console blank (screen saver) timeout in
- -                      seconds. Defaults to 10*60 = 10mins. A value of 0
- -                      disables the blank timer.
+ +                      seconds. A value of 0 disables the blank timer.
+ +                       Defaults to 0.
   
         coredump_filter=
                         [KNL] Change the default value for
@@@ -709,9 -709,6 +709,9 @@@
                         It will be ignored when crashkernel=X,high is not used
                         or memory reserved is below 4G.
   
+ +      crossrelease_fullstack
+ +                      [KNL] Allow to record full stack trace in cross-release
+ +
         cryptomgr.notests
                           [KNL] Disable crypto self-tests
   
@@@ -727,7 -724,7 +727,7 @@@
         db9.dev[2|3]=   [HW,JOY] Multisystem joystick support via parallel port
                         (one device per port)
                         Format: <port#>,<type>
- -                      See also Documentation/input/joystick-parport.txt
+ +                      See also Documentation/input/devices/joystick-parport.rst
   
         ddebug_query=   [KNL,DYNAMIC_DEBUG] Enable debug messages at early boot
                         time. See
@@@ -857,7 -854,7 +857,7 @@@
                         The filter can be disabled or changed to another
                         driver later using sysfs.
   
- -      drm_kms_helper.edid_firmware=[<connector>:]<file>[,[<connector>:]<file>]
+ +      drm.edid_firmware=[<connector>:]<file>[,[<connector>:]<file>]
                         Broken monitors, graphic adapters, KVMs and EDIDless
                         panels may send no or incorrect EDID data sets.
                         This parameter allows to specify an EDID data sets
@@@ -1223,7 -1220,7 +1223,7 @@@
                         [HW,JOY] Multisystem joystick and NES/SNES/PSX pad
                         support via parallel port (up to 5 devices per port)
                         Format: <port#>,<pad1>,<pad2>,<pad3>,<pad4>,<pad5>
- -                      See also Documentation/input/joystick-parport.txt
+ +                      See also Documentation/input/devices/joystick-parport.rst
   
         gamma=          [HW,DRM]
   
@@@ -1716,13 -1713,6 +1716,13 @@@
         irqaffinity=    [SMP] Set the default irq affinity mask
                         The argument is a cpu list, as described above.
   
+ +      irqchip.gicv2_force_probe=
+ +                      [ARM, ARM64]
+ +                      Format: <bool>
+ +                      Force the kernel to look for the second 4kB page
+ +                      of a GICv2 controller even if the memory range
+ +                      exposed by the device tree is too small.
+ +
         irqfixup        [HW]
                         When an interrupt is not handled search all handlers
                         for it. Intended to get systems with badly broken
@@@ -1737,33 -1727,20 +1737,33 @@@
         isapnp=         [ISAPNP]
                         Format: <RDP>,<reset>,<pci_scan>,<verbosity>
   
- -      isolcpus=       [KNL,SMP] Isolate CPUs from the general scheduler.
- -                      The argument is a cpu list, as described above.
+ +      isolcpus=       [KNL,SMP] Isolate a given set of CPUs from disturbance.
+ +                      [Deprecated - use cpusets instead]
+ +                      Format: [flag-list,]<cpu-list>
+ +
+ +                      Specify one or more CPUs to isolate from disturbances
+ +                      specified in the flag list (default: domain):
+ +
+ +                      nohz
+ +                        Disable the tick when a single task runs.
+ +                      domain
+ +                        Isolate from the general SMP balancing and scheduling
+ +                        algorithms. Note that performing domain isolation this way
+ +                        is irreversible: it's not possible to bring back a CPU to
+ +                        the domains once isolated through isolcpus. It's strongly
+ +                        advised to use cpusets instead to disable scheduler load
+ +                        balancing through the "cpuset.sched_load_balance" file.
+ +                        It offers a much more flexible interface where CPUs can
+ +                        move in and out of an isolated set anytime.
+ +
+ +                        You can move a process onto or off an "isolated" CPU via
+ +                        the CPU affinity syscalls or cpuset.
+ +                        <cpu number> begins at 0 and the maximum value is
+ +                        "number of CPUs in system - 1".
+ +
+ +                      The format of <cpu-list> is described above.
   
- -                      This option can be used to specify one or more CPUs
- -                      to isolate from the general SMP balancing and scheduling
- -                      algorithms. You can move a process onto or off an
- -                      "isolated" CPU via the CPU affinity syscalls or cpuset.
- -                      <cpu number> begins at 0 and the maximum value is
- -                      "number of CPUs in system - 1".
   
- -                      This option is the preferred way to isolate CPUs. The
- -                      alternative -- manually setting the CPU mask of all
- -                      tasks in the system -- can cause problems and
- -                      suboptimal load balancer performance.
   
         iucv=           [HW,NET]
   
@@@ -1789,7 -1766,7 +1789,7 @@@
                                 ivrs_acpihid[00:14.5]=AMD0020:0
   
         js=             [HW,JOY] Analog joystick
- -                      See Documentation/input/joystick.txt.
+ +                      See Documentation/input/joydev/joystick.rst.
   
         nokaslr         [KNL]
                         When CONFIG_RANDOMIZE_BASE is set, this disables
@@@ -1864,6 -1841,13 +1864,6 @@@
                         Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y,
                         the default is off.
   
- -      kmemcheck=      [X86] Boot-time kmemcheck enable/disable/one-shot mode
- -                      Valid arguments: 0, 1, 2
- -                      kmemcheck=0 (disabled)
- -                      kmemcheck=1 (enabled)
- -                      kmemcheck=2 (one-shot mode)
- -                      Default: 2 (one-shot mode)
- -
         kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
                         Default is 0 (don't ignore, but inject #GP)
   
@@@ -1890,10 -1874,6 +1890,10 @@@
                         [KVM,ARM] Trap guest accesses to GICv3 common
                         system registers
   
+ +      kvm-arm.vgic_v4_enable=
+ +                      [KVM,ARM] Allow use of GICv4 for direct injection of
+ +                      LPIs.
+ +
         kvm-intel.ept=  [KVM,Intel] Disable extended page tables
                         (virtualized MMU) support on capable Intel chips.
                         Default is 1 (enabled)
@@@ -2268,10 -2248,10 +2268,10 @@@
                         s2idle  - Suspend-To-Idle
                         shallow - Power-On Suspend or equivalent (if supported)
                         deep    - Suspend-To-RAM or equivalent (if supported)
- -                      See Documentation/power/states.txt.
+ +                      See Documentation/admin-guide/pm/sleep-states.rst.
   
         meye.*=         [HW] Set MotionEye Camera parameters
- -                      See Documentation/video4linux/meye.txt.
+ +                      See Documentation/media/v4l-drivers/meye.rst.
   
         mfgpt_irq=      [IA-32] Specify the IRQ to use for the
                         Multi-Function General Purpose Timers on AMD Geode
@@@ -2568,9 -2548,6 +2568,9 @@@
   
         noalign         [KNL,ARM]
   
+ +      noaltinstr      [S390] Disables alternative instructions patching
+ +                      (CPU alternatives feature).
+ +
         noapic          [SMP,APIC] Tells the kernel to not make use of any
                         IOAPICs that may be present in the system.
   
@@@ -2708,6 -2685,8 +2708,8 @@@
                         steal time is computed, but won't influence scheduler
                         behaviour
   
+       nopti           [X86-64] Disable kernel page table isolation
+ 
         nolapic         [X86-32,APIC] Do not enable or use the local APIC.
   
         nolapic_timer   [X86-32,APIC] Do not use the local APIC timer.
@@@ -3157,7 -3136,7 +3159,7 @@@
   
         plip=           [PPT,NET] Parallel port network link
                         Format: { parport<nr> | timid | 0 }
- -                      See also Documentation/parport.txt.
+ +                      See also Documentation/admin-guide/parport.rst.
   
         pmtmr=          [X86] Manual setup of pmtmr I/O Port.
                         Override pmtimer IOPort with a hex value.
@@@ -3208,10 -3187,6 +3210,10 @@@
                         allowed (eg kernel_enable_fpu()/kernel_disable_fpu()).
                         There is some performance impact when enabling this.
   
+ +      ppc_tm=         [PPC]
+ +                      Format: {"off"}
+ +                      Disable Hardware Transactional Memory
+ +
         print-fatal-signals=
                         [KNL] debug: print fatal signals
   
@@@ -3250,15 -3225,13 +3252,15 @@@
                         instead using the legacy FADT method
   
         profile=        [KNL] Enable kernel profiling via /proc/profile
- -                      Format: [schedule,]<number>
+ +                      Format: [<profiletype>,]<number>
+ +                      Param: <profiletype>: "schedule", "sleep", or "kvm"
+ +                              [defaults to kernel profiling]
                         Param: "schedule" - profile schedule points.
- -                      Param: <number> - step/bucket size as a power of 2 for
- -                              statistical time based profiling.
                         Param: "sleep" - profile D-state sleeping (millisecs).
                                 Requires CONFIG_SCHEDSTATS
                         Param: "kvm" - profile VM exits.
+ +                      Param: <number> - step/bucket size as a power of 2 for
+ +                              statistical time based profiling.
   
         prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk
                         before loading.
@@@ -3282,6 -3255,12 +3284,12 @@@
         pt.             [PARIDE]
                         See Documentation/blockdev/paride.txt.
   
+       pti=            [X86_64]
+                       Control user/kernel address space isolation:
+                       on - enable
+                       off - disable
+                       auto - default setting
+ 
         pty.legacy_count=
                         [KNL] Number of legacy pty's. Overwrites compiled-in
                         default number.
@@@ -3568,9 -3547,6 +3576,9 @@@
         rcutorture.stall_cpu_holdoff= [KNL]
                         Time to wait (s) after boot before inducing stall.
   
+ +      rcutorture.stall_cpu_irqsoff= [KNL]
+ +                      Disable interrupts while stalling if set.
+ +
         rcutorture.stat_interval= [KNL]
                         Time (s) between statistics printk()s.
   
@@@ -3917,12 -3893,6 +3925,12 @@@
                         [KNL] Should the soft-lockup detector generate panics.
                         Format: <integer>
   
+ +                      A nonzero value instructs the soft-lockup detector
+ +                      to panic the machine when a soft-lockup occurs. This
+ +                      is also controlled by CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC
+ +                      which is the respective build-time switch to that
+ +                      functionality.
+ +
         softlockup_all_cpu_backtrace=
                         [KNL] Should the soft-lockup detector generate
                         backtraces on all cpus.
@@@ -4232,15 -4202,12 +4240,15 @@@
                         Used to run time disable IRQ_TIME_ACCOUNTING on any
                         platforms where RDTSC is slow and this accounting
                         can add overhead.
+ +                      [x86] unstable: mark the TSC clocksource as unstable, this
+ +                      marks the TSC unconditionally unstable at bootup and
+ +                      avoids any further wobbles once the TSC watchdog notices.
   
         turbografx.map[2|3]=    [HW,JOY]
                         TurboGraFX parallel port interface
                         Format:
                         <port#>,<js1>,<js2>,<js3>,<js4>,<js5>,<js6>,<js7>
- -                      See also Documentation/input/joystick-parport.txt
+ +                      See also Documentation/input/devices/joystick-parport.rst
   
         udbg-immortal   [PPC] When debugging early kernel crashes that
                         happen after console_init() and before a proper
diff --combined arch/x86/boot/compressed/pagetable.c

index d5364ca2e3f9290d0ba36606b7e25369f872c144,e691ff734cb5adb5c9e5ff2c0da1c4bd790617ee..b5e5e02f8cde7fa9123dc56981c3a3a98f45843c
--- 1/arch/x86/boot/compressed/pagetable.c
--- 2/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@@ -23,6 -23,9 +23,9 @@@
    */
   #undef CONFIG_AMD_MEM_ENCRYPT
   
+ /* No PAGE_TABLE_ISOLATION support needed either: */
+ #undef CONFIG_PAGE_TABLE_ISOLATION
+ 
   #include "misc.h"
   
   /* These actually do the work of building the kernel identity maps. */
@@@ -77,18 -80,16 +80,18 @@@ static unsigned long top_level_pgt
    * Mapping information structure passed to kernel_ident_mapping_init().
    * Due to relocation, pointers must be assigned at run time not build time.
    */
- -static struct x86_mapping_info mapping_info = {
- -      .page_flag       = __PAGE_KERNEL_LARGE_EXEC,
- -};
+ +static struct x86_mapping_info mapping_info;
   
   /* Locates and clears a region for a new top level page table. */
   void initialize_identity_maps(void)
   {
+ +      unsigned long sev_me_mask = get_sev_encryption_mask();
+ +
         /* Init mapping_info with run-time function/buffer pointers. */
         mapping_info.alloc_pgt_page = alloc_pgt_page;
         mapping_info.context = &pgt_data;
+ +      mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sev_me_mask;
+ +      mapping_info.kernpg_flag = _KERNPG_TABLE | sev_me_mask;
   
         /*
          * It should be impossible for this not to already be true,
diff --combined arch/x86/entry/entry_64.S

index 3d19c830e1b1ab3c7e3115014039a35eb9607214,ed31d00dc5eef2f163267341467e2543414bed5a..f048e384ff54e06530b657efc3b00cdf50f1ce5b
--- 1/arch/x86/entry/entry_64.S
--- 2/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@@ -23,7 -23,6 +23,6 @@@
   #include <asm/segment.h>
   #include <asm/cache.h>
   #include <asm/errno.h>
- #include "calling.h"
   #include <asm/asm-offsets.h>
   #include <asm/msr.h>
   #include <asm/unistd.h>
@@@ -40,6 -39,8 +39,8 @@@
   #include <asm/frame.h>
   #include <linux/err.h>
   
+ #include "calling.h"
+ 
   .code64
   .section .entry.text, "ax"
   
@@@ -51,19 -52,15 +52,19 @@@ ENTRY(native_usergs_sysret64
   END(native_usergs_sysret64)
   #endif /* CONFIG_PARAVIRT */
   
- -.macro TRACE_IRQS_IRETQ
+ +.macro TRACE_IRQS_FLAGS flags:req
   #ifdef CONFIG_TRACE_IRQFLAGS
- -      bt      $9, EFLAGS(%rsp)                /* interrupts off? */
+ +      bt      $9, \flags              /* interrupts off? */
         jnc     1f
         TRACE_IRQS_ON
   1:
   #endif
   .endm
   
+ +.macro TRACE_IRQS_IRETQ
+ +      TRACE_IRQS_FLAGS EFLAGS(%rsp)
+ +.endm
+ +
   /*
    * When dynamic function tracer is enabled it will add a breakpoint
    * to all locations that it is about to modify, sync CPUs, update
@@@ -168,6 -165,9 +169,9 @@@ ENTRY(entry_SYSCALL_64_trampoline
         /* Stash the user RSP. */
         movq    %rsp, RSP_SCRATCH
   
+       /* Note: using %rsp as a scratch reg. */
+       SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+ 
         /* Load the top of the task stack into RSP */
         movq    CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
   
@@@ -207,9 -207,15 +211,13 @@@ ENTRY(entry_SYSCALL_64
          */
   
         swapgs
+       /*
+        * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
+        * is not required to switch CR3.
+        */
         movq    %rsp, PER_CPU_VAR(rsp_scratch)
         movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
   
- -      TRACE_IRQS_OFF
- -
         /* Construct struct pt_regs on stack */
         pushq   $__USER_DS                      /* pt_regs->ss */
         pushq   PER_CPU_VAR(rsp_scratch)        /* pt_regs->sp */
@@@ -230,8 -236,6 +238,8 @@@ GLOBAL(entry_SYSCALL_64_after_hwframe
         sub     $(6*8), %rsp                    /* pt_regs->bp, bx, r12-15 not saved */
         UNWIND_HINT_REGS extra=0
   
+ +      TRACE_IRQS_OFF
+ +
         /*
          * If we need to do entry work or if we guess we'll need to do
          * exit work, go straight to the slow path.
@@@ -403,6 -407,7 +411,7 @@@ syscall_return_via_sysret
          * We are on the trampoline stack.  All regs except RDI are live.
          * We can do future final exit work right here.
          */
+       SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
   
         popq    %rdi
         popq    %rsp
@@@ -740,6 -745,8 +749,8 @@@ GLOBAL(swapgs_restore_regs_and_return_t
          * We can do future final exit work right here.
          */
   
+       SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+ 
         /* Restore RDI. */
         popq    %rdi
         SWAPGS
@@@ -822,7 -829,9 +833,9 @@@ native_irq_return_ldt
          */
   
         pushq   %rdi                            /* Stash user RDI */
-       SWAPGS
+       SWAPGS                                  /* to kernel GS */
+       SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi   /* to kernel CR3 */
+ 
         movq    PER_CPU_VAR(espfix_waddr), %rdi
         movq    %rax, (0*8)(%rdi)               /* user RAX */
         movq    (1*8)(%rsp), %rax               /* user RIP */
@@@ -838,7 -847,6 +851,6 @@@
         /* Now RAX == RSP. */
   
         andl    $0xffff0000, %eax               /* RAX = (RSP & 0xffff0000) */
-       popq    %rdi                            /* Restore user RDI */
   
         /*
          * espfix_stack[31:16] == 0.  The page tables are set up such that
@@@ -849,7 -857,11 +861,11 @@@
          * still points to an RO alias of the ESPFIX stack.
          */
         orq     PER_CPU_VAR(espfix_stack), %rax
-       SWAPGS
+ 
+       SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+       SWAPGS                                  /* to user GS */
+       popq    %rdi                            /* Restore user RDI */
+ 
         movq    %rax, %rsp
         UNWIND_HINT_IRET_REGS offset=8
   
@@@ -949,6 -961,8 +965,8 @@@ ENTRY(switch_to_thread_stack
         UNWIND_HINT_FUNC
   
         pushq   %rdi
+       /* Need to switch before accessing the thread stack. */
+       SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
         movq    %rsp, %rdi
         movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
         UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
@@@ -1078,13 -1092,11 +1096,13 @@@ ENTRY(native_load_gs_index
         FRAME_BEGIN
         pushfq
         DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
+ +      TRACE_IRQS_OFF
         SWAPGS
   .Lgs_change:
         movl    %edi, %gs
   2:    ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
         SWAPGS
+ +      TRACE_IRQS_FLAGS (%rsp)
         popfq
         FRAME_END
         ret
@@@ -1250,7 -1262,11 +1268,11 @@@ ENTRY(paranoid_entry
         js      1f                              /* negative -> in kernel */
         SWAPGS
         xorl    %ebx, %ebx
- 1:    ret
+ 
+ 1:
+       SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
+ 
+       ret
   END(paranoid_entry)
   
   /*
@@@ -1272,6 -1288,7 +1294,7 @@@ ENTRY(paranoid_exit
         testl   %ebx, %ebx                      /* swapgs needed? */
         jnz     .Lparanoid_exit_no_swapgs
         TRACE_IRQS_IRETQ
+       RESTORE_CR3     scratch_reg=%rbx save_reg=%r14
         SWAPGS_UNSAFE_STACK
         jmp     .Lparanoid_exit_restore
   .Lparanoid_exit_no_swapgs:
@@@ -1299,6 -1316,8 +1322,8 @@@ ENTRY(error_entry
          * from user mode due to an IRET fault.
          */
         SWAPGS
+       /* We have user CR3.  Change to kernel CR3. */
+       SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
   
   .Lerror_entry_from_usermode_after_swapgs:
         /* Put us onto the real thread stack. */
@@@ -1345,6 -1364,7 +1370,7 @@@
          * .Lgs_change's error handler with kernel gsbase.
          */
         SWAPGS
+       SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
         jmp .Lerror_entry_done
   
   .Lbstep_iret:
@@@ -1354,10 -1374,11 +1380,11 @@@
   
   .Lerror_bad_iret:
         /*
-        * We came from an IRET to user mode, so we have user gsbase.
-        * Switch to kernel gsbase:
+        * We came from an IRET to user mode, so we have user
+        * gsbase and CR3.  Switch to kernel gsbase and CR3:
          */
         SWAPGS
+       SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
   
         /*
          * Pretend that the exception came from user mode: set up pt_regs
@@@ -1389,6 -1410,10 +1416,10 @@@ END(error_exit
   /*
    * Runs on exception stack.  Xen PV does not go through this path at all,
    * so we can use real assembly here.
+  *
+  * Registers:
+  *    %r14: Used to save/restore the CR3 of the interrupted context
+  *          when PAGE_TABLE_ISOLATION is in use.  Do not clobber.
    */
   ENTRY(nmi)
         UNWIND_HINT_IRET_REGS
@@@ -1452,6 -1477,7 +1483,7 @@@
   
         swapgs
         cld
+       SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
         movq    %rsp, %rdx
         movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
         UNWIND_HINT_IRET_REGS base=%rdx offset=8
@@@ -1704,6 -1730,8 +1736,8 @@@ end_repeat_nmi
         movq    $-1, %rsi
         call    do_nmi
   
+       RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
+ 
         testl   %ebx, %ebx                      /* swapgs needed? */
         jnz     nmi_restore
   nmi_swapgs:
diff --combined arch/x86/include/asm/desc.h

index ec8be07c0cda5c9b240d351ca583409713c58406,85e23bb7b34e31538034ddf9a9e8df773fe7b9b0..13c5ee878a477902b8494532b24853b6c156a170
--- 1/arch/x86/include/asm/desc.h
--- 2/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@@ -21,6 -21,8 +21,8 @@@ static inline void fill_ldt(struct desc
   
         desc->type              = (info->read_exec_only ^ 1) << 1;
         desc->type             |= info->contents << 2;
+       /* Set the ACCESS bit so it can be mapped RO */
+       desc->type             |= 1;
   
         desc->s                 = 1;
         desc->dpl               = 0x3;
@@@ -387,7 -389,7 +389,7 @@@ static inline void set_desc_limit(struc
   void update_intr_gate(unsigned int n, const void *addr);
   void alloc_intr_gate(unsigned int n, const void *addr);
   
- -extern unsigned long used_vectors[];
+ +extern unsigned long system_vectors[];
   
   #ifdef CONFIG_X86_64
   DECLARE_PER_CPU(u32, debug_idt_ctr);
diff --combined arch/x86/include/asm/disabled-features.h

index 14d6d50073142b0f49b06850ccd0d394546479ee,e428e16dd822471510418715d3d1cd36e0905647..b027633e73003e121d7c043438ac7dbd10fc07a4
--- 1/arch/x86/include/asm/disabled-features.h
--- 2/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@@ -16,12 -16,6 +16,12 @@@
   # define DISABLE_MPX  (1<<(X86_FEATURE_MPX & 31))
   #endif
   
+ +#ifdef CONFIG_X86_INTEL_UMIP
+ +# define DISABLE_UMIP 0
+ +#else
+ +# define DISABLE_UMIP (1<<(X86_FEATURE_UMIP & 31))
+ +#endif
+ +
   #ifdef CONFIG_X86_64
   # define DISABLE_VME          (1<<(X86_FEATURE_VME & 31))
   # define DISABLE_K6_MTRR      (1<<(X86_FEATURE_K6_MTRR & 31))
@@@ -50,6 -44,12 +50,12 @@@
   # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
   #endif
   
+ #ifdef CONFIG_PAGE_TABLE_ISOLATION
+ # define DISABLE_PTI          0
+ #else
+ # define DISABLE_PTI          (1 << (X86_FEATURE_PTI & 31))
+ #endif
+ 
   /*
    * Make sure to add features to the correct mask
    */
@@@ -60,7 -60,7 +66,7 @@@
   #define DISABLED_MASK4        (DISABLE_PCID)
   #define DISABLED_MASK5        0
   #define DISABLED_MASK6        0
- #define DISABLED_MASK7        0
+ #define DISABLED_MASK7        (DISABLE_PTI)
   #define DISABLED_MASK8        0
   #define DISABLED_MASK9        (DISABLE_MPX)
   #define DISABLED_MASK10       0
@@@ -69,7 -69,7 +75,7 @@@
   #define DISABLED_MASK13       0
   #define DISABLED_MASK14       0
   #define DISABLED_MASK15       0
- -#define DISABLED_MASK16       (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57)
+ +#define DISABLED_MASK16       (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP)
   #define DISABLED_MASK17       0
   #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
   
diff --combined arch/x86/include/asm/pgtable.h

index 95e2dfd755218ccfaf6417b44c822b545a35568e,6b43d677f8ca743f09f64a92b21a5b2976c49c29..e42b8943cb1a311a00ddceb36129ede3012489ef
--- 1/arch/x86/include/asm/pgtable.h
--- 2/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@@ -28,6 -28,7 +28,7 @@@ extern pgd_t early_top_pgt[PTRS_PER_PGD
   int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
   
   void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
   void ptdump_walk_pgd_level_checkwx(void);
   
   #ifdef CONFIG_DEBUG_WX
@@@ -667,6 -668,11 +668,6 @@@ static inline bool pte_accessible(struc
         return false;
   }
   
- -static inline int pte_hidden(pte_t pte)
- -{
- -      return pte_flags(pte) & _PAGE_HIDDEN;
- -}
- -
   static inline int pmd_present(pmd_t pmd)
   {
         /*
@@@ -841,7 -847,12 +842,12 @@@ static inline pud_t *pud_offset(p4d_t *
   
   static inline int p4d_bad(p4d_t p4d)
   {
-       return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+       unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
+ 
+       if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+               ignore_flags |= _PAGE_NX;
+ 
+       return (p4d_flags(p4d) & ~ignore_flags) != 0;
   }
   #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
   
@@@ -875,7 -886,12 +881,12 @@@ static inline p4d_t *p4d_offset(pgd_t *
   
   static inline int pgd_bad(pgd_t pgd)
   {
-       return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+       unsigned long ignore_flags = _PAGE_USER;
+ 
+       if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+               ignore_flags |= _PAGE_NX;
+ 
+       return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
   }
   
   static inline int pgd_none(pgd_t pgd)
@@@ -904,7 -920,11 +915,11 @@@
    * pgd_offset() returns a (pgd_t *)
    * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
    */
- #define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
+ #define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
+ /*
+  * a shortcut to get a pgd_t in a given mm
+  */
+ #define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
   /*
    * a shortcut which implies the use of the kernel's pgd, instead
    * of a process's
@@@ -1061,7 -1081,7 +1076,7 @@@ extern int pmdp_clear_flush_young(struc
                                   unsigned long address, pmd_t *pmdp);
   
   
- -#define __HAVE_ARCH_PMD_WRITE
+ +#define pmd_write pmd_write
   static inline int pmd_write(pmd_t pmd)
   {
         return pmd_flags(pmd) & _PAGE_RW;
@@@ -1088,12 -1108,6 +1103,12 @@@ static inline void pmdp_set_wrprotect(s
         clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
   }
   
+ +#define pud_write pud_write
+ +static inline int pud_write(pud_t pud)
+ +{
+ +      return pud_flags(pud) & _PAGE_RW;
+ +}
+ +
   /*
    * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
    *
@@@ -1106,7 -1120,14 +1121,14 @@@
    */
   static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
   {
-        memcpy(dst, src, count * sizeof(pgd_t));
+       memcpy(dst, src, count * sizeof(pgd_t));
+ #ifdef CONFIG_PAGE_TABLE_ISOLATION
+       if (!static_cpu_has(X86_FEATURE_PTI))
+               return;
+       /* Clone the user space pgd as well */
+       memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
+              count * sizeof(pgd_t));
+ #endif
   }
   
   #define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --combined arch/x86/include/asm/processor.h

index cad8dab266bceefcd91a830371716d48679c7cc7,9c18da64daa920c09f037d745bd9ac9390ae55aa..d3a67fba200ae2a5c03f52a7815dca00b43c63ad
--- 1/arch/x86/include/asm/processor.h
--- 2/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@@ -132,7 -132,6 +132,7 @@@ struct cpuinfo_x86 
         /* Index into per_cpu list: */
         u16                     cpu_index;
         u32                     microcode;
+ +      unsigned                initialized : 1;
   } __randomize_layout;
   
   struct cpuid_regs {
@@@ -852,13 -851,22 +852,22 @@@ static inline void spin_lock_prefetch(c
   
   #else
   /*
-  * User space process size. 47bits minus one guard page.  The guard
-  * page is necessary on Intel CPUs: if a SYSCALL instruction is at
-  * the highest possible canonical userspace address, then that
-  * syscall will enter the kernel with a non-canonical return
-  * address, and SYSRET will explode dangerously.  We avoid this
-  * particular problem by preventing anything from being mapped
-  * at the maximum canonical address.
+  * User space process size.  This is the first address outside the user range.
+  * There are a few constraints that determine this:
+  *
+  * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+  * address, then that syscall will enter the kernel with a
+  * non-canonical return address, and SYSRET will explode dangerously.
+  * We avoid this particular problem by preventing anything executable
+  * from being mapped at the maximum canonical address.
+  *
+  * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+  * CPUs malfunction if they execute code from the highest canonical page.
+  * They'll speculate right off the end of the canonical space, and
+  * bad things happen.  This is worked around in the same way as the
+  * Intel problem.
+  *
+  * With page table isolation enabled, we map the LDT in ... [stay tuned]
    */
   #define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
   
diff --combined arch/x86/include/asm/tlbflush.h

index e1884cf35257b8133ca97f50d146ae3ebfcaa30f,b519da4fc03c7c26c4f722ea2c5d97b091cbfd15..f68f9c836cca09bcd7fc7c5795e219dcaa3f216b
--- 1/arch/x86/include/asm/tlbflush.h
--- 2/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@@ -10,38 -10,90 +10,90 @@@
   #include <asm/special_insns.h>
   #include <asm/smp.h>
   #include <asm/invpcid.h>
+ #include <asm/pti.h>
+ #include <asm/processor-flags.h>
   
- static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
- {
-       /*
-        * Bump the generation count.  This also serves as a full barrier
-        * that synchronizes with switch_mm(): callers are required to order
-        * their read of mm_cpumask after their writes to the paging
-        * structures.
-        */
-       return atomic64_inc_return(&mm->context.tlb_gen);
- }
+ /*
+  * The x86 feature is called PCID (Process Context IDentifier). It is similar
+  * to what is traditionally called ASID on the RISC processors.
+  *
+  * We don't use the traditional ASID implementation, where each process/mm gets
+  * its own ASID and flush/restart when we run out of ASID space.
+  *
+  * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
+  * that came by on this CPU, allowing cheaper switch_mm between processes on
+  * this CPU.
+  *
+  * We end up with different spaces for different things. To avoid confusion we
+  * use different names for each of them:
+  *
+  * ASID  - [0, TLB_NR_DYN_ASIDS-1]
+  *         the canonical identifier for an mm
+  *
+  * kPCID - [1, TLB_NR_DYN_ASIDS]
+  *         the value we write into the PCID part of CR3; corresponds to the
+  *         ASID+1, because PCID 0 is special.
+  *
+  * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+  *         for KPTI each mm has two address spaces and thus needs two
+  *         PCID values, but we can still do with a single ASID denomination
+  *         for each mm. Corresponds to kPCID + 2048.
+  *
+  */
   
   /* There are 12 bits of space for ASIDS in CR3 */
   #define CR3_HW_ASID_BITS              12
+ 
   /*
    * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
    * user/kernel switches
    */
- #define PTI_CONSUMED_ASID_BITS                0
+ #ifdef CONFIG_PAGE_TABLE_ISOLATION
+ # define PTI_CONSUMED_PCID_BITS       1
+ #else
+ # define PTI_CONSUMED_PCID_BITS       0
+ #endif
+ 
+ #define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
   
- #define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
   /*
    * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid.  -1 below to account
-  * for them being zero-based.  Another -1 is because ASID 0 is reserved for
+  * for them being zero-based.  Another -1 is because PCID 0 is reserved for
    * use by non-PCID-aware users.
    */
- #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2)
+ #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
   
+ /*
+  * 6 because 6 should be plenty and struct tlb_state will fit in two cache
+  * lines.
+  */
+ #define TLB_NR_DYN_ASIDS      6
+ 
+ /*
+  * Given @asid, compute kPCID
+  */
   static inline u16 kern_pcid(u16 asid)
   {
         VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+ 
+ #ifdef CONFIG_PAGE_TABLE_ISOLATION
+       /*
+        * Make sure that the dynamic ASID space does not confict with the
+        * bit we are using to switch between user and kernel ASIDs.
+        */
+       BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
+ 
+       /*
+        * The ASID being passed in here should have respected the
+        * MAX_ASID_AVAILABLE and thus never have the switch bit set.
+        */
+       VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
+ #endif
         /*
+        * The dynamically-assigned ASIDs that get passed in are small
+        * (<TLB_NR_DYN_ASIDS).  They never have the high switch bit set,
+        * so do not bother to clear it.
+        *
          * If PCID is on, ASID-aware code paths put the ASID+1 into the
          * PCID bits.  This serves two purposes.  It prevents a nasty
          * situation in which PCID-unaware code saves CR3, loads some other
@@@ -53,6 -105,18 +105,18 @@@
         return asid + 1;
   }
   
+ /*
+  * Given @asid, compute uPCID
+  */
+ static inline u16 user_pcid(u16 asid)
+ {
+       u16 ret = kern_pcid(asid);
+ #ifdef CONFIG_PAGE_TABLE_ISOLATION
+       ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
+ #endif
+       return ret;
+ }
+ 
   struct pgd_t;
   static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
   {
@@@ -95,12 -159,6 +159,6 @@@ static inline bool tlb_defer_switch_to_
         return !static_cpu_has(X86_FEATURE_PCID);
   }
   
- /*
-  * 6 because 6 should be plenty and struct tlb_state will fit in
-  * two cache lines.
-  */
- #define TLB_NR_DYN_ASIDS 6
- 
   struct tlb_context {
         u64 ctx_id;
         u64 tlb_gen;
@@@ -134,6 -192,24 +192,24 @@@ struct tlb_state 
          */
         bool is_lazy;
   
+       /*
+        * If set we changed the page tables in such a way that we
+        * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
+        * This tells us to go invalidate all the non-loaded ctxs[]
+        * on the next context switch.
+        *
+        * The current ctx was kept up-to-date as it ran and does not
+        * need to be invalidated.
+        */
+       bool invalidate_other;
+ 
+       /*
+        * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
+        * the corresponding user PCID needs a flush next time we
+        * switch to it; see SWITCH_TO_USER_CR3.
+        */
+       unsigned short user_pcid_flush_mask;
+ 
         /*
          * Access to this CR4 shadow and to H/W CR4 is protected by
          * disabling interrupts when modifying either one.
@@@ -169,43 -245,40 +245,43 @@@ static inline void cr4_init_shadow(void
         this_cpu_write(cpu_tlbstate.cr4, __read_cr4());
   }
   
+ +static inline void __cr4_set(unsigned long cr4)
+ +{
+ +      lockdep_assert_irqs_disabled();
+ +      this_cpu_write(cpu_tlbstate.cr4, cr4);
+ +      __write_cr4(cr4);
+ +}
+ +
   /* Set in this cpu's CR4. */
   static inline void cr4_set_bits(unsigned long mask)
   {
- -      unsigned long cr4;
+ +      unsigned long cr4, flags;
   
+ +      local_irq_save(flags);
         cr4 = this_cpu_read(cpu_tlbstate.cr4);
- -      if ((cr4 | mask) != cr4) {
- -              cr4 |= mask;
- -              this_cpu_write(cpu_tlbstate.cr4, cr4);
- -              __write_cr4(cr4);
- -      }
+ +      if ((cr4 | mask) != cr4)
+ +              __cr4_set(cr4 | mask);
+ +      local_irq_restore(flags);
   }
   
   /* Clear in this cpu's CR4. */
   static inline void cr4_clear_bits(unsigned long mask)
   {
- -      unsigned long cr4;
+ +      unsigned long cr4, flags;
   
+ +      local_irq_save(flags);
         cr4 = this_cpu_read(cpu_tlbstate.cr4);
- -      if ((cr4 & ~mask) != cr4) {
- -              cr4 &= ~mask;
- -              this_cpu_write(cpu_tlbstate.cr4, cr4);
- -              __write_cr4(cr4);
- -      }
+ +      if ((cr4 & ~mask) != cr4)
+ +              __cr4_set(cr4 & ~mask);
+ +      local_irq_restore(flags);
   }
   
- -static inline void cr4_toggle_bits(unsigned long mask)
+ +static inline void cr4_toggle_bits_irqsoff(unsigned long mask)
   {
         unsigned long cr4;
   
         cr4 = this_cpu_read(cpu_tlbstate.cr4);
- -      cr4 ^= mask;
- -      this_cpu_write(cpu_tlbstate.cr4, cr4);
- -      __write_cr4(cr4);
+ +      __cr4_set(cr4 ^ mask);
   }
   
   /* Read the CR4 shadow. */
@@@ -214,6 -287,14 +290,14 @@@ static inline unsigned long cr4_read_sh
         return this_cpu_read(cpu_tlbstate.cr4);
   }
   
+ /*
+  * Mark all other ASIDs as invalid, preserves the current.
+  */
+ static inline void invalidate_other_asid(void)
+ {
+       this_cpu_write(cpu_tlbstate.invalidate_other, true);
+ }
+ 
   /*
    * Save some of cr4 feature set we're using (e.g.  Pentium 4MB
    * enable and PPro Global page enable), so that any CPU's that boot
@@@ -233,15 -314,42 +317,42 @@@ static inline void cr4_set_bits_and_upd
   
   extern void initialize_tlbstate_and_flush(void);
   
+ /*
+  * Given an ASID, flush the corresponding user ASID.  We can delay this
+  * until the next time we switch to it.
+  *
+  * See SWITCH_TO_USER_CR3.
+  */
+ static inline void invalidate_user_asid(u16 asid)
+ {
+       /* There is no user ASID if address space separation is off */
+       if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+               return;
+ 
+       /*
+        * We only have a single ASID if PCID is off and the CR3
+        * write will have flushed it.
+        */
+       if (!cpu_feature_enabled(X86_FEATURE_PCID))
+               return;
+ 
+       if (!static_cpu_has(X86_FEATURE_PTI))
+               return;
+ 
+       __set_bit(kern_pcid(asid),
+                 (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
+ }
+ 
   /*
    * flush the entire current user mapping
    */
   static inline void __native_flush_tlb(void)
   {
+       invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
         /*
-        * If current->mm == NULL then we borrow a mm which may change during a
-        * task switch and therefore we must not be preempted while we write CR3
-        * back:
+        * If current->mm == NULL then we borrow a mm which may change
+        * during a task switch and therefore we must not be preempted
+        * while we write CR3 back:
          */
         preempt_disable();
         native_write_cr3(__native_read_cr3());
@@@ -259,6 -367,8 +370,8 @@@ static inline void __native_flush_tlb_g
                 /*
                  * Using INVPCID is considerably faster than a pair of writes
                  * to CR4 sandwiched inside an IRQ flag save/restore.
+                *
+                * Note, this works with CR4.PCIDE=0 or 1.
                  */
                 invpcid_flush_all();
                 return;
@@@ -285,7 -395,21 +398,21 @@@
    */
   static inline void __native_flush_tlb_single(unsigned long addr)
   {
+       u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ 
         asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ 
+       if (!static_cpu_has(X86_FEATURE_PTI))
+               return;
+ 
+       /*
+        * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
+        * Just use invalidate_user_asid() in case we are called early.
+        */
+       if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
+               invalidate_user_asid(loaded_mm_asid);
+       else
+               invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
   }
   
   /*
@@@ -301,14 -425,6 +428,6 @@@ static inline void __flush_tlb_all(void
                  */
                 __flush_tlb();
         }
- 
-       /*
-        * Note: if we somehow had PCID but not PGE, then this wouldn't work --
-        * we'd end up flushing kernel translations for the current ASID but
-        * we might fail to flush kernel translations for other cached ASIDs.
-        *
-        * To avoid this issue, we force PCID off if PGE is off.
-        */
   }
   
   /*
@@@ -318,6 -434,16 +437,16 @@@ static inline void __flush_tlb_one(unsi
   {
         count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
         __flush_tlb_single(addr);
+ 
+       if (!static_cpu_has(X86_FEATURE_PTI))
+               return;
+ 
+       /*
+        * __flush_tlb_single() will have cleared the TLB entry for this ASID,
+        * but since kernel space is replicated across all, we must also
+        * invalidate all others.
+        */
+       invalidate_other_asid();
   }
   
   #define TLB_FLUSH_ALL -1UL
@@@ -378,6 -504,17 +507,17 @@@ static inline void flush_tlb_page(struc
   void native_flush_tlb_others(const struct cpumask *cpumask,
                              const struct flush_tlb_info *info);
   
+ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
+ {
+       /*
+        * Bump the generation count.  This also serves as a full barrier
+        * that synchronizes with switch_mm(): callers are required to order
+        * their read of mm_cpumask after their writes to the paging
+        * structures.
+        */
+       return atomic64_inc_return(&mm->context.tlb_gen);
+ }
+ 
   static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
                                         struct mm_struct *mm)
   {
diff --combined arch/x86/include/uapi/asm/processor-flags.h

index 7e1e730396ae08f5a267adaccf0c3ba46448f780,97abdaab9535703d21ed95de307ad2152b74d39b..bcba3c643e63dced1c873ee5e1cdbfdd5d307928
--- 1/arch/x86/include/uapi/asm/processor-flags.h
--- 2/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@@ -78,7 -78,12 +78,12 @@@
   #define X86_CR3_PWT           _BITUL(X86_CR3_PWT_BIT)
   #define X86_CR3_PCD_BIT               4 /* Page Cache Disable */
   #define X86_CR3_PCD           _BITUL(X86_CR3_PCD_BIT)
- #define X86_CR3_PCID_MASK     _AC(0x00000fff,UL) /* PCID Mask */
+ 
+ #define X86_CR3_PCID_BITS     12
+ #define X86_CR3_PCID_MASK     (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
+ 
+ #define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+ #define X86_CR3_PCID_NOFLUSH    _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
   
   /*
    * Intel CPU features in CR4
@@@ -105,8 -110,6 +110,8 @@@
   #define X86_CR4_OSFXSR                _BITUL(X86_CR4_OSFXSR_BIT)
   #define X86_CR4_OSXMMEXCPT_BIT        10 /* enable unmasked SSE exceptions */
   #define X86_CR4_OSXMMEXCPT    _BITUL(X86_CR4_OSXMMEXCPT_BIT)
+ +#define X86_CR4_UMIP_BIT      11 /* enable UMIP support */
+ +#define X86_CR4_UMIP          _BITUL(X86_CR4_UMIP_BIT)
   #define X86_CR4_LA57_BIT      12 /* enable 5-level page tables */
   #define X86_CR4_LA57          _BITUL(X86_CR4_LA57_BIT)
   #define X86_CR4_VMXE_BIT      13 /* enable VMX virtualization */
diff --combined arch/x86/kernel/cpu/common.c

index c9757f07d738af73ce3bd14c51780c71a512395f,f2a94dfb434e9a7c61d6eacc28dd0129f6e844e1..c47de4ebf63a3e84a64511662c08d5b20faa94db
--- 1/arch/x86/kernel/cpu/common.c
--- 2/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@@ -329,30 -329,6 +329,30 @@@ static __always_inline void setup_smap(
         }
   }
   
+ +static __always_inline void setup_umip(struct cpuinfo_x86 *c)
+ +{
+ +      /* Check the boot processor, plus build option for UMIP. */
+ +      if (!cpu_feature_enabled(X86_FEATURE_UMIP))
+ +              goto out;
+ +
+ +      /* Check the current processor's cpuid bits. */
+ +      if (!cpu_has(c, X86_FEATURE_UMIP))
+ +              goto out;
+ +
+ +      cr4_set_bits(X86_CR4_UMIP);
+ +
+ +      pr_info("x86/cpu: Activated the Intel User Mode Instruction Prevention (UMIP) CPU feature\n");
+ +
+ +      return;
+ +
+ +out:
+ +      /*
+ +       * Make sure UMIP is disabled in case it was enabled in a
+ +       * previous boot (e.g., via kexec).
+ +       */
+ +      cr4_clear_bits(X86_CR4_UMIP);
+ +}
+ +
   /*
    * Protection Keys are not available in 32-bit mode.
    */
@@@ -882,8 -858,8 +882,8 @@@ static void identify_cpu_without_cpuid(
    * cache alignment.
    * The others are not touched to avoid unwanted side effects.
    *
- - * WARNING: this function is only called on the BP.  Don't add code here
- - * that is supposed to run on all CPUs.
+ + * WARNING: this function is only called on the boot CPU.  Don't add code
+ + * here that is supposed to run on all CPUs.
    */
   static void __init early_identify_cpu(struct cpuinfo_x86 *c)
   {
@@@ -922,6 -898,10 +922,10 @@@
         }
   
         setup_force_cpu_cap(X86_FEATURE_ALWAYS);
+ 
+       /* Assume for now that ALL x86 CPUs are insecure */
+       setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
+ 
         fpu__init_system(c);
   
   #ifdef CONFIG_X86_32
@@@ -1166,10 -1146,9 +1170,10 @@@ static void identify_cpu(struct cpuinfo
         /* Disable the PN if appropriate */
         squash_the_stupid_serial_number(c);
   
- -      /* Set up SMEP/SMAP */
+ +      /* Set up SMEP/SMAP/UMIP */
         setup_smep(c);
         setup_smap(c);
+ +      setup_umip(c);
   
         /*
          * The vendor-specific functions might have changed features.
@@@ -1360,7 -1339,10 +1364,10 @@@ void syscall_init(void
                 (entry_SYSCALL_64_trampoline - _entry_trampoline);
   
         wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
-       wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
+       if (static_cpu_has(X86_FEATURE_PTI))
+               wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
+       else
+               wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
   
   #ifdef CONFIG_IA32_EMULATION
         wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
diff --combined arch/x86/mm/Makefile

index 52195ee3f6d50ebd2005aa040b1cf0023edd6b33,52906808e277575eccfe8a24e8b9f983dec8e274..27e9e90a8d3572b900ccaacae1623de803072b17
--- 1/arch/x86/mm/Makefile
--- 2/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@@ -29,6 -29,8 +29,6 @@@ obj-$(CONFIG_X86_PTDUMP)      += debug_paget
   
   obj-$(CONFIG_HIGHMEM)         += highmem_32.o
   
- -obj-$(CONFIG_KMEMCHECK)               += kmemcheck/
- -
   KASAN_SANITIZE_kasan_init_$(BITS).o := n
   obj-$(CONFIG_KASAN)           += kasan_init_$(BITS).o
   
@@@ -41,9 -43,10 +41,10 @@@ obj-$(CONFIG_AMD_NUMA)              += amdtopology.
   obj-$(CONFIG_ACPI_NUMA)               += srat.o
   obj-$(CONFIG_NUMA_EMU)                += numa_emulation.o
   
- obj-$(CONFIG_X86_INTEL_MPX)   += mpx.o
- obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
- obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+ obj-$(CONFIG_X86_INTEL_MPX)                   += mpx.o
+ obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)        += pkeys.o
+ obj-$(CONFIG_RANDOMIZE_MEMORY)                        += kaslr.o
+ obj-$(CONFIG_PAGE_TABLE_ISOLATION)            += pti.o
   
   obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
   obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --combined arch/x86/mm/init.c

index 6fdf91ef130a4737ab434ce98f77b2583fe1840d,80259ad8c386015c896aaf305609c5a06c1c87e3..8ca324d072828e19700ba094dbeae433b767c964
--- 1/arch/x86/mm/init.c
--- 2/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@@ -20,6 -20,7 +20,7 @@@
   #include <asm/kaslr.h>
   #include <asm/hypervisor.h>
   #include <asm/cpufeature.h>
+ #include <asm/pti.h>
   
   /*
    * We need to define the tracepoints somewhere, and tlb.c
@@@ -92,7 -93,8 +93,7 @@@ __ref void *alloc_low_pages(unsigned in
                 unsigned int order;
   
                 order = get_order((unsigned long)num << PAGE_SHIFT);
- -              return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
- -                                              __GFP_ZERO, order);
+ +              return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
         }
   
         if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
@@@ -160,14 -162,21 +161,20 @@@ struct map_range 
   
   static int page_size_mask;
   
+ static void enable_global_pages(void)
+ {
+       if (!static_cpu_has(X86_FEATURE_PTI))
+               __supported_pte_mask |= _PAGE_GLOBAL;
+ }
+ 
   static void __init probe_page_size_mask(void)
   {
         /*
- -       * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
- -       * use small pages.
+ +       * For pagealloc debugging, identity mapping will use small pages.
          * This will simplify cpa(), which otherwise needs to support splitting
          * large pages into small in interrupt context, etc.
          */
- -      if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK))
+ +      if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
                 page_size_mask |= 1 << PG_LEVEL_2M;
         else
                 direct_gbpages = 0;
@@@ -177,11 -186,11 +184,11 @@@
                 cr4_set_bits_and_update_boot(X86_CR4_PSE);
   
         /* Enable PGE if available */
+       __supported_pte_mask &= ~_PAGE_GLOBAL;
         if (boot_cpu_has(X86_FEATURE_PGE)) {
                 cr4_set_bits_and_update_boot(X86_CR4_PGE);
-               __supported_pte_mask |= _PAGE_GLOBAL;
-       } else
-               __supported_pte_mask &= ~_PAGE_GLOBAL;
+               enable_global_pages();
+       }
   
         /* Enable 1 GB linear kernel mappings if available: */
         if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
@@@ -194,34 -203,44 +201,44 @@@
   
   static void setup_pcid(void)
   {
- #ifdef CONFIG_X86_64
-       if (boot_cpu_has(X86_FEATURE_PCID)) {
-               if (boot_cpu_has(X86_FEATURE_PGE)) {
-                       /*
-                        * This can't be cr4_set_bits_and_update_boot() --
-                        * the trampoline code can't handle CR4.PCIDE and
-                        * it wouldn't do any good anyway.  Despite the name,
-                        * cr4_set_bits_and_update_boot() doesn't actually
-                        * cause the bits in question to remain set all the
-                        * way through the secondary boot asm.
-                        *
-                        * Instead, we brute-force it and set CR4.PCIDE
-                        * manually in start_secondary().
-                        */
-                       cr4_set_bits(X86_CR4_PCIDE);
-               } else {
-                       /*
-                        * flush_tlb_all(), as currently implemented, won't
-                        * work if PCID is on but PGE is not.  Since that
-                        * combination doesn't exist on real hardware, there's
-                        * no reason to try to fully support it, but it's
-                        * polite to avoid corrupting data if we're on
-                        * an improperly configured VM.
-                        */
-                       setup_clear_cpu_cap(X86_FEATURE_PCID);
-               }
+       if (!IS_ENABLED(CONFIG_X86_64))
+               return;
+ 
+       if (!boot_cpu_has(X86_FEATURE_PCID))
+               return;
+ 
+       if (boot_cpu_has(X86_FEATURE_PGE)) {
+               /*
+                * This can't be cr4_set_bits_and_update_boot() -- the
+                * trampoline code can't handle CR4.PCIDE and it wouldn't
+                * do any good anyway.  Despite the name,
+                * cr4_set_bits_and_update_boot() doesn't actually cause
+                * the bits in question to remain set all the way through
+                * the secondary boot asm.
+                *
+                * Instead, we brute-force it and set CR4.PCIDE manually in
+                * start_secondary().
+                */
+               cr4_set_bits(X86_CR4_PCIDE);
+ 
+               /*
+                * INVPCID's single-context modes (2/3) only work if we set
+                * X86_CR4_PCIDE, *and* we INVPCID support.  It's unusable
+                * on systems that have X86_CR4_PCIDE clear, or that have
+                * no INVPCID support at all.
+                */
+               if (boot_cpu_has(X86_FEATURE_INVPCID))
+                       setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
+       } else {
+               /*
+                * flush_tlb_all(), as currently implemented, won't work if
+                * PCID is on but PGE is not.  Since that combination
+                * doesn't exist on real hardware, there's no reason to try
+                * to fully support it, but it's polite to avoid corrupting
+                * data if we're on an improperly configured VM.
+                */
+               setup_clear_cpu_cap(X86_FEATURE_PCID);
         }
- #endif
   }
   
   #ifdef CONFIG_X86_32
@@@ -622,6 -641,7 +639,7 @@@ void __init init_mem_mapping(void
   {
         unsigned long end;
   
+       pti_check_boottime_disable();
         probe_page_size_mask();
         setup_pcid();
   
@@@ -845,7 -865,7 +863,7 @@@ void __init zone_sizes_init(void
         free_area_init_nodes(max_zone_pfns);
   }
   
- DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
         .loaded_mm = &init_mm,
         .next_asid = 1,
         .cr4 = ~0UL,    /* fail hard if we screw up cr4 shadow initialization */
diff --combined arch/x86/mm/pgtable.c

index 96d456a94b0342eb967f918e4233498ac24e4349,9b7bcbd33cc246070446381db0ebf944435514d8..004abf9ebf1222c169448090f7f1c570635bce41
--- 1/arch/x86/mm/pgtable.c
--- 2/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@@ -7,7 -7,7 +7,7 @@@
   #include <asm/fixmap.h>
   #include <asm/mtrr.h>
   
- -#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO)
+ +#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
   
   #ifdef CONFIG_HIGHPTE
   #define PGALLOC_USER_GFP __GFP_HIGHMEM
@@@ -355,14 -355,15 +355,15 @@@ static inline void _pgd_free(pgd_t *pgd
                 kmem_cache_free(pgd_cache, pgd);
   }
   #else
+ 
   static inline pgd_t *_pgd_alloc(void)
   {
-       return (pgd_t *)__get_free_page(PGALLOC_GFP);
+       return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
   }
   
   static inline void _pgd_free(pgd_t *pgd)
   {
-       free_page((unsigned long)pgd);
+       free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
   }
   #endif /* CONFIG_X86_PAE */
   
diff --combined arch/x86/platform/efi/efi_64.c

index 6a151ce70e865caadde95c859855c4b63283ad4b,39c4b35ac7a4a9cc0d2b54cbc7b7ea58997da09f..d87ac96e37ede3ea93dabb67d99538fdadc03de0
--- 1/arch/x86/platform/efi/efi_64.c
--- 2/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@@ -33,7 -33,6 +33,7 @@@
   #include <linux/reboot.h>
   #include <linux/slab.h>
   #include <linux/ucs2_string.h>
+ +#include <linux/mem_encrypt.h>
   
   #include <asm/setup.h>
   #include <asm/page.h>
@@@ -196,6 -195,9 +196,9 @@@ static pgd_t *efi_pgd
    * because we want to avoid inserting EFI region mappings (EFI_VA_END
    * to EFI_VA_START) into the standard kernel page tables. Everything
    * else can be shared, see efi_sync_low_kernel_mappings().
+  *
+  * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the
+  * allocation.
    */
   int __init efi_alloc_page_tables(void)
   {
@@@ -207,8 -209,8 +210,8 @@@
         if (efi_enabled(EFI_OLD_MEMMAP))
                 return 0;
   
- -      gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO;
+ +      gfp_mask = GFP_KERNEL | __GFP_ZERO;
-       efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
+       efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
         if (!efi_pgd)
                 return -ENOMEM;
   
@@@ -371,11 -373,7 +374,11 @@@ int __init efi_setup_page_tables(unsign
          * as trim_bios_range() will reserve the first page and isolate it away
          * from memory allocators anyway.
          */
- -      if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, _PAGE_RW)) {
+ +      pf = _PAGE_RW;
+ +      if (sev_active())
+ +              pf |= _PAGE_ENC;
+ +
+ +      if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, pf)) {
                 pr_err("Failed to create 1:1 mapping for the first page!\n");
                 return 1;
         }
@@@ -418,9 -416,6 +421,9 @@@ static void __init __map_region(efi_mem
         if (!(md->attribute & EFI_MEMORY_WB))
                 flags |= _PAGE_PCD;
   
+ +      if (sev_active())
+ +              flags |= _PAGE_ENC;
+ +
         pfn = md->phys_addr >> PAGE_SHIFT;
         if (kernel_map_pages_in_pgd(pgd, pfn, va, md->num_pages, flags))
                 pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
@@@ -547,9 -542,6 +550,9 @@@ static int __init efi_update_mem_attr(s
         if (!(md->attribute & EFI_MEMORY_RO))
                 pf |= _PAGE_RW;
   
+ +      if (sev_active())
+ +              pf |= _PAGE_ENC;
+ +
         return efi_update_mappings(md, pf);
   }
   
@@@ -601,9 -593,6 +604,9 @@@ void __init efi_runtime_update_mappings
                         (md->type != EFI_RUNTIME_SERVICES_CODE))
                         pf |= _PAGE_RW;
   
+ +              if (sev_active())
+ +                      pf |= _PAGE_ENC;
+ +
                 efi_update_mappings(md, pf);
         }
   }
diff --combined init/main.c

index 7b606fc4848264f3eb52a86bc2f6480585cb5654,b32ec72cdf3dd8731b53b57975d96d2edd6cbf0a..a8100b9548398e8b102052f2c1418b21ea423825
--- 1/init/main.c
--- 2/init/main.c
+++ b/init/main.c
@@@ -46,7 -46,6 +46,7 @@@
   #include <linux/cgroup.h>
   #include <linux/efi.h>
   #include <linux/tick.h>
+ +#include <linux/sched/isolation.h>
   #include <linux/interrupt.h>
   #include <linux/taskstats_kern.h>
   #include <linux/delayacct.h>
@@@ -70,11 -69,13 +70,12 @@@
   #include <linux/kgdb.h>
   #include <linux/ftrace.h>
   #include <linux/async.h>
- -#include <linux/kmemcheck.h>
   #include <linux/sfi.h>
   #include <linux/shmem_fs.h>
   #include <linux/slab.h>
   #include <linux/perf_event.h>
   #include <linux/ptrace.h>
+ #include <linux/pti.h>
   #include <linux/blkdev.h>
   #include <linux/elevator.h>
   #include <linux/sched_clock.h>
@@@ -506,6 -507,8 +507,8 @@@ static void __init mm_init(void
         ioremap_huge_init();
         /* Should be run before the first non-init thread is created */
         init_espfix_bsp();
+       /* Should be run after espfix64 is set up. */
+       pti_init();
   }
   
   asmlinkage __visible void __init start_kernel(void)
@@@ -564,6 -567,7 +567,6 @@@
          * kmem_cache_init()
          */
         setup_log_buf(0);
- -      pidhash_init();
         vfs_caches_init_early();
         sort_main_extable();
         trap_init();
@@@ -590,12 -594,6 +593,12 @@@
                 local_irq_disable();
         radix_tree_init();
   
+ +      /*
+ +       * Set up housekeeping before setting up workqueues to allow the unbound
+ +       * workqueue to take non-housekeeping into account.
+ +       */
+ +      housekeeping_init();
+ +
         /*
          * Allow workqueue creation and work item queueing/cancelling
          * early.  Work item execution depends on kthreads and starts after
@@@ -671,12 -669,12 +674,12 @@@
         debug_objects_mem_init();
         setup_per_cpu_pageset();
         numa_policy_init();
+ +      acpi_early_init();
         if (late_time_init)
                 late_time_init();
         calibrate_delay();
- -      pidmap_init();
+ +      pid_idr_init();
         anon_vma_init();
- -      acpi_early_init();
   #ifdef CONFIG_X86
         if (efi_enabled(EFI_RUNTIME_SERVICES))
                 efi_enter_virtual_mode();
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 30 Dec 2017 01:02:49 +0000 (17:02 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 30 Dec 2017 01:02:49 +0000 (17:02 -0800)
		1	2
Documentation/admin-guide/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/boot/compressed/pagetable.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/entry/entry_64.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/desc.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/disabled-features.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/pgtable.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/processor.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/tlbflush.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/uapi/asm/processor-flags.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/common.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/init.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/pgtable.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/platform/efi/efi_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
init/main.c	patch \|	diff1 \|	diff2 \|	blob \| history