]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blobdiff - arch/x86/mm/tlb.c
x86/mm: Replace compile-time checks for 5-level paging with runtime-time checks
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / mm / tlb.c
index 5bfe61a5e8e3c672bf3ba65d17903a2bdaa20d92..92cb8a901c364db1a74f624711618d756be01879 100644 (file)
@@ -6,13 +6,14 @@
 #include <linux/interrupt.h>
 #include <linux/export.h>
 #include <linux/cpu.h>
+#include <linux/debugfs.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
+#include <asm/nospec-branch.h>
 #include <asm/cache.h>
 #include <asm/apic.h>
 #include <asm/uv/uv.h>
-#include <linux/debugfs.h>
 
 /*
  *     TLB flushing, formerly SMP-only
@@ -156,7 +157,7 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
        unsigned long sp = current_stack_pointer;
        pgd_t *pgd = pgd_offset(mm, sp);
 
-       if (CONFIG_PGTABLE_LEVELS > 4) {
+       if (pgtable_l5_enabled) {
                if (unlikely(pgd_none(*pgd))) {
                        pgd_t *pgd_ref = pgd_offset_k(sp);
 
@@ -228,6 +229,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 #endif
        this_cpu_write(cpu_tlbstate.is_lazy, false);
 
+       /*
+        * The membarrier system call requires a full memory barrier and
+        * core serialization before returning to user-space, after
+        * storing to rq->curr. Writing to CR3 provides that full
+        * memory barrier and core serializing instruction.
+        */
        if (real_prev == next) {
                VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
                           next->context.ctx_id);
@@ -247,6 +254,27 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
        } else {
                u16 new_asid;
                bool need_flush;
+               u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
+
+               /*
+                * Avoid user/user BTB poisoning by flushing the branch
+                * predictor when switching between processes. This stops
+                * one process from doing Spectre-v2 attacks on another.
+                *
+                * As an optimization, flush indirect branches only when
+                * switching into processes that disable dumping. This
+                * protects high value processes like gpg, without having
+                * too high performance overhead. IBPB is *expensive*!
+                *
+                * This will not flush branches when switching into kernel
+                * threads. It will also not flush if we switch to idle
+                * thread and back to the same process. It will flush if we
+                * switch to a different non-dumpable process.
+                */
+               if (tsk && tsk->mm &&
+                   tsk->mm->context.ctx_id != last_ctx_id &&
+                   get_dumpable(tsk->mm) != SUID_DUMP_USER)
+                       indirect_branch_prediction_barrier();
 
                if (IS_ENABLED(CONFIG_VMAP_STACK)) {
                        /*
@@ -292,6 +320,14 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                        trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
                }
 
+               /*
+                * Record last user mm's context id, so we can avoid
+                * flushing branch buffer with IBPB if we switch back
+                * to the same user.
+                */
+               if (next != &init_mm)
+                       this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+
                this_cpu_write(cpu_tlbstate.loaded_mm, next);
                this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
        }
@@ -369,6 +405,7 @@ void initialize_tlbstate_and_flush(void)
        write_cr3(build_cr3(mm->pgd, 0));
 
        /* Reinitialize tlbstate. */
+       this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id);
        this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
        this_cpu_write(cpu_tlbstate.next_asid, 1);
        this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
@@ -576,7 +613,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 {
        int cpu;
 
-       struct flush_tlb_info info = {
+       struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
                .mm = mm,
        };