From: Fabian Grünbichler Date: Fri, 19 Jan 2018 11:43:16 +0000 (+0100) Subject: fix #1622: i40e memory leak X-Git-Url: https://git.proxmox.com/?p=pve-kernel.git;a=commitdiff_plain;h=a0f7ab8a6a11503e0ce26a89992d368bf25f052a fix #1622: i40e memory leak cherry-pick from upstream 4.14 --- diff --git a/patches/kernel/0017-i40e-Fix-memory-leak-related-filter-programming-stat.patch b/patches/kernel/0017-i40e-Fix-memory-leak-related-filter-programming-stat.patch new file mode 100644 index 0000000..e318a18 --- /dev/null +++ b/patches/kernel/0017-i40e-Fix-memory-leak-related-filter-programming-stat.patch @@ -0,0 +1,127 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Alexander Duyck +Date: Wed, 4 Oct 2017 08:44:43 -0700 +Subject: [PATCH] i40e: Fix memory leak related filter programming status +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +It looks like we weren't correctly placing the pages from buffers that had +been used to return a filter programming status back on the ring. As a +result they were being overwritten and tracking of the pages was lost. + +This change works to correct that by incorporating part of +i40e_put_rx_buffer into the programming status handler code. As a result we +should now be correctly placing the pages for those buffers on the +re-allocation list instead of letting them stay in place. + +Fixes: 0e626ff7ccbf ("i40e: Fix support for flow director programming status") +Reported-by: Anders K. Pedersen +Signed-off-by: Alexander Duyck +Tested-by: Anders K Pedersen +Signed-off-by: Jeff Kirsher +(cherry picked from commit 2b9478ffc550f17c6cd8c69057234e91150f5972) +Signed-off-by: Fabian Grünbichler +--- + drivers/net/ethernet/intel/i40e/i40e_txrx.c | 63 ++++++++++++++++------------- + 1 file changed, 36 insertions(+), 27 deletions(-) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c +index 2194960d5855..391b1878c24b 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c +@@ -1042,6 +1042,32 @@ static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc) + return false; + } + ++/** ++ * i40e_reuse_rx_page - page flip buffer and store it back on the ring ++ * @rx_ring: rx descriptor ring to store buffers on ++ * @old_buff: donor buffer to have page reused ++ * ++ * Synchronizes page for reuse by the adapter ++ **/ ++static void i40e_reuse_rx_page(struct i40e_ring *rx_ring, ++ struct i40e_rx_buffer *old_buff) ++{ ++ struct i40e_rx_buffer *new_buff; ++ u16 nta = rx_ring->next_to_alloc; ++ ++ new_buff = &rx_ring->rx_bi[nta]; ++ ++ /* update, and store next to alloc */ ++ nta++; ++ rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; ++ ++ /* transfer page from old buffer to new buffer */ ++ new_buff->dma = old_buff->dma; ++ new_buff->page = old_buff->page; ++ new_buff->page_offset = old_buff->page_offset; ++ new_buff->pagecnt_bias = old_buff->pagecnt_bias; ++} ++ + /** + * i40e_rx_is_programming_status - check for programming status descriptor + * @qw: qword representing status_error_len in CPU ordering +@@ -1076,15 +1102,24 @@ static void i40e_clean_programming_status(struct i40e_ring *rx_ring, + union i40e_rx_desc *rx_desc, + u64 qw) + { +- u32 ntc = rx_ring->next_to_clean + 1; ++ struct i40e_rx_buffer *rx_buffer; ++ u32 ntc = rx_ring->next_to_clean; + u8 id; + + /* fetch, update, and store next to clean */ ++ rx_buffer = &rx_ring->rx_bi[ntc++]; + ntc = (ntc < rx_ring->count) ? ntc : 0; + rx_ring->next_to_clean = ntc; + + prefetch(I40E_RX_DESC(rx_ring, ntc)); + ++ /* place unused page back on the ring */ ++ i40e_reuse_rx_page(rx_ring, rx_buffer); ++ rx_ring->rx_stats.page_reuse_count++; ++ ++ /* clear contents of buffer_info */ ++ rx_buffer->page = NULL; ++ + id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >> + I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT; + +@@ -1643,32 +1678,6 @@ static bool i40e_cleanup_headers(struct i40e_ring *rx_ring, struct sk_buff *skb, + return false; + } + +-/** +- * i40e_reuse_rx_page - page flip buffer and store it back on the ring +- * @rx_ring: rx descriptor ring to store buffers on +- * @old_buff: donor buffer to have page reused +- * +- * Synchronizes page for reuse by the adapter +- **/ +-static void i40e_reuse_rx_page(struct i40e_ring *rx_ring, +- struct i40e_rx_buffer *old_buff) +-{ +- struct i40e_rx_buffer *new_buff; +- u16 nta = rx_ring->next_to_alloc; +- +- new_buff = &rx_ring->rx_bi[nta]; +- +- /* update, and store next to alloc */ +- nta++; +- rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; +- +- /* transfer page from old buffer to new buffer */ +- new_buff->dma = old_buff->dma; +- new_buff->page = old_buff->page; +- new_buff->page_offset = old_buff->page_offset; +- new_buff->pagecnt_bias = old_buff->pagecnt_bias; +-} +- + /** + * i40e_page_is_reusable - check if any reuse is possible + * @page: page struct to check +-- +2.14.2 + diff --git a/patches/kernel/0017-x86-mm-Add-the-nopcid-boot-option-to-turn-off-PCID.patch b/patches/kernel/0017-x86-mm-Add-the-nopcid-boot-option-to-turn-off-PCID.patch deleted file mode 100644 index bc566a5..0000000 --- a/patches/kernel/0017-x86-mm-Add-the-nopcid-boot-option-to-turn-off-PCID.patch +++ /dev/null @@ -1,83 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 29 Jun 2017 08:53:20 -0700 -Subject: [PATCH] x86/mm: Add the 'nopcid' boot option to turn off PCID -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The parameter is only present on x86_64 systems to save a few bytes, -as PCID is always disabled on x86_32. - -Signed-off-by: Andy Lutomirski -Reviewed-by: Nadav Amit -Reviewed-by: Borislav Petkov -Reviewed-by: Thomas Gleixner -Cc: Andrew Morton -Cc: Arjan van de Ven -Cc: Borislav Petkov -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Mel Gorman -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: linux-mm@kvack.org -Link: http://lkml.kernel.org/r/8bbb2e65bcd249a5f18bfb8128b4689f08ac2b60.1498751203.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 0790c9aad84901ca1bdc14746175549c8b5da215) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 62d3a63645c17611fe8ccc0c5adc5e840d9cff7b) -Signed-off-by: Fabian Grünbichler ---- - Documentation/admin-guide/kernel-parameters.txt | 2 ++ - arch/x86/kernel/cpu/common.c | 18 ++++++++++++++++++ - 2 files changed, 20 insertions(+) - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 73fd6abac39b..3510e255ef4c 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -2700,6 +2700,8 @@ - nopat [X86] Disable PAT (page attribute table extension of - pagetables) support. - -+ nopcid [X86-64] Disable the PCID cpu feature. -+ - norandmaps Don't use address space randomization. Equivalent to - echo 0 > /proc/sys/kernel/randomize_va_space - -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index c8b39870f33e..904485e7b230 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -168,6 +168,24 @@ static int __init x86_mpx_setup(char *s) - } - __setup("nompx", x86_mpx_setup); - -+#ifdef CONFIG_X86_64 -+static int __init x86_pcid_setup(char *s) -+{ -+ /* require an exact match without trailing characters */ -+ if (strlen(s)) -+ return 0; -+ -+ /* do not emit a message if the feature is not present */ -+ if (!boot_cpu_has(X86_FEATURE_PCID)) -+ return 1; -+ -+ setup_clear_cpu_cap(X86_FEATURE_PCID); -+ pr_info("nopcid: PCID feature disabled\n"); -+ return 1; -+} -+__setup("nopcid", x86_pcid_setup); -+#endif -+ - static int __init x86_noinvpcid_setup(char *s) - { - /* noinvpcid doesn't accept parameters */ --- -2.14.2 - diff --git a/patches/kernel/0018-x86-mm-Add-the-nopcid-boot-option-to-turn-off-PCID.patch b/patches/kernel/0018-x86-mm-Add-the-nopcid-boot-option-to-turn-off-PCID.patch new file mode 100644 index 0000000..bc566a5 --- /dev/null +++ b/patches/kernel/0018-x86-mm-Add-the-nopcid-boot-option-to-turn-off-PCID.patch @@ -0,0 +1,83 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 29 Jun 2017 08:53:20 -0700 +Subject: [PATCH] x86/mm: Add the 'nopcid' boot option to turn off PCID +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The parameter is only present on x86_64 systems to save a few bytes, +as PCID is always disabled on x86_32. + +Signed-off-by: Andy Lutomirski +Reviewed-by: Nadav Amit +Reviewed-by: Borislav Petkov +Reviewed-by: Thomas Gleixner +Cc: Andrew Morton +Cc: Arjan van de Ven +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Mel Gorman +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/8bbb2e65bcd249a5f18bfb8128b4689f08ac2b60.1498751203.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 0790c9aad84901ca1bdc14746175549c8b5da215) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 62d3a63645c17611fe8ccc0c5adc5e840d9cff7b) +Signed-off-by: Fabian Grünbichler +--- + Documentation/admin-guide/kernel-parameters.txt | 2 ++ + arch/x86/kernel/cpu/common.c | 18 ++++++++++++++++++ + 2 files changed, 20 insertions(+) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 73fd6abac39b..3510e255ef4c 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -2700,6 +2700,8 @@ + nopat [X86] Disable PAT (page attribute table extension of + pagetables) support. + ++ nopcid [X86-64] Disable the PCID cpu feature. ++ + norandmaps Don't use address space randomization. Equivalent to + echo 0 > /proc/sys/kernel/randomize_va_space + +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index c8b39870f33e..904485e7b230 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -168,6 +168,24 @@ static int __init x86_mpx_setup(char *s) + } + __setup("nompx", x86_mpx_setup); + ++#ifdef CONFIG_X86_64 ++static int __init x86_pcid_setup(char *s) ++{ ++ /* require an exact match without trailing characters */ ++ if (strlen(s)) ++ return 0; ++ ++ /* do not emit a message if the feature is not present */ ++ if (!boot_cpu_has(X86_FEATURE_PCID)) ++ return 1; ++ ++ setup_clear_cpu_cap(X86_FEATURE_PCID); ++ pr_info("nopcid: PCID feature disabled\n"); ++ return 1; ++} ++__setup("nopcid", x86_pcid_setup); ++#endif ++ + static int __init x86_noinvpcid_setup(char *s) + { + /* noinvpcid doesn't accept parameters */ +-- +2.14.2 + diff --git a/patches/kernel/0018-x86-mm-Enable-CR4.PCIDE-on-supported-systems.patch b/patches/kernel/0018-x86-mm-Enable-CR4.PCIDE-on-supported-systems.patch deleted file mode 100644 index a718862..0000000 --- a/patches/kernel/0018-x86-mm-Enable-CR4.PCIDE-on-supported-systems.patch +++ /dev/null @@ -1,120 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 29 Jun 2017 08:53:21 -0700 -Subject: [PATCH] x86/mm: Enable CR4.PCIDE on supported systems -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -We can use PCID if the CPU has PCID and PGE and we're not on Xen. - -By itself, this has no effect. A followup patch will start using PCID. - -Signed-off-by: Andy Lutomirski -Reviewed-by: Nadav Amit -Reviewed-by: Boris Ostrovsky -Reviewed-by: Thomas Gleixner -Cc: Andrew Morton -Cc: Arjan van de Ven -Cc: Borislav Petkov -Cc: Dave Hansen -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Mel Gorman -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: linux-mm@kvack.org -Link: http://lkml.kernel.org/r/6327ecd907b32f79d5aa0d466f04503bbec5df88.1498751203.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 660da7c9228f685b2ebe664f9fd69aaddcc420b5) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 7d6bbe5528395f18de50bd2532843546c849883d) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/tlbflush.h | 8 ++++++++ - arch/x86/kernel/cpu/common.c | 22 ++++++++++++++++++++++ - arch/x86/xen/enlighten_pv.c | 6 ++++++ - 3 files changed, 36 insertions(+) - -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index 50ea3482e1d1..2b3d68093235 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -207,6 +207,14 @@ static inline void __flush_tlb_all(void) - __flush_tlb_global(); - else - __flush_tlb(); -+ -+ /* -+ * Note: if we somehow had PCID but not PGE, then this wouldn't work -- -+ * we'd end up flushing kernel translations for the current ASID but -+ * we might fail to flush kernel translations for other cached ASIDs. -+ * -+ * To avoid this issue, we force PCID off if PGE is off. -+ */ - } - - static inline void __flush_tlb_one(unsigned long addr) -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index 904485e7b230..b95cd94ca97b 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -329,6 +329,25 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) - } - } - -+static void setup_pcid(struct cpuinfo_x86 *c) -+{ -+ if (cpu_has(c, X86_FEATURE_PCID)) { -+ if (cpu_has(c, X86_FEATURE_PGE)) { -+ cr4_set_bits(X86_CR4_PCIDE); -+ } else { -+ /* -+ * flush_tlb_all(), as currently implemented, won't -+ * work if PCID is on but PGE is not. Since that -+ * combination doesn't exist on real hardware, there's -+ * no reason to try to fully support it, but it's -+ * polite to avoid corrupting data if we're on -+ * an improperly configured VM. -+ */ -+ clear_cpu_cap(c, X86_FEATURE_PCID); -+ } -+ } -+} -+ - /* - * Protection Keys are not available in 32-bit mode. - */ -@@ -1143,6 +1162,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) - setup_smep(c); - setup_smap(c); - -+ /* Set up PCID */ -+ setup_pcid(c); -+ - /* - * The vendor-specific functions might have changed features. - * Now we do "generic changes." -diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c -index 811e4ddb3f37..290bc5ac9852 100644 ---- a/arch/x86/xen/enlighten_pv.c -+++ b/arch/x86/xen/enlighten_pv.c -@@ -264,6 +264,12 @@ static void __init xen_init_capabilities(void) - setup_clear_cpu_cap(X86_FEATURE_ACC); - setup_clear_cpu_cap(X86_FEATURE_X2APIC); - -+ /* -+ * Xen PV would need some work to support PCID: CR3 handling as well -+ * as xen_flush_tlb_others() would need updating. -+ */ -+ setup_clear_cpu_cap(X86_FEATURE_PCID); -+ - if (!xen_initial_domain()) - setup_clear_cpu_cap(X86_FEATURE_ACPI); - --- -2.14.2 - diff --git a/patches/kernel/0019-x86-mm-Document-how-CR4.PCIDE-restore-works.patch b/patches/kernel/0019-x86-mm-Document-how-CR4.PCIDE-restore-works.patch deleted file mode 100644 index 03ccd7a..0000000 --- a/patches/kernel/0019-x86-mm-Document-how-CR4.PCIDE-restore-works.patch +++ /dev/null @@ -1,54 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Wed, 6 Sep 2017 19:54:54 -0700 -Subject: [PATCH] x86/mm: Document how CR4.PCIDE restore works -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -While debugging a problem, I thought that using -cr4_set_bits_and_update_boot() to restore CR4.PCIDE would be -helpful. It turns out to be counterproductive. - -Add a comment documenting how this works. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Linus Torvalds -(cherry picked from commit 1c9fe4409ce3e9c78b1ed96ee8ed699d4f03bf33) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 0d69e4c4a2db42a9bac6609a3df15bd91163f8b9) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/cpu/common.c | 13 +++++++++++++ - 1 file changed, 13 insertions(+) - -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index b95cd94ca97b..0b80ed14ff52 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -333,6 +333,19 @@ static void setup_pcid(struct cpuinfo_x86 *c) - { - if (cpu_has(c, X86_FEATURE_PCID)) { - if (cpu_has(c, X86_FEATURE_PGE)) { -+ /* -+ * We'd like to use cr4_set_bits_and_update_boot(), -+ * but we can't. CR4.PCIDE is special and can only -+ * be set in long mode, and the early CPU init code -+ * doesn't know this and would try to restore CR4.PCIDE -+ * prior to entering long mode. -+ * -+ * Instead, we rely on the fact that hotplug, resume, -+ * etc all fully restore CR4 before they write anything -+ * that could have nonzero PCID bits to CR3. CR4.PCIDE -+ * has no effect on the page tables themselves, so we -+ * don't need it to be restored early. -+ */ - cr4_set_bits(X86_CR4_PCIDE); - } else { - /* --- -2.14.2 - diff --git a/patches/kernel/0019-x86-mm-Enable-CR4.PCIDE-on-supported-systems.patch b/patches/kernel/0019-x86-mm-Enable-CR4.PCIDE-on-supported-systems.patch new file mode 100644 index 0000000..a718862 --- /dev/null +++ b/patches/kernel/0019-x86-mm-Enable-CR4.PCIDE-on-supported-systems.patch @@ -0,0 +1,120 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 29 Jun 2017 08:53:21 -0700 +Subject: [PATCH] x86/mm: Enable CR4.PCIDE on supported systems +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +We can use PCID if the CPU has PCID and PGE and we're not on Xen. + +By itself, this has no effect. A followup patch will start using PCID. + +Signed-off-by: Andy Lutomirski +Reviewed-by: Nadav Amit +Reviewed-by: Boris Ostrovsky +Reviewed-by: Thomas Gleixner +Cc: Andrew Morton +Cc: Arjan van de Ven +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Mel Gorman +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/6327ecd907b32f79d5aa0d466f04503bbec5df88.1498751203.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 660da7c9228f685b2ebe664f9fd69aaddcc420b5) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 7d6bbe5528395f18de50bd2532843546c849883d) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/tlbflush.h | 8 ++++++++ + arch/x86/kernel/cpu/common.c | 22 ++++++++++++++++++++++ + arch/x86/xen/enlighten_pv.c | 6 ++++++ + 3 files changed, 36 insertions(+) + +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 50ea3482e1d1..2b3d68093235 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -207,6 +207,14 @@ static inline void __flush_tlb_all(void) + __flush_tlb_global(); + else + __flush_tlb(); ++ ++ /* ++ * Note: if we somehow had PCID but not PGE, then this wouldn't work -- ++ * we'd end up flushing kernel translations for the current ASID but ++ * we might fail to flush kernel translations for other cached ASIDs. ++ * ++ * To avoid this issue, we force PCID off if PGE is off. ++ */ + } + + static inline void __flush_tlb_one(unsigned long addr) +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 904485e7b230..b95cd94ca97b 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -329,6 +329,25 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) + } + } + ++static void setup_pcid(struct cpuinfo_x86 *c) ++{ ++ if (cpu_has(c, X86_FEATURE_PCID)) { ++ if (cpu_has(c, X86_FEATURE_PGE)) { ++ cr4_set_bits(X86_CR4_PCIDE); ++ } else { ++ /* ++ * flush_tlb_all(), as currently implemented, won't ++ * work if PCID is on but PGE is not. Since that ++ * combination doesn't exist on real hardware, there's ++ * no reason to try to fully support it, but it's ++ * polite to avoid corrupting data if we're on ++ * an improperly configured VM. ++ */ ++ clear_cpu_cap(c, X86_FEATURE_PCID); ++ } ++ } ++} ++ + /* + * Protection Keys are not available in 32-bit mode. + */ +@@ -1143,6 +1162,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) + setup_smep(c); + setup_smap(c); + ++ /* Set up PCID */ ++ setup_pcid(c); ++ + /* + * The vendor-specific functions might have changed features. + * Now we do "generic changes." +diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c +index 811e4ddb3f37..290bc5ac9852 100644 +--- a/arch/x86/xen/enlighten_pv.c ++++ b/arch/x86/xen/enlighten_pv.c +@@ -264,6 +264,12 @@ static void __init xen_init_capabilities(void) + setup_clear_cpu_cap(X86_FEATURE_ACC); + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + ++ /* ++ * Xen PV would need some work to support PCID: CR3 handling as well ++ * as xen_flush_tlb_others() would need updating. ++ */ ++ setup_clear_cpu_cap(X86_FEATURE_PCID); ++ + if (!xen_initial_domain()) + setup_clear_cpu_cap(X86_FEATURE_ACPI); + +-- +2.14.2 + diff --git a/patches/kernel/0020-x86-entry-64-Refactor-IRQ-stacks-and-make-them-NMI-s.patch b/patches/kernel/0020-x86-entry-64-Refactor-IRQ-stacks-and-make-them-NMI-s.patch deleted file mode 100644 index edabecd..0000000 --- a/patches/kernel/0020-x86-entry-64-Refactor-IRQ-stacks-and-make-them-NMI-s.patch +++ /dev/null @@ -1,201 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Tue, 11 Jul 2017 10:33:38 -0500 -Subject: [PATCH] x86/entry/64: Refactor IRQ stacks and make them NMI-safe -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -This will allow IRQ stacks to nest inside NMIs or similar entries -that can happen during IRQ stack setup or teardown. - -The new macros won't work correctly if they're invoked with IRQs on. -Add a check under CONFIG_DEBUG_ENTRY to detect that. - -Signed-off-by: Andy Lutomirski -[ Use %r10 instead of %r11 in xen_do_hypervisor_callback to make objtool - and ORC unwinder's lives a little easier. ] -Signed-off-by: Josh Poimboeuf -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Denys Vlasenko -Cc: H. Peter Anvin -Cc: Jiri Slaby -Cc: Linus Torvalds -Cc: Mike Galbraith -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: live-patching@vger.kernel.org -Link: http://lkml.kernel.org/r/b0b2ff5fb97d2da2e1d7e1f380190c92545c8bb5.1499786555.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 1d3e53e8624a3ec85f4041ca6d973da7c1575938) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit be58b042e135d0ee777a54798f33015857d7e2e0) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/process_64.c | 3 ++ - arch/x86/Kconfig.debug | 2 -- - arch/x86/entry/entry_64.S | 85 +++++++++++++++++++++++++++++++------------- - 3 files changed, 64 insertions(+), 26 deletions(-) - -diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c -index fe56e6f93cbb..1e7701c4cd80 100644 ---- a/arch/x86/kernel/process_64.c -+++ b/arch/x86/kernel/process_64.c -@@ -404,6 +404,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) - int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - -+ WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && -+ this_cpu_read(irq_count) != -1); -+ - switch_fpu_prepare(prev_fpu, cpu); - - /* We must save %fs and %gs before load_TLS() because -diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug -index cd20ca0b4043..1fc519f3c49e 100644 ---- a/arch/x86/Kconfig.debug -+++ b/arch/x86/Kconfig.debug -@@ -305,8 +305,6 @@ config DEBUG_ENTRY - Some of these sanity checks may slow down kernel entries and - exits or otherwise impact performance. - -- This is currently used to help test NMI code. -- - If unsure, say N. - - config DEBUG_NMI_SELFTEST -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 6d078b89a5e8..07b4056af8a8 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -447,6 +447,59 @@ ENTRY(irq_entries_start) - .endr - END(irq_entries_start) - -+.macro DEBUG_ENTRY_ASSERT_IRQS_OFF -+#ifdef CONFIG_DEBUG_ENTRY -+ pushfq -+ testl $X86_EFLAGS_IF, (%rsp) -+ jz .Lokay_\@ -+ ud2 -+.Lokay_\@: -+ addq $8, %rsp -+#endif -+.endm -+ -+/* -+ * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers -+ * flags and puts old RSP into old_rsp, and leaves all other GPRs alone. -+ * Requires kernel GSBASE. -+ * -+ * The invariant is that, if irq_count != -1, then the IRQ stack is in use. -+ */ -+.macro ENTER_IRQ_STACK old_rsp -+ DEBUG_ENTRY_ASSERT_IRQS_OFF -+ movq %rsp, \old_rsp -+ incl PER_CPU_VAR(irq_count) -+ -+ /* -+ * Right now, if we just incremented irq_count to zero, we've -+ * claimed the IRQ stack but we haven't switched to it yet. -+ * -+ * If anything is added that can interrupt us here without using IST, -+ * it must be *extremely* careful to limit its stack usage. This -+ * could include kprobes and a hypothetical future IST-less #DB -+ * handler. -+ */ -+ -+ cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp -+ pushq \old_rsp -+.endm -+ -+/* -+ * Undoes ENTER_IRQ_STACK. -+ */ -+.macro LEAVE_IRQ_STACK -+ DEBUG_ENTRY_ASSERT_IRQS_OFF -+ /* We need to be off the IRQ stack before decrementing irq_count. */ -+ popq %rsp -+ -+ /* -+ * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming -+ * the irq stack but we're not on it. -+ */ -+ -+ decl PER_CPU_VAR(irq_count) -+.endm -+ - /* - * Interrupt entry/exit. - * -@@ -485,17 +538,7 @@ END(irq_entries_start) - CALL_enter_from_user_mode - - 1: -- /* -- * Save previous stack pointer, optionally switch to interrupt stack. -- * irq_count is used to check if a CPU is already on an interrupt stack -- * or not. While this is essentially redundant with preempt_count it is -- * a little cheaper to use a separate counter in the PDA (short of -- * moving irq_enter into assembly, which would be too much work) -- */ -- movq %rsp, %rdi -- incl PER_CPU_VAR(irq_count) -- cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp -- pushq %rdi -+ ENTER_IRQ_STACK old_rsp=%rdi - /* We entered an interrupt context - irqs are off: */ - TRACE_IRQS_OFF - -@@ -515,10 +558,8 @@ common_interrupt: - ret_from_intr: - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF -- decl PER_CPU_VAR(irq_count) - -- /* Restore saved previous stack */ -- popq %rsp -+ LEAVE_IRQ_STACK - - testb $3, CS(%rsp) - jz retint_kernel -@@ -892,12 +933,10 @@ bad_gs: - ENTRY(do_softirq_own_stack) - pushq %rbp - mov %rsp, %rbp -- incl PER_CPU_VAR(irq_count) -- cmove PER_CPU_VAR(irq_stack_ptr), %rsp -- push %rbp /* frame pointer backlink */ -+ ENTER_IRQ_STACK old_rsp=%r11 - call __do_softirq -+ LEAVE_IRQ_STACK - leaveq -- decl PER_CPU_VAR(irq_count) - ret - END(do_softirq_own_stack) - -@@ -924,13 +963,11 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ - * see the correct pointer to the pt_regs - */ - movq %rdi, %rsp /* we don't return, adjust the stack frame */ --11: incl PER_CPU_VAR(irq_count) -- movq %rsp, %rbp -- cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp -- pushq %rbp /* frame pointer backlink */ -+ -+ ENTER_IRQ_STACK old_rsp=%r10 - call xen_evtchn_do_upcall -- popq %rsp -- decl PER_CPU_VAR(irq_count) -+ LEAVE_IRQ_STACK -+ - #ifndef CONFIG_PREEMPT - call xen_maybe_preempt_hcall - #endif --- -2.14.2 - diff --git a/patches/kernel/0020-x86-mm-Document-how-CR4.PCIDE-restore-works.patch b/patches/kernel/0020-x86-mm-Document-how-CR4.PCIDE-restore-works.patch new file mode 100644 index 0000000..03ccd7a --- /dev/null +++ b/patches/kernel/0020-x86-mm-Document-how-CR4.PCIDE-restore-works.patch @@ -0,0 +1,54 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Wed, 6 Sep 2017 19:54:54 -0700 +Subject: [PATCH] x86/mm: Document how CR4.PCIDE restore works +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +While debugging a problem, I thought that using +cr4_set_bits_and_update_boot() to restore CR4.PCIDE would be +helpful. It turns out to be counterproductive. + +Add a comment documenting how this works. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Linus Torvalds +(cherry picked from commit 1c9fe4409ce3e9c78b1ed96ee8ed699d4f03bf33) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 0d69e4c4a2db42a9bac6609a3df15bd91163f8b9) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/cpu/common.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index b95cd94ca97b..0b80ed14ff52 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -333,6 +333,19 @@ static void setup_pcid(struct cpuinfo_x86 *c) + { + if (cpu_has(c, X86_FEATURE_PCID)) { + if (cpu_has(c, X86_FEATURE_PGE)) { ++ /* ++ * We'd like to use cr4_set_bits_and_update_boot(), ++ * but we can't. CR4.PCIDE is special and can only ++ * be set in long mode, and the early CPU init code ++ * doesn't know this and would try to restore CR4.PCIDE ++ * prior to entering long mode. ++ * ++ * Instead, we rely on the fact that hotplug, resume, ++ * etc all fully restore CR4 before they write anything ++ * that could have nonzero PCID bits to CR3. CR4.PCIDE ++ * has no effect on the page tables themselves, so we ++ * don't need it to be restored early. ++ */ + cr4_set_bits(X86_CR4_PCIDE); + } else { + /* +-- +2.14.2 + diff --git a/patches/kernel/0021-x86-entry-64-Initialize-the-top-of-the-IRQ-stack-bef.patch b/patches/kernel/0021-x86-entry-64-Initialize-the-top-of-the-IRQ-stack-bef.patch deleted file mode 100644 index f50fc39..0000000 --- a/patches/kernel/0021-x86-entry-64-Initialize-the-top-of-the-IRQ-stack-bef.patch +++ /dev/null @@ -1,94 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Tue, 11 Jul 2017 10:33:39 -0500 -Subject: [PATCH] x86/entry/64: Initialize the top of the IRQ stack before - switching stacks -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The OOPS unwinder wants the word at the top of the IRQ stack to -point back to the previous stack at all times when the IRQ stack -is in use. There's currently a one-instruction window in ENTER_IRQ_STACK -during which this isn't the case. Fix it by writing the old RSP to the -top of the IRQ stack before jumping. - -This currently writes the pointer to the stack twice, which is a bit -ugly. We could get rid of this by replacing irq_stack_ptr with -irq_stack_ptr_minus_eight (better name welcome). OTOH, there may be -all kinds of odd microarchitectural considerations in play that -affect performance by a few cycles here. - -Reported-by: Mike Galbraith -Reported-by: Josh Poimboeuf -Signed-off-by: Andy Lutomirski -Signed-off-by: Josh Poimboeuf -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Denys Vlasenko -Cc: H. Peter Anvin -Cc: Jiri Slaby -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: live-patching@vger.kernel.org -Link: http://lkml.kernel.org/r/aae7e79e49914808440ad5310ace138ced2179ca.1499786555.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 2995590964da93e1fd9a91550f9c9d9fab28f160) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit a753ff654dfd07a7f8d6f39a27126589eac7e55f) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/entry_64.S | 24 +++++++++++++++++++++++- - 1 file changed, 23 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 07b4056af8a8..184b70712545 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -469,6 +469,7 @@ END(irq_entries_start) - DEBUG_ENTRY_ASSERT_IRQS_OFF - movq %rsp, \old_rsp - incl PER_CPU_VAR(irq_count) -+ jnz .Lirq_stack_push_old_rsp_\@ - - /* - * Right now, if we just incremented irq_count to zero, we've -@@ -478,9 +479,30 @@ END(irq_entries_start) - * it must be *extremely* careful to limit its stack usage. This - * could include kprobes and a hypothetical future IST-less #DB - * handler. -+ * -+ * The OOPS unwinder relies on the word at the top of the IRQ -+ * stack linking back to the previous RSP for the entire time we're -+ * on the IRQ stack. For this to work reliably, we need to write -+ * it before we actually move ourselves to the IRQ stack. -+ */ -+ -+ movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) -+ movq PER_CPU_VAR(irq_stack_ptr), %rsp -+ -+#ifdef CONFIG_DEBUG_ENTRY -+ /* -+ * If the first movq above becomes wrong due to IRQ stack layout -+ * changes, the only way we'll notice is if we try to unwind right -+ * here. Assert that we set up the stack right to catch this type -+ * of bug quickly. - */ -+ cmpq -8(%rsp), \old_rsp -+ je .Lirq_stack_okay\@ -+ ud2 -+ .Lirq_stack_okay\@: -+#endif - -- cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp -+.Lirq_stack_push_old_rsp_\@: - pushq \old_rsp - .endm - --- -2.14.2 - diff --git a/patches/kernel/0021-x86-entry-64-Refactor-IRQ-stacks-and-make-them-NMI-s.patch b/patches/kernel/0021-x86-entry-64-Refactor-IRQ-stacks-and-make-them-NMI-s.patch new file mode 100644 index 0000000..edabecd --- /dev/null +++ b/patches/kernel/0021-x86-entry-64-Refactor-IRQ-stacks-and-make-them-NMI-s.patch @@ -0,0 +1,201 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Tue, 11 Jul 2017 10:33:38 -0500 +Subject: [PATCH] x86/entry/64: Refactor IRQ stacks and make them NMI-safe +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +This will allow IRQ stacks to nest inside NMIs or similar entries +that can happen during IRQ stack setup or teardown. + +The new macros won't work correctly if they're invoked with IRQs on. +Add a check under CONFIG_DEBUG_ENTRY to detect that. + +Signed-off-by: Andy Lutomirski +[ Use %r10 instead of %r11 in xen_do_hypervisor_callback to make objtool + and ORC unwinder's lives a little easier. ] +Signed-off-by: Josh Poimboeuf +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Jiri Slaby +Cc: Linus Torvalds +Cc: Mike Galbraith +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: live-patching@vger.kernel.org +Link: http://lkml.kernel.org/r/b0b2ff5fb97d2da2e1d7e1f380190c92545c8bb5.1499786555.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 1d3e53e8624a3ec85f4041ca6d973da7c1575938) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit be58b042e135d0ee777a54798f33015857d7e2e0) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/process_64.c | 3 ++ + arch/x86/Kconfig.debug | 2 -- + arch/x86/entry/entry_64.S | 85 +++++++++++++++++++++++++++++++------------- + 3 files changed, 64 insertions(+), 26 deletions(-) + +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index fe56e6f93cbb..1e7701c4cd80 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -404,6 +404,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + int cpu = smp_processor_id(); + struct tss_struct *tss = &per_cpu(cpu_tss, cpu); + ++ WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && ++ this_cpu_read(irq_count) != -1); ++ + switch_fpu_prepare(prev_fpu, cpu); + + /* We must save %fs and %gs before load_TLS() because +diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug +index cd20ca0b4043..1fc519f3c49e 100644 +--- a/arch/x86/Kconfig.debug ++++ b/arch/x86/Kconfig.debug +@@ -305,8 +305,6 @@ config DEBUG_ENTRY + Some of these sanity checks may slow down kernel entries and + exits or otherwise impact performance. + +- This is currently used to help test NMI code. +- + If unsure, say N. + + config DEBUG_NMI_SELFTEST +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 6d078b89a5e8..07b4056af8a8 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -447,6 +447,59 @@ ENTRY(irq_entries_start) + .endr + END(irq_entries_start) + ++.macro DEBUG_ENTRY_ASSERT_IRQS_OFF ++#ifdef CONFIG_DEBUG_ENTRY ++ pushfq ++ testl $X86_EFLAGS_IF, (%rsp) ++ jz .Lokay_\@ ++ ud2 ++.Lokay_\@: ++ addq $8, %rsp ++#endif ++.endm ++ ++/* ++ * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers ++ * flags and puts old RSP into old_rsp, and leaves all other GPRs alone. ++ * Requires kernel GSBASE. ++ * ++ * The invariant is that, if irq_count != -1, then the IRQ stack is in use. ++ */ ++.macro ENTER_IRQ_STACK old_rsp ++ DEBUG_ENTRY_ASSERT_IRQS_OFF ++ movq %rsp, \old_rsp ++ incl PER_CPU_VAR(irq_count) ++ ++ /* ++ * Right now, if we just incremented irq_count to zero, we've ++ * claimed the IRQ stack but we haven't switched to it yet. ++ * ++ * If anything is added that can interrupt us here without using IST, ++ * it must be *extremely* careful to limit its stack usage. This ++ * could include kprobes and a hypothetical future IST-less #DB ++ * handler. ++ */ ++ ++ cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp ++ pushq \old_rsp ++.endm ++ ++/* ++ * Undoes ENTER_IRQ_STACK. ++ */ ++.macro LEAVE_IRQ_STACK ++ DEBUG_ENTRY_ASSERT_IRQS_OFF ++ /* We need to be off the IRQ stack before decrementing irq_count. */ ++ popq %rsp ++ ++ /* ++ * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming ++ * the irq stack but we're not on it. ++ */ ++ ++ decl PER_CPU_VAR(irq_count) ++.endm ++ + /* + * Interrupt entry/exit. + * +@@ -485,17 +538,7 @@ END(irq_entries_start) + CALL_enter_from_user_mode + + 1: +- /* +- * Save previous stack pointer, optionally switch to interrupt stack. +- * irq_count is used to check if a CPU is already on an interrupt stack +- * or not. While this is essentially redundant with preempt_count it is +- * a little cheaper to use a separate counter in the PDA (short of +- * moving irq_enter into assembly, which would be too much work) +- */ +- movq %rsp, %rdi +- incl PER_CPU_VAR(irq_count) +- cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp +- pushq %rdi ++ ENTER_IRQ_STACK old_rsp=%rdi + /* We entered an interrupt context - irqs are off: */ + TRACE_IRQS_OFF + +@@ -515,10 +558,8 @@ common_interrupt: + ret_from_intr: + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF +- decl PER_CPU_VAR(irq_count) + +- /* Restore saved previous stack */ +- popq %rsp ++ LEAVE_IRQ_STACK + + testb $3, CS(%rsp) + jz retint_kernel +@@ -892,12 +933,10 @@ bad_gs: + ENTRY(do_softirq_own_stack) + pushq %rbp + mov %rsp, %rbp +- incl PER_CPU_VAR(irq_count) +- cmove PER_CPU_VAR(irq_stack_ptr), %rsp +- push %rbp /* frame pointer backlink */ ++ ENTER_IRQ_STACK old_rsp=%r11 + call __do_softirq ++ LEAVE_IRQ_STACK + leaveq +- decl PER_CPU_VAR(irq_count) + ret + END(do_softirq_own_stack) + +@@ -924,13 +963,11 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ + * see the correct pointer to the pt_regs + */ + movq %rdi, %rsp /* we don't return, adjust the stack frame */ +-11: incl PER_CPU_VAR(irq_count) +- movq %rsp, %rbp +- cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp +- pushq %rbp /* frame pointer backlink */ ++ ++ ENTER_IRQ_STACK old_rsp=%r10 + call xen_evtchn_do_upcall +- popq %rsp +- decl PER_CPU_VAR(irq_count) ++ LEAVE_IRQ_STACK ++ + #ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall + #endif +-- +2.14.2 + diff --git a/patches/kernel/0022-x86-entry-64-Add-unwind-hint-annotations.patch b/patches/kernel/0022-x86-entry-64-Add-unwind-hint-annotations.patch deleted file mode 100644 index 428fcf3..0000000 --- a/patches/kernel/0022-x86-entry-64-Add-unwind-hint-annotations.patch +++ /dev/null @@ -1,463 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Tue, 11 Jul 2017 10:33:44 -0500 -Subject: [PATCH] x86/entry/64: Add unwind hint annotations -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Add unwind hint annotations to entry_64.S. This will enable the ORC -unwinder to unwind through any location in the entry code including -syscalls, interrupts, and exceptions. - -Signed-off-by: Josh Poimboeuf -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Denys Vlasenko -Cc: H. Peter Anvin -Cc: Jiri Slaby -Cc: Linus Torvalds -Cc: Mike Galbraith -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: live-patching@vger.kernel.org -Link: http://lkml.kernel.org/r/b9f6d478aadf68ba57c739dcfac34ec0dc021c4c.1499786555.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 8c1f75587a18ca032da8f6376d1ed882d7095289) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit a8448e6971c1e71b22c651131d14f8be76e6d399) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/Makefile | 1 - - arch/x86/entry/calling.h | 5 ++++ - arch/x86/entry/entry_64.S | 71 ++++++++++++++++++++++++++++++++++++++++------- - 3 files changed, 66 insertions(+), 11 deletions(-) - -diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile -index 9976fcecd17e..af28a8a24366 100644 ---- a/arch/x86/entry/Makefile -+++ b/arch/x86/entry/Makefile -@@ -2,7 +2,6 @@ - # Makefile for the x86 low level entry code - # - --OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y - OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y - - CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) -diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h -index 05ed3d393da7..640aafebdc00 100644 ---- a/arch/x86/entry/calling.h -+++ b/arch/x86/entry/calling.h -@@ -1,4 +1,5 @@ - #include -+#include - - /* - -@@ -112,6 +113,7 @@ For 32-bit we have the following conventions - kernel is built with - movq %rdx, 12*8+\offset(%rsp) - movq %rsi, 13*8+\offset(%rsp) - movq %rdi, 14*8+\offset(%rsp) -+ UNWIND_HINT_REGS offset=\offset extra=0 - .endm - .macro SAVE_C_REGS offset=0 - SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 -@@ -136,6 +138,7 @@ For 32-bit we have the following conventions - kernel is built with - movq %r12, 3*8+\offset(%rsp) - movq %rbp, 4*8+\offset(%rsp) - movq %rbx, 5*8+\offset(%rsp) -+ UNWIND_HINT_REGS offset=\offset - .endm - - .macro RESTORE_EXTRA_REGS offset=0 -@@ -145,6 +148,7 @@ For 32-bit we have the following conventions - kernel is built with - movq 3*8+\offset(%rsp), %r12 - movq 4*8+\offset(%rsp), %rbp - movq 5*8+\offset(%rsp), %rbx -+ UNWIND_HINT_REGS offset=\offset extra=0 - .endm - - .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 -@@ -167,6 +171,7 @@ For 32-bit we have the following conventions - kernel is built with - .endif - movq 13*8(%rsp), %rsi - movq 14*8(%rsp), %rdi -+ UNWIND_HINT_IRET_REGS offset=16*8 - .endm - .macro RESTORE_C_REGS - RESTORE_C_REGS_HELPER 1,1,1,1,1 -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 184b70712545..64b233ab7cad 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -36,6 +36,7 @@ - #include - #include - #include -+#include - #include - - .code64 -@@ -43,9 +44,10 @@ - - #ifdef CONFIG_PARAVIRT - ENTRY(native_usergs_sysret64) -+ UNWIND_HINT_EMPTY - swapgs - sysretq --ENDPROC(native_usergs_sysret64) -+END(native_usergs_sysret64) - #endif /* CONFIG_PARAVIRT */ - - .macro TRACE_IRQS_IRETQ -@@ -134,6 +136,7 @@ ENDPROC(native_usergs_sysret64) - */ - - ENTRY(entry_SYSCALL_64) -+ UNWIND_HINT_EMPTY - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, -@@ -169,6 +172,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) - pushq %r10 /* pt_regs->r10 */ - pushq %r11 /* pt_regs->r11 */ - sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ -+ UNWIND_HINT_REGS extra=0 - - /* - * If we need to do entry work or if we guess we'll need to do -@@ -223,6 +227,7 @@ entry_SYSCALL_64_fastpath: - movq EFLAGS(%rsp), %r11 - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp), %rsp -+ UNWIND_HINT_EMPTY - USERGS_SYSRET64 - - 1: -@@ -316,6 +321,7 @@ syscall_return_via_sysret: - /* rcx and r11 are already restored (see code above) */ - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp), %rsp -+ UNWIND_HINT_EMPTY - USERGS_SYSRET64 - - opportunistic_sysret_failed: -@@ -343,6 +349,7 @@ ENTRY(stub_ptregs_64) - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - popq %rax -+ UNWIND_HINT_REGS extra=0 - jmp entry_SYSCALL64_slow_path - - 1: -@@ -351,6 +358,7 @@ END(stub_ptregs_64) - - .macro ptregs_stub func - ENTRY(ptregs_\func) -+ UNWIND_HINT_FUNC - leaq \func(%rip), %rax - jmp stub_ptregs_64 - END(ptregs_\func) -@@ -367,6 +375,7 @@ END(ptregs_\func) - * %rsi: next task - */ - ENTRY(__switch_to_asm) -+ UNWIND_HINT_FUNC - /* - * Save callee-saved registers - * This must match the order in inactive_task_frame -@@ -406,6 +415,7 @@ END(__switch_to_asm) - * r12: kernel thread arg - */ - ENTRY(ret_from_fork) -+ UNWIND_HINT_EMPTY - movq %rax, %rdi - call schedule_tail /* rdi: 'prev' task parameter */ - -@@ -413,6 +423,7 @@ ENTRY(ret_from_fork) - jnz 1f /* kernel threads are uncommon */ - - 2: -+ UNWIND_HINT_REGS - movq %rsp, %rdi - call syscall_return_slowpath /* returns with IRQs disabled */ - TRACE_IRQS_ON /* user mode is traced as IRQS on */ -@@ -440,10 +451,11 @@ END(ret_from_fork) - ENTRY(irq_entries_start) - vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) -+ UNWIND_HINT_IRET_REGS - pushq $(~vector+0x80) /* Note: always in signed byte range */ -- vector=vector+1 - jmp common_interrupt - .align 8 -+ vector=vector+1 - .endr - END(irq_entries_start) - -@@ -465,9 +477,14 @@ END(irq_entries_start) - * - * The invariant is that, if irq_count != -1, then the IRQ stack is in use. - */ --.macro ENTER_IRQ_STACK old_rsp -+.macro ENTER_IRQ_STACK regs=1 old_rsp - DEBUG_ENTRY_ASSERT_IRQS_OFF - movq %rsp, \old_rsp -+ -+ .if \regs -+ UNWIND_HINT_REGS base=\old_rsp -+ .endif -+ - incl PER_CPU_VAR(irq_count) - jnz .Lirq_stack_push_old_rsp_\@ - -@@ -504,16 +521,24 @@ END(irq_entries_start) - - .Lirq_stack_push_old_rsp_\@: - pushq \old_rsp -+ -+ .if \regs -+ UNWIND_HINT_REGS indirect=1 -+ .endif - .endm - - /* - * Undoes ENTER_IRQ_STACK. - */ --.macro LEAVE_IRQ_STACK -+.macro LEAVE_IRQ_STACK regs=1 - DEBUG_ENTRY_ASSERT_IRQS_OFF - /* We need to be off the IRQ stack before decrementing irq_count. */ - popq %rsp - -+ .if \regs -+ UNWIND_HINT_REGS -+ .endif -+ - /* - * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming - * the irq stack but we're not on it. -@@ -624,6 +649,7 @@ restore_c_regs_and_iret: - INTERRUPT_RETURN - - ENTRY(native_iret) -+ UNWIND_HINT_IRET_REGS - /* - * Are we returning to a stack segment from the LDT? Note: in - * 64-bit mode SS:RSP on the exception stack is always valid. -@@ -696,6 +722,7 @@ native_irq_return_ldt: - orq PER_CPU_VAR(espfix_stack), %rax - SWAPGS - movq %rax, %rsp -+ UNWIND_HINT_IRET_REGS offset=8 - - /* - * At this point, we cannot write to the stack any more, but we can -@@ -717,6 +744,7 @@ END(common_interrupt) - */ - .macro apicinterrupt3 num sym do_sym - ENTRY(\sym) -+ UNWIND_HINT_IRET_REGS - ASM_CLAC - pushq $~(\num) - .Lcommon_\sym: -@@ -803,6 +831,8 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt - - .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 - ENTRY(\sym) -+ UNWIND_HINT_IRET_REGS offset=8 -+ - /* Sanity check */ - .if \shift_ist != -1 && \paranoid == 0 - .error "using shift_ist requires paranoid=1" -@@ -826,6 +856,7 @@ ENTRY(\sym) - .else - call error_entry - .endif -+ UNWIND_HINT_REGS - /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ - - .if \paranoid -@@ -923,6 +954,7 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 - * edi: new selector - */ - ENTRY(native_load_gs_index) -+ FRAME_BEGIN - pushfq - DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) - SWAPGS -@@ -931,8 +963,9 @@ ENTRY(native_load_gs_index) - 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE - SWAPGS - popfq -+ FRAME_END - ret --END(native_load_gs_index) -+ENDPROC(native_load_gs_index) - EXPORT_SYMBOL(native_load_gs_index) - - _ASM_EXTABLE(.Lgs_change, bad_gs) -@@ -955,12 +988,12 @@ bad_gs: - ENTRY(do_softirq_own_stack) - pushq %rbp - mov %rsp, %rbp -- ENTER_IRQ_STACK old_rsp=%r11 -+ ENTER_IRQ_STACK regs=0 old_rsp=%r11 - call __do_softirq -- LEAVE_IRQ_STACK -+ LEAVE_IRQ_STACK regs=0 - leaveq - ret --END(do_softirq_own_stack) -+ENDPROC(do_softirq_own_stack) - - #ifdef CONFIG_XEN - idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 -@@ -984,7 +1017,9 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ - * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will - * see the correct pointer to the pt_regs - */ -+ UNWIND_HINT_FUNC - movq %rdi, %rsp /* we don't return, adjust the stack frame */ -+ UNWIND_HINT_REGS - - ENTER_IRQ_STACK old_rsp=%r10 - call xen_evtchn_do_upcall -@@ -1010,6 +1045,7 @@ END(xen_do_hypervisor_callback) - * with its current contents: any discrepancy means we in category 1. - */ - ENTRY(xen_failsafe_callback) -+ UNWIND_HINT_EMPTY - movl %ds, %ecx - cmpw %cx, 0x10(%rsp) - jne 1f -@@ -1029,11 +1065,13 @@ ENTRY(xen_failsafe_callback) - pushq $0 /* RIP */ - pushq %r11 - pushq %rcx -+ UNWIND_HINT_IRET_REGS offset=8 - jmp general_protection - 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ - movq (%rsp), %rcx - movq 8(%rsp), %r11 - addq $0x30, %rsp -+ UNWIND_HINT_IRET_REGS - pushq $-1 /* orig_ax = -1 => not a system call */ - ALLOC_PT_GPREGS_ON_STACK - SAVE_C_REGS -@@ -1079,6 +1117,7 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise - */ - ENTRY(paranoid_entry) -+ UNWIND_HINT_FUNC - cld - SAVE_C_REGS 8 - SAVE_EXTRA_REGS 8 -@@ -1106,6 +1145,7 @@ END(paranoid_entry) - * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) - */ - ENTRY(paranoid_exit) -+ UNWIND_HINT_REGS - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF_DEBUG - testl %ebx, %ebx /* swapgs needed? */ -@@ -1127,6 +1167,7 @@ END(paranoid_exit) - * Return: EBX=0: came from user mode; EBX=1: otherwise - */ - ENTRY(error_entry) -+ UNWIND_HINT_FUNC - cld - SAVE_C_REGS 8 - SAVE_EXTRA_REGS 8 -@@ -1211,6 +1252,7 @@ END(error_entry) - * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode - */ - ENTRY(error_exit) -+ UNWIND_HINT_REGS - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - testl %ebx, %ebx -@@ -1220,6 +1262,7 @@ END(error_exit) - - /* Runs on exception stack */ - ENTRY(nmi) -+ UNWIND_HINT_IRET_REGS - /* - * Fix up the exception frame if we're on Xen. - * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most -@@ -1293,11 +1336,13 @@ ENTRY(nmi) - cld - movq %rsp, %rdx - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp -+ UNWIND_HINT_IRET_REGS base=%rdx offset=8 - pushq 5*8(%rdx) /* pt_regs->ss */ - pushq 4*8(%rdx) /* pt_regs->rsp */ - pushq 3*8(%rdx) /* pt_regs->flags */ - pushq 2*8(%rdx) /* pt_regs->cs */ - pushq 1*8(%rdx) /* pt_regs->rip */ -+ UNWIND_HINT_IRET_REGS - pushq $-1 /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ -@@ -1314,6 +1359,7 @@ ENTRY(nmi) - pushq %r13 /* pt_regs->r13 */ - pushq %r14 /* pt_regs->r14 */ - pushq %r15 /* pt_regs->r15 */ -+ UNWIND_HINT_REGS - ENCODE_FRAME_POINTER - - /* -@@ -1468,6 +1514,7 @@ first_nmi: - .rept 5 - pushq 11*8(%rsp) - .endr -+ UNWIND_HINT_IRET_REGS - - /* Everything up to here is safe from nested NMIs */ - -@@ -1483,6 +1530,7 @@ first_nmi: - pushq $__KERNEL_CS /* CS */ - pushq $1f /* RIP */ - INTERRUPT_RETURN /* continues at repeat_nmi below */ -+ UNWIND_HINT_IRET_REGS - 1: - #endif - -@@ -1532,6 +1580,7 @@ end_repeat_nmi: - * exceptions might do. - */ - call paranoid_entry -+ UNWIND_HINT_REGS - - /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ - movq %rsp, %rdi -@@ -1569,17 +1618,19 @@ nmi_restore: - END(nmi) - - ENTRY(ignore_sysret) -+ UNWIND_HINT_EMPTY - mov $-ENOSYS, %eax - sysret - END(ignore_sysret) - - ENTRY(rewind_stack_do_exit) -+ UNWIND_HINT_FUNC - /* Prevent any naive code from trying to unwind to our caller. */ - xorl %ebp, %ebp - - movq PER_CPU_VAR(cpu_current_top_of_stack), %rax -- leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp -+ leaq -PTREGS_SIZE(%rax), %rsp -+ UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE - - call do_exit --1: jmp 1b - END(rewind_stack_do_exit) --- -2.14.2 - diff --git a/patches/kernel/0022-x86-entry-64-Initialize-the-top-of-the-IRQ-stack-bef.patch b/patches/kernel/0022-x86-entry-64-Initialize-the-top-of-the-IRQ-stack-bef.patch new file mode 100644 index 0000000..f50fc39 --- /dev/null +++ b/patches/kernel/0022-x86-entry-64-Initialize-the-top-of-the-IRQ-stack-bef.patch @@ -0,0 +1,94 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Tue, 11 Jul 2017 10:33:39 -0500 +Subject: [PATCH] x86/entry/64: Initialize the top of the IRQ stack before + switching stacks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The OOPS unwinder wants the word at the top of the IRQ stack to +point back to the previous stack at all times when the IRQ stack +is in use. There's currently a one-instruction window in ENTER_IRQ_STACK +during which this isn't the case. Fix it by writing the old RSP to the +top of the IRQ stack before jumping. + +This currently writes the pointer to the stack twice, which is a bit +ugly. We could get rid of this by replacing irq_stack_ptr with +irq_stack_ptr_minus_eight (better name welcome). OTOH, there may be +all kinds of odd microarchitectural considerations in play that +affect performance by a few cycles here. + +Reported-by: Mike Galbraith +Reported-by: Josh Poimboeuf +Signed-off-by: Andy Lutomirski +Signed-off-by: Josh Poimboeuf +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Jiri Slaby +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: live-patching@vger.kernel.org +Link: http://lkml.kernel.org/r/aae7e79e49914808440ad5310ace138ced2179ca.1499786555.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 2995590964da93e1fd9a91550f9c9d9fab28f160) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit a753ff654dfd07a7f8d6f39a27126589eac7e55f) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/entry_64.S | 24 +++++++++++++++++++++++- + 1 file changed, 23 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 07b4056af8a8..184b70712545 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -469,6 +469,7 @@ END(irq_entries_start) + DEBUG_ENTRY_ASSERT_IRQS_OFF + movq %rsp, \old_rsp + incl PER_CPU_VAR(irq_count) ++ jnz .Lirq_stack_push_old_rsp_\@ + + /* + * Right now, if we just incremented irq_count to zero, we've +@@ -478,9 +479,30 @@ END(irq_entries_start) + * it must be *extremely* careful to limit its stack usage. This + * could include kprobes and a hypothetical future IST-less #DB + * handler. ++ * ++ * The OOPS unwinder relies on the word at the top of the IRQ ++ * stack linking back to the previous RSP for the entire time we're ++ * on the IRQ stack. For this to work reliably, we need to write ++ * it before we actually move ourselves to the IRQ stack. ++ */ ++ ++ movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) ++ movq PER_CPU_VAR(irq_stack_ptr), %rsp ++ ++#ifdef CONFIG_DEBUG_ENTRY ++ /* ++ * If the first movq above becomes wrong due to IRQ stack layout ++ * changes, the only way we'll notice is if we try to unwind right ++ * here. Assert that we set up the stack right to catch this type ++ * of bug quickly. + */ ++ cmpq -8(%rsp), \old_rsp ++ je .Lirq_stack_okay\@ ++ ud2 ++ .Lirq_stack_okay\@: ++#endif + +- cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp ++.Lirq_stack_push_old_rsp_\@: + pushq \old_rsp + .endm + +-- +2.14.2 + diff --git a/patches/kernel/0023-x86-entry-64-Add-unwind-hint-annotations.patch b/patches/kernel/0023-x86-entry-64-Add-unwind-hint-annotations.patch new file mode 100644 index 0000000..428fcf3 --- /dev/null +++ b/patches/kernel/0023-x86-entry-64-Add-unwind-hint-annotations.patch @@ -0,0 +1,463 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Tue, 11 Jul 2017 10:33:44 -0500 +Subject: [PATCH] x86/entry/64: Add unwind hint annotations +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Add unwind hint annotations to entry_64.S. This will enable the ORC +unwinder to unwind through any location in the entry code including +syscalls, interrupts, and exceptions. + +Signed-off-by: Josh Poimboeuf +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Jiri Slaby +Cc: Linus Torvalds +Cc: Mike Galbraith +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: live-patching@vger.kernel.org +Link: http://lkml.kernel.org/r/b9f6d478aadf68ba57c739dcfac34ec0dc021c4c.1499786555.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 8c1f75587a18ca032da8f6376d1ed882d7095289) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit a8448e6971c1e71b22c651131d14f8be76e6d399) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/Makefile | 1 - + arch/x86/entry/calling.h | 5 ++++ + arch/x86/entry/entry_64.S | 71 ++++++++++++++++++++++++++++++++++++++++------- + 3 files changed, 66 insertions(+), 11 deletions(-) + +diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile +index 9976fcecd17e..af28a8a24366 100644 +--- a/arch/x86/entry/Makefile ++++ b/arch/x86/entry/Makefile +@@ -2,7 +2,6 @@ + # Makefile for the x86 low level entry code + # + +-OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y + OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y + + CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) +diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h +index 05ed3d393da7..640aafebdc00 100644 +--- a/arch/x86/entry/calling.h ++++ b/arch/x86/entry/calling.h +@@ -1,4 +1,5 @@ + #include ++#include + + /* + +@@ -112,6 +113,7 @@ For 32-bit we have the following conventions - kernel is built with + movq %rdx, 12*8+\offset(%rsp) + movq %rsi, 13*8+\offset(%rsp) + movq %rdi, 14*8+\offset(%rsp) ++ UNWIND_HINT_REGS offset=\offset extra=0 + .endm + .macro SAVE_C_REGS offset=0 + SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 +@@ -136,6 +138,7 @@ For 32-bit we have the following conventions - kernel is built with + movq %r12, 3*8+\offset(%rsp) + movq %rbp, 4*8+\offset(%rsp) + movq %rbx, 5*8+\offset(%rsp) ++ UNWIND_HINT_REGS offset=\offset + .endm + + .macro RESTORE_EXTRA_REGS offset=0 +@@ -145,6 +148,7 @@ For 32-bit we have the following conventions - kernel is built with + movq 3*8+\offset(%rsp), %r12 + movq 4*8+\offset(%rsp), %rbp + movq 5*8+\offset(%rsp), %rbx ++ UNWIND_HINT_REGS offset=\offset extra=0 + .endm + + .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 +@@ -167,6 +171,7 @@ For 32-bit we have the following conventions - kernel is built with + .endif + movq 13*8(%rsp), %rsi + movq 14*8(%rsp), %rdi ++ UNWIND_HINT_IRET_REGS offset=16*8 + .endm + .macro RESTORE_C_REGS + RESTORE_C_REGS_HELPER 1,1,1,1,1 +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 184b70712545..64b233ab7cad 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + #include + + .code64 +@@ -43,9 +44,10 @@ + + #ifdef CONFIG_PARAVIRT + ENTRY(native_usergs_sysret64) ++ UNWIND_HINT_EMPTY + swapgs + sysretq +-ENDPROC(native_usergs_sysret64) ++END(native_usergs_sysret64) + #endif /* CONFIG_PARAVIRT */ + + .macro TRACE_IRQS_IRETQ +@@ -134,6 +136,7 @@ ENDPROC(native_usergs_sysret64) + */ + + ENTRY(entry_SYSCALL_64) ++ UNWIND_HINT_EMPTY + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, +@@ -169,6 +172,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ + sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ ++ UNWIND_HINT_REGS extra=0 + + /* + * If we need to do entry work or if we guess we'll need to do +@@ -223,6 +227,7 @@ entry_SYSCALL_64_fastpath: + movq EFLAGS(%rsp), %r11 + RESTORE_C_REGS_EXCEPT_RCX_R11 + movq RSP(%rsp), %rsp ++ UNWIND_HINT_EMPTY + USERGS_SYSRET64 + + 1: +@@ -316,6 +321,7 @@ syscall_return_via_sysret: + /* rcx and r11 are already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_RCX_R11 + movq RSP(%rsp), %rsp ++ UNWIND_HINT_EMPTY + USERGS_SYSRET64 + + opportunistic_sysret_failed: +@@ -343,6 +349,7 @@ ENTRY(stub_ptregs_64) + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + popq %rax ++ UNWIND_HINT_REGS extra=0 + jmp entry_SYSCALL64_slow_path + + 1: +@@ -351,6 +358,7 @@ END(stub_ptregs_64) + + .macro ptregs_stub func + ENTRY(ptregs_\func) ++ UNWIND_HINT_FUNC + leaq \func(%rip), %rax + jmp stub_ptregs_64 + END(ptregs_\func) +@@ -367,6 +375,7 @@ END(ptregs_\func) + * %rsi: next task + */ + ENTRY(__switch_to_asm) ++ UNWIND_HINT_FUNC + /* + * Save callee-saved registers + * This must match the order in inactive_task_frame +@@ -406,6 +415,7 @@ END(__switch_to_asm) + * r12: kernel thread arg + */ + ENTRY(ret_from_fork) ++ UNWIND_HINT_EMPTY + movq %rax, %rdi + call schedule_tail /* rdi: 'prev' task parameter */ + +@@ -413,6 +423,7 @@ ENTRY(ret_from_fork) + jnz 1f /* kernel threads are uncommon */ + + 2: ++ UNWIND_HINT_REGS + movq %rsp, %rdi + call syscall_return_slowpath /* returns with IRQs disabled */ + TRACE_IRQS_ON /* user mode is traced as IRQS on */ +@@ -440,10 +451,11 @@ END(ret_from_fork) + ENTRY(irq_entries_start) + vector=FIRST_EXTERNAL_VECTOR + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) ++ UNWIND_HINT_IRET_REGS + pushq $(~vector+0x80) /* Note: always in signed byte range */ +- vector=vector+1 + jmp common_interrupt + .align 8 ++ vector=vector+1 + .endr + END(irq_entries_start) + +@@ -465,9 +477,14 @@ END(irq_entries_start) + * + * The invariant is that, if irq_count != -1, then the IRQ stack is in use. + */ +-.macro ENTER_IRQ_STACK old_rsp ++.macro ENTER_IRQ_STACK regs=1 old_rsp + DEBUG_ENTRY_ASSERT_IRQS_OFF + movq %rsp, \old_rsp ++ ++ .if \regs ++ UNWIND_HINT_REGS base=\old_rsp ++ .endif ++ + incl PER_CPU_VAR(irq_count) + jnz .Lirq_stack_push_old_rsp_\@ + +@@ -504,16 +521,24 @@ END(irq_entries_start) + + .Lirq_stack_push_old_rsp_\@: + pushq \old_rsp ++ ++ .if \regs ++ UNWIND_HINT_REGS indirect=1 ++ .endif + .endm + + /* + * Undoes ENTER_IRQ_STACK. + */ +-.macro LEAVE_IRQ_STACK ++.macro LEAVE_IRQ_STACK regs=1 + DEBUG_ENTRY_ASSERT_IRQS_OFF + /* We need to be off the IRQ stack before decrementing irq_count. */ + popq %rsp + ++ .if \regs ++ UNWIND_HINT_REGS ++ .endif ++ + /* + * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming + * the irq stack but we're not on it. +@@ -624,6 +649,7 @@ restore_c_regs_and_iret: + INTERRUPT_RETURN + + ENTRY(native_iret) ++ UNWIND_HINT_IRET_REGS + /* + * Are we returning to a stack segment from the LDT? Note: in + * 64-bit mode SS:RSP on the exception stack is always valid. +@@ -696,6 +722,7 @@ native_irq_return_ldt: + orq PER_CPU_VAR(espfix_stack), %rax + SWAPGS + movq %rax, %rsp ++ UNWIND_HINT_IRET_REGS offset=8 + + /* + * At this point, we cannot write to the stack any more, but we can +@@ -717,6 +744,7 @@ END(common_interrupt) + */ + .macro apicinterrupt3 num sym do_sym + ENTRY(\sym) ++ UNWIND_HINT_IRET_REGS + ASM_CLAC + pushq $~(\num) + .Lcommon_\sym: +@@ -803,6 +831,8 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt + + .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 + ENTRY(\sym) ++ UNWIND_HINT_IRET_REGS offset=8 ++ + /* Sanity check */ + .if \shift_ist != -1 && \paranoid == 0 + .error "using shift_ist requires paranoid=1" +@@ -826,6 +856,7 @@ ENTRY(\sym) + .else + call error_entry + .endif ++ UNWIND_HINT_REGS + /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ + + .if \paranoid +@@ -923,6 +954,7 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 + * edi: new selector + */ + ENTRY(native_load_gs_index) ++ FRAME_BEGIN + pushfq + DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) + SWAPGS +@@ -931,8 +963,9 @@ ENTRY(native_load_gs_index) + 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE + SWAPGS + popfq ++ FRAME_END + ret +-END(native_load_gs_index) ++ENDPROC(native_load_gs_index) + EXPORT_SYMBOL(native_load_gs_index) + + _ASM_EXTABLE(.Lgs_change, bad_gs) +@@ -955,12 +988,12 @@ bad_gs: + ENTRY(do_softirq_own_stack) + pushq %rbp + mov %rsp, %rbp +- ENTER_IRQ_STACK old_rsp=%r11 ++ ENTER_IRQ_STACK regs=0 old_rsp=%r11 + call __do_softirq +- LEAVE_IRQ_STACK ++ LEAVE_IRQ_STACK regs=0 + leaveq + ret +-END(do_softirq_own_stack) ++ENDPROC(do_softirq_own_stack) + + #ifdef CONFIG_XEN + idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 +@@ -984,7 +1017,9 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ + * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will + * see the correct pointer to the pt_regs + */ ++ UNWIND_HINT_FUNC + movq %rdi, %rsp /* we don't return, adjust the stack frame */ ++ UNWIND_HINT_REGS + + ENTER_IRQ_STACK old_rsp=%r10 + call xen_evtchn_do_upcall +@@ -1010,6 +1045,7 @@ END(xen_do_hypervisor_callback) + * with its current contents: any discrepancy means we in category 1. + */ + ENTRY(xen_failsafe_callback) ++ UNWIND_HINT_EMPTY + movl %ds, %ecx + cmpw %cx, 0x10(%rsp) + jne 1f +@@ -1029,11 +1065,13 @@ ENTRY(xen_failsafe_callback) + pushq $0 /* RIP */ + pushq %r11 + pushq %rcx ++ UNWIND_HINT_IRET_REGS offset=8 + jmp general_protection + 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ + movq (%rsp), %rcx + movq 8(%rsp), %r11 + addq $0x30, %rsp ++ UNWIND_HINT_IRET_REGS + pushq $-1 /* orig_ax = -1 => not a system call */ + ALLOC_PT_GPREGS_ON_STACK + SAVE_C_REGS +@@ -1079,6 +1117,7 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + */ + ENTRY(paranoid_entry) ++ UNWIND_HINT_FUNC + cld + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 +@@ -1106,6 +1145,7 @@ END(paranoid_entry) + * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) + */ + ENTRY(paranoid_exit) ++ UNWIND_HINT_REGS + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF_DEBUG + testl %ebx, %ebx /* swapgs needed? */ +@@ -1127,6 +1167,7 @@ END(paranoid_exit) + * Return: EBX=0: came from user mode; EBX=1: otherwise + */ + ENTRY(error_entry) ++ UNWIND_HINT_FUNC + cld + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 +@@ -1211,6 +1252,7 @@ END(error_entry) + * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode + */ + ENTRY(error_exit) ++ UNWIND_HINT_REGS + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + testl %ebx, %ebx +@@ -1220,6 +1262,7 @@ END(error_exit) + + /* Runs on exception stack */ + ENTRY(nmi) ++ UNWIND_HINT_IRET_REGS + /* + * Fix up the exception frame if we're on Xen. + * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most +@@ -1293,11 +1336,13 @@ ENTRY(nmi) + cld + movq %rsp, %rdx + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp ++ UNWIND_HINT_IRET_REGS base=%rdx offset=8 + pushq 5*8(%rdx) /* pt_regs->ss */ + pushq 4*8(%rdx) /* pt_regs->rsp */ + pushq 3*8(%rdx) /* pt_regs->flags */ + pushq 2*8(%rdx) /* pt_regs->cs */ + pushq 1*8(%rdx) /* pt_regs->rip */ ++ UNWIND_HINT_IRET_REGS + pushq $-1 /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ +@@ -1314,6 +1359,7 @@ ENTRY(nmi) + pushq %r13 /* pt_regs->r13 */ + pushq %r14 /* pt_regs->r14 */ + pushq %r15 /* pt_regs->r15 */ ++ UNWIND_HINT_REGS + ENCODE_FRAME_POINTER + + /* +@@ -1468,6 +1514,7 @@ first_nmi: + .rept 5 + pushq 11*8(%rsp) + .endr ++ UNWIND_HINT_IRET_REGS + + /* Everything up to here is safe from nested NMIs */ + +@@ -1483,6 +1530,7 @@ first_nmi: + pushq $__KERNEL_CS /* CS */ + pushq $1f /* RIP */ + INTERRUPT_RETURN /* continues at repeat_nmi below */ ++ UNWIND_HINT_IRET_REGS + 1: + #endif + +@@ -1532,6 +1580,7 @@ end_repeat_nmi: + * exceptions might do. + */ + call paranoid_entry ++ UNWIND_HINT_REGS + + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ + movq %rsp, %rdi +@@ -1569,17 +1618,19 @@ nmi_restore: + END(nmi) + + ENTRY(ignore_sysret) ++ UNWIND_HINT_EMPTY + mov $-ENOSYS, %eax + sysret + END(ignore_sysret) + + ENTRY(rewind_stack_do_exit) ++ UNWIND_HINT_FUNC + /* Prevent any naive code from trying to unwind to our caller. */ + xorl %ebp, %ebp + + movq PER_CPU_VAR(cpu_current_top_of_stack), %rax +- leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp ++ leaq -PTREGS_SIZE(%rax), %rsp ++ UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE + + call do_exit +-1: jmp 1b + END(rewind_stack_do_exit) +-- +2.14.2 + diff --git a/patches/kernel/0023-xen-x86-Remove-SME-feature-in-PV-guests.patch b/patches/kernel/0023-xen-x86-Remove-SME-feature-in-PV-guests.patch deleted file mode 100644 index bba2e33..0000000 --- a/patches/kernel/0023-xen-x86-Remove-SME-feature-in-PV-guests.patch +++ /dev/null @@ -1,70 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tom Lendacky -Date: Mon, 17 Jul 2017 16:10:29 -0500 -Subject: [PATCH] xen/x86: Remove SME feature in PV guests -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Xen does not currently support SME for PV guests. Clear the SME CPU -capability in order to avoid any ambiguity. - -Signed-off-by: Tom Lendacky -Reviewed-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Reviewed-by: Juergen Gross -Cc: -Cc: Alexander Potapenko -Cc: Andrey Ryabinin -Cc: Andy Lutomirski -Cc: Arnd Bergmann -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brijesh Singh -Cc: Dave Young -Cc: Dmitry Vyukov -Cc: Jonathan Corbet -Cc: Konrad Rzeszutek Wilk -Cc: Larry Woodman -Cc: Linus Torvalds -Cc: Matt Fleming -Cc: Michael S. Tsirkin -Cc: Paolo Bonzini -Cc: Peter Zijlstra -Cc: Radim Krčmář -Cc: Rik van Riel -Cc: Toshimitsu Kani -Cc: kasan-dev@googlegroups.com -Cc: kvm@vger.kernel.org -Cc: linux-arch@vger.kernel.org -Cc: linux-doc@vger.kernel.org -Cc: linux-efi@vger.kernel.org -Cc: linux-mm@kvack.org -Link: http://lkml.kernel.org/r/3b605622a9fae5e588e5a13967120a18ec18071b.1500319216.git.thomas.lendacky@amd.com -Signed-off-by: Ingo Molnar -(cherry picked from commit f2f931c6819467af5260a21c59fb787ce2863f92) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 8370907399392a637a2e51b4db3368fb594db3a6) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/xen/enlighten_pv.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c -index 290bc5ac9852..df1921751aa5 100644 ---- a/arch/x86/xen/enlighten_pv.c -+++ b/arch/x86/xen/enlighten_pv.c -@@ -263,6 +263,7 @@ static void __init xen_init_capabilities(void) - setup_clear_cpu_cap(X86_FEATURE_MTRR); - setup_clear_cpu_cap(X86_FEATURE_ACC); - setup_clear_cpu_cap(X86_FEATURE_X2APIC); -+ setup_clear_cpu_cap(X86_FEATURE_SME); - - /* - * Xen PV would need some work to support PCID: CR3 handling as well --- -2.14.2 - diff --git a/patches/kernel/0024-x86-xen-64-Rearrange-the-SYSCALL-entries.patch b/patches/kernel/0024-x86-xen-64-Rearrange-the-SYSCALL-entries.patch deleted file mode 100644 index c6898df..0000000 --- a/patches/kernel/0024-x86-xen-64-Rearrange-the-SYSCALL-entries.patch +++ /dev/null @@ -1,152 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 7 Aug 2017 20:59:21 -0700 -Subject: [PATCH] x86/xen/64: Rearrange the SYSCALL entries -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Xen's raw SYSCALL entries are much less weird than native. Rather -than fudging them to look like native entries, use the Xen-provided -stack frame directly. - -This lets us eliminate entry_SYSCALL_64_after_swapgs and two uses of -the SWAPGS_UNSAFE_STACK paravirt hook. The SYSENTER code would -benefit from similar treatment. - -This makes one change to the native code path: the compat -instruction that clears the high 32 bits of %rax is moved slightly -later. I'd be surprised if this affects performance at all. - -Tested-by: Juergen Gross -Signed-off-by: Andy Lutomirski -Reviewed-by: Juergen Gross -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: xen-devel@lists.xenproject.org -Link: http://lkml.kernel.org/r/7c88ed36805d36841ab03ec3b48b4122c4418d71.1502164668.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 8a9949bc71a71b3dd633255ebe8f8869b1f73474) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit b8cec41ee5f30df5032cfe8c86103f7d92a89590) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/entry_64.S | 9 ++------- - arch/x86/entry/entry_64_compat.S | 7 +++---- - arch/x86/xen/xen-asm_64.S | 23 +++++++++-------------- - 3 files changed, 14 insertions(+), 25 deletions(-) - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 64b233ab7cad..4dbb336a1fdd 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -142,14 +142,8 @@ ENTRY(entry_SYSCALL_64) - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ -- SWAPGS_UNSAFE_STACK -- /* -- * A hypervisor implementation might want to use a label -- * after the swapgs, so that it can do the swapgs -- * for the guest and jump here on syscall. -- */ --GLOBAL(entry_SYSCALL_64_after_swapgs) - -+ swapgs - movq %rsp, PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - -@@ -161,6 +155,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) - pushq %r11 /* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ -+GLOBAL(entry_SYSCALL_64_after_hwframe) - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ -diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S -index e1721dafbcb1..5314d7b8e5ad 100644 ---- a/arch/x86/entry/entry_64_compat.S -+++ b/arch/x86/entry/entry_64_compat.S -@@ -183,21 +183,20 @@ ENDPROC(entry_SYSENTER_compat) - */ - ENTRY(entry_SYSCALL_compat) - /* Interrupts are off on entry. */ -- SWAPGS_UNSAFE_STACK -+ swapgs - - /* Stash user ESP and switch to the kernel stack. */ - movl %esp, %r8d - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - -- /* Zero-extending 32-bit regs, do not remove */ -- movl %eax, %eax -- - /* Construct struct pt_regs on stack */ - pushq $__USER32_DS /* pt_regs->ss */ - pushq %r8 /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ - pushq $__USER32_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ -+GLOBAL(entry_SYSCALL_compat_after_hwframe) -+ movl %eax, %eax /* discard orig_ax high bits */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ -diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S -index c3df43141e70..a8a4f4c460a6 100644 ---- a/arch/x86/xen/xen-asm_64.S -+++ b/arch/x86/xen/xen-asm_64.S -@@ -82,34 +82,29 @@ RELOC(xen_sysret64, 1b+1) - * rip - * r11 - * rsp->rcx -- * -- * In all the entrypoints, we undo all that to make it look like a -- * CPU-generated syscall/sysenter and jump to the normal entrypoint. - */ - --.macro undo_xen_syscall -- mov 0*8(%rsp), %rcx -- mov 1*8(%rsp), %r11 -- mov 5*8(%rsp), %rsp --.endm -- - /* Normal 64-bit system call target */ - ENTRY(xen_syscall_target) -- undo_xen_syscall -- jmp entry_SYSCALL_64_after_swapgs -+ popq %rcx -+ popq %r11 -+ jmp entry_SYSCALL_64_after_hwframe - ENDPROC(xen_syscall_target) - - #ifdef CONFIG_IA32_EMULATION - - /* 32-bit compat syscall target */ - ENTRY(xen_syscall32_target) -- undo_xen_syscall -- jmp entry_SYSCALL_compat -+ popq %rcx -+ popq %r11 -+ jmp entry_SYSCALL_compat_after_hwframe - ENDPROC(xen_syscall32_target) - - /* 32-bit compat sysenter target */ - ENTRY(xen_sysenter_target) -- undo_xen_syscall -+ mov 0*8(%rsp), %rcx -+ mov 1*8(%rsp), %r11 -+ mov 5*8(%rsp), %rsp - jmp entry_SYSENTER_compat - ENDPROC(xen_sysenter_target) - --- -2.14.2 - diff --git a/patches/kernel/0024-xen-x86-Remove-SME-feature-in-PV-guests.patch b/patches/kernel/0024-xen-x86-Remove-SME-feature-in-PV-guests.patch new file mode 100644 index 0000000..bba2e33 --- /dev/null +++ b/patches/kernel/0024-xen-x86-Remove-SME-feature-in-PV-guests.patch @@ -0,0 +1,70 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Mon, 17 Jul 2017 16:10:29 -0500 +Subject: [PATCH] xen/x86: Remove SME feature in PV guests +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Xen does not currently support SME for PV guests. Clear the SME CPU +capability in order to avoid any ambiguity. + +Signed-off-by: Tom Lendacky +Reviewed-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Reviewed-by: Juergen Gross +Cc: +Cc: Alexander Potapenko +Cc: Andrey Ryabinin +Cc: Andy Lutomirski +Cc: Arnd Bergmann +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brijesh Singh +Cc: Dave Young +Cc: Dmitry Vyukov +Cc: Jonathan Corbet +Cc: Konrad Rzeszutek Wilk +Cc: Larry Woodman +Cc: Linus Torvalds +Cc: Matt Fleming +Cc: Michael S. Tsirkin +Cc: Paolo Bonzini +Cc: Peter Zijlstra +Cc: Radim Krčmář +Cc: Rik van Riel +Cc: Toshimitsu Kani +Cc: kasan-dev@googlegroups.com +Cc: kvm@vger.kernel.org +Cc: linux-arch@vger.kernel.org +Cc: linux-doc@vger.kernel.org +Cc: linux-efi@vger.kernel.org +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/3b605622a9fae5e588e5a13967120a18ec18071b.1500319216.git.thomas.lendacky@amd.com +Signed-off-by: Ingo Molnar +(cherry picked from commit f2f931c6819467af5260a21c59fb787ce2863f92) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 8370907399392a637a2e51b4db3368fb594db3a6) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/xen/enlighten_pv.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c +index 290bc5ac9852..df1921751aa5 100644 +--- a/arch/x86/xen/enlighten_pv.c ++++ b/arch/x86/xen/enlighten_pv.c +@@ -263,6 +263,7 @@ static void __init xen_init_capabilities(void) + setup_clear_cpu_cap(X86_FEATURE_MTRR); + setup_clear_cpu_cap(X86_FEATURE_ACC); + setup_clear_cpu_cap(X86_FEATURE_X2APIC); ++ setup_clear_cpu_cap(X86_FEATURE_SME); + + /* + * Xen PV would need some work to support PCID: CR3 handling as well +-- +2.14.2 + diff --git a/patches/kernel/0025-irq-Make-the-irqentry-text-section-unconditional.patch b/patches/kernel/0025-irq-Make-the-irqentry-text-section-unconditional.patch deleted file mode 100644 index 4be6064..0000000 --- a/patches/kernel/0025-irq-Make-the-irqentry-text-section-unconditional.patch +++ /dev/null @@ -1,223 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Masami Hiramatsu -Date: Thu, 3 Aug 2017 11:38:21 +0900 -Subject: [PATCH] irq: Make the irqentry text section unconditional -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Generate irqentry and softirqentry text sections without -any Kconfig dependencies. This will add extra sections, but -there should be no performace impact. - -Suggested-by: Ingo Molnar -Signed-off-by: Masami Hiramatsu -Cc: Ananth N Mavinakayanahalli -Cc: Anil S Keshavamurthy -Cc: Chris Zankel -Cc: David S . Miller -Cc: Francis Deslauriers -Cc: Jesper Nilsson -Cc: Linus Torvalds -Cc: Max Filippov -Cc: Mikael Starvik -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: Yoshinori Sato -Cc: linux-arch@vger.kernel.org -Cc: linux-cris-kernel@axis.com -Cc: mathieu.desnoyers@efficios.com -Link: http://lkml.kernel.org/r/150172789110.27216.3955739126693102122.stgit@devbox -Signed-off-by: Ingo Molnar -(cherry picked from commit 229a71860547ec856b156179a9c6bef2de426f66) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 8fd2f68cc93ae772cfddf4151d13448ff17d0229) -Signed-off-by: Fabian Grünbichler ---- - arch/arm/include/asm/traps.h | 7 ------- - arch/arm64/include/asm/traps.h | 7 ------- - include/asm-generic/sections.h | 4 ++++ - include/asm-generic/vmlinux.lds.h | 8 -------- - include/linux/interrupt.h | 14 +------------- - arch/x86/kernel/unwind_frame.c | 2 -- - arch/x86/entry/entry_64.S | 9 ++------- - 7 files changed, 7 insertions(+), 44 deletions(-) - -diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h -index f555bb3664dc..683d9230984a 100644 ---- a/arch/arm/include/asm/traps.h -+++ b/arch/arm/include/asm/traps.h -@@ -18,7 +18,6 @@ struct undef_hook { - void register_undef_hook(struct undef_hook *hook); - void unregister_undef_hook(struct undef_hook *hook); - --#ifdef CONFIG_FUNCTION_GRAPH_TRACER - static inline int __in_irqentry_text(unsigned long ptr) - { - extern char __irqentry_text_start[]; -@@ -27,12 +26,6 @@ static inline int __in_irqentry_text(unsigned long ptr) - return ptr >= (unsigned long)&__irqentry_text_start && - ptr < (unsigned long)&__irqentry_text_end; - } --#else --static inline int __in_irqentry_text(unsigned long ptr) --{ -- return 0; --} --#endif - - static inline int in_exception_text(unsigned long ptr) - { -diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h -index 02e9035b0685..47a9066f7c86 100644 ---- a/arch/arm64/include/asm/traps.h -+++ b/arch/arm64/include/asm/traps.h -@@ -37,18 +37,11 @@ void unregister_undef_hook(struct undef_hook *hook); - - void arm64_notify_segfault(struct pt_regs *regs, unsigned long addr); - --#ifdef CONFIG_FUNCTION_GRAPH_TRACER - static inline int __in_irqentry_text(unsigned long ptr) - { - return ptr >= (unsigned long)&__irqentry_text_start && - ptr < (unsigned long)&__irqentry_text_end; - } --#else --static inline int __in_irqentry_text(unsigned long ptr) --{ -- return 0; --} --#endif - - static inline int in_exception_text(unsigned long ptr) - { -diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h -index 532372c6cf15..e5da44eddd2f 100644 ---- a/include/asm-generic/sections.h -+++ b/include/asm-generic/sections.h -@@ -27,6 +27,8 @@ - * __kprobes_text_start, __kprobes_text_end - * __entry_text_start, __entry_text_end - * __ctors_start, __ctors_end -+ * __irqentry_text_start, __irqentry_text_end -+ * __softirqentry_text_start, __softirqentry_text_end - */ - extern char _text[], _stext[], _etext[]; - extern char _data[], _sdata[], _edata[]; -@@ -39,6 +41,8 @@ extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[]; - extern char __kprobes_text_start[], __kprobes_text_end[]; - extern char __entry_text_start[], __entry_text_end[]; - extern char __start_rodata[], __end_rodata[]; -+extern char __irqentry_text_start[], __irqentry_text_end[]; -+extern char __softirqentry_text_start[], __softirqentry_text_end[]; - - /* Start and end of .ctors section - used for constructor calls. */ - extern char __ctors_start[], __ctors_end[]; -diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h -index 9623d78f8494..e7e955d4ab9e 100644 ---- a/include/asm-generic/vmlinux.lds.h -+++ b/include/asm-generic/vmlinux.lds.h -@@ -497,25 +497,17 @@ - *(.entry.text) \ - VMLINUX_SYMBOL(__entry_text_end) = .; - --#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) - #define IRQENTRY_TEXT \ - ALIGN_FUNCTION(); \ - VMLINUX_SYMBOL(__irqentry_text_start) = .; \ - *(.irqentry.text) \ - VMLINUX_SYMBOL(__irqentry_text_end) = .; --#else --#define IRQENTRY_TEXT --#endif - --#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) - #define SOFTIRQENTRY_TEXT \ - ALIGN_FUNCTION(); \ - VMLINUX_SYMBOL(__softirqentry_text_start) = .; \ - *(.softirqentry.text) \ - VMLINUX_SYMBOL(__softirqentry_text_end) = .; --#else --#define SOFTIRQENTRY_TEXT --#endif - - /* Section used for early init (in .S files) */ - #define HEAD_TEXT *(.head.text) -diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h -index a2fddddb0d60..59ba11661b6e 100644 ---- a/include/linux/interrupt.h -+++ b/include/linux/interrupt.h -@@ -18,6 +18,7 @@ - #include - #include - #include -+#include - - /* - * These correspond to the IORESOURCE_IRQ_* defines in -@@ -726,7 +727,6 @@ extern int early_irq_init(void); - extern int arch_probe_nr_irqs(void); - extern int arch_early_irq_init(void); - --#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) - /* - * We want to know which function is an entrypoint of a hardirq or a softirq. - */ -@@ -734,16 +734,4 @@ extern int arch_early_irq_init(void); - #define __softirq_entry \ - __attribute__((__section__(".softirqentry.text"))) - --/* Limits of hardirq entrypoints */ --extern char __irqentry_text_start[]; --extern char __irqentry_text_end[]; --/* Limits of softirq entrypoints */ --extern char __softirqentry_text_start[]; --extern char __softirqentry_text_end[]; -- --#else --#define __irq_entry --#define __softirq_entry --#endif -- - #endif -diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c -index b9389d72b2f7..c29e5bc7e9c9 100644 ---- a/arch/x86/kernel/unwind_frame.c -+++ b/arch/x86/kernel/unwind_frame.c -@@ -91,10 +91,8 @@ static bool in_entry_code(unsigned long ip) - if (addr >= __entry_text_start && addr < __entry_text_end) - return true; - --#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) - if (addr >= __irqentry_text_start && addr < __irqentry_text_end) - return true; --#endif - - return false; - } -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 4dbb336a1fdd..ca0b250eefc4 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -761,13 +761,8 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym) - #endif - - /* Make sure APIC interrupt handlers end up in the irqentry section: */ --#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) --# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" --# define POP_SECTION_IRQENTRY .popsection --#else --# define PUSH_SECTION_IRQENTRY --# define POP_SECTION_IRQENTRY --#endif -+#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" -+#define POP_SECTION_IRQENTRY .popsection - - .macro apicinterrupt num sym do_sym - PUSH_SECTION_IRQENTRY --- -2.14.2 - diff --git a/patches/kernel/0025-x86-xen-64-Rearrange-the-SYSCALL-entries.patch b/patches/kernel/0025-x86-xen-64-Rearrange-the-SYSCALL-entries.patch new file mode 100644 index 0000000..c6898df --- /dev/null +++ b/patches/kernel/0025-x86-xen-64-Rearrange-the-SYSCALL-entries.patch @@ -0,0 +1,152 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 7 Aug 2017 20:59:21 -0700 +Subject: [PATCH] x86/xen/64: Rearrange the SYSCALL entries +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Xen's raw SYSCALL entries are much less weird than native. Rather +than fudging them to look like native entries, use the Xen-provided +stack frame directly. + +This lets us eliminate entry_SYSCALL_64_after_swapgs and two uses of +the SWAPGS_UNSAFE_STACK paravirt hook. The SYSENTER code would +benefit from similar treatment. + +This makes one change to the native code path: the compat +instruction that clears the high 32 bits of %rax is moved slightly +later. I'd be surprised if this affects performance at all. + +Tested-by: Juergen Gross +Signed-off-by: Andy Lutomirski +Reviewed-by: Juergen Gross +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: xen-devel@lists.xenproject.org +Link: http://lkml.kernel.org/r/7c88ed36805d36841ab03ec3b48b4122c4418d71.1502164668.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 8a9949bc71a71b3dd633255ebe8f8869b1f73474) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit b8cec41ee5f30df5032cfe8c86103f7d92a89590) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/entry_64.S | 9 ++------- + arch/x86/entry/entry_64_compat.S | 7 +++---- + arch/x86/xen/xen-asm_64.S | 23 +++++++++-------------- + 3 files changed, 14 insertions(+), 25 deletions(-) + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 64b233ab7cad..4dbb336a1fdd 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -142,14 +142,8 @@ ENTRY(entry_SYSCALL_64) + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ +- SWAPGS_UNSAFE_STACK +- /* +- * A hypervisor implementation might want to use a label +- * after the swapgs, so that it can do the swapgs +- * for the guest and jump here on syscall. +- */ +-GLOBAL(entry_SYSCALL_64_after_swapgs) + ++ swapgs + movq %rsp, PER_CPU_VAR(rsp_scratch) + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + +@@ -161,6 +155,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ ++GLOBAL(entry_SYSCALL_64_after_hwframe) + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index e1721dafbcb1..5314d7b8e5ad 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -183,21 +183,20 @@ ENDPROC(entry_SYSENTER_compat) + */ + ENTRY(entry_SYSCALL_compat) + /* Interrupts are off on entry. */ +- SWAPGS_UNSAFE_STACK ++ swapgs + + /* Stash user ESP and switch to the kernel stack. */ + movl %esp, %r8d + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + +- /* Zero-extending 32-bit regs, do not remove */ +- movl %eax, %eax +- + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %r8 /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ ++GLOBAL(entry_SYSCALL_compat_after_hwframe) ++ movl %eax, %eax /* discard orig_ax high bits */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ +diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S +index c3df43141e70..a8a4f4c460a6 100644 +--- a/arch/x86/xen/xen-asm_64.S ++++ b/arch/x86/xen/xen-asm_64.S +@@ -82,34 +82,29 @@ RELOC(xen_sysret64, 1b+1) + * rip + * r11 + * rsp->rcx +- * +- * In all the entrypoints, we undo all that to make it look like a +- * CPU-generated syscall/sysenter and jump to the normal entrypoint. + */ + +-.macro undo_xen_syscall +- mov 0*8(%rsp), %rcx +- mov 1*8(%rsp), %r11 +- mov 5*8(%rsp), %rsp +-.endm +- + /* Normal 64-bit system call target */ + ENTRY(xen_syscall_target) +- undo_xen_syscall +- jmp entry_SYSCALL_64_after_swapgs ++ popq %rcx ++ popq %r11 ++ jmp entry_SYSCALL_64_after_hwframe + ENDPROC(xen_syscall_target) + + #ifdef CONFIG_IA32_EMULATION + + /* 32-bit compat syscall target */ + ENTRY(xen_syscall32_target) +- undo_xen_syscall +- jmp entry_SYSCALL_compat ++ popq %rcx ++ popq %r11 ++ jmp entry_SYSCALL_compat_after_hwframe + ENDPROC(xen_syscall32_target) + + /* 32-bit compat sysenter target */ + ENTRY(xen_sysenter_target) +- undo_xen_syscall ++ mov 0*8(%rsp), %rcx ++ mov 1*8(%rsp), %r11 ++ mov 5*8(%rsp), %rsp + jmp entry_SYSENTER_compat + ENDPROC(xen_sysenter_target) + +-- +2.14.2 + diff --git a/patches/kernel/0026-irq-Make-the-irqentry-text-section-unconditional.patch b/patches/kernel/0026-irq-Make-the-irqentry-text-section-unconditional.patch new file mode 100644 index 0000000..4be6064 --- /dev/null +++ b/patches/kernel/0026-irq-Make-the-irqentry-text-section-unconditional.patch @@ -0,0 +1,223 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Masami Hiramatsu +Date: Thu, 3 Aug 2017 11:38:21 +0900 +Subject: [PATCH] irq: Make the irqentry text section unconditional +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Generate irqentry and softirqentry text sections without +any Kconfig dependencies. This will add extra sections, but +there should be no performace impact. + +Suggested-by: Ingo Molnar +Signed-off-by: Masami Hiramatsu +Cc: Ananth N Mavinakayanahalli +Cc: Anil S Keshavamurthy +Cc: Chris Zankel +Cc: David S . Miller +Cc: Francis Deslauriers +Cc: Jesper Nilsson +Cc: Linus Torvalds +Cc: Max Filippov +Cc: Mikael Starvik +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: Yoshinori Sato +Cc: linux-arch@vger.kernel.org +Cc: linux-cris-kernel@axis.com +Cc: mathieu.desnoyers@efficios.com +Link: http://lkml.kernel.org/r/150172789110.27216.3955739126693102122.stgit@devbox +Signed-off-by: Ingo Molnar +(cherry picked from commit 229a71860547ec856b156179a9c6bef2de426f66) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 8fd2f68cc93ae772cfddf4151d13448ff17d0229) +Signed-off-by: Fabian Grünbichler +--- + arch/arm/include/asm/traps.h | 7 ------- + arch/arm64/include/asm/traps.h | 7 ------- + include/asm-generic/sections.h | 4 ++++ + include/asm-generic/vmlinux.lds.h | 8 -------- + include/linux/interrupt.h | 14 +------------- + arch/x86/kernel/unwind_frame.c | 2 -- + arch/x86/entry/entry_64.S | 9 ++------- + 7 files changed, 7 insertions(+), 44 deletions(-) + +diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h +index f555bb3664dc..683d9230984a 100644 +--- a/arch/arm/include/asm/traps.h ++++ b/arch/arm/include/asm/traps.h +@@ -18,7 +18,6 @@ struct undef_hook { + void register_undef_hook(struct undef_hook *hook); + void unregister_undef_hook(struct undef_hook *hook); + +-#ifdef CONFIG_FUNCTION_GRAPH_TRACER + static inline int __in_irqentry_text(unsigned long ptr) + { + extern char __irqentry_text_start[]; +@@ -27,12 +26,6 @@ static inline int __in_irqentry_text(unsigned long ptr) + return ptr >= (unsigned long)&__irqentry_text_start && + ptr < (unsigned long)&__irqentry_text_end; + } +-#else +-static inline int __in_irqentry_text(unsigned long ptr) +-{ +- return 0; +-} +-#endif + + static inline int in_exception_text(unsigned long ptr) + { +diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h +index 02e9035b0685..47a9066f7c86 100644 +--- a/arch/arm64/include/asm/traps.h ++++ b/arch/arm64/include/asm/traps.h +@@ -37,18 +37,11 @@ void unregister_undef_hook(struct undef_hook *hook); + + void arm64_notify_segfault(struct pt_regs *regs, unsigned long addr); + +-#ifdef CONFIG_FUNCTION_GRAPH_TRACER + static inline int __in_irqentry_text(unsigned long ptr) + { + return ptr >= (unsigned long)&__irqentry_text_start && + ptr < (unsigned long)&__irqentry_text_end; + } +-#else +-static inline int __in_irqentry_text(unsigned long ptr) +-{ +- return 0; +-} +-#endif + + static inline int in_exception_text(unsigned long ptr) + { +diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h +index 532372c6cf15..e5da44eddd2f 100644 +--- a/include/asm-generic/sections.h ++++ b/include/asm-generic/sections.h +@@ -27,6 +27,8 @@ + * __kprobes_text_start, __kprobes_text_end + * __entry_text_start, __entry_text_end + * __ctors_start, __ctors_end ++ * __irqentry_text_start, __irqentry_text_end ++ * __softirqentry_text_start, __softirqentry_text_end + */ + extern char _text[], _stext[], _etext[]; + extern char _data[], _sdata[], _edata[]; +@@ -39,6 +41,8 @@ extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[]; + extern char __kprobes_text_start[], __kprobes_text_end[]; + extern char __entry_text_start[], __entry_text_end[]; + extern char __start_rodata[], __end_rodata[]; ++extern char __irqentry_text_start[], __irqentry_text_end[]; ++extern char __softirqentry_text_start[], __softirqentry_text_end[]; + + /* Start and end of .ctors section - used for constructor calls. */ + extern char __ctors_start[], __ctors_end[]; +diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h +index 9623d78f8494..e7e955d4ab9e 100644 +--- a/include/asm-generic/vmlinux.lds.h ++++ b/include/asm-generic/vmlinux.lds.h +@@ -497,25 +497,17 @@ + *(.entry.text) \ + VMLINUX_SYMBOL(__entry_text_end) = .; + +-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) + #define IRQENTRY_TEXT \ + ALIGN_FUNCTION(); \ + VMLINUX_SYMBOL(__irqentry_text_start) = .; \ + *(.irqentry.text) \ + VMLINUX_SYMBOL(__irqentry_text_end) = .; +-#else +-#define IRQENTRY_TEXT +-#endif + +-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) + #define SOFTIRQENTRY_TEXT \ + ALIGN_FUNCTION(); \ + VMLINUX_SYMBOL(__softirqentry_text_start) = .; \ + *(.softirqentry.text) \ + VMLINUX_SYMBOL(__softirqentry_text_end) = .; +-#else +-#define SOFTIRQENTRY_TEXT +-#endif + + /* Section used for early init (in .S files) */ + #define HEAD_TEXT *(.head.text) +diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h +index a2fddddb0d60..59ba11661b6e 100644 +--- a/include/linux/interrupt.h ++++ b/include/linux/interrupt.h +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + + /* + * These correspond to the IORESOURCE_IRQ_* defines in +@@ -726,7 +727,6 @@ extern int early_irq_init(void); + extern int arch_probe_nr_irqs(void); + extern int arch_early_irq_init(void); + +-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) + /* + * We want to know which function is an entrypoint of a hardirq or a softirq. + */ +@@ -734,16 +734,4 @@ extern int arch_early_irq_init(void); + #define __softirq_entry \ + __attribute__((__section__(".softirqentry.text"))) + +-/* Limits of hardirq entrypoints */ +-extern char __irqentry_text_start[]; +-extern char __irqentry_text_end[]; +-/* Limits of softirq entrypoints */ +-extern char __softirqentry_text_start[]; +-extern char __softirqentry_text_end[]; +- +-#else +-#define __irq_entry +-#define __softirq_entry +-#endif +- + #endif +diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c +index b9389d72b2f7..c29e5bc7e9c9 100644 +--- a/arch/x86/kernel/unwind_frame.c ++++ b/arch/x86/kernel/unwind_frame.c +@@ -91,10 +91,8 @@ static bool in_entry_code(unsigned long ip) + if (addr >= __entry_text_start && addr < __entry_text_end) + return true; + +-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) + if (addr >= __irqentry_text_start && addr < __irqentry_text_end) + return true; +-#endif + + return false; + } +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 4dbb336a1fdd..ca0b250eefc4 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -761,13 +761,8 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym) + #endif + + /* Make sure APIC interrupt handlers end up in the irqentry section: */ +-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) +-# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" +-# define POP_SECTION_IRQENTRY .popsection +-#else +-# define PUSH_SECTION_IRQENTRY +-# define POP_SECTION_IRQENTRY +-#endif ++#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" ++#define POP_SECTION_IRQENTRY .popsection + + .macro apicinterrupt num sym do_sym + PUSH_SECTION_IRQENTRY +-- +2.14.2 + diff --git a/patches/kernel/0026-x86-xen-64-Fix-the-reported-SS-and-CS-in-SYSCALL.patch b/patches/kernel/0026-x86-xen-64-Fix-the-reported-SS-and-CS-in-SYSCALL.patch deleted file mode 100644 index 345a513..0000000 --- a/patches/kernel/0026-x86-xen-64-Fix-the-reported-SS-and-CS-in-SYSCALL.patch +++ /dev/null @@ -1,84 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 14 Aug 2017 22:36:19 -0700 -Subject: [PATCH] x86/xen/64: Fix the reported SS and CS in SYSCALL -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -When I cleaned up the Xen SYSCALL entries, I inadvertently changed -the reported segment registers. Before my patch, regs->ss was -__USER(32)_DS and regs->cs was __USER(32)_CS. After the patch, they -are FLAT_USER_CS/DS(32). - -This had a couple unfortunate effects. It confused the -opportunistic fast return logic. It also significantly increased -the risk of triggering a nasty glibc bug: - - https://sourceware.org/bugzilla/show_bug.cgi?id=21269 - -Update the Xen entry code to change it back. - -Reported-by: Brian Gerst -Signed-off-by: Andy Lutomirski -Cc: Andrew Cooper -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: xen-devel@lists.xenproject.org -Fixes: 8a9949bc71a7 ("x86/xen/64: Rearrange the SYSCALL entries") -Link: http://lkml.kernel.org/r/daba8351ea2764bb30272296ab9ce08a81bd8264.1502775273.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit fa2016a8e7d846b306e431646d250500e1da0c33) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 69a6ef3aeb274efe86fd74771830354f303ccc2f) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/xen/xen-asm_64.S | 18 ++++++++++++++++++ - 1 file changed, 18 insertions(+) - -diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S -index a8a4f4c460a6..c5fee2680abc 100644 ---- a/arch/x86/xen/xen-asm_64.S -+++ b/arch/x86/xen/xen-asm_64.S -@@ -88,6 +88,15 @@ RELOC(xen_sysret64, 1b+1) - ENTRY(xen_syscall_target) - popq %rcx - popq %r11 -+ -+ /* -+ * Neither Xen nor the kernel really knows what the old SS and -+ * CS were. The kernel expects __USER_DS and __USER_CS, so -+ * report those values even though Xen will guess its own values. -+ */ -+ movq $__USER_DS, 4*8(%rsp) -+ movq $__USER_CS, 1*8(%rsp) -+ - jmp entry_SYSCALL_64_after_hwframe - ENDPROC(xen_syscall_target) - -@@ -97,6 +106,15 @@ ENDPROC(xen_syscall_target) - ENTRY(xen_syscall32_target) - popq %rcx - popq %r11 -+ -+ /* -+ * Neither Xen nor the kernel really knows what the old SS and -+ * CS were. The kernel expects __USER32_DS and __USER32_CS, so -+ * report those values even though Xen will guess its own values. -+ */ -+ movq $__USER32_DS, 4*8(%rsp) -+ movq $__USER32_CS, 1*8(%rsp) -+ - jmp entry_SYSCALL_compat_after_hwframe - ENDPROC(xen_syscall32_target) - --- -2.14.2 - diff --git a/patches/kernel/0027-x86-paravirt-xen-Remove-xen_patch.patch b/patches/kernel/0027-x86-paravirt-xen-Remove-xen_patch.patch deleted file mode 100644 index 009b9b3..0000000 --- a/patches/kernel/0027-x86-paravirt-xen-Remove-xen_patch.patch +++ /dev/null @@ -1,360 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Wed, 16 Aug 2017 19:31:56 +0200 -Subject: [PATCH] x86/paravirt/xen: Remove xen_patch() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Xen's paravirt patch function xen_patch() does some special casing for -irq_ops functions to apply relocations when those functions can be -patched inline instead of calls. - -Unfortunately none of the special case function replacements is small -enough to be patched inline, so the special case never applies. - -As xen_patch() will call paravirt_patch_default() in all cases it can -be just dropped. xen-asm.h doesn't seem necessary without xen_patch() -as the only thing left in it would be the definition of XEN_EFLAGS_NMI -used only once. So move that definition and remove xen-asm.h. - -Signed-off-by: Juergen Gross -Reviewed-by: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: boris.ostrovsky@oracle.com -Cc: lguest@lists.ozlabs.org -Cc: rusty@rustcorp.com.au -Cc: xen-devel@lists.xenproject.org -Link: http://lkml.kernel.org/r/20170816173157.8633-2-jgross@suse.com -Signed-off-by: Ingo Molnar -(cherry picked from commit edcb5cf84f05e5d2e2af25422a72ccde359fcca9) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit c96c9c712136a9e24a7aaf0aac4c149eee01bd8e) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/xen/xen-asm.h | 12 --------- - arch/x86/xen/xen-ops.h | 15 +++--------- - arch/x86/xen/enlighten_pv.c | 59 +-------------------------------------------- - arch/x86/xen/xen-asm.S | 26 +++++--------------- - arch/x86/xen/xen-asm_32.S | 27 ++++----------------- - arch/x86/xen/xen-asm_64.S | 20 ++++----------- - 6 files changed, 21 insertions(+), 138 deletions(-) - delete mode 100644 arch/x86/xen/xen-asm.h - -diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h -deleted file mode 100644 -index 465276467a47..000000000000 ---- a/arch/x86/xen/xen-asm.h -+++ /dev/null -@@ -1,12 +0,0 @@ --#ifndef _XEN_XEN_ASM_H --#define _XEN_XEN_ASM_H -- --#include -- --#define RELOC(x, v) .globl x##_reloc; x##_reloc=v --#define ENDPATCH(x) .globl x##_end; x##_end=. -- --/* Pseudo-flag used for virtual NMI, which we don't implement yet */ --#define XEN_EFLAGS_NMI 0x80000000 -- --#endif -diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h -index 0d5004477db6..70301ac0d414 100644 ---- a/arch/x86/xen/xen-ops.h -+++ b/arch/x86/xen/xen-ops.h -@@ -129,17 +129,10 @@ static inline void __init xen_efi_init(void) - } - #endif - --/* Declare an asm function, along with symbols needed to make it -- inlineable */ --#define DECL_ASM(ret, name, ...) \ -- __visible ret name(__VA_ARGS__); \ -- extern char name##_end[] __visible; \ -- extern char name##_reloc[] __visible -- --DECL_ASM(void, xen_irq_enable_direct, void); --DECL_ASM(void, xen_irq_disable_direct, void); --DECL_ASM(unsigned long, xen_save_fl_direct, void); --DECL_ASM(void, xen_restore_fl_direct, unsigned long); -+__visible void xen_irq_enable_direct(void); -+__visible void xen_irq_disable_direct(void); -+__visible unsigned long xen_save_fl_direct(void); -+__visible void xen_restore_fl_direct(unsigned long); - - /* These are not functions, and cannot be called normally */ - __visible void xen_iret(void); -diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c -index df1921751aa5..6c279c8f0a0e 100644 ---- a/arch/x86/xen/enlighten_pv.c -+++ b/arch/x86/xen/enlighten_pv.c -@@ -988,59 +988,6 @@ void __ref xen_setup_vcpu_info_placement(void) - } - } - --static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, -- unsigned long addr, unsigned len) --{ -- char *start, *end, *reloc; -- unsigned ret; -- -- start = end = reloc = NULL; -- --#define SITE(op, x) \ -- case PARAVIRT_PATCH(op.x): \ -- if (xen_have_vcpu_info_placement) { \ -- start = (char *)xen_##x##_direct; \ -- end = xen_##x##_direct_end; \ -- reloc = xen_##x##_direct_reloc; \ -- } \ -- goto patch_site -- -- switch (type) { -- SITE(pv_irq_ops, irq_enable); -- SITE(pv_irq_ops, irq_disable); -- SITE(pv_irq_ops, save_fl); -- SITE(pv_irq_ops, restore_fl); --#undef SITE -- -- patch_site: -- if (start == NULL || (end-start) > len) -- goto default_patch; -- -- ret = paravirt_patch_insns(insnbuf, len, start, end); -- -- /* Note: because reloc is assigned from something that -- appears to be an array, gcc assumes it's non-null, -- but doesn't know its relationship with start and -- end. */ -- if (reloc > start && reloc < end) { -- int reloc_off = reloc - start; -- long *relocp = (long *)(insnbuf + reloc_off); -- long delta = start - (char *)addr; -- -- *relocp += delta; -- } -- break; -- -- default_patch: -- default: -- ret = paravirt_patch_default(type, clobbers, insnbuf, -- addr, len); -- break; -- } -- -- return ret; --} -- - static const struct pv_info xen_info __initconst = { - .shared_kernel_pmd = 0, - -@@ -1050,10 +997,6 @@ static const struct pv_info xen_info __initconst = { - .name = "Xen", - }; - --static const struct pv_init_ops xen_init_ops __initconst = { -- .patch = xen_patch, --}; -- - static const struct pv_cpu_ops xen_cpu_ops __initconst = { - .cpuid = xen_cpuid, - -@@ -1251,7 +1194,7 @@ asmlinkage __visible void __init xen_start_kernel(void) - - /* Install Xen paravirt ops */ - pv_info = xen_info; -- pv_init_ops = xen_init_ops; -+ pv_init_ops.patch = paravirt_patch_default; - pv_cpu_ops = xen_cpu_ops; - - x86_platform.get_nmi_reason = xen_get_nmi_reason; -diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S -index eff224df813f..dcd31fa39b5d 100644 ---- a/arch/x86/xen/xen-asm.S -+++ b/arch/x86/xen/xen-asm.S -@@ -1,14 +1,8 @@ - /* -- * Asm versions of Xen pv-ops, suitable for either direct use or -- * inlining. The inline versions are the same as the direct-use -- * versions, with the pre- and post-amble chopped off. -- * -- * This code is encoded for size rather than absolute efficiency, with -- * a view to being able to inline as much as possible. -+ * Asm versions of Xen pv-ops, suitable for direct use. - * - * We only bother with direct forms (ie, vcpu in percpu data) of the -- * operations here; the indirect forms are better handled in C, since -- * they're generally too large to inline anyway. -+ * operations here; the indirect forms are better handled in C. - */ - - #include -@@ -16,7 +10,7 @@ - #include - #include - --#include "xen-asm.h" -+#include - - /* - * Enable events. This clears the event mask and tests the pending -@@ -38,13 +32,11 @@ ENTRY(xen_irq_enable_direct) - testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending - jz 1f - --2: call check_events -+ call check_events - 1: --ENDPATCH(xen_irq_enable_direct) - FRAME_END - ret - ENDPROC(xen_irq_enable_direct) -- RELOC(xen_irq_enable_direct, 2b+1) - - - /* -@@ -53,10 +45,8 @@ ENDPATCH(xen_irq_enable_direct) - */ - ENTRY(xen_irq_disable_direct) - movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask --ENDPATCH(xen_irq_disable_direct) - ret -- ENDPROC(xen_irq_disable_direct) -- RELOC(xen_irq_disable_direct, 0) -+ENDPROC(xen_irq_disable_direct) - - /* - * (xen_)save_fl is used to get the current interrupt enable status. -@@ -71,10 +61,8 @@ ENTRY(xen_save_fl_direct) - testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask - setz %ah - addb %ah, %ah --ENDPATCH(xen_save_fl_direct) - ret - ENDPROC(xen_save_fl_direct) -- RELOC(xen_save_fl_direct, 0) - - - /* -@@ -101,13 +89,11 @@ ENTRY(xen_restore_fl_direct) - /* check for unmasked and pending */ - cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending - jnz 1f --2: call check_events -+ call check_events - 1: --ENDPATCH(xen_restore_fl_direct) - FRAME_END - ret - ENDPROC(xen_restore_fl_direct) -- RELOC(xen_restore_fl_direct, 2b+1) - - - /* -diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S -index feb6d40a0860..1200e262a116 100644 ---- a/arch/x86/xen/xen-asm_32.S -+++ b/arch/x86/xen/xen-asm_32.S -@@ -1,14 +1,8 @@ - /* -- * Asm versions of Xen pv-ops, suitable for either direct use or -- * inlining. The inline versions are the same as the direct-use -- * versions, with the pre- and post-amble chopped off. -- * -- * This code is encoded for size rather than absolute efficiency, with -- * a view to being able to inline as much as possible. -+ * Asm versions of Xen pv-ops, suitable for direct use. - * - * We only bother with direct forms (ie, vcpu in pda) of the -- * operations here; the indirect forms are better handled in C, since -- * they're generally too large to inline anyway. -+ * operations here; the indirect forms are better handled in C. - */ - - #include -@@ -18,21 +12,10 @@ - - #include - --#include "xen-asm.h" -+#include - --/* -- * Force an event check by making a hypercall, but preserve regs -- * before making the call. -- */ --check_events: -- push %eax -- push %ecx -- push %edx -- call xen_force_evtchn_callback -- pop %edx -- pop %ecx -- pop %eax -- ret -+/* Pseudo-flag used for virtual NMI, which we don't implement yet */ -+#define XEN_EFLAGS_NMI 0x80000000 - - /* - * This is run where a normal iret would be run, with the same stack setup: -diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S -index c5fee2680abc..3a3b6a211584 100644 ---- a/arch/x86/xen/xen-asm_64.S -+++ b/arch/x86/xen/xen-asm_64.S -@@ -1,14 +1,8 @@ - /* -- * Asm versions of Xen pv-ops, suitable for either direct use or -- * inlining. The inline versions are the same as the direct-use -- * versions, with the pre- and post-amble chopped off. -- * -- * This code is encoded for size rather than absolute efficiency, with -- * a view to being able to inline as much as possible. -+ * Asm versions of Xen pv-ops, suitable for direct use. - * - * We only bother with direct forms (ie, vcpu in pda) of the -- * operations here; the indirect forms are better handled in C, since -- * they're generally too large to inline anyway. -+ * operations here; the indirect forms are better handled in C. - */ - - #include -@@ -20,7 +14,7 @@ - - #include - --#include "xen-asm.h" -+#include - - ENTRY(xen_adjust_exception_frame) - mov 8+0(%rsp), %rcx -@@ -46,9 +40,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 - */ - ENTRY(xen_iret) - pushq $0 --1: jmp hypercall_iret --ENDPATCH(xen_iret) --RELOC(xen_iret, 1b+1) -+ jmp hypercall_iret - - ENTRY(xen_sysret64) - /* -@@ -65,9 +57,7 @@ ENTRY(xen_sysret64) - pushq %rcx - - pushq $VGCF_in_syscall --1: jmp hypercall_iret --ENDPATCH(xen_sysret64) --RELOC(xen_sysret64, 1b+1) -+ jmp hypercall_iret - - /* - * Xen handles syscall callbacks much like ordinary exceptions, which --- -2.14.2 - diff --git a/patches/kernel/0027-x86-xen-64-Fix-the-reported-SS-and-CS-in-SYSCALL.patch b/patches/kernel/0027-x86-xen-64-Fix-the-reported-SS-and-CS-in-SYSCALL.patch new file mode 100644 index 0000000..345a513 --- /dev/null +++ b/patches/kernel/0027-x86-xen-64-Fix-the-reported-SS-and-CS-in-SYSCALL.patch @@ -0,0 +1,84 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 14 Aug 2017 22:36:19 -0700 +Subject: [PATCH] x86/xen/64: Fix the reported SS and CS in SYSCALL +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +When I cleaned up the Xen SYSCALL entries, I inadvertently changed +the reported segment registers. Before my patch, regs->ss was +__USER(32)_DS and regs->cs was __USER(32)_CS. After the patch, they +are FLAT_USER_CS/DS(32). + +This had a couple unfortunate effects. It confused the +opportunistic fast return logic. It also significantly increased +the risk of triggering a nasty glibc bug: + + https://sourceware.org/bugzilla/show_bug.cgi?id=21269 + +Update the Xen entry code to change it back. + +Reported-by: Brian Gerst +Signed-off-by: Andy Lutomirski +Cc: Andrew Cooper +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: xen-devel@lists.xenproject.org +Fixes: 8a9949bc71a7 ("x86/xen/64: Rearrange the SYSCALL entries") +Link: http://lkml.kernel.org/r/daba8351ea2764bb30272296ab9ce08a81bd8264.1502775273.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit fa2016a8e7d846b306e431646d250500e1da0c33) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 69a6ef3aeb274efe86fd74771830354f303ccc2f) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/xen/xen-asm_64.S | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S +index a8a4f4c460a6..c5fee2680abc 100644 +--- a/arch/x86/xen/xen-asm_64.S ++++ b/arch/x86/xen/xen-asm_64.S +@@ -88,6 +88,15 @@ RELOC(xen_sysret64, 1b+1) + ENTRY(xen_syscall_target) + popq %rcx + popq %r11 ++ ++ /* ++ * Neither Xen nor the kernel really knows what the old SS and ++ * CS were. The kernel expects __USER_DS and __USER_CS, so ++ * report those values even though Xen will guess its own values. ++ */ ++ movq $__USER_DS, 4*8(%rsp) ++ movq $__USER_CS, 1*8(%rsp) ++ + jmp entry_SYSCALL_64_after_hwframe + ENDPROC(xen_syscall_target) + +@@ -97,6 +106,15 @@ ENDPROC(xen_syscall_target) + ENTRY(xen_syscall32_target) + popq %rcx + popq %r11 ++ ++ /* ++ * Neither Xen nor the kernel really knows what the old SS and ++ * CS were. The kernel expects __USER32_DS and __USER32_CS, so ++ * report those values even though Xen will guess its own values. ++ */ ++ movq $__USER32_DS, 4*8(%rsp) ++ movq $__USER32_CS, 1*8(%rsp) ++ + jmp entry_SYSCALL_compat_after_hwframe + ENDPROC(xen_syscall32_target) + +-- +2.14.2 + diff --git a/patches/kernel/0028-x86-paravirt-xen-Remove-xen_patch.patch b/patches/kernel/0028-x86-paravirt-xen-Remove-xen_patch.patch new file mode 100644 index 0000000..009b9b3 --- /dev/null +++ b/patches/kernel/0028-x86-paravirt-xen-Remove-xen_patch.patch @@ -0,0 +1,360 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Juergen Gross +Date: Wed, 16 Aug 2017 19:31:56 +0200 +Subject: [PATCH] x86/paravirt/xen: Remove xen_patch() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Xen's paravirt patch function xen_patch() does some special casing for +irq_ops functions to apply relocations when those functions can be +patched inline instead of calls. + +Unfortunately none of the special case function replacements is small +enough to be patched inline, so the special case never applies. + +As xen_patch() will call paravirt_patch_default() in all cases it can +be just dropped. xen-asm.h doesn't seem necessary without xen_patch() +as the only thing left in it would be the definition of XEN_EFLAGS_NMI +used only once. So move that definition and remove xen-asm.h. + +Signed-off-by: Juergen Gross +Reviewed-by: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: boris.ostrovsky@oracle.com +Cc: lguest@lists.ozlabs.org +Cc: rusty@rustcorp.com.au +Cc: xen-devel@lists.xenproject.org +Link: http://lkml.kernel.org/r/20170816173157.8633-2-jgross@suse.com +Signed-off-by: Ingo Molnar +(cherry picked from commit edcb5cf84f05e5d2e2af25422a72ccde359fcca9) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit c96c9c712136a9e24a7aaf0aac4c149eee01bd8e) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/xen/xen-asm.h | 12 --------- + arch/x86/xen/xen-ops.h | 15 +++--------- + arch/x86/xen/enlighten_pv.c | 59 +-------------------------------------------- + arch/x86/xen/xen-asm.S | 26 +++++--------------- + arch/x86/xen/xen-asm_32.S | 27 ++++----------------- + arch/x86/xen/xen-asm_64.S | 20 ++++----------- + 6 files changed, 21 insertions(+), 138 deletions(-) + delete mode 100644 arch/x86/xen/xen-asm.h + +diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h +deleted file mode 100644 +index 465276467a47..000000000000 +--- a/arch/x86/xen/xen-asm.h ++++ /dev/null +@@ -1,12 +0,0 @@ +-#ifndef _XEN_XEN_ASM_H +-#define _XEN_XEN_ASM_H +- +-#include +- +-#define RELOC(x, v) .globl x##_reloc; x##_reloc=v +-#define ENDPATCH(x) .globl x##_end; x##_end=. +- +-/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +-#define XEN_EFLAGS_NMI 0x80000000 +- +-#endif +diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h +index 0d5004477db6..70301ac0d414 100644 +--- a/arch/x86/xen/xen-ops.h ++++ b/arch/x86/xen/xen-ops.h +@@ -129,17 +129,10 @@ static inline void __init xen_efi_init(void) + } + #endif + +-/* Declare an asm function, along with symbols needed to make it +- inlineable */ +-#define DECL_ASM(ret, name, ...) \ +- __visible ret name(__VA_ARGS__); \ +- extern char name##_end[] __visible; \ +- extern char name##_reloc[] __visible +- +-DECL_ASM(void, xen_irq_enable_direct, void); +-DECL_ASM(void, xen_irq_disable_direct, void); +-DECL_ASM(unsigned long, xen_save_fl_direct, void); +-DECL_ASM(void, xen_restore_fl_direct, unsigned long); ++__visible void xen_irq_enable_direct(void); ++__visible void xen_irq_disable_direct(void); ++__visible unsigned long xen_save_fl_direct(void); ++__visible void xen_restore_fl_direct(unsigned long); + + /* These are not functions, and cannot be called normally */ + __visible void xen_iret(void); +diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c +index df1921751aa5..6c279c8f0a0e 100644 +--- a/arch/x86/xen/enlighten_pv.c ++++ b/arch/x86/xen/enlighten_pv.c +@@ -988,59 +988,6 @@ void __ref xen_setup_vcpu_info_placement(void) + } + } + +-static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, +- unsigned long addr, unsigned len) +-{ +- char *start, *end, *reloc; +- unsigned ret; +- +- start = end = reloc = NULL; +- +-#define SITE(op, x) \ +- case PARAVIRT_PATCH(op.x): \ +- if (xen_have_vcpu_info_placement) { \ +- start = (char *)xen_##x##_direct; \ +- end = xen_##x##_direct_end; \ +- reloc = xen_##x##_direct_reloc; \ +- } \ +- goto patch_site +- +- switch (type) { +- SITE(pv_irq_ops, irq_enable); +- SITE(pv_irq_ops, irq_disable); +- SITE(pv_irq_ops, save_fl); +- SITE(pv_irq_ops, restore_fl); +-#undef SITE +- +- patch_site: +- if (start == NULL || (end-start) > len) +- goto default_patch; +- +- ret = paravirt_patch_insns(insnbuf, len, start, end); +- +- /* Note: because reloc is assigned from something that +- appears to be an array, gcc assumes it's non-null, +- but doesn't know its relationship with start and +- end. */ +- if (reloc > start && reloc < end) { +- int reloc_off = reloc - start; +- long *relocp = (long *)(insnbuf + reloc_off); +- long delta = start - (char *)addr; +- +- *relocp += delta; +- } +- break; +- +- default_patch: +- default: +- ret = paravirt_patch_default(type, clobbers, insnbuf, +- addr, len); +- break; +- } +- +- return ret; +-} +- + static const struct pv_info xen_info __initconst = { + .shared_kernel_pmd = 0, + +@@ -1050,10 +997,6 @@ static const struct pv_info xen_info __initconst = { + .name = "Xen", + }; + +-static const struct pv_init_ops xen_init_ops __initconst = { +- .patch = xen_patch, +-}; +- + static const struct pv_cpu_ops xen_cpu_ops __initconst = { + .cpuid = xen_cpuid, + +@@ -1251,7 +1194,7 @@ asmlinkage __visible void __init xen_start_kernel(void) + + /* Install Xen paravirt ops */ + pv_info = xen_info; +- pv_init_ops = xen_init_ops; ++ pv_init_ops.patch = paravirt_patch_default; + pv_cpu_ops = xen_cpu_ops; + + x86_platform.get_nmi_reason = xen_get_nmi_reason; +diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S +index eff224df813f..dcd31fa39b5d 100644 +--- a/arch/x86/xen/xen-asm.S ++++ b/arch/x86/xen/xen-asm.S +@@ -1,14 +1,8 @@ + /* +- * Asm versions of Xen pv-ops, suitable for either direct use or +- * inlining. The inline versions are the same as the direct-use +- * versions, with the pre- and post-amble chopped off. +- * +- * This code is encoded for size rather than absolute efficiency, with +- * a view to being able to inline as much as possible. ++ * Asm versions of Xen pv-ops, suitable for direct use. + * + * We only bother with direct forms (ie, vcpu in percpu data) of the +- * operations here; the indirect forms are better handled in C, since +- * they're generally too large to inline anyway. ++ * operations here; the indirect forms are better handled in C. + */ + + #include +@@ -16,7 +10,7 @@ + #include + #include + +-#include "xen-asm.h" ++#include + + /* + * Enable events. This clears the event mask and tests the pending +@@ -38,13 +32,11 @@ ENTRY(xen_irq_enable_direct) + testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending + jz 1f + +-2: call check_events ++ call check_events + 1: +-ENDPATCH(xen_irq_enable_direct) + FRAME_END + ret + ENDPROC(xen_irq_enable_direct) +- RELOC(xen_irq_enable_direct, 2b+1) + + + /* +@@ -53,10 +45,8 @@ ENDPATCH(xen_irq_enable_direct) + */ + ENTRY(xen_irq_disable_direct) + movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask +-ENDPATCH(xen_irq_disable_direct) + ret +- ENDPROC(xen_irq_disable_direct) +- RELOC(xen_irq_disable_direct, 0) ++ENDPROC(xen_irq_disable_direct) + + /* + * (xen_)save_fl is used to get the current interrupt enable status. +@@ -71,10 +61,8 @@ ENTRY(xen_save_fl_direct) + testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + setz %ah + addb %ah, %ah +-ENDPATCH(xen_save_fl_direct) + ret + ENDPROC(xen_save_fl_direct) +- RELOC(xen_save_fl_direct, 0) + + + /* +@@ -101,13 +89,11 @@ ENTRY(xen_restore_fl_direct) + /* check for unmasked and pending */ + cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending + jnz 1f +-2: call check_events ++ call check_events + 1: +-ENDPATCH(xen_restore_fl_direct) + FRAME_END + ret + ENDPROC(xen_restore_fl_direct) +- RELOC(xen_restore_fl_direct, 2b+1) + + + /* +diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S +index feb6d40a0860..1200e262a116 100644 +--- a/arch/x86/xen/xen-asm_32.S ++++ b/arch/x86/xen/xen-asm_32.S +@@ -1,14 +1,8 @@ + /* +- * Asm versions of Xen pv-ops, suitable for either direct use or +- * inlining. The inline versions are the same as the direct-use +- * versions, with the pre- and post-amble chopped off. +- * +- * This code is encoded for size rather than absolute efficiency, with +- * a view to being able to inline as much as possible. ++ * Asm versions of Xen pv-ops, suitable for direct use. + * + * We only bother with direct forms (ie, vcpu in pda) of the +- * operations here; the indirect forms are better handled in C, since +- * they're generally too large to inline anyway. ++ * operations here; the indirect forms are better handled in C. + */ + + #include +@@ -18,21 +12,10 @@ + + #include + +-#include "xen-asm.h" ++#include + +-/* +- * Force an event check by making a hypercall, but preserve regs +- * before making the call. +- */ +-check_events: +- push %eax +- push %ecx +- push %edx +- call xen_force_evtchn_callback +- pop %edx +- pop %ecx +- pop %eax +- ret ++/* Pseudo-flag used for virtual NMI, which we don't implement yet */ ++#define XEN_EFLAGS_NMI 0x80000000 + + /* + * This is run where a normal iret would be run, with the same stack setup: +diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S +index c5fee2680abc..3a3b6a211584 100644 +--- a/arch/x86/xen/xen-asm_64.S ++++ b/arch/x86/xen/xen-asm_64.S +@@ -1,14 +1,8 @@ + /* +- * Asm versions of Xen pv-ops, suitable for either direct use or +- * inlining. The inline versions are the same as the direct-use +- * versions, with the pre- and post-amble chopped off. +- * +- * This code is encoded for size rather than absolute efficiency, with +- * a view to being able to inline as much as possible. ++ * Asm versions of Xen pv-ops, suitable for direct use. + * + * We only bother with direct forms (ie, vcpu in pda) of the +- * operations here; the indirect forms are better handled in C, since +- * they're generally too large to inline anyway. ++ * operations here; the indirect forms are better handled in C. + */ + + #include +@@ -20,7 +14,7 @@ + + #include + +-#include "xen-asm.h" ++#include + + ENTRY(xen_adjust_exception_frame) + mov 8+0(%rsp), %rcx +@@ -46,9 +40,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 + */ + ENTRY(xen_iret) + pushq $0 +-1: jmp hypercall_iret +-ENDPATCH(xen_iret) +-RELOC(xen_iret, 1b+1) ++ jmp hypercall_iret + + ENTRY(xen_sysret64) + /* +@@ -65,9 +57,7 @@ ENTRY(xen_sysret64) + pushq %rcx + + pushq $VGCF_in_syscall +-1: jmp hypercall_iret +-ENDPATCH(xen_sysret64) +-RELOC(xen_sysret64, 1b+1) ++ jmp hypercall_iret + + /* + * Xen handles syscall callbacks much like ordinary exceptions, which +-- +2.14.2 + diff --git a/patches/kernel/0028-x86-traps-Simplify-pagefault-tracing-logic.patch b/patches/kernel/0028-x86-traps-Simplify-pagefault-tracing-logic.patch deleted file mode 100644 index 801d82f..0000000 --- a/patches/kernel/0028-x86-traps-Simplify-pagefault-tracing-logic.patch +++ /dev/null @@ -1,218 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 28 Aug 2017 08:47:22 +0200 -Subject: [PATCH] x86/traps: Simplify pagefault tracing logic -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Make use of the new irqvector tracing static key and remove the duplicated -trace_do_pagefault() implementation. - -If irq vector tracing is disabled, then the overhead of this is a single -NOP5, which is a reasonable tradeoff to avoid duplicated code and the -unholy macro mess. - -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Steven Rostedt -Link: http://lkml.kernel.org/r/20170828064956.672965407@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 11a7ffb01703c3bbb1e9b968893f4487a1b0b5a8) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 8478bb5608747fd64c9fd4a2f5422fb4af756a50) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/traps.h | 10 +-------- - arch/x86/kernel/kvm.c | 2 +- - arch/x86/mm/fault.c | 49 ++++++++++++-------------------------------- - arch/x86/entry/entry_32.S | 8 -------- - arch/x86/entry/entry_64.S | 13 +----------- - 5 files changed, 16 insertions(+), 66 deletions(-) - -diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h -index 01fd0a7f48cd..b4f322d6c95f 100644 ---- a/arch/x86/include/asm/traps.h -+++ b/arch/x86/include/asm/traps.h -@@ -39,7 +39,6 @@ asmlinkage void machine_check(void); - asmlinkage void simd_coprocessor_error(void); - - #ifdef CONFIG_TRACING --asmlinkage void trace_page_fault(void); - #define trace_stack_segment stack_segment - #define trace_divide_error divide_error - #define trace_bounds bounds -@@ -54,6 +53,7 @@ asmlinkage void trace_page_fault(void); - #define trace_alignment_check alignment_check - #define trace_simd_coprocessor_error simd_coprocessor_error - #define trace_async_page_fault async_page_fault -+#define trace_page_fault page_fault - #endif - - dotraplinkage void do_divide_error(struct pt_regs *, long); -@@ -74,14 +74,6 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *); - #endif - dotraplinkage void do_general_protection(struct pt_regs *, long); - dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); --#ifdef CONFIG_TRACING --dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long); --#else --static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error) --{ -- do_page_fault(regs, error); --} --#endif - dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); - dotraplinkage void do_coprocessor_error(struct pt_regs *, long); - dotraplinkage void do_alignment_check(struct pt_regs *, long); -diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c -index e5e4306e4546..9e3798b00e40 100644 ---- a/arch/x86/kernel/kvm.c -+++ b/arch/x86/kernel/kvm.c -@@ -270,7 +270,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) - - switch (kvm_read_and_reset_pf_reason()) { - default: -- trace_do_page_fault(regs, error_code); -+ do_page_fault(regs, error_code); - break; - case KVM_PV_REASON_PAGE_NOT_PRESENT: - /* page is swapped out by the host. */ -diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c -index 955be01dd9cc..4ee9eb916826 100644 ---- a/arch/x86/mm/fault.c -+++ b/arch/x86/mm/fault.c -@@ -1253,10 +1253,6 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. -- * -- * This function must have noinline because both callers -- * {,trace_}do_page_fault() have notrace on. Having this an actual function -- * guarantees there's a function trace entry. - */ - static noinline void - __do_page_fault(struct pt_regs *regs, unsigned long error_code, -@@ -1491,27 +1487,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, - } - NOKPROBE_SYMBOL(__do_page_fault); - --dotraplinkage void notrace --do_page_fault(struct pt_regs *regs, unsigned long error_code) --{ -- unsigned long address = read_cr2(); /* Get the faulting address */ -- enum ctx_state prev_state; -- -- /* -- * We must have this function tagged with __kprobes, notrace and call -- * read_cr2() before calling anything else. To avoid calling any kind -- * of tracing machinery before we've observed the CR2 value. -- * -- * exception_{enter,exit}() contain all sorts of tracepoints. -- */ -- -- prev_state = exception_enter(); -- __do_page_fault(regs, error_code, address); -- exception_exit(prev_state); --} --NOKPROBE_SYMBOL(do_page_fault); -- --#ifdef CONFIG_TRACING - static nokprobe_inline void - trace_page_fault_entries(unsigned long address, struct pt_regs *regs, - unsigned long error_code) -@@ -1522,22 +1497,24 @@ trace_page_fault_entries(unsigned long address, struct pt_regs *regs, - trace_page_fault_kernel(address, regs, error_code); - } - -+/* -+ * We must have this function blacklisted from kprobes, tagged with notrace -+ * and call read_cr2() before calling anything else. To avoid calling any -+ * kind of tracing machinery before we've observed the CR2 value. -+ * -+ * exception_{enter,exit}() contains all sorts of tracepoints. -+ */ - dotraplinkage void notrace --trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) -+do_page_fault(struct pt_regs *regs, unsigned long error_code) - { -- /* -- * The exception_enter and tracepoint processing could -- * trigger another page faults (user space callchain -- * reading) and destroy the original cr2 value, so read -- * the faulting address now. -- */ -- unsigned long address = read_cr2(); -+ unsigned long address = read_cr2(); /* Get the faulting address */ - enum ctx_state prev_state; - - prev_state = exception_enter(); -- trace_page_fault_entries(address, regs, error_code); -+ if (trace_irqvectors_enabled()) -+ trace_page_fault_entries(address, regs, error_code); -+ - __do_page_fault(regs, error_code, address); - exception_exit(prev_state); - } --NOKPROBE_SYMBOL(trace_do_page_fault); --#endif /* CONFIG_TRACING */ -+NOKPROBE_SYMBOL(do_page_fault); -diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S -index 48ef7bb32c42..0092da1c056f 100644 ---- a/arch/x86/entry/entry_32.S -+++ b/arch/x86/entry/entry_32.S -@@ -891,14 +891,6 @@ BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - - #endif /* CONFIG_HYPERV */ - --#ifdef CONFIG_TRACING --ENTRY(trace_page_fault) -- ASM_CLAC -- pushl $trace_do_page_fault -- jmp common_exception --END(trace_page_fault) --#endif -- - ENTRY(page_fault) - ASM_CLAC - pushl $do_page_fault -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index ca0b250eefc4..dfabcbf8e813 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -913,17 +913,6 @@ ENTRY(\sym) - END(\sym) - .endm - --#ifdef CONFIG_TRACING --.macro trace_idtentry sym do_sym has_error_code:req --idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code --idtentry \sym \do_sym has_error_code=\has_error_code --.endm --#else --.macro trace_idtentry sym do_sym has_error_code:req --idtentry \sym \do_sym has_error_code=\has_error_code --.endm --#endif -- - idtentry divide_error do_divide_error has_error_code=0 - idtentry overflow do_overflow has_error_code=0 - idtentry bounds do_bounds has_error_code=0 -@@ -1091,7 +1080,7 @@ idtentry xen_stack_segment do_stack_segment has_error_code=1 - #endif - - idtentry general_protection do_general_protection has_error_code=1 --trace_idtentry page_fault do_page_fault has_error_code=1 -+idtentry page_fault do_page_fault has_error_code=1 - - #ifdef CONFIG_KVM_GUEST - idtentry async_page_fault do_async_page_fault has_error_code=1 --- -2.14.2 - diff --git a/patches/kernel/0029-x86-idt-Unify-gate_struct-handling-for-32-64-bit-ker.patch b/patches/kernel/0029-x86-idt-Unify-gate_struct-handling-for-32-64-bit-ker.patch deleted file mode 100644 index 4cfc341..0000000 --- a/patches/kernel/0029-x86-idt-Unify-gate_struct-handling-for-32-64-bit-ker.patch +++ /dev/null @@ -1,262 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 28 Aug 2017 08:47:37 +0200 -Subject: [PATCH] x86/idt: Unify gate_struct handling for 32/64-bit kernels -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The first 32 bits of gate struct are the same for 32 and 64 bit kernels. - -The 32-bit version uses desc_struct and no designated data structure, -so we need different accessors for 32 and 64 bit kernels. - -Aside of that the macros which are necessary to build the 32-bit -gate descriptor are horrible to read. - -Unify the gate structs and switch all code fiddling with it over. - -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Denys Vlasenko -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Steven Rostedt -Link: http://lkml.kernel.org/r/20170828064957.861974317@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 64b163fab684e3de47aa8db6cc08ae7d2e194373) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 587719b1926757eb7531e0631d63fb93cd60d0d3) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/desc.h | 45 ++++++++++++++----------------- - arch/x86/include/asm/desc_defs.h | 57 ++++++++++++++++++++++++++-------------- - arch/x86/kvm/vmx.c | 2 +- - arch/x86/xen/enlighten_pv.c | 12 ++++----- - 4 files changed, 63 insertions(+), 53 deletions(-) - -diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h -index d0a21b12dd58..57e502a4e92f 100644 ---- a/arch/x86/include/asm/desc.h -+++ b/arch/x86/include/asm/desc.h -@@ -83,33 +83,25 @@ static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu) - return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu)); - } - --#ifdef CONFIG_X86_64 -- - static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, - unsigned dpl, unsigned ist, unsigned seg) - { -- gate->offset_low = PTR_LOW(func); -+ gate->offset_low = (u16) func; -+ gate->bits.p = 1; -+ gate->bits.dpl = dpl; -+ gate->bits.zero = 0; -+ gate->bits.type = type; -+ gate->offset_middle = (u16) (func >> 16); -+#ifdef CONFIG_X86_64 - gate->segment = __KERNEL_CS; -- gate->ist = ist; -- gate->p = 1; -- gate->dpl = dpl; -- gate->zero0 = 0; -- gate->zero1 = 0; -- gate->type = type; -- gate->offset_middle = PTR_MIDDLE(func); -- gate->offset_high = PTR_HIGH(func); --} -- -+ gate->bits.ist = ist; -+ gate->reserved = 0; -+ gate->offset_high = (u32) (func >> 32); - #else --static inline void pack_gate(gate_desc *gate, unsigned char type, -- unsigned long base, unsigned dpl, unsigned flags, -- unsigned short seg) --{ -- gate->a = (seg << 16) | (base & 0xffff); -- gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8); --} -- -+ gate->segment = seg; -+ gate->bits.ist = 0; - #endif -+} - - static inline int desc_empty(const void *ptr) - { -@@ -185,7 +177,8 @@ static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, - } - - --static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size) -+static inline void set_tssldt_descriptor(void *d, unsigned long addr, -+ unsigned type, unsigned size) - { - #ifdef CONFIG_X86_64 - struct ldttss_desc64 *desc = d; -@@ -193,13 +186,13 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned t - memset(desc, 0, sizeof(*desc)); - - desc->limit0 = size & 0xFFFF; -- desc->base0 = PTR_LOW(addr); -- desc->base1 = PTR_MIDDLE(addr) & 0xFF; -+ desc->base0 = (u16) addr; -+ desc->base1 = (addr >> 16) & 0xFF; - desc->type = type; - desc->p = 1; - desc->limit1 = (size >> 16) & 0xF; -- desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; -- desc->base3 = PTR_HIGH(addr); -+ desc->base2 = (addr >> 24) & 0xFF; -+ desc->base3 = (u32) (addr >> 32); - #else - pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); - #endif -diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h -index 49265345d4d2..d684bee8a59a 100644 ---- a/arch/x86/include/asm/desc_defs.h -+++ b/arch/x86/include/asm/desc_defs.h -@@ -47,20 +47,6 @@ enum { - GATE_TASK = 0x5, - }; - --/* 16byte gate */ --struct gate_struct64 { -- u16 offset_low; -- u16 segment; -- unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1; -- u16 offset_middle; -- u32 offset_high; -- u32 zero1; --} __attribute__((packed)); -- --#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF) --#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF) --#define PTR_HIGH(x) ((unsigned long long)(x) >> 32) -- - enum { - DESC_TSS = 0x9, - DESC_LDT = 0x2, -@@ -77,20 +63,51 @@ struct ldttss_desc64 { - u32 zero1; - } __attribute__((packed)); - -+ - #ifdef CONFIG_X86_64 --typedef struct gate_struct64 gate_desc; - typedef struct ldttss_desc64 ldt_desc; - typedef struct ldttss_desc64 tss_desc; --#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32)) --#define gate_segment(g) ((g).segment) - #else --typedef struct desc_struct gate_desc; - typedef struct desc_struct ldt_desc; - typedef struct desc_struct tss_desc; --#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff)) --#define gate_segment(g) ((g).a >> 16) - #endif - -+struct idt_bits { -+ u16 ist : 3, -+ zero : 5, -+ type : 5, -+ dpl : 2, -+ p : 1; -+} __attribute__((packed)); -+ -+struct gate_struct { -+ u16 offset_low; -+ u16 segment; -+ struct idt_bits bits; -+ u16 offset_middle; -+#ifdef CONFIG_X86_64 -+ u32 offset_high; -+ u32 reserved; -+#endif -+} __attribute__((packed)); -+ -+typedef struct gate_struct gate_desc; -+ -+static inline unsigned long gate_offset(const gate_desc *g) -+{ -+#ifdef CONFIG_X86_64 -+ return g->offset_low | ((unsigned long)g->offset_middle << 16) | -+ ((unsigned long) g->offset_high << 32); -+#else -+ return g->offset_low | ((unsigned long)g->offset_middle << 16); -+#endif -+} -+ -+static inline unsigned long gate_segment(const gate_desc *g) -+{ -+ return g->segment; -+} -+ - struct desc_ptr { - unsigned short size; - unsigned long address; -diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c -index a2c95522ac99..7b447d126d17 100644 ---- a/arch/x86/kvm/vmx.c -+++ b/arch/x86/kvm/vmx.c -@@ -8838,7 +8838,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) - - vector = exit_intr_info & INTR_INFO_VECTOR_MASK; - desc = (gate_desc *)vmx->host_idt_base + vector; -- entry = gate_offset(*desc); -+ entry = gate_offset(desc); - asm volatile( - #ifdef CONFIG_X86_64 - "mov %%" _ASM_SP ", %[sp]\n\t" -diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c -index 6c279c8f0a0e..49ee3315b9f7 100644 ---- a/arch/x86/xen/enlighten_pv.c -+++ b/arch/x86/xen/enlighten_pv.c -@@ -591,12 +591,12 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, - { - unsigned long addr; - -- if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) -+ if (val->bits.type != GATE_TRAP && val->bits.type != GATE_INTERRUPT) - return 0; - - info->vector = vector; - -- addr = gate_offset(*val); -+ addr = gate_offset(val); - #ifdef CONFIG_X86_64 - /* - * Look for known traps using IST, and substitute them -@@ -629,16 +629,16 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, - ; - else { - /* Some other trap using IST? */ -- if (WARN_ON(val->ist != 0)) -+ if (WARN_ON(val->bits.ist != 0)) - return 0; - } - #endif /* CONFIG_X86_64 */ - info->address = addr; - -- info->cs = gate_segment(*val); -- info->flags = val->dpl; -+ info->cs = gate_segment(val); -+ info->flags = val->bits.dpl; - /* interrupt gates clear IF */ -- if (val->type == GATE_INTERRUPT) -+ if (val->bits.type == GATE_INTERRUPT) - info->flags |= 1 << 2; - - return 1; --- -2.14.2 - diff --git a/patches/kernel/0029-x86-traps-Simplify-pagefault-tracing-logic.patch b/patches/kernel/0029-x86-traps-Simplify-pagefault-tracing-logic.patch new file mode 100644 index 0000000..801d82f --- /dev/null +++ b/patches/kernel/0029-x86-traps-Simplify-pagefault-tracing-logic.patch @@ -0,0 +1,218 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 28 Aug 2017 08:47:22 +0200 +Subject: [PATCH] x86/traps: Simplify pagefault tracing logic +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Make use of the new irqvector tracing static key and remove the duplicated +trace_do_pagefault() implementation. + +If irq vector tracing is disabled, then the overhead of this is a single +NOP5, which is a reasonable tradeoff to avoid duplicated code and the +unholy macro mess. + +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Steven Rostedt +Link: http://lkml.kernel.org/r/20170828064956.672965407@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 11a7ffb01703c3bbb1e9b968893f4487a1b0b5a8) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 8478bb5608747fd64c9fd4a2f5422fb4af756a50) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/traps.h | 10 +-------- + arch/x86/kernel/kvm.c | 2 +- + arch/x86/mm/fault.c | 49 ++++++++++++-------------------------------- + arch/x86/entry/entry_32.S | 8 -------- + arch/x86/entry/entry_64.S | 13 +----------- + 5 files changed, 16 insertions(+), 66 deletions(-) + +diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h +index 01fd0a7f48cd..b4f322d6c95f 100644 +--- a/arch/x86/include/asm/traps.h ++++ b/arch/x86/include/asm/traps.h +@@ -39,7 +39,6 @@ asmlinkage void machine_check(void); + asmlinkage void simd_coprocessor_error(void); + + #ifdef CONFIG_TRACING +-asmlinkage void trace_page_fault(void); + #define trace_stack_segment stack_segment + #define trace_divide_error divide_error + #define trace_bounds bounds +@@ -54,6 +53,7 @@ asmlinkage void trace_page_fault(void); + #define trace_alignment_check alignment_check + #define trace_simd_coprocessor_error simd_coprocessor_error + #define trace_async_page_fault async_page_fault ++#define trace_page_fault page_fault + #endif + + dotraplinkage void do_divide_error(struct pt_regs *, long); +@@ -74,14 +74,6 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *); + #endif + dotraplinkage void do_general_protection(struct pt_regs *, long); + dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); +-#ifdef CONFIG_TRACING +-dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long); +-#else +-static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error) +-{ +- do_page_fault(regs, error); +-} +-#endif + dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); + dotraplinkage void do_coprocessor_error(struct pt_regs *, long); + dotraplinkage void do_alignment_check(struct pt_regs *, long); +diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c +index e5e4306e4546..9e3798b00e40 100644 +--- a/arch/x86/kernel/kvm.c ++++ b/arch/x86/kernel/kvm.c +@@ -270,7 +270,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) + + switch (kvm_read_and_reset_pf_reason()) { + default: +- trace_do_page_fault(regs, error_code); ++ do_page_fault(regs, error_code); + break; + case KVM_PV_REASON_PAGE_NOT_PRESENT: + /* page is swapped out by the host. */ +diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c +index 955be01dd9cc..4ee9eb916826 100644 +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -1253,10 +1253,6 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. +- * +- * This function must have noinline because both callers +- * {,trace_}do_page_fault() have notrace on. Having this an actual function +- * guarantees there's a function trace entry. + */ + static noinline void + __do_page_fault(struct pt_regs *regs, unsigned long error_code, +@@ -1491,27 +1487,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, + } + NOKPROBE_SYMBOL(__do_page_fault); + +-dotraplinkage void notrace +-do_page_fault(struct pt_regs *regs, unsigned long error_code) +-{ +- unsigned long address = read_cr2(); /* Get the faulting address */ +- enum ctx_state prev_state; +- +- /* +- * We must have this function tagged with __kprobes, notrace and call +- * read_cr2() before calling anything else. To avoid calling any kind +- * of tracing machinery before we've observed the CR2 value. +- * +- * exception_{enter,exit}() contain all sorts of tracepoints. +- */ +- +- prev_state = exception_enter(); +- __do_page_fault(regs, error_code, address); +- exception_exit(prev_state); +-} +-NOKPROBE_SYMBOL(do_page_fault); +- +-#ifdef CONFIG_TRACING + static nokprobe_inline void + trace_page_fault_entries(unsigned long address, struct pt_regs *regs, + unsigned long error_code) +@@ -1522,22 +1497,24 @@ trace_page_fault_entries(unsigned long address, struct pt_regs *regs, + trace_page_fault_kernel(address, regs, error_code); + } + ++/* ++ * We must have this function blacklisted from kprobes, tagged with notrace ++ * and call read_cr2() before calling anything else. To avoid calling any ++ * kind of tracing machinery before we've observed the CR2 value. ++ * ++ * exception_{enter,exit}() contains all sorts of tracepoints. ++ */ + dotraplinkage void notrace +-trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) ++do_page_fault(struct pt_regs *regs, unsigned long error_code) + { +- /* +- * The exception_enter and tracepoint processing could +- * trigger another page faults (user space callchain +- * reading) and destroy the original cr2 value, so read +- * the faulting address now. +- */ +- unsigned long address = read_cr2(); ++ unsigned long address = read_cr2(); /* Get the faulting address */ + enum ctx_state prev_state; + + prev_state = exception_enter(); +- trace_page_fault_entries(address, regs, error_code); ++ if (trace_irqvectors_enabled()) ++ trace_page_fault_entries(address, regs, error_code); ++ + __do_page_fault(regs, error_code, address); + exception_exit(prev_state); + } +-NOKPROBE_SYMBOL(trace_do_page_fault); +-#endif /* CONFIG_TRACING */ ++NOKPROBE_SYMBOL(do_page_fault); +diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S +index 48ef7bb32c42..0092da1c056f 100644 +--- a/arch/x86/entry/entry_32.S ++++ b/arch/x86/entry/entry_32.S +@@ -891,14 +891,6 @@ BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + + #endif /* CONFIG_HYPERV */ + +-#ifdef CONFIG_TRACING +-ENTRY(trace_page_fault) +- ASM_CLAC +- pushl $trace_do_page_fault +- jmp common_exception +-END(trace_page_fault) +-#endif +- + ENTRY(page_fault) + ASM_CLAC + pushl $do_page_fault +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index ca0b250eefc4..dfabcbf8e813 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -913,17 +913,6 @@ ENTRY(\sym) + END(\sym) + .endm + +-#ifdef CONFIG_TRACING +-.macro trace_idtentry sym do_sym has_error_code:req +-idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code +-idtentry \sym \do_sym has_error_code=\has_error_code +-.endm +-#else +-.macro trace_idtentry sym do_sym has_error_code:req +-idtentry \sym \do_sym has_error_code=\has_error_code +-.endm +-#endif +- + idtentry divide_error do_divide_error has_error_code=0 + idtentry overflow do_overflow has_error_code=0 + idtentry bounds do_bounds has_error_code=0 +@@ -1091,7 +1080,7 @@ idtentry xen_stack_segment do_stack_segment has_error_code=1 + #endif + + idtentry general_protection do_general_protection has_error_code=1 +-trace_idtentry page_fault do_page_fault has_error_code=1 ++idtentry page_fault do_page_fault has_error_code=1 + + #ifdef CONFIG_KVM_GUEST + idtentry async_page_fault do_async_page_fault has_error_code=1 +-- +2.14.2 + diff --git a/patches/kernel/0030-x86-asm-Replace-access-to-desc_struct-a-b-fields.patch b/patches/kernel/0030-x86-asm-Replace-access-to-desc_struct-a-b-fields.patch deleted file mode 100644 index 11a91a1..0000000 --- a/patches/kernel/0030-x86-asm-Replace-access-to-desc_struct-a-b-fields.patch +++ /dev/null @@ -1,93 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 28 Aug 2017 08:47:40 +0200 -Subject: [PATCH] x86/asm: Replace access to desc_struct:a/b fields -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The union inside of desc_struct which allows access to the raw u32 parts of -the descriptors. This raw access part is about to go away. - -Replace the few code parts which access those fields. - -Signed-off-by: Thomas Gleixner -Reviewed-by: Boris Ostrovsky -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Denys Vlasenko -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Steven Rostedt -Link: http://lkml.kernel.org/r/20170828064958.120214366@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 9a98e7780022aa7cd201eb8a88a4f1d607b73cde) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 8469c76c61ea9c3b86b596352d1148bace5ea706) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/xen/hypercall.h | 6 ++++-- - arch/x86/kernel/tls.c | 2 +- - arch/x86/xen/enlighten_pv.c | 2 +- - 3 files changed, 6 insertions(+), 4 deletions(-) - -diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h -index 11071fcd630e..9606688caa4b 100644 ---- a/arch/x86/include/asm/xen/hypercall.h -+++ b/arch/x86/include/asm/xen/hypercall.h -@@ -552,6 +552,8 @@ static inline void - MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, - struct desc_struct desc) - { -+ u32 *p = (u32 *) &desc; -+ - mcl->op = __HYPERVISOR_update_descriptor; - if (sizeof(maddr) == sizeof(long)) { - mcl->args[0] = maddr; -@@ -559,8 +561,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, - } else { - mcl->args[0] = maddr; - mcl->args[1] = maddr >> 32; -- mcl->args[2] = desc.a; -- mcl->args[3] = desc.b; -+ mcl->args[2] = *p++; -+ mcl->args[3] = *p; - } - - trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4); -diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c -index dcd699baea1b..a106b9719c58 100644 ---- a/arch/x86/kernel/tls.c -+++ b/arch/x86/kernel/tls.c -@@ -93,7 +93,7 @@ static void set_tls_desc(struct task_struct *p, int idx, - - while (n-- > 0) { - if (LDT_empty(info) || LDT_zero(info)) { -- desc->a = desc->b = 0; -+ memset(desc, 0, sizeof(*desc)); - } else { - fill_ldt(desc, info); - -diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c -index 49ee3315b9f7..c76f5ff4d0d7 100644 ---- a/arch/x86/xen/enlighten_pv.c -+++ b/arch/x86/xen/enlighten_pv.c -@@ -501,7 +501,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) - static inline bool desc_equal(const struct desc_struct *d1, - const struct desc_struct *d2) - { -- return d1->a == d2->a && d1->b == d2->b; -+ return !memcmp(d1, d2, sizeof(*d1)); - } - - static void load_TLS_descriptor(struct thread_struct *t, --- -2.14.2 - diff --git a/patches/kernel/0030-x86-idt-Unify-gate_struct-handling-for-32-64-bit-ker.patch b/patches/kernel/0030-x86-idt-Unify-gate_struct-handling-for-32-64-bit-ker.patch new file mode 100644 index 0000000..4cfc341 --- /dev/null +++ b/patches/kernel/0030-x86-idt-Unify-gate_struct-handling-for-32-64-bit-ker.patch @@ -0,0 +1,262 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 28 Aug 2017 08:47:37 +0200 +Subject: [PATCH] x86/idt: Unify gate_struct handling for 32/64-bit kernels +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The first 32 bits of gate struct are the same for 32 and 64 bit kernels. + +The 32-bit version uses desc_struct and no designated data structure, +so we need different accessors for 32 and 64 bit kernels. + +Aside of that the macros which are necessary to build the 32-bit +gate descriptor are horrible to read. + +Unify the gate structs and switch all code fiddling with it over. + +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Steven Rostedt +Link: http://lkml.kernel.org/r/20170828064957.861974317@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 64b163fab684e3de47aa8db6cc08ae7d2e194373) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 587719b1926757eb7531e0631d63fb93cd60d0d3) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/desc.h | 45 ++++++++++++++----------------- + arch/x86/include/asm/desc_defs.h | 57 ++++++++++++++++++++++++++-------------- + arch/x86/kvm/vmx.c | 2 +- + arch/x86/xen/enlighten_pv.c | 12 ++++----- + 4 files changed, 63 insertions(+), 53 deletions(-) + +diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h +index d0a21b12dd58..57e502a4e92f 100644 +--- a/arch/x86/include/asm/desc.h ++++ b/arch/x86/include/asm/desc.h +@@ -83,33 +83,25 @@ static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu) + return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu)); + } + +-#ifdef CONFIG_X86_64 +- + static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, + unsigned dpl, unsigned ist, unsigned seg) + { +- gate->offset_low = PTR_LOW(func); ++ gate->offset_low = (u16) func; ++ gate->bits.p = 1; ++ gate->bits.dpl = dpl; ++ gate->bits.zero = 0; ++ gate->bits.type = type; ++ gate->offset_middle = (u16) (func >> 16); ++#ifdef CONFIG_X86_64 + gate->segment = __KERNEL_CS; +- gate->ist = ist; +- gate->p = 1; +- gate->dpl = dpl; +- gate->zero0 = 0; +- gate->zero1 = 0; +- gate->type = type; +- gate->offset_middle = PTR_MIDDLE(func); +- gate->offset_high = PTR_HIGH(func); +-} +- ++ gate->bits.ist = ist; ++ gate->reserved = 0; ++ gate->offset_high = (u32) (func >> 32); + #else +-static inline void pack_gate(gate_desc *gate, unsigned char type, +- unsigned long base, unsigned dpl, unsigned flags, +- unsigned short seg) +-{ +- gate->a = (seg << 16) | (base & 0xffff); +- gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8); +-} +- ++ gate->segment = seg; ++ gate->bits.ist = 0; + #endif ++} + + static inline int desc_empty(const void *ptr) + { +@@ -185,7 +177,8 @@ static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, + } + + +-static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size) ++static inline void set_tssldt_descriptor(void *d, unsigned long addr, ++ unsigned type, unsigned size) + { + #ifdef CONFIG_X86_64 + struct ldttss_desc64 *desc = d; +@@ -193,13 +186,13 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned t + memset(desc, 0, sizeof(*desc)); + + desc->limit0 = size & 0xFFFF; +- desc->base0 = PTR_LOW(addr); +- desc->base1 = PTR_MIDDLE(addr) & 0xFF; ++ desc->base0 = (u16) addr; ++ desc->base1 = (addr >> 16) & 0xFF; + desc->type = type; + desc->p = 1; + desc->limit1 = (size >> 16) & 0xF; +- desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; +- desc->base3 = PTR_HIGH(addr); ++ desc->base2 = (addr >> 24) & 0xFF; ++ desc->base3 = (u32) (addr >> 32); + #else + pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); + #endif +diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h +index 49265345d4d2..d684bee8a59a 100644 +--- a/arch/x86/include/asm/desc_defs.h ++++ b/arch/x86/include/asm/desc_defs.h +@@ -47,20 +47,6 @@ enum { + GATE_TASK = 0x5, + }; + +-/* 16byte gate */ +-struct gate_struct64 { +- u16 offset_low; +- u16 segment; +- unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1; +- u16 offset_middle; +- u32 offset_high; +- u32 zero1; +-} __attribute__((packed)); +- +-#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF) +-#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF) +-#define PTR_HIGH(x) ((unsigned long long)(x) >> 32) +- + enum { + DESC_TSS = 0x9, + DESC_LDT = 0x2, +@@ -77,20 +63,51 @@ struct ldttss_desc64 { + u32 zero1; + } __attribute__((packed)); + ++ + #ifdef CONFIG_X86_64 +-typedef struct gate_struct64 gate_desc; + typedef struct ldttss_desc64 ldt_desc; + typedef struct ldttss_desc64 tss_desc; +-#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32)) +-#define gate_segment(g) ((g).segment) + #else +-typedef struct desc_struct gate_desc; + typedef struct desc_struct ldt_desc; + typedef struct desc_struct tss_desc; +-#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff)) +-#define gate_segment(g) ((g).a >> 16) + #endif + ++struct idt_bits { ++ u16 ist : 3, ++ zero : 5, ++ type : 5, ++ dpl : 2, ++ p : 1; ++} __attribute__((packed)); ++ ++struct gate_struct { ++ u16 offset_low; ++ u16 segment; ++ struct idt_bits bits; ++ u16 offset_middle; ++#ifdef CONFIG_X86_64 ++ u32 offset_high; ++ u32 reserved; ++#endif ++} __attribute__((packed)); ++ ++typedef struct gate_struct gate_desc; ++ ++static inline unsigned long gate_offset(const gate_desc *g) ++{ ++#ifdef CONFIG_X86_64 ++ return g->offset_low | ((unsigned long)g->offset_middle << 16) | ++ ((unsigned long) g->offset_high << 32); ++#else ++ return g->offset_low | ((unsigned long)g->offset_middle << 16); ++#endif ++} ++ ++static inline unsigned long gate_segment(const gate_desc *g) ++{ ++ return g->segment; ++} ++ + struct desc_ptr { + unsigned short size; + unsigned long address; +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index a2c95522ac99..7b447d126d17 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -8838,7 +8838,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) + + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + desc = (gate_desc *)vmx->host_idt_base + vector; +- entry = gate_offset(*desc); ++ entry = gate_offset(desc); + asm volatile( + #ifdef CONFIG_X86_64 + "mov %%" _ASM_SP ", %[sp]\n\t" +diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c +index 6c279c8f0a0e..49ee3315b9f7 100644 +--- a/arch/x86/xen/enlighten_pv.c ++++ b/arch/x86/xen/enlighten_pv.c +@@ -591,12 +591,12 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, + { + unsigned long addr; + +- if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) ++ if (val->bits.type != GATE_TRAP && val->bits.type != GATE_INTERRUPT) + return 0; + + info->vector = vector; + +- addr = gate_offset(*val); ++ addr = gate_offset(val); + #ifdef CONFIG_X86_64 + /* + * Look for known traps using IST, and substitute them +@@ -629,16 +629,16 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, + ; + else { + /* Some other trap using IST? */ +- if (WARN_ON(val->ist != 0)) ++ if (WARN_ON(val->bits.ist != 0)) + return 0; + } + #endif /* CONFIG_X86_64 */ + info->address = addr; + +- info->cs = gate_segment(*val); +- info->flags = val->dpl; ++ info->cs = gate_segment(val); ++ info->flags = val->bits.dpl; + /* interrupt gates clear IF */ +- if (val->type == GATE_INTERRUPT) ++ if (val->bits.type == GATE_INTERRUPT) + info->flags |= 1 << 2; + + return 1; +-- +2.14.2 + diff --git a/patches/kernel/0031-x86-asm-Replace-access-to-desc_struct-a-b-fields.patch b/patches/kernel/0031-x86-asm-Replace-access-to-desc_struct-a-b-fields.patch new file mode 100644 index 0000000..11a91a1 --- /dev/null +++ b/patches/kernel/0031-x86-asm-Replace-access-to-desc_struct-a-b-fields.patch @@ -0,0 +1,93 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 28 Aug 2017 08:47:40 +0200 +Subject: [PATCH] x86/asm: Replace access to desc_struct:a/b fields +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The union inside of desc_struct which allows access to the raw u32 parts of +the descriptors. This raw access part is about to go away. + +Replace the few code parts which access those fields. + +Signed-off-by: Thomas Gleixner +Reviewed-by: Boris Ostrovsky +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Steven Rostedt +Link: http://lkml.kernel.org/r/20170828064958.120214366@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 9a98e7780022aa7cd201eb8a88a4f1d607b73cde) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 8469c76c61ea9c3b86b596352d1148bace5ea706) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/xen/hypercall.h | 6 ++++-- + arch/x86/kernel/tls.c | 2 +- + arch/x86/xen/enlighten_pv.c | 2 +- + 3 files changed, 6 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h +index 11071fcd630e..9606688caa4b 100644 +--- a/arch/x86/include/asm/xen/hypercall.h ++++ b/arch/x86/include/asm/xen/hypercall.h +@@ -552,6 +552,8 @@ static inline void + MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, + struct desc_struct desc) + { ++ u32 *p = (u32 *) &desc; ++ + mcl->op = __HYPERVISOR_update_descriptor; + if (sizeof(maddr) == sizeof(long)) { + mcl->args[0] = maddr; +@@ -559,8 +561,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, + } else { + mcl->args[0] = maddr; + mcl->args[1] = maddr >> 32; +- mcl->args[2] = desc.a; +- mcl->args[3] = desc.b; ++ mcl->args[2] = *p++; ++ mcl->args[3] = *p; + } + + trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4); +diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c +index dcd699baea1b..a106b9719c58 100644 +--- a/arch/x86/kernel/tls.c ++++ b/arch/x86/kernel/tls.c +@@ -93,7 +93,7 @@ static void set_tls_desc(struct task_struct *p, int idx, + + while (n-- > 0) { + if (LDT_empty(info) || LDT_zero(info)) { +- desc->a = desc->b = 0; ++ memset(desc, 0, sizeof(*desc)); + } else { + fill_ldt(desc, info); + +diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c +index 49ee3315b9f7..c76f5ff4d0d7 100644 +--- a/arch/x86/xen/enlighten_pv.c ++++ b/arch/x86/xen/enlighten_pv.c +@@ -501,7 +501,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) + static inline bool desc_equal(const struct desc_struct *d1, + const struct desc_struct *d2) + { +- return d1->a == d2->a && d1->b == d2->b; ++ return !memcmp(d1, d2, sizeof(*d1)); + } + + static void load_TLS_descriptor(struct thread_struct *t, +-- +2.14.2 + diff --git a/patches/kernel/0031-x86-xen-Get-rid-of-paravirt-op-adjust_exception_fram.patch b/patches/kernel/0031-x86-xen-Get-rid-of-paravirt-op-adjust_exception_fram.patch deleted file mode 100644 index c58c59c..0000000 --- a/patches/kernel/0031-x86-xen-Get-rid-of-paravirt-op-adjust_exception_fram.patch +++ /dev/null @@ -1,436 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Thu, 31 Aug 2017 19:42:49 +0200 -Subject: [PATCH] x86/xen: Get rid of paravirt op adjust_exception_frame -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -When running as Xen pv-guest the exception frame on the stack contains -%r11 and %rcx additional to the other data pushed by the processor. - -Instead of having a paravirt op being called for each exception type -prepend the Xen specific code to each exception entry. When running as -Xen pv-guest just use the exception entry with prepended instructions, -otherwise use the entry without the Xen specific code. - -[ tglx: Merged through tip to avoid ugly merge conflict ] - -Signed-off-by: Juergen Gross -Signed-off-by: Thomas Gleixner -Cc: xen-devel@lists.xenproject.org -Cc: boris.ostrovsky@oracle.com -Cc: luto@amacapital.net -Link: http://lkml.kernel.org/r/20170831174249.26853-1-jg@pfupf.net -(backported from commit 5878d5d6fdef6447d73b0acc121ba445bef37f53) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 9a6fb927deb3ebbe831741ca82081714637181a7) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/paravirt.h | 5 -- - arch/x86/include/asm/paravirt_types.h | 3 -- - arch/x86/include/asm/proto.h | 3 ++ - arch/x86/include/asm/traps.h | 28 ++++++++-- - arch/x86/xen/xen-ops.h | 1 - - arch/x86/kernel/asm-offsets_64.c | 1 - - arch/x86/kernel/paravirt.c | 3 -- - arch/x86/xen/enlighten_pv.c | 98 +++++++++++++++++++++++------------ - arch/x86/xen/irq.c | 3 -- - arch/x86/entry/entry_64.S | 23 ++------ - arch/x86/entry/entry_64_compat.S | 1 - - arch/x86/xen/xen-asm_64.S | 41 +++++++++++++-- - 12 files changed, 133 insertions(+), 77 deletions(-) - -diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h -index 9ccac1926587..c25dd22f7c70 100644 ---- a/arch/x86/include/asm/paravirt.h -+++ b/arch/x86/include/asm/paravirt.h -@@ -960,11 +960,6 @@ extern void default_banner(void); - #define GET_CR2_INTO_RAX \ - call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2) - --#define PARAVIRT_ADJUST_EXCEPTION_FRAME \ -- PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \ -- CLBR_NONE, \ -- call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame)) -- - #define USERGS_SYSRET64 \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ - CLBR_NONE, \ -diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h -index 9ffc36bfe4cd..6b64fc6367f2 100644 ---- a/arch/x86/include/asm/paravirt_types.h -+++ b/arch/x86/include/asm/paravirt_types.h -@@ -196,9 +196,6 @@ struct pv_irq_ops { - void (*safe_halt)(void); - void (*halt)(void); - --#ifdef CONFIG_X86_64 -- void (*adjust_exception_frame)(void); --#endif - } __no_randomize_layout; - - struct pv_mmu_ops { -diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h -index 8d3964fc5f91..b408b1886195 100644 ---- a/arch/x86/include/asm/proto.h -+++ b/arch/x86/include/asm/proto.h -@@ -24,6 +24,9 @@ void entry_SYSENTER_compat(void); - void __end_entry_SYSENTER_compat(void); - void entry_SYSCALL_compat(void); - void entry_INT80_compat(void); -+#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) -+void xen_entry_INT80_compat(void); -+#endif - #endif - - void x86_configure_nx(void); -diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h -index b4f322d6c95f..feb89dbe359d 100644 ---- a/arch/x86/include/asm/traps.h -+++ b/arch/x86/include/asm/traps.h -@@ -13,9 +13,6 @@ asmlinkage void divide_error(void); - asmlinkage void debug(void); - asmlinkage void nmi(void); - asmlinkage void int3(void); --asmlinkage void xen_debug(void); --asmlinkage void xen_int3(void); --asmlinkage void xen_stack_segment(void); - asmlinkage void overflow(void); - asmlinkage void bounds(void); - asmlinkage void invalid_op(void); -@@ -56,6 +53,31 @@ asmlinkage void simd_coprocessor_error(void); - #define trace_page_fault page_fault - #endif - -+#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) -+asmlinkage void xen_divide_error(void); -+asmlinkage void xen_xendebug(void); -+asmlinkage void xen_xenint3(void); -+asmlinkage void xen_nmi(void); -+asmlinkage void xen_overflow(void); -+asmlinkage void xen_bounds(void); -+asmlinkage void xen_invalid_op(void); -+asmlinkage void xen_device_not_available(void); -+asmlinkage void xen_double_fault(void); -+asmlinkage void xen_coprocessor_segment_overrun(void); -+asmlinkage void xen_invalid_TSS(void); -+asmlinkage void xen_segment_not_present(void); -+asmlinkage void xen_stack_segment(void); -+asmlinkage void xen_general_protection(void); -+asmlinkage void xen_page_fault(void); -+asmlinkage void xen_spurious_interrupt_bug(void); -+asmlinkage void xen_coprocessor_error(void); -+asmlinkage void xen_alignment_check(void); -+#ifdef CONFIG_X86_MCE -+asmlinkage void xen_machine_check(void); -+#endif /* CONFIG_X86_MCE */ -+asmlinkage void xen_simd_coprocessor_error(void); -+#endif -+ - dotraplinkage void do_divide_error(struct pt_regs *, long); - dotraplinkage void do_debug(struct pt_regs *, long); - dotraplinkage void do_nmi(struct pt_regs *, long); -diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h -index 70301ac0d414..c8a6d224f7ed 100644 ---- a/arch/x86/xen/xen-ops.h -+++ b/arch/x86/xen/xen-ops.h -@@ -138,7 +138,6 @@ __visible void xen_restore_fl_direct(unsigned long); - __visible void xen_iret(void); - __visible void xen_sysret32(void); - __visible void xen_sysret64(void); --__visible void xen_adjust_exception_frame(void); - - extern int xen_panic_handler_init(void); - -diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c -index 99332f550c48..cf42206926af 100644 ---- a/arch/x86/kernel/asm-offsets_64.c -+++ b/arch/x86/kernel/asm-offsets_64.c -@@ -20,7 +20,6 @@ static char syscalls_ia32[] = { - int main(void) - { - #ifdef CONFIG_PARAVIRT -- OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); - OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); - OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); - BLANK(); -diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c -index bc0a849589bb..a14df9eecfed 100644 ---- a/arch/x86/kernel/paravirt.c -+++ b/arch/x86/kernel/paravirt.c -@@ -319,9 +319,6 @@ __visible struct pv_irq_ops pv_irq_ops = { - .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), - .safe_halt = native_safe_halt, - .halt = native_halt, --#ifdef CONFIG_X86_64 -- .adjust_exception_frame = paravirt_nop, --#endif - }; - - __visible struct pv_cpu_ops pv_cpu_ops = { -diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c -index c76f5ff4d0d7..ae2a2e2d6362 100644 ---- a/arch/x86/xen/enlighten_pv.c -+++ b/arch/x86/xen/enlighten_pv.c -@@ -586,6 +586,70 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, - preempt_enable(); - } - -+#ifdef CONFIG_X86_64 -+struct trap_array_entry { -+ void (*orig)(void); -+ void (*xen)(void); -+ bool ist_okay; -+}; -+ -+static struct trap_array_entry trap_array[] = { -+ { debug, xen_xendebug, true }, -+ { int3, xen_xenint3, true }, -+ { double_fault, xen_double_fault, true }, -+#ifdef CONFIG_X86_MCE -+ { machine_check, xen_machine_check, true }, -+#endif -+ { nmi, xen_nmi, true }, -+ { overflow, xen_overflow, false }, -+#ifdef CONFIG_IA32_EMULATION -+ { entry_INT80_compat, xen_entry_INT80_compat, false }, -+#endif -+ { page_fault, xen_page_fault, false }, -+ { divide_error, xen_divide_error, false }, -+ { bounds, xen_bounds, false }, -+ { invalid_op, xen_invalid_op, false }, -+ { device_not_available, xen_device_not_available, false }, -+ { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false }, -+ { invalid_TSS, xen_invalid_TSS, false }, -+ { segment_not_present, xen_segment_not_present, false }, -+ { stack_segment, xen_stack_segment, false }, -+ { general_protection, xen_general_protection, false }, -+ { spurious_interrupt_bug, xen_spurious_interrupt_bug, false }, -+ { coprocessor_error, xen_coprocessor_error, false }, -+ { alignment_check, xen_alignment_check, false }, -+ { simd_coprocessor_error, xen_simd_coprocessor_error, false }, -+}; -+ -+static bool get_trap_addr(void **addr, unsigned int ist) -+{ -+ unsigned int nr; -+ bool ist_okay = false; -+ -+ /* -+ * Replace trap handler addresses by Xen specific ones. -+ * Check for known traps using IST and whitelist them. -+ * The debugger ones are the only ones we care about. -+ * Xen will handle faults like double_fault, * so we should never see -+ * them. Warn if there's an unexpected IST-using fault handler. -+ */ -+ for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) { -+ struct trap_array_entry *entry = trap_array + nr; -+ -+ if (*addr == entry->orig) { -+ *addr = entry->xen; -+ ist_okay = entry->ist_okay; -+ break; -+ } -+ } -+ -+ if (WARN_ON(ist != 0 && !ist_okay)) -+ return false; -+ -+ return true; -+} -+#endif -+ - static int cvt_gate_to_trap(int vector, const gate_desc *val, - struct trap_info *info) - { -@@ -598,40 +662,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, - - addr = gate_offset(val); - #ifdef CONFIG_X86_64 -- /* -- * Look for known traps using IST, and substitute them -- * appropriately. The debugger ones are the only ones we care -- * about. Xen will handle faults like double_fault, -- * so we should never see them. Warn if -- * there's an unexpected IST-using fault handler. -- */ -- if (addr == (unsigned long)debug) -- addr = (unsigned long)xen_debug; -- else if (addr == (unsigned long)int3) -- addr = (unsigned long)xen_int3; -- else if (addr == (unsigned long)stack_segment) -- addr = (unsigned long)xen_stack_segment; -- else if (addr == (unsigned long)double_fault) { -- /* Don't need to handle these */ -+ if (!get_trap_addr((void **)&addr, val->bits.ist)) - return 0; --#ifdef CONFIG_X86_MCE -- } else if (addr == (unsigned long)machine_check) { -- /* -- * when xen hypervisor inject vMCE to guest, -- * use native mce handler to handle it -- */ -- ; --#endif -- } else if (addr == (unsigned long)nmi) -- /* -- * Use the native version as well. -- */ -- ; -- else { -- /* Some other trap using IST? */ -- if (WARN_ON(val->bits.ist != 0)) -- return 0; -- } - #endif /* CONFIG_X86_64 */ - info->address = addr; - -diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c -index 33e92955e09d..d4eff5676cfa 100644 ---- a/arch/x86/xen/irq.c -+++ b/arch/x86/xen/irq.c -@@ -123,9 +123,6 @@ static const struct pv_irq_ops xen_irq_ops __initconst = { - - .safe_halt = xen_safe_halt, - .halt = xen_halt, --#ifdef CONFIG_X86_64 -- .adjust_exception_frame = xen_adjust_exception_frame, --#endif - }; - - void __init xen_init_irq_ops(void) -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index dfabcbf8e813..c12260ef3e4b 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -829,7 +829,6 @@ ENTRY(\sym) - .endif - - ASM_CLAC -- PARAVIRT_ADJUST_EXCEPTION_FRAME - - .ifeq \has_error_code - pushq $-1 /* ORIG_RAX: no syscall to restart */ -@@ -975,7 +974,7 @@ ENTRY(do_softirq_own_stack) - ENDPROC(do_softirq_own_stack) - - #ifdef CONFIG_XEN --idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 -+idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 - - /* - * A note on the "critical region" in our callback handler. -@@ -1042,8 +1041,6 @@ ENTRY(xen_failsafe_callback) - movq 8(%rsp), %r11 - addq $0x30, %rsp - pushq $0 /* RIP */ -- pushq %r11 -- pushq %rcx - UNWIND_HINT_IRET_REGS offset=8 - jmp general_protection - 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ -@@ -1074,9 +1071,8 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK - idtentry stack_segment do_stack_segment has_error_code=1 - - #ifdef CONFIG_XEN --idtentry xen_debug do_debug has_error_code=0 --idtentry xen_int3 do_int3 has_error_code=0 --idtentry xen_stack_segment do_stack_segment has_error_code=1 -+idtentry xendebug do_debug has_error_code=0 -+idtentry xenint3 do_int3 has_error_code=0 - #endif - - idtentry general_protection do_general_protection has_error_code=1 -@@ -1240,20 +1236,9 @@ ENTRY(error_exit) - END(error_exit) - - /* Runs on exception stack */ -+/* XXX: broken on Xen PV */ - ENTRY(nmi) - UNWIND_HINT_IRET_REGS -- /* -- * Fix up the exception frame if we're on Xen. -- * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most -- * one value to the stack on native, so it may clobber the rdx -- * scratch slot, but it won't clobber any of the important -- * slots past it. -- * -- * Xen is a different story, because the Xen frame itself overlaps -- * the "NMI executing" variable. -- */ -- PARAVIRT_ADJUST_EXCEPTION_FRAME -- - /* - * We allow breakpoints in NMIs. If a breakpoint occurs, then - * the iretq it performs will take us out of NMI context. -diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S -index 5314d7b8e5ad..d8468ba24be0 100644 ---- a/arch/x86/entry/entry_64_compat.S -+++ b/arch/x86/entry/entry_64_compat.S -@@ -293,7 +293,6 @@ ENTRY(entry_INT80_compat) - /* - * Interrupts are off on entry. - */ -- PARAVIRT_ADJUST_EXCEPTION_FRAME - ASM_CLAC /* Do this early to minimize exposure */ - SWAPGS - -diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S -index 3a3b6a211584..dae2cc33afb5 100644 ---- a/arch/x86/xen/xen-asm_64.S -+++ b/arch/x86/xen/xen-asm_64.S -@@ -16,11 +16,42 @@ - - #include - --ENTRY(xen_adjust_exception_frame) -- mov 8+0(%rsp), %rcx -- mov 8+8(%rsp), %r11 -- ret $16 --ENDPROC(xen_adjust_exception_frame) -+.macro xen_pv_trap name -+ENTRY(xen_\name) -+ pop %rcx -+ pop %r11 -+ jmp \name -+END(xen_\name) -+.endm -+ -+xen_pv_trap divide_error -+xen_pv_trap debug -+xen_pv_trap xendebug -+xen_pv_trap int3 -+xen_pv_trap xenint3 -+xen_pv_trap nmi -+xen_pv_trap overflow -+xen_pv_trap bounds -+xen_pv_trap invalid_op -+xen_pv_trap device_not_available -+xen_pv_trap double_fault -+xen_pv_trap coprocessor_segment_overrun -+xen_pv_trap invalid_TSS -+xen_pv_trap segment_not_present -+xen_pv_trap stack_segment -+xen_pv_trap general_protection -+xen_pv_trap page_fault -+xen_pv_trap spurious_interrupt_bug -+xen_pv_trap coprocessor_error -+xen_pv_trap alignment_check -+#ifdef CONFIG_X86_MCE -+xen_pv_trap machine_check -+#endif /* CONFIG_X86_MCE */ -+xen_pv_trap simd_coprocessor_error -+#ifdef CONFIG_IA32_EMULATION -+xen_pv_trap entry_INT80_compat -+#endif -+xen_pv_trap hypervisor_callback - - hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 - /* --- -2.14.2 - diff --git a/patches/kernel/0032-x86-paravirt-Remove-no-longer-used-paravirt-function.patch b/patches/kernel/0032-x86-paravirt-Remove-no-longer-used-paravirt-function.patch deleted file mode 100644 index 516eb30..0000000 --- a/patches/kernel/0032-x86-paravirt-Remove-no-longer-used-paravirt-function.patch +++ /dev/null @@ -1,390 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Mon, 4 Sep 2017 12:25:27 +0200 -Subject: [PATCH] x86/paravirt: Remove no longer used paravirt functions -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -With removal of lguest some of the paravirt functions are no longer -needed: - - ->read_cr4() - ->store_idt() - ->set_pmd_at() - ->set_pud_at() - ->pte_update() - -Remove them. - -Signed-off-by: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: akataria@vmware.com -Cc: boris.ostrovsky@oracle.com -Cc: chrisw@sous-sol.org -Cc: jeremy@goop.org -Cc: rusty@rustcorp.com.au -Cc: virtualization@lists.linux-foundation.org -Cc: xen-devel@lists.xenproject.org -Link: http://lkml.kernel.org/r/20170904102527.25409-1-jgross@suse.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 87930019c713873a1c3b9bd55dde46e81f70c8f1) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit edf3ab0080a6e79a300753e66929b0b7499eaec5) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/desc.h | 3 +-- - arch/x86/include/asm/paravirt.h | 37 ----------------------------------- - arch/x86/include/asm/paravirt_types.h | 9 --------- - arch/x86/include/asm/pgtable.h | 27 ++++--------------------- - arch/x86/include/asm/special_insns.h | 10 +++++----- - arch/x86/kernel/paravirt.c | 5 ----- - arch/x86/kvm/vmx.c | 2 +- - arch/x86/mm/pgtable.c | 7 +------ - arch/x86/xen/enlighten_pv.c | 2 -- - arch/x86/xen/mmu_pv.c | 2 -- - 10 files changed, 12 insertions(+), 92 deletions(-) - -diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h -index 57e502a4e92f..f995e5a09136 100644 ---- a/arch/x86/include/asm/desc.h -+++ b/arch/x86/include/asm/desc.h -@@ -120,7 +120,6 @@ static inline int desc_empty(const void *ptr) - #define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt)) - - #define store_gdt(dtr) native_store_gdt(dtr) --#define store_idt(dtr) native_store_idt(dtr) - #define store_tr(tr) (tr = native_store_tr()) - - #define load_TLS(t, cpu) native_load_tls(t, cpu) -@@ -241,7 +240,7 @@ static inline void native_store_gdt(struct desc_ptr *dtr) - asm volatile("sgdt %0":"=m" (*dtr)); - } - --static inline void native_store_idt(struct desc_ptr *dtr) -+static inline void store_idt(struct desc_ptr *dtr) - { - asm volatile("sidt %0":"=m" (*dtr)); - } -diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h -index c25dd22f7c70..12deec722cf0 100644 ---- a/arch/x86/include/asm/paravirt.h -+++ b/arch/x86/include/asm/paravirt.h -@@ -71,11 +71,6 @@ static inline void write_cr3(unsigned long x) - PVOP_VCALL1(pv_mmu_ops.write_cr3, x); - } - --static inline unsigned long __read_cr4(void) --{ -- return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4); --} -- - static inline void __write_cr4(unsigned long x) - { - PVOP_VCALL1(pv_cpu_ops.write_cr4, x); -@@ -228,10 +223,6 @@ static inline void set_ldt(const void *addr, unsigned entries) - { - PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries); - } --static inline void store_idt(struct desc_ptr *dtr) --{ -- PVOP_VCALL1(pv_cpu_ops.store_idt, dtr); --} - static inline unsigned long paravirt_store_tr(void) - { - return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr); -@@ -365,12 +356,6 @@ static inline void paravirt_release_p4d(unsigned long pfn) - PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn); - } - --static inline void pte_update(struct mm_struct *mm, unsigned long addr, -- pte_t *ptep) --{ -- PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); --} -- - static inline pte_t __pte(pteval_t val) - { - pteval_t ret; -@@ -472,28 +457,6 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); - } - --static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, -- pmd_t *pmdp, pmd_t pmd) --{ -- if (sizeof(pmdval_t) > sizeof(long)) -- /* 5 arg words */ -- pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd); -- else -- PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, -- native_pmd_val(pmd)); --} -- --static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, -- pud_t *pudp, pud_t pud) --{ -- if (sizeof(pudval_t) > sizeof(long)) -- /* 5 arg words */ -- pv_mmu_ops.set_pud_at(mm, addr, pudp, pud); -- else -- PVOP_VCALL4(pv_mmu_ops.set_pud_at, mm, addr, pudp, -- native_pud_val(pud)); --} -- - static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) - { - pmdval_t val = native_pmd_val(pmd); -diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h -index 6b64fc6367f2..42873edd9f9d 100644 ---- a/arch/x86/include/asm/paravirt_types.h -+++ b/arch/x86/include/asm/paravirt_types.h -@@ -107,7 +107,6 @@ struct pv_cpu_ops { - unsigned long (*read_cr0)(void); - void (*write_cr0)(unsigned long); - -- unsigned long (*read_cr4)(void); - void (*write_cr4)(unsigned long); - - #ifdef CONFIG_X86_64 -@@ -119,8 +118,6 @@ struct pv_cpu_ops { - void (*load_tr_desc)(void); - void (*load_gdt)(const struct desc_ptr *); - void (*load_idt)(const struct desc_ptr *); -- /* store_gdt has been removed. */ -- void (*store_idt)(struct desc_ptr *); - void (*set_ldt)(const void *desc, unsigned entries); - unsigned long (*store_tr)(void); - void (*load_tls)(struct thread_struct *t, unsigned int cpu); -@@ -245,12 +242,6 @@ struct pv_mmu_ops { - void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); - void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); -- void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr, -- pmd_t *pmdp, pmd_t pmdval); -- void (*set_pud_at)(struct mm_struct *mm, unsigned long addr, -- pud_t *pudp, pud_t pudval); -- void (*pte_update)(struct mm_struct *mm, unsigned long addr, -- pte_t *ptep); - - pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); -diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h -index 77037b6f1caa..bb8e9ea7deb4 100644 ---- a/arch/x86/include/asm/pgtable.h -+++ b/arch/x86/include/asm/pgtable.h -@@ -43,8 +43,6 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); - #else /* !CONFIG_PARAVIRT */ - #define set_pte(ptep, pte) native_set_pte(ptep, pte) - #define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) --#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd) --#define set_pud_at(mm, addr, pudp, pud) native_set_pud_at(mm, addr, pudp, pud) - - #define set_pte_atomic(ptep, pte) \ - native_set_pte_atomic(ptep, pte) -@@ -75,8 +73,6 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); - #define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep) - #define pmd_clear(pmd) native_pmd_clear(pmd) - --#define pte_update(mm, addr, ptep) do { } while (0) -- - #define pgd_val(x) native_pgd_val(x) - #define __pgd(x) native_make_pgd(x) - -@@ -965,31 +961,18 @@ static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, - native_set_pte(ptep, pte); - } - --static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr, -- pmd_t *pmdp , pmd_t pmd) -+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, -+ pmd_t *pmdp, pmd_t pmd) - { - native_set_pmd(pmdp, pmd); - } - --static inline void native_set_pud_at(struct mm_struct *mm, unsigned long addr, -- pud_t *pudp, pud_t pud) -+static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, -+ pud_t *pudp, pud_t pud) - { - native_set_pud(pudp, pud); - } - --#ifndef CONFIG_PARAVIRT --/* -- * Rules for using pte_update - it must be called after any PTE update which -- * has not been done using the set_pte / clear_pte interfaces. It is used by -- * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE -- * updates should either be sets, clears, or set_pte_atomic for P->P -- * transitions, which means this hook should only be called for user PTEs. -- * This hook implies a P->P protection or access change has taken place, which -- * requires a subsequent TLB flush. -- */ --#define pte_update(mm, addr, ptep) do { } while (0) --#endif -- - /* - * We only update the dirty/accessed state if we set - * the dirty bit by hand in the kernel, since the hardware -@@ -1017,7 +1000,6 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) - { - pte_t pte = native_ptep_get_and_clear(ptep); -- pte_update(mm, addr, ptep); - return pte; - } - -@@ -1044,7 +1026,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) - { - clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte); -- pte_update(mm, addr, ptep); - } - - #define flush_tlb_fix_spurious_fault(vma, address) do { } while (0) -diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h -index 9efaabf5b54b..a24dfcf79f4a 100644 ---- a/arch/x86/include/asm/special_insns.h -+++ b/arch/x86/include/asm/special_insns.h -@@ -135,6 +135,11 @@ static inline void native_wbinvd(void) - - extern asmlinkage void native_load_gs_index(unsigned); - -+static inline unsigned long __read_cr4(void) -+{ -+ return native_read_cr4(); -+} -+ - #ifdef CONFIG_PARAVIRT - #include - #else -@@ -173,11 +178,6 @@ static inline void write_cr3(unsigned long x) - native_write_cr3(x); - } - --static inline unsigned long __read_cr4(void) --{ -- return native_read_cr4(); --} -- - static inline void __write_cr4(unsigned long x) - { - native_write_cr4(x); -diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c -index a14df9eecfed..19a3e8f961c7 100644 ---- a/arch/x86/kernel/paravirt.c -+++ b/arch/x86/kernel/paravirt.c -@@ -327,7 +327,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = { - .set_debugreg = native_set_debugreg, - .read_cr0 = native_read_cr0, - .write_cr0 = native_write_cr0, -- .read_cr4 = native_read_cr4, - .write_cr4 = native_write_cr4, - #ifdef CONFIG_X86_64 - .read_cr8 = native_read_cr8, -@@ -343,7 +342,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = { - .set_ldt = native_set_ldt, - .load_gdt = native_load_gdt, - .load_idt = native_load_idt, -- .store_idt = native_store_idt, - .store_tr = native_store_tr, - .load_tls = native_load_tls, - #ifdef CONFIG_X86_64 -@@ -411,8 +409,6 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { - .set_pte = native_set_pte, - .set_pte_at = native_set_pte_at, - .set_pmd = native_set_pmd, -- .set_pmd_at = native_set_pmd_at, -- .pte_update = paravirt_nop, - - .ptep_modify_prot_start = __ptep_modify_prot_start, - .ptep_modify_prot_commit = __ptep_modify_prot_commit, -@@ -424,7 +420,6 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { - .pmd_clear = native_pmd_clear, - #endif - .set_pud = native_set_pud, -- .set_pud_at = native_set_pud_at, - - .pmd_val = PTE_IDENT, - .make_pmd = PTE_IDENT, -diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c -index 7b447d126d17..dd4996a96c71 100644 ---- a/arch/x86/kvm/vmx.c -+++ b/arch/x86/kvm/vmx.c -@@ -5174,7 +5174,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) - vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - -- native_store_idt(&dt); -+ store_idt(&dt); - vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ - vmx->host_idt_base = dt.address; - -diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c -index 508a708eb9a6..942391b5b639 100644 ---- a/arch/x86/mm/pgtable.c -+++ b/arch/x86/mm/pgtable.c -@@ -426,10 +426,8 @@ int ptep_set_access_flags(struct vm_area_struct *vma, - { - int changed = !pte_same(*ptep, entry); - -- if (changed && dirty) { -+ if (changed && dirty) - *ptep = entry; -- pte_update(vma->vm_mm, address, ptep); -- } - - return changed; - } -@@ -486,9 +484,6 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, - ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, - (unsigned long *) &ptep->pte); - -- if (ret) -- pte_update(vma->vm_mm, addr, ptep); -- - return ret; - } - -diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c -index ae2a2e2d6362..69b9deff7e5c 100644 ---- a/arch/x86/xen/enlighten_pv.c -+++ b/arch/x86/xen/enlighten_pv.c -@@ -1038,7 +1038,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { - .read_cr0 = xen_read_cr0, - .write_cr0 = xen_write_cr0, - -- .read_cr4 = native_read_cr4, - .write_cr4 = xen_write_cr4, - - #ifdef CONFIG_X86_64 -@@ -1073,7 +1072,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { - .alloc_ldt = xen_alloc_ldt, - .free_ldt = xen_free_ldt, - -- .store_idt = native_store_idt, - .store_tr = xen_store_tr, - - .write_ldt_entry = xen_write_ldt_entry, -diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c -index cab28cf2cffb..5f61b7e2e6b2 100644 ---- a/arch/x86/xen/mmu_pv.c -+++ b/arch/x86/xen/mmu_pv.c -@@ -2430,8 +2430,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { - .flush_tlb_single = xen_flush_tlb_single, - .flush_tlb_others = xen_flush_tlb_others, - -- .pte_update = paravirt_nop, -- - .pgd_alloc = xen_pgd_alloc, - .pgd_free = xen_pgd_free, - --- -2.14.2 - diff --git a/patches/kernel/0032-x86-xen-Get-rid-of-paravirt-op-adjust_exception_fram.patch b/patches/kernel/0032-x86-xen-Get-rid-of-paravirt-op-adjust_exception_fram.patch new file mode 100644 index 0000000..c58c59c --- /dev/null +++ b/patches/kernel/0032-x86-xen-Get-rid-of-paravirt-op-adjust_exception_fram.patch @@ -0,0 +1,436 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Juergen Gross +Date: Thu, 31 Aug 2017 19:42:49 +0200 +Subject: [PATCH] x86/xen: Get rid of paravirt op adjust_exception_frame +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +When running as Xen pv-guest the exception frame on the stack contains +%r11 and %rcx additional to the other data pushed by the processor. + +Instead of having a paravirt op being called for each exception type +prepend the Xen specific code to each exception entry. When running as +Xen pv-guest just use the exception entry with prepended instructions, +otherwise use the entry without the Xen specific code. + +[ tglx: Merged through tip to avoid ugly merge conflict ] + +Signed-off-by: Juergen Gross +Signed-off-by: Thomas Gleixner +Cc: xen-devel@lists.xenproject.org +Cc: boris.ostrovsky@oracle.com +Cc: luto@amacapital.net +Link: http://lkml.kernel.org/r/20170831174249.26853-1-jg@pfupf.net +(backported from commit 5878d5d6fdef6447d73b0acc121ba445bef37f53) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 9a6fb927deb3ebbe831741ca82081714637181a7) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/paravirt.h | 5 -- + arch/x86/include/asm/paravirt_types.h | 3 -- + arch/x86/include/asm/proto.h | 3 ++ + arch/x86/include/asm/traps.h | 28 ++++++++-- + arch/x86/xen/xen-ops.h | 1 - + arch/x86/kernel/asm-offsets_64.c | 1 - + arch/x86/kernel/paravirt.c | 3 -- + arch/x86/xen/enlighten_pv.c | 98 +++++++++++++++++++++++------------ + arch/x86/xen/irq.c | 3 -- + arch/x86/entry/entry_64.S | 23 ++------ + arch/x86/entry/entry_64_compat.S | 1 - + arch/x86/xen/xen-asm_64.S | 41 +++++++++++++-- + 12 files changed, 133 insertions(+), 77 deletions(-) + +diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h +index 9ccac1926587..c25dd22f7c70 100644 +--- a/arch/x86/include/asm/paravirt.h ++++ b/arch/x86/include/asm/paravirt.h +@@ -960,11 +960,6 @@ extern void default_banner(void); + #define GET_CR2_INTO_RAX \ + call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2) + +-#define PARAVIRT_ADJUST_EXCEPTION_FRAME \ +- PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \ +- CLBR_NONE, \ +- call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame)) +- + #define USERGS_SYSRET64 \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ + CLBR_NONE, \ +diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h +index 9ffc36bfe4cd..6b64fc6367f2 100644 +--- a/arch/x86/include/asm/paravirt_types.h ++++ b/arch/x86/include/asm/paravirt_types.h +@@ -196,9 +196,6 @@ struct pv_irq_ops { + void (*safe_halt)(void); + void (*halt)(void); + +-#ifdef CONFIG_X86_64 +- void (*adjust_exception_frame)(void); +-#endif + } __no_randomize_layout; + + struct pv_mmu_ops { +diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h +index 8d3964fc5f91..b408b1886195 100644 +--- a/arch/x86/include/asm/proto.h ++++ b/arch/x86/include/asm/proto.h +@@ -24,6 +24,9 @@ void entry_SYSENTER_compat(void); + void __end_entry_SYSENTER_compat(void); + void entry_SYSCALL_compat(void); + void entry_INT80_compat(void); ++#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) ++void xen_entry_INT80_compat(void); ++#endif + #endif + + void x86_configure_nx(void); +diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h +index b4f322d6c95f..feb89dbe359d 100644 +--- a/arch/x86/include/asm/traps.h ++++ b/arch/x86/include/asm/traps.h +@@ -13,9 +13,6 @@ asmlinkage void divide_error(void); + asmlinkage void debug(void); + asmlinkage void nmi(void); + asmlinkage void int3(void); +-asmlinkage void xen_debug(void); +-asmlinkage void xen_int3(void); +-asmlinkage void xen_stack_segment(void); + asmlinkage void overflow(void); + asmlinkage void bounds(void); + asmlinkage void invalid_op(void); +@@ -56,6 +53,31 @@ asmlinkage void simd_coprocessor_error(void); + #define trace_page_fault page_fault + #endif + ++#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) ++asmlinkage void xen_divide_error(void); ++asmlinkage void xen_xendebug(void); ++asmlinkage void xen_xenint3(void); ++asmlinkage void xen_nmi(void); ++asmlinkage void xen_overflow(void); ++asmlinkage void xen_bounds(void); ++asmlinkage void xen_invalid_op(void); ++asmlinkage void xen_device_not_available(void); ++asmlinkage void xen_double_fault(void); ++asmlinkage void xen_coprocessor_segment_overrun(void); ++asmlinkage void xen_invalid_TSS(void); ++asmlinkage void xen_segment_not_present(void); ++asmlinkage void xen_stack_segment(void); ++asmlinkage void xen_general_protection(void); ++asmlinkage void xen_page_fault(void); ++asmlinkage void xen_spurious_interrupt_bug(void); ++asmlinkage void xen_coprocessor_error(void); ++asmlinkage void xen_alignment_check(void); ++#ifdef CONFIG_X86_MCE ++asmlinkage void xen_machine_check(void); ++#endif /* CONFIG_X86_MCE */ ++asmlinkage void xen_simd_coprocessor_error(void); ++#endif ++ + dotraplinkage void do_divide_error(struct pt_regs *, long); + dotraplinkage void do_debug(struct pt_regs *, long); + dotraplinkage void do_nmi(struct pt_regs *, long); +diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h +index 70301ac0d414..c8a6d224f7ed 100644 +--- a/arch/x86/xen/xen-ops.h ++++ b/arch/x86/xen/xen-ops.h +@@ -138,7 +138,6 @@ __visible void xen_restore_fl_direct(unsigned long); + __visible void xen_iret(void); + __visible void xen_sysret32(void); + __visible void xen_sysret64(void); +-__visible void xen_adjust_exception_frame(void); + + extern int xen_panic_handler_init(void); + +diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c +index 99332f550c48..cf42206926af 100644 +--- a/arch/x86/kernel/asm-offsets_64.c ++++ b/arch/x86/kernel/asm-offsets_64.c +@@ -20,7 +20,6 @@ static char syscalls_ia32[] = { + int main(void) + { + #ifdef CONFIG_PARAVIRT +- OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); + OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); + OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); + BLANK(); +diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c +index bc0a849589bb..a14df9eecfed 100644 +--- a/arch/x86/kernel/paravirt.c ++++ b/arch/x86/kernel/paravirt.c +@@ -319,9 +319,6 @@ __visible struct pv_irq_ops pv_irq_ops = { + .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), + .safe_halt = native_safe_halt, + .halt = native_halt, +-#ifdef CONFIG_X86_64 +- .adjust_exception_frame = paravirt_nop, +-#endif + }; + + __visible struct pv_cpu_ops pv_cpu_ops = { +diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c +index c76f5ff4d0d7..ae2a2e2d6362 100644 +--- a/arch/x86/xen/enlighten_pv.c ++++ b/arch/x86/xen/enlighten_pv.c +@@ -586,6 +586,70 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, + preempt_enable(); + } + ++#ifdef CONFIG_X86_64 ++struct trap_array_entry { ++ void (*orig)(void); ++ void (*xen)(void); ++ bool ist_okay; ++}; ++ ++static struct trap_array_entry trap_array[] = { ++ { debug, xen_xendebug, true }, ++ { int3, xen_xenint3, true }, ++ { double_fault, xen_double_fault, true }, ++#ifdef CONFIG_X86_MCE ++ { machine_check, xen_machine_check, true }, ++#endif ++ { nmi, xen_nmi, true }, ++ { overflow, xen_overflow, false }, ++#ifdef CONFIG_IA32_EMULATION ++ { entry_INT80_compat, xen_entry_INT80_compat, false }, ++#endif ++ { page_fault, xen_page_fault, false }, ++ { divide_error, xen_divide_error, false }, ++ { bounds, xen_bounds, false }, ++ { invalid_op, xen_invalid_op, false }, ++ { device_not_available, xen_device_not_available, false }, ++ { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false }, ++ { invalid_TSS, xen_invalid_TSS, false }, ++ { segment_not_present, xen_segment_not_present, false }, ++ { stack_segment, xen_stack_segment, false }, ++ { general_protection, xen_general_protection, false }, ++ { spurious_interrupt_bug, xen_spurious_interrupt_bug, false }, ++ { coprocessor_error, xen_coprocessor_error, false }, ++ { alignment_check, xen_alignment_check, false }, ++ { simd_coprocessor_error, xen_simd_coprocessor_error, false }, ++}; ++ ++static bool get_trap_addr(void **addr, unsigned int ist) ++{ ++ unsigned int nr; ++ bool ist_okay = false; ++ ++ /* ++ * Replace trap handler addresses by Xen specific ones. ++ * Check for known traps using IST and whitelist them. ++ * The debugger ones are the only ones we care about. ++ * Xen will handle faults like double_fault, * so we should never see ++ * them. Warn if there's an unexpected IST-using fault handler. ++ */ ++ for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) { ++ struct trap_array_entry *entry = trap_array + nr; ++ ++ if (*addr == entry->orig) { ++ *addr = entry->xen; ++ ist_okay = entry->ist_okay; ++ break; ++ } ++ } ++ ++ if (WARN_ON(ist != 0 && !ist_okay)) ++ return false; ++ ++ return true; ++} ++#endif ++ + static int cvt_gate_to_trap(int vector, const gate_desc *val, + struct trap_info *info) + { +@@ -598,40 +662,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, + + addr = gate_offset(val); + #ifdef CONFIG_X86_64 +- /* +- * Look for known traps using IST, and substitute them +- * appropriately. The debugger ones are the only ones we care +- * about. Xen will handle faults like double_fault, +- * so we should never see them. Warn if +- * there's an unexpected IST-using fault handler. +- */ +- if (addr == (unsigned long)debug) +- addr = (unsigned long)xen_debug; +- else if (addr == (unsigned long)int3) +- addr = (unsigned long)xen_int3; +- else if (addr == (unsigned long)stack_segment) +- addr = (unsigned long)xen_stack_segment; +- else if (addr == (unsigned long)double_fault) { +- /* Don't need to handle these */ ++ if (!get_trap_addr((void **)&addr, val->bits.ist)) + return 0; +-#ifdef CONFIG_X86_MCE +- } else if (addr == (unsigned long)machine_check) { +- /* +- * when xen hypervisor inject vMCE to guest, +- * use native mce handler to handle it +- */ +- ; +-#endif +- } else if (addr == (unsigned long)nmi) +- /* +- * Use the native version as well. +- */ +- ; +- else { +- /* Some other trap using IST? */ +- if (WARN_ON(val->bits.ist != 0)) +- return 0; +- } + #endif /* CONFIG_X86_64 */ + info->address = addr; + +diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c +index 33e92955e09d..d4eff5676cfa 100644 +--- a/arch/x86/xen/irq.c ++++ b/arch/x86/xen/irq.c +@@ -123,9 +123,6 @@ static const struct pv_irq_ops xen_irq_ops __initconst = { + + .safe_halt = xen_safe_halt, + .halt = xen_halt, +-#ifdef CONFIG_X86_64 +- .adjust_exception_frame = xen_adjust_exception_frame, +-#endif + }; + + void __init xen_init_irq_ops(void) +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index dfabcbf8e813..c12260ef3e4b 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -829,7 +829,6 @@ ENTRY(\sym) + .endif + + ASM_CLAC +- PARAVIRT_ADJUST_EXCEPTION_FRAME + + .ifeq \has_error_code + pushq $-1 /* ORIG_RAX: no syscall to restart */ +@@ -975,7 +974,7 @@ ENTRY(do_softirq_own_stack) + ENDPROC(do_softirq_own_stack) + + #ifdef CONFIG_XEN +-idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 ++idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 + + /* + * A note on the "critical region" in our callback handler. +@@ -1042,8 +1041,6 @@ ENTRY(xen_failsafe_callback) + movq 8(%rsp), %r11 + addq $0x30, %rsp + pushq $0 /* RIP */ +- pushq %r11 +- pushq %rcx + UNWIND_HINT_IRET_REGS offset=8 + jmp general_protection + 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ +@@ -1074,9 +1071,8 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK + idtentry stack_segment do_stack_segment has_error_code=1 + + #ifdef CONFIG_XEN +-idtentry xen_debug do_debug has_error_code=0 +-idtentry xen_int3 do_int3 has_error_code=0 +-idtentry xen_stack_segment do_stack_segment has_error_code=1 ++idtentry xendebug do_debug has_error_code=0 ++idtentry xenint3 do_int3 has_error_code=0 + #endif + + idtentry general_protection do_general_protection has_error_code=1 +@@ -1240,20 +1236,9 @@ ENTRY(error_exit) + END(error_exit) + + /* Runs on exception stack */ ++/* XXX: broken on Xen PV */ + ENTRY(nmi) + UNWIND_HINT_IRET_REGS +- /* +- * Fix up the exception frame if we're on Xen. +- * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most +- * one value to the stack on native, so it may clobber the rdx +- * scratch slot, but it won't clobber any of the important +- * slots past it. +- * +- * Xen is a different story, because the Xen frame itself overlaps +- * the "NMI executing" variable. +- */ +- PARAVIRT_ADJUST_EXCEPTION_FRAME +- + /* + * We allow breakpoints in NMIs. If a breakpoint occurs, then + * the iretq it performs will take us out of NMI context. +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index 5314d7b8e5ad..d8468ba24be0 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -293,7 +293,6 @@ ENTRY(entry_INT80_compat) + /* + * Interrupts are off on entry. + */ +- PARAVIRT_ADJUST_EXCEPTION_FRAME + ASM_CLAC /* Do this early to minimize exposure */ + SWAPGS + +diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S +index 3a3b6a211584..dae2cc33afb5 100644 +--- a/arch/x86/xen/xen-asm_64.S ++++ b/arch/x86/xen/xen-asm_64.S +@@ -16,11 +16,42 @@ + + #include + +-ENTRY(xen_adjust_exception_frame) +- mov 8+0(%rsp), %rcx +- mov 8+8(%rsp), %r11 +- ret $16 +-ENDPROC(xen_adjust_exception_frame) ++.macro xen_pv_trap name ++ENTRY(xen_\name) ++ pop %rcx ++ pop %r11 ++ jmp \name ++END(xen_\name) ++.endm ++ ++xen_pv_trap divide_error ++xen_pv_trap debug ++xen_pv_trap xendebug ++xen_pv_trap int3 ++xen_pv_trap xenint3 ++xen_pv_trap nmi ++xen_pv_trap overflow ++xen_pv_trap bounds ++xen_pv_trap invalid_op ++xen_pv_trap device_not_available ++xen_pv_trap double_fault ++xen_pv_trap coprocessor_segment_overrun ++xen_pv_trap invalid_TSS ++xen_pv_trap segment_not_present ++xen_pv_trap stack_segment ++xen_pv_trap general_protection ++xen_pv_trap page_fault ++xen_pv_trap spurious_interrupt_bug ++xen_pv_trap coprocessor_error ++xen_pv_trap alignment_check ++#ifdef CONFIG_X86_MCE ++xen_pv_trap machine_check ++#endif /* CONFIG_X86_MCE */ ++xen_pv_trap simd_coprocessor_error ++#ifdef CONFIG_IA32_EMULATION ++xen_pv_trap entry_INT80_compat ++#endif ++xen_pv_trap hypervisor_callback + + hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 + /* +-- +2.14.2 + diff --git a/patches/kernel/0033-x86-entry-Fix-idtentry-unwind-hint.patch b/patches/kernel/0033-x86-entry-Fix-idtentry-unwind-hint.patch deleted file mode 100644 index 13fb2c6..0000000 --- a/patches/kernel/0033-x86-entry-Fix-idtentry-unwind-hint.patch +++ /dev/null @@ -1,53 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Fri, 20 Oct 2017 11:21:33 -0500 -Subject: [PATCH] x86/entry: Fix idtentry unwind hint -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -This fixes the following ORC warning in the 'int3' entry code: - - WARNING: can't dereference iret registers at ffff8801c5f17fe0 for ip ffffffff95f0d94b - -The ORC metadata had the wrong stack offset for the iret registers. - -Their location on the stack is dependent on whether the exception has an -error code. - -Reported-and-tested-by: Andrei Vagin -Signed-off-by: Josh Poimboeuf -Cc: Andy Lutomirski -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Fixes: 8c1f75587a18 ("x86/entry/64: Add unwind hint annotations") -Link: http://lkml.kernel.org/r/931d57f0551ed7979d5e7e05370d445c8e5137f8.1508516398.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 98990a33b77dda9babf91cb235654f6729e5702e) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 266be2a5053230f6d0b6f27d3e8e9f28df40dd7e) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/entry_64.S | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index c12260ef3e4b..2e4fc6425f47 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -821,7 +821,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt - - .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 - ENTRY(\sym) -- UNWIND_HINT_IRET_REGS offset=8 -+ UNWIND_HINT_IRET_REGS offset=\has_error_code*8 - - /* Sanity check */ - .if \shift_ist != -1 && \paranoid == 0 --- -2.14.2 - diff --git a/patches/kernel/0033-x86-paravirt-Remove-no-longer-used-paravirt-function.patch b/patches/kernel/0033-x86-paravirt-Remove-no-longer-used-paravirt-function.patch new file mode 100644 index 0000000..516eb30 --- /dev/null +++ b/patches/kernel/0033-x86-paravirt-Remove-no-longer-used-paravirt-function.patch @@ -0,0 +1,390 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Juergen Gross +Date: Mon, 4 Sep 2017 12:25:27 +0200 +Subject: [PATCH] x86/paravirt: Remove no longer used paravirt functions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +With removal of lguest some of the paravirt functions are no longer +needed: + + ->read_cr4() + ->store_idt() + ->set_pmd_at() + ->set_pud_at() + ->pte_update() + +Remove them. + +Signed-off-by: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: akataria@vmware.com +Cc: boris.ostrovsky@oracle.com +Cc: chrisw@sous-sol.org +Cc: jeremy@goop.org +Cc: rusty@rustcorp.com.au +Cc: virtualization@lists.linux-foundation.org +Cc: xen-devel@lists.xenproject.org +Link: http://lkml.kernel.org/r/20170904102527.25409-1-jgross@suse.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 87930019c713873a1c3b9bd55dde46e81f70c8f1) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit edf3ab0080a6e79a300753e66929b0b7499eaec5) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/desc.h | 3 +-- + arch/x86/include/asm/paravirt.h | 37 ----------------------------------- + arch/x86/include/asm/paravirt_types.h | 9 --------- + arch/x86/include/asm/pgtable.h | 27 ++++--------------------- + arch/x86/include/asm/special_insns.h | 10 +++++----- + arch/x86/kernel/paravirt.c | 5 ----- + arch/x86/kvm/vmx.c | 2 +- + arch/x86/mm/pgtable.c | 7 +------ + arch/x86/xen/enlighten_pv.c | 2 -- + arch/x86/xen/mmu_pv.c | 2 -- + 10 files changed, 12 insertions(+), 92 deletions(-) + +diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h +index 57e502a4e92f..f995e5a09136 100644 +--- a/arch/x86/include/asm/desc.h ++++ b/arch/x86/include/asm/desc.h +@@ -120,7 +120,6 @@ static inline int desc_empty(const void *ptr) + #define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt)) + + #define store_gdt(dtr) native_store_gdt(dtr) +-#define store_idt(dtr) native_store_idt(dtr) + #define store_tr(tr) (tr = native_store_tr()) + + #define load_TLS(t, cpu) native_load_tls(t, cpu) +@@ -241,7 +240,7 @@ static inline void native_store_gdt(struct desc_ptr *dtr) + asm volatile("sgdt %0":"=m" (*dtr)); + } + +-static inline void native_store_idt(struct desc_ptr *dtr) ++static inline void store_idt(struct desc_ptr *dtr) + { + asm volatile("sidt %0":"=m" (*dtr)); + } +diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h +index c25dd22f7c70..12deec722cf0 100644 +--- a/arch/x86/include/asm/paravirt.h ++++ b/arch/x86/include/asm/paravirt.h +@@ -71,11 +71,6 @@ static inline void write_cr3(unsigned long x) + PVOP_VCALL1(pv_mmu_ops.write_cr3, x); + } + +-static inline unsigned long __read_cr4(void) +-{ +- return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4); +-} +- + static inline void __write_cr4(unsigned long x) + { + PVOP_VCALL1(pv_cpu_ops.write_cr4, x); +@@ -228,10 +223,6 @@ static inline void set_ldt(const void *addr, unsigned entries) + { + PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries); + } +-static inline void store_idt(struct desc_ptr *dtr) +-{ +- PVOP_VCALL1(pv_cpu_ops.store_idt, dtr); +-} + static inline unsigned long paravirt_store_tr(void) + { + return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr); +@@ -365,12 +356,6 @@ static inline void paravirt_release_p4d(unsigned long pfn) + PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn); + } + +-static inline void pte_update(struct mm_struct *mm, unsigned long addr, +- pte_t *ptep) +-{ +- PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); +-} +- + static inline pte_t __pte(pteval_t val) + { + pteval_t ret; +@@ -472,28 +457,6 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, + PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); + } + +-static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, +- pmd_t *pmdp, pmd_t pmd) +-{ +- if (sizeof(pmdval_t) > sizeof(long)) +- /* 5 arg words */ +- pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd); +- else +- PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, +- native_pmd_val(pmd)); +-} +- +-static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, +- pud_t *pudp, pud_t pud) +-{ +- if (sizeof(pudval_t) > sizeof(long)) +- /* 5 arg words */ +- pv_mmu_ops.set_pud_at(mm, addr, pudp, pud); +- else +- PVOP_VCALL4(pv_mmu_ops.set_pud_at, mm, addr, pudp, +- native_pud_val(pud)); +-} +- + static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) + { + pmdval_t val = native_pmd_val(pmd); +diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h +index 6b64fc6367f2..42873edd9f9d 100644 +--- a/arch/x86/include/asm/paravirt_types.h ++++ b/arch/x86/include/asm/paravirt_types.h +@@ -107,7 +107,6 @@ struct pv_cpu_ops { + unsigned long (*read_cr0)(void); + void (*write_cr0)(unsigned long); + +- unsigned long (*read_cr4)(void); + void (*write_cr4)(unsigned long); + + #ifdef CONFIG_X86_64 +@@ -119,8 +118,6 @@ struct pv_cpu_ops { + void (*load_tr_desc)(void); + void (*load_gdt)(const struct desc_ptr *); + void (*load_idt)(const struct desc_ptr *); +- /* store_gdt has been removed. */ +- void (*store_idt)(struct desc_ptr *); + void (*set_ldt)(const void *desc, unsigned entries); + unsigned long (*store_tr)(void); + void (*load_tls)(struct thread_struct *t, unsigned int cpu); +@@ -245,12 +242,6 @@ struct pv_mmu_ops { + void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval); + void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); +- void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr, +- pmd_t *pmdp, pmd_t pmdval); +- void (*set_pud_at)(struct mm_struct *mm, unsigned long addr, +- pud_t *pudp, pud_t pudval); +- void (*pte_update)(struct mm_struct *mm, unsigned long addr, +- pte_t *ptep); + + pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index 77037b6f1caa..bb8e9ea7deb4 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -43,8 +43,6 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); + #else /* !CONFIG_PARAVIRT */ + #define set_pte(ptep, pte) native_set_pte(ptep, pte) + #define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) +-#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd) +-#define set_pud_at(mm, addr, pudp, pud) native_set_pud_at(mm, addr, pudp, pud) + + #define set_pte_atomic(ptep, pte) \ + native_set_pte_atomic(ptep, pte) +@@ -75,8 +73,6 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); + #define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep) + #define pmd_clear(pmd) native_pmd_clear(pmd) + +-#define pte_update(mm, addr, ptep) do { } while (0) +- + #define pgd_val(x) native_pgd_val(x) + #define __pgd(x) native_make_pgd(x) + +@@ -965,31 +961,18 @@ static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, + native_set_pte(ptep, pte); + } + +-static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr, +- pmd_t *pmdp , pmd_t pmd) ++static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, ++ pmd_t *pmdp, pmd_t pmd) + { + native_set_pmd(pmdp, pmd); + } + +-static inline void native_set_pud_at(struct mm_struct *mm, unsigned long addr, +- pud_t *pudp, pud_t pud) ++static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, ++ pud_t *pudp, pud_t pud) + { + native_set_pud(pudp, pud); + } + +-#ifndef CONFIG_PARAVIRT +-/* +- * Rules for using pte_update - it must be called after any PTE update which +- * has not been done using the set_pte / clear_pte interfaces. It is used by +- * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE +- * updates should either be sets, clears, or set_pte_atomic for P->P +- * transitions, which means this hook should only be called for user PTEs. +- * This hook implies a P->P protection or access change has taken place, which +- * requires a subsequent TLB flush. +- */ +-#define pte_update(mm, addr, ptep) do { } while (0) +-#endif +- + /* + * We only update the dirty/accessed state if we set + * the dirty bit by hand in the kernel, since the hardware +@@ -1017,7 +1000,6 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) + { + pte_t pte = native_ptep_get_and_clear(ptep); +- pte_update(mm, addr, ptep); + return pte; + } + +@@ -1044,7 +1026,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) + { + clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte); +- pte_update(mm, addr, ptep); + } + + #define flush_tlb_fix_spurious_fault(vma, address) do { } while (0) +diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h +index 9efaabf5b54b..a24dfcf79f4a 100644 +--- a/arch/x86/include/asm/special_insns.h ++++ b/arch/x86/include/asm/special_insns.h +@@ -135,6 +135,11 @@ static inline void native_wbinvd(void) + + extern asmlinkage void native_load_gs_index(unsigned); + ++static inline unsigned long __read_cr4(void) ++{ ++ return native_read_cr4(); ++} ++ + #ifdef CONFIG_PARAVIRT + #include + #else +@@ -173,11 +178,6 @@ static inline void write_cr3(unsigned long x) + native_write_cr3(x); + } + +-static inline unsigned long __read_cr4(void) +-{ +- return native_read_cr4(); +-} +- + static inline void __write_cr4(unsigned long x) + { + native_write_cr4(x); +diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c +index a14df9eecfed..19a3e8f961c7 100644 +--- a/arch/x86/kernel/paravirt.c ++++ b/arch/x86/kernel/paravirt.c +@@ -327,7 +327,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = { + .set_debugreg = native_set_debugreg, + .read_cr0 = native_read_cr0, + .write_cr0 = native_write_cr0, +- .read_cr4 = native_read_cr4, + .write_cr4 = native_write_cr4, + #ifdef CONFIG_X86_64 + .read_cr8 = native_read_cr8, +@@ -343,7 +342,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = { + .set_ldt = native_set_ldt, + .load_gdt = native_load_gdt, + .load_idt = native_load_idt, +- .store_idt = native_store_idt, + .store_tr = native_store_tr, + .load_tls = native_load_tls, + #ifdef CONFIG_X86_64 +@@ -411,8 +409,6 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { + .set_pte = native_set_pte, + .set_pte_at = native_set_pte_at, + .set_pmd = native_set_pmd, +- .set_pmd_at = native_set_pmd_at, +- .pte_update = paravirt_nop, + + .ptep_modify_prot_start = __ptep_modify_prot_start, + .ptep_modify_prot_commit = __ptep_modify_prot_commit, +@@ -424,7 +420,6 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { + .pmd_clear = native_pmd_clear, + #endif + .set_pud = native_set_pud, +- .set_pud_at = native_set_pud_at, + + .pmd_val = PTE_IDENT, + .make_pmd = PTE_IDENT, +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 7b447d126d17..dd4996a96c71 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -5174,7 +5174,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) + vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ + +- native_store_idt(&dt); ++ store_idt(&dt); + vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ + vmx->host_idt_base = dt.address; + +diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c +index 508a708eb9a6..942391b5b639 100644 +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -426,10 +426,8 @@ int ptep_set_access_flags(struct vm_area_struct *vma, + { + int changed = !pte_same(*ptep, entry); + +- if (changed && dirty) { ++ if (changed && dirty) + *ptep = entry; +- pte_update(vma->vm_mm, address, ptep); +- } + + return changed; + } +@@ -486,9 +484,6 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *) &ptep->pte); + +- if (ret) +- pte_update(vma->vm_mm, addr, ptep); +- + return ret; + } + +diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c +index ae2a2e2d6362..69b9deff7e5c 100644 +--- a/arch/x86/xen/enlighten_pv.c ++++ b/arch/x86/xen/enlighten_pv.c +@@ -1038,7 +1038,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { + .read_cr0 = xen_read_cr0, + .write_cr0 = xen_write_cr0, + +- .read_cr4 = native_read_cr4, + .write_cr4 = xen_write_cr4, + + #ifdef CONFIG_X86_64 +@@ -1073,7 +1072,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { + .alloc_ldt = xen_alloc_ldt, + .free_ldt = xen_free_ldt, + +- .store_idt = native_store_idt, + .store_tr = xen_store_tr, + + .write_ldt_entry = xen_write_ldt_entry, +diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c +index cab28cf2cffb..5f61b7e2e6b2 100644 +--- a/arch/x86/xen/mmu_pv.c ++++ b/arch/x86/xen/mmu_pv.c +@@ -2430,8 +2430,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { + .flush_tlb_single = xen_flush_tlb_single, + .flush_tlb_others = xen_flush_tlb_others, + +- .pte_update = paravirt_nop, +- + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, + +-- +2.14.2 + diff --git a/patches/kernel/0034-x86-entry-Fix-idtentry-unwind-hint.patch b/patches/kernel/0034-x86-entry-Fix-idtentry-unwind-hint.patch new file mode 100644 index 0000000..13fb2c6 --- /dev/null +++ b/patches/kernel/0034-x86-entry-Fix-idtentry-unwind-hint.patch @@ -0,0 +1,53 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Fri, 20 Oct 2017 11:21:33 -0500 +Subject: [PATCH] x86/entry: Fix idtentry unwind hint +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +This fixes the following ORC warning in the 'int3' entry code: + + WARNING: can't dereference iret registers at ffff8801c5f17fe0 for ip ffffffff95f0d94b + +The ORC metadata had the wrong stack offset for the iret registers. + +Their location on the stack is dependent on whether the exception has an +error code. + +Reported-and-tested-by: Andrei Vagin +Signed-off-by: Josh Poimboeuf +Cc: Andy Lutomirski +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Fixes: 8c1f75587a18 ("x86/entry/64: Add unwind hint annotations") +Link: http://lkml.kernel.org/r/931d57f0551ed7979d5e7e05370d445c8e5137f8.1508516398.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 98990a33b77dda9babf91cb235654f6729e5702e) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 266be2a5053230f6d0b6f27d3e8e9f28df40dd7e) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/entry_64.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index c12260ef3e4b..2e4fc6425f47 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -821,7 +821,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt + + .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 + ENTRY(\sym) +- UNWIND_HINT_IRET_REGS offset=8 ++ UNWIND_HINT_IRET_REGS offset=\has_error_code*8 + + /* Sanity check */ + .if \shift_ist != -1 && \paranoid == 0 +-- +2.14.2 + diff --git a/patches/kernel/0034-x86-mm-64-Initialize-CR4.PCIDE-early.patch b/patches/kernel/0034-x86-mm-64-Initialize-CR4.PCIDE-early.patch deleted file mode 100644 index 15f8a3e..0000000 --- a/patches/kernel/0034-x86-mm-64-Initialize-CR4.PCIDE-early.patch +++ /dev/null @@ -1,237 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Sun, 10 Sep 2017 17:48:27 -0700 -Subject: [PATCH] x86/mm/64: Initialize CR4.PCIDE early -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -cpu_init() is weird: it's called rather late (after early -identification and after most MMU state is initialized) on the boot -CPU but is called extremely early (before identification) on secondary -CPUs. It's called just late enough on the boot CPU that its CR4 value -isn't propagated to mmu_cr4_features. - -Even if we put CR4.PCIDE into mmu_cr4_features, we'd hit two -problems. First, we'd crash in the trampoline code. That's -fixable, and I tried that. It turns out that mmu_cr4_features is -totally ignored by secondary_start_64(), though, so even with the -trampoline code fixed, it wouldn't help. - -This means that we don't currently have CR4.PCIDE reliably initialized -before we start playing with cpu_tlbstate. This is very fragile and -tends to cause boot failures if I make even small changes to the TLB -handling code. - -Make it more robust: initialize CR4.PCIDE earlier on the boot CPU -and propagate it to secondary CPUs in start_secondary(). - -( Yes, this is ugly. I think we should have improved mmu_cr4_features - to actually control CR4 during secondary bootup, but that would be - fairly intrusive at this stage. ) - -Signed-off-by: Andy Lutomirski -Reported-by: Sai Praneeth Prakhya -Tested-by: Sai Praneeth Prakhya -Cc: Borislav Petkov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: linux-kernel@vger.kernel.org -Fixes: 660da7c9228f ("x86/mm: Enable CR4.PCIDE on supported systems") -Signed-off-by: Ingo Molnar -(cherry picked from commit c7ad5ad297e644601747d6dbee978bf85e14f7bc) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 0e6a37a43aa876327e7d21881c09977da2d5c270) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/cpu/common.c | 49 +++++++------------------------------------- - arch/x86/kernel/setup.c | 5 ++++- - arch/x86/kernel/smpboot.c | 8 +++++--- - arch/x86/mm/init.c | 34 ++++++++++++++++++++++++++++++ - 4 files changed, 50 insertions(+), 46 deletions(-) - -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index 0b80ed14ff52..4be7b209a3d6 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -169,21 +169,21 @@ static int __init x86_mpx_setup(char *s) - __setup("nompx", x86_mpx_setup); - - #ifdef CONFIG_X86_64 --static int __init x86_pcid_setup(char *s) -+static int __init x86_nopcid_setup(char *s) - { -- /* require an exact match without trailing characters */ -- if (strlen(s)) -- return 0; -+ /* nopcid doesn't accept parameters */ -+ if (s) -+ return -EINVAL; - - /* do not emit a message if the feature is not present */ - if (!boot_cpu_has(X86_FEATURE_PCID)) -- return 1; -+ return 0; - - setup_clear_cpu_cap(X86_FEATURE_PCID); - pr_info("nopcid: PCID feature disabled\n"); -- return 1; -+ return 0; - } --__setup("nopcid", x86_pcid_setup); -+early_param("nopcid", x86_nopcid_setup); - #endif - - static int __init x86_noinvpcid_setup(char *s) -@@ -329,38 +329,6 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) - } - } - --static void setup_pcid(struct cpuinfo_x86 *c) --{ -- if (cpu_has(c, X86_FEATURE_PCID)) { -- if (cpu_has(c, X86_FEATURE_PGE)) { -- /* -- * We'd like to use cr4_set_bits_and_update_boot(), -- * but we can't. CR4.PCIDE is special and can only -- * be set in long mode, and the early CPU init code -- * doesn't know this and would try to restore CR4.PCIDE -- * prior to entering long mode. -- * -- * Instead, we rely on the fact that hotplug, resume, -- * etc all fully restore CR4 before they write anything -- * that could have nonzero PCID bits to CR3. CR4.PCIDE -- * has no effect on the page tables themselves, so we -- * don't need it to be restored early. -- */ -- cr4_set_bits(X86_CR4_PCIDE); -- } else { -- /* -- * flush_tlb_all(), as currently implemented, won't -- * work if PCID is on but PGE is not. Since that -- * combination doesn't exist on real hardware, there's -- * no reason to try to fully support it, but it's -- * polite to avoid corrupting data if we're on -- * an improperly configured VM. -- */ -- clear_cpu_cap(c, X86_FEATURE_PCID); -- } -- } --} -- - /* - * Protection Keys are not available in 32-bit mode. - */ -@@ -1175,9 +1143,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) - setup_smep(c); - setup_smap(c); - -- /* Set up PCID */ -- setup_pcid(c); -- - /* - * The vendor-specific functions might have changed features. - * Now we do "generic changes." -diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c -index d7e8b983aa72..f964bfddfefd 100644 ---- a/arch/x86/kernel/setup.c -+++ b/arch/x86/kernel/setup.c -@@ -1174,8 +1174,11 @@ void __init setup_arch(char **cmdline_p) - * with the current CR4 value. This may not be necessary, but - * auditing all the early-boot CR4 manipulation would be needed to - * rule it out. -+ * -+ * Mask off features that don't work outside long mode (just -+ * PCIDE for now). - */ -- mmu_cr4_features = __read_cr4(); -+ mmu_cr4_features = __read_cr4() & ~X86_CR4_PCIDE; - - memblock_set_current_limit(get_max_mapped()); - -diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c -index 893fd8c849e2..d05006f6c31c 100644 ---- a/arch/x86/kernel/smpboot.c -+++ b/arch/x86/kernel/smpboot.c -@@ -227,10 +227,12 @@ static int enable_start_cpu0; - static void notrace start_secondary(void *unused) - { - /* -- * Don't put *anything* before cpu_init(), SMP booting is too -- * fragile that we want to limit the things done here to the -- * most necessary things. -+ * Don't put *anything* except direct CPU state initialization -+ * before cpu_init(), SMP booting is too fragile that we want to -+ * limit the things done here to the most necessary things. - */ -+ if (boot_cpu_has(X86_FEATURE_PCID)) -+ __write_cr4(__read_cr4() | X86_CR4_PCIDE); - cpu_init(); - x86_cpuinit.early_percpu_clock_init(); - preempt_disable(); -diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c -index bf3f1065d6ad..df2624b091a7 100644 ---- a/arch/x86/mm/init.c -+++ b/arch/x86/mm/init.c -@@ -19,6 +19,7 @@ - #include - #include - #include -+#include - - /* - * We need to define the tracepoints somewhere, and tlb.c -@@ -193,6 +194,38 @@ static void __init probe_page_size_mask(void) - } - } - -+static void setup_pcid(void) -+{ -+#ifdef CONFIG_X86_64 -+ if (boot_cpu_has(X86_FEATURE_PCID)) { -+ if (boot_cpu_has(X86_FEATURE_PGE)) { -+ /* -+ * This can't be cr4_set_bits_and_update_boot() -- -+ * the trampoline code can't handle CR4.PCIDE and -+ * it wouldn't do any good anyway. Despite the name, -+ * cr4_set_bits_and_update_boot() doesn't actually -+ * cause the bits in question to remain set all the -+ * way through the secondary boot asm. -+ * -+ * Instead, we brute-force it and set CR4.PCIDE -+ * manually in start_secondary(). -+ */ -+ cr4_set_bits(X86_CR4_PCIDE); -+ } else { -+ /* -+ * flush_tlb_all(), as currently implemented, won't -+ * work if PCID is on but PGE is not. Since that -+ * combination doesn't exist on real hardware, there's -+ * no reason to try to fully support it, but it's -+ * polite to avoid corrupting data if we're on -+ * an improperly configured VM. -+ */ -+ setup_clear_cpu_cap(X86_FEATURE_PCID); -+ } -+ } -+#endif -+} -+ - #ifdef CONFIG_X86_32 - #define NR_RANGE_MR 3 - #else /* CONFIG_X86_64 */ -@@ -592,6 +625,7 @@ void __init init_mem_mapping(void) - unsigned long end; - - probe_page_size_mask(); -+ setup_pcid(); - - #ifdef CONFIG_X86_64 - end = max_pfn << PAGE_SHIFT; --- -2.14.2 - diff --git a/patches/kernel/0035-objtool-Add-ORC-unwind-table-generation.patch b/patches/kernel/0035-objtool-Add-ORC-unwind-table-generation.patch deleted file mode 100644 index f4bce26..0000000 --- a/patches/kernel/0035-objtool-Add-ORC-unwind-table-generation.patch +++ /dev/null @@ -1,1339 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Tue, 11 Jul 2017 10:33:42 -0500 -Subject: [PATCH] objtool: Add ORC unwind table generation -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Now that objtool knows the states of all registers on the stack for each -instruction, it's straightforward to generate debuginfo for an unwinder -to use. - -Instead of generating DWARF, generate a new format called ORC, which is -more suitable for an in-kernel unwinder. See -Documentation/x86/orc-unwinder.txt for a more detailed description of -this new debuginfo format and why it's preferable to DWARF. - -Signed-off-by: Josh Poimboeuf -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Denys Vlasenko -Cc: H. Peter Anvin -Cc: Jiri Slaby -Cc: Linus Torvalds -Cc: Mike Galbraith -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: live-patching@vger.kernel.org -Link: http://lkml.kernel.org/r/c9b9f01ba6c5ed2bdc9bb0957b78167fdbf9632e.1499786555.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 627fce14809ba5610b0cb476cd0186d3fcedecfc) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 9460f7766786ad0f8330f78f22b81842632a5398) -Signed-off-by: Fabian Grünbichler ---- - tools/objtool/Documentation/stack-validation.txt | 56 ++---- - tools/objtool/builtin.h | 1 + - tools/objtool/check.h | 15 +- - tools/objtool/elf.h | 15 +- - tools/objtool/orc.h | 30 ++++ - tools/objtool/orc_types.h | 85 +++++++++ - tools/objtool/builtin-check.c | 2 +- - tools/objtool/builtin-orc.c | 70 ++++++++ - tools/objtool/check.c | 58 +++++- - tools/objtool/elf.c | 212 ++++++++++++++++++++-- - tools/objtool/objtool.c | 3 +- - tools/objtool/orc_dump.c | 212 ++++++++++++++++++++++ - tools/objtool/orc_gen.c | 214 +++++++++++++++++++++++ - tools/objtool/Build | 3 + - 14 files changed, 916 insertions(+), 60 deletions(-) - create mode 100644 tools/objtool/orc.h - create mode 100644 tools/objtool/orc_types.h - create mode 100644 tools/objtool/builtin-orc.c - create mode 100644 tools/objtool/orc_dump.c - create mode 100644 tools/objtool/orc_gen.c - -diff --git a/tools/objtool/Documentation/stack-validation.txt b/tools/objtool/Documentation/stack-validation.txt -index 17c1195f11f4..6a1af43862df 100644 ---- a/tools/objtool/Documentation/stack-validation.txt -+++ b/tools/objtool/Documentation/stack-validation.txt -@@ -11,9 +11,6 @@ analyzes every .o file and ensures the validity of its stack metadata. - It enforces a set of rules on asm code and C inline assembly code so - that stack traces can be reliable. - --Currently it only checks frame pointer usage, but there are plans to add --CFI validation for C files and CFI generation for asm files. -- - For each function, it recursively follows all possible code paths and - validates the correct frame pointer state at each instruction. - -@@ -23,6 +20,10 @@ alternative execution paths to a given instruction (or set of - instructions). Similarly, it knows how to follow switch statements, for - which gcc sometimes uses jump tables. - -+(Objtool also has an 'orc generate' subcommand which generates debuginfo -+for the ORC unwinder. See Documentation/x86/orc-unwinder.txt in the -+kernel tree for more details.) -+ - - Why do we need stack metadata validation? - ----------------------------------------- -@@ -93,37 +94,14 @@ a) More reliable stack traces for frame pointer enabled kernels - or at the very end of the function after the stack frame has been - destroyed. This is an inherent limitation of frame pointers. - --b) 100% reliable stack traces for DWARF enabled kernels -- -- (NOTE: This is not yet implemented) -- -- As an alternative to frame pointers, DWARF Call Frame Information -- (CFI) metadata can be used to walk the stack. Unlike frame pointers, -- CFI metadata is out of band. So it doesn't affect runtime -- performance and it can be reliable even when interrupts or exceptions -- are involved. -- -- For C code, gcc automatically generates DWARF CFI metadata. But for -- asm code, generating CFI is a tedious manual approach which requires -- manually placed .cfi assembler macros to be scattered throughout the -- code. It's clumsy and very easy to get wrong, and it makes the real -- code harder to read. -- -- Stacktool will improve this situation in several ways. For code -- which already has CFI annotations, it will validate them. For code -- which doesn't have CFI annotations, it will generate them. So an -- architecture can opt to strip out all the manual .cfi annotations -- from their asm code and have objtool generate them instead. -+b) ORC (Oops Rewind Capability) unwind table generation - -- We might also add a runtime stack validation debug option where we -- periodically walk the stack from schedule() and/or an NMI to ensure -- that the stack metadata is sane and that we reach the bottom of the -- stack. -+ An alternative to frame pointers and DWARF, ORC unwind data can be -+ used to walk the stack. Unlike frame pointers, ORC data is out of -+ band. So it doesn't affect runtime performance and it can be -+ reliable even when interrupts or exceptions are involved. - -- So the benefit of objtool here will be that external tooling should -- always show perfect stack traces. And the same will be true for -- kernel warning/oops traces if the architecture has a runtime DWARF -- unwinder. -+ For more details, see Documentation/x86/orc-unwinder.txt. - - c) Higher live patching compatibility rate - -@@ -211,7 +189,7 @@ they mean, and suggestions for how to fix them. - function, add proper frame pointer logic using the FRAME_BEGIN and - FRAME_END macros. Otherwise, if it's not a callable function, remove - its ELF function annotation by changing ENDPROC to END, and instead -- use the manual CFI hint macros in asm/undwarf.h. -+ use the manual unwind hint macros in asm/unwind_hints.h. - - If it's a GCC-compiled .c file, the error may be because the function - uses an inline asm() statement which has a "call" instruction. An -@@ -231,8 +209,8 @@ they mean, and suggestions for how to fix them. - If the error is for an asm file, and the instruction is inside (or - reachable from) a callable function, the function should be annotated - with the ENTRY/ENDPROC macros (ENDPROC is the important one). -- Otherwise, the code should probably be annotated with the CFI hint -- macros in asm/undwarf.h so objtool and the unwinder can know the -+ Otherwise, the code should probably be annotated with the unwind hint -+ macros in asm/unwind_hints.h so objtool and the unwinder can know the - stack state associated with the code. - - If you're 100% sure the code won't affect stack traces, or if you're -@@ -258,7 +236,7 @@ they mean, and suggestions for how to fix them. - instructions aren't allowed in a callable function, and are most - likely part of the kernel entry code. They should usually not have - the callable function annotation (ENDPROC) and should always be -- annotated with the CFI hint macros in asm/undwarf.h. -+ annotated with the unwind hint macros in asm/unwind_hints.h. - - - 6. file.o: warning: objtool: func()+0x26: sibling call from callable instruction with modified stack frame -@@ -272,7 +250,7 @@ they mean, and suggestions for how to fix them. - - If the instruction is not actually in a callable function (e.g. - kernel entry code), change ENDPROC to END and annotate manually with -- the CFI hint macros in asm/undwarf.h. -+ the unwind hint macros in asm/unwind_hints.h. - - - 7. file: warning: objtool: func()+0x5c: stack state mismatch -@@ -288,8 +266,8 @@ they mean, and suggestions for how to fix them. - - Another possibility is that the code has some asm or inline asm which - does some unusual things to the stack or the frame pointer. In such -- cases it's probably appropriate to use the CFI hint macros in -- asm/undwarf.h. -+ cases it's probably appropriate to use the unwind hint macros in -+ asm/unwind_hints.h. - - - 8. file.o: warning: objtool: funcA() falls through to next function funcB() -diff --git a/tools/objtool/builtin.h b/tools/objtool/builtin.h -index 34d2ba78a616..dd526067fed5 100644 ---- a/tools/objtool/builtin.h -+++ b/tools/objtool/builtin.h -@@ -18,5 +18,6 @@ - #define _BUILTIN_H - - extern int cmd_check(int argc, const char **argv); -+extern int cmd_orc(int argc, const char **argv); - - #endif /* _BUILTIN_H */ -diff --git a/tools/objtool/check.h b/tools/objtool/check.h -index da85f5b00ec6..046874bbe226 100644 ---- a/tools/objtool/check.h -+++ b/tools/objtool/check.h -@@ -22,12 +22,14 @@ - #include "elf.h" - #include "cfi.h" - #include "arch.h" -+#include "orc.h" - #include - - struct insn_state { - struct cfi_reg cfa; - struct cfi_reg regs[CFI_NUM_REGS]; - int stack_size; -+ unsigned char type; - bool bp_scratch; - bool drap; - int drap_reg; -@@ -48,6 +50,7 @@ struct instruction { - struct symbol *func; - struct stack_op stack_op; - struct insn_state state; -+ struct orc_entry orc; - }; - - struct objtool_file { -@@ -58,9 +61,19 @@ struct objtool_file { - bool ignore_unreachables, c_file; - }; - --int check(const char *objname, bool nofp); -+int check(const char *objname, bool nofp, bool orc); -+ -+struct instruction *find_insn(struct objtool_file *file, -+ struct section *sec, unsigned long offset); - - #define for_each_insn(file, insn) \ - list_for_each_entry(insn, &file->insn_list, list) - -+#define sec_for_each_insn(file, sec, insn) \ -+ for (insn = find_insn(file, sec, 0); \ -+ insn && &insn->list != &file->insn_list && \ -+ insn->sec == sec; \ -+ insn = list_next_entry(insn, list)) -+ -+ - #endif /* _CHECK_H */ -diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h -index 343968b778cb..d86e2ff14466 100644 ---- a/tools/objtool/elf.h -+++ b/tools/objtool/elf.h -@@ -28,6 +28,13 @@ - # define elf_getshdrstrndx elf_getshstrndx - #endif - -+/* -+ * Fallback for systems without this "read, mmaping if possible" cmd. -+ */ -+#ifndef ELF_C_READ_MMAP -+#define ELF_C_READ_MMAP ELF_C_READ -+#endif -+ - struct section { - struct list_head list; - GElf_Shdr sh; -@@ -41,6 +48,7 @@ struct section { - char *name; - int idx; - unsigned int len; -+ bool changed, text; - }; - - struct symbol { -@@ -75,7 +83,7 @@ struct elf { - }; - - --struct elf *elf_open(const char *name); -+struct elf *elf_open(const char *name, int flags); - struct section *find_section_by_name(struct elf *elf, const char *name); - struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset); - struct symbol *find_symbol_containing(struct section *sec, unsigned long offset); -@@ -83,6 +91,11 @@ struct rela *find_rela_by_dest(struct section *sec, unsigned long offset); - struct rela *find_rela_by_dest_range(struct section *sec, unsigned long offset, - unsigned int len); - struct symbol *find_containing_func(struct section *sec, unsigned long offset); -+struct section *elf_create_section(struct elf *elf, const char *name, size_t -+ entsize, int nr); -+struct section *elf_create_rela_section(struct elf *elf, struct section *base); -+int elf_rebuild_rela_section(struct section *sec); -+int elf_write(struct elf *elf); - void elf_close(struct elf *elf); - - #define for_each_sec(file, sec) \ -diff --git a/tools/objtool/orc.h b/tools/objtool/orc.h -new file mode 100644 -index 000000000000..a4139e386ef3 ---- /dev/null -+++ b/tools/objtool/orc.h -@@ -0,0 +1,30 @@ -+/* -+ * Copyright (C) 2017 Josh Poimboeuf -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version 2 -+ * of the License, or (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, see . -+ */ -+ -+#ifndef _ORC_H -+#define _ORC_H -+ -+#include "orc_types.h" -+ -+struct objtool_file; -+ -+int create_orc(struct objtool_file *file); -+int create_orc_sections(struct objtool_file *file); -+ -+int orc_dump(const char *objname); -+ -+#endif /* _ORC_H */ -diff --git a/tools/objtool/orc_types.h b/tools/objtool/orc_types.h -new file mode 100644 -index 000000000000..fc5cf6cffd9a ---- /dev/null -+++ b/tools/objtool/orc_types.h -@@ -0,0 +1,85 @@ -+/* -+ * Copyright (C) 2017 Josh Poimboeuf -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version 2 -+ * of the License, or (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, see . -+ */ -+ -+#ifndef _ORC_TYPES_H -+#define _ORC_TYPES_H -+ -+#include -+#include -+ -+/* -+ * The ORC_REG_* registers are base registers which are used to find other -+ * registers on the stack. -+ * -+ * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the -+ * address of the previous frame: the caller's SP before it called the current -+ * function. -+ * -+ * ORC_REG_UNDEFINED means the corresponding register's value didn't change in -+ * the current frame. -+ * -+ * The most commonly used base registers are SP and BP -- which the previous SP -+ * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is -+ * usually based on. -+ * -+ * The rest of the base registers are needed for special cases like entry code -+ * and GCC realigned stacks. -+ */ -+#define ORC_REG_UNDEFINED 0 -+#define ORC_REG_PREV_SP 1 -+#define ORC_REG_DX 2 -+#define ORC_REG_DI 3 -+#define ORC_REG_BP 4 -+#define ORC_REG_SP 5 -+#define ORC_REG_R10 6 -+#define ORC_REG_R13 7 -+#define ORC_REG_BP_INDIRECT 8 -+#define ORC_REG_SP_INDIRECT 9 -+#define ORC_REG_MAX 15 -+ -+/* -+ * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the -+ * caller's SP right before it made the call). Used for all callable -+ * functions, i.e. all C code and all callable asm functions. -+ * -+ * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points -+ * to a fully populated pt_regs from a syscall, interrupt, or exception. -+ * -+ * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset -+ * points to the iret return frame. -+ */ -+#define ORC_TYPE_CALL 0 -+#define ORC_TYPE_REGS 1 -+#define ORC_TYPE_REGS_IRET 2 -+ -+/* -+ * This struct is more or less a vastly simplified version of the DWARF Call -+ * Frame Information standard. It contains only the necessary parts of DWARF -+ * CFI, simplified for ease of access by the in-kernel unwinder. It tells the -+ * unwinder how to find the previous SP and BP (and sometimes entry regs) on -+ * the stack for a given code address. Each instance of the struct corresponds -+ * to one or more code locations. -+ */ -+struct orc_entry { -+ s16 sp_offset; -+ s16 bp_offset; -+ unsigned sp_reg:4; -+ unsigned bp_reg:4; -+ unsigned type:2; -+} __packed; -+ -+#endif /* _ORC_TYPES_H */ -diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c -index 365c34ecab26..eedf089b1495 100644 ---- a/tools/objtool/builtin-check.c -+++ b/tools/objtool/builtin-check.c -@@ -52,5 +52,5 @@ int cmd_check(int argc, const char **argv) - - objname = argv[0]; - -- return check(objname, nofp); -+ return check(objname, nofp, false); - } -diff --git a/tools/objtool/builtin-orc.c b/tools/objtool/builtin-orc.c -new file mode 100644 -index 000000000000..5ca41ab0df48 ---- /dev/null -+++ b/tools/objtool/builtin-orc.c -@@ -0,0 +1,70 @@ -+/* -+ * Copyright (C) 2017 Josh Poimboeuf -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version 2 -+ * of the License, or (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, see . -+ */ -+ -+/* -+ * objtool orc: -+ * -+ * This command analyzes a .o file and adds .orc_unwind and .orc_unwind_ip -+ * sections to it, which is used by the in-kernel ORC unwinder. -+ * -+ * This command is a superset of "objtool check". -+ */ -+ -+#include -+#include -+#include "builtin.h" -+#include "check.h" -+ -+ -+static const char *orc_usage[] = { -+ "objtool orc generate [] file.o", -+ "objtool orc dump file.o", -+ NULL, -+}; -+ -+extern const struct option check_options[]; -+extern bool nofp; -+ -+int cmd_orc(int argc, const char **argv) -+{ -+ const char *objname; -+ -+ argc--; argv++; -+ if (!strncmp(argv[0], "gen", 3)) { -+ argc = parse_options(argc, argv, check_options, orc_usage, 0); -+ if (argc != 1) -+ usage_with_options(orc_usage, check_options); -+ -+ objname = argv[0]; -+ -+ return check(objname, nofp, true); -+ -+ } -+ -+ if (!strcmp(argv[0], "dump")) { -+ if (argc != 2) -+ usage_with_options(orc_usage, check_options); -+ -+ objname = argv[1]; -+ -+ return orc_dump(objname); -+ } -+ -+ usage_with_options(orc_usage, check_options); -+ -+ return 0; -+} -diff --git a/tools/objtool/check.c b/tools/objtool/check.c -index 2c6d74880403..cb57c526ba17 100644 ---- a/tools/objtool/check.c -+++ b/tools/objtool/check.c -@@ -36,8 +36,8 @@ const char *objname; - static bool nofp; - struct cfi_state initial_func_cfi; - --static struct instruction *find_insn(struct objtool_file *file, -- struct section *sec, unsigned long offset) -+struct instruction *find_insn(struct objtool_file *file, -+ struct section *sec, unsigned long offset) - { - struct instruction *insn; - -@@ -259,6 +259,11 @@ static int decode_instructions(struct objtool_file *file) - if (!(sec->sh.sh_flags & SHF_EXECINSTR)) - continue; - -+ if (strcmp(sec->name, ".altinstr_replacement") && -+ strcmp(sec->name, ".altinstr_aux") && -+ strncmp(sec->name, ".discard.", 9)) -+ sec->text = true; -+ - for (offset = 0; offset < sec->len; offset += insn->len) { - insn = malloc(sizeof(*insn)); - if (!insn) { -@@ -947,6 +952,30 @@ static bool has_valid_stack_frame(struct insn_state *state) - return false; - } - -+static int update_insn_state_regs(struct instruction *insn, struct insn_state *state) -+{ -+ struct cfi_reg *cfa = &state->cfa; -+ struct stack_op *op = &insn->stack_op; -+ -+ if (cfa->base != CFI_SP) -+ return 0; -+ -+ /* push */ -+ if (op->dest.type == OP_DEST_PUSH) -+ cfa->offset += 8; -+ -+ /* pop */ -+ if (op->src.type == OP_SRC_POP) -+ cfa->offset -= 8; -+ -+ /* add immediate to sp */ -+ if (op->dest.type == OP_DEST_REG && op->src.type == OP_SRC_ADD && -+ op->dest.reg == CFI_SP && op->src.reg == CFI_SP) -+ cfa->offset -= op->src.offset; -+ -+ return 0; -+} -+ - static void save_reg(struct insn_state *state, unsigned char reg, int base, - int offset) - { -@@ -1032,6 +1061,9 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) - return 0; - } - -+ if (state->type == ORC_TYPE_REGS || state->type == ORC_TYPE_REGS_IRET) -+ return update_insn_state_regs(insn, state); -+ - switch (op->dest.type) { - - case OP_DEST_REG: -@@ -1323,6 +1355,10 @@ static bool insn_state_match(struct instruction *insn, struct insn_state *state) - break; - } - -+ } else if (state1->type != state2->type) { -+ WARN_FUNC("stack state mismatch: type1=%d type2=%d", -+ insn->sec, insn->offset, state1->type, state2->type); -+ - } else if (state1->drap != state2->drap || - (state1->drap && state1->drap_reg != state2->drap_reg)) { - WARN_FUNC("stack state mismatch: drap1=%d(%d) drap2=%d(%d)", -@@ -1613,7 +1649,7 @@ static void cleanup(struct objtool_file *file) - elf_close(file->elf); - } - --int check(const char *_objname, bool _nofp) -+int check(const char *_objname, bool _nofp, bool orc) - { - struct objtool_file file; - int ret, warnings = 0; -@@ -1621,7 +1657,7 @@ int check(const char *_objname, bool _nofp) - objname = _objname; - nofp = _nofp; - -- file.elf = elf_open(objname); -+ file.elf = elf_open(objname, orc ? O_RDWR : O_RDONLY); - if (!file.elf) - return 1; - -@@ -1654,6 +1690,20 @@ int check(const char *_objname, bool _nofp) - warnings += ret; - } - -+ if (orc) { -+ ret = create_orc(&file); -+ if (ret < 0) -+ goto out; -+ -+ ret = create_orc_sections(&file); -+ if (ret < 0) -+ goto out; -+ -+ ret = elf_write(file.elf); -+ if (ret < 0) -+ goto out; -+ } -+ - out: - cleanup(&file); - -diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c -index 1a7e8aa2af58..6e9f980a7d26 100644 ---- a/tools/objtool/elf.c -+++ b/tools/objtool/elf.c -@@ -30,16 +30,6 @@ - #include "elf.h" - #include "warn.h" - --/* -- * Fallback for systems without this "read, mmaping if possible" cmd. -- */ --#ifndef ELF_C_READ_MMAP --#define ELF_C_READ_MMAP ELF_C_READ --#endif -- --#define WARN_ELF(format, ...) \ -- WARN(format ": %s", ##__VA_ARGS__, elf_errmsg(-1)) -- - struct section *find_section_by_name(struct elf *elf, const char *name) - { - struct section *sec; -@@ -349,9 +339,10 @@ static int read_relas(struct elf *elf) - return 0; - } - --struct elf *elf_open(const char *name) -+struct elf *elf_open(const char *name, int flags) - { - struct elf *elf; -+ Elf_Cmd cmd; - - elf_version(EV_CURRENT); - -@@ -364,13 +355,20 @@ struct elf *elf_open(const char *name) - - INIT_LIST_HEAD(&elf->sections); - -- elf->fd = open(name, O_RDONLY); -+ elf->fd = open(name, flags); - if (elf->fd == -1) { - perror("open"); - goto err; - } - -- elf->elf = elf_begin(elf->fd, ELF_C_READ_MMAP, NULL); -+ if ((flags & O_ACCMODE) == O_RDONLY) -+ cmd = ELF_C_READ_MMAP; -+ else if ((flags & O_ACCMODE) == O_RDWR) -+ cmd = ELF_C_RDWR; -+ else /* O_WRONLY */ -+ cmd = ELF_C_WRITE; -+ -+ elf->elf = elf_begin(elf->fd, cmd, NULL); - if (!elf->elf) { - WARN_ELF("elf_begin"); - goto err; -@@ -397,6 +395,194 @@ struct elf *elf_open(const char *name) - return NULL; - } - -+struct section *elf_create_section(struct elf *elf, const char *name, -+ size_t entsize, int nr) -+{ -+ struct section *sec, *shstrtab; -+ size_t size = entsize * nr; -+ struct Elf_Scn *s; -+ Elf_Data *data; -+ -+ sec = malloc(sizeof(*sec)); -+ if (!sec) { -+ perror("malloc"); -+ return NULL; -+ } -+ memset(sec, 0, sizeof(*sec)); -+ -+ INIT_LIST_HEAD(&sec->symbol_list); -+ INIT_LIST_HEAD(&sec->rela_list); -+ hash_init(sec->rela_hash); -+ hash_init(sec->symbol_hash); -+ -+ list_add_tail(&sec->list, &elf->sections); -+ -+ s = elf_newscn(elf->elf); -+ if (!s) { -+ WARN_ELF("elf_newscn"); -+ return NULL; -+ } -+ -+ sec->name = strdup(name); -+ if (!sec->name) { -+ perror("strdup"); -+ return NULL; -+ } -+ -+ sec->idx = elf_ndxscn(s); -+ sec->len = size; -+ sec->changed = true; -+ -+ sec->data = elf_newdata(s); -+ if (!sec->data) { -+ WARN_ELF("elf_newdata"); -+ return NULL; -+ } -+ -+ sec->data->d_size = size; -+ sec->data->d_align = 1; -+ -+ if (size) { -+ sec->data->d_buf = malloc(size); -+ if (!sec->data->d_buf) { -+ perror("malloc"); -+ return NULL; -+ } -+ memset(sec->data->d_buf, 0, size); -+ } -+ -+ if (!gelf_getshdr(s, &sec->sh)) { -+ WARN_ELF("gelf_getshdr"); -+ return NULL; -+ } -+ -+ sec->sh.sh_size = size; -+ sec->sh.sh_entsize = entsize; -+ sec->sh.sh_type = SHT_PROGBITS; -+ sec->sh.sh_addralign = 1; -+ sec->sh.sh_flags = SHF_ALLOC; -+ -+ -+ /* Add section name to .shstrtab */ -+ shstrtab = find_section_by_name(elf, ".shstrtab"); -+ if (!shstrtab) { -+ WARN("can't find .shstrtab section"); -+ return NULL; -+ } -+ -+ s = elf_getscn(elf->elf, shstrtab->idx); -+ if (!s) { -+ WARN_ELF("elf_getscn"); -+ return NULL; -+ } -+ -+ data = elf_newdata(s); -+ if (!data) { -+ WARN_ELF("elf_newdata"); -+ return NULL; -+ } -+ -+ data->d_buf = sec->name; -+ data->d_size = strlen(name) + 1; -+ data->d_align = 1; -+ -+ sec->sh.sh_name = shstrtab->len; -+ -+ shstrtab->len += strlen(name) + 1; -+ shstrtab->changed = true; -+ -+ return sec; -+} -+ -+struct section *elf_create_rela_section(struct elf *elf, struct section *base) -+{ -+ char *relaname; -+ struct section *sec; -+ -+ relaname = malloc(strlen(base->name) + strlen(".rela") + 1); -+ if (!relaname) { -+ perror("malloc"); -+ return NULL; -+ } -+ strcpy(relaname, ".rela"); -+ strcat(relaname, base->name); -+ -+ sec = elf_create_section(elf, relaname, sizeof(GElf_Rela), 0); -+ if (!sec) -+ return NULL; -+ -+ base->rela = sec; -+ sec->base = base; -+ -+ sec->sh.sh_type = SHT_RELA; -+ sec->sh.sh_addralign = 8; -+ sec->sh.sh_link = find_section_by_name(elf, ".symtab")->idx; -+ sec->sh.sh_info = base->idx; -+ sec->sh.sh_flags = SHF_INFO_LINK; -+ -+ return sec; -+} -+ -+int elf_rebuild_rela_section(struct section *sec) -+{ -+ struct rela *rela; -+ int nr, idx = 0, size; -+ GElf_Rela *relas; -+ -+ nr = 0; -+ list_for_each_entry(rela, &sec->rela_list, list) -+ nr++; -+ -+ size = nr * sizeof(*relas); -+ relas = malloc(size); -+ if (!relas) { -+ perror("malloc"); -+ return -1; -+ } -+ -+ sec->data->d_buf = relas; -+ sec->data->d_size = size; -+ -+ sec->sh.sh_size = size; -+ -+ idx = 0; -+ list_for_each_entry(rela, &sec->rela_list, list) { -+ relas[idx].r_offset = rela->offset; -+ relas[idx].r_addend = rela->addend; -+ relas[idx].r_info = GELF_R_INFO(rela->sym->idx, rela->type); -+ idx++; -+ } -+ -+ return 0; -+} -+ -+int elf_write(struct elf *elf) -+{ -+ struct section *sec; -+ Elf_Scn *s; -+ -+ list_for_each_entry(sec, &elf->sections, list) { -+ if (sec->changed) { -+ s = elf_getscn(elf->elf, sec->idx); -+ if (!s) { -+ WARN_ELF("elf_getscn"); -+ return -1; -+ } -+ if (!gelf_update_shdr (s, &sec->sh)) { -+ WARN_ELF("gelf_update_shdr"); -+ return -1; -+ } -+ } -+ } -+ -+ if (elf_update(elf->elf, ELF_C_WRITE) < 0) { -+ WARN_ELF("elf_update"); -+ return -1; -+ } -+ -+ return 0; -+} -+ - void elf_close(struct elf *elf) - { - struct section *sec, *tmpsec; -diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c -index ecc5b1b5d15d..31e0f9143840 100644 ---- a/tools/objtool/objtool.c -+++ b/tools/objtool/objtool.c -@@ -42,10 +42,11 @@ struct cmd_struct { - }; - - static const char objtool_usage_string[] = -- "objtool [OPTIONS] COMMAND [ARGS]"; -+ "objtool COMMAND [ARGS]"; - - static struct cmd_struct objtool_cmds[] = { - {"check", cmd_check, "Perform stack metadata validation on an object file" }, -+ {"orc", cmd_orc, "Generate in-place ORC unwind tables for an object file" }, - }; - - bool help; -diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c -new file mode 100644 -index 000000000000..36c5bf6a2675 ---- /dev/null -+++ b/tools/objtool/orc_dump.c -@@ -0,0 +1,212 @@ -+/* -+ * Copyright (C) 2017 Josh Poimboeuf -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version 2 -+ * of the License, or (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, see . -+ */ -+ -+#include -+#include "orc.h" -+#include "warn.h" -+ -+static const char *reg_name(unsigned int reg) -+{ -+ switch (reg) { -+ case ORC_REG_PREV_SP: -+ return "prevsp"; -+ case ORC_REG_DX: -+ return "dx"; -+ case ORC_REG_DI: -+ return "di"; -+ case ORC_REG_BP: -+ return "bp"; -+ case ORC_REG_SP: -+ return "sp"; -+ case ORC_REG_R10: -+ return "r10"; -+ case ORC_REG_R13: -+ return "r13"; -+ case ORC_REG_BP_INDIRECT: -+ return "bp(ind)"; -+ case ORC_REG_SP_INDIRECT: -+ return "sp(ind)"; -+ default: -+ return "?"; -+ } -+} -+ -+static const char *orc_type_name(unsigned int type) -+{ -+ switch (type) { -+ case ORC_TYPE_CALL: -+ return "call"; -+ case ORC_TYPE_REGS: -+ return "regs"; -+ case ORC_TYPE_REGS_IRET: -+ return "iret"; -+ default: -+ return "?"; -+ } -+} -+ -+static void print_reg(unsigned int reg, int offset) -+{ -+ if (reg == ORC_REG_BP_INDIRECT) -+ printf("(bp%+d)", offset); -+ else if (reg == ORC_REG_SP_INDIRECT) -+ printf("(sp%+d)", offset); -+ else if (reg == ORC_REG_UNDEFINED) -+ printf("(und)"); -+ else -+ printf("%s%+d", reg_name(reg), offset); -+} -+ -+int orc_dump(const char *_objname) -+{ -+ int fd, nr_entries, i, *orc_ip = NULL, orc_size = 0; -+ struct orc_entry *orc = NULL; -+ char *name; -+ unsigned long nr_sections, orc_ip_addr = 0; -+ size_t shstrtab_idx; -+ Elf *elf; -+ Elf_Scn *scn; -+ GElf_Shdr sh; -+ GElf_Rela rela; -+ GElf_Sym sym; -+ Elf_Data *data, *symtab = NULL, *rela_orc_ip = NULL; -+ -+ -+ objname = _objname; -+ -+ elf_version(EV_CURRENT); -+ -+ fd = open(objname, O_RDONLY); -+ if (fd == -1) { -+ perror("open"); -+ return -1; -+ } -+ -+ elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); -+ if (!elf) { -+ WARN_ELF("elf_begin"); -+ return -1; -+ } -+ -+ if (elf_getshdrnum(elf, &nr_sections)) { -+ WARN_ELF("elf_getshdrnum"); -+ return -1; -+ } -+ -+ if (elf_getshdrstrndx(elf, &shstrtab_idx)) { -+ WARN_ELF("elf_getshdrstrndx"); -+ return -1; -+ } -+ -+ for (i = 0; i < nr_sections; i++) { -+ scn = elf_getscn(elf, i); -+ if (!scn) { -+ WARN_ELF("elf_getscn"); -+ return -1; -+ } -+ -+ if (!gelf_getshdr(scn, &sh)) { -+ WARN_ELF("gelf_getshdr"); -+ return -1; -+ } -+ -+ name = elf_strptr(elf, shstrtab_idx, sh.sh_name); -+ if (!name) { -+ WARN_ELF("elf_strptr"); -+ return -1; -+ } -+ -+ data = elf_getdata(scn, NULL); -+ if (!data) { -+ WARN_ELF("elf_getdata"); -+ return -1; -+ } -+ -+ if (!strcmp(name, ".symtab")) { -+ symtab = data; -+ } else if (!strcmp(name, ".orc_unwind")) { -+ orc = data->d_buf; -+ orc_size = sh.sh_size; -+ } else if (!strcmp(name, ".orc_unwind_ip")) { -+ orc_ip = data->d_buf; -+ orc_ip_addr = sh.sh_addr; -+ } else if (!strcmp(name, ".rela.orc_unwind_ip")) { -+ rela_orc_ip = data; -+ } -+ } -+ -+ if (!symtab || !orc || !orc_ip) -+ return 0; -+ -+ if (orc_size % sizeof(*orc) != 0) { -+ WARN("bad .orc_unwind section size"); -+ return -1; -+ } -+ -+ nr_entries = orc_size / sizeof(*orc); -+ for (i = 0; i < nr_entries; i++) { -+ if (rela_orc_ip) { -+ if (!gelf_getrela(rela_orc_ip, i, &rela)) { -+ WARN_ELF("gelf_getrela"); -+ return -1; -+ } -+ -+ if (!gelf_getsym(symtab, GELF_R_SYM(rela.r_info), &sym)) { -+ WARN_ELF("gelf_getsym"); -+ return -1; -+ } -+ -+ scn = elf_getscn(elf, sym.st_shndx); -+ if (!scn) { -+ WARN_ELF("elf_getscn"); -+ return -1; -+ } -+ -+ if (!gelf_getshdr(scn, &sh)) { -+ WARN_ELF("gelf_getshdr"); -+ return -1; -+ } -+ -+ name = elf_strptr(elf, shstrtab_idx, sh.sh_name); -+ if (!name || !*name) { -+ WARN_ELF("elf_strptr"); -+ return -1; -+ } -+ -+ printf("%s+%lx:", name, rela.r_addend); -+ -+ } else { -+ printf("%lx:", orc_ip_addr + (i * sizeof(int)) + orc_ip[i]); -+ } -+ -+ -+ printf(" sp:"); -+ -+ print_reg(orc[i].sp_reg, orc[i].sp_offset); -+ -+ printf(" bp:"); -+ -+ print_reg(orc[i].bp_reg, orc[i].bp_offset); -+ -+ printf(" type:%s\n", orc_type_name(orc[i].type)); -+ } -+ -+ elf_end(elf); -+ close(fd); -+ -+ return 0; -+} -diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c -new file mode 100644 -index 000000000000..e5ca31429c9b ---- /dev/null -+++ b/tools/objtool/orc_gen.c -@@ -0,0 +1,214 @@ -+/* -+ * Copyright (C) 2017 Josh Poimboeuf -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version 2 -+ * of the License, or (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, see . -+ */ -+ -+#include -+#include -+ -+#include "orc.h" -+#include "check.h" -+#include "warn.h" -+ -+int create_orc(struct objtool_file *file) -+{ -+ struct instruction *insn; -+ -+ for_each_insn(file, insn) { -+ struct orc_entry *orc = &insn->orc; -+ struct cfi_reg *cfa = &insn->state.cfa; -+ struct cfi_reg *bp = &insn->state.regs[CFI_BP]; -+ -+ if (cfa->base == CFI_UNDEFINED) { -+ orc->sp_reg = ORC_REG_UNDEFINED; -+ continue; -+ } -+ -+ switch (cfa->base) { -+ case CFI_SP: -+ orc->sp_reg = ORC_REG_SP; -+ break; -+ case CFI_SP_INDIRECT: -+ orc->sp_reg = ORC_REG_SP_INDIRECT; -+ break; -+ case CFI_BP: -+ orc->sp_reg = ORC_REG_BP; -+ break; -+ case CFI_BP_INDIRECT: -+ orc->sp_reg = ORC_REG_BP_INDIRECT; -+ break; -+ case CFI_R10: -+ orc->sp_reg = ORC_REG_R10; -+ break; -+ case CFI_R13: -+ orc->sp_reg = ORC_REG_R13; -+ break; -+ case CFI_DI: -+ orc->sp_reg = ORC_REG_DI; -+ break; -+ case CFI_DX: -+ orc->sp_reg = ORC_REG_DX; -+ break; -+ default: -+ WARN_FUNC("unknown CFA base reg %d", -+ insn->sec, insn->offset, cfa->base); -+ return -1; -+ } -+ -+ switch(bp->base) { -+ case CFI_UNDEFINED: -+ orc->bp_reg = ORC_REG_UNDEFINED; -+ break; -+ case CFI_CFA: -+ orc->bp_reg = ORC_REG_PREV_SP; -+ break; -+ case CFI_BP: -+ orc->bp_reg = ORC_REG_BP; -+ break; -+ default: -+ WARN_FUNC("unknown BP base reg %d", -+ insn->sec, insn->offset, bp->base); -+ return -1; -+ } -+ -+ orc->sp_offset = cfa->offset; -+ orc->bp_offset = bp->offset; -+ orc->type = insn->state.type; -+ } -+ -+ return 0; -+} -+ -+static int create_orc_entry(struct section *u_sec, struct section *ip_relasec, -+ unsigned int idx, struct section *insn_sec, -+ unsigned long insn_off, struct orc_entry *o) -+{ -+ struct orc_entry *orc; -+ struct rela *rela; -+ -+ /* populate ORC data */ -+ orc = (struct orc_entry *)u_sec->data->d_buf + idx; -+ memcpy(orc, o, sizeof(*orc)); -+ -+ /* populate rela for ip */ -+ rela = malloc(sizeof(*rela)); -+ if (!rela) { -+ perror("malloc"); -+ return -1; -+ } -+ memset(rela, 0, sizeof(*rela)); -+ -+ rela->sym = insn_sec->sym; -+ rela->addend = insn_off; -+ rela->type = R_X86_64_PC32; -+ rela->offset = idx * sizeof(int); -+ -+ list_add_tail(&rela->list, &ip_relasec->rela_list); -+ hash_add(ip_relasec->rela_hash, &rela->hash, rela->offset); -+ -+ return 0; -+} -+ -+int create_orc_sections(struct objtool_file *file) -+{ -+ struct instruction *insn, *prev_insn; -+ struct section *sec, *u_sec, *ip_relasec; -+ unsigned int idx; -+ -+ struct orc_entry empty = { -+ .sp_reg = ORC_REG_UNDEFINED, -+ .bp_reg = ORC_REG_UNDEFINED, -+ .type = ORC_TYPE_CALL, -+ }; -+ -+ sec = find_section_by_name(file->elf, ".orc_unwind"); -+ if (sec) { -+ WARN("file already has .orc_unwind section, skipping"); -+ return -1; -+ } -+ -+ /* count the number of needed orcs */ -+ idx = 0; -+ for_each_sec(file, sec) { -+ if (!sec->text) -+ continue; -+ -+ prev_insn = NULL; -+ sec_for_each_insn(file, sec, insn) { -+ if (!prev_insn || -+ memcmp(&insn->orc, &prev_insn->orc, -+ sizeof(struct orc_entry))) { -+ idx++; -+ } -+ prev_insn = insn; -+ } -+ -+ /* section terminator */ -+ if (prev_insn) -+ idx++; -+ } -+ if (!idx) -+ return -1; -+ -+ -+ /* create .orc_unwind_ip and .rela.orc_unwind_ip sections */ -+ sec = elf_create_section(file->elf, ".orc_unwind_ip", sizeof(int), idx); -+ -+ ip_relasec = elf_create_rela_section(file->elf, sec); -+ if (!ip_relasec) -+ return -1; -+ -+ /* create .orc_unwind section */ -+ u_sec = elf_create_section(file->elf, ".orc_unwind", -+ sizeof(struct orc_entry), idx); -+ -+ /* populate sections */ -+ idx = 0; -+ for_each_sec(file, sec) { -+ if (!sec->text) -+ continue; -+ -+ prev_insn = NULL; -+ sec_for_each_insn(file, sec, insn) { -+ if (!prev_insn || memcmp(&insn->orc, &prev_insn->orc, -+ sizeof(struct orc_entry))) { -+ -+ if (create_orc_entry(u_sec, ip_relasec, idx, -+ insn->sec, insn->offset, -+ &insn->orc)) -+ return -1; -+ -+ idx++; -+ } -+ prev_insn = insn; -+ } -+ -+ /* section terminator */ -+ if (prev_insn) { -+ if (create_orc_entry(u_sec, ip_relasec, idx, -+ prev_insn->sec, -+ prev_insn->offset + prev_insn->len, -+ &empty)) -+ return -1; -+ -+ idx++; -+ } -+ } -+ -+ if (elf_rebuild_rela_section(ip_relasec)) -+ return -1; -+ -+ return 0; -+} -diff --git a/tools/objtool/Build b/tools/objtool/Build -index 6f2e1987c4d9..749becdf5b90 100644 ---- a/tools/objtool/Build -+++ b/tools/objtool/Build -@@ -1,6 +1,9 @@ - objtool-y += arch/$(SRCARCH)/ - objtool-y += builtin-check.o -+objtool-y += builtin-orc.o - objtool-y += check.o -+objtool-y += orc_gen.o -+objtool-y += orc_dump.o - objtool-y += elf.o - objtool-y += special.o - objtool-y += objtool.o --- -2.14.2 - diff --git a/patches/kernel/0035-x86-mm-64-Initialize-CR4.PCIDE-early.patch b/patches/kernel/0035-x86-mm-64-Initialize-CR4.PCIDE-early.patch new file mode 100644 index 0000000..15f8a3e --- /dev/null +++ b/patches/kernel/0035-x86-mm-64-Initialize-CR4.PCIDE-early.patch @@ -0,0 +1,237 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Sun, 10 Sep 2017 17:48:27 -0700 +Subject: [PATCH] x86/mm/64: Initialize CR4.PCIDE early +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +cpu_init() is weird: it's called rather late (after early +identification and after most MMU state is initialized) on the boot +CPU but is called extremely early (before identification) on secondary +CPUs. It's called just late enough on the boot CPU that its CR4 value +isn't propagated to mmu_cr4_features. + +Even if we put CR4.PCIDE into mmu_cr4_features, we'd hit two +problems. First, we'd crash in the trampoline code. That's +fixable, and I tried that. It turns out that mmu_cr4_features is +totally ignored by secondary_start_64(), though, so even with the +trampoline code fixed, it wouldn't help. + +This means that we don't currently have CR4.PCIDE reliably initialized +before we start playing with cpu_tlbstate. This is very fragile and +tends to cause boot failures if I make even small changes to the TLB +handling code. + +Make it more robust: initialize CR4.PCIDE earlier on the boot CPU +and propagate it to secondary CPUs in start_secondary(). + +( Yes, this is ugly. I think we should have improved mmu_cr4_features + to actually control CR4 during secondary bootup, but that would be + fairly intrusive at this stage. ) + +Signed-off-by: Andy Lutomirski +Reported-by: Sai Praneeth Prakhya +Tested-by: Sai Praneeth Prakhya +Cc: Borislav Petkov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: linux-kernel@vger.kernel.org +Fixes: 660da7c9228f ("x86/mm: Enable CR4.PCIDE on supported systems") +Signed-off-by: Ingo Molnar +(cherry picked from commit c7ad5ad297e644601747d6dbee978bf85e14f7bc) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 0e6a37a43aa876327e7d21881c09977da2d5c270) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/cpu/common.c | 49 +++++++------------------------------------- + arch/x86/kernel/setup.c | 5 ++++- + arch/x86/kernel/smpboot.c | 8 +++++--- + arch/x86/mm/init.c | 34 ++++++++++++++++++++++++++++++ + 4 files changed, 50 insertions(+), 46 deletions(-) + +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 0b80ed14ff52..4be7b209a3d6 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -169,21 +169,21 @@ static int __init x86_mpx_setup(char *s) + __setup("nompx", x86_mpx_setup); + + #ifdef CONFIG_X86_64 +-static int __init x86_pcid_setup(char *s) ++static int __init x86_nopcid_setup(char *s) + { +- /* require an exact match without trailing characters */ +- if (strlen(s)) +- return 0; ++ /* nopcid doesn't accept parameters */ ++ if (s) ++ return -EINVAL; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_PCID)) +- return 1; ++ return 0; + + setup_clear_cpu_cap(X86_FEATURE_PCID); + pr_info("nopcid: PCID feature disabled\n"); +- return 1; ++ return 0; + } +-__setup("nopcid", x86_pcid_setup); ++early_param("nopcid", x86_nopcid_setup); + #endif + + static int __init x86_noinvpcid_setup(char *s) +@@ -329,38 +329,6 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) + } + } + +-static void setup_pcid(struct cpuinfo_x86 *c) +-{ +- if (cpu_has(c, X86_FEATURE_PCID)) { +- if (cpu_has(c, X86_FEATURE_PGE)) { +- /* +- * We'd like to use cr4_set_bits_and_update_boot(), +- * but we can't. CR4.PCIDE is special and can only +- * be set in long mode, and the early CPU init code +- * doesn't know this and would try to restore CR4.PCIDE +- * prior to entering long mode. +- * +- * Instead, we rely on the fact that hotplug, resume, +- * etc all fully restore CR4 before they write anything +- * that could have nonzero PCID bits to CR3. CR4.PCIDE +- * has no effect on the page tables themselves, so we +- * don't need it to be restored early. +- */ +- cr4_set_bits(X86_CR4_PCIDE); +- } else { +- /* +- * flush_tlb_all(), as currently implemented, won't +- * work if PCID is on but PGE is not. Since that +- * combination doesn't exist on real hardware, there's +- * no reason to try to fully support it, but it's +- * polite to avoid corrupting data if we're on +- * an improperly configured VM. +- */ +- clear_cpu_cap(c, X86_FEATURE_PCID); +- } +- } +-} +- + /* + * Protection Keys are not available in 32-bit mode. + */ +@@ -1175,9 +1143,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) + setup_smep(c); + setup_smap(c); + +- /* Set up PCID */ +- setup_pcid(c); +- + /* + * The vendor-specific functions might have changed features. + * Now we do "generic changes." +diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c +index d7e8b983aa72..f964bfddfefd 100644 +--- a/arch/x86/kernel/setup.c ++++ b/arch/x86/kernel/setup.c +@@ -1174,8 +1174,11 @@ void __init setup_arch(char **cmdline_p) + * with the current CR4 value. This may not be necessary, but + * auditing all the early-boot CR4 manipulation would be needed to + * rule it out. ++ * ++ * Mask off features that don't work outside long mode (just ++ * PCIDE for now). + */ +- mmu_cr4_features = __read_cr4(); ++ mmu_cr4_features = __read_cr4() & ~X86_CR4_PCIDE; + + memblock_set_current_limit(get_max_mapped()); + +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index 893fd8c849e2..d05006f6c31c 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -227,10 +227,12 @@ static int enable_start_cpu0; + static void notrace start_secondary(void *unused) + { + /* +- * Don't put *anything* before cpu_init(), SMP booting is too +- * fragile that we want to limit the things done here to the +- * most necessary things. ++ * Don't put *anything* except direct CPU state initialization ++ * before cpu_init(), SMP booting is too fragile that we want to ++ * limit the things done here to the most necessary things. + */ ++ if (boot_cpu_has(X86_FEATURE_PCID)) ++ __write_cr4(__read_cr4() | X86_CR4_PCIDE); + cpu_init(); + x86_cpuinit.early_percpu_clock_init(); + preempt_disable(); +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c +index bf3f1065d6ad..df2624b091a7 100644 +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + /* + * We need to define the tracepoints somewhere, and tlb.c +@@ -193,6 +194,38 @@ static void __init probe_page_size_mask(void) + } + } + ++static void setup_pcid(void) ++{ ++#ifdef CONFIG_X86_64 ++ if (boot_cpu_has(X86_FEATURE_PCID)) { ++ if (boot_cpu_has(X86_FEATURE_PGE)) { ++ /* ++ * This can't be cr4_set_bits_and_update_boot() -- ++ * the trampoline code can't handle CR4.PCIDE and ++ * it wouldn't do any good anyway. Despite the name, ++ * cr4_set_bits_and_update_boot() doesn't actually ++ * cause the bits in question to remain set all the ++ * way through the secondary boot asm. ++ * ++ * Instead, we brute-force it and set CR4.PCIDE ++ * manually in start_secondary(). ++ */ ++ cr4_set_bits(X86_CR4_PCIDE); ++ } else { ++ /* ++ * flush_tlb_all(), as currently implemented, won't ++ * work if PCID is on but PGE is not. Since that ++ * combination doesn't exist on real hardware, there's ++ * no reason to try to fully support it, but it's ++ * polite to avoid corrupting data if we're on ++ * an improperly configured VM. ++ */ ++ setup_clear_cpu_cap(X86_FEATURE_PCID); ++ } ++ } ++#endif ++} ++ + #ifdef CONFIG_X86_32 + #define NR_RANGE_MR 3 + #else /* CONFIG_X86_64 */ +@@ -592,6 +625,7 @@ void __init init_mem_mapping(void) + unsigned long end; + + probe_page_size_mask(); ++ setup_pcid(); + + #ifdef CONFIG_X86_64 + end = max_pfn << PAGE_SHIFT; +-- +2.14.2 + diff --git a/patches/kernel/0036-objtool-Add-ORC-unwind-table-generation.patch b/patches/kernel/0036-objtool-Add-ORC-unwind-table-generation.patch new file mode 100644 index 0000000..f4bce26 --- /dev/null +++ b/patches/kernel/0036-objtool-Add-ORC-unwind-table-generation.patch @@ -0,0 +1,1339 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Tue, 11 Jul 2017 10:33:42 -0500 +Subject: [PATCH] objtool: Add ORC unwind table generation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Now that objtool knows the states of all registers on the stack for each +instruction, it's straightforward to generate debuginfo for an unwinder +to use. + +Instead of generating DWARF, generate a new format called ORC, which is +more suitable for an in-kernel unwinder. See +Documentation/x86/orc-unwinder.txt for a more detailed description of +this new debuginfo format and why it's preferable to DWARF. + +Signed-off-by: Josh Poimboeuf +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Jiri Slaby +Cc: Linus Torvalds +Cc: Mike Galbraith +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: live-patching@vger.kernel.org +Link: http://lkml.kernel.org/r/c9b9f01ba6c5ed2bdc9bb0957b78167fdbf9632e.1499786555.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 627fce14809ba5610b0cb476cd0186d3fcedecfc) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 9460f7766786ad0f8330f78f22b81842632a5398) +Signed-off-by: Fabian Grünbichler +--- + tools/objtool/Documentation/stack-validation.txt | 56 ++---- + tools/objtool/builtin.h | 1 + + tools/objtool/check.h | 15 +- + tools/objtool/elf.h | 15 +- + tools/objtool/orc.h | 30 ++++ + tools/objtool/orc_types.h | 85 +++++++++ + tools/objtool/builtin-check.c | 2 +- + tools/objtool/builtin-orc.c | 70 ++++++++ + tools/objtool/check.c | 58 +++++- + tools/objtool/elf.c | 212 ++++++++++++++++++++-- + tools/objtool/objtool.c | 3 +- + tools/objtool/orc_dump.c | 212 ++++++++++++++++++++++ + tools/objtool/orc_gen.c | 214 +++++++++++++++++++++++ + tools/objtool/Build | 3 + + 14 files changed, 916 insertions(+), 60 deletions(-) + create mode 100644 tools/objtool/orc.h + create mode 100644 tools/objtool/orc_types.h + create mode 100644 tools/objtool/builtin-orc.c + create mode 100644 tools/objtool/orc_dump.c + create mode 100644 tools/objtool/orc_gen.c + +diff --git a/tools/objtool/Documentation/stack-validation.txt b/tools/objtool/Documentation/stack-validation.txt +index 17c1195f11f4..6a1af43862df 100644 +--- a/tools/objtool/Documentation/stack-validation.txt ++++ b/tools/objtool/Documentation/stack-validation.txt +@@ -11,9 +11,6 @@ analyzes every .o file and ensures the validity of its stack metadata. + It enforces a set of rules on asm code and C inline assembly code so + that stack traces can be reliable. + +-Currently it only checks frame pointer usage, but there are plans to add +-CFI validation for C files and CFI generation for asm files. +- + For each function, it recursively follows all possible code paths and + validates the correct frame pointer state at each instruction. + +@@ -23,6 +20,10 @@ alternative execution paths to a given instruction (or set of + instructions). Similarly, it knows how to follow switch statements, for + which gcc sometimes uses jump tables. + ++(Objtool also has an 'orc generate' subcommand which generates debuginfo ++for the ORC unwinder. See Documentation/x86/orc-unwinder.txt in the ++kernel tree for more details.) ++ + + Why do we need stack metadata validation? + ----------------------------------------- +@@ -93,37 +94,14 @@ a) More reliable stack traces for frame pointer enabled kernels + or at the very end of the function after the stack frame has been + destroyed. This is an inherent limitation of frame pointers. + +-b) 100% reliable stack traces for DWARF enabled kernels +- +- (NOTE: This is not yet implemented) +- +- As an alternative to frame pointers, DWARF Call Frame Information +- (CFI) metadata can be used to walk the stack. Unlike frame pointers, +- CFI metadata is out of band. So it doesn't affect runtime +- performance and it can be reliable even when interrupts or exceptions +- are involved. +- +- For C code, gcc automatically generates DWARF CFI metadata. But for +- asm code, generating CFI is a tedious manual approach which requires +- manually placed .cfi assembler macros to be scattered throughout the +- code. It's clumsy and very easy to get wrong, and it makes the real +- code harder to read. +- +- Stacktool will improve this situation in several ways. For code +- which already has CFI annotations, it will validate them. For code +- which doesn't have CFI annotations, it will generate them. So an +- architecture can opt to strip out all the manual .cfi annotations +- from their asm code and have objtool generate them instead. ++b) ORC (Oops Rewind Capability) unwind table generation + +- We might also add a runtime stack validation debug option where we +- periodically walk the stack from schedule() and/or an NMI to ensure +- that the stack metadata is sane and that we reach the bottom of the +- stack. ++ An alternative to frame pointers and DWARF, ORC unwind data can be ++ used to walk the stack. Unlike frame pointers, ORC data is out of ++ band. So it doesn't affect runtime performance and it can be ++ reliable even when interrupts or exceptions are involved. + +- So the benefit of objtool here will be that external tooling should +- always show perfect stack traces. And the same will be true for +- kernel warning/oops traces if the architecture has a runtime DWARF +- unwinder. ++ For more details, see Documentation/x86/orc-unwinder.txt. + + c) Higher live patching compatibility rate + +@@ -211,7 +189,7 @@ they mean, and suggestions for how to fix them. + function, add proper frame pointer logic using the FRAME_BEGIN and + FRAME_END macros. Otherwise, if it's not a callable function, remove + its ELF function annotation by changing ENDPROC to END, and instead +- use the manual CFI hint macros in asm/undwarf.h. ++ use the manual unwind hint macros in asm/unwind_hints.h. + + If it's a GCC-compiled .c file, the error may be because the function + uses an inline asm() statement which has a "call" instruction. An +@@ -231,8 +209,8 @@ they mean, and suggestions for how to fix them. + If the error is for an asm file, and the instruction is inside (or + reachable from) a callable function, the function should be annotated + with the ENTRY/ENDPROC macros (ENDPROC is the important one). +- Otherwise, the code should probably be annotated with the CFI hint +- macros in asm/undwarf.h so objtool and the unwinder can know the ++ Otherwise, the code should probably be annotated with the unwind hint ++ macros in asm/unwind_hints.h so objtool and the unwinder can know the + stack state associated with the code. + + If you're 100% sure the code won't affect stack traces, or if you're +@@ -258,7 +236,7 @@ they mean, and suggestions for how to fix them. + instructions aren't allowed in a callable function, and are most + likely part of the kernel entry code. They should usually not have + the callable function annotation (ENDPROC) and should always be +- annotated with the CFI hint macros in asm/undwarf.h. ++ annotated with the unwind hint macros in asm/unwind_hints.h. + + + 6. file.o: warning: objtool: func()+0x26: sibling call from callable instruction with modified stack frame +@@ -272,7 +250,7 @@ they mean, and suggestions for how to fix them. + + If the instruction is not actually in a callable function (e.g. + kernel entry code), change ENDPROC to END and annotate manually with +- the CFI hint macros in asm/undwarf.h. ++ the unwind hint macros in asm/unwind_hints.h. + + + 7. file: warning: objtool: func()+0x5c: stack state mismatch +@@ -288,8 +266,8 @@ they mean, and suggestions for how to fix them. + + Another possibility is that the code has some asm or inline asm which + does some unusual things to the stack or the frame pointer. In such +- cases it's probably appropriate to use the CFI hint macros in +- asm/undwarf.h. ++ cases it's probably appropriate to use the unwind hint macros in ++ asm/unwind_hints.h. + + + 8. file.o: warning: objtool: funcA() falls through to next function funcB() +diff --git a/tools/objtool/builtin.h b/tools/objtool/builtin.h +index 34d2ba78a616..dd526067fed5 100644 +--- a/tools/objtool/builtin.h ++++ b/tools/objtool/builtin.h +@@ -18,5 +18,6 @@ + #define _BUILTIN_H + + extern int cmd_check(int argc, const char **argv); ++extern int cmd_orc(int argc, const char **argv); + + #endif /* _BUILTIN_H */ +diff --git a/tools/objtool/check.h b/tools/objtool/check.h +index da85f5b00ec6..046874bbe226 100644 +--- a/tools/objtool/check.h ++++ b/tools/objtool/check.h +@@ -22,12 +22,14 @@ + #include "elf.h" + #include "cfi.h" + #include "arch.h" ++#include "orc.h" + #include + + struct insn_state { + struct cfi_reg cfa; + struct cfi_reg regs[CFI_NUM_REGS]; + int stack_size; ++ unsigned char type; + bool bp_scratch; + bool drap; + int drap_reg; +@@ -48,6 +50,7 @@ struct instruction { + struct symbol *func; + struct stack_op stack_op; + struct insn_state state; ++ struct orc_entry orc; + }; + + struct objtool_file { +@@ -58,9 +61,19 @@ struct objtool_file { + bool ignore_unreachables, c_file; + }; + +-int check(const char *objname, bool nofp); ++int check(const char *objname, bool nofp, bool orc); ++ ++struct instruction *find_insn(struct objtool_file *file, ++ struct section *sec, unsigned long offset); + + #define for_each_insn(file, insn) \ + list_for_each_entry(insn, &file->insn_list, list) + ++#define sec_for_each_insn(file, sec, insn) \ ++ for (insn = find_insn(file, sec, 0); \ ++ insn && &insn->list != &file->insn_list && \ ++ insn->sec == sec; \ ++ insn = list_next_entry(insn, list)) ++ ++ + #endif /* _CHECK_H */ +diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h +index 343968b778cb..d86e2ff14466 100644 +--- a/tools/objtool/elf.h ++++ b/tools/objtool/elf.h +@@ -28,6 +28,13 @@ + # define elf_getshdrstrndx elf_getshstrndx + #endif + ++/* ++ * Fallback for systems without this "read, mmaping if possible" cmd. ++ */ ++#ifndef ELF_C_READ_MMAP ++#define ELF_C_READ_MMAP ELF_C_READ ++#endif ++ + struct section { + struct list_head list; + GElf_Shdr sh; +@@ -41,6 +48,7 @@ struct section { + char *name; + int idx; + unsigned int len; ++ bool changed, text; + }; + + struct symbol { +@@ -75,7 +83,7 @@ struct elf { + }; + + +-struct elf *elf_open(const char *name); ++struct elf *elf_open(const char *name, int flags); + struct section *find_section_by_name(struct elf *elf, const char *name); + struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset); + struct symbol *find_symbol_containing(struct section *sec, unsigned long offset); +@@ -83,6 +91,11 @@ struct rela *find_rela_by_dest(struct section *sec, unsigned long offset); + struct rela *find_rela_by_dest_range(struct section *sec, unsigned long offset, + unsigned int len); + struct symbol *find_containing_func(struct section *sec, unsigned long offset); ++struct section *elf_create_section(struct elf *elf, const char *name, size_t ++ entsize, int nr); ++struct section *elf_create_rela_section(struct elf *elf, struct section *base); ++int elf_rebuild_rela_section(struct section *sec); ++int elf_write(struct elf *elf); + void elf_close(struct elf *elf); + + #define for_each_sec(file, sec) \ +diff --git a/tools/objtool/orc.h b/tools/objtool/orc.h +new file mode 100644 +index 000000000000..a4139e386ef3 +--- /dev/null ++++ b/tools/objtool/orc.h +@@ -0,0 +1,30 @@ ++/* ++ * Copyright (C) 2017 Josh Poimboeuf ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version 2 ++ * of the License, or (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, see . ++ */ ++ ++#ifndef _ORC_H ++#define _ORC_H ++ ++#include "orc_types.h" ++ ++struct objtool_file; ++ ++int create_orc(struct objtool_file *file); ++int create_orc_sections(struct objtool_file *file); ++ ++int orc_dump(const char *objname); ++ ++#endif /* _ORC_H */ +diff --git a/tools/objtool/orc_types.h b/tools/objtool/orc_types.h +new file mode 100644 +index 000000000000..fc5cf6cffd9a +--- /dev/null ++++ b/tools/objtool/orc_types.h +@@ -0,0 +1,85 @@ ++/* ++ * Copyright (C) 2017 Josh Poimboeuf ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version 2 ++ * of the License, or (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, see . ++ */ ++ ++#ifndef _ORC_TYPES_H ++#define _ORC_TYPES_H ++ ++#include ++#include ++ ++/* ++ * The ORC_REG_* registers are base registers which are used to find other ++ * registers on the stack. ++ * ++ * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the ++ * address of the previous frame: the caller's SP before it called the current ++ * function. ++ * ++ * ORC_REG_UNDEFINED means the corresponding register's value didn't change in ++ * the current frame. ++ * ++ * The most commonly used base registers are SP and BP -- which the previous SP ++ * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is ++ * usually based on. ++ * ++ * The rest of the base registers are needed for special cases like entry code ++ * and GCC realigned stacks. ++ */ ++#define ORC_REG_UNDEFINED 0 ++#define ORC_REG_PREV_SP 1 ++#define ORC_REG_DX 2 ++#define ORC_REG_DI 3 ++#define ORC_REG_BP 4 ++#define ORC_REG_SP 5 ++#define ORC_REG_R10 6 ++#define ORC_REG_R13 7 ++#define ORC_REG_BP_INDIRECT 8 ++#define ORC_REG_SP_INDIRECT 9 ++#define ORC_REG_MAX 15 ++ ++/* ++ * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the ++ * caller's SP right before it made the call). Used for all callable ++ * functions, i.e. all C code and all callable asm functions. ++ * ++ * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points ++ * to a fully populated pt_regs from a syscall, interrupt, or exception. ++ * ++ * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset ++ * points to the iret return frame. ++ */ ++#define ORC_TYPE_CALL 0 ++#define ORC_TYPE_REGS 1 ++#define ORC_TYPE_REGS_IRET 2 ++ ++/* ++ * This struct is more or less a vastly simplified version of the DWARF Call ++ * Frame Information standard. It contains only the necessary parts of DWARF ++ * CFI, simplified for ease of access by the in-kernel unwinder. It tells the ++ * unwinder how to find the previous SP and BP (and sometimes entry regs) on ++ * the stack for a given code address. Each instance of the struct corresponds ++ * to one or more code locations. ++ */ ++struct orc_entry { ++ s16 sp_offset; ++ s16 bp_offset; ++ unsigned sp_reg:4; ++ unsigned bp_reg:4; ++ unsigned type:2; ++} __packed; ++ ++#endif /* _ORC_TYPES_H */ +diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c +index 365c34ecab26..eedf089b1495 100644 +--- a/tools/objtool/builtin-check.c ++++ b/tools/objtool/builtin-check.c +@@ -52,5 +52,5 @@ int cmd_check(int argc, const char **argv) + + objname = argv[0]; + +- return check(objname, nofp); ++ return check(objname, nofp, false); + } +diff --git a/tools/objtool/builtin-orc.c b/tools/objtool/builtin-orc.c +new file mode 100644 +index 000000000000..5ca41ab0df48 +--- /dev/null ++++ b/tools/objtool/builtin-orc.c +@@ -0,0 +1,70 @@ ++/* ++ * Copyright (C) 2017 Josh Poimboeuf ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version 2 ++ * of the License, or (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, see . ++ */ ++ ++/* ++ * objtool orc: ++ * ++ * This command analyzes a .o file and adds .orc_unwind and .orc_unwind_ip ++ * sections to it, which is used by the in-kernel ORC unwinder. ++ * ++ * This command is a superset of "objtool check". ++ */ ++ ++#include ++#include ++#include "builtin.h" ++#include "check.h" ++ ++ ++static const char *orc_usage[] = { ++ "objtool orc generate [] file.o", ++ "objtool orc dump file.o", ++ NULL, ++}; ++ ++extern const struct option check_options[]; ++extern bool nofp; ++ ++int cmd_orc(int argc, const char **argv) ++{ ++ const char *objname; ++ ++ argc--; argv++; ++ if (!strncmp(argv[0], "gen", 3)) { ++ argc = parse_options(argc, argv, check_options, orc_usage, 0); ++ if (argc != 1) ++ usage_with_options(orc_usage, check_options); ++ ++ objname = argv[0]; ++ ++ return check(objname, nofp, true); ++ ++ } ++ ++ if (!strcmp(argv[0], "dump")) { ++ if (argc != 2) ++ usage_with_options(orc_usage, check_options); ++ ++ objname = argv[1]; ++ ++ return orc_dump(objname); ++ } ++ ++ usage_with_options(orc_usage, check_options); ++ ++ return 0; ++} +diff --git a/tools/objtool/check.c b/tools/objtool/check.c +index 2c6d74880403..cb57c526ba17 100644 +--- a/tools/objtool/check.c ++++ b/tools/objtool/check.c +@@ -36,8 +36,8 @@ const char *objname; + static bool nofp; + struct cfi_state initial_func_cfi; + +-static struct instruction *find_insn(struct objtool_file *file, +- struct section *sec, unsigned long offset) ++struct instruction *find_insn(struct objtool_file *file, ++ struct section *sec, unsigned long offset) + { + struct instruction *insn; + +@@ -259,6 +259,11 @@ static int decode_instructions(struct objtool_file *file) + if (!(sec->sh.sh_flags & SHF_EXECINSTR)) + continue; + ++ if (strcmp(sec->name, ".altinstr_replacement") && ++ strcmp(sec->name, ".altinstr_aux") && ++ strncmp(sec->name, ".discard.", 9)) ++ sec->text = true; ++ + for (offset = 0; offset < sec->len; offset += insn->len) { + insn = malloc(sizeof(*insn)); + if (!insn) { +@@ -947,6 +952,30 @@ static bool has_valid_stack_frame(struct insn_state *state) + return false; + } + ++static int update_insn_state_regs(struct instruction *insn, struct insn_state *state) ++{ ++ struct cfi_reg *cfa = &state->cfa; ++ struct stack_op *op = &insn->stack_op; ++ ++ if (cfa->base != CFI_SP) ++ return 0; ++ ++ /* push */ ++ if (op->dest.type == OP_DEST_PUSH) ++ cfa->offset += 8; ++ ++ /* pop */ ++ if (op->src.type == OP_SRC_POP) ++ cfa->offset -= 8; ++ ++ /* add immediate to sp */ ++ if (op->dest.type == OP_DEST_REG && op->src.type == OP_SRC_ADD && ++ op->dest.reg == CFI_SP && op->src.reg == CFI_SP) ++ cfa->offset -= op->src.offset; ++ ++ return 0; ++} ++ + static void save_reg(struct insn_state *state, unsigned char reg, int base, + int offset) + { +@@ -1032,6 +1061,9 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) + return 0; + } + ++ if (state->type == ORC_TYPE_REGS || state->type == ORC_TYPE_REGS_IRET) ++ return update_insn_state_regs(insn, state); ++ + switch (op->dest.type) { + + case OP_DEST_REG: +@@ -1323,6 +1355,10 @@ static bool insn_state_match(struct instruction *insn, struct insn_state *state) + break; + } + ++ } else if (state1->type != state2->type) { ++ WARN_FUNC("stack state mismatch: type1=%d type2=%d", ++ insn->sec, insn->offset, state1->type, state2->type); ++ + } else if (state1->drap != state2->drap || + (state1->drap && state1->drap_reg != state2->drap_reg)) { + WARN_FUNC("stack state mismatch: drap1=%d(%d) drap2=%d(%d)", +@@ -1613,7 +1649,7 @@ static void cleanup(struct objtool_file *file) + elf_close(file->elf); + } + +-int check(const char *_objname, bool _nofp) ++int check(const char *_objname, bool _nofp, bool orc) + { + struct objtool_file file; + int ret, warnings = 0; +@@ -1621,7 +1657,7 @@ int check(const char *_objname, bool _nofp) + objname = _objname; + nofp = _nofp; + +- file.elf = elf_open(objname); ++ file.elf = elf_open(objname, orc ? O_RDWR : O_RDONLY); + if (!file.elf) + return 1; + +@@ -1654,6 +1690,20 @@ int check(const char *_objname, bool _nofp) + warnings += ret; + } + ++ if (orc) { ++ ret = create_orc(&file); ++ if (ret < 0) ++ goto out; ++ ++ ret = create_orc_sections(&file); ++ if (ret < 0) ++ goto out; ++ ++ ret = elf_write(file.elf); ++ if (ret < 0) ++ goto out; ++ } ++ + out: + cleanup(&file); + +diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c +index 1a7e8aa2af58..6e9f980a7d26 100644 +--- a/tools/objtool/elf.c ++++ b/tools/objtool/elf.c +@@ -30,16 +30,6 @@ + #include "elf.h" + #include "warn.h" + +-/* +- * Fallback for systems without this "read, mmaping if possible" cmd. +- */ +-#ifndef ELF_C_READ_MMAP +-#define ELF_C_READ_MMAP ELF_C_READ +-#endif +- +-#define WARN_ELF(format, ...) \ +- WARN(format ": %s", ##__VA_ARGS__, elf_errmsg(-1)) +- + struct section *find_section_by_name(struct elf *elf, const char *name) + { + struct section *sec; +@@ -349,9 +339,10 @@ static int read_relas(struct elf *elf) + return 0; + } + +-struct elf *elf_open(const char *name) ++struct elf *elf_open(const char *name, int flags) + { + struct elf *elf; ++ Elf_Cmd cmd; + + elf_version(EV_CURRENT); + +@@ -364,13 +355,20 @@ struct elf *elf_open(const char *name) + + INIT_LIST_HEAD(&elf->sections); + +- elf->fd = open(name, O_RDONLY); ++ elf->fd = open(name, flags); + if (elf->fd == -1) { + perror("open"); + goto err; + } + +- elf->elf = elf_begin(elf->fd, ELF_C_READ_MMAP, NULL); ++ if ((flags & O_ACCMODE) == O_RDONLY) ++ cmd = ELF_C_READ_MMAP; ++ else if ((flags & O_ACCMODE) == O_RDWR) ++ cmd = ELF_C_RDWR; ++ else /* O_WRONLY */ ++ cmd = ELF_C_WRITE; ++ ++ elf->elf = elf_begin(elf->fd, cmd, NULL); + if (!elf->elf) { + WARN_ELF("elf_begin"); + goto err; +@@ -397,6 +395,194 @@ struct elf *elf_open(const char *name) + return NULL; + } + ++struct section *elf_create_section(struct elf *elf, const char *name, ++ size_t entsize, int nr) ++{ ++ struct section *sec, *shstrtab; ++ size_t size = entsize * nr; ++ struct Elf_Scn *s; ++ Elf_Data *data; ++ ++ sec = malloc(sizeof(*sec)); ++ if (!sec) { ++ perror("malloc"); ++ return NULL; ++ } ++ memset(sec, 0, sizeof(*sec)); ++ ++ INIT_LIST_HEAD(&sec->symbol_list); ++ INIT_LIST_HEAD(&sec->rela_list); ++ hash_init(sec->rela_hash); ++ hash_init(sec->symbol_hash); ++ ++ list_add_tail(&sec->list, &elf->sections); ++ ++ s = elf_newscn(elf->elf); ++ if (!s) { ++ WARN_ELF("elf_newscn"); ++ return NULL; ++ } ++ ++ sec->name = strdup(name); ++ if (!sec->name) { ++ perror("strdup"); ++ return NULL; ++ } ++ ++ sec->idx = elf_ndxscn(s); ++ sec->len = size; ++ sec->changed = true; ++ ++ sec->data = elf_newdata(s); ++ if (!sec->data) { ++ WARN_ELF("elf_newdata"); ++ return NULL; ++ } ++ ++ sec->data->d_size = size; ++ sec->data->d_align = 1; ++ ++ if (size) { ++ sec->data->d_buf = malloc(size); ++ if (!sec->data->d_buf) { ++ perror("malloc"); ++ return NULL; ++ } ++ memset(sec->data->d_buf, 0, size); ++ } ++ ++ if (!gelf_getshdr(s, &sec->sh)) { ++ WARN_ELF("gelf_getshdr"); ++ return NULL; ++ } ++ ++ sec->sh.sh_size = size; ++ sec->sh.sh_entsize = entsize; ++ sec->sh.sh_type = SHT_PROGBITS; ++ sec->sh.sh_addralign = 1; ++ sec->sh.sh_flags = SHF_ALLOC; ++ ++ ++ /* Add section name to .shstrtab */ ++ shstrtab = find_section_by_name(elf, ".shstrtab"); ++ if (!shstrtab) { ++ WARN("can't find .shstrtab section"); ++ return NULL; ++ } ++ ++ s = elf_getscn(elf->elf, shstrtab->idx); ++ if (!s) { ++ WARN_ELF("elf_getscn"); ++ return NULL; ++ } ++ ++ data = elf_newdata(s); ++ if (!data) { ++ WARN_ELF("elf_newdata"); ++ return NULL; ++ } ++ ++ data->d_buf = sec->name; ++ data->d_size = strlen(name) + 1; ++ data->d_align = 1; ++ ++ sec->sh.sh_name = shstrtab->len; ++ ++ shstrtab->len += strlen(name) + 1; ++ shstrtab->changed = true; ++ ++ return sec; ++} ++ ++struct section *elf_create_rela_section(struct elf *elf, struct section *base) ++{ ++ char *relaname; ++ struct section *sec; ++ ++ relaname = malloc(strlen(base->name) + strlen(".rela") + 1); ++ if (!relaname) { ++ perror("malloc"); ++ return NULL; ++ } ++ strcpy(relaname, ".rela"); ++ strcat(relaname, base->name); ++ ++ sec = elf_create_section(elf, relaname, sizeof(GElf_Rela), 0); ++ if (!sec) ++ return NULL; ++ ++ base->rela = sec; ++ sec->base = base; ++ ++ sec->sh.sh_type = SHT_RELA; ++ sec->sh.sh_addralign = 8; ++ sec->sh.sh_link = find_section_by_name(elf, ".symtab")->idx; ++ sec->sh.sh_info = base->idx; ++ sec->sh.sh_flags = SHF_INFO_LINK; ++ ++ return sec; ++} ++ ++int elf_rebuild_rela_section(struct section *sec) ++{ ++ struct rela *rela; ++ int nr, idx = 0, size; ++ GElf_Rela *relas; ++ ++ nr = 0; ++ list_for_each_entry(rela, &sec->rela_list, list) ++ nr++; ++ ++ size = nr * sizeof(*relas); ++ relas = malloc(size); ++ if (!relas) { ++ perror("malloc"); ++ return -1; ++ } ++ ++ sec->data->d_buf = relas; ++ sec->data->d_size = size; ++ ++ sec->sh.sh_size = size; ++ ++ idx = 0; ++ list_for_each_entry(rela, &sec->rela_list, list) { ++ relas[idx].r_offset = rela->offset; ++ relas[idx].r_addend = rela->addend; ++ relas[idx].r_info = GELF_R_INFO(rela->sym->idx, rela->type); ++ idx++; ++ } ++ ++ return 0; ++} ++ ++int elf_write(struct elf *elf) ++{ ++ struct section *sec; ++ Elf_Scn *s; ++ ++ list_for_each_entry(sec, &elf->sections, list) { ++ if (sec->changed) { ++ s = elf_getscn(elf->elf, sec->idx); ++ if (!s) { ++ WARN_ELF("elf_getscn"); ++ return -1; ++ } ++ if (!gelf_update_shdr (s, &sec->sh)) { ++ WARN_ELF("gelf_update_shdr"); ++ return -1; ++ } ++ } ++ } ++ ++ if (elf_update(elf->elf, ELF_C_WRITE) < 0) { ++ WARN_ELF("elf_update"); ++ return -1; ++ } ++ ++ return 0; ++} ++ + void elf_close(struct elf *elf) + { + struct section *sec, *tmpsec; +diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c +index ecc5b1b5d15d..31e0f9143840 100644 +--- a/tools/objtool/objtool.c ++++ b/tools/objtool/objtool.c +@@ -42,10 +42,11 @@ struct cmd_struct { + }; + + static const char objtool_usage_string[] = +- "objtool [OPTIONS] COMMAND [ARGS]"; ++ "objtool COMMAND [ARGS]"; + + static struct cmd_struct objtool_cmds[] = { + {"check", cmd_check, "Perform stack metadata validation on an object file" }, ++ {"orc", cmd_orc, "Generate in-place ORC unwind tables for an object file" }, + }; + + bool help; +diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c +new file mode 100644 +index 000000000000..36c5bf6a2675 +--- /dev/null ++++ b/tools/objtool/orc_dump.c +@@ -0,0 +1,212 @@ ++/* ++ * Copyright (C) 2017 Josh Poimboeuf ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version 2 ++ * of the License, or (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, see . ++ */ ++ ++#include ++#include "orc.h" ++#include "warn.h" ++ ++static const char *reg_name(unsigned int reg) ++{ ++ switch (reg) { ++ case ORC_REG_PREV_SP: ++ return "prevsp"; ++ case ORC_REG_DX: ++ return "dx"; ++ case ORC_REG_DI: ++ return "di"; ++ case ORC_REG_BP: ++ return "bp"; ++ case ORC_REG_SP: ++ return "sp"; ++ case ORC_REG_R10: ++ return "r10"; ++ case ORC_REG_R13: ++ return "r13"; ++ case ORC_REG_BP_INDIRECT: ++ return "bp(ind)"; ++ case ORC_REG_SP_INDIRECT: ++ return "sp(ind)"; ++ default: ++ return "?"; ++ } ++} ++ ++static const char *orc_type_name(unsigned int type) ++{ ++ switch (type) { ++ case ORC_TYPE_CALL: ++ return "call"; ++ case ORC_TYPE_REGS: ++ return "regs"; ++ case ORC_TYPE_REGS_IRET: ++ return "iret"; ++ default: ++ return "?"; ++ } ++} ++ ++static void print_reg(unsigned int reg, int offset) ++{ ++ if (reg == ORC_REG_BP_INDIRECT) ++ printf("(bp%+d)", offset); ++ else if (reg == ORC_REG_SP_INDIRECT) ++ printf("(sp%+d)", offset); ++ else if (reg == ORC_REG_UNDEFINED) ++ printf("(und)"); ++ else ++ printf("%s%+d", reg_name(reg), offset); ++} ++ ++int orc_dump(const char *_objname) ++{ ++ int fd, nr_entries, i, *orc_ip = NULL, orc_size = 0; ++ struct orc_entry *orc = NULL; ++ char *name; ++ unsigned long nr_sections, orc_ip_addr = 0; ++ size_t shstrtab_idx; ++ Elf *elf; ++ Elf_Scn *scn; ++ GElf_Shdr sh; ++ GElf_Rela rela; ++ GElf_Sym sym; ++ Elf_Data *data, *symtab = NULL, *rela_orc_ip = NULL; ++ ++ ++ objname = _objname; ++ ++ elf_version(EV_CURRENT); ++ ++ fd = open(objname, O_RDONLY); ++ if (fd == -1) { ++ perror("open"); ++ return -1; ++ } ++ ++ elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); ++ if (!elf) { ++ WARN_ELF("elf_begin"); ++ return -1; ++ } ++ ++ if (elf_getshdrnum(elf, &nr_sections)) { ++ WARN_ELF("elf_getshdrnum"); ++ return -1; ++ } ++ ++ if (elf_getshdrstrndx(elf, &shstrtab_idx)) { ++ WARN_ELF("elf_getshdrstrndx"); ++ return -1; ++ } ++ ++ for (i = 0; i < nr_sections; i++) { ++ scn = elf_getscn(elf, i); ++ if (!scn) { ++ WARN_ELF("elf_getscn"); ++ return -1; ++ } ++ ++ if (!gelf_getshdr(scn, &sh)) { ++ WARN_ELF("gelf_getshdr"); ++ return -1; ++ } ++ ++ name = elf_strptr(elf, shstrtab_idx, sh.sh_name); ++ if (!name) { ++ WARN_ELF("elf_strptr"); ++ return -1; ++ } ++ ++ data = elf_getdata(scn, NULL); ++ if (!data) { ++ WARN_ELF("elf_getdata"); ++ return -1; ++ } ++ ++ if (!strcmp(name, ".symtab")) { ++ symtab = data; ++ } else if (!strcmp(name, ".orc_unwind")) { ++ orc = data->d_buf; ++ orc_size = sh.sh_size; ++ } else if (!strcmp(name, ".orc_unwind_ip")) { ++ orc_ip = data->d_buf; ++ orc_ip_addr = sh.sh_addr; ++ } else if (!strcmp(name, ".rela.orc_unwind_ip")) { ++ rela_orc_ip = data; ++ } ++ } ++ ++ if (!symtab || !orc || !orc_ip) ++ return 0; ++ ++ if (orc_size % sizeof(*orc) != 0) { ++ WARN("bad .orc_unwind section size"); ++ return -1; ++ } ++ ++ nr_entries = orc_size / sizeof(*orc); ++ for (i = 0; i < nr_entries; i++) { ++ if (rela_orc_ip) { ++ if (!gelf_getrela(rela_orc_ip, i, &rela)) { ++ WARN_ELF("gelf_getrela"); ++ return -1; ++ } ++ ++ if (!gelf_getsym(symtab, GELF_R_SYM(rela.r_info), &sym)) { ++ WARN_ELF("gelf_getsym"); ++ return -1; ++ } ++ ++ scn = elf_getscn(elf, sym.st_shndx); ++ if (!scn) { ++ WARN_ELF("elf_getscn"); ++ return -1; ++ } ++ ++ if (!gelf_getshdr(scn, &sh)) { ++ WARN_ELF("gelf_getshdr"); ++ return -1; ++ } ++ ++ name = elf_strptr(elf, shstrtab_idx, sh.sh_name); ++ if (!name || !*name) { ++ WARN_ELF("elf_strptr"); ++ return -1; ++ } ++ ++ printf("%s+%lx:", name, rela.r_addend); ++ ++ } else { ++ printf("%lx:", orc_ip_addr + (i * sizeof(int)) + orc_ip[i]); ++ } ++ ++ ++ printf(" sp:"); ++ ++ print_reg(orc[i].sp_reg, orc[i].sp_offset); ++ ++ printf(" bp:"); ++ ++ print_reg(orc[i].bp_reg, orc[i].bp_offset); ++ ++ printf(" type:%s\n", orc_type_name(orc[i].type)); ++ } ++ ++ elf_end(elf); ++ close(fd); ++ ++ return 0; ++} +diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c +new file mode 100644 +index 000000000000..e5ca31429c9b +--- /dev/null ++++ b/tools/objtool/orc_gen.c +@@ -0,0 +1,214 @@ ++/* ++ * Copyright (C) 2017 Josh Poimboeuf ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version 2 ++ * of the License, or (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, see . ++ */ ++ ++#include ++#include ++ ++#include "orc.h" ++#include "check.h" ++#include "warn.h" ++ ++int create_orc(struct objtool_file *file) ++{ ++ struct instruction *insn; ++ ++ for_each_insn(file, insn) { ++ struct orc_entry *orc = &insn->orc; ++ struct cfi_reg *cfa = &insn->state.cfa; ++ struct cfi_reg *bp = &insn->state.regs[CFI_BP]; ++ ++ if (cfa->base == CFI_UNDEFINED) { ++ orc->sp_reg = ORC_REG_UNDEFINED; ++ continue; ++ } ++ ++ switch (cfa->base) { ++ case CFI_SP: ++ orc->sp_reg = ORC_REG_SP; ++ break; ++ case CFI_SP_INDIRECT: ++ orc->sp_reg = ORC_REG_SP_INDIRECT; ++ break; ++ case CFI_BP: ++ orc->sp_reg = ORC_REG_BP; ++ break; ++ case CFI_BP_INDIRECT: ++ orc->sp_reg = ORC_REG_BP_INDIRECT; ++ break; ++ case CFI_R10: ++ orc->sp_reg = ORC_REG_R10; ++ break; ++ case CFI_R13: ++ orc->sp_reg = ORC_REG_R13; ++ break; ++ case CFI_DI: ++ orc->sp_reg = ORC_REG_DI; ++ break; ++ case CFI_DX: ++ orc->sp_reg = ORC_REG_DX; ++ break; ++ default: ++ WARN_FUNC("unknown CFA base reg %d", ++ insn->sec, insn->offset, cfa->base); ++ return -1; ++ } ++ ++ switch(bp->base) { ++ case CFI_UNDEFINED: ++ orc->bp_reg = ORC_REG_UNDEFINED; ++ break; ++ case CFI_CFA: ++ orc->bp_reg = ORC_REG_PREV_SP; ++ break; ++ case CFI_BP: ++ orc->bp_reg = ORC_REG_BP; ++ break; ++ default: ++ WARN_FUNC("unknown BP base reg %d", ++ insn->sec, insn->offset, bp->base); ++ return -1; ++ } ++ ++ orc->sp_offset = cfa->offset; ++ orc->bp_offset = bp->offset; ++ orc->type = insn->state.type; ++ } ++ ++ return 0; ++} ++ ++static int create_orc_entry(struct section *u_sec, struct section *ip_relasec, ++ unsigned int idx, struct section *insn_sec, ++ unsigned long insn_off, struct orc_entry *o) ++{ ++ struct orc_entry *orc; ++ struct rela *rela; ++ ++ /* populate ORC data */ ++ orc = (struct orc_entry *)u_sec->data->d_buf + idx; ++ memcpy(orc, o, sizeof(*orc)); ++ ++ /* populate rela for ip */ ++ rela = malloc(sizeof(*rela)); ++ if (!rela) { ++ perror("malloc"); ++ return -1; ++ } ++ memset(rela, 0, sizeof(*rela)); ++ ++ rela->sym = insn_sec->sym; ++ rela->addend = insn_off; ++ rela->type = R_X86_64_PC32; ++ rela->offset = idx * sizeof(int); ++ ++ list_add_tail(&rela->list, &ip_relasec->rela_list); ++ hash_add(ip_relasec->rela_hash, &rela->hash, rela->offset); ++ ++ return 0; ++} ++ ++int create_orc_sections(struct objtool_file *file) ++{ ++ struct instruction *insn, *prev_insn; ++ struct section *sec, *u_sec, *ip_relasec; ++ unsigned int idx; ++ ++ struct orc_entry empty = { ++ .sp_reg = ORC_REG_UNDEFINED, ++ .bp_reg = ORC_REG_UNDEFINED, ++ .type = ORC_TYPE_CALL, ++ }; ++ ++ sec = find_section_by_name(file->elf, ".orc_unwind"); ++ if (sec) { ++ WARN("file already has .orc_unwind section, skipping"); ++ return -1; ++ } ++ ++ /* count the number of needed orcs */ ++ idx = 0; ++ for_each_sec(file, sec) { ++ if (!sec->text) ++ continue; ++ ++ prev_insn = NULL; ++ sec_for_each_insn(file, sec, insn) { ++ if (!prev_insn || ++ memcmp(&insn->orc, &prev_insn->orc, ++ sizeof(struct orc_entry))) { ++ idx++; ++ } ++ prev_insn = insn; ++ } ++ ++ /* section terminator */ ++ if (prev_insn) ++ idx++; ++ } ++ if (!idx) ++ return -1; ++ ++ ++ /* create .orc_unwind_ip and .rela.orc_unwind_ip sections */ ++ sec = elf_create_section(file->elf, ".orc_unwind_ip", sizeof(int), idx); ++ ++ ip_relasec = elf_create_rela_section(file->elf, sec); ++ if (!ip_relasec) ++ return -1; ++ ++ /* create .orc_unwind section */ ++ u_sec = elf_create_section(file->elf, ".orc_unwind", ++ sizeof(struct orc_entry), idx); ++ ++ /* populate sections */ ++ idx = 0; ++ for_each_sec(file, sec) { ++ if (!sec->text) ++ continue; ++ ++ prev_insn = NULL; ++ sec_for_each_insn(file, sec, insn) { ++ if (!prev_insn || memcmp(&insn->orc, &prev_insn->orc, ++ sizeof(struct orc_entry))) { ++ ++ if (create_orc_entry(u_sec, ip_relasec, idx, ++ insn->sec, insn->offset, ++ &insn->orc)) ++ return -1; ++ ++ idx++; ++ } ++ prev_insn = insn; ++ } ++ ++ /* section terminator */ ++ if (prev_insn) { ++ if (create_orc_entry(u_sec, ip_relasec, idx, ++ prev_insn->sec, ++ prev_insn->offset + prev_insn->len, ++ &empty)) ++ return -1; ++ ++ idx++; ++ } ++ } ++ ++ if (elf_rebuild_rela_section(ip_relasec)) ++ return -1; ++ ++ return 0; ++} +diff --git a/tools/objtool/Build b/tools/objtool/Build +index 6f2e1987c4d9..749becdf5b90 100644 +--- a/tools/objtool/Build ++++ b/tools/objtool/Build +@@ -1,6 +1,9 @@ + objtool-y += arch/$(SRCARCH)/ + objtool-y += builtin-check.o ++objtool-y += builtin-orc.o + objtool-y += check.o ++objtool-y += orc_gen.o ++objtool-y += orc_dump.o + objtool-y += elf.o + objtool-y += special.o + objtool-y += objtool.o +-- +2.14.2 + diff --git a/patches/kernel/0036-objtool-x86-Add-facility-for-asm-code-to-provide-unw.patch b/patches/kernel/0036-objtool-x86-Add-facility-for-asm-code-to-provide-unw.patch deleted file mode 100644 index 3c4000c..0000000 --- a/patches/kernel/0036-objtool-x86-Add-facility-for-asm-code-to-provide-unw.patch +++ /dev/null @@ -1,641 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Tue, 11 Jul 2017 10:33:43 -0500 -Subject: [PATCH] objtool, x86: Add facility for asm code to provide unwind - hints -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Some asm (and inline asm) code does special things to the stack which -objtool can't understand. (Nor can GCC or GNU assembler, for that -matter.) In such cases we need a facility for the code to provide -annotations, so the unwinder can unwind through it. - -This provides such a facility, in the form of unwind hints. They're -similar to the GNU assembler .cfi* directives, but they give more -information, and are needed in far fewer places, because objtool can -fill in the blanks by following branches and adjusting the stack pointer -for pushes and pops. - -Signed-off-by: Josh Poimboeuf -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Denys Vlasenko -Cc: H. Peter Anvin -Cc: Jiri Slaby -Cc: Linus Torvalds -Cc: Mike Galbraith -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: live-patching@vger.kernel.org -Link: http://lkml.kernel.org/r/0f5f3c9104fca559ff4088bece1d14ae3bca52d5.1499786555.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 39358a033b2e4432052265c1fa0f36f572d8cfb5) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit a1fed2e10e84d48643a09861c2d127968621813e) -Signed-off-by: Fabian Grünbichler ---- - tools/objtool/Makefile | 3 + - arch/x86/include/asm/orc_types.h | 107 ++++++++++++++++++++ - arch/x86/include/asm/unwind_hints.h | 103 +++++++++++++++++++ - tools/objtool/check.h | 4 +- - tools/objtool/orc_types.h | 22 +++++ - tools/objtool/check.c | 191 +++++++++++++++++++++++++++++++++--- - 6 files changed, 417 insertions(+), 13 deletions(-) - create mode 100644 arch/x86/include/asm/orc_types.h - create mode 100644 arch/x86/include/asm/unwind_hints.h - -diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile -index 0e2765e243c0..3a6425fefc43 100644 ---- a/tools/objtool/Makefile -+++ b/tools/objtool/Makefile -@@ -52,6 +52,9 @@ $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN) - diff -I'^#include' arch/x86/insn/inat.h ../../arch/x86/include/asm/inat.h >/dev/null && \ - diff -I'^#include' arch/x86/insn/inat_types.h ../../arch/x86/include/asm/inat_types.h >/dev/null) \ - || echo "warning: objtool: x86 instruction decoder differs from kernel" >&2 )) || true -+ @(test -d ../../kernel -a -d ../../tools -a -d ../objtool && (( \ -+ diff ../../arch/x86/include/asm/orc_types.h orc_types.h >/dev/null) \ -+ || echo "warning: objtool: orc_types.h differs from kernel" >&2 )) || true - $(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@ - - -diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h -new file mode 100644 -index 000000000000..7dc777a6cb40 ---- /dev/null -+++ b/arch/x86/include/asm/orc_types.h -@@ -0,0 +1,107 @@ -+/* -+ * Copyright (C) 2017 Josh Poimboeuf -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version 2 -+ * of the License, or (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, see . -+ */ -+ -+#ifndef _ORC_TYPES_H -+#define _ORC_TYPES_H -+ -+#include -+#include -+ -+/* -+ * The ORC_REG_* registers are base registers which are used to find other -+ * registers on the stack. -+ * -+ * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the -+ * address of the previous frame: the caller's SP before it called the current -+ * function. -+ * -+ * ORC_REG_UNDEFINED means the corresponding register's value didn't change in -+ * the current frame. -+ * -+ * The most commonly used base registers are SP and BP -- which the previous SP -+ * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is -+ * usually based on. -+ * -+ * The rest of the base registers are needed for special cases like entry code -+ * and GCC realigned stacks. -+ */ -+#define ORC_REG_UNDEFINED 0 -+#define ORC_REG_PREV_SP 1 -+#define ORC_REG_DX 2 -+#define ORC_REG_DI 3 -+#define ORC_REG_BP 4 -+#define ORC_REG_SP 5 -+#define ORC_REG_R10 6 -+#define ORC_REG_R13 7 -+#define ORC_REG_BP_INDIRECT 8 -+#define ORC_REG_SP_INDIRECT 9 -+#define ORC_REG_MAX 15 -+ -+/* -+ * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the -+ * caller's SP right before it made the call). Used for all callable -+ * functions, i.e. all C code and all callable asm functions. -+ * -+ * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points -+ * to a fully populated pt_regs from a syscall, interrupt, or exception. -+ * -+ * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset -+ * points to the iret return frame. -+ * -+ * The UNWIND_HINT macros are used only for the unwind_hint struct. They -+ * aren't used in struct orc_entry due to size and complexity constraints. -+ * Objtool converts them to real types when it converts the hints to orc -+ * entries. -+ */ -+#define ORC_TYPE_CALL 0 -+#define ORC_TYPE_REGS 1 -+#define ORC_TYPE_REGS_IRET 2 -+#define UNWIND_HINT_TYPE_SAVE 3 -+#define UNWIND_HINT_TYPE_RESTORE 4 -+ -+#ifndef __ASSEMBLY__ -+/* -+ * This struct is more or less a vastly simplified version of the DWARF Call -+ * Frame Information standard. It contains only the necessary parts of DWARF -+ * CFI, simplified for ease of access by the in-kernel unwinder. It tells the -+ * unwinder how to find the previous SP and BP (and sometimes entry regs) on -+ * the stack for a given code address. Each instance of the struct corresponds -+ * to one or more code locations. -+ */ -+struct orc_entry { -+ s16 sp_offset; -+ s16 bp_offset; -+ unsigned sp_reg:4; -+ unsigned bp_reg:4; -+ unsigned type:2; -+}; -+ -+/* -+ * This struct is used by asm and inline asm code to manually annotate the -+ * location of registers on the stack for the ORC unwinder. -+ * -+ * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*. -+ */ -+struct unwind_hint { -+ u32 ip; -+ s16 sp_offset; -+ u8 sp_reg; -+ u8 type; -+}; -+#endif /* __ASSEMBLY__ */ -+ -+#endif /* _ORC_TYPES_H */ -diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h -new file mode 100644 -index 000000000000..5e02b11c9b86 ---- /dev/null -+++ b/arch/x86/include/asm/unwind_hints.h -@@ -0,0 +1,103 @@ -+#ifndef _ASM_X86_UNWIND_HINTS_H -+#define _ASM_X86_UNWIND_HINTS_H -+ -+#include "orc_types.h" -+ -+#ifdef __ASSEMBLY__ -+ -+/* -+ * In asm, there are two kinds of code: normal C-type callable functions and -+ * the rest. The normal callable functions can be called by other code, and -+ * don't do anything unusual with the stack. Such normal callable functions -+ * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this -+ * category. In this case, no special debugging annotations are needed because -+ * objtool can automatically generate the ORC data for the ORC unwinder to read -+ * at runtime. -+ * -+ * Anything which doesn't fall into the above category, such as syscall and -+ * interrupt handlers, tends to not be called directly by other functions, and -+ * often does unusual non-C-function-type things with the stack pointer. Such -+ * code needs to be annotated such that objtool can understand it. The -+ * following CFI hint macros are for this type of code. -+ * -+ * These macros provide hints to objtool about the state of the stack at each -+ * instruction. Objtool starts from the hints and follows the code flow, -+ * making automatic CFI adjustments when it sees pushes and pops, filling out -+ * the debuginfo as necessary. It will also warn if it sees any -+ * inconsistencies. -+ */ -+.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL -+#ifdef CONFIG_STACK_VALIDATION -+.Lunwind_hint_ip_\@: -+ .pushsection .discard.unwind_hints -+ /* struct unwind_hint */ -+ .long .Lunwind_hint_ip_\@ - . -+ .short \sp_offset -+ .byte \sp_reg -+ .byte \type -+ .popsection -+#endif -+.endm -+ -+.macro UNWIND_HINT_EMPTY -+ UNWIND_HINT sp_reg=ORC_REG_UNDEFINED -+.endm -+ -+.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0 -+ .if \base == %rsp && \indirect -+ .set sp_reg, ORC_REG_SP_INDIRECT -+ .elseif \base == %rsp -+ .set sp_reg, ORC_REG_SP -+ .elseif \base == %rbp -+ .set sp_reg, ORC_REG_BP -+ .elseif \base == %rdi -+ .set sp_reg, ORC_REG_DI -+ .elseif \base == %rdx -+ .set sp_reg, ORC_REG_DX -+ .elseif \base == %r10 -+ .set sp_reg, ORC_REG_R10 -+ .else -+ .error "UNWIND_HINT_REGS: bad base register" -+ .endif -+ -+ .set sp_offset, \offset -+ -+ .if \iret -+ .set type, ORC_TYPE_REGS_IRET -+ .elseif \extra == 0 -+ .set type, ORC_TYPE_REGS_IRET -+ .set sp_offset, \offset + (16*8) -+ .else -+ .set type, ORC_TYPE_REGS -+ .endif -+ -+ UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type -+.endm -+ -+.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 -+ UNWIND_HINT_REGS base=\base offset=\offset iret=1 -+.endm -+ -+.macro UNWIND_HINT_FUNC sp_offset=8 -+ UNWIND_HINT sp_offset=\sp_offset -+.endm -+ -+#else /* !__ASSEMBLY__ */ -+ -+#define UNWIND_HINT(sp_reg, sp_offset, type) \ -+ "987: \n\t" \ -+ ".pushsection .discard.unwind_hints\n\t" \ -+ /* struct unwind_hint */ \ -+ ".long 987b - .\n\t" \ -+ ".short " __stringify(sp_offset) "\n\t" \ -+ ".byte " __stringify(sp_reg) "\n\t" \ -+ ".byte " __stringify(type) "\n\t" \ -+ ".popsection\n\t" -+ -+#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE) -+ -+#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE) -+ -+#endif /* __ASSEMBLY__ */ -+ -+#endif /* _ASM_X86_UNWIND_HINTS_H */ -diff --git a/tools/objtool/check.h b/tools/objtool/check.h -index 046874bbe226..ac3d4b13f17b 100644 ---- a/tools/objtool/check.h -+++ b/tools/objtool/check.h -@@ -43,7 +43,7 @@ struct instruction { - unsigned int len; - unsigned char type; - unsigned long immediate; -- bool alt_group, visited, dead_end, ignore; -+ bool alt_group, visited, dead_end, ignore, hint, save, restore; - struct symbol *call_dest; - struct instruction *jump_dest; - struct list_head alts; -@@ -58,7 +58,7 @@ struct objtool_file { - struct list_head insn_list; - DECLARE_HASHTABLE(insn_hash, 16); - struct section *rodata, *whitelist; -- bool ignore_unreachables, c_file; -+ bool ignore_unreachables, c_file, hints; - }; - - int check(const char *objname, bool nofp, bool orc); -diff --git a/tools/objtool/orc_types.h b/tools/objtool/orc_types.h -index fc5cf6cffd9a..9c9dc579bd7d 100644 ---- a/tools/objtool/orc_types.h -+++ b/tools/objtool/orc_types.h -@@ -61,11 +61,19 @@ - * - * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset - * points to the iret return frame. -+ * -+ * The UNWIND_HINT macros are used only for the unwind_hint struct. They -+ * aren't used in struct orc_entry due to size and complexity constraints. -+ * Objtool converts them to real types when it converts the hints to orc -+ * entries. - */ - #define ORC_TYPE_CALL 0 - #define ORC_TYPE_REGS 1 - #define ORC_TYPE_REGS_IRET 2 -+#define UNWIND_HINT_TYPE_SAVE 3 -+#define UNWIND_HINT_TYPE_RESTORE 4 - -+#ifndef __ASSEMBLY__ - /* - * This struct is more or less a vastly simplified version of the DWARF Call - * Frame Information standard. It contains only the necessary parts of DWARF -@@ -82,4 +90,18 @@ struct orc_entry { - unsigned type:2; - } __packed; - -+/* -+ * This struct is used by asm and inline asm code to manually annotate the -+ * location of registers on the stack for the ORC unwinder. -+ * -+ * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*. -+ */ -+struct unwind_hint { -+ u32 ip; -+ s16 sp_offset; -+ u8 sp_reg; -+ u8 type; -+}; -+#endif /* __ASSEMBLY__ */ -+ - #endif /* _ORC_TYPES_H */ -diff --git a/tools/objtool/check.c b/tools/objtool/check.c -index cb57c526ba17..368275de5f23 100644 ---- a/tools/objtool/check.c -+++ b/tools/objtool/check.c -@@ -100,7 +100,6 @@ static bool gcov_enabled(struct objtool_file *file) - static bool ignore_func(struct objtool_file *file, struct symbol *func) - { - struct rela *rela; -- struct instruction *insn; - - /* check for STACK_FRAME_NON_STANDARD */ - if (file->whitelist && file->whitelist->rela) -@@ -113,11 +112,6 @@ static bool ignore_func(struct objtool_file *file, struct symbol *func) - return true; - } - -- /* check if it has a context switching instruction */ -- func_for_each_insn(file, func, insn) -- if (insn->type == INSN_CONTEXT_SWITCH) -- return true; -- - return false; - } - -@@ -879,6 +873,99 @@ static int add_switch_table_alts(struct objtool_file *file) - return 0; - } - -+static int read_unwind_hints(struct objtool_file *file) -+{ -+ struct section *sec, *relasec; -+ struct rela *rela; -+ struct unwind_hint *hint; -+ struct instruction *insn; -+ struct cfi_reg *cfa; -+ int i; -+ -+ sec = find_section_by_name(file->elf, ".discard.unwind_hints"); -+ if (!sec) -+ return 0; -+ -+ relasec = sec->rela; -+ if (!relasec) { -+ WARN("missing .rela.discard.unwind_hints section"); -+ return -1; -+ } -+ -+ if (sec->len % sizeof(struct unwind_hint)) { -+ WARN("struct unwind_hint size mismatch"); -+ return -1; -+ } -+ -+ file->hints = true; -+ -+ for (i = 0; i < sec->len / sizeof(struct unwind_hint); i++) { -+ hint = (struct unwind_hint *)sec->data->d_buf + i; -+ -+ rela = find_rela_by_dest(sec, i * sizeof(*hint)); -+ if (!rela) { -+ WARN("can't find rela for unwind_hints[%d]", i); -+ return -1; -+ } -+ -+ insn = find_insn(file, rela->sym->sec, rela->addend); -+ if (!insn) { -+ WARN("can't find insn for unwind_hints[%d]", i); -+ return -1; -+ } -+ -+ cfa = &insn->state.cfa; -+ -+ if (hint->type == UNWIND_HINT_TYPE_SAVE) { -+ insn->save = true; -+ continue; -+ -+ } else if (hint->type == UNWIND_HINT_TYPE_RESTORE) { -+ insn->restore = true; -+ insn->hint = true; -+ continue; -+ } -+ -+ insn->hint = true; -+ -+ switch (hint->sp_reg) { -+ case ORC_REG_UNDEFINED: -+ cfa->base = CFI_UNDEFINED; -+ break; -+ case ORC_REG_SP: -+ cfa->base = CFI_SP; -+ break; -+ case ORC_REG_BP: -+ cfa->base = CFI_BP; -+ break; -+ case ORC_REG_SP_INDIRECT: -+ cfa->base = CFI_SP_INDIRECT; -+ break; -+ case ORC_REG_R10: -+ cfa->base = CFI_R10; -+ break; -+ case ORC_REG_R13: -+ cfa->base = CFI_R13; -+ break; -+ case ORC_REG_DI: -+ cfa->base = CFI_DI; -+ break; -+ case ORC_REG_DX: -+ cfa->base = CFI_DX; -+ break; -+ default: -+ WARN_FUNC("unsupported unwind_hint sp base reg %d", -+ insn->sec, insn->offset, hint->sp_reg); -+ return -1; -+ } -+ -+ cfa->offset = hint->sp_offset; -+ insn->state.type = hint->type; -+ } -+ -+ return 0; -+} -+ - static int decode_sections(struct objtool_file *file) - { - int ret; -@@ -909,6 +996,10 @@ static int decode_sections(struct objtool_file *file) - if (ret) - return ret; - -+ ret = read_unwind_hints(file); -+ if (ret) -+ return ret; -+ - return 0; - } - -@@ -1382,7 +1473,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, - struct insn_state state) - { - struct alternative *alt; -- struct instruction *insn; -+ struct instruction *insn, *next_insn; - struct section *sec; - struct symbol *func = NULL; - int ret; -@@ -1397,6 +1488,8 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, - } - - while (1) { -+ next_insn = next_insn_same_sec(file, insn); -+ - if (file->c_file && insn->func) { - if (func && func != insn->func) { - WARN("%s() falls through to next function %s()", -@@ -1414,13 +1507,54 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, - } - - if (insn->visited) { -- if (!!insn_state_match(insn, &state)) -+ if (!insn->hint && !insn_state_match(insn, &state)) - return 1; - - return 0; - } - -- insn->state = state; -+ if (insn->hint) { -+ if (insn->restore) { -+ struct instruction *save_insn, *i; -+ -+ i = insn; -+ save_insn = NULL; -+ func_for_each_insn_continue_reverse(file, func, i) { -+ if (i->save) { -+ save_insn = i; -+ break; -+ } -+ } -+ -+ if (!save_insn) { -+ WARN_FUNC("no corresponding CFI save for CFI restore", -+ sec, insn->offset); -+ return 1; -+ } -+ -+ if (!save_insn->visited) { -+ /* -+ * Oops, no state to copy yet. -+ * Hopefully we can reach this -+ * instruction from another branch -+ * after the save insn has been -+ * visited. -+ */ -+ if (insn == first) -+ return 0; -+ -+ WARN_FUNC("objtool isn't smart enough to handle this CFI save/restore combo", -+ sec, insn->offset); -+ return 1; -+ } -+ -+ insn->state = save_insn->state; -+ } -+ -+ state = insn->state; -+ -+ } else -+ insn->state = state; - - insn->visited = true; - -@@ -1497,6 +1631,14 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, - - return 0; - -+ case INSN_CONTEXT_SWITCH: -+ if (func && (!next_insn || !next_insn->hint)) { -+ WARN_FUNC("unsupported instruction in callable function", -+ sec, insn->offset); -+ return 1; -+ } -+ return 0; -+ - case INSN_STACK: - if (update_insn_state(insn, &state)) - return -1; -@@ -1510,7 +1652,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, - if (insn->dead_end) - return 0; - -- insn = next_insn_same_sec(file, insn); -+ insn = next_insn; - if (!insn) { - WARN("%s: unexpected end of section", sec->name); - return 1; -@@ -1520,6 +1662,27 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, - return 0; - } - -+static int validate_unwind_hints(struct objtool_file *file) -+{ -+ struct instruction *insn; -+ int ret, warnings = 0; -+ struct insn_state state; -+ -+ if (!file->hints) -+ return 0; -+ -+ clear_insn_state(&state); -+ -+ for_each_insn(file, insn) { -+ if (insn->hint && !insn->visited) { -+ ret = validate_branch(file, insn, state); -+ warnings += ret; -+ } -+ } -+ -+ return warnings; -+} -+ - static bool is_kasan_insn(struct instruction *insn) - { - return (insn->type == INSN_CALL && -@@ -1665,8 +1828,9 @@ int check(const char *_objname, bool _nofp, bool orc) - hash_init(file.insn_hash); - file.whitelist = find_section_by_name(file.elf, ".discard.func_stack_frame_non_standard"); - file.rodata = find_section_by_name(file.elf, ".rodata"); -- file.ignore_unreachables = false; - file.c_file = find_section_by_name(file.elf, ".comment"); -+ file.ignore_unreachables = false; -+ file.hints = false; - - arch_initial_func_cfi_state(&initial_func_cfi); - -@@ -1683,6 +1847,11 @@ int check(const char *_objname, bool _nofp, bool orc) - goto out; - warnings += ret; - -+ ret = validate_unwind_hints(&file); -+ if (ret < 0) -+ goto out; -+ warnings += ret; -+ - if (!warnings) { - ret = validate_reachable_instructions(&file); - if (ret < 0) --- -2.14.2 - diff --git a/patches/kernel/0037-objtool-x86-Add-facility-for-asm-code-to-provide-unw.patch b/patches/kernel/0037-objtool-x86-Add-facility-for-asm-code-to-provide-unw.patch new file mode 100644 index 0000000..3c4000c --- /dev/null +++ b/patches/kernel/0037-objtool-x86-Add-facility-for-asm-code-to-provide-unw.patch @@ -0,0 +1,641 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Tue, 11 Jul 2017 10:33:43 -0500 +Subject: [PATCH] objtool, x86: Add facility for asm code to provide unwind + hints +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Some asm (and inline asm) code does special things to the stack which +objtool can't understand. (Nor can GCC or GNU assembler, for that +matter.) In such cases we need a facility for the code to provide +annotations, so the unwinder can unwind through it. + +This provides such a facility, in the form of unwind hints. They're +similar to the GNU assembler .cfi* directives, but they give more +information, and are needed in far fewer places, because objtool can +fill in the blanks by following branches and adjusting the stack pointer +for pushes and pops. + +Signed-off-by: Josh Poimboeuf +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Jiri Slaby +Cc: Linus Torvalds +Cc: Mike Galbraith +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: live-patching@vger.kernel.org +Link: http://lkml.kernel.org/r/0f5f3c9104fca559ff4088bece1d14ae3bca52d5.1499786555.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 39358a033b2e4432052265c1fa0f36f572d8cfb5) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit a1fed2e10e84d48643a09861c2d127968621813e) +Signed-off-by: Fabian Grünbichler +--- + tools/objtool/Makefile | 3 + + arch/x86/include/asm/orc_types.h | 107 ++++++++++++++++++++ + arch/x86/include/asm/unwind_hints.h | 103 +++++++++++++++++++ + tools/objtool/check.h | 4 +- + tools/objtool/orc_types.h | 22 +++++ + tools/objtool/check.c | 191 +++++++++++++++++++++++++++++++++--- + 6 files changed, 417 insertions(+), 13 deletions(-) + create mode 100644 arch/x86/include/asm/orc_types.h + create mode 100644 arch/x86/include/asm/unwind_hints.h + +diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile +index 0e2765e243c0..3a6425fefc43 100644 +--- a/tools/objtool/Makefile ++++ b/tools/objtool/Makefile +@@ -52,6 +52,9 @@ $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN) + diff -I'^#include' arch/x86/insn/inat.h ../../arch/x86/include/asm/inat.h >/dev/null && \ + diff -I'^#include' arch/x86/insn/inat_types.h ../../arch/x86/include/asm/inat_types.h >/dev/null) \ + || echo "warning: objtool: x86 instruction decoder differs from kernel" >&2 )) || true ++ @(test -d ../../kernel -a -d ../../tools -a -d ../objtool && (( \ ++ diff ../../arch/x86/include/asm/orc_types.h orc_types.h >/dev/null) \ ++ || echo "warning: objtool: orc_types.h differs from kernel" >&2 )) || true + $(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@ + + +diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h +new file mode 100644 +index 000000000000..7dc777a6cb40 +--- /dev/null ++++ b/arch/x86/include/asm/orc_types.h +@@ -0,0 +1,107 @@ ++/* ++ * Copyright (C) 2017 Josh Poimboeuf ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version 2 ++ * of the License, or (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, see . ++ */ ++ ++#ifndef _ORC_TYPES_H ++#define _ORC_TYPES_H ++ ++#include ++#include ++ ++/* ++ * The ORC_REG_* registers are base registers which are used to find other ++ * registers on the stack. ++ * ++ * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the ++ * address of the previous frame: the caller's SP before it called the current ++ * function. ++ * ++ * ORC_REG_UNDEFINED means the corresponding register's value didn't change in ++ * the current frame. ++ * ++ * The most commonly used base registers are SP and BP -- which the previous SP ++ * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is ++ * usually based on. ++ * ++ * The rest of the base registers are needed for special cases like entry code ++ * and GCC realigned stacks. ++ */ ++#define ORC_REG_UNDEFINED 0 ++#define ORC_REG_PREV_SP 1 ++#define ORC_REG_DX 2 ++#define ORC_REG_DI 3 ++#define ORC_REG_BP 4 ++#define ORC_REG_SP 5 ++#define ORC_REG_R10 6 ++#define ORC_REG_R13 7 ++#define ORC_REG_BP_INDIRECT 8 ++#define ORC_REG_SP_INDIRECT 9 ++#define ORC_REG_MAX 15 ++ ++/* ++ * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the ++ * caller's SP right before it made the call). Used for all callable ++ * functions, i.e. all C code and all callable asm functions. ++ * ++ * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points ++ * to a fully populated pt_regs from a syscall, interrupt, or exception. ++ * ++ * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset ++ * points to the iret return frame. ++ * ++ * The UNWIND_HINT macros are used only for the unwind_hint struct. They ++ * aren't used in struct orc_entry due to size and complexity constraints. ++ * Objtool converts them to real types when it converts the hints to orc ++ * entries. ++ */ ++#define ORC_TYPE_CALL 0 ++#define ORC_TYPE_REGS 1 ++#define ORC_TYPE_REGS_IRET 2 ++#define UNWIND_HINT_TYPE_SAVE 3 ++#define UNWIND_HINT_TYPE_RESTORE 4 ++ ++#ifndef __ASSEMBLY__ ++/* ++ * This struct is more or less a vastly simplified version of the DWARF Call ++ * Frame Information standard. It contains only the necessary parts of DWARF ++ * CFI, simplified for ease of access by the in-kernel unwinder. It tells the ++ * unwinder how to find the previous SP and BP (and sometimes entry regs) on ++ * the stack for a given code address. Each instance of the struct corresponds ++ * to one or more code locations. ++ */ ++struct orc_entry { ++ s16 sp_offset; ++ s16 bp_offset; ++ unsigned sp_reg:4; ++ unsigned bp_reg:4; ++ unsigned type:2; ++}; ++ ++/* ++ * This struct is used by asm and inline asm code to manually annotate the ++ * location of registers on the stack for the ORC unwinder. ++ * ++ * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*. ++ */ ++struct unwind_hint { ++ u32 ip; ++ s16 sp_offset; ++ u8 sp_reg; ++ u8 type; ++}; ++#endif /* __ASSEMBLY__ */ ++ ++#endif /* _ORC_TYPES_H */ +diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h +new file mode 100644 +index 000000000000..5e02b11c9b86 +--- /dev/null ++++ b/arch/x86/include/asm/unwind_hints.h +@@ -0,0 +1,103 @@ ++#ifndef _ASM_X86_UNWIND_HINTS_H ++#define _ASM_X86_UNWIND_HINTS_H ++ ++#include "orc_types.h" ++ ++#ifdef __ASSEMBLY__ ++ ++/* ++ * In asm, there are two kinds of code: normal C-type callable functions and ++ * the rest. The normal callable functions can be called by other code, and ++ * don't do anything unusual with the stack. Such normal callable functions ++ * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this ++ * category. In this case, no special debugging annotations are needed because ++ * objtool can automatically generate the ORC data for the ORC unwinder to read ++ * at runtime. ++ * ++ * Anything which doesn't fall into the above category, such as syscall and ++ * interrupt handlers, tends to not be called directly by other functions, and ++ * often does unusual non-C-function-type things with the stack pointer. Such ++ * code needs to be annotated such that objtool can understand it. The ++ * following CFI hint macros are for this type of code. ++ * ++ * These macros provide hints to objtool about the state of the stack at each ++ * instruction. Objtool starts from the hints and follows the code flow, ++ * making automatic CFI adjustments when it sees pushes and pops, filling out ++ * the debuginfo as necessary. It will also warn if it sees any ++ * inconsistencies. ++ */ ++.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL ++#ifdef CONFIG_STACK_VALIDATION ++.Lunwind_hint_ip_\@: ++ .pushsection .discard.unwind_hints ++ /* struct unwind_hint */ ++ .long .Lunwind_hint_ip_\@ - . ++ .short \sp_offset ++ .byte \sp_reg ++ .byte \type ++ .popsection ++#endif ++.endm ++ ++.macro UNWIND_HINT_EMPTY ++ UNWIND_HINT sp_reg=ORC_REG_UNDEFINED ++.endm ++ ++.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0 ++ .if \base == %rsp && \indirect ++ .set sp_reg, ORC_REG_SP_INDIRECT ++ .elseif \base == %rsp ++ .set sp_reg, ORC_REG_SP ++ .elseif \base == %rbp ++ .set sp_reg, ORC_REG_BP ++ .elseif \base == %rdi ++ .set sp_reg, ORC_REG_DI ++ .elseif \base == %rdx ++ .set sp_reg, ORC_REG_DX ++ .elseif \base == %r10 ++ .set sp_reg, ORC_REG_R10 ++ .else ++ .error "UNWIND_HINT_REGS: bad base register" ++ .endif ++ ++ .set sp_offset, \offset ++ ++ .if \iret ++ .set type, ORC_TYPE_REGS_IRET ++ .elseif \extra == 0 ++ .set type, ORC_TYPE_REGS_IRET ++ .set sp_offset, \offset + (16*8) ++ .else ++ .set type, ORC_TYPE_REGS ++ .endif ++ ++ UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type ++.endm ++ ++.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 ++ UNWIND_HINT_REGS base=\base offset=\offset iret=1 ++.endm ++ ++.macro UNWIND_HINT_FUNC sp_offset=8 ++ UNWIND_HINT sp_offset=\sp_offset ++.endm ++ ++#else /* !__ASSEMBLY__ */ ++ ++#define UNWIND_HINT(sp_reg, sp_offset, type) \ ++ "987: \n\t" \ ++ ".pushsection .discard.unwind_hints\n\t" \ ++ /* struct unwind_hint */ \ ++ ".long 987b - .\n\t" \ ++ ".short " __stringify(sp_offset) "\n\t" \ ++ ".byte " __stringify(sp_reg) "\n\t" \ ++ ".byte " __stringify(type) "\n\t" \ ++ ".popsection\n\t" ++ ++#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE) ++ ++#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE) ++ ++#endif /* __ASSEMBLY__ */ ++ ++#endif /* _ASM_X86_UNWIND_HINTS_H */ +diff --git a/tools/objtool/check.h b/tools/objtool/check.h +index 046874bbe226..ac3d4b13f17b 100644 +--- a/tools/objtool/check.h ++++ b/tools/objtool/check.h +@@ -43,7 +43,7 @@ struct instruction { + unsigned int len; + unsigned char type; + unsigned long immediate; +- bool alt_group, visited, dead_end, ignore; ++ bool alt_group, visited, dead_end, ignore, hint, save, restore; + struct symbol *call_dest; + struct instruction *jump_dest; + struct list_head alts; +@@ -58,7 +58,7 @@ struct objtool_file { + struct list_head insn_list; + DECLARE_HASHTABLE(insn_hash, 16); + struct section *rodata, *whitelist; +- bool ignore_unreachables, c_file; ++ bool ignore_unreachables, c_file, hints; + }; + + int check(const char *objname, bool nofp, bool orc); +diff --git a/tools/objtool/orc_types.h b/tools/objtool/orc_types.h +index fc5cf6cffd9a..9c9dc579bd7d 100644 +--- a/tools/objtool/orc_types.h ++++ b/tools/objtool/orc_types.h +@@ -61,11 +61,19 @@ + * + * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset + * points to the iret return frame. ++ * ++ * The UNWIND_HINT macros are used only for the unwind_hint struct. They ++ * aren't used in struct orc_entry due to size and complexity constraints. ++ * Objtool converts them to real types when it converts the hints to orc ++ * entries. + */ + #define ORC_TYPE_CALL 0 + #define ORC_TYPE_REGS 1 + #define ORC_TYPE_REGS_IRET 2 ++#define UNWIND_HINT_TYPE_SAVE 3 ++#define UNWIND_HINT_TYPE_RESTORE 4 + ++#ifndef __ASSEMBLY__ + /* + * This struct is more or less a vastly simplified version of the DWARF Call + * Frame Information standard. It contains only the necessary parts of DWARF +@@ -82,4 +90,18 @@ struct orc_entry { + unsigned type:2; + } __packed; + ++/* ++ * This struct is used by asm and inline asm code to manually annotate the ++ * location of registers on the stack for the ORC unwinder. ++ * ++ * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*. ++ */ ++struct unwind_hint { ++ u32 ip; ++ s16 sp_offset; ++ u8 sp_reg; ++ u8 type; ++}; ++#endif /* __ASSEMBLY__ */ ++ + #endif /* _ORC_TYPES_H */ +diff --git a/tools/objtool/check.c b/tools/objtool/check.c +index cb57c526ba17..368275de5f23 100644 +--- a/tools/objtool/check.c ++++ b/tools/objtool/check.c +@@ -100,7 +100,6 @@ static bool gcov_enabled(struct objtool_file *file) + static bool ignore_func(struct objtool_file *file, struct symbol *func) + { + struct rela *rela; +- struct instruction *insn; + + /* check for STACK_FRAME_NON_STANDARD */ + if (file->whitelist && file->whitelist->rela) +@@ -113,11 +112,6 @@ static bool ignore_func(struct objtool_file *file, struct symbol *func) + return true; + } + +- /* check if it has a context switching instruction */ +- func_for_each_insn(file, func, insn) +- if (insn->type == INSN_CONTEXT_SWITCH) +- return true; +- + return false; + } + +@@ -879,6 +873,99 @@ static int add_switch_table_alts(struct objtool_file *file) + return 0; + } + ++static int read_unwind_hints(struct objtool_file *file) ++{ ++ struct section *sec, *relasec; ++ struct rela *rela; ++ struct unwind_hint *hint; ++ struct instruction *insn; ++ struct cfi_reg *cfa; ++ int i; ++ ++ sec = find_section_by_name(file->elf, ".discard.unwind_hints"); ++ if (!sec) ++ return 0; ++ ++ relasec = sec->rela; ++ if (!relasec) { ++ WARN("missing .rela.discard.unwind_hints section"); ++ return -1; ++ } ++ ++ if (sec->len % sizeof(struct unwind_hint)) { ++ WARN("struct unwind_hint size mismatch"); ++ return -1; ++ } ++ ++ file->hints = true; ++ ++ for (i = 0; i < sec->len / sizeof(struct unwind_hint); i++) { ++ hint = (struct unwind_hint *)sec->data->d_buf + i; ++ ++ rela = find_rela_by_dest(sec, i * sizeof(*hint)); ++ if (!rela) { ++ WARN("can't find rela for unwind_hints[%d]", i); ++ return -1; ++ } ++ ++ insn = find_insn(file, rela->sym->sec, rela->addend); ++ if (!insn) { ++ WARN("can't find insn for unwind_hints[%d]", i); ++ return -1; ++ } ++ ++ cfa = &insn->state.cfa; ++ ++ if (hint->type == UNWIND_HINT_TYPE_SAVE) { ++ insn->save = true; ++ continue; ++ ++ } else if (hint->type == UNWIND_HINT_TYPE_RESTORE) { ++ insn->restore = true; ++ insn->hint = true; ++ continue; ++ } ++ ++ insn->hint = true; ++ ++ switch (hint->sp_reg) { ++ case ORC_REG_UNDEFINED: ++ cfa->base = CFI_UNDEFINED; ++ break; ++ case ORC_REG_SP: ++ cfa->base = CFI_SP; ++ break; ++ case ORC_REG_BP: ++ cfa->base = CFI_BP; ++ break; ++ case ORC_REG_SP_INDIRECT: ++ cfa->base = CFI_SP_INDIRECT; ++ break; ++ case ORC_REG_R10: ++ cfa->base = CFI_R10; ++ break; ++ case ORC_REG_R13: ++ cfa->base = CFI_R13; ++ break; ++ case ORC_REG_DI: ++ cfa->base = CFI_DI; ++ break; ++ case ORC_REG_DX: ++ cfa->base = CFI_DX; ++ break; ++ default: ++ WARN_FUNC("unsupported unwind_hint sp base reg %d", ++ insn->sec, insn->offset, hint->sp_reg); ++ return -1; ++ } ++ ++ cfa->offset = hint->sp_offset; ++ insn->state.type = hint->type; ++ } ++ ++ return 0; ++} ++ + static int decode_sections(struct objtool_file *file) + { + int ret; +@@ -909,6 +996,10 @@ static int decode_sections(struct objtool_file *file) + if (ret) + return ret; + ++ ret = read_unwind_hints(file); ++ if (ret) ++ return ret; ++ + return 0; + } + +@@ -1382,7 +1473,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, + struct insn_state state) + { + struct alternative *alt; +- struct instruction *insn; ++ struct instruction *insn, *next_insn; + struct section *sec; + struct symbol *func = NULL; + int ret; +@@ -1397,6 +1488,8 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, + } + + while (1) { ++ next_insn = next_insn_same_sec(file, insn); ++ + if (file->c_file && insn->func) { + if (func && func != insn->func) { + WARN("%s() falls through to next function %s()", +@@ -1414,13 +1507,54 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, + } + + if (insn->visited) { +- if (!!insn_state_match(insn, &state)) ++ if (!insn->hint && !insn_state_match(insn, &state)) + return 1; + + return 0; + } + +- insn->state = state; ++ if (insn->hint) { ++ if (insn->restore) { ++ struct instruction *save_insn, *i; ++ ++ i = insn; ++ save_insn = NULL; ++ func_for_each_insn_continue_reverse(file, func, i) { ++ if (i->save) { ++ save_insn = i; ++ break; ++ } ++ } ++ ++ if (!save_insn) { ++ WARN_FUNC("no corresponding CFI save for CFI restore", ++ sec, insn->offset); ++ return 1; ++ } ++ ++ if (!save_insn->visited) { ++ /* ++ * Oops, no state to copy yet. ++ * Hopefully we can reach this ++ * instruction from another branch ++ * after the save insn has been ++ * visited. ++ */ ++ if (insn == first) ++ return 0; ++ ++ WARN_FUNC("objtool isn't smart enough to handle this CFI save/restore combo", ++ sec, insn->offset); ++ return 1; ++ } ++ ++ insn->state = save_insn->state; ++ } ++ ++ state = insn->state; ++ ++ } else ++ insn->state = state; + + insn->visited = true; + +@@ -1497,6 +1631,14 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, + + return 0; + ++ case INSN_CONTEXT_SWITCH: ++ if (func && (!next_insn || !next_insn->hint)) { ++ WARN_FUNC("unsupported instruction in callable function", ++ sec, insn->offset); ++ return 1; ++ } ++ return 0; ++ + case INSN_STACK: + if (update_insn_state(insn, &state)) + return -1; +@@ -1510,7 +1652,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, + if (insn->dead_end) + return 0; + +- insn = next_insn_same_sec(file, insn); ++ insn = next_insn; + if (!insn) { + WARN("%s: unexpected end of section", sec->name); + return 1; +@@ -1520,6 +1662,27 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, + return 0; + } + ++static int validate_unwind_hints(struct objtool_file *file) ++{ ++ struct instruction *insn; ++ int ret, warnings = 0; ++ struct insn_state state; ++ ++ if (!file->hints) ++ return 0; ++ ++ clear_insn_state(&state); ++ ++ for_each_insn(file, insn) { ++ if (insn->hint && !insn->visited) { ++ ret = validate_branch(file, insn, state); ++ warnings += ret; ++ } ++ } ++ ++ return warnings; ++} ++ + static bool is_kasan_insn(struct instruction *insn) + { + return (insn->type == INSN_CALL && +@@ -1665,8 +1828,9 @@ int check(const char *_objname, bool _nofp, bool orc) + hash_init(file.insn_hash); + file.whitelist = find_section_by_name(file.elf, ".discard.func_stack_frame_non_standard"); + file.rodata = find_section_by_name(file.elf, ".rodata"); +- file.ignore_unreachables = false; + file.c_file = find_section_by_name(file.elf, ".comment"); ++ file.ignore_unreachables = false; ++ file.hints = false; + + arch_initial_func_cfi_state(&initial_func_cfi); + +@@ -1683,6 +1847,11 @@ int check(const char *_objname, bool _nofp, bool orc) + goto out; + warnings += ret; + ++ ret = validate_unwind_hints(&file); ++ if (ret < 0) ++ goto out; ++ warnings += ret; ++ + if (!warnings) { + ret = validate_reachable_instructions(&file); + if (ret < 0) +-- +2.14.2 + diff --git a/patches/kernel/0037-x86-unwind-Add-the-ORC-unwinder.patch b/patches/kernel/0037-x86-unwind-Add-the-ORC-unwinder.patch deleted file mode 100644 index b8f0318..0000000 --- a/patches/kernel/0037-x86-unwind-Add-the-ORC-unwinder.patch +++ /dev/null @@ -1,1407 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Mon, 24 Jul 2017 18:36:57 -0500 -Subject: [PATCH] x86/unwind: Add the ORC unwinder -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Add the new ORC unwinder which is enabled by CONFIG_ORC_UNWINDER=y. -It plugs into the existing x86 unwinder framework. - -It relies on objtool to generate the needed .orc_unwind and -.orc_unwind_ip sections. - -For more details on why ORC is used instead of DWARF, see -Documentation/x86/orc-unwinder.txt - but the short version is -that it's a simplified, fundamentally more robust debugninfo -data structure, which also allows up to two orders of magnitude -faster lookups than the DWARF unwinder - which matters to -profiling workloads like perf. - -Thanks to Andy Lutomirski for the performance improvement ideas: -splitting the ORC unwind table into two parallel arrays and creating a -fast lookup table to search a subset of the unwind table. - -Signed-off-by: Josh Poimboeuf -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Denys Vlasenko -Cc: H. Peter Anvin -Cc: Jiri Slaby -Cc: Linus Torvalds -Cc: Mike Galbraith -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: live-patching@vger.kernel.org -Link: http://lkml.kernel.org/r/0a6cbfb40f8da99b7a45a1a8302dc6aef16ec812.1500938583.git.jpoimboe@redhat.com -[ Extended the changelog. ] -Signed-off-by: Ingo Molnar -(cherry picked from commit ee9f8fce99640811b2b8e79d0d1dbe8bab69ba67) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit dccbf63d7a6cc431af23a86e28275a74904545cd) -Signed-off-by: Fabian Grünbichler ---- - Documentation/x86/orc-unwinder.txt | 179 ++++++++++++ - arch/x86/kernel/Makefile | 8 +- - scripts/Makefile.build | 14 +- - arch/um/include/asm/unwind.h | 8 + - arch/x86/include/asm/module.h | 9 + - arch/x86/include/asm/orc_lookup.h | 46 +++ - arch/x86/include/asm/orc_types.h | 2 +- - arch/x86/include/asm/unwind.h | 76 +++-- - include/asm-generic/vmlinux.lds.h | 27 +- - arch/x86/kernel/module.c | 11 +- - arch/x86/kernel/setup.c | 3 + - arch/x86/kernel/unwind_frame.c | 39 +-- - arch/x86/kernel/unwind_guess.c | 5 + - arch/x86/kernel/unwind_orc.c | 582 +++++++++++++++++++++++++++++++++++++ - arch/x86/Kconfig | 1 + - arch/x86/Kconfig.debug | 25 ++ - arch/x86/kernel/vmlinux.lds.S | 3 + - lib/Kconfig.debug | 3 + - 18 files changed, 977 insertions(+), 64 deletions(-) - create mode 100644 Documentation/x86/orc-unwinder.txt - create mode 100644 arch/um/include/asm/unwind.h - create mode 100644 arch/x86/include/asm/orc_lookup.h - create mode 100644 arch/x86/kernel/unwind_orc.c - -diff --git a/Documentation/x86/orc-unwinder.txt b/Documentation/x86/orc-unwinder.txt -new file mode 100644 -index 000000000000..af0c9a4c65a6 ---- /dev/null -+++ b/Documentation/x86/orc-unwinder.txt -@@ -0,0 +1,179 @@ -+ORC unwinder -+============ -+ -+Overview -+-------- -+ -+The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is -+similar in concept to a DWARF unwinder. The difference is that the -+format of the ORC data is much simpler than DWARF, which in turn allows -+the ORC unwinder to be much simpler and faster. -+ -+The ORC data consists of unwind tables which are generated by objtool. -+They contain out-of-band data which is used by the in-kernel ORC -+unwinder. Objtool generates the ORC data by first doing compile-time -+stack metadata validation (CONFIG_STACK_VALIDATION). After analyzing -+all the code paths of a .o file, it determines information about the -+stack state at each instruction address in the file and outputs that -+information to the .orc_unwind and .orc_unwind_ip sections. -+ -+The per-object ORC sections are combined at link time and are sorted and -+post-processed at boot time. The unwinder uses the resulting data to -+correlate instruction addresses with their stack states at run time. -+ -+ -+ORC vs frame pointers -+--------------------- -+ -+With frame pointers enabled, GCC adds instrumentation code to every -+function in the kernel. The kernel's .text size increases by about -+3.2%, resulting in a broad kernel-wide slowdown. Measurements by Mel -+Gorman [1] have shown a slowdown of 5-10% for some workloads. -+ -+In contrast, the ORC unwinder has no effect on text size or runtime -+performance, because the debuginfo is out of band. So if you disable -+frame pointers and enable the ORC unwinder, you get a nice performance -+improvement across the board, and still have reliable stack traces. -+ -+Ingo Molnar says: -+ -+ "Note that it's not just a performance improvement, but also an -+ instruction cache locality improvement: 3.2% .text savings almost -+ directly transform into a similarly sized reduction in cache -+ footprint. That can transform to even higher speedups for workloads -+ whose cache locality is borderline." -+ -+Another benefit of ORC compared to frame pointers is that it can -+reliably unwind across interrupts and exceptions. Frame pointer based -+unwinds can sometimes skip the caller of the interrupted function, if it -+was a leaf function or if the interrupt hit before the frame pointer was -+saved. -+ -+The main disadvantage of the ORC unwinder compared to frame pointers is -+that it needs more memory to store the ORC unwind tables: roughly 2-4MB -+depending on the kernel config. -+ -+ -+ORC vs DWARF -+------------ -+ -+ORC debuginfo's advantage over DWARF itself is that it's much simpler. -+It gets rid of the complex DWARF CFI state machine and also gets rid of -+the tracking of unnecessary registers. This allows the unwinder to be -+much simpler, meaning fewer bugs, which is especially important for -+mission critical oops code. -+ -+The simpler debuginfo format also enables the unwinder to be much faster -+than DWARF, which is important for perf and lockdep. In a basic -+performance test by Jiri Slaby [2], the ORC unwinder was about 20x -+faster than an out-of-tree DWARF unwinder. (Note: That measurement was -+taken before some performance tweaks were added, which doubled -+performance, so the speedup over DWARF may be closer to 40x.) -+ -+The ORC data format does have a few downsides compared to DWARF. ORC -+unwind tables take up ~50% more RAM (+1.3MB on an x86 defconfig kernel) -+than DWARF-based eh_frame tables. -+ -+Another potential downside is that, as GCC evolves, it's conceivable -+that the ORC data may end up being *too* simple to describe the state of -+the stack for certain optimizations. But IMO this is unlikely because -+GCC saves the frame pointer for any unusual stack adjustments it does, -+so I suspect we'll really only ever need to keep track of the stack -+pointer and the frame pointer between call frames. But even if we do -+end up having to track all the registers DWARF tracks, at least we will -+still be able to control the format, e.g. no complex state machines. -+ -+ -+ORC unwind table generation -+--------------------------- -+ -+The ORC data is generated by objtool. With the existing compile-time -+stack metadata validation feature, objtool already follows all code -+paths, and so it already has all the information it needs to be able to -+generate ORC data from scratch. So it's an easy step to go from stack -+validation to ORC data generation. -+ -+It should be possible to instead generate the ORC data with a simple -+tool which converts DWARF to ORC data. However, such a solution would -+be incomplete due to the kernel's extensive use of asm, inline asm, and -+special sections like exception tables. -+ -+That could be rectified by manually annotating those special code paths -+using GNU assembler .cfi annotations in .S files, and homegrown -+annotations for inline asm in .c files. But asm annotations were tried -+in the past and were found to be unmaintainable. They were often -+incorrect/incomplete and made the code harder to read and keep updated. -+And based on looking at glibc code, annotating inline asm in .c files -+might be even worse. -+ -+Objtool still needs a few annotations, but only in code which does -+unusual things to the stack like entry code. And even then, far fewer -+annotations are needed than what DWARF would need, so they're much more -+maintainable than DWARF CFI annotations. -+ -+So the advantages of using objtool to generate ORC data are that it -+gives more accurate debuginfo, with very few annotations. It also -+insulates the kernel from toolchain bugs which can be very painful to -+deal with in the kernel since we often have to workaround issues in -+older versions of the toolchain for years. -+ -+The downside is that the unwinder now becomes dependent on objtool's -+ability to reverse engineer GCC code flow. If GCC optimizations become -+too complicated for objtool to follow, the ORC data generation might -+stop working or become incomplete. (It's worth noting that livepatch -+already has such a dependency on objtool's ability to follow GCC code -+flow.) -+ -+If newer versions of GCC come up with some optimizations which break -+objtool, we may need to revisit the current implementation. Some -+possible solutions would be asking GCC to make the optimizations more -+palatable, or having objtool use DWARF as an additional input, or -+creating a GCC plugin to assist objtool with its analysis. But for now, -+objtool follows GCC code quite well. -+ -+ -+Unwinder implementation details -+------------------------------- -+ -+Objtool generates the ORC data by integrating with the compile-time -+stack metadata validation feature, which is described in detail in -+tools/objtool/Documentation/stack-validation.txt. After analyzing all -+the code paths of a .o file, it creates an array of orc_entry structs, -+and a parallel array of instruction addresses associated with those -+structs, and writes them to the .orc_unwind and .orc_unwind_ip sections -+respectively. -+ -+The ORC data is split into the two arrays for performance reasons, to -+make the searchable part of the data (.orc_unwind_ip) more compact. The -+arrays are sorted in parallel at boot time. -+ -+Performance is further improved by the use of a fast lookup table which -+is created at runtime. The fast lookup table associates a given address -+with a range of indices for the .orc_unwind table, so that only a small -+subset of the table needs to be searched. -+ -+ -+Etymology -+--------- -+ -+Orcs, fearsome creatures of medieval folklore, are the Dwarves' natural -+enemies. Similarly, the ORC unwinder was created in opposition to the -+complexity and slowness of DWARF. -+ -+"Although Orcs rarely consider multiple solutions to a problem, they do -+excel at getting things done because they are creatures of action, not -+thought." [3] Similarly, unlike the esoteric DWARF unwinder, the -+veracious ORC unwinder wastes no time or siloconic effort decoding -+variable-length zero-extended unsigned-integer byte-coded -+state-machine-based debug information entries. -+ -+Similar to how Orcs frequently unravel the well-intentioned plans of -+their adversaries, the ORC unwinder frequently unravels stacks with -+brutal, unyielding efficiency. -+ -+ORC stands for Oops Rewind Capability. -+ -+ -+[1] https://lkml.kernel.org/r/20170602104048.jkkzssljsompjdwy@suse.de -+[2] https://lkml.kernel.org/r/d2ca5435-6386-29b8-db87-7f227c2b713a@suse.cz -+[3] http://dustin.wikidot.com/half-orcs-and-orcs -diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile -index a01892bdd61a..287eac7d207f 100644 ---- a/arch/x86/kernel/Makefile -+++ b/arch/x86/kernel/Makefile -@@ -126,11 +126,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o - obj-$(CONFIG_TRACING) += tracepoint.o - obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o - --ifdef CONFIG_FRAME_POINTER --obj-y += unwind_frame.o --else --obj-y += unwind_guess.o --endif -+obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o -+obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o -+obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o - - ### - # 64 bit specific files -diff --git a/scripts/Makefile.build b/scripts/Makefile.build -index 273bc2228307..ab2c8ef43cdb 100644 ---- a/scripts/Makefile.build -+++ b/scripts/Makefile.build -@@ -258,7 +258,8 @@ ifneq ($(SKIP_STACK_VALIDATION),1) - - __objtool_obj := $(objtree)/tools/objtool/objtool - --objtool_args = check -+objtool_args = $(if $(CONFIG_ORC_UNWINDER),orc generate,check) -+ - ifndef CONFIG_FRAME_POINTER - objtool_args += --no-fp - endif -@@ -276,6 +277,11 @@ objtool_obj = $(if $(patsubst y%,, \ - endif # SKIP_STACK_VALIDATION - endif # CONFIG_STACK_VALIDATION - -+# Rebuild all objects when objtool changes, or is enabled/disabled. -+objtool_dep = $(objtool_obj) \ -+ $(wildcard include/config/orc/unwinder.h \ -+ include/config/stack/validation.h) -+ - define rule_cc_o_c - $(call echo-cmd,checksrc) $(cmd_checksrc) \ - $(call cmd_and_fixdep,cc_o_c) \ -@@ -298,14 +304,14 @@ cmd_undef_syms = echo - endif - - # Built-in and composite module parts --$(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_obj) FORCE -+$(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_dep) FORCE - $(call cmd,force_checksrc) - $(call cmd,force_check_kmsg) - $(call if_changed_rule,cc_o_c) - - # Single-part modules are special since we need to mark them in $(MODVERDIR) - --$(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_obj) FORCE -+$(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_dep) FORCE - $(call cmd,force_checksrc) - $(call cmd,force_check_kmsg) - $(call if_changed_rule,cc_o_c) -@@ -401,7 +407,7 @@ cmd_modversions_S = \ - endif - endif - --$(obj)/%.o: $(src)/%.S $(objtool_obj) FORCE -+$(obj)/%.o: $(src)/%.S $(objtool_dep) FORCE - $(call if_changed_rule,as_o_S) - - targets += $(real-objs-y) $(real-objs-m) $(lib-y) -diff --git a/arch/um/include/asm/unwind.h b/arch/um/include/asm/unwind.h -new file mode 100644 -index 000000000000..7ffa5437b761 ---- /dev/null -+++ b/arch/um/include/asm/unwind.h -@@ -0,0 +1,8 @@ -+#ifndef _ASM_UML_UNWIND_H -+#define _ASM_UML_UNWIND_H -+ -+static inline void -+unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, -+ void *orc, size_t orc_size) {} -+ -+#endif /* _ASM_UML_UNWIND_H */ -diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h -index e3b7819caeef..9eb7c718aaf8 100644 ---- a/arch/x86/include/asm/module.h -+++ b/arch/x86/include/asm/module.h -@@ -2,6 +2,15 @@ - #define _ASM_X86_MODULE_H - - #include -+#include -+ -+struct mod_arch_specific { -+#ifdef CONFIG_ORC_UNWINDER -+ unsigned int num_orcs; -+ int *orc_unwind_ip; -+ struct orc_entry *orc_unwind; -+#endif -+}; - - #ifdef CONFIG_X86_64 - /* X86_64 does not define MODULE_PROC_FAMILY */ -diff --git a/arch/x86/include/asm/orc_lookup.h b/arch/x86/include/asm/orc_lookup.h -new file mode 100644 -index 000000000000..91c8d868424d ---- /dev/null -+++ b/arch/x86/include/asm/orc_lookup.h -@@ -0,0 +1,46 @@ -+/* -+ * Copyright (C) 2017 Josh Poimboeuf -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version 2 -+ * of the License, or (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, see . -+ */ -+#ifndef _ORC_LOOKUP_H -+#define _ORC_LOOKUP_H -+ -+/* -+ * This is a lookup table for speeding up access to the .orc_unwind table. -+ * Given an input address offset, the corresponding lookup table entry -+ * specifies a subset of the .orc_unwind table to search. -+ * -+ * Each block represents the end of the previous range and the start of the -+ * next range. An extra block is added to give the last range an end. -+ * -+ * The block size should be a power of 2 to avoid a costly 'div' instruction. -+ * -+ * A block size of 256 was chosen because it roughly doubles unwinder -+ * performance while only adding ~5% to the ORC data footprint. -+ */ -+#define LOOKUP_BLOCK_ORDER 8 -+#define LOOKUP_BLOCK_SIZE (1 << LOOKUP_BLOCK_ORDER) -+ -+#ifndef LINKER_SCRIPT -+ -+extern unsigned int orc_lookup[]; -+extern unsigned int orc_lookup_end[]; -+ -+#define LOOKUP_START_IP (unsigned long)_stext -+#define LOOKUP_STOP_IP (unsigned long)_etext -+ -+#endif /* LINKER_SCRIPT */ -+ -+#endif /* _ORC_LOOKUP_H */ -diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h -index 7dc777a6cb40..9c9dc579bd7d 100644 ---- a/arch/x86/include/asm/orc_types.h -+++ b/arch/x86/include/asm/orc_types.h -@@ -88,7 +88,7 @@ struct orc_entry { - unsigned sp_reg:4; - unsigned bp_reg:4; - unsigned type:2; --}; -+} __packed; - - /* - * This struct is used by asm and inline asm code to manually annotate the -diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h -index e6676495b125..25b8d31a007d 100644 ---- a/arch/x86/include/asm/unwind.h -+++ b/arch/x86/include/asm/unwind.h -@@ -12,11 +12,14 @@ struct unwind_state { - struct task_struct *task; - int graph_idx; - bool error; --#ifdef CONFIG_FRAME_POINTER -+#if defined(CONFIG_ORC_UNWINDER) -+ bool signal, full_regs; -+ unsigned long sp, bp, ip; -+ struct pt_regs *regs; -+#elif defined(CONFIG_FRAME_POINTER) - bool got_irq; -- unsigned long *bp, *orig_sp; -+ unsigned long *bp, *orig_sp, ip; - struct pt_regs *regs; -- unsigned long ip; - #else - unsigned long *sp; - #endif -@@ -24,41 +27,30 @@ struct unwind_state { - - void __unwind_start(struct unwind_state *state, struct task_struct *task, - struct pt_regs *regs, unsigned long *first_frame); -- - bool unwind_next_frame(struct unwind_state *state); -- - unsigned long unwind_get_return_address(struct unwind_state *state); -+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); - - static inline bool unwind_done(struct unwind_state *state) - { - return state->stack_info.type == STACK_TYPE_UNKNOWN; - } - --static inline --void unwind_start(struct unwind_state *state, struct task_struct *task, -- struct pt_regs *regs, unsigned long *first_frame) --{ -- first_frame = first_frame ? : get_stack_pointer(task, regs); -- -- __unwind_start(state, task, regs, first_frame); --} -- - static inline bool unwind_error(struct unwind_state *state) - { - return state->error; - } - --#ifdef CONFIG_FRAME_POINTER -- - static inline --unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) -+void unwind_start(struct unwind_state *state, struct task_struct *task, -+ struct pt_regs *regs, unsigned long *first_frame) - { -- if (unwind_done(state)) -- return NULL; -+ first_frame = first_frame ? : get_stack_pointer(task, regs); - -- return state->regs ? &state->regs->ip : state->bp + 1; -+ __unwind_start(state, task, regs, first_frame); - } - -+#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER) - static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) - { - if (unwind_done(state)) -@@ -66,20 +58,46 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) - - return state->regs; - } -- --#else /* !CONFIG_FRAME_POINTER */ -- --static inline --unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) -+#else -+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) - { - return NULL; - } -+#endif - --static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) -+#ifdef CONFIG_ORC_UNWINDER -+void unwind_init(void); -+void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, -+ void *orc, size_t orc_size); -+#else -+static inline void unwind_init(void) {} -+static inline -+void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, -+ void *orc, size_t orc_size) {} -+#endif -+ -+/* -+ * This disables KASAN checking when reading a value from another task's stack, -+ * since the other task could be running on another CPU and could have poisoned -+ * the stack in the meantime. -+ */ -+#define READ_ONCE_TASK_STACK(task, x) \ -+({ \ -+ unsigned long val; \ -+ if (task == current) \ -+ val = READ_ONCE(x); \ -+ else \ -+ val = READ_ONCE_NOCHECK(x); \ -+ val; \ -+}) -+ -+static inline bool task_on_another_cpu(struct task_struct *task) - { -- return NULL; -+#ifdef CONFIG_SMP -+ return task != current && task->on_cpu; -+#else -+ return false; -+#endif - } - --#endif /* CONFIG_FRAME_POINTER */ -- - #endif /* _ASM_X86_UNWIND_H */ -diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h -index e7e955d4ab9e..9fdb54a95976 100644 ---- a/include/asm-generic/vmlinux.lds.h -+++ b/include/asm-generic/vmlinux.lds.h -@@ -686,6 +686,31 @@ - #define BUG_TABLE - #endif - -+#ifdef CONFIG_ORC_UNWINDER -+#define ORC_UNWIND_TABLE \ -+ . = ALIGN(4); \ -+ .orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) { \ -+ VMLINUX_SYMBOL(__start_orc_unwind_ip) = .; \ -+ KEEP(*(.orc_unwind_ip)) \ -+ VMLINUX_SYMBOL(__stop_orc_unwind_ip) = .; \ -+ } \ -+ . = ALIGN(6); \ -+ .orc_unwind : AT(ADDR(.orc_unwind) - LOAD_OFFSET) { \ -+ VMLINUX_SYMBOL(__start_orc_unwind) = .; \ -+ KEEP(*(.orc_unwind)) \ -+ VMLINUX_SYMBOL(__stop_orc_unwind) = .; \ -+ } \ -+ . = ALIGN(4); \ -+ .orc_lookup : AT(ADDR(.orc_lookup) - LOAD_OFFSET) { \ -+ VMLINUX_SYMBOL(orc_lookup) = .; \ -+ . += (((SIZEOF(.text) + LOOKUP_BLOCK_SIZE - 1) / \ -+ LOOKUP_BLOCK_SIZE) + 1) * 4; \ -+ VMLINUX_SYMBOL(orc_lookup_end) = .; \ -+ } -+#else -+#define ORC_UNWIND_TABLE -+#endif -+ - #ifdef CONFIG_PM_TRACE - #define TRACEDATA \ - . = ALIGN(4); \ -@@ -872,7 +897,7 @@ - DATA_DATA \ - CONSTRUCTORS \ - } \ -- BUG_TABLE -+ BUG_TABLE \ - - #define INIT_TEXT_SECTION(inittext_align) \ - . = ALIGN(inittext_align); \ -diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c -index f67bd3205df7..62e7d70aadd5 100644 ---- a/arch/x86/kernel/module.c -+++ b/arch/x86/kernel/module.c -@@ -35,6 +35,7 @@ - #include - #include - #include -+#include - - #if 0 - #define DEBUGP(fmt, ...) \ -@@ -213,7 +214,7 @@ int module_finalize(const Elf_Ehdr *hdr, - struct module *me) - { - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, -- *para = NULL; -+ *para = NULL, *orc = NULL, *orc_ip = NULL; - char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { -@@ -225,6 +226,10 @@ int module_finalize(const Elf_Ehdr *hdr, - locks = s; - if (!strcmp(".parainstructions", secstrings + s->sh_name)) - para = s; -+ if (!strcmp(".orc_unwind", secstrings + s->sh_name)) -+ orc = s; -+ if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) -+ orc_ip = s; - } - - if (alt) { -@@ -248,6 +253,10 @@ int module_finalize(const Elf_Ehdr *hdr, - /* make jump label nops */ - jump_label_apply_nops(me); - -+ if (orc && orc_ip) -+ unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, -+ (void *)orc->sh_addr, orc->sh_size); -+ - return 0; - } - -diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c -index f964bfddfefd..dd6e8707e969 100644 ---- a/arch/x86/kernel/setup.c -+++ b/arch/x86/kernel/setup.c -@@ -121,6 +121,7 @@ - #include - #include - #include -+#include - - /* - * max_low_pfn_mapped: highest direct mapped pfn under 4GB -@@ -1325,6 +1326,8 @@ void __init setup_arch(char **cmdline_p) - if (efi_enabled(EFI_BOOT)) - efi_apply_memmap_quirks(); - #endif -+ -+ unwind_init(); - } - - #ifdef CONFIG_X86_32 -diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c -index c29e5bc7e9c9..d145a0b1f529 100644 ---- a/arch/x86/kernel/unwind_frame.c -+++ b/arch/x86/kernel/unwind_frame.c -@@ -10,20 +10,22 @@ - - #define FRAME_HEADER_SIZE (sizeof(long) * 2) - --/* -- * This disables KASAN checking when reading a value from another task's stack, -- * since the other task could be running on another CPU and could have poisoned -- * the stack in the meantime. -- */ --#define READ_ONCE_TASK_STACK(task, x) \ --({ \ -- unsigned long val; \ -- if (task == current) \ -- val = READ_ONCE(x); \ -- else \ -- val = READ_ONCE_NOCHECK(x); \ -- val; \ --}) -+unsigned long unwind_get_return_address(struct unwind_state *state) -+{ -+ if (unwind_done(state)) -+ return 0; -+ -+ return __kernel_text_address(state->ip) ? state->ip : 0; -+} -+EXPORT_SYMBOL_GPL(unwind_get_return_address); -+ -+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) -+{ -+ if (unwind_done(state)) -+ return NULL; -+ -+ return state->regs ? &state->regs->ip : state->bp + 1; -+} - - static void unwind_dump(struct unwind_state *state) - { -@@ -66,15 +68,6 @@ static void unwind_dump(struct unwind_state *state) - } - } - --unsigned long unwind_get_return_address(struct unwind_state *state) --{ -- if (unwind_done(state)) -- return 0; -- -- return __kernel_text_address(state->ip) ? state->ip : 0; --} --EXPORT_SYMBOL_GPL(unwind_get_return_address); -- - static size_t regs_size(struct pt_regs *regs) - { - /* x86_32 regs from kernel mode are two words shorter: */ -diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c -index 039f36738e49..4f0e17b90463 100644 ---- a/arch/x86/kernel/unwind_guess.c -+++ b/arch/x86/kernel/unwind_guess.c -@@ -19,6 +19,11 @@ unsigned long unwind_get_return_address(struct unwind_state *state) - } - EXPORT_SYMBOL_GPL(unwind_get_return_address); - -+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) -+{ -+ return NULL; -+} -+ - bool unwind_next_frame(struct unwind_state *state) - { - struct stack_info *info = &state->stack_info; -diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c -new file mode 100644 -index 000000000000..570b70d3f604 ---- /dev/null -+++ b/arch/x86/kernel/unwind_orc.c -@@ -0,0 +1,582 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#define orc_warn(fmt, ...) \ -+ printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__) -+ -+extern int __start_orc_unwind_ip[]; -+extern int __stop_orc_unwind_ip[]; -+extern struct orc_entry __start_orc_unwind[]; -+extern struct orc_entry __stop_orc_unwind[]; -+ -+static DEFINE_MUTEX(sort_mutex); -+int *cur_orc_ip_table = __start_orc_unwind_ip; -+struct orc_entry *cur_orc_table = __start_orc_unwind; -+ -+unsigned int lookup_num_blocks; -+bool orc_init; -+ -+static inline unsigned long orc_ip(const int *ip) -+{ -+ return (unsigned long)ip + *ip; -+} -+ -+static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table, -+ unsigned int num_entries, unsigned long ip) -+{ -+ int *first = ip_table; -+ int *last = ip_table + num_entries - 1; -+ int *mid = first, *found = first; -+ -+ if (!num_entries) -+ return NULL; -+ -+ /* -+ * Do a binary range search to find the rightmost duplicate of a given -+ * starting address. Some entries are section terminators which are -+ * "weak" entries for ensuring there are no gaps. They should be -+ * ignored when they conflict with a real entry. -+ */ -+ while (first <= last) { -+ mid = first + ((last - first) / 2); -+ -+ if (orc_ip(mid) <= ip) { -+ found = mid; -+ first = mid + 1; -+ } else -+ last = mid - 1; -+ } -+ -+ return u_table + (found - ip_table); -+} -+ -+#ifdef CONFIG_MODULES -+static struct orc_entry *orc_module_find(unsigned long ip) -+{ -+ struct module *mod; -+ -+ mod = __module_address(ip); -+ if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip) -+ return NULL; -+ return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind, -+ mod->arch.num_orcs, ip); -+} -+#else -+static struct orc_entry *orc_module_find(unsigned long ip) -+{ -+ return NULL; -+} -+#endif -+ -+static struct orc_entry *orc_find(unsigned long ip) -+{ -+ if (!orc_init) -+ return NULL; -+ -+ /* For non-init vmlinux addresses, use the fast lookup table: */ -+ if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) { -+ unsigned int idx, start, stop; -+ -+ idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE; -+ -+ if (unlikely((idx >= lookup_num_blocks-1))) { -+ orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%lx\n", -+ idx, lookup_num_blocks, ip); -+ return NULL; -+ } -+ -+ start = orc_lookup[idx]; -+ stop = orc_lookup[idx + 1] + 1; -+ -+ if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) || -+ (__start_orc_unwind + stop > __stop_orc_unwind))) { -+ orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%lx\n", -+ idx, lookup_num_blocks, start, stop, ip); -+ return NULL; -+ } -+ -+ return __orc_find(__start_orc_unwind_ip + start, -+ __start_orc_unwind + start, stop - start, ip); -+ } -+ -+ /* vmlinux .init slow lookup: */ -+ if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext) -+ return __orc_find(__start_orc_unwind_ip, __start_orc_unwind, -+ __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); -+ -+ /* Module lookup: */ -+ return orc_module_find(ip); -+} -+ -+static void orc_sort_swap(void *_a, void *_b, int size) -+{ -+ struct orc_entry *orc_a, *orc_b; -+ struct orc_entry orc_tmp; -+ int *a = _a, *b = _b, tmp; -+ int delta = _b - _a; -+ -+ /* Swap the .orc_unwind_ip entries: */ -+ tmp = *a; -+ *a = *b + delta; -+ *b = tmp - delta; -+ -+ /* Swap the corresponding .orc_unwind entries: */ -+ orc_a = cur_orc_table + (a - cur_orc_ip_table); -+ orc_b = cur_orc_table + (b - cur_orc_ip_table); -+ orc_tmp = *orc_a; -+ *orc_a = *orc_b; -+ *orc_b = orc_tmp; -+} -+ -+static int orc_sort_cmp(const void *_a, const void *_b) -+{ -+ struct orc_entry *orc_a; -+ const int *a = _a, *b = _b; -+ unsigned long a_val = orc_ip(a); -+ unsigned long b_val = orc_ip(b); -+ -+ if (a_val > b_val) -+ return 1; -+ if (a_val < b_val) -+ return -1; -+ -+ /* -+ * The "weak" section terminator entries need to always be on the left -+ * to ensure the lookup code skips them in favor of real entries. -+ * These terminator entries exist to handle any gaps created by -+ * whitelisted .o files which didn't get objtool generation. -+ */ -+ orc_a = cur_orc_table + (a - cur_orc_ip_table); -+ return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1; -+} -+ -+#ifdef CONFIG_MODULES -+void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size, -+ void *_orc, size_t orc_size) -+{ -+ int *orc_ip = _orc_ip; -+ struct orc_entry *orc = _orc; -+ unsigned int num_entries = orc_ip_size / sizeof(int); -+ -+ WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 || -+ orc_size % sizeof(*orc) != 0 || -+ num_entries != orc_size / sizeof(*orc)); -+ -+ /* -+ * The 'cur_orc_*' globals allow the orc_sort_swap() callback to -+ * associate an .orc_unwind_ip table entry with its corresponding -+ * .orc_unwind entry so they can both be swapped. -+ */ -+ mutex_lock(&sort_mutex); -+ cur_orc_ip_table = orc_ip; -+ cur_orc_table = orc; -+ sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap); -+ mutex_unlock(&sort_mutex); -+ -+ mod->arch.orc_unwind_ip = orc_ip; -+ mod->arch.orc_unwind = orc; -+ mod->arch.num_orcs = num_entries; -+} -+#endif -+ -+void __init unwind_init(void) -+{ -+ size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip; -+ size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind; -+ size_t num_entries = orc_ip_size / sizeof(int); -+ struct orc_entry *orc; -+ int i; -+ -+ if (!num_entries || orc_ip_size % sizeof(int) != 0 || -+ orc_size % sizeof(struct orc_entry) != 0 || -+ num_entries != orc_size / sizeof(struct orc_entry)) { -+ orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n"); -+ return; -+ } -+ -+ /* Sort the .orc_unwind and .orc_unwind_ip tables: */ -+ sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp, -+ orc_sort_swap); -+ -+ /* Initialize the fast lookup table: */ -+ lookup_num_blocks = orc_lookup_end - orc_lookup; -+ for (i = 0; i < lookup_num_blocks-1; i++) { -+ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, -+ num_entries, -+ LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i)); -+ if (!orc) { -+ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); -+ return; -+ } -+ -+ orc_lookup[i] = orc - __start_orc_unwind; -+ } -+ -+ /* Initialize the ending block: */ -+ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries, -+ LOOKUP_STOP_IP); -+ if (!orc) { -+ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); -+ return; -+ } -+ orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind; -+ -+ orc_init = true; -+} -+ -+unsigned long unwind_get_return_address(struct unwind_state *state) -+{ -+ if (unwind_done(state)) -+ return 0; -+ -+ return __kernel_text_address(state->ip) ? state->ip : 0; -+} -+EXPORT_SYMBOL_GPL(unwind_get_return_address); -+ -+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) -+{ -+ if (unwind_done(state)) -+ return NULL; -+ -+ if (state->regs) -+ return &state->regs->ip; -+ -+ if (state->sp) -+ return (unsigned long *)state->sp - 1; -+ -+ return NULL; -+} -+ -+static bool stack_access_ok(struct unwind_state *state, unsigned long addr, -+ size_t len) -+{ -+ struct stack_info *info = &state->stack_info; -+ -+ /* -+ * If the address isn't on the current stack, switch to the next one. -+ * -+ * We may have to traverse multiple stacks to deal with the possibility -+ * that info->next_sp could point to an empty stack and the address -+ * could be on a subsequent stack. -+ */ -+ while (!on_stack(info, (void *)addr, len)) -+ if (get_stack_info(info->next_sp, state->task, info, -+ &state->stack_mask)) -+ return false; -+ -+ return true; -+} -+ -+static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, -+ unsigned long *val) -+{ -+ if (!stack_access_ok(state, addr, sizeof(long))) -+ return false; -+ -+ *val = READ_ONCE_TASK_STACK(state->task, *(unsigned long *)addr); -+ return true; -+} -+ -+#define REGS_SIZE (sizeof(struct pt_regs)) -+#define SP_OFFSET (offsetof(struct pt_regs, sp)) -+#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) -+#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) -+ -+static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, -+ unsigned long *ip, unsigned long *sp, bool full) -+{ -+ size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; -+ size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; -+ struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); -+ -+ if (IS_ENABLED(CONFIG_X86_64)) { -+ if (!stack_access_ok(state, addr, regs_size)) -+ return false; -+ -+ *ip = regs->ip; -+ *sp = regs->sp; -+ -+ return true; -+ } -+ -+ if (!stack_access_ok(state, addr, sp_offset)) -+ return false; -+ -+ *ip = regs->ip; -+ -+ if (user_mode(regs)) { -+ if (!stack_access_ok(state, addr + sp_offset, -+ REGS_SIZE - SP_OFFSET)) -+ return false; -+ -+ *sp = regs->sp; -+ } else -+ *sp = (unsigned long)®s->sp; -+ -+ return true; -+} -+ -+bool unwind_next_frame(struct unwind_state *state) -+{ -+ unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; -+ enum stack_type prev_type = state->stack_info.type; -+ struct orc_entry *orc; -+ struct pt_regs *ptregs; -+ bool indirect = false; -+ -+ if (unwind_done(state)) -+ return false; -+ -+ /* Don't let modules unload while we're reading their ORC data. */ -+ preempt_disable(); -+ -+ /* Have we reached the end? */ -+ if (state->regs && user_mode(state->regs)) -+ goto done; -+ -+ /* -+ * Find the orc_entry associated with the text address. -+ * -+ * Decrement call return addresses by one so they work for sibling -+ * calls and calls to noreturn functions. -+ */ -+ orc = orc_find(state->signal ? state->ip : state->ip - 1); -+ if (!orc || orc->sp_reg == ORC_REG_UNDEFINED) -+ goto done; -+ orig_ip = state->ip; -+ -+ /* Find the previous frame's stack: */ -+ switch (orc->sp_reg) { -+ case ORC_REG_SP: -+ sp = state->sp + orc->sp_offset; -+ break; -+ -+ case ORC_REG_BP: -+ sp = state->bp + orc->sp_offset; -+ break; -+ -+ case ORC_REG_SP_INDIRECT: -+ sp = state->sp + orc->sp_offset; -+ indirect = true; -+ break; -+ -+ case ORC_REG_BP_INDIRECT: -+ sp = state->bp + orc->sp_offset; -+ indirect = true; -+ break; -+ -+ case ORC_REG_R10: -+ if (!state->regs || !state->full_regs) { -+ orc_warn("missing regs for base reg R10 at ip %p\n", -+ (void *)state->ip); -+ goto done; -+ } -+ sp = state->regs->r10; -+ break; -+ -+ case ORC_REG_R13: -+ if (!state->regs || !state->full_regs) { -+ orc_warn("missing regs for base reg R13 at ip %p\n", -+ (void *)state->ip); -+ goto done; -+ } -+ sp = state->regs->r13; -+ break; -+ -+ case ORC_REG_DI: -+ if (!state->regs || !state->full_regs) { -+ orc_warn("missing regs for base reg DI at ip %p\n", -+ (void *)state->ip); -+ goto done; -+ } -+ sp = state->regs->di; -+ break; -+ -+ case ORC_REG_DX: -+ if (!state->regs || !state->full_regs) { -+ orc_warn("missing regs for base reg DX at ip %p\n", -+ (void *)state->ip); -+ goto done; -+ } -+ sp = state->regs->dx; -+ break; -+ -+ default: -+ orc_warn("unknown SP base reg %d for ip %p\n", -+ orc->sp_reg, (void *)state->ip); -+ goto done; -+ } -+ -+ if (indirect) { -+ if (!deref_stack_reg(state, sp, &sp)) -+ goto done; -+ } -+ -+ /* Find IP, SP and possibly regs: */ -+ switch (orc->type) { -+ case ORC_TYPE_CALL: -+ ip_p = sp - sizeof(long); -+ -+ if (!deref_stack_reg(state, ip_p, &state->ip)) -+ goto done; -+ -+ state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, -+ state->ip, (void *)ip_p); -+ -+ state->sp = sp; -+ state->regs = NULL; -+ state->signal = false; -+ break; -+ -+ case ORC_TYPE_REGS: -+ if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { -+ orc_warn("can't dereference registers at %p for ip %p\n", -+ (void *)sp, (void *)orig_ip); -+ goto done; -+ } -+ -+ state->regs = (struct pt_regs *)sp; -+ state->full_regs = true; -+ state->signal = true; -+ break; -+ -+ case ORC_TYPE_REGS_IRET: -+ if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { -+ orc_warn("can't dereference iret registers at %p for ip %p\n", -+ (void *)sp, (void *)orig_ip); -+ goto done; -+ } -+ -+ ptregs = container_of((void *)sp, struct pt_regs, ip); -+ if ((unsigned long)ptregs >= prev_sp && -+ on_stack(&state->stack_info, ptregs, REGS_SIZE)) { -+ state->regs = ptregs; -+ state->full_regs = false; -+ } else -+ state->regs = NULL; -+ -+ state->signal = true; -+ break; -+ -+ default: -+ orc_warn("unknown .orc_unwind entry type %d\n", orc->type); -+ break; -+ } -+ -+ /* Find BP: */ -+ switch (orc->bp_reg) { -+ case ORC_REG_UNDEFINED: -+ if (state->regs && state->full_regs) -+ state->bp = state->regs->bp; -+ break; -+ -+ case ORC_REG_PREV_SP: -+ if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp)) -+ goto done; -+ break; -+ -+ case ORC_REG_BP: -+ if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp)) -+ goto done; -+ break; -+ -+ default: -+ orc_warn("unknown BP base reg %d for ip %p\n", -+ orc->bp_reg, (void *)orig_ip); -+ goto done; -+ } -+ -+ /* Prevent a recursive loop due to bad ORC data: */ -+ if (state->stack_info.type == prev_type && -+ on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) && -+ state->sp <= prev_sp) { -+ orc_warn("stack going in the wrong direction? ip=%p\n", -+ (void *)orig_ip); -+ goto done; -+ } -+ -+ preempt_enable(); -+ return true; -+ -+done: -+ preempt_enable(); -+ state->stack_info.type = STACK_TYPE_UNKNOWN; -+ return false; -+} -+EXPORT_SYMBOL_GPL(unwind_next_frame); -+ -+void __unwind_start(struct unwind_state *state, struct task_struct *task, -+ struct pt_regs *regs, unsigned long *first_frame) -+{ -+ memset(state, 0, sizeof(*state)); -+ state->task = task; -+ -+ /* -+ * Refuse to unwind the stack of a task while it's executing on another -+ * CPU. This check is racy, but that's ok: the unwinder has other -+ * checks to prevent it from going off the rails. -+ */ -+ if (task_on_another_cpu(task)) -+ goto done; -+ -+ if (regs) { -+ if (user_mode(regs)) -+ goto done; -+ -+ state->ip = regs->ip; -+ state->sp = kernel_stack_pointer(regs); -+ state->bp = regs->bp; -+ state->regs = regs; -+ state->full_regs = true; -+ state->signal = true; -+ -+ } else if (task == current) { -+ asm volatile("lea (%%rip), %0\n\t" -+ "mov %%rsp, %1\n\t" -+ "mov %%rbp, %2\n\t" -+ : "=r" (state->ip), "=r" (state->sp), -+ "=r" (state->bp)); -+ -+ } else { -+ struct inactive_task_frame *frame = (void *)task->thread.sp; -+ -+ state->sp = task->thread.sp; -+ state->bp = READ_ONCE_NOCHECK(frame->bp); -+ state->ip = READ_ONCE_NOCHECK(frame->ret_addr); -+ } -+ -+ if (get_stack_info((unsigned long *)state->sp, state->task, -+ &state->stack_info, &state->stack_mask)) -+ return; -+ -+ /* -+ * The caller can provide the address of the first frame directly -+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame -+ * to start unwinding at. Skip ahead until we reach it. -+ */ -+ -+ /* When starting from regs, skip the regs frame: */ -+ if (regs) { -+ unwind_next_frame(state); -+ return; -+ } -+ -+ /* Otherwise, skip ahead to the user-specified starting frame: */ -+ while (!unwind_done(state) && -+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) || -+ state->sp <= (unsigned long)first_frame)) -+ unwind_next_frame(state); -+ -+ return; -+ -+done: -+ state->stack_info.type = STACK_TYPE_UNKNOWN; -+ return; -+} -+EXPORT_SYMBOL_GPL(__unwind_start); -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 658fcf67862c..d6f45f6d1054 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -158,6 +158,7 @@ config X86 - select HAVE_MEMBLOCK - select HAVE_MEMBLOCK_NODE_MAP - select HAVE_MIXED_BREAKPOINTS_REGS -+ select HAVE_MOD_ARCH_SPECIFIC - select HAVE_NMI - select HAVE_OPROFILE - select HAVE_OPTPROBES -diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug -index 1fc519f3c49e..d5bca2ec8a74 100644 ---- a/arch/x86/Kconfig.debug -+++ b/arch/x86/Kconfig.debug -@@ -356,4 +356,29 @@ config PUNIT_ATOM_DEBUG - The current power state can be read from - /sys/kernel/debug/punit_atom/dev_power_state - -+config ORC_UNWINDER -+ bool "ORC unwinder" -+ depends on X86_64 -+ select STACK_VALIDATION -+ ---help--- -+ This option enables the ORC (Oops Rewind Capability) unwinder for -+ unwinding kernel stack traces. It uses a custom data format which is -+ a simplified version of the DWARF Call Frame Information standard. -+ -+ This unwinder is more accurate across interrupt entry frames than the -+ frame pointer unwinder. It can also enable a 5-10% performance -+ improvement across the entire kernel if CONFIG_FRAME_POINTER is -+ disabled. -+ -+ Enabling this option will increase the kernel's runtime memory usage -+ by roughly 2-4MB, depending on your kernel config. -+ -+config FRAME_POINTER_UNWINDER -+ def_bool y -+ depends on !ORC_UNWINDER && FRAME_POINTER -+ -+config GUESS_UNWINDER -+ def_bool y -+ depends on !ORC_UNWINDER && !FRAME_POINTER -+ - endmenu -diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S -index c8a3b61be0aa..f05f00acac89 100644 ---- a/arch/x86/kernel/vmlinux.lds.S -+++ b/arch/x86/kernel/vmlinux.lds.S -@@ -24,6 +24,7 @@ - #include - #include - #include -+#include - #include - #include - -@@ -148,6 +149,8 @@ SECTIONS - - BUG_TABLE - -+ ORC_UNWIND_TABLE -+ - . = ALIGN(PAGE_SIZE); - __vvar_page = .; - -diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index c617b9d1d6cb..0b4d1b3880b0 100644 ---- a/lib/Kconfig.debug -+++ b/lib/Kconfig.debug -@@ -374,6 +374,9 @@ config STACK_VALIDATION - pointers (if CONFIG_FRAME_POINTER is enabled). This helps ensure - that runtime stack traces are more reliable. - -+ This is also a prerequisite for generation of ORC unwind data, which -+ is needed for CONFIG_ORC_UNWINDER. -+ - For more information, see - tools/objtool/Documentation/stack-validation.txt. - --- -2.14.2 - diff --git a/patches/kernel/0038-x86-kconfig-Consolidate-unwinders-into-multiple-choi.patch b/patches/kernel/0038-x86-kconfig-Consolidate-unwinders-into-multiple-choi.patch deleted file mode 100644 index f588b6e..0000000 --- a/patches/kernel/0038-x86-kconfig-Consolidate-unwinders-into-multiple-choi.patch +++ /dev/null @@ -1,171 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Tue, 25 Jul 2017 08:54:24 -0500 -Subject: [PATCH] x86/kconfig: Consolidate unwinders into multiple choice - selection -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -There are three mutually exclusive unwinders. Make that more obvious by -combining them into a multiple-choice selection: - - CONFIG_FRAME_POINTER_UNWINDER - CONFIG_ORC_UNWINDER - CONFIG_GUESS_UNWINDER (if CONFIG_EXPERT=y) - -Frame pointers are still the default (for now). - -The old CONFIG_FRAME_POINTER option is still used in some -arch-independent places, so keep it around, but make it -invisible to the user on x86 - it's now selected by -CONFIG_FRAME_POINTER_UNWINDER=y. - -Suggested-by: Ingo Molnar -Signed-off-by: Josh Poimboeuf -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Denys Vlasenko -Cc: H. Peter Anvin -Cc: Jiri Slaby -Cc: Linus Torvalds -Cc: Mike Galbraith -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: live-patching@vger.kernel.org -Link: http://lkml.kernel.org/r/20170725135424.zukjmgpz3plf5pmt@treble -Signed-off-by: Ingo Molnar -(cherry picked from commit 81d387190039c14edac8de2b3ec789beb899afd9) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 26ddacc1e6333555e4a6bd63c4c935b323509f92) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/unwind.h | 4 ++-- - arch/x86/Kconfig | 3 +-- - arch/x86/Kconfig.debug | 45 +++++++++++++++++++++++++++++++++++++------ - arch/x86/configs/tiny.config | 2 ++ - 4 files changed, 44 insertions(+), 10 deletions(-) - -diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h -index 25b8d31a007d..e9f793e2df7a 100644 ---- a/arch/x86/include/asm/unwind.h -+++ b/arch/x86/include/asm/unwind.h -@@ -16,7 +16,7 @@ struct unwind_state { - bool signal, full_regs; - unsigned long sp, bp, ip; - struct pt_regs *regs; --#elif defined(CONFIG_FRAME_POINTER) -+#elif defined(CONFIG_FRAME_POINTER_UNWINDER) - bool got_irq; - unsigned long *bp, *orig_sp, ip; - struct pt_regs *regs; -@@ -50,7 +50,7 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, - __unwind_start(state, task, regs, first_frame); - } - --#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER) -+#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER) - static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) - { - if (unwind_done(state)) -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index d6f45f6d1054..3a0b8cb57caf 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -73,7 +73,6 @@ config X86 - select ARCH_USE_QUEUED_RWLOCKS - select ARCH_USE_QUEUED_SPINLOCKS - select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH -- select ARCH_WANT_FRAME_POINTERS - select ARCH_WANTS_DYNAMIC_TASK_STRUCT - select ARCH_WANTS_THP_SWAP if X86_64 - select BUILDTIME_EXTABLE_SORT -@@ -169,7 +168,7 @@ config X86 - select HAVE_PERF_REGS - select HAVE_PERF_USER_STACK_DUMP - select HAVE_REGS_AND_STACK_ACCESS_API -- select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER && STACK_VALIDATION -+ select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION - select HAVE_STACK_VALIDATION if X86_64 - select HAVE_SYSCALL_TRACEPOINTS - select HAVE_UNSTABLE_SCHED_CLOCK -diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug -index d5bca2ec8a74..c441b5d65ec8 100644 ---- a/arch/x86/Kconfig.debug -+++ b/arch/x86/Kconfig.debug -@@ -356,6 +356,29 @@ config PUNIT_ATOM_DEBUG - The current power state can be read from - /sys/kernel/debug/punit_atom/dev_power_state - -+choice -+ prompt "Choose kernel unwinder" -+ default FRAME_POINTER_UNWINDER -+ ---help--- -+ This determines which method will be used for unwinding kernel stack -+ traces for panics, oopses, bugs, warnings, perf, /proc//stack, -+ livepatch, lockdep, and more. -+ -+config FRAME_POINTER_UNWINDER -+ bool "Frame pointer unwinder" -+ select FRAME_POINTER -+ ---help--- -+ This option enables the frame pointer unwinder for unwinding kernel -+ stack traces. -+ -+ The unwinder itself is fast and it uses less RAM than the ORC -+ unwinder, but the kernel text size will grow by ~3% and the kernel's -+ overall performance will degrade by roughly 5-10%. -+ -+ This option is recommended if you want to use the livepatch -+ consistency model, as this is currently the only way to get a -+ reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). -+ - config ORC_UNWINDER - bool "ORC unwinder" - depends on X86_64 -@@ -373,12 +396,22 @@ config ORC_UNWINDER - Enabling this option will increase the kernel's runtime memory usage - by roughly 2-4MB, depending on your kernel config. - --config FRAME_POINTER_UNWINDER -- def_bool y -- depends on !ORC_UNWINDER && FRAME_POINTER -- - config GUESS_UNWINDER -- def_bool y -- depends on !ORC_UNWINDER && !FRAME_POINTER -+ bool "Guess unwinder" -+ depends on EXPERT -+ ---help--- -+ This option enables the "guess" unwinder for unwinding kernel stack -+ traces. It scans the stack and reports every kernel text address it -+ finds. Some of the addresses it reports may be incorrect. -+ -+ While this option often produces false positives, it can still be -+ useful in many cases. Unlike the other unwinders, it has no runtime -+ overhead. -+ -+endchoice -+ -+config FRAME_POINTER -+ depends on !ORC_UNWINDER && !GUESS_UNWINDER -+ bool - - endmenu -diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config -index 4b429df40d7a..550cd5012b73 100644 ---- a/arch/x86/configs/tiny.config -+++ b/arch/x86/configs/tiny.config -@@ -1,3 +1,5 @@ - CONFIG_NOHIGHMEM=y - # CONFIG_HIGHMEM4G is not set - # CONFIG_HIGHMEM64G is not set -+CONFIG_GUESS_UNWINDER=y -+# CONFIG_FRAME_POINTER_UNWINDER is not set --- -2.14.2 - diff --git a/patches/kernel/0038-x86-unwind-Add-the-ORC-unwinder.patch b/patches/kernel/0038-x86-unwind-Add-the-ORC-unwinder.patch new file mode 100644 index 0000000..b8f0318 --- /dev/null +++ b/patches/kernel/0038-x86-unwind-Add-the-ORC-unwinder.patch @@ -0,0 +1,1407 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Mon, 24 Jul 2017 18:36:57 -0500 +Subject: [PATCH] x86/unwind: Add the ORC unwinder +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Add the new ORC unwinder which is enabled by CONFIG_ORC_UNWINDER=y. +It plugs into the existing x86 unwinder framework. + +It relies on objtool to generate the needed .orc_unwind and +.orc_unwind_ip sections. + +For more details on why ORC is used instead of DWARF, see +Documentation/x86/orc-unwinder.txt - but the short version is +that it's a simplified, fundamentally more robust debugninfo +data structure, which also allows up to two orders of magnitude +faster lookups than the DWARF unwinder - which matters to +profiling workloads like perf. + +Thanks to Andy Lutomirski for the performance improvement ideas: +splitting the ORC unwind table into two parallel arrays and creating a +fast lookup table to search a subset of the unwind table. + +Signed-off-by: Josh Poimboeuf +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Jiri Slaby +Cc: Linus Torvalds +Cc: Mike Galbraith +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: live-patching@vger.kernel.org +Link: http://lkml.kernel.org/r/0a6cbfb40f8da99b7a45a1a8302dc6aef16ec812.1500938583.git.jpoimboe@redhat.com +[ Extended the changelog. ] +Signed-off-by: Ingo Molnar +(cherry picked from commit ee9f8fce99640811b2b8e79d0d1dbe8bab69ba67) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit dccbf63d7a6cc431af23a86e28275a74904545cd) +Signed-off-by: Fabian Grünbichler +--- + Documentation/x86/orc-unwinder.txt | 179 ++++++++++++ + arch/x86/kernel/Makefile | 8 +- + scripts/Makefile.build | 14 +- + arch/um/include/asm/unwind.h | 8 + + arch/x86/include/asm/module.h | 9 + + arch/x86/include/asm/orc_lookup.h | 46 +++ + arch/x86/include/asm/orc_types.h | 2 +- + arch/x86/include/asm/unwind.h | 76 +++-- + include/asm-generic/vmlinux.lds.h | 27 +- + arch/x86/kernel/module.c | 11 +- + arch/x86/kernel/setup.c | 3 + + arch/x86/kernel/unwind_frame.c | 39 +-- + arch/x86/kernel/unwind_guess.c | 5 + + arch/x86/kernel/unwind_orc.c | 582 +++++++++++++++++++++++++++++++++++++ + arch/x86/Kconfig | 1 + + arch/x86/Kconfig.debug | 25 ++ + arch/x86/kernel/vmlinux.lds.S | 3 + + lib/Kconfig.debug | 3 + + 18 files changed, 977 insertions(+), 64 deletions(-) + create mode 100644 Documentation/x86/orc-unwinder.txt + create mode 100644 arch/um/include/asm/unwind.h + create mode 100644 arch/x86/include/asm/orc_lookup.h + create mode 100644 arch/x86/kernel/unwind_orc.c + +diff --git a/Documentation/x86/orc-unwinder.txt b/Documentation/x86/orc-unwinder.txt +new file mode 100644 +index 000000000000..af0c9a4c65a6 +--- /dev/null ++++ b/Documentation/x86/orc-unwinder.txt +@@ -0,0 +1,179 @@ ++ORC unwinder ++============ ++ ++Overview ++-------- ++ ++The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is ++similar in concept to a DWARF unwinder. The difference is that the ++format of the ORC data is much simpler than DWARF, which in turn allows ++the ORC unwinder to be much simpler and faster. ++ ++The ORC data consists of unwind tables which are generated by objtool. ++They contain out-of-band data which is used by the in-kernel ORC ++unwinder. Objtool generates the ORC data by first doing compile-time ++stack metadata validation (CONFIG_STACK_VALIDATION). After analyzing ++all the code paths of a .o file, it determines information about the ++stack state at each instruction address in the file and outputs that ++information to the .orc_unwind and .orc_unwind_ip sections. ++ ++The per-object ORC sections are combined at link time and are sorted and ++post-processed at boot time. The unwinder uses the resulting data to ++correlate instruction addresses with their stack states at run time. ++ ++ ++ORC vs frame pointers ++--------------------- ++ ++With frame pointers enabled, GCC adds instrumentation code to every ++function in the kernel. The kernel's .text size increases by about ++3.2%, resulting in a broad kernel-wide slowdown. Measurements by Mel ++Gorman [1] have shown a slowdown of 5-10% for some workloads. ++ ++In contrast, the ORC unwinder has no effect on text size or runtime ++performance, because the debuginfo is out of band. So if you disable ++frame pointers and enable the ORC unwinder, you get a nice performance ++improvement across the board, and still have reliable stack traces. ++ ++Ingo Molnar says: ++ ++ "Note that it's not just a performance improvement, but also an ++ instruction cache locality improvement: 3.2% .text savings almost ++ directly transform into a similarly sized reduction in cache ++ footprint. That can transform to even higher speedups for workloads ++ whose cache locality is borderline." ++ ++Another benefit of ORC compared to frame pointers is that it can ++reliably unwind across interrupts and exceptions. Frame pointer based ++unwinds can sometimes skip the caller of the interrupted function, if it ++was a leaf function or if the interrupt hit before the frame pointer was ++saved. ++ ++The main disadvantage of the ORC unwinder compared to frame pointers is ++that it needs more memory to store the ORC unwind tables: roughly 2-4MB ++depending on the kernel config. ++ ++ ++ORC vs DWARF ++------------ ++ ++ORC debuginfo's advantage over DWARF itself is that it's much simpler. ++It gets rid of the complex DWARF CFI state machine and also gets rid of ++the tracking of unnecessary registers. This allows the unwinder to be ++much simpler, meaning fewer bugs, which is especially important for ++mission critical oops code. ++ ++The simpler debuginfo format also enables the unwinder to be much faster ++than DWARF, which is important for perf and lockdep. In a basic ++performance test by Jiri Slaby [2], the ORC unwinder was about 20x ++faster than an out-of-tree DWARF unwinder. (Note: That measurement was ++taken before some performance tweaks were added, which doubled ++performance, so the speedup over DWARF may be closer to 40x.) ++ ++The ORC data format does have a few downsides compared to DWARF. ORC ++unwind tables take up ~50% more RAM (+1.3MB on an x86 defconfig kernel) ++than DWARF-based eh_frame tables. ++ ++Another potential downside is that, as GCC evolves, it's conceivable ++that the ORC data may end up being *too* simple to describe the state of ++the stack for certain optimizations. But IMO this is unlikely because ++GCC saves the frame pointer for any unusual stack adjustments it does, ++so I suspect we'll really only ever need to keep track of the stack ++pointer and the frame pointer between call frames. But even if we do ++end up having to track all the registers DWARF tracks, at least we will ++still be able to control the format, e.g. no complex state machines. ++ ++ ++ORC unwind table generation ++--------------------------- ++ ++The ORC data is generated by objtool. With the existing compile-time ++stack metadata validation feature, objtool already follows all code ++paths, and so it already has all the information it needs to be able to ++generate ORC data from scratch. So it's an easy step to go from stack ++validation to ORC data generation. ++ ++It should be possible to instead generate the ORC data with a simple ++tool which converts DWARF to ORC data. However, such a solution would ++be incomplete due to the kernel's extensive use of asm, inline asm, and ++special sections like exception tables. ++ ++That could be rectified by manually annotating those special code paths ++using GNU assembler .cfi annotations in .S files, and homegrown ++annotations for inline asm in .c files. But asm annotations were tried ++in the past and were found to be unmaintainable. They were often ++incorrect/incomplete and made the code harder to read and keep updated. ++And based on looking at glibc code, annotating inline asm in .c files ++might be even worse. ++ ++Objtool still needs a few annotations, but only in code which does ++unusual things to the stack like entry code. And even then, far fewer ++annotations are needed than what DWARF would need, so they're much more ++maintainable than DWARF CFI annotations. ++ ++So the advantages of using objtool to generate ORC data are that it ++gives more accurate debuginfo, with very few annotations. It also ++insulates the kernel from toolchain bugs which can be very painful to ++deal with in the kernel since we often have to workaround issues in ++older versions of the toolchain for years. ++ ++The downside is that the unwinder now becomes dependent on objtool's ++ability to reverse engineer GCC code flow. If GCC optimizations become ++too complicated for objtool to follow, the ORC data generation might ++stop working or become incomplete. (It's worth noting that livepatch ++already has such a dependency on objtool's ability to follow GCC code ++flow.) ++ ++If newer versions of GCC come up with some optimizations which break ++objtool, we may need to revisit the current implementation. Some ++possible solutions would be asking GCC to make the optimizations more ++palatable, or having objtool use DWARF as an additional input, or ++creating a GCC plugin to assist objtool with its analysis. But for now, ++objtool follows GCC code quite well. ++ ++ ++Unwinder implementation details ++------------------------------- ++ ++Objtool generates the ORC data by integrating with the compile-time ++stack metadata validation feature, which is described in detail in ++tools/objtool/Documentation/stack-validation.txt. After analyzing all ++the code paths of a .o file, it creates an array of orc_entry structs, ++and a parallel array of instruction addresses associated with those ++structs, and writes them to the .orc_unwind and .orc_unwind_ip sections ++respectively. ++ ++The ORC data is split into the two arrays for performance reasons, to ++make the searchable part of the data (.orc_unwind_ip) more compact. The ++arrays are sorted in parallel at boot time. ++ ++Performance is further improved by the use of a fast lookup table which ++is created at runtime. The fast lookup table associates a given address ++with a range of indices for the .orc_unwind table, so that only a small ++subset of the table needs to be searched. ++ ++ ++Etymology ++--------- ++ ++Orcs, fearsome creatures of medieval folklore, are the Dwarves' natural ++enemies. Similarly, the ORC unwinder was created in opposition to the ++complexity and slowness of DWARF. ++ ++"Although Orcs rarely consider multiple solutions to a problem, they do ++excel at getting things done because they are creatures of action, not ++thought." [3] Similarly, unlike the esoteric DWARF unwinder, the ++veracious ORC unwinder wastes no time or siloconic effort decoding ++variable-length zero-extended unsigned-integer byte-coded ++state-machine-based debug information entries. ++ ++Similar to how Orcs frequently unravel the well-intentioned plans of ++their adversaries, the ORC unwinder frequently unravels stacks with ++brutal, unyielding efficiency. ++ ++ORC stands for Oops Rewind Capability. ++ ++ ++[1] https://lkml.kernel.org/r/20170602104048.jkkzssljsompjdwy@suse.de ++[2] https://lkml.kernel.org/r/d2ca5435-6386-29b8-db87-7f227c2b713a@suse.cz ++[3] http://dustin.wikidot.com/half-orcs-and-orcs +diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile +index a01892bdd61a..287eac7d207f 100644 +--- a/arch/x86/kernel/Makefile ++++ b/arch/x86/kernel/Makefile +@@ -126,11 +126,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o + obj-$(CONFIG_TRACING) += tracepoint.o + obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o + +-ifdef CONFIG_FRAME_POINTER +-obj-y += unwind_frame.o +-else +-obj-y += unwind_guess.o +-endif ++obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o ++obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o ++obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o + + ### + # 64 bit specific files +diff --git a/scripts/Makefile.build b/scripts/Makefile.build +index 273bc2228307..ab2c8ef43cdb 100644 +--- a/scripts/Makefile.build ++++ b/scripts/Makefile.build +@@ -258,7 +258,8 @@ ifneq ($(SKIP_STACK_VALIDATION),1) + + __objtool_obj := $(objtree)/tools/objtool/objtool + +-objtool_args = check ++objtool_args = $(if $(CONFIG_ORC_UNWINDER),orc generate,check) ++ + ifndef CONFIG_FRAME_POINTER + objtool_args += --no-fp + endif +@@ -276,6 +277,11 @@ objtool_obj = $(if $(patsubst y%,, \ + endif # SKIP_STACK_VALIDATION + endif # CONFIG_STACK_VALIDATION + ++# Rebuild all objects when objtool changes, or is enabled/disabled. ++objtool_dep = $(objtool_obj) \ ++ $(wildcard include/config/orc/unwinder.h \ ++ include/config/stack/validation.h) ++ + define rule_cc_o_c + $(call echo-cmd,checksrc) $(cmd_checksrc) \ + $(call cmd_and_fixdep,cc_o_c) \ +@@ -298,14 +304,14 @@ cmd_undef_syms = echo + endif + + # Built-in and composite module parts +-$(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_obj) FORCE ++$(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_dep) FORCE + $(call cmd,force_checksrc) + $(call cmd,force_check_kmsg) + $(call if_changed_rule,cc_o_c) + + # Single-part modules are special since we need to mark them in $(MODVERDIR) + +-$(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_obj) FORCE ++$(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_dep) FORCE + $(call cmd,force_checksrc) + $(call cmd,force_check_kmsg) + $(call if_changed_rule,cc_o_c) +@@ -401,7 +407,7 @@ cmd_modversions_S = \ + endif + endif + +-$(obj)/%.o: $(src)/%.S $(objtool_obj) FORCE ++$(obj)/%.o: $(src)/%.S $(objtool_dep) FORCE + $(call if_changed_rule,as_o_S) + + targets += $(real-objs-y) $(real-objs-m) $(lib-y) +diff --git a/arch/um/include/asm/unwind.h b/arch/um/include/asm/unwind.h +new file mode 100644 +index 000000000000..7ffa5437b761 +--- /dev/null ++++ b/arch/um/include/asm/unwind.h +@@ -0,0 +1,8 @@ ++#ifndef _ASM_UML_UNWIND_H ++#define _ASM_UML_UNWIND_H ++ ++static inline void ++unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, ++ void *orc, size_t orc_size) {} ++ ++#endif /* _ASM_UML_UNWIND_H */ +diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h +index e3b7819caeef..9eb7c718aaf8 100644 +--- a/arch/x86/include/asm/module.h ++++ b/arch/x86/include/asm/module.h +@@ -2,6 +2,15 @@ + #define _ASM_X86_MODULE_H + + #include ++#include ++ ++struct mod_arch_specific { ++#ifdef CONFIG_ORC_UNWINDER ++ unsigned int num_orcs; ++ int *orc_unwind_ip; ++ struct orc_entry *orc_unwind; ++#endif ++}; + + #ifdef CONFIG_X86_64 + /* X86_64 does not define MODULE_PROC_FAMILY */ +diff --git a/arch/x86/include/asm/orc_lookup.h b/arch/x86/include/asm/orc_lookup.h +new file mode 100644 +index 000000000000..91c8d868424d +--- /dev/null ++++ b/arch/x86/include/asm/orc_lookup.h +@@ -0,0 +1,46 @@ ++/* ++ * Copyright (C) 2017 Josh Poimboeuf ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version 2 ++ * of the License, or (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, see . ++ */ ++#ifndef _ORC_LOOKUP_H ++#define _ORC_LOOKUP_H ++ ++/* ++ * This is a lookup table for speeding up access to the .orc_unwind table. ++ * Given an input address offset, the corresponding lookup table entry ++ * specifies a subset of the .orc_unwind table to search. ++ * ++ * Each block represents the end of the previous range and the start of the ++ * next range. An extra block is added to give the last range an end. ++ * ++ * The block size should be a power of 2 to avoid a costly 'div' instruction. ++ * ++ * A block size of 256 was chosen because it roughly doubles unwinder ++ * performance while only adding ~5% to the ORC data footprint. ++ */ ++#define LOOKUP_BLOCK_ORDER 8 ++#define LOOKUP_BLOCK_SIZE (1 << LOOKUP_BLOCK_ORDER) ++ ++#ifndef LINKER_SCRIPT ++ ++extern unsigned int orc_lookup[]; ++extern unsigned int orc_lookup_end[]; ++ ++#define LOOKUP_START_IP (unsigned long)_stext ++#define LOOKUP_STOP_IP (unsigned long)_etext ++ ++#endif /* LINKER_SCRIPT */ ++ ++#endif /* _ORC_LOOKUP_H */ +diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h +index 7dc777a6cb40..9c9dc579bd7d 100644 +--- a/arch/x86/include/asm/orc_types.h ++++ b/arch/x86/include/asm/orc_types.h +@@ -88,7 +88,7 @@ struct orc_entry { + unsigned sp_reg:4; + unsigned bp_reg:4; + unsigned type:2; +-}; ++} __packed; + + /* + * This struct is used by asm and inline asm code to manually annotate the +diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h +index e6676495b125..25b8d31a007d 100644 +--- a/arch/x86/include/asm/unwind.h ++++ b/arch/x86/include/asm/unwind.h +@@ -12,11 +12,14 @@ struct unwind_state { + struct task_struct *task; + int graph_idx; + bool error; +-#ifdef CONFIG_FRAME_POINTER ++#if defined(CONFIG_ORC_UNWINDER) ++ bool signal, full_regs; ++ unsigned long sp, bp, ip; ++ struct pt_regs *regs; ++#elif defined(CONFIG_FRAME_POINTER) + bool got_irq; +- unsigned long *bp, *orig_sp; ++ unsigned long *bp, *orig_sp, ip; + struct pt_regs *regs; +- unsigned long ip; + #else + unsigned long *sp; + #endif +@@ -24,41 +27,30 @@ struct unwind_state { + + void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame); +- + bool unwind_next_frame(struct unwind_state *state); +- + unsigned long unwind_get_return_address(struct unwind_state *state); ++unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); + + static inline bool unwind_done(struct unwind_state *state) + { + return state->stack_info.type == STACK_TYPE_UNKNOWN; + } + +-static inline +-void unwind_start(struct unwind_state *state, struct task_struct *task, +- struct pt_regs *regs, unsigned long *first_frame) +-{ +- first_frame = first_frame ? : get_stack_pointer(task, regs); +- +- __unwind_start(state, task, regs, first_frame); +-} +- + static inline bool unwind_error(struct unwind_state *state) + { + return state->error; + } + +-#ifdef CONFIG_FRAME_POINTER +- + static inline +-unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) ++void unwind_start(struct unwind_state *state, struct task_struct *task, ++ struct pt_regs *regs, unsigned long *first_frame) + { +- if (unwind_done(state)) +- return NULL; ++ first_frame = first_frame ? : get_stack_pointer(task, regs); + +- return state->regs ? &state->regs->ip : state->bp + 1; ++ __unwind_start(state, task, regs, first_frame); + } + ++#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER) + static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) + { + if (unwind_done(state)) +@@ -66,20 +58,46 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) + + return state->regs; + } +- +-#else /* !CONFIG_FRAME_POINTER */ +- +-static inline +-unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) ++#else ++static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) + { + return NULL; + } ++#endif + +-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) ++#ifdef CONFIG_ORC_UNWINDER ++void unwind_init(void); ++void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, ++ void *orc, size_t orc_size); ++#else ++static inline void unwind_init(void) {} ++static inline ++void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, ++ void *orc, size_t orc_size) {} ++#endif ++ ++/* ++ * This disables KASAN checking when reading a value from another task's stack, ++ * since the other task could be running on another CPU and could have poisoned ++ * the stack in the meantime. ++ */ ++#define READ_ONCE_TASK_STACK(task, x) \ ++({ \ ++ unsigned long val; \ ++ if (task == current) \ ++ val = READ_ONCE(x); \ ++ else \ ++ val = READ_ONCE_NOCHECK(x); \ ++ val; \ ++}) ++ ++static inline bool task_on_another_cpu(struct task_struct *task) + { +- return NULL; ++#ifdef CONFIG_SMP ++ return task != current && task->on_cpu; ++#else ++ return false; ++#endif + } + +-#endif /* CONFIG_FRAME_POINTER */ +- + #endif /* _ASM_X86_UNWIND_H */ +diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h +index e7e955d4ab9e..9fdb54a95976 100644 +--- a/include/asm-generic/vmlinux.lds.h ++++ b/include/asm-generic/vmlinux.lds.h +@@ -686,6 +686,31 @@ + #define BUG_TABLE + #endif + ++#ifdef CONFIG_ORC_UNWINDER ++#define ORC_UNWIND_TABLE \ ++ . = ALIGN(4); \ ++ .orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) { \ ++ VMLINUX_SYMBOL(__start_orc_unwind_ip) = .; \ ++ KEEP(*(.orc_unwind_ip)) \ ++ VMLINUX_SYMBOL(__stop_orc_unwind_ip) = .; \ ++ } \ ++ . = ALIGN(6); \ ++ .orc_unwind : AT(ADDR(.orc_unwind) - LOAD_OFFSET) { \ ++ VMLINUX_SYMBOL(__start_orc_unwind) = .; \ ++ KEEP(*(.orc_unwind)) \ ++ VMLINUX_SYMBOL(__stop_orc_unwind) = .; \ ++ } \ ++ . = ALIGN(4); \ ++ .orc_lookup : AT(ADDR(.orc_lookup) - LOAD_OFFSET) { \ ++ VMLINUX_SYMBOL(orc_lookup) = .; \ ++ . += (((SIZEOF(.text) + LOOKUP_BLOCK_SIZE - 1) / \ ++ LOOKUP_BLOCK_SIZE) + 1) * 4; \ ++ VMLINUX_SYMBOL(orc_lookup_end) = .; \ ++ } ++#else ++#define ORC_UNWIND_TABLE ++#endif ++ + #ifdef CONFIG_PM_TRACE + #define TRACEDATA \ + . = ALIGN(4); \ +@@ -872,7 +897,7 @@ + DATA_DATA \ + CONSTRUCTORS \ + } \ +- BUG_TABLE ++ BUG_TABLE \ + + #define INIT_TEXT_SECTION(inittext_align) \ + . = ALIGN(inittext_align); \ +diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c +index f67bd3205df7..62e7d70aadd5 100644 +--- a/arch/x86/kernel/module.c ++++ b/arch/x86/kernel/module.c +@@ -35,6 +35,7 @@ + #include + #include + #include ++#include + + #if 0 + #define DEBUGP(fmt, ...) \ +@@ -213,7 +214,7 @@ int module_finalize(const Elf_Ehdr *hdr, + struct module *me) + { + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, +- *para = NULL; ++ *para = NULL, *orc = NULL, *orc_ip = NULL; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { +@@ -225,6 +226,10 @@ int module_finalize(const Elf_Ehdr *hdr, + locks = s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; ++ if (!strcmp(".orc_unwind", secstrings + s->sh_name)) ++ orc = s; ++ if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) ++ orc_ip = s; + } + + if (alt) { +@@ -248,6 +253,10 @@ int module_finalize(const Elf_Ehdr *hdr, + /* make jump label nops */ + jump_label_apply_nops(me); + ++ if (orc && orc_ip) ++ unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, ++ (void *)orc->sh_addr, orc->sh_size); ++ + return 0; + } + +diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c +index f964bfddfefd..dd6e8707e969 100644 +--- a/arch/x86/kernel/setup.c ++++ b/arch/x86/kernel/setup.c +@@ -121,6 +121,7 @@ + #include + #include + #include ++#include + + /* + * max_low_pfn_mapped: highest direct mapped pfn under 4GB +@@ -1325,6 +1326,8 @@ void __init setup_arch(char **cmdline_p) + if (efi_enabled(EFI_BOOT)) + efi_apply_memmap_quirks(); + #endif ++ ++ unwind_init(); + } + + #ifdef CONFIG_X86_32 +diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c +index c29e5bc7e9c9..d145a0b1f529 100644 +--- a/arch/x86/kernel/unwind_frame.c ++++ b/arch/x86/kernel/unwind_frame.c +@@ -10,20 +10,22 @@ + + #define FRAME_HEADER_SIZE (sizeof(long) * 2) + +-/* +- * This disables KASAN checking when reading a value from another task's stack, +- * since the other task could be running on another CPU and could have poisoned +- * the stack in the meantime. +- */ +-#define READ_ONCE_TASK_STACK(task, x) \ +-({ \ +- unsigned long val; \ +- if (task == current) \ +- val = READ_ONCE(x); \ +- else \ +- val = READ_ONCE_NOCHECK(x); \ +- val; \ +-}) ++unsigned long unwind_get_return_address(struct unwind_state *state) ++{ ++ if (unwind_done(state)) ++ return 0; ++ ++ return __kernel_text_address(state->ip) ? state->ip : 0; ++} ++EXPORT_SYMBOL_GPL(unwind_get_return_address); ++ ++unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) ++{ ++ if (unwind_done(state)) ++ return NULL; ++ ++ return state->regs ? &state->regs->ip : state->bp + 1; ++} + + static void unwind_dump(struct unwind_state *state) + { +@@ -66,15 +68,6 @@ static void unwind_dump(struct unwind_state *state) + } + } + +-unsigned long unwind_get_return_address(struct unwind_state *state) +-{ +- if (unwind_done(state)) +- return 0; +- +- return __kernel_text_address(state->ip) ? state->ip : 0; +-} +-EXPORT_SYMBOL_GPL(unwind_get_return_address); +- + static size_t regs_size(struct pt_regs *regs) + { + /* x86_32 regs from kernel mode are two words shorter: */ +diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c +index 039f36738e49..4f0e17b90463 100644 +--- a/arch/x86/kernel/unwind_guess.c ++++ b/arch/x86/kernel/unwind_guess.c +@@ -19,6 +19,11 @@ unsigned long unwind_get_return_address(struct unwind_state *state) + } + EXPORT_SYMBOL_GPL(unwind_get_return_address); + ++unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) ++{ ++ return NULL; ++} ++ + bool unwind_next_frame(struct unwind_state *state) + { + struct stack_info *info = &state->stack_info; +diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c +new file mode 100644 +index 000000000000..570b70d3f604 +--- /dev/null ++++ b/arch/x86/kernel/unwind_orc.c +@@ -0,0 +1,582 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define orc_warn(fmt, ...) \ ++ printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__) ++ ++extern int __start_orc_unwind_ip[]; ++extern int __stop_orc_unwind_ip[]; ++extern struct orc_entry __start_orc_unwind[]; ++extern struct orc_entry __stop_orc_unwind[]; ++ ++static DEFINE_MUTEX(sort_mutex); ++int *cur_orc_ip_table = __start_orc_unwind_ip; ++struct orc_entry *cur_orc_table = __start_orc_unwind; ++ ++unsigned int lookup_num_blocks; ++bool orc_init; ++ ++static inline unsigned long orc_ip(const int *ip) ++{ ++ return (unsigned long)ip + *ip; ++} ++ ++static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table, ++ unsigned int num_entries, unsigned long ip) ++{ ++ int *first = ip_table; ++ int *last = ip_table + num_entries - 1; ++ int *mid = first, *found = first; ++ ++ if (!num_entries) ++ return NULL; ++ ++ /* ++ * Do a binary range search to find the rightmost duplicate of a given ++ * starting address. Some entries are section terminators which are ++ * "weak" entries for ensuring there are no gaps. They should be ++ * ignored when they conflict with a real entry. ++ */ ++ while (first <= last) { ++ mid = first + ((last - first) / 2); ++ ++ if (orc_ip(mid) <= ip) { ++ found = mid; ++ first = mid + 1; ++ } else ++ last = mid - 1; ++ } ++ ++ return u_table + (found - ip_table); ++} ++ ++#ifdef CONFIG_MODULES ++static struct orc_entry *orc_module_find(unsigned long ip) ++{ ++ struct module *mod; ++ ++ mod = __module_address(ip); ++ if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip) ++ return NULL; ++ return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind, ++ mod->arch.num_orcs, ip); ++} ++#else ++static struct orc_entry *orc_module_find(unsigned long ip) ++{ ++ return NULL; ++} ++#endif ++ ++static struct orc_entry *orc_find(unsigned long ip) ++{ ++ if (!orc_init) ++ return NULL; ++ ++ /* For non-init vmlinux addresses, use the fast lookup table: */ ++ if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) { ++ unsigned int idx, start, stop; ++ ++ idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE; ++ ++ if (unlikely((idx >= lookup_num_blocks-1))) { ++ orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%lx\n", ++ idx, lookup_num_blocks, ip); ++ return NULL; ++ } ++ ++ start = orc_lookup[idx]; ++ stop = orc_lookup[idx + 1] + 1; ++ ++ if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) || ++ (__start_orc_unwind + stop > __stop_orc_unwind))) { ++ orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%lx\n", ++ idx, lookup_num_blocks, start, stop, ip); ++ return NULL; ++ } ++ ++ return __orc_find(__start_orc_unwind_ip + start, ++ __start_orc_unwind + start, stop - start, ip); ++ } ++ ++ /* vmlinux .init slow lookup: */ ++ if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext) ++ return __orc_find(__start_orc_unwind_ip, __start_orc_unwind, ++ __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); ++ ++ /* Module lookup: */ ++ return orc_module_find(ip); ++} ++ ++static void orc_sort_swap(void *_a, void *_b, int size) ++{ ++ struct orc_entry *orc_a, *orc_b; ++ struct orc_entry orc_tmp; ++ int *a = _a, *b = _b, tmp; ++ int delta = _b - _a; ++ ++ /* Swap the .orc_unwind_ip entries: */ ++ tmp = *a; ++ *a = *b + delta; ++ *b = tmp - delta; ++ ++ /* Swap the corresponding .orc_unwind entries: */ ++ orc_a = cur_orc_table + (a - cur_orc_ip_table); ++ orc_b = cur_orc_table + (b - cur_orc_ip_table); ++ orc_tmp = *orc_a; ++ *orc_a = *orc_b; ++ *orc_b = orc_tmp; ++} ++ ++static int orc_sort_cmp(const void *_a, const void *_b) ++{ ++ struct orc_entry *orc_a; ++ const int *a = _a, *b = _b; ++ unsigned long a_val = orc_ip(a); ++ unsigned long b_val = orc_ip(b); ++ ++ if (a_val > b_val) ++ return 1; ++ if (a_val < b_val) ++ return -1; ++ ++ /* ++ * The "weak" section terminator entries need to always be on the left ++ * to ensure the lookup code skips them in favor of real entries. ++ * These terminator entries exist to handle any gaps created by ++ * whitelisted .o files which didn't get objtool generation. ++ */ ++ orc_a = cur_orc_table + (a - cur_orc_ip_table); ++ return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1; ++} ++ ++#ifdef CONFIG_MODULES ++void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size, ++ void *_orc, size_t orc_size) ++{ ++ int *orc_ip = _orc_ip; ++ struct orc_entry *orc = _orc; ++ unsigned int num_entries = orc_ip_size / sizeof(int); ++ ++ WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 || ++ orc_size % sizeof(*orc) != 0 || ++ num_entries != orc_size / sizeof(*orc)); ++ ++ /* ++ * The 'cur_orc_*' globals allow the orc_sort_swap() callback to ++ * associate an .orc_unwind_ip table entry with its corresponding ++ * .orc_unwind entry so they can both be swapped. ++ */ ++ mutex_lock(&sort_mutex); ++ cur_orc_ip_table = orc_ip; ++ cur_orc_table = orc; ++ sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap); ++ mutex_unlock(&sort_mutex); ++ ++ mod->arch.orc_unwind_ip = orc_ip; ++ mod->arch.orc_unwind = orc; ++ mod->arch.num_orcs = num_entries; ++} ++#endif ++ ++void __init unwind_init(void) ++{ ++ size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip; ++ size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind; ++ size_t num_entries = orc_ip_size / sizeof(int); ++ struct orc_entry *orc; ++ int i; ++ ++ if (!num_entries || orc_ip_size % sizeof(int) != 0 || ++ orc_size % sizeof(struct orc_entry) != 0 || ++ num_entries != orc_size / sizeof(struct orc_entry)) { ++ orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n"); ++ return; ++ } ++ ++ /* Sort the .orc_unwind and .orc_unwind_ip tables: */ ++ sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp, ++ orc_sort_swap); ++ ++ /* Initialize the fast lookup table: */ ++ lookup_num_blocks = orc_lookup_end - orc_lookup; ++ for (i = 0; i < lookup_num_blocks-1; i++) { ++ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, ++ num_entries, ++ LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i)); ++ if (!orc) { ++ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); ++ return; ++ } ++ ++ orc_lookup[i] = orc - __start_orc_unwind; ++ } ++ ++ /* Initialize the ending block: */ ++ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries, ++ LOOKUP_STOP_IP); ++ if (!orc) { ++ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); ++ return; ++ } ++ orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind; ++ ++ orc_init = true; ++} ++ ++unsigned long unwind_get_return_address(struct unwind_state *state) ++{ ++ if (unwind_done(state)) ++ return 0; ++ ++ return __kernel_text_address(state->ip) ? state->ip : 0; ++} ++EXPORT_SYMBOL_GPL(unwind_get_return_address); ++ ++unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) ++{ ++ if (unwind_done(state)) ++ return NULL; ++ ++ if (state->regs) ++ return &state->regs->ip; ++ ++ if (state->sp) ++ return (unsigned long *)state->sp - 1; ++ ++ return NULL; ++} ++ ++static bool stack_access_ok(struct unwind_state *state, unsigned long addr, ++ size_t len) ++{ ++ struct stack_info *info = &state->stack_info; ++ ++ /* ++ * If the address isn't on the current stack, switch to the next one. ++ * ++ * We may have to traverse multiple stacks to deal with the possibility ++ * that info->next_sp could point to an empty stack and the address ++ * could be on a subsequent stack. ++ */ ++ while (!on_stack(info, (void *)addr, len)) ++ if (get_stack_info(info->next_sp, state->task, info, ++ &state->stack_mask)) ++ return false; ++ ++ return true; ++} ++ ++static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, ++ unsigned long *val) ++{ ++ if (!stack_access_ok(state, addr, sizeof(long))) ++ return false; ++ ++ *val = READ_ONCE_TASK_STACK(state->task, *(unsigned long *)addr); ++ return true; ++} ++ ++#define REGS_SIZE (sizeof(struct pt_regs)) ++#define SP_OFFSET (offsetof(struct pt_regs, sp)) ++#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) ++#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) ++ ++static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, ++ unsigned long *ip, unsigned long *sp, bool full) ++{ ++ size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; ++ size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; ++ struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); ++ ++ if (IS_ENABLED(CONFIG_X86_64)) { ++ if (!stack_access_ok(state, addr, regs_size)) ++ return false; ++ ++ *ip = regs->ip; ++ *sp = regs->sp; ++ ++ return true; ++ } ++ ++ if (!stack_access_ok(state, addr, sp_offset)) ++ return false; ++ ++ *ip = regs->ip; ++ ++ if (user_mode(regs)) { ++ if (!stack_access_ok(state, addr + sp_offset, ++ REGS_SIZE - SP_OFFSET)) ++ return false; ++ ++ *sp = regs->sp; ++ } else ++ *sp = (unsigned long)®s->sp; ++ ++ return true; ++} ++ ++bool unwind_next_frame(struct unwind_state *state) ++{ ++ unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; ++ enum stack_type prev_type = state->stack_info.type; ++ struct orc_entry *orc; ++ struct pt_regs *ptregs; ++ bool indirect = false; ++ ++ if (unwind_done(state)) ++ return false; ++ ++ /* Don't let modules unload while we're reading their ORC data. */ ++ preempt_disable(); ++ ++ /* Have we reached the end? */ ++ if (state->regs && user_mode(state->regs)) ++ goto done; ++ ++ /* ++ * Find the orc_entry associated with the text address. ++ * ++ * Decrement call return addresses by one so they work for sibling ++ * calls and calls to noreturn functions. ++ */ ++ orc = orc_find(state->signal ? state->ip : state->ip - 1); ++ if (!orc || orc->sp_reg == ORC_REG_UNDEFINED) ++ goto done; ++ orig_ip = state->ip; ++ ++ /* Find the previous frame's stack: */ ++ switch (orc->sp_reg) { ++ case ORC_REG_SP: ++ sp = state->sp + orc->sp_offset; ++ break; ++ ++ case ORC_REG_BP: ++ sp = state->bp + orc->sp_offset; ++ break; ++ ++ case ORC_REG_SP_INDIRECT: ++ sp = state->sp + orc->sp_offset; ++ indirect = true; ++ break; ++ ++ case ORC_REG_BP_INDIRECT: ++ sp = state->bp + orc->sp_offset; ++ indirect = true; ++ break; ++ ++ case ORC_REG_R10: ++ if (!state->regs || !state->full_regs) { ++ orc_warn("missing regs for base reg R10 at ip %p\n", ++ (void *)state->ip); ++ goto done; ++ } ++ sp = state->regs->r10; ++ break; ++ ++ case ORC_REG_R13: ++ if (!state->regs || !state->full_regs) { ++ orc_warn("missing regs for base reg R13 at ip %p\n", ++ (void *)state->ip); ++ goto done; ++ } ++ sp = state->regs->r13; ++ break; ++ ++ case ORC_REG_DI: ++ if (!state->regs || !state->full_regs) { ++ orc_warn("missing regs for base reg DI at ip %p\n", ++ (void *)state->ip); ++ goto done; ++ } ++ sp = state->regs->di; ++ break; ++ ++ case ORC_REG_DX: ++ if (!state->regs || !state->full_regs) { ++ orc_warn("missing regs for base reg DX at ip %p\n", ++ (void *)state->ip); ++ goto done; ++ } ++ sp = state->regs->dx; ++ break; ++ ++ default: ++ orc_warn("unknown SP base reg %d for ip %p\n", ++ orc->sp_reg, (void *)state->ip); ++ goto done; ++ } ++ ++ if (indirect) { ++ if (!deref_stack_reg(state, sp, &sp)) ++ goto done; ++ } ++ ++ /* Find IP, SP and possibly regs: */ ++ switch (orc->type) { ++ case ORC_TYPE_CALL: ++ ip_p = sp - sizeof(long); ++ ++ if (!deref_stack_reg(state, ip_p, &state->ip)) ++ goto done; ++ ++ state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ++ state->ip, (void *)ip_p); ++ ++ state->sp = sp; ++ state->regs = NULL; ++ state->signal = false; ++ break; ++ ++ case ORC_TYPE_REGS: ++ if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { ++ orc_warn("can't dereference registers at %p for ip %p\n", ++ (void *)sp, (void *)orig_ip); ++ goto done; ++ } ++ ++ state->regs = (struct pt_regs *)sp; ++ state->full_regs = true; ++ state->signal = true; ++ break; ++ ++ case ORC_TYPE_REGS_IRET: ++ if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { ++ orc_warn("can't dereference iret registers at %p for ip %p\n", ++ (void *)sp, (void *)orig_ip); ++ goto done; ++ } ++ ++ ptregs = container_of((void *)sp, struct pt_regs, ip); ++ if ((unsigned long)ptregs >= prev_sp && ++ on_stack(&state->stack_info, ptregs, REGS_SIZE)) { ++ state->regs = ptregs; ++ state->full_regs = false; ++ } else ++ state->regs = NULL; ++ ++ state->signal = true; ++ break; ++ ++ default: ++ orc_warn("unknown .orc_unwind entry type %d\n", orc->type); ++ break; ++ } ++ ++ /* Find BP: */ ++ switch (orc->bp_reg) { ++ case ORC_REG_UNDEFINED: ++ if (state->regs && state->full_regs) ++ state->bp = state->regs->bp; ++ break; ++ ++ case ORC_REG_PREV_SP: ++ if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp)) ++ goto done; ++ break; ++ ++ case ORC_REG_BP: ++ if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp)) ++ goto done; ++ break; ++ ++ default: ++ orc_warn("unknown BP base reg %d for ip %p\n", ++ orc->bp_reg, (void *)orig_ip); ++ goto done; ++ } ++ ++ /* Prevent a recursive loop due to bad ORC data: */ ++ if (state->stack_info.type == prev_type && ++ on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) && ++ state->sp <= prev_sp) { ++ orc_warn("stack going in the wrong direction? ip=%p\n", ++ (void *)orig_ip); ++ goto done; ++ } ++ ++ preempt_enable(); ++ return true; ++ ++done: ++ preempt_enable(); ++ state->stack_info.type = STACK_TYPE_UNKNOWN; ++ return false; ++} ++EXPORT_SYMBOL_GPL(unwind_next_frame); ++ ++void __unwind_start(struct unwind_state *state, struct task_struct *task, ++ struct pt_regs *regs, unsigned long *first_frame) ++{ ++ memset(state, 0, sizeof(*state)); ++ state->task = task; ++ ++ /* ++ * Refuse to unwind the stack of a task while it's executing on another ++ * CPU. This check is racy, but that's ok: the unwinder has other ++ * checks to prevent it from going off the rails. ++ */ ++ if (task_on_another_cpu(task)) ++ goto done; ++ ++ if (regs) { ++ if (user_mode(regs)) ++ goto done; ++ ++ state->ip = regs->ip; ++ state->sp = kernel_stack_pointer(regs); ++ state->bp = regs->bp; ++ state->regs = regs; ++ state->full_regs = true; ++ state->signal = true; ++ ++ } else if (task == current) { ++ asm volatile("lea (%%rip), %0\n\t" ++ "mov %%rsp, %1\n\t" ++ "mov %%rbp, %2\n\t" ++ : "=r" (state->ip), "=r" (state->sp), ++ "=r" (state->bp)); ++ ++ } else { ++ struct inactive_task_frame *frame = (void *)task->thread.sp; ++ ++ state->sp = task->thread.sp; ++ state->bp = READ_ONCE_NOCHECK(frame->bp); ++ state->ip = READ_ONCE_NOCHECK(frame->ret_addr); ++ } ++ ++ if (get_stack_info((unsigned long *)state->sp, state->task, ++ &state->stack_info, &state->stack_mask)) ++ return; ++ ++ /* ++ * The caller can provide the address of the first frame directly ++ * (first_frame) or indirectly (regs->sp) to indicate which stack frame ++ * to start unwinding at. Skip ahead until we reach it. ++ */ ++ ++ /* When starting from regs, skip the regs frame: */ ++ if (regs) { ++ unwind_next_frame(state); ++ return; ++ } ++ ++ /* Otherwise, skip ahead to the user-specified starting frame: */ ++ while (!unwind_done(state) && ++ (!on_stack(&state->stack_info, first_frame, sizeof(long)) || ++ state->sp <= (unsigned long)first_frame)) ++ unwind_next_frame(state); ++ ++ return; ++ ++done: ++ state->stack_info.type = STACK_TYPE_UNKNOWN; ++ return; ++} ++EXPORT_SYMBOL_GPL(__unwind_start); +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 658fcf67862c..d6f45f6d1054 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -158,6 +158,7 @@ config X86 + select HAVE_MEMBLOCK + select HAVE_MEMBLOCK_NODE_MAP + select HAVE_MIXED_BREAKPOINTS_REGS ++ select HAVE_MOD_ARCH_SPECIFIC + select HAVE_NMI + select HAVE_OPROFILE + select HAVE_OPTPROBES +diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug +index 1fc519f3c49e..d5bca2ec8a74 100644 +--- a/arch/x86/Kconfig.debug ++++ b/arch/x86/Kconfig.debug +@@ -356,4 +356,29 @@ config PUNIT_ATOM_DEBUG + The current power state can be read from + /sys/kernel/debug/punit_atom/dev_power_state + ++config ORC_UNWINDER ++ bool "ORC unwinder" ++ depends on X86_64 ++ select STACK_VALIDATION ++ ---help--- ++ This option enables the ORC (Oops Rewind Capability) unwinder for ++ unwinding kernel stack traces. It uses a custom data format which is ++ a simplified version of the DWARF Call Frame Information standard. ++ ++ This unwinder is more accurate across interrupt entry frames than the ++ frame pointer unwinder. It can also enable a 5-10% performance ++ improvement across the entire kernel if CONFIG_FRAME_POINTER is ++ disabled. ++ ++ Enabling this option will increase the kernel's runtime memory usage ++ by roughly 2-4MB, depending on your kernel config. ++ ++config FRAME_POINTER_UNWINDER ++ def_bool y ++ depends on !ORC_UNWINDER && FRAME_POINTER ++ ++config GUESS_UNWINDER ++ def_bool y ++ depends on !ORC_UNWINDER && !FRAME_POINTER ++ + endmenu +diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S +index c8a3b61be0aa..f05f00acac89 100644 +--- a/arch/x86/kernel/vmlinux.lds.S ++++ b/arch/x86/kernel/vmlinux.lds.S +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -148,6 +149,8 @@ SECTIONS + + BUG_TABLE + ++ ORC_UNWIND_TABLE ++ + . = ALIGN(PAGE_SIZE); + __vvar_page = .; + +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index c617b9d1d6cb..0b4d1b3880b0 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -374,6 +374,9 @@ config STACK_VALIDATION + pointers (if CONFIG_FRAME_POINTER is enabled). This helps ensure + that runtime stack traces are more reliable. + ++ This is also a prerequisite for generation of ORC unwind data, which ++ is needed for CONFIG_ORC_UNWINDER. ++ + For more information, see + tools/objtool/Documentation/stack-validation.txt. + +-- +2.14.2 + diff --git a/patches/kernel/0039-objtool-Upgrade-libelf-devel-warning-to-error-for-CO.patch b/patches/kernel/0039-objtool-Upgrade-libelf-devel-warning-to-error-for-CO.patch deleted file mode 100644 index c2217fc..0000000 --- a/patches/kernel/0039-objtool-Upgrade-libelf-devel-warning-to-error-for-CO.patch +++ /dev/null @@ -1,51 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Tue, 3 Oct 2017 20:10:36 -0500 -Subject: [PATCH] objtool: Upgrade libelf-devel warning to error for - CONFIG_ORC_UNWINDER -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -With CONFIG_ORC_UNWINDER, if the user doesn't have libelf-devel -installed, and they don't see the make warning, their ORC unwinder will -be silently broken. Upgrade the warning to an error. - -Reported-and-tested-by: Borislav Petkov -Signed-off-by: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/d9dfc39fb8240998820f9efb233d283a1ee96084.1507079417.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 3dd40cb320fee7c23b574ab821ce140ccd1281c9) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit c413466a72ca533ec126ebc0c5bb579ae0c96b1d) -Signed-off-by: Fabian Grünbichler ---- - Makefile | 6 +++++- - 1 file changed, 5 insertions(+), 1 deletion(-) - -diff --git a/Makefile b/Makefile -index 8e14a926fc94..490ce18685ea 100644 ---- a/Makefile -+++ b/Makefile -@@ -965,7 +965,11 @@ ifdef CONFIG_STACK_VALIDATION - ifeq ($(has_libelf),1) - objtool_target := tools/objtool FORCE - else -- $(warning "Cannot use CONFIG_STACK_VALIDATION, please install libelf-dev, libelf-devel or elfutils-libelf-devel") -+ ifdef CONFIG_ORC_UNWINDER -+ $(error "Cannot generate ORC metadata for CONFIG_ORC_UNWINDER=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") -+ else -+ $(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") -+ endif - SKIP_STACK_VALIDATION := 1 - export SKIP_STACK_VALIDATION - endif --- -2.14.2 - diff --git a/patches/kernel/0039-x86-kconfig-Consolidate-unwinders-into-multiple-choi.patch b/patches/kernel/0039-x86-kconfig-Consolidate-unwinders-into-multiple-choi.patch new file mode 100644 index 0000000..f588b6e --- /dev/null +++ b/patches/kernel/0039-x86-kconfig-Consolidate-unwinders-into-multiple-choi.patch @@ -0,0 +1,171 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Tue, 25 Jul 2017 08:54:24 -0500 +Subject: [PATCH] x86/kconfig: Consolidate unwinders into multiple choice + selection +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +There are three mutually exclusive unwinders. Make that more obvious by +combining them into a multiple-choice selection: + + CONFIG_FRAME_POINTER_UNWINDER + CONFIG_ORC_UNWINDER + CONFIG_GUESS_UNWINDER (if CONFIG_EXPERT=y) + +Frame pointers are still the default (for now). + +The old CONFIG_FRAME_POINTER option is still used in some +arch-independent places, so keep it around, but make it +invisible to the user on x86 - it's now selected by +CONFIG_FRAME_POINTER_UNWINDER=y. + +Suggested-by: Ingo Molnar +Signed-off-by: Josh Poimboeuf +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Jiri Slaby +Cc: Linus Torvalds +Cc: Mike Galbraith +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: live-patching@vger.kernel.org +Link: http://lkml.kernel.org/r/20170725135424.zukjmgpz3plf5pmt@treble +Signed-off-by: Ingo Molnar +(cherry picked from commit 81d387190039c14edac8de2b3ec789beb899afd9) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 26ddacc1e6333555e4a6bd63c4c935b323509f92) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/unwind.h | 4 ++-- + arch/x86/Kconfig | 3 +-- + arch/x86/Kconfig.debug | 45 +++++++++++++++++++++++++++++++++++++------ + arch/x86/configs/tiny.config | 2 ++ + 4 files changed, 44 insertions(+), 10 deletions(-) + +diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h +index 25b8d31a007d..e9f793e2df7a 100644 +--- a/arch/x86/include/asm/unwind.h ++++ b/arch/x86/include/asm/unwind.h +@@ -16,7 +16,7 @@ struct unwind_state { + bool signal, full_regs; + unsigned long sp, bp, ip; + struct pt_regs *regs; +-#elif defined(CONFIG_FRAME_POINTER) ++#elif defined(CONFIG_FRAME_POINTER_UNWINDER) + bool got_irq; + unsigned long *bp, *orig_sp, ip; + struct pt_regs *regs; +@@ -50,7 +50,7 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, + __unwind_start(state, task, regs, first_frame); + } + +-#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER) ++#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER) + static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) + { + if (unwind_done(state)) +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index d6f45f6d1054..3a0b8cb57caf 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -73,7 +73,6 @@ config X86 + select ARCH_USE_QUEUED_RWLOCKS + select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +- select ARCH_WANT_FRAME_POINTERS + select ARCH_WANTS_DYNAMIC_TASK_STRUCT + select ARCH_WANTS_THP_SWAP if X86_64 + select BUILDTIME_EXTABLE_SORT +@@ -169,7 +168,7 @@ config X86 + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP + select HAVE_REGS_AND_STACK_ACCESS_API +- select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER && STACK_VALIDATION ++ select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION + select HAVE_STACK_VALIDATION if X86_64 + select HAVE_SYSCALL_TRACEPOINTS + select HAVE_UNSTABLE_SCHED_CLOCK +diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug +index d5bca2ec8a74..c441b5d65ec8 100644 +--- a/arch/x86/Kconfig.debug ++++ b/arch/x86/Kconfig.debug +@@ -356,6 +356,29 @@ config PUNIT_ATOM_DEBUG + The current power state can be read from + /sys/kernel/debug/punit_atom/dev_power_state + ++choice ++ prompt "Choose kernel unwinder" ++ default FRAME_POINTER_UNWINDER ++ ---help--- ++ This determines which method will be used for unwinding kernel stack ++ traces for panics, oopses, bugs, warnings, perf, /proc//stack, ++ livepatch, lockdep, and more. ++ ++config FRAME_POINTER_UNWINDER ++ bool "Frame pointer unwinder" ++ select FRAME_POINTER ++ ---help--- ++ This option enables the frame pointer unwinder for unwinding kernel ++ stack traces. ++ ++ The unwinder itself is fast and it uses less RAM than the ORC ++ unwinder, but the kernel text size will grow by ~3% and the kernel's ++ overall performance will degrade by roughly 5-10%. ++ ++ This option is recommended if you want to use the livepatch ++ consistency model, as this is currently the only way to get a ++ reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). ++ + config ORC_UNWINDER + bool "ORC unwinder" + depends on X86_64 +@@ -373,12 +396,22 @@ config ORC_UNWINDER + Enabling this option will increase the kernel's runtime memory usage + by roughly 2-4MB, depending on your kernel config. + +-config FRAME_POINTER_UNWINDER +- def_bool y +- depends on !ORC_UNWINDER && FRAME_POINTER +- + config GUESS_UNWINDER +- def_bool y +- depends on !ORC_UNWINDER && !FRAME_POINTER ++ bool "Guess unwinder" ++ depends on EXPERT ++ ---help--- ++ This option enables the "guess" unwinder for unwinding kernel stack ++ traces. It scans the stack and reports every kernel text address it ++ finds. Some of the addresses it reports may be incorrect. ++ ++ While this option often produces false positives, it can still be ++ useful in many cases. Unlike the other unwinders, it has no runtime ++ overhead. ++ ++endchoice ++ ++config FRAME_POINTER ++ depends on !ORC_UNWINDER && !GUESS_UNWINDER ++ bool + + endmenu +diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config +index 4b429df40d7a..550cd5012b73 100644 +--- a/arch/x86/configs/tiny.config ++++ b/arch/x86/configs/tiny.config +@@ -1,3 +1,5 @@ + CONFIG_NOHIGHMEM=y + # CONFIG_HIGHMEM4G is not set + # CONFIG_HIGHMEM64G is not set ++CONFIG_GUESS_UNWINDER=y ++# CONFIG_FRAME_POINTER_UNWINDER is not set +-- +2.14.2 + diff --git a/patches/kernel/0040-objtool-Upgrade-libelf-devel-warning-to-error-for-CO.patch b/patches/kernel/0040-objtool-Upgrade-libelf-devel-warning-to-error-for-CO.patch new file mode 100644 index 0000000..c2217fc --- /dev/null +++ b/patches/kernel/0040-objtool-Upgrade-libelf-devel-warning-to-error-for-CO.patch @@ -0,0 +1,51 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Tue, 3 Oct 2017 20:10:36 -0500 +Subject: [PATCH] objtool: Upgrade libelf-devel warning to error for + CONFIG_ORC_UNWINDER +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +With CONFIG_ORC_UNWINDER, if the user doesn't have libelf-devel +installed, and they don't see the make warning, their ORC unwinder will +be silently broken. Upgrade the warning to an error. + +Reported-and-tested-by: Borislav Petkov +Signed-off-by: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/d9dfc39fb8240998820f9efb233d283a1ee96084.1507079417.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 3dd40cb320fee7c23b574ab821ce140ccd1281c9) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit c413466a72ca533ec126ebc0c5bb579ae0c96b1d) +Signed-off-by: Fabian Grünbichler +--- + Makefile | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/Makefile b/Makefile +index 8e14a926fc94..490ce18685ea 100644 +--- a/Makefile ++++ b/Makefile +@@ -965,7 +965,11 @@ ifdef CONFIG_STACK_VALIDATION + ifeq ($(has_libelf),1) + objtool_target := tools/objtool FORCE + else +- $(warning "Cannot use CONFIG_STACK_VALIDATION, please install libelf-dev, libelf-devel or elfutils-libelf-devel") ++ ifdef CONFIG_ORC_UNWINDER ++ $(error "Cannot generate ORC metadata for CONFIG_ORC_UNWINDER=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") ++ else ++ $(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") ++ endif + SKIP_STACK_VALIDATION := 1 + export SKIP_STACK_VALIDATION + endif +-- +2.14.2 + diff --git a/patches/kernel/0040-x86-ldt-64-Refresh-DS-and-ES-when-modify_ldt-changes.patch b/patches/kernel/0040-x86-ldt-64-Refresh-DS-and-ES-when-modify_ldt-changes.patch deleted file mode 100644 index 91ade6a..0000000 --- a/patches/kernel/0040-x86-ldt-64-Refresh-DS-and-ES-when-modify_ldt-changes.patch +++ /dev/null @@ -1,82 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Wed, 26 Jul 2017 07:16:30 -0700 -Subject: [PATCH] x86/ldt/64: Refresh DS and ES when modify_ldt changes an - entry -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -On x86_32, modify_ldt() implicitly refreshes the cached DS and ES -segments because they are refreshed on return to usermode. - -On x86_64, they're not refreshed on return to usermode. To improve -determinism and match x86_32's behavior, refresh them when we update -the LDT. - -This avoids a situation in which the DS points to a descriptor that is -changed but the old cached segment persists until the next reschedule. -If this happens, then the user-visible state will change -nondeterministically some time after modify_ldt() returns, which is -unfortunate. - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Chang Seok -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Signed-off-by: Ingo Molnar -(cherry picked from commit a632375764aa25c97b78beb56c71b0ba59d1cf83) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 295cb0b06150958ec84ee4b8844ef7e389e22c4e) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/ldt.c | 21 +++++++++++++++++++++ - 1 file changed, 21 insertions(+) - -diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c -index a870910c8565..f0e64db18ac8 100644 ---- a/arch/x86/kernel/ldt.c -+++ b/arch/x86/kernel/ldt.c -@@ -21,6 +21,25 @@ - #include - #include - -+static void refresh_ldt_segments(void) -+{ -+#ifdef CONFIG_X86_64 -+ unsigned short sel; -+ -+ /* -+ * Make sure that the cached DS and ES descriptors match the updated -+ * LDT. -+ */ -+ savesegment(ds, sel); -+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) -+ loadsegment(ds, sel); -+ -+ savesegment(es, sel); -+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) -+ loadsegment(es, sel); -+#endif -+} -+ - /* context.lock is held for us, so we don't need any locking. */ - static void flush_ldt(void *__mm) - { -@@ -32,6 +51,8 @@ static void flush_ldt(void *__mm) - - pc = &mm->context; - set_ldt(pc->ldt->entries, pc->ldt->nr_entries); -+ -+ refresh_ldt_segments(); - } - - /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ --- -2.14.2 - diff --git a/patches/kernel/0041-x86-ldt-64-Refresh-DS-and-ES-when-modify_ldt-changes.patch b/patches/kernel/0041-x86-ldt-64-Refresh-DS-and-ES-when-modify_ldt-changes.patch new file mode 100644 index 0000000..91ade6a --- /dev/null +++ b/patches/kernel/0041-x86-ldt-64-Refresh-DS-and-ES-when-modify_ldt-changes.patch @@ -0,0 +1,82 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Wed, 26 Jul 2017 07:16:30 -0700 +Subject: [PATCH] x86/ldt/64: Refresh DS and ES when modify_ldt changes an + entry +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +On x86_32, modify_ldt() implicitly refreshes the cached DS and ES +segments because they are refreshed on return to usermode. + +On x86_64, they're not refreshed on return to usermode. To improve +determinism and match x86_32's behavior, refresh them when we update +the LDT. + +This avoids a situation in which the DS points to a descriptor that is +changed but the old cached segment persists until the next reschedule. +If this happens, then the user-visible state will change +nondeterministically some time after modify_ldt() returns, which is +unfortunate. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Chang Seok +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Signed-off-by: Ingo Molnar +(cherry picked from commit a632375764aa25c97b78beb56c71b0ba59d1cf83) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 295cb0b06150958ec84ee4b8844ef7e389e22c4e) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/ldt.c | 21 +++++++++++++++++++++ + 1 file changed, 21 insertions(+) + +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index a870910c8565..f0e64db18ac8 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -21,6 +21,25 @@ + #include + #include + ++static void refresh_ldt_segments(void) ++{ ++#ifdef CONFIG_X86_64 ++ unsigned short sel; ++ ++ /* ++ * Make sure that the cached DS and ES descriptors match the updated ++ * LDT. ++ */ ++ savesegment(ds, sel); ++ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) ++ loadsegment(ds, sel); ++ ++ savesegment(es, sel); ++ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) ++ loadsegment(es, sel); ++#endif ++} ++ + /* context.lock is held for us, so we don't need any locking. */ + static void flush_ldt(void *__mm) + { +@@ -32,6 +51,8 @@ static void flush_ldt(void *__mm) + + pc = &mm->context; + set_ldt(pc->ldt->entries, pc->ldt->nr_entries); ++ ++ refresh_ldt_segments(); + } + + /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ +-- +2.14.2 + diff --git a/patches/kernel/0041-x86-mm-Give-each-mm-TLB-flush-generation-a-unique-ID.patch b/patches/kernel/0041-x86-mm-Give-each-mm-TLB-flush-generation-a-unique-ID.patch deleted file mode 100644 index d260739..0000000 --- a/patches/kernel/0041-x86-mm-Give-each-mm-TLB-flush-generation-a-unique-ID.patch +++ /dev/null @@ -1,182 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 29 Jun 2017 08:53:15 -0700 -Subject: [PATCH] x86/mm: Give each mm TLB flush generation a unique ID -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -This adds two new variables to mmu_context_t: ctx_id and tlb_gen. -ctx_id uniquely identifies the mm_struct and will never be reused. -For a given mm_struct (and hence ctx_id), tlb_gen is a monotonic -count of the number of times that a TLB flush has been requested. -The pair (ctx_id, tlb_gen) can be used as an identifier for TLB -flush actions and will be used in subsequent patches to reliably -determine whether all needed TLB flushes have occurred on a given -CPU. - -This patch is split out for ease of review. By itself, it has no -real effect other than creating and updating the new variables. - -Signed-off-by: Andy Lutomirski -Reviewed-by: Nadav Amit -Reviewed-by: Thomas Gleixner -Cc: Andrew Morton -Cc: Arjan van de Ven -Cc: Borislav Petkov -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Mel Gorman -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: linux-mm@kvack.org -Link: http://lkml.kernel.org/r/413a91c24dab3ed0caa5f4e4d017d87b0857f920.1498751203.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit f39681ed0f48498b80455095376f11535feea332) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit e566a0dfbb2a5f7ea90dd66ce384740372739e14) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/mmu.h | 25 +++++++++++++++++++++++-- - arch/x86/include/asm/mmu_context.h | 6 ++++++ - arch/x86/include/asm/tlbflush.h | 18 ++++++++++++++++++ - arch/x86/mm/tlb.c | 6 ++++-- - 4 files changed, 51 insertions(+), 4 deletions(-) - -diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h -index 79b647a7ebd0..bb8c597c2248 100644 ---- a/arch/x86/include/asm/mmu.h -+++ b/arch/x86/include/asm/mmu.h -@@ -3,12 +3,28 @@ - - #include - #include -+#include - - /* -- * The x86 doesn't have a mmu context, but -- * we put the segment information here. -+ * x86 has arch-specific MMU state beyond what lives in mm_struct. - */ - typedef struct { -+ /* -+ * ctx_id uniquely identifies this mm_struct. A ctx_id will never -+ * be reused, and zero is not a valid ctx_id. -+ */ -+ u64 ctx_id; -+ -+ /* -+ * Any code that needs to do any sort of TLB flushing for this -+ * mm will first make its changes to the page tables, then -+ * increment tlb_gen, then flush. This lets the low-level -+ * flushing code keep track of what needs flushing. -+ * -+ * This is not used on Xen PV. -+ */ -+ atomic64_t tlb_gen; -+ - #ifdef CONFIG_MODIFY_LDT_SYSCALL - struct ldt_struct *ldt; - #endif -@@ -37,6 +53,11 @@ typedef struct { - #endif - } mm_context_t; - -+#define INIT_MM_CONTEXT(mm) \ -+ .context = { \ -+ .ctx_id = 1, \ -+ } -+ - void leave_mm(int cpu); - - #endif /* _ASM_X86_MMU_H */ -diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h -index 7a234be7e298..6c05679c715b 100644 ---- a/arch/x86/include/asm/mmu_context.h -+++ b/arch/x86/include/asm/mmu_context.h -@@ -12,6 +12,9 @@ - #include - #include - #include -+ -+extern atomic64_t last_mm_ctx_id; -+ - #ifndef CONFIG_PARAVIRT - static inline void paravirt_activate_mm(struct mm_struct *prev, - struct mm_struct *next) -@@ -132,6 +135,9 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) - static inline int init_new_context(struct task_struct *tsk, - struct mm_struct *mm) - { -+ mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); -+ atomic64_set(&mm->context.tlb_gen, 0); -+ - #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS - if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { - /* pkey 0 is the default and always allocated */ -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index 2b3d68093235..f1f2e73b7b77 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -57,6 +57,23 @@ static inline void invpcid_flush_all_nonglobals(void) - __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); - } - -+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) -+{ -+ u64 new_tlb_gen; -+ -+ /* -+ * Bump the generation count. This also serves as a full barrier -+ * that synchronizes with switch_mm(): callers are required to order -+ * their read of mm_cpumask after their writes to the paging -+ * structures. -+ */ -+ smp_mb__before_atomic(); -+ new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); -+ smp_mb__after_atomic(); -+ -+ return new_tlb_gen; -+} -+ - #ifdef CONFIG_PARAVIRT - #include - #else -@@ -270,6 +287,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, - static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, - struct mm_struct *mm) - { -+ inc_mm_tlb_gen(mm); - cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); - } - -diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c -index 014d07a80053..14f4f8f66aa8 100644 ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -28,6 +28,8 @@ - * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi - */ - -+atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); -+ - void leave_mm(int cpu) - { - struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); -@@ -250,8 +252,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - - cpu = get_cpu(); - -- /* Synchronize with switch_mm. */ -- smp_mb(); -+ /* This is also a barrier that synchronizes with switch_mm(). */ -+ inc_mm_tlb_gen(mm); - - /* Should we flush just the requested range? */ - if ((end != TLB_FLUSH_ALL) && --- -2.14.2 - diff --git a/patches/kernel/0042-x86-mm-Give-each-mm-TLB-flush-generation-a-unique-ID.patch b/patches/kernel/0042-x86-mm-Give-each-mm-TLB-flush-generation-a-unique-ID.patch new file mode 100644 index 0000000..d260739 --- /dev/null +++ b/patches/kernel/0042-x86-mm-Give-each-mm-TLB-flush-generation-a-unique-ID.patch @@ -0,0 +1,182 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 29 Jun 2017 08:53:15 -0700 +Subject: [PATCH] x86/mm: Give each mm TLB flush generation a unique ID +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +This adds two new variables to mmu_context_t: ctx_id and tlb_gen. +ctx_id uniquely identifies the mm_struct and will never be reused. +For a given mm_struct (and hence ctx_id), tlb_gen is a monotonic +count of the number of times that a TLB flush has been requested. +The pair (ctx_id, tlb_gen) can be used as an identifier for TLB +flush actions and will be used in subsequent patches to reliably +determine whether all needed TLB flushes have occurred on a given +CPU. + +This patch is split out for ease of review. By itself, it has no +real effect other than creating and updating the new variables. + +Signed-off-by: Andy Lutomirski +Reviewed-by: Nadav Amit +Reviewed-by: Thomas Gleixner +Cc: Andrew Morton +Cc: Arjan van de Ven +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Mel Gorman +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/413a91c24dab3ed0caa5f4e4d017d87b0857f920.1498751203.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit f39681ed0f48498b80455095376f11535feea332) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit e566a0dfbb2a5f7ea90dd66ce384740372739e14) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/mmu.h | 25 +++++++++++++++++++++++-- + arch/x86/include/asm/mmu_context.h | 6 ++++++ + arch/x86/include/asm/tlbflush.h | 18 ++++++++++++++++++ + arch/x86/mm/tlb.c | 6 ++++-- + 4 files changed, 51 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h +index 79b647a7ebd0..bb8c597c2248 100644 +--- a/arch/x86/include/asm/mmu.h ++++ b/arch/x86/include/asm/mmu.h +@@ -3,12 +3,28 @@ + + #include + #include ++#include + + /* +- * The x86 doesn't have a mmu context, but +- * we put the segment information here. ++ * x86 has arch-specific MMU state beyond what lives in mm_struct. + */ + typedef struct { ++ /* ++ * ctx_id uniquely identifies this mm_struct. A ctx_id will never ++ * be reused, and zero is not a valid ctx_id. ++ */ ++ u64 ctx_id; ++ ++ /* ++ * Any code that needs to do any sort of TLB flushing for this ++ * mm will first make its changes to the page tables, then ++ * increment tlb_gen, then flush. This lets the low-level ++ * flushing code keep track of what needs flushing. ++ * ++ * This is not used on Xen PV. ++ */ ++ atomic64_t tlb_gen; ++ + #ifdef CONFIG_MODIFY_LDT_SYSCALL + struct ldt_struct *ldt; + #endif +@@ -37,6 +53,11 @@ typedef struct { + #endif + } mm_context_t; + ++#define INIT_MM_CONTEXT(mm) \ ++ .context = { \ ++ .ctx_id = 1, \ ++ } ++ + void leave_mm(int cpu); + + #endif /* _ASM_X86_MMU_H */ +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index 7a234be7e298..6c05679c715b 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -12,6 +12,9 @@ + #include + #include + #include ++ ++extern atomic64_t last_mm_ctx_id; ++ + #ifndef CONFIG_PARAVIRT + static inline void paravirt_activate_mm(struct mm_struct *prev, + struct mm_struct *next) +@@ -132,6 +135,9 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) + static inline int init_new_context(struct task_struct *tsk, + struct mm_struct *mm) + { ++ mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); ++ atomic64_set(&mm->context.tlb_gen, 0); ++ + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS + if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { + /* pkey 0 is the default and always allocated */ +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 2b3d68093235..f1f2e73b7b77 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -57,6 +57,23 @@ static inline void invpcid_flush_all_nonglobals(void) + __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); + } + ++static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) ++{ ++ u64 new_tlb_gen; ++ ++ /* ++ * Bump the generation count. This also serves as a full barrier ++ * that synchronizes with switch_mm(): callers are required to order ++ * their read of mm_cpumask after their writes to the paging ++ * structures. ++ */ ++ smp_mb__before_atomic(); ++ new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); ++ smp_mb__after_atomic(); ++ ++ return new_tlb_gen; ++} ++ + #ifdef CONFIG_PARAVIRT + #include + #else +@@ -270,6 +287,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, + static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm) + { ++ inc_mm_tlb_gen(mm); + cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); + } + +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 014d07a80053..14f4f8f66aa8 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -28,6 +28,8 @@ + * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi + */ + ++atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); ++ + void leave_mm(int cpu) + { + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); +@@ -250,8 +252,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + + cpu = get_cpu(); + +- /* Synchronize with switch_mm. */ +- smp_mb(); ++ /* This is also a barrier that synchronizes with switch_mm(). */ ++ inc_mm_tlb_gen(mm); + + /* Should we flush just the requested range? */ + if ((end != TLB_FLUSH_ALL) && +-- +2.14.2 + diff --git a/patches/kernel/0042-x86-mm-Track-the-TLB-s-tlb_gen-and-update-the-flushi.patch b/patches/kernel/0042-x86-mm-Track-the-TLB-s-tlb_gen-and-update-the-flushi.patch deleted file mode 100644 index 2630f26..0000000 --- a/patches/kernel/0042-x86-mm-Track-the-TLB-s-tlb_gen-and-update-the-flushi.patch +++ /dev/null @@ -1,279 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 29 Jun 2017 08:53:16 -0700 -Subject: [PATCH] x86/mm: Track the TLB's tlb_gen and update the flushing - algorithm -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -There are two kernel features that would benefit from tracking -how up-to-date each CPU's TLB is in the case where IPIs aren't keeping -it up to date in real time: - - - Lazy mm switching currently works by switching to init_mm when - it would otherwise flush. This is wasteful: there isn't fundamentally - any need to update CR3 at all when going lazy or when returning from - lazy mode, nor is there any need to receive flush IPIs at all. Instead, - we should just stop trying to keep the TLB coherent when we go lazy and, - when unlazying, check whether we missed any flushes. - - - PCID will let us keep recent user contexts alive in the TLB. If we - start doing this, we need a way to decide whether those contexts are - up to date. - -On some paravirt systems, remote TLBs can be flushed without IPIs. -This won't update the target CPUs' tlb_gens, which may cause -unnecessary local flushes later on. We can address this if it becomes -a problem by carefully updating the target CPU's tlb_gen directly. - -By itself, this patch is a very minor optimization that avoids -unnecessary flushes when multiple TLB flushes targetting the same CPU -race. The complexity in this patch would not be worth it on its own, -but it will enable improved lazy TLB tracking and PCID. - -Signed-off-by: Andy Lutomirski -Reviewed-by: Nadav Amit -Reviewed-by: Thomas Gleixner -Cc: Andrew Morton -Cc: Arjan van de Ven -Cc: Borislav Petkov -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Mel Gorman -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: linux-mm@kvack.org -Link: http://lkml.kernel.org/r/1210fb244bc9cbe7677f7f0b72db4d359675f24b.1498751203.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit b0579ade7cd82391360e959cc844e50a160e8a96) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit d34881c25f3c70228ed792fd62881185a25c4422) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/tlbflush.h | 43 +++++++++++++++-- - arch/x86/mm/tlb.c | 102 +++++++++++++++++++++++++++++++++++++--- - 2 files changed, 135 insertions(+), 10 deletions(-) - -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index f1f2e73b7b77..3a167c214560 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -82,6 +82,11 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) - #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) - #endif - -+struct tlb_context { -+ u64 ctx_id; -+ u64 tlb_gen; -+}; -+ - struct tlb_state { - /* - * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts -@@ -97,6 +102,21 @@ struct tlb_state { - * disabling interrupts when modifying either one. - */ - unsigned long cr4; -+ -+ /* -+ * This is a list of all contexts that might exist in the TLB. -+ * Since we don't yet use PCID, there is only one context. -+ * -+ * For each context, ctx_id indicates which mm the TLB's user -+ * entries came from. As an invariant, the TLB will never -+ * contain entries that are out-of-date as when that mm reached -+ * the tlb_gen in the list. -+ * -+ * To be clear, this means that it's legal for the TLB code to -+ * flush the TLB without updating tlb_gen. This can happen -+ * (for now, at least) due to paravirt remote flushes. -+ */ -+ struct tlb_context ctxs[1]; - }; - DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); - -@@ -256,9 +276,26 @@ static inline void __flush_tlb_one(unsigned long addr) - * and page-granular flushes are available only on i486 and up. - */ - struct flush_tlb_info { -- struct mm_struct *mm; -- unsigned long start; -- unsigned long end; -+ /* -+ * We support several kinds of flushes. -+ * -+ * - Fully flush a single mm. .mm will be set, .end will be -+ * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to -+ * which the IPI sender is trying to catch us up. -+ * -+ * - Partially flush a single mm. .mm will be set, .start and -+ * .end will indicate the range, and .new_tlb_gen will be set -+ * such that the changes between generation .new_tlb_gen-1 and -+ * .new_tlb_gen are entirely contained in the indicated range. -+ * -+ * - Fully flush all mms whose tlb_gens have been updated. .mm -+ * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen -+ * will be zero. -+ */ -+ struct mm_struct *mm; -+ unsigned long start; -+ unsigned long end; -+ u64 new_tlb_gen; - }; - - #define local_flush_tlb() __flush_tlb() -diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c -index 14f4f8f66aa8..4e5a5ddb9e4d 100644 ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -105,6 +105,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - } - - this_cpu_write(cpu_tlbstate.loaded_mm, next); -+ this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); -+ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen)); - - WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); - cpumask_set_cpu(cpu, mm_cpumask(next)); -@@ -155,25 +157,102 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - switch_ldt(real_prev, next); - } - -+/* -+ * flush_tlb_func_common()'s memory ordering requirement is that any -+ * TLB fills that happen after we flush the TLB are ordered after we -+ * read active_mm's tlb_gen. We don't need any explicit barriers -+ * because all x86 flush operations are serializing and the -+ * atomic64_read operation won't be reordered by the compiler. -+ */ - static void flush_tlb_func_common(const struct flush_tlb_info *f, - bool local, enum tlb_flush_reason reason) - { -+ /* -+ * We have three different tlb_gen values in here. They are: -+ * -+ * - mm_tlb_gen: the latest generation. -+ * - local_tlb_gen: the generation that this CPU has already caught -+ * up to. -+ * - f->new_tlb_gen: the generation that the requester of the flush -+ * wants us to catch up to. -+ */ -+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); -+ u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); -+ u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen); -+ - /* This code cannot presently handle being reentered. */ - VM_WARN_ON(!irqs_disabled()); - -+ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != -+ loaded_mm->context.ctx_id); -+ - if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { -+ /* -+ * leave_mm() is adequate to handle any type of flush, and -+ * we would prefer not to receive further IPIs. leave_mm() -+ * clears this CPU's bit in mm_cpumask(). -+ */ - leave_mm(smp_processor_id()); - return; - } - -- if (f->end == TLB_FLUSH_ALL) { -- local_flush_tlb(); -- if (local) -- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); -- trace_tlb_flush(reason, TLB_FLUSH_ALL); -- } else { -+ if (unlikely(local_tlb_gen == mm_tlb_gen)) { -+ /* -+ * There's nothing to do: we're already up to date. This can -+ * happen if two concurrent flushes happen -- the first flush to -+ * be handled can catch us all the way up, leaving no work for -+ * the second flush. -+ */ -+ return; -+ } -+ -+ WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); -+ WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); -+ -+ /* -+ * If we get to this point, we know that our TLB is out of date. -+ * This does not strictly imply that we need to flush (it's -+ * possible that f->new_tlb_gen <= local_tlb_gen), but we're -+ * going to need to flush in the very near future, so we might -+ * as well get it over with. -+ * -+ * The only question is whether to do a full or partial flush. -+ * -+ * We do a partial flush if requested and two extra conditions -+ * are met: -+ * -+ * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that -+ * we've always done all needed flushes to catch up to -+ * local_tlb_gen. If, for example, local_tlb_gen == 2 and -+ * f->new_tlb_gen == 3, then we know that the flush needed to bring -+ * us up to date for tlb_gen 3 is the partial flush we're -+ * processing. -+ * -+ * As an example of why this check is needed, suppose that there -+ * are two concurrent flushes. The first is a full flush that -+ * changes context.tlb_gen from 1 to 2. The second is a partial -+ * flush that changes context.tlb_gen from 2 to 3. If they get -+ * processed on this CPU in reverse order, we'll see -+ * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. -+ * If we were to use __flush_tlb_single() and set local_tlb_gen to -+ * 3, we'd be break the invariant: we'd update local_tlb_gen above -+ * 1 without the full flush that's needed for tlb_gen 2. -+ * -+ * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. -+ * Partial TLB flushes are not all that much cheaper than full TLB -+ * flushes, so it seems unlikely that it would be a performance win -+ * to do a partial flush if that won't bring our TLB fully up to -+ * date. By doing a full flush instead, we can increase -+ * local_tlb_gen all the way to mm_tlb_gen and we can probably -+ * avoid another flush in the very near future. -+ */ -+ if (f->end != TLB_FLUSH_ALL && -+ f->new_tlb_gen == local_tlb_gen + 1 && -+ f->new_tlb_gen == mm_tlb_gen) { -+ /* Partial flush */ - unsigned long addr; - unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; -+ - addr = f->start; - while (addr < f->end) { - __flush_tlb_single(addr); -@@ -182,7 +261,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, - if (local) - count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); - trace_tlb_flush(reason, nr_pages); -+ } else { -+ /* Full flush. */ -+ local_flush_tlb(); -+ if (local) -+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); -+ trace_tlb_flush(reason, TLB_FLUSH_ALL); - } -+ -+ /* Both paths above update our state to mm_tlb_gen. */ -+ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen); - } - - static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) -@@ -253,7 +341,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - cpu = get_cpu(); - - /* This is also a barrier that synchronizes with switch_mm(). */ -- inc_mm_tlb_gen(mm); -+ info.new_tlb_gen = inc_mm_tlb_gen(mm); - - /* Should we flush just the requested range? */ - if ((end != TLB_FLUSH_ALL) && --- -2.14.2 - diff --git a/patches/kernel/0043-x86-mm-Rework-lazy-TLB-mode-and-TLB-freshness-tracki.patch b/patches/kernel/0043-x86-mm-Rework-lazy-TLB-mode-and-TLB-freshness-tracki.patch deleted file mode 100644 index 70f93ef..0000000 --- a/patches/kernel/0043-x86-mm-Rework-lazy-TLB-mode-and-TLB-freshness-tracki.patch +++ /dev/null @@ -1,453 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 29 Jun 2017 08:53:17 -0700 -Subject: [PATCH] x86/mm: Rework lazy TLB mode and TLB freshness tracking -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -x86's lazy TLB mode used to be fairly weak -- it would switch to -init_mm the first time it tried to flush a lazy TLB. This meant an -unnecessary CR3 write and, if the flush was remote, an unnecessary -IPI. - -Rewrite it entirely. When we enter lazy mode, we simply remove the -CPU from mm_cpumask. This means that we need a way to figure out -whether we've missed a flush when we switch back out of lazy mode. -I use the tlb_gen machinery to track whether a context is up to -date. - -Note to reviewers: this patch, my itself, looks a bit odd. I'm -using an array of length 1 containing (ctx_id, tlb_gen) rather than -just storing tlb_gen, and making it at array isn't necessary yet. -I'm doing this because the next few patches add PCID support, and, -with PCID, we need ctx_id, and the array will end up with a length -greater than 1. Making it an array now means that there will be -less churn and therefore less stress on your eyeballs. - -NB: This is dubious but, AFAICT, still correct on Xen and UV. -xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this -patch changes the way that mm_cpumask() works. This should be okay, -since Xen *also* iterates all online CPUs to find all the CPUs it -needs to twiddle. - -The UV tlbflush code is rather dated and should be changed. - -Here are some benchmark results, done on a Skylake laptop at 2.3 GHz -(turbo off, intel_pstate requesting max performance) under KVM with -the guest using idle=poll (to avoid artifacts when bouncing between -CPUs). I haven't done any real statistics here -- I just ran them -in a loop and picked the fastest results that didn't look like -outliers. Unpatched means commit a4eb8b993554, so all the -bookkeeping overhead is gone. - -MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In -an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU. -This is intended to be a nearly worst-case test. - - patched: 13.4µs - unpatched: 21.6µs - -Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores), -nrounds = 100, 256M data - - patched: 1.1 seconds or so - unpatched: 1.9 seconds or so - -The sleepup on Vitaly's test appearss to be because it spends a lot -of time blocked on mmap_sem, and this patch avoids sending IPIs to -blocked CPUs. - -Signed-off-by: Andy Lutomirski -Reviewed-by: Nadav Amit -Reviewed-by: Thomas Gleixner -Cc: Andrew Banman -Cc: Andrew Morton -Cc: Arjan van de Ven -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Dave Hansen -Cc: Dimitri Sivanich -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Mel Gorman -Cc: Mike Travis -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: linux-mm@kvack.org -Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 94b1b03b519b81c494900cb112aa00ed205cc2d9) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit b381b7ae452f2bc6384507a897247be7c93a71cc) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/mmu_context.h | 6 +- - arch/x86/include/asm/tlbflush.h | 4 - - arch/x86/mm/init.c | 1 - - arch/x86/mm/tlb.c | 197 ++++++++++++++++++++++--------------- - arch/x86/xen/mmu_pv.c | 5 +- - 5 files changed, 124 insertions(+), 89 deletions(-) - -diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h -index 6c05679c715b..d6b055b328f2 100644 ---- a/arch/x86/include/asm/mmu_context.h -+++ b/arch/x86/include/asm/mmu_context.h -@@ -128,8 +128,10 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) - - static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) - { -- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) -- this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); -+ int cpu = smp_processor_id(); -+ -+ if (cpumask_test_cpu(cpu, mm_cpumask(mm))) -+ cpumask_clear_cpu(cpu, mm_cpumask(mm)); - } - - static inline int init_new_context(struct task_struct *tsk, -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index 3a167c214560..6397275008db 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -95,7 +95,6 @@ struct tlb_state { - * mode even if we've already switched back to swapper_pg_dir. - */ - struct mm_struct *loaded_mm; -- int state; - - /* - * Access to this CR4 shadow and to H/W CR4 is protected by -@@ -318,9 +317,6 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) - void native_flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info); - --#define TLBSTATE_OK 1 --#define TLBSTATE_LAZY 2 -- - static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, - struct mm_struct *mm) - { -diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c -index df2624b091a7..c86dc071bb10 100644 ---- a/arch/x86/mm/init.c -+++ b/arch/x86/mm/init.c -@@ -849,7 +849,6 @@ void __init zone_sizes_init(void) - - DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { - .loaded_mm = &init_mm, -- .state = 0, - .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ - }; - EXPORT_SYMBOL_GPL(cpu_tlbstate); -diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c -index 4e5a5ddb9e4d..0982c997d36f 100644 ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -45,8 +45,8 @@ void leave_mm(int cpu) - if (loaded_mm == &init_mm) - return; - -- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) -- BUG(); -+ /* Warn if we're not lazy. */ -+ WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); - - switch_mm(NULL, &init_mm, NULL); - } -@@ -65,94 +65,117 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, - void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) - { -- unsigned cpu = smp_processor_id(); - struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); -+ unsigned cpu = smp_processor_id(); -+ u64 next_tlb_gen; - - /* -- * NB: The scheduler will call us with prev == next when -- * switching from lazy TLB mode to normal mode if active_mm -- * isn't changing. When this happens, there is no guarantee -- * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next. -+ * NB: The scheduler will call us with prev == next when switching -+ * from lazy TLB mode to normal mode if active_mm isn't changing. -+ * When this happens, we don't assume that CR3 (and hence -+ * cpu_tlbstate.loaded_mm) matches next. - * - * NB: leave_mm() calls us with prev == NULL and tsk == NULL. - */ - -- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); -+ /* We don't want flush_tlb_func_* to run concurrently with us. */ -+ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) -+ WARN_ON_ONCE(!irqs_disabled()); -+ -+ /* -+ * Verify that CR3 is what we think it is. This will catch -+ * hypothetical buggy code that directly switches to swapper_pg_dir -+ * without going through leave_mm() / switch_mm_irqs_off(). -+ */ -+ VM_BUG_ON(read_cr3_pa() != __pa(real_prev->pgd)); - - if (real_prev == next) { -- /* -- * There's nothing to do: we always keep the per-mm control -- * regs in sync with cpu_tlbstate.loaded_mm. Just -- * sanity-check mm_cpumask. -- */ -- if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next)))) -- cpumask_set_cpu(cpu, mm_cpumask(next)); -- return; -- } -+ VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != -+ next->context.ctx_id); -+ -+ if (cpumask_test_cpu(cpu, mm_cpumask(next))) { -+ /* -+ * There's nothing to do: we weren't lazy, and we -+ * aren't changing our mm. We don't need to flush -+ * anything, nor do we need to update CR3, CR4, or -+ * LDTR. -+ */ -+ return; -+ } -+ -+ /* Resume remote flushes and then read tlb_gen. */ -+ cpumask_set_cpu(cpu, mm_cpumask(next)); -+ next_tlb_gen = atomic64_read(&next->context.tlb_gen); -+ -+ if (this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen) < next_tlb_gen) { -+ /* -+ * Ideally, we'd have a flush_tlb() variant that -+ * takes the known CR3 value as input. This would -+ * be faster on Xen PV and on hypothetical CPUs -+ * on which INVPCID is fast. -+ */ -+ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, -+ next_tlb_gen); -+ write_cr3(__pa(next->pgd)); -+ -+ /* -+ * This gets called via leave_mm() in the idle path -+ * where RCU functions differently. Tracing normally -+ * uses RCU, so we have to call the tracepoint -+ * specially here. -+ */ -+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, -+ TLB_FLUSH_ALL); -+ } - -- if (IS_ENABLED(CONFIG_VMAP_STACK)) { - /* -- * If our current stack is in vmalloc space and isn't -- * mapped in the new pgd, we'll double-fault. Forcibly -- * map it. -+ * We just exited lazy mode, which means that CR4 and/or LDTR -+ * may be stale. (Changes to the required CR4 and LDTR states -+ * are not reflected in tlb_gen.) - */ -- unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); -- -- pgd_t *pgd = next->pgd + stack_pgd_index; -- -- if (unlikely(pgd_none(*pgd))) -- set_pgd(pgd, init_mm.pgd[stack_pgd_index]); -- } -+ } else { -+ VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) == -+ next->context.ctx_id); -+ -+ if (IS_ENABLED(CONFIG_VMAP_STACK)) { -+ /* -+ * If our current stack is in vmalloc space and isn't -+ * mapped in the new pgd, we'll double-fault. Forcibly -+ * map it. -+ */ -+ unsigned int index = pgd_index(current_stack_pointer()); -+ pgd_t *pgd = next->pgd + index; -+ -+ if (unlikely(pgd_none(*pgd))) -+ set_pgd(pgd, init_mm.pgd[index]); -+ } - -- this_cpu_write(cpu_tlbstate.loaded_mm, next); -- this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); -- this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen)); -+ /* Stop remote flushes for the previous mm */ -+ if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) -+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); - -- WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); -- cpumask_set_cpu(cpu, mm_cpumask(next)); -+ VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); - -- /* -- * Re-load page tables. -- * -- * This logic has an ordering constraint: -- * -- * CPU 0: Write to a PTE for 'next' -- * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. -- * CPU 1: set bit 1 in next's mm_cpumask -- * CPU 1: load from the PTE that CPU 0 writes (implicit) -- * -- * We need to prevent an outcome in which CPU 1 observes -- * the new PTE value and CPU 0 observes bit 1 clear in -- * mm_cpumask. (If that occurs, then the IPI will never -- * be sent, and CPU 0's TLB will contain a stale entry.) -- * -- * The bad outcome can occur if either CPU's load is -- * reordered before that CPU's store, so both CPUs must -- * execute full barriers to prevent this from happening. -- * -- * Thus, switch_mm needs a full barrier between the -- * store to mm_cpumask and any operation that could load -- * from next->pgd. TLB fills are special and can happen -- * due to instruction fetches or for no reason at all, -- * and neither LOCK nor MFENCE orders them. -- * Fortunately, load_cr3() is serializing and gives the -- * ordering guarantee we need. -- */ -- load_cr3(next->pgd); -+ /* -+ * Start remote flushes and then read tlb_gen. -+ */ -+ cpumask_set_cpu(cpu, mm_cpumask(next)); -+ next_tlb_gen = atomic64_read(&next->context.tlb_gen); - -- /* -- * This gets called via leave_mm() in the idle path where RCU -- * functions differently. Tracing normally uses RCU, so we have to -- * call the tracepoint specially here. -- */ -- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); -+ this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); -+ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen); -+ this_cpu_write(cpu_tlbstate.loaded_mm, next); -+ write_cr3(__pa(next->pgd)); - -- /* Stop flush ipis for the previous mm */ -- WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && -- real_prev != &init_mm); -- cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); -+ /* -+ * This gets called via leave_mm() in the idle path where RCU -+ * functions differently. Tracing normally uses RCU, so we -+ * have to call the tracepoint specially here. -+ */ -+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, -+ TLB_FLUSH_ALL); -+ } - -- /* Load per-mm CR4 and LDTR state */ - load_mm_cr4(next); - switch_ldt(real_prev, next); - } -@@ -186,13 +209,13 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, - VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != - loaded_mm->context.ctx_id); - -- if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { -+ if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { - /* -- * leave_mm() is adequate to handle any type of flush, and -- * we would prefer not to receive further IPIs. leave_mm() -- * clears this CPU's bit in mm_cpumask(). -+ * We're in lazy mode -- don't flush. We can get here on -+ * remote flushes due to races and on local flushes if a -+ * kernel thread coincidentally flushes the mm it's lazily -+ * still using. - */ -- leave_mm(smp_processor_id()); - return; - } - -@@ -203,6 +226,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, - * be handled can catch us all the way up, leaving no work for - * the second flush. - */ -+ trace_tlb_flush(reason, 0); - return; - } - -@@ -304,6 +328,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask, - (info->end - info->start) >> PAGE_SHIFT); - - if (is_uv_system()) { -+ /* -+ * This whole special case is confused. UV has a "Broadcast -+ * Assist Unit", which seems to be a fancy way to send IPIs. -+ * Back when x86 used an explicit TLB flush IPI, UV was -+ * optimized to use its own mechanism. These days, x86 uses -+ * smp_call_function_many(), but UV still uses a manual IPI, -+ * and that IPI's action is out of date -- it does a manual -+ * flush instead of calling flush_tlb_func_remote(). This -+ * means that the percpu tlb_gen variables won't be updated -+ * and we'll do pointless flushes on future context switches. -+ * -+ * Rather than hooking native_flush_tlb_others() here, I think -+ * that UV should be updated so that smp_call_function_many(), -+ * etc, are optimal on UV. -+ */ - unsigned int cpu; - - cpu = smp_processor_id(); -@@ -363,6 +402,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - - if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), &info); -+ - put_cpu(); - } - -@@ -371,8 +411,6 @@ static void do_flush_tlb_all(void *info) - { - count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); - __flush_tlb_all(); -- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) -- leave_mm(smp_processor_id()); - } - - void flush_tlb_all(void) -@@ -425,6 +463,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) - - if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) - flush_tlb_others(&batch->cpumask, &info); -+ - cpumask_clear(&batch->cpumask); - - put_cpu(); -diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c -index 5f61b7e2e6b2..ba76f3ce997f 100644 ---- a/arch/x86/xen/mmu_pv.c -+++ b/arch/x86/xen/mmu_pv.c -@@ -1005,14 +1005,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm) - /* Get the "official" set of cpus referring to our pagetable. */ - if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { - for_each_online_cpu(cpu) { -- if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) -- && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) -+ if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) - continue; - smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1); - } - return; - } -- cpumask_copy(mask, mm_cpumask(mm)); - - /* - * It's possible that a vcpu may have a stale reference to our -@@ -1021,6 +1019,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm) - * look at its actual current cr3 value, and force it to flush - * if needed. - */ -+ cpumask_clear(mask); - for_each_online_cpu(cpu) { - if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) - cpumask_set_cpu(cpu, mask); --- -2.14.2 - diff --git a/patches/kernel/0043-x86-mm-Track-the-TLB-s-tlb_gen-and-update-the-flushi.patch b/patches/kernel/0043-x86-mm-Track-the-TLB-s-tlb_gen-and-update-the-flushi.patch new file mode 100644 index 0000000..2630f26 --- /dev/null +++ b/patches/kernel/0043-x86-mm-Track-the-TLB-s-tlb_gen-and-update-the-flushi.patch @@ -0,0 +1,279 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 29 Jun 2017 08:53:16 -0700 +Subject: [PATCH] x86/mm: Track the TLB's tlb_gen and update the flushing + algorithm +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +There are two kernel features that would benefit from tracking +how up-to-date each CPU's TLB is in the case where IPIs aren't keeping +it up to date in real time: + + - Lazy mm switching currently works by switching to init_mm when + it would otherwise flush. This is wasteful: there isn't fundamentally + any need to update CR3 at all when going lazy or when returning from + lazy mode, nor is there any need to receive flush IPIs at all. Instead, + we should just stop trying to keep the TLB coherent when we go lazy and, + when unlazying, check whether we missed any flushes. + + - PCID will let us keep recent user contexts alive in the TLB. If we + start doing this, we need a way to decide whether those contexts are + up to date. + +On some paravirt systems, remote TLBs can be flushed without IPIs. +This won't update the target CPUs' tlb_gens, which may cause +unnecessary local flushes later on. We can address this if it becomes +a problem by carefully updating the target CPU's tlb_gen directly. + +By itself, this patch is a very minor optimization that avoids +unnecessary flushes when multiple TLB flushes targetting the same CPU +race. The complexity in this patch would not be worth it on its own, +but it will enable improved lazy TLB tracking and PCID. + +Signed-off-by: Andy Lutomirski +Reviewed-by: Nadav Amit +Reviewed-by: Thomas Gleixner +Cc: Andrew Morton +Cc: Arjan van de Ven +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Mel Gorman +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/1210fb244bc9cbe7677f7f0b72db4d359675f24b.1498751203.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit b0579ade7cd82391360e959cc844e50a160e8a96) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit d34881c25f3c70228ed792fd62881185a25c4422) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/tlbflush.h | 43 +++++++++++++++-- + arch/x86/mm/tlb.c | 102 +++++++++++++++++++++++++++++++++++++--- + 2 files changed, 135 insertions(+), 10 deletions(-) + +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index f1f2e73b7b77..3a167c214560 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -82,6 +82,11 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) + #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) + #endif + ++struct tlb_context { ++ u64 ctx_id; ++ u64 tlb_gen; ++}; ++ + struct tlb_state { + /* + * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts +@@ -97,6 +102,21 @@ struct tlb_state { + * disabling interrupts when modifying either one. + */ + unsigned long cr4; ++ ++ /* ++ * This is a list of all contexts that might exist in the TLB. ++ * Since we don't yet use PCID, there is only one context. ++ * ++ * For each context, ctx_id indicates which mm the TLB's user ++ * entries came from. As an invariant, the TLB will never ++ * contain entries that are out-of-date as when that mm reached ++ * the tlb_gen in the list. ++ * ++ * To be clear, this means that it's legal for the TLB code to ++ * flush the TLB without updating tlb_gen. This can happen ++ * (for now, at least) due to paravirt remote flushes. ++ */ ++ struct tlb_context ctxs[1]; + }; + DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); + +@@ -256,9 +276,26 @@ static inline void __flush_tlb_one(unsigned long addr) + * and page-granular flushes are available only on i486 and up. + */ + struct flush_tlb_info { +- struct mm_struct *mm; +- unsigned long start; +- unsigned long end; ++ /* ++ * We support several kinds of flushes. ++ * ++ * - Fully flush a single mm. .mm will be set, .end will be ++ * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to ++ * which the IPI sender is trying to catch us up. ++ * ++ * - Partially flush a single mm. .mm will be set, .start and ++ * .end will indicate the range, and .new_tlb_gen will be set ++ * such that the changes between generation .new_tlb_gen-1 and ++ * .new_tlb_gen are entirely contained in the indicated range. ++ * ++ * - Fully flush all mms whose tlb_gens have been updated. .mm ++ * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen ++ * will be zero. ++ */ ++ struct mm_struct *mm; ++ unsigned long start; ++ unsigned long end; ++ u64 new_tlb_gen; + }; + + #define local_flush_tlb() __flush_tlb() +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 14f4f8f66aa8..4e5a5ddb9e4d 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -105,6 +105,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + } + + this_cpu_write(cpu_tlbstate.loaded_mm, next); ++ this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); ++ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen)); + + WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); + cpumask_set_cpu(cpu, mm_cpumask(next)); +@@ -155,25 +157,102 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + switch_ldt(real_prev, next); + } + ++/* ++ * flush_tlb_func_common()'s memory ordering requirement is that any ++ * TLB fills that happen after we flush the TLB are ordered after we ++ * read active_mm's tlb_gen. We don't need any explicit barriers ++ * because all x86 flush operations are serializing and the ++ * atomic64_read operation won't be reordered by the compiler. ++ */ + static void flush_tlb_func_common(const struct flush_tlb_info *f, + bool local, enum tlb_flush_reason reason) + { ++ /* ++ * We have three different tlb_gen values in here. They are: ++ * ++ * - mm_tlb_gen: the latest generation. ++ * - local_tlb_gen: the generation that this CPU has already caught ++ * up to. ++ * - f->new_tlb_gen: the generation that the requester of the flush ++ * wants us to catch up to. ++ */ ++ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); ++ u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); ++ u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen); ++ + /* This code cannot presently handle being reentered. */ + VM_WARN_ON(!irqs_disabled()); + ++ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != ++ loaded_mm->context.ctx_id); ++ + if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { ++ /* ++ * leave_mm() is adequate to handle any type of flush, and ++ * we would prefer not to receive further IPIs. leave_mm() ++ * clears this CPU's bit in mm_cpumask(). ++ */ + leave_mm(smp_processor_id()); + return; + } + +- if (f->end == TLB_FLUSH_ALL) { +- local_flush_tlb(); +- if (local) +- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); +- trace_tlb_flush(reason, TLB_FLUSH_ALL); +- } else { ++ if (unlikely(local_tlb_gen == mm_tlb_gen)) { ++ /* ++ * There's nothing to do: we're already up to date. This can ++ * happen if two concurrent flushes happen -- the first flush to ++ * be handled can catch us all the way up, leaving no work for ++ * the second flush. ++ */ ++ return; ++ } ++ ++ WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); ++ WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); ++ ++ /* ++ * If we get to this point, we know that our TLB is out of date. ++ * This does not strictly imply that we need to flush (it's ++ * possible that f->new_tlb_gen <= local_tlb_gen), but we're ++ * going to need to flush in the very near future, so we might ++ * as well get it over with. ++ * ++ * The only question is whether to do a full or partial flush. ++ * ++ * We do a partial flush if requested and two extra conditions ++ * are met: ++ * ++ * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that ++ * we've always done all needed flushes to catch up to ++ * local_tlb_gen. If, for example, local_tlb_gen == 2 and ++ * f->new_tlb_gen == 3, then we know that the flush needed to bring ++ * us up to date for tlb_gen 3 is the partial flush we're ++ * processing. ++ * ++ * As an example of why this check is needed, suppose that there ++ * are two concurrent flushes. The first is a full flush that ++ * changes context.tlb_gen from 1 to 2. The second is a partial ++ * flush that changes context.tlb_gen from 2 to 3. If they get ++ * processed on this CPU in reverse order, we'll see ++ * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. ++ * If we were to use __flush_tlb_single() and set local_tlb_gen to ++ * 3, we'd be break the invariant: we'd update local_tlb_gen above ++ * 1 without the full flush that's needed for tlb_gen 2. ++ * ++ * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. ++ * Partial TLB flushes are not all that much cheaper than full TLB ++ * flushes, so it seems unlikely that it would be a performance win ++ * to do a partial flush if that won't bring our TLB fully up to ++ * date. By doing a full flush instead, we can increase ++ * local_tlb_gen all the way to mm_tlb_gen and we can probably ++ * avoid another flush in the very near future. ++ */ ++ if (f->end != TLB_FLUSH_ALL && ++ f->new_tlb_gen == local_tlb_gen + 1 && ++ f->new_tlb_gen == mm_tlb_gen) { ++ /* Partial flush */ + unsigned long addr; + unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; ++ + addr = f->start; + while (addr < f->end) { + __flush_tlb_single(addr); +@@ -182,7 +261,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, + if (local) + count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); + trace_tlb_flush(reason, nr_pages); ++ } else { ++ /* Full flush. */ ++ local_flush_tlb(); ++ if (local) ++ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); ++ trace_tlb_flush(reason, TLB_FLUSH_ALL); + } ++ ++ /* Both paths above update our state to mm_tlb_gen. */ ++ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen); + } + + static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) +@@ -253,7 +341,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + cpu = get_cpu(); + + /* This is also a barrier that synchronizes with switch_mm(). */ +- inc_mm_tlb_gen(mm); ++ info.new_tlb_gen = inc_mm_tlb_gen(mm); + + /* Should we flush just the requested range? */ + if ((end != TLB_FLUSH_ALL) && +-- +2.14.2 + diff --git a/patches/kernel/0044-x86-mm-Implement-PCID-based-optimization-try-to-pres.patch b/patches/kernel/0044-x86-mm-Implement-PCID-based-optimization-try-to-pres.patch deleted file mode 100644 index 0b9df59..0000000 --- a/patches/kernel/0044-x86-mm-Implement-PCID-based-optimization-try-to-pres.patch +++ /dev/null @@ -1,340 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 24 Jul 2017 21:41:38 -0700 -Subject: [PATCH] x86/mm: Implement PCID based optimization: try to preserve - old TLB entries using PCID -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -PCID is a "process context ID" -- it's what other architectures call -an address space ID. Every non-global TLB entry is tagged with a -PCID, only TLB entries that match the currently selected PCID are -used, and we can switch PGDs without flushing the TLB. x86's -PCID is 12 bits. - -This is an unorthodox approach to using PCID. x86's PCID is far too -short to uniquely identify a process, and we can't even really -uniquely identify a running process because there are monster -systems with over 4096 CPUs. To make matters worse, past attempts -to use all 12 PCID bits have resulted in slowdowns instead of -speedups. - -This patch uses PCID differently. We use a PCID to identify a -recently-used mm on a per-cpu basis. An mm has no fixed PCID -binding at all; instead, we give it a fresh PCID each time it's -loaded except in cases where we want to preserve the TLB, in which -case we reuse a recent value. - -Here are some benchmark results, done on a Skylake laptop at 2.3 GHz -(turbo off, intel_pstate requesting max performance) under KVM with -the guest using idle=poll (to avoid artifacts when bouncing between -CPUs). I haven't done any real statistics here -- I just ran them -in a loop and picked the fastest results that didn't look like -outliers. Unpatched means commit a4eb8b993554, so all the -bookkeeping overhead is gone. - -ping-pong between two mms on the same CPU using eventfd: - - patched: 1.22µs - patched, nopcid: 1.33µs - unpatched: 1.34µs - -Same ping-pong, but now touch 512 pages (all zero-page to minimize -cache misses) each iteration. dTLB misses are measured by -dtlb_load_misses.miss_causes_a_walk: - - patched: 1.8µs 11M dTLB misses - patched, nopcid: 6.2µs, 207M dTLB misses - unpatched: 6.1µs, 190M dTLB misses - -Signed-off-by: Andy Lutomirski -Reviewed-by: Nadav Amit -Cc: Andrew Morton -Cc: Arjan van de Ven -Cc: Borislav Petkov -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Mel Gorman -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Thomas Gleixner -Cc: linux-mm@kvack.org -Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(backported from commit 10af6235e0d327d42e1bad974385197817923dc1) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit d833a976288cdcf7fb1dabb48ebf614ebf6a311c) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/mmu_context.h | 3 ++ - arch/x86/include/asm/processor-flags.h | 2 + - arch/x86/include/asm/tlbflush.h | 18 +++++++- - arch/x86/mm/init.c | 1 + - arch/x86/mm/tlb.c | 84 +++++++++++++++++++++++++--------- - 5 files changed, 85 insertions(+), 23 deletions(-) - -diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h -index d6b055b328f2..7ae318c340d9 100644 ---- a/arch/x86/include/asm/mmu_context.h -+++ b/arch/x86/include/asm/mmu_context.h -@@ -298,6 +298,9 @@ static inline unsigned long __get_current_cr3_fast(void) - { - unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); - -+ if (static_cpu_has(X86_FEATURE_PCID)) -+ cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid); -+ - /* For now, be very restrictive about when this can be called. */ - VM_WARN_ON(in_nmi() || preemptible()); - -diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h -index 79aa2f98398d..791b60199aa4 100644 ---- a/arch/x86/include/asm/processor-flags.h -+++ b/arch/x86/include/asm/processor-flags.h -@@ -35,6 +35,7 @@ - /* Mask off the address space ID bits. */ - #define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull - #define CR3_PCID_MASK 0xFFFull -+#define CR3_NOFLUSH (1UL << 63) - #else - /* - * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save -@@ -42,6 +43,7 @@ - */ - #define CR3_ADDR_MASK 0xFFFFFFFFull - #define CR3_PCID_MASK 0ull -+#define CR3_NOFLUSH 0 - #endif - - #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index 6397275008db..d23e61dc0640 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -82,6 +82,12 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) - #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) - #endif - -+/* -+ * 6 because 6 should be plenty and struct tlb_state will fit in -+ * two cache lines. -+ */ -+#define TLB_NR_DYN_ASIDS 6 -+ - struct tlb_context { - u64 ctx_id; - u64 tlb_gen; -@@ -95,6 +101,8 @@ struct tlb_state { - * mode even if we've already switched back to swapper_pg_dir. - */ - struct mm_struct *loaded_mm; -+ u16 loaded_mm_asid; -+ u16 next_asid; - - /* - * Access to this CR4 shadow and to H/W CR4 is protected by -@@ -104,7 +112,8 @@ struct tlb_state { - - /* - * This is a list of all contexts that might exist in the TLB. -- * Since we don't yet use PCID, there is only one context. -+ * There is one per ASID that we use, and the ASID (what the -+ * CPU calls PCID) is the index into ctxts. - * - * For each context, ctx_id indicates which mm the TLB's user - * entries came from. As an invariant, the TLB will never -@@ -114,8 +123,13 @@ struct tlb_state { - * To be clear, this means that it's legal for the TLB code to - * flush the TLB without updating tlb_gen. This can happen - * (for now, at least) due to paravirt remote flushes. -+ * -+ * NB: context 0 is a bit special, since it's also used by -+ * various bits of init code. This is fine -- code that -+ * isn't aware of PCID will end up harmlessly flushing -+ * context 0. - */ -- struct tlb_context ctxs[1]; -+ struct tlb_context ctxs[TLB_NR_DYN_ASIDS]; - }; - DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); - -diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c -index c86dc071bb10..af5c1ed21d43 100644 ---- a/arch/x86/mm/init.c -+++ b/arch/x86/mm/init.c -@@ -849,6 +849,7 @@ void __init zone_sizes_init(void) - - DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { - .loaded_mm = &init_mm, -+ .next_asid = 1, - .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ - }; - EXPORT_SYMBOL_GPL(cpu_tlbstate); -diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c -index 0982c997d36f..57943b4d8f2e 100644 ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -30,6 +30,40 @@ - - atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); - -+static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, -+ u16 *new_asid, bool *need_flush) -+{ -+ u16 asid; -+ -+ if (!static_cpu_has(X86_FEATURE_PCID)) { -+ *new_asid = 0; -+ *need_flush = true; -+ return; -+ } -+ -+ for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { -+ if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != -+ next->context.ctx_id) -+ continue; -+ -+ *new_asid = asid; -+ *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < -+ next_tlb_gen); -+ return; -+ } -+ -+ /* -+ * We don't currently own an ASID slot on this CPU. -+ * Allocate a slot. -+ */ -+ *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; -+ if (*new_asid >= TLB_NR_DYN_ASIDS) { -+ *new_asid = 0; -+ this_cpu_write(cpu_tlbstate.next_asid, 1); -+ } -+ *need_flush = true; -+} -+ - void leave_mm(int cpu) - { - struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); -@@ -66,6 +100,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) - { - struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); -+ u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - unsigned cpu = smp_processor_id(); - u64 next_tlb_gen; - -@@ -85,12 +120,13 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - /* - * Verify that CR3 is what we think it is. This will catch - * hypothetical buggy code that directly switches to swapper_pg_dir -- * without going through leave_mm() / switch_mm_irqs_off(). -+ * without going through leave_mm() / switch_mm_irqs_off() or that -+ * does something like write_cr3(read_cr3_pa()). - */ -- VM_BUG_ON(read_cr3_pa() != __pa(real_prev->pgd)); -+ VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid)); - - if (real_prev == next) { -- VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != -+ VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != - next->context.ctx_id); - - if (cpumask_test_cpu(cpu, mm_cpumask(next))) { -@@ -107,16 +143,17 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - cpumask_set_cpu(cpu, mm_cpumask(next)); - next_tlb_gen = atomic64_read(&next->context.tlb_gen); - -- if (this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen) < next_tlb_gen) { -+ if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < -+ next_tlb_gen) { - /* - * Ideally, we'd have a flush_tlb() variant that - * takes the known CR3 value as input. This would - * be faster on Xen PV and on hypothetical CPUs - * on which INVPCID is fast. - */ -- this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, -+ this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, - next_tlb_gen); -- write_cr3(__pa(next->pgd)); -+ write_cr3(__pa(next->pgd) | prev_asid); - - /* - * This gets called via leave_mm() in the idle path -@@ -134,8 +171,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - * are not reflected in tlb_gen.) - */ - } else { -- VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) == -- next->context.ctx_id); -+ u16 new_asid; -+ bool need_flush; - - if (IS_ENABLED(CONFIG_VMAP_STACK)) { - /* -@@ -162,18 +199,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - cpumask_set_cpu(cpu, mm_cpumask(next)); - next_tlb_gen = atomic64_read(&next->context.tlb_gen); - -- this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); -- this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen); -- this_cpu_write(cpu_tlbstate.loaded_mm, next); -- write_cr3(__pa(next->pgd)); -+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); - -- /* -- * This gets called via leave_mm() in the idle path where RCU -- * functions differently. Tracing normally uses RCU, so we -- * have to call the tracepoint specially here. -- */ -- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, -+ if (need_flush) { -+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); -+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); -+ write_cr3(__pa(next->pgd) | new_asid); -+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, - TLB_FLUSH_ALL); -+ } else { -+ /* The new ASID is already up to date. */ -+ write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); -+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); -+ } -+ -+ this_cpu_write(cpu_tlbstate.loaded_mm, next); -+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); - } - - load_mm_cr4(next); -@@ -200,13 +241,14 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, - * wants us to catch up to. - */ - struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); -+ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); -- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen); -+ u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); - - /* This code cannot presently handle being reentered. */ - VM_WARN_ON(!irqs_disabled()); - -- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != -+ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != - loaded_mm->context.ctx_id); - - if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { -@@ -294,7 +336,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, - } - - /* Both paths above update our state to mm_tlb_gen. */ -- this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen); -+ this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); - } - - static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) --- -2.14.2 - diff --git a/patches/kernel/0044-x86-mm-Rework-lazy-TLB-mode-and-TLB-freshness-tracki.patch b/patches/kernel/0044-x86-mm-Rework-lazy-TLB-mode-and-TLB-freshness-tracki.patch new file mode 100644 index 0000000..70f93ef --- /dev/null +++ b/patches/kernel/0044-x86-mm-Rework-lazy-TLB-mode-and-TLB-freshness-tracki.patch @@ -0,0 +1,453 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 29 Jun 2017 08:53:17 -0700 +Subject: [PATCH] x86/mm: Rework lazy TLB mode and TLB freshness tracking +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +x86's lazy TLB mode used to be fairly weak -- it would switch to +init_mm the first time it tried to flush a lazy TLB. This meant an +unnecessary CR3 write and, if the flush was remote, an unnecessary +IPI. + +Rewrite it entirely. When we enter lazy mode, we simply remove the +CPU from mm_cpumask. This means that we need a way to figure out +whether we've missed a flush when we switch back out of lazy mode. +I use the tlb_gen machinery to track whether a context is up to +date. + +Note to reviewers: this patch, my itself, looks a bit odd. I'm +using an array of length 1 containing (ctx_id, tlb_gen) rather than +just storing tlb_gen, and making it at array isn't necessary yet. +I'm doing this because the next few patches add PCID support, and, +with PCID, we need ctx_id, and the array will end up with a length +greater than 1. Making it an array now means that there will be +less churn and therefore less stress on your eyeballs. + +NB: This is dubious but, AFAICT, still correct on Xen and UV. +xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this +patch changes the way that mm_cpumask() works. This should be okay, +since Xen *also* iterates all online CPUs to find all the CPUs it +needs to twiddle. + +The UV tlbflush code is rather dated and should be changed. + +Here are some benchmark results, done on a Skylake laptop at 2.3 GHz +(turbo off, intel_pstate requesting max performance) under KVM with +the guest using idle=poll (to avoid artifacts when bouncing between +CPUs). I haven't done any real statistics here -- I just ran them +in a loop and picked the fastest results that didn't look like +outliers. Unpatched means commit a4eb8b993554, so all the +bookkeeping overhead is gone. + +MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In +an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU. +This is intended to be a nearly worst-case test. + + patched: 13.4µs + unpatched: 21.6µs + +Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores), +nrounds = 100, 256M data + + patched: 1.1 seconds or so + unpatched: 1.9 seconds or so + +The sleepup on Vitaly's test appearss to be because it spends a lot +of time blocked on mmap_sem, and this patch avoids sending IPIs to +blocked CPUs. + +Signed-off-by: Andy Lutomirski +Reviewed-by: Nadav Amit +Reviewed-by: Thomas Gleixner +Cc: Andrew Banman +Cc: Andrew Morton +Cc: Arjan van de Ven +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: Dimitri Sivanich +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Mel Gorman +Cc: Mike Travis +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 94b1b03b519b81c494900cb112aa00ed205cc2d9) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit b381b7ae452f2bc6384507a897247be7c93a71cc) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/mmu_context.h | 6 +- + arch/x86/include/asm/tlbflush.h | 4 - + arch/x86/mm/init.c | 1 - + arch/x86/mm/tlb.c | 197 ++++++++++++++++++++++--------------- + arch/x86/xen/mmu_pv.c | 5 +- + 5 files changed, 124 insertions(+), 89 deletions(-) + +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index 6c05679c715b..d6b055b328f2 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -128,8 +128,10 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) + + static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) + { +- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) +- this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); ++ int cpu = smp_processor_id(); ++ ++ if (cpumask_test_cpu(cpu, mm_cpumask(mm))) ++ cpumask_clear_cpu(cpu, mm_cpumask(mm)); + } + + static inline int init_new_context(struct task_struct *tsk, +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 3a167c214560..6397275008db 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -95,7 +95,6 @@ struct tlb_state { + * mode even if we've already switched back to swapper_pg_dir. + */ + struct mm_struct *loaded_mm; +- int state; + + /* + * Access to this CR4 shadow and to H/W CR4 is protected by +@@ -318,9 +317,6 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) + void native_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info); + +-#define TLBSTATE_OK 1 +-#define TLBSTATE_LAZY 2 +- + static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm) + { +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c +index df2624b091a7..c86dc071bb10 100644 +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -849,7 +849,6 @@ void __init zone_sizes_init(void) + + DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { + .loaded_mm = &init_mm, +- .state = 0, + .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ + }; + EXPORT_SYMBOL_GPL(cpu_tlbstate); +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 4e5a5ddb9e4d..0982c997d36f 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -45,8 +45,8 @@ void leave_mm(int cpu) + if (loaded_mm == &init_mm) + return; + +- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) +- BUG(); ++ /* Warn if we're not lazy. */ ++ WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); + + switch_mm(NULL, &init_mm, NULL); + } +@@ -65,94 +65,117 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, + void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) + { +- unsigned cpu = smp_processor_id(); + struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); ++ unsigned cpu = smp_processor_id(); ++ u64 next_tlb_gen; + + /* +- * NB: The scheduler will call us with prev == next when +- * switching from lazy TLB mode to normal mode if active_mm +- * isn't changing. When this happens, there is no guarantee +- * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next. ++ * NB: The scheduler will call us with prev == next when switching ++ * from lazy TLB mode to normal mode if active_mm isn't changing. ++ * When this happens, we don't assume that CR3 (and hence ++ * cpu_tlbstate.loaded_mm) matches next. + * + * NB: leave_mm() calls us with prev == NULL and tsk == NULL. + */ + +- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); ++ /* We don't want flush_tlb_func_* to run concurrently with us. */ ++ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) ++ WARN_ON_ONCE(!irqs_disabled()); ++ ++ /* ++ * Verify that CR3 is what we think it is. This will catch ++ * hypothetical buggy code that directly switches to swapper_pg_dir ++ * without going through leave_mm() / switch_mm_irqs_off(). ++ */ ++ VM_BUG_ON(read_cr3_pa() != __pa(real_prev->pgd)); + + if (real_prev == next) { +- /* +- * There's nothing to do: we always keep the per-mm control +- * regs in sync with cpu_tlbstate.loaded_mm. Just +- * sanity-check mm_cpumask. +- */ +- if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next)))) +- cpumask_set_cpu(cpu, mm_cpumask(next)); +- return; +- } ++ VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != ++ next->context.ctx_id); ++ ++ if (cpumask_test_cpu(cpu, mm_cpumask(next))) { ++ /* ++ * There's nothing to do: we weren't lazy, and we ++ * aren't changing our mm. We don't need to flush ++ * anything, nor do we need to update CR3, CR4, or ++ * LDTR. ++ */ ++ return; ++ } ++ ++ /* Resume remote flushes and then read tlb_gen. */ ++ cpumask_set_cpu(cpu, mm_cpumask(next)); ++ next_tlb_gen = atomic64_read(&next->context.tlb_gen); ++ ++ if (this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen) < next_tlb_gen) { ++ /* ++ * Ideally, we'd have a flush_tlb() variant that ++ * takes the known CR3 value as input. This would ++ * be faster on Xen PV and on hypothetical CPUs ++ * on which INVPCID is fast. ++ */ ++ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, ++ next_tlb_gen); ++ write_cr3(__pa(next->pgd)); ++ ++ /* ++ * This gets called via leave_mm() in the idle path ++ * where RCU functions differently. Tracing normally ++ * uses RCU, so we have to call the tracepoint ++ * specially here. ++ */ ++ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, ++ TLB_FLUSH_ALL); ++ } + +- if (IS_ENABLED(CONFIG_VMAP_STACK)) { + /* +- * If our current stack is in vmalloc space and isn't +- * mapped in the new pgd, we'll double-fault. Forcibly +- * map it. ++ * We just exited lazy mode, which means that CR4 and/or LDTR ++ * may be stale. (Changes to the required CR4 and LDTR states ++ * are not reflected in tlb_gen.) + */ +- unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); +- +- pgd_t *pgd = next->pgd + stack_pgd_index; +- +- if (unlikely(pgd_none(*pgd))) +- set_pgd(pgd, init_mm.pgd[stack_pgd_index]); +- } ++ } else { ++ VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) == ++ next->context.ctx_id); ++ ++ if (IS_ENABLED(CONFIG_VMAP_STACK)) { ++ /* ++ * If our current stack is in vmalloc space and isn't ++ * mapped in the new pgd, we'll double-fault. Forcibly ++ * map it. ++ */ ++ unsigned int index = pgd_index(current_stack_pointer()); ++ pgd_t *pgd = next->pgd + index; ++ ++ if (unlikely(pgd_none(*pgd))) ++ set_pgd(pgd, init_mm.pgd[index]); ++ } + +- this_cpu_write(cpu_tlbstate.loaded_mm, next); +- this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); +- this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen)); ++ /* Stop remote flushes for the previous mm */ ++ if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) ++ cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + +- WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); +- cpumask_set_cpu(cpu, mm_cpumask(next)); ++ VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); + +- /* +- * Re-load page tables. +- * +- * This logic has an ordering constraint: +- * +- * CPU 0: Write to a PTE for 'next' +- * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. +- * CPU 1: set bit 1 in next's mm_cpumask +- * CPU 1: load from the PTE that CPU 0 writes (implicit) +- * +- * We need to prevent an outcome in which CPU 1 observes +- * the new PTE value and CPU 0 observes bit 1 clear in +- * mm_cpumask. (If that occurs, then the IPI will never +- * be sent, and CPU 0's TLB will contain a stale entry.) +- * +- * The bad outcome can occur if either CPU's load is +- * reordered before that CPU's store, so both CPUs must +- * execute full barriers to prevent this from happening. +- * +- * Thus, switch_mm needs a full barrier between the +- * store to mm_cpumask and any operation that could load +- * from next->pgd. TLB fills are special and can happen +- * due to instruction fetches or for no reason at all, +- * and neither LOCK nor MFENCE orders them. +- * Fortunately, load_cr3() is serializing and gives the +- * ordering guarantee we need. +- */ +- load_cr3(next->pgd); ++ /* ++ * Start remote flushes and then read tlb_gen. ++ */ ++ cpumask_set_cpu(cpu, mm_cpumask(next)); ++ next_tlb_gen = atomic64_read(&next->context.tlb_gen); + +- /* +- * This gets called via leave_mm() in the idle path where RCU +- * functions differently. Tracing normally uses RCU, so we have to +- * call the tracepoint specially here. +- */ +- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); ++ this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); ++ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen); ++ this_cpu_write(cpu_tlbstate.loaded_mm, next); ++ write_cr3(__pa(next->pgd)); + +- /* Stop flush ipis for the previous mm */ +- WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && +- real_prev != &init_mm); +- cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); ++ /* ++ * This gets called via leave_mm() in the idle path where RCU ++ * functions differently. Tracing normally uses RCU, so we ++ * have to call the tracepoint specially here. ++ */ ++ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, ++ TLB_FLUSH_ALL); ++ } + +- /* Load per-mm CR4 and LDTR state */ + load_mm_cr4(next); + switch_ldt(real_prev, next); + } +@@ -186,13 +209,13 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != + loaded_mm->context.ctx_id); + +- if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { ++ if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { + /* +- * leave_mm() is adequate to handle any type of flush, and +- * we would prefer not to receive further IPIs. leave_mm() +- * clears this CPU's bit in mm_cpumask(). ++ * We're in lazy mode -- don't flush. We can get here on ++ * remote flushes due to races and on local flushes if a ++ * kernel thread coincidentally flushes the mm it's lazily ++ * still using. + */ +- leave_mm(smp_processor_id()); + return; + } + +@@ -203,6 +226,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, + * be handled can catch us all the way up, leaving no work for + * the second flush. + */ ++ trace_tlb_flush(reason, 0); + return; + } + +@@ -304,6 +328,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask, + (info->end - info->start) >> PAGE_SHIFT); + + if (is_uv_system()) { ++ /* ++ * This whole special case is confused. UV has a "Broadcast ++ * Assist Unit", which seems to be a fancy way to send IPIs. ++ * Back when x86 used an explicit TLB flush IPI, UV was ++ * optimized to use its own mechanism. These days, x86 uses ++ * smp_call_function_many(), but UV still uses a manual IPI, ++ * and that IPI's action is out of date -- it does a manual ++ * flush instead of calling flush_tlb_func_remote(). This ++ * means that the percpu tlb_gen variables won't be updated ++ * and we'll do pointless flushes on future context switches. ++ * ++ * Rather than hooking native_flush_tlb_others() here, I think ++ * that UV should be updated so that smp_call_function_many(), ++ * etc, are optimal on UV. ++ */ + unsigned int cpu; + + cpu = smp_processor_id(); +@@ -363,6 +402,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + + if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), &info); ++ + put_cpu(); + } + +@@ -371,8 +411,6 @@ static void do_flush_tlb_all(void *info) + { + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + __flush_tlb_all(); +- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) +- leave_mm(smp_processor_id()); + } + + void flush_tlb_all(void) +@@ -425,6 +463,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) + + if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) + flush_tlb_others(&batch->cpumask, &info); ++ + cpumask_clear(&batch->cpumask); + + put_cpu(); +diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c +index 5f61b7e2e6b2..ba76f3ce997f 100644 +--- a/arch/x86/xen/mmu_pv.c ++++ b/arch/x86/xen/mmu_pv.c +@@ -1005,14 +1005,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm) + /* Get the "official" set of cpus referring to our pagetable. */ + if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { + for_each_online_cpu(cpu) { +- if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) +- && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) ++ if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) + continue; + smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1); + } + return; + } +- cpumask_copy(mask, mm_cpumask(mm)); + + /* + * It's possible that a vcpu may have a stale reference to our +@@ -1021,6 +1019,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm) + * look at its actual current cr3 value, and force it to flush + * if needed. + */ ++ cpumask_clear(mask); + for_each_online_cpu(cpu) { + if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) + cpumask_set_cpu(cpu, mask); +-- +2.14.2 + diff --git a/patches/kernel/0045-x86-mm-Factor-out-CR3-building-code.patch b/patches/kernel/0045-x86-mm-Factor-out-CR3-building-code.patch deleted file mode 100644 index 01f7292..0000000 --- a/patches/kernel/0045-x86-mm-Factor-out-CR3-building-code.patch +++ /dev/null @@ -1,176 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Sun, 17 Sep 2017 09:03:48 -0700 -Subject: [PATCH] x86/mm: Factor out CR3-building code -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Current, the code that assembles a value to load into CR3 is -open-coded everywhere. Factor it out into helpers build_cr3() and -build_cr3_noflush(). - -This makes one semantic change: __get_current_cr3_fast() was wrong -on SME systems. No one noticed because the only caller is in the -VMX code, and there are no CPUs with both SME and VMX. - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: Tom Lendacky -Link: http://lkml.kernel.org/r/ce350cf11e93e2842d14d0b95b0199c7d881f527.1505663533.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(backported from commit 47061a24e2ee5bd8a40d473d47a5bd823fa0081f) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 72be211bac7be521f128d419d63cae38ba60ace8) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/mmu_context.h | 15 ++++++--- - arch/x86/mm/tlb.c | 68 +++++++++++++++++++++++++++++++++++--- - 2 files changed, 75 insertions(+), 8 deletions(-) - -diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h -index 7ae318c340d9..a999ba6b721f 100644 ---- a/arch/x86/include/asm/mmu_context.h -+++ b/arch/x86/include/asm/mmu_context.h -@@ -286,6 +286,15 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, - return __pkru_allows_pkey(vma_pkey(vma), write); - } - -+static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) -+{ -+ return __sme_pa(mm->pgd) | asid; -+} -+ -+static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) -+{ -+ return __sme_pa(mm->pgd) | asid | CR3_NOFLUSH; -+} - - /* - * This can be used from process context to figure out what the value of -@@ -296,10 +305,8 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, - */ - static inline unsigned long __get_current_cr3_fast(void) - { -- unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); -- -- if (static_cpu_has(X86_FEATURE_PCID)) -- cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid); -+ unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm), -+ this_cpu_read(cpu_tlbstate.loaded_mm_asid)); - - /* For now, be very restrictive about when this can be called. */ - VM_WARN_ON(in_nmi() || preemptible()); -diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c -index 57943b4d8f2e..440400316c8a 100644 ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -123,7 +123,23 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - * without going through leave_mm() / switch_mm_irqs_off() or that - * does something like write_cr3(read_cr3_pa()). - */ -- VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid)); -+#ifdef CONFIG_DEBUG_VM -+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { -+ /* -+ * If we were to BUG here, we'd be very likely to kill -+ * the system so hard that we don't see the call trace. -+ * Try to recover instead by ignoring the error and doing -+ * a global flush to minimize the chance of corruption. -+ * -+ * (This is far from being a fully correct recovery. -+ * Architecturally, the CPU could prefetch something -+ * back into an incorrect ASID slot and leave it there -+ * to cause trouble down the road. It's better than -+ * nothing, though.) -+ */ -+ __flush_tlb_all(); -+ } -+#endif - - if (real_prev == next) { - VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != -@@ -153,7 +169,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - */ - this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, - next_tlb_gen); -- write_cr3(__pa(next->pgd) | prev_asid); -+ write_cr3(build_cr3(next, prev_asid)); - - /* - * This gets called via leave_mm() in the idle path -@@ -204,12 +220,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - if (need_flush) { - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); -- write_cr3(__pa(next->pgd) | new_asid); -+ write_cr3(build_cr3(next, new_asid)); - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, - TLB_FLUSH_ALL); - } else { - /* The new ASID is already up to date. */ -- write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); -+ write_cr3(build_cr3_noflush(next, new_asid)); - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); - } - -@@ -221,6 +237,50 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - switch_ldt(real_prev, next); - } - -+/* -+ * Call this when reinitializing a CPU. It fixes the following potential -+ * problems: -+ * -+ * - The ASID changed from what cpu_tlbstate thinks it is (most likely -+ * because the CPU was taken down and came back up with CR3's PCID -+ * bits clear. CPU hotplug can do this. -+ * -+ * - The TLB contains junk in slots corresponding to inactive ASIDs. -+ * -+ * - The CPU went so far out to lunch that it may have missed a TLB -+ * flush. -+ */ -+void initialize_tlbstate_and_flush(void) -+{ -+ int i; -+ struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); -+ u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); -+ unsigned long cr3 = __read_cr3(); -+ -+ /* Assert that CR3 already references the right mm. */ -+ WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); -+ -+ /* -+ * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization -+ * doesn't work like other CR4 bits because it can only be set from -+ * long mode.) -+ */ -+ WARN_ON(boot_cpu_has(X86_FEATURE_PCID) && -+ !(cr4_read_shadow() & X86_CR4_PCIDE)); -+ -+ /* Force ASID 0 and force a TLB flush. */ -+ write_cr3(build_cr3(mm, 0)); -+ -+ /* Reinitialize tlbstate. */ -+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); -+ this_cpu_write(cpu_tlbstate.next_asid, 1); -+ this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); -+ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); -+ -+ for (i = 1; i < TLB_NR_DYN_ASIDS; i++) -+ this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); -+} -+ - /* - * flush_tlb_func_common()'s memory ordering requirement is that any - * TLB fills that happen after we flush the TLB are ordered after we --- -2.14.2 - diff --git a/patches/kernel/0045-x86-mm-Implement-PCID-based-optimization-try-to-pres.patch b/patches/kernel/0045-x86-mm-Implement-PCID-based-optimization-try-to-pres.patch new file mode 100644 index 0000000..0b9df59 --- /dev/null +++ b/patches/kernel/0045-x86-mm-Implement-PCID-based-optimization-try-to-pres.patch @@ -0,0 +1,340 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 24 Jul 2017 21:41:38 -0700 +Subject: [PATCH] x86/mm: Implement PCID based optimization: try to preserve + old TLB entries using PCID +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +PCID is a "process context ID" -- it's what other architectures call +an address space ID. Every non-global TLB entry is tagged with a +PCID, only TLB entries that match the currently selected PCID are +used, and we can switch PGDs without flushing the TLB. x86's +PCID is 12 bits. + +This is an unorthodox approach to using PCID. x86's PCID is far too +short to uniquely identify a process, and we can't even really +uniquely identify a running process because there are monster +systems with over 4096 CPUs. To make matters worse, past attempts +to use all 12 PCID bits have resulted in slowdowns instead of +speedups. + +This patch uses PCID differently. We use a PCID to identify a +recently-used mm on a per-cpu basis. An mm has no fixed PCID +binding at all; instead, we give it a fresh PCID each time it's +loaded except in cases where we want to preserve the TLB, in which +case we reuse a recent value. + +Here are some benchmark results, done on a Skylake laptop at 2.3 GHz +(turbo off, intel_pstate requesting max performance) under KVM with +the guest using idle=poll (to avoid artifacts when bouncing between +CPUs). I haven't done any real statistics here -- I just ran them +in a loop and picked the fastest results that didn't look like +outliers. Unpatched means commit a4eb8b993554, so all the +bookkeeping overhead is gone. + +ping-pong between two mms on the same CPU using eventfd: + + patched: 1.22µs + patched, nopcid: 1.33µs + unpatched: 1.34µs + +Same ping-pong, but now touch 512 pages (all zero-page to minimize +cache misses) each iteration. dTLB misses are measured by +dtlb_load_misses.miss_causes_a_walk: + + patched: 1.8µs 11M dTLB misses + patched, nopcid: 6.2µs, 207M dTLB misses + unpatched: 6.1µs, 190M dTLB misses + +Signed-off-by: Andy Lutomirski +Reviewed-by: Nadav Amit +Cc: Andrew Morton +Cc: Arjan van de Ven +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Mel Gorman +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Thomas Gleixner +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(backported from commit 10af6235e0d327d42e1bad974385197817923dc1) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit d833a976288cdcf7fb1dabb48ebf614ebf6a311c) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/mmu_context.h | 3 ++ + arch/x86/include/asm/processor-flags.h | 2 + + arch/x86/include/asm/tlbflush.h | 18 +++++++- + arch/x86/mm/init.c | 1 + + arch/x86/mm/tlb.c | 84 +++++++++++++++++++++++++--------- + 5 files changed, 85 insertions(+), 23 deletions(-) + +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index d6b055b328f2..7ae318c340d9 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -298,6 +298,9 @@ static inline unsigned long __get_current_cr3_fast(void) + { + unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); + ++ if (static_cpu_has(X86_FEATURE_PCID)) ++ cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid); ++ + /* For now, be very restrictive about when this can be called. */ + VM_WARN_ON(in_nmi() || preemptible()); + +diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h +index 79aa2f98398d..791b60199aa4 100644 +--- a/arch/x86/include/asm/processor-flags.h ++++ b/arch/x86/include/asm/processor-flags.h +@@ -35,6 +35,7 @@ + /* Mask off the address space ID bits. */ + #define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull + #define CR3_PCID_MASK 0xFFFull ++#define CR3_NOFLUSH (1UL << 63) + #else + /* + * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save +@@ -42,6 +43,7 @@ + */ + #define CR3_ADDR_MASK 0xFFFFFFFFull + #define CR3_PCID_MASK 0ull ++#define CR3_NOFLUSH 0 + #endif + + #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 6397275008db..d23e61dc0640 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -82,6 +82,12 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) + #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) + #endif + ++/* ++ * 6 because 6 should be plenty and struct tlb_state will fit in ++ * two cache lines. ++ */ ++#define TLB_NR_DYN_ASIDS 6 ++ + struct tlb_context { + u64 ctx_id; + u64 tlb_gen; +@@ -95,6 +101,8 @@ struct tlb_state { + * mode even if we've already switched back to swapper_pg_dir. + */ + struct mm_struct *loaded_mm; ++ u16 loaded_mm_asid; ++ u16 next_asid; + + /* + * Access to this CR4 shadow and to H/W CR4 is protected by +@@ -104,7 +112,8 @@ struct tlb_state { + + /* + * This is a list of all contexts that might exist in the TLB. +- * Since we don't yet use PCID, there is only one context. ++ * There is one per ASID that we use, and the ASID (what the ++ * CPU calls PCID) is the index into ctxts. + * + * For each context, ctx_id indicates which mm the TLB's user + * entries came from. As an invariant, the TLB will never +@@ -114,8 +123,13 @@ struct tlb_state { + * To be clear, this means that it's legal for the TLB code to + * flush the TLB without updating tlb_gen. This can happen + * (for now, at least) due to paravirt remote flushes. ++ * ++ * NB: context 0 is a bit special, since it's also used by ++ * various bits of init code. This is fine -- code that ++ * isn't aware of PCID will end up harmlessly flushing ++ * context 0. + */ +- struct tlb_context ctxs[1]; ++ struct tlb_context ctxs[TLB_NR_DYN_ASIDS]; + }; + DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); + +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c +index c86dc071bb10..af5c1ed21d43 100644 +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -849,6 +849,7 @@ void __init zone_sizes_init(void) + + DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { + .loaded_mm = &init_mm, ++ .next_asid = 1, + .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ + }; + EXPORT_SYMBOL_GPL(cpu_tlbstate); +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 0982c997d36f..57943b4d8f2e 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -30,6 +30,40 @@ + + atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); + ++static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, ++ u16 *new_asid, bool *need_flush) ++{ ++ u16 asid; ++ ++ if (!static_cpu_has(X86_FEATURE_PCID)) { ++ *new_asid = 0; ++ *need_flush = true; ++ return; ++ } ++ ++ for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { ++ if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != ++ next->context.ctx_id) ++ continue; ++ ++ *new_asid = asid; ++ *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < ++ next_tlb_gen); ++ return; ++ } ++ ++ /* ++ * We don't currently own an ASID slot on this CPU. ++ * Allocate a slot. ++ */ ++ *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; ++ if (*new_asid >= TLB_NR_DYN_ASIDS) { ++ *new_asid = 0; ++ this_cpu_write(cpu_tlbstate.next_asid, 1); ++ } ++ *need_flush = true; ++} ++ + void leave_mm(int cpu) + { + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); +@@ -66,6 +100,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) + { + struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); ++ u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + unsigned cpu = smp_processor_id(); + u64 next_tlb_gen; + +@@ -85,12 +120,13 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + /* + * Verify that CR3 is what we think it is. This will catch + * hypothetical buggy code that directly switches to swapper_pg_dir +- * without going through leave_mm() / switch_mm_irqs_off(). ++ * without going through leave_mm() / switch_mm_irqs_off() or that ++ * does something like write_cr3(read_cr3_pa()). + */ +- VM_BUG_ON(read_cr3_pa() != __pa(real_prev->pgd)); ++ VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid)); + + if (real_prev == next) { +- VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != ++ VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != + next->context.ctx_id); + + if (cpumask_test_cpu(cpu, mm_cpumask(next))) { +@@ -107,16 +143,17 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + +- if (this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen) < next_tlb_gen) { ++ if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < ++ next_tlb_gen) { + /* + * Ideally, we'd have a flush_tlb() variant that + * takes the known CR3 value as input. This would + * be faster on Xen PV and on hypothetical CPUs + * on which INVPCID is fast. + */ +- this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, ++ this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, + next_tlb_gen); +- write_cr3(__pa(next->pgd)); ++ write_cr3(__pa(next->pgd) | prev_asid); + + /* + * This gets called via leave_mm() in the idle path +@@ -134,8 +171,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + * are not reflected in tlb_gen.) + */ + } else { +- VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) == +- next->context.ctx_id); ++ u16 new_asid; ++ bool need_flush; + + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + /* +@@ -162,18 +199,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + +- this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); +- this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen); +- this_cpu_write(cpu_tlbstate.loaded_mm, next); +- write_cr3(__pa(next->pgd)); ++ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + +- /* +- * This gets called via leave_mm() in the idle path where RCU +- * functions differently. Tracing normally uses RCU, so we +- * have to call the tracepoint specially here. +- */ +- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, ++ if (need_flush) { ++ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); ++ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); ++ write_cr3(__pa(next->pgd) | new_asid); ++ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, + TLB_FLUSH_ALL); ++ } else { ++ /* The new ASID is already up to date. */ ++ write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); ++ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); ++ } ++ ++ this_cpu_write(cpu_tlbstate.loaded_mm, next); ++ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + } + + load_mm_cr4(next); +@@ -200,13 +241,14 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, + * wants us to catch up to. + */ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); ++ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); +- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen); ++ u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); + + /* This code cannot presently handle being reentered. */ + VM_WARN_ON(!irqs_disabled()); + +- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != ++ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != + loaded_mm->context.ctx_id); + + if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { +@@ -294,7 +336,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, + } + + /* Both paths above update our state to mm_tlb_gen. */ +- this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen); ++ this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); + } + + static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) +-- +2.14.2 + diff --git a/patches/kernel/0046-x86-mm-64-Stop-using-CR3.PCID-0-in-ASID-aware-code.patch b/patches/kernel/0046-x86-mm-64-Stop-using-CR3.PCID-0-in-ASID-aware-code.patch deleted file mode 100644 index 6ce824b..0000000 --- a/patches/kernel/0046-x86-mm-64-Stop-using-CR3.PCID-0-in-ASID-aware-code.patch +++ /dev/null @@ -1,85 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Sun, 17 Sep 2017 09:03:49 -0700 -Subject: [PATCH] x86/mm/64: Stop using CR3.PCID == 0 in ASID-aware code -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Putting the logical ASID into CR3's PCID bits directly means that we -have two cases to consider separately: ASID == 0 and ASID != 0. -This means that bugs that only hit in one of these cases trigger -nondeterministically. - -There were some bugs like this in the past, and I think there's -still one in current kernels. In particular, we have a number of -ASID-unware code paths that save CR3, write some special value, and -then restore CR3. This includes suspend/resume, hibernate, kexec, -EFI, and maybe other things I've missed. This is currently -dangerous: if ASID != 0, then this code sequence will leave garbage -in the TLB tagged for ASID 0. We could potentially see corruption -when switching back to ASID 0. In principle, an -initialize_tlbstate_and_flush() call after these sequences would -solve the problem, but EFI, at least, does not call this. (And it -probably shouldn't -- initialize_tlbstate_and_flush() is rather -expensive.) - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/cdc14bbe5d3c3ef2a562be09a6368ffe9bd947a6.1505663533.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 52a2af400c1075219b3f0ce5c96fc961da44018a) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 15e474753e66e44da1365049f465427053a453ba) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/mmu_context.h | 21 +++++++++++++++++++-- - 1 file changed, 19 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h -index a999ba6b721f..c120b5db178a 100644 ---- a/arch/x86/include/asm/mmu_context.h -+++ b/arch/x86/include/asm/mmu_context.h -@@ -286,14 +286,31 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, - return __pkru_allows_pkey(vma_pkey(vma), write); - } - -+/* -+ * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID -+ * bits. This serves two purposes. It prevents a nasty situation in -+ * which PCID-unaware code saves CR3, loads some other value (with PCID -+ * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if -+ * the saved ASID was nonzero. It also means that any bugs involving -+ * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger -+ * deterministically. -+ */ -+ - static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) - { -- return __sme_pa(mm->pgd) | asid; -+ if (static_cpu_has(X86_FEATURE_PCID)) { -+ VM_WARN_ON_ONCE(asid > 4094); -+ return __sme_pa(mm->pgd) | (asid + 1); -+ } else { -+ VM_WARN_ON_ONCE(asid != 0); -+ return __sme_pa(mm->pgd); -+ } - } - - static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) - { -- return __sme_pa(mm->pgd) | asid | CR3_NOFLUSH; -+ VM_WARN_ON_ONCE(asid > 4094); -+ return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH; - } - - /* --- -2.14.2 - diff --git a/patches/kernel/0046-x86-mm-Factor-out-CR3-building-code.patch b/patches/kernel/0046-x86-mm-Factor-out-CR3-building-code.patch new file mode 100644 index 0000000..01f7292 --- /dev/null +++ b/patches/kernel/0046-x86-mm-Factor-out-CR3-building-code.patch @@ -0,0 +1,176 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Sun, 17 Sep 2017 09:03:48 -0700 +Subject: [PATCH] x86/mm: Factor out CR3-building code +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Current, the code that assembles a value to load into CR3 is +open-coded everywhere. Factor it out into helpers build_cr3() and +build_cr3_noflush(). + +This makes one semantic change: __get_current_cr3_fast() was wrong +on SME systems. No one noticed because the only caller is in the +VMX code, and there are no CPUs with both SME and VMX. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: Tom Lendacky +Link: http://lkml.kernel.org/r/ce350cf11e93e2842d14d0b95b0199c7d881f527.1505663533.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(backported from commit 47061a24e2ee5bd8a40d473d47a5bd823fa0081f) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 72be211bac7be521f128d419d63cae38ba60ace8) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/mmu_context.h | 15 ++++++--- + arch/x86/mm/tlb.c | 68 +++++++++++++++++++++++++++++++++++--- + 2 files changed, 75 insertions(+), 8 deletions(-) + +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index 7ae318c340d9..a999ba6b721f 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -286,6 +286,15 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, + return __pkru_allows_pkey(vma_pkey(vma), write); + } + ++static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) ++{ ++ return __sme_pa(mm->pgd) | asid; ++} ++ ++static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) ++{ ++ return __sme_pa(mm->pgd) | asid | CR3_NOFLUSH; ++} + + /* + * This can be used from process context to figure out what the value of +@@ -296,10 +305,8 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, + */ + static inline unsigned long __get_current_cr3_fast(void) + { +- unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); +- +- if (static_cpu_has(X86_FEATURE_PCID)) +- cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid); ++ unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm), ++ this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* For now, be very restrictive about when this can be called. */ + VM_WARN_ON(in_nmi() || preemptible()); +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 57943b4d8f2e..440400316c8a 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -123,7 +123,23 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + * without going through leave_mm() / switch_mm_irqs_off() or that + * does something like write_cr3(read_cr3_pa()). + */ +- VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid)); ++#ifdef CONFIG_DEBUG_VM ++ if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { ++ /* ++ * If we were to BUG here, we'd be very likely to kill ++ * the system so hard that we don't see the call trace. ++ * Try to recover instead by ignoring the error and doing ++ * a global flush to minimize the chance of corruption. ++ * ++ * (This is far from being a fully correct recovery. ++ * Architecturally, the CPU could prefetch something ++ * back into an incorrect ASID slot and leave it there ++ * to cause trouble down the road. It's better than ++ * nothing, though.) ++ */ ++ __flush_tlb_all(); ++ } ++#endif + + if (real_prev == next) { + VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != +@@ -153,7 +169,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + */ + this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, + next_tlb_gen); +- write_cr3(__pa(next->pgd) | prev_asid); ++ write_cr3(build_cr3(next, prev_asid)); + + /* + * This gets called via leave_mm() in the idle path +@@ -204,12 +220,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); +- write_cr3(__pa(next->pgd) | new_asid); ++ write_cr3(build_cr3(next, new_asid)); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, + TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ +- write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); ++ write_cr3(build_cr3_noflush(next, new_asid)); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); + } + +@@ -221,6 +237,50 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + switch_ldt(real_prev, next); + } + ++/* ++ * Call this when reinitializing a CPU. It fixes the following potential ++ * problems: ++ * ++ * - The ASID changed from what cpu_tlbstate thinks it is (most likely ++ * because the CPU was taken down and came back up with CR3's PCID ++ * bits clear. CPU hotplug can do this. ++ * ++ * - The TLB contains junk in slots corresponding to inactive ASIDs. ++ * ++ * - The CPU went so far out to lunch that it may have missed a TLB ++ * flush. ++ */ ++void initialize_tlbstate_and_flush(void) ++{ ++ int i; ++ struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); ++ u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); ++ unsigned long cr3 = __read_cr3(); ++ ++ /* Assert that CR3 already references the right mm. */ ++ WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); ++ ++ /* ++ * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization ++ * doesn't work like other CR4 bits because it can only be set from ++ * long mode.) ++ */ ++ WARN_ON(boot_cpu_has(X86_FEATURE_PCID) && ++ !(cr4_read_shadow() & X86_CR4_PCIDE)); ++ ++ /* Force ASID 0 and force a TLB flush. */ ++ write_cr3(build_cr3(mm, 0)); ++ ++ /* Reinitialize tlbstate. */ ++ this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); ++ this_cpu_write(cpu_tlbstate.next_asid, 1); ++ this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); ++ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); ++ ++ for (i = 1; i < TLB_NR_DYN_ASIDS; i++) ++ this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); ++} ++ + /* + * flush_tlb_func_common()'s memory ordering requirement is that any + * TLB fills that happen after we flush the TLB are ordered after we +-- +2.14.2 + diff --git a/patches/kernel/0047-x86-mm-64-Stop-using-CR3.PCID-0-in-ASID-aware-code.patch b/patches/kernel/0047-x86-mm-64-Stop-using-CR3.PCID-0-in-ASID-aware-code.patch new file mode 100644 index 0000000..6ce824b --- /dev/null +++ b/patches/kernel/0047-x86-mm-64-Stop-using-CR3.PCID-0-in-ASID-aware-code.patch @@ -0,0 +1,85 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Sun, 17 Sep 2017 09:03:49 -0700 +Subject: [PATCH] x86/mm/64: Stop using CR3.PCID == 0 in ASID-aware code +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Putting the logical ASID into CR3's PCID bits directly means that we +have two cases to consider separately: ASID == 0 and ASID != 0. +This means that bugs that only hit in one of these cases trigger +nondeterministically. + +There were some bugs like this in the past, and I think there's +still one in current kernels. In particular, we have a number of +ASID-unware code paths that save CR3, write some special value, and +then restore CR3. This includes suspend/resume, hibernate, kexec, +EFI, and maybe other things I've missed. This is currently +dangerous: if ASID != 0, then this code sequence will leave garbage +in the TLB tagged for ASID 0. We could potentially see corruption +when switching back to ASID 0. In principle, an +initialize_tlbstate_and_flush() call after these sequences would +solve the problem, but EFI, at least, does not call this. (And it +probably shouldn't -- initialize_tlbstate_and_flush() is rather +expensive.) + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/cdc14bbe5d3c3ef2a562be09a6368ffe9bd947a6.1505663533.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 52a2af400c1075219b3f0ce5c96fc961da44018a) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 15e474753e66e44da1365049f465427053a453ba) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/mmu_context.h | 21 +++++++++++++++++++-- + 1 file changed, 19 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index a999ba6b721f..c120b5db178a 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -286,14 +286,31 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, + return __pkru_allows_pkey(vma_pkey(vma), write); + } + ++/* ++ * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID ++ * bits. This serves two purposes. It prevents a nasty situation in ++ * which PCID-unaware code saves CR3, loads some other value (with PCID ++ * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if ++ * the saved ASID was nonzero. It also means that any bugs involving ++ * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger ++ * deterministically. ++ */ ++ + static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) + { +- return __sme_pa(mm->pgd) | asid; ++ if (static_cpu_has(X86_FEATURE_PCID)) { ++ VM_WARN_ON_ONCE(asid > 4094); ++ return __sme_pa(mm->pgd) | (asid + 1); ++ } else { ++ VM_WARN_ON_ONCE(asid != 0); ++ return __sme_pa(mm->pgd); ++ } + } + + static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) + { +- return __sme_pa(mm->pgd) | asid | CR3_NOFLUSH; ++ VM_WARN_ON_ONCE(asid > 4094); ++ return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH; + } + + /* +-- +2.14.2 + diff --git a/patches/kernel/0047-x86-mm-Flush-more-aggressively-in-lazy-TLB-mode.patch b/patches/kernel/0047-x86-mm-Flush-more-aggressively-in-lazy-TLB-mode.patch deleted file mode 100644 index 62c8c07..0000000 --- a/patches/kernel/0047-x86-mm-Flush-more-aggressively-in-lazy-TLB-mode.patch +++ /dev/null @@ -1,401 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 9 Oct 2017 09:50:49 -0700 -Subject: [PATCH] x86/mm: Flush more aggressively in lazy TLB mode -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Since commit: - - 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking") - -x86's lazy TLB mode has been all the way lazy: when running a kernel thread -(including the idle thread), the kernel keeps using the last user mm's -page tables without attempting to maintain user TLB coherence at all. - -From a pure semantic perspective, this is fine -- kernel threads won't -attempt to access user pages, so having stale TLB entries doesn't matter. - -Unfortunately, I forgot about a subtlety. By skipping TLB flushes, -we also allow any paging-structure caches that may exist on the CPU -to become incoherent. This means that we can have a -paging-structure cache entry that references a freed page table, and -the CPU is within its rights to do a speculative page walk starting -at the freed page table. - -I can imagine this causing two different problems: - - - A speculative page walk starting from a bogus page table could read - IO addresses. I haven't seen any reports of this causing problems. - - - A speculative page walk that involves a bogus page table can install - garbage in the TLB. Such garbage would always be at a user VA, but - some AMD CPUs have logic that triggers a machine check when it notices - these bogus entries. I've seen a couple reports of this. - -Boris further explains the failure mode: - -> It is actually more of an optimization which assumes that paging-structure -> entries are in WB DRAM: -> -> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables -> performance optimization that assumes PML4, PDP, PDE, and PTE entries -> are in cacheable WB-DRAM; memory type checks may be bypassed, and -> addresses outside of WB-DRAM may result in undefined behavior or NB -> protocol errors. 1=Disables performance optimization and allows PML4, -> PDP, PDE and PTE entries to be in any memory type. Operating systems -> that maintain page tables in memory types other than WB- DRAM must set -> TlbCacheDis to insure proper operation." -> -> The MCE generated is an NB protocol error to signal that -> -> "Link: A specific coherent-only packet from a CPU was issued to an -> IO link. This may be caused by software which addresses page table -> structures in a memory type other than cacheable WB-DRAM without -> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for -> example, when page table structure addresses are above top of memory. In -> such cases, the NB will generate an MCE if it sees a mismatch between -> the memory operation generated by the core and the link type." -> -> I'm assuming coherent-only packets don't go out on IO links, thus the -> error. - -To fix this, reinstate TLB coherence in lazy mode. With this patch -applied, we do it in one of two ways: - - - If we have PCID, we simply switch back to init_mm's page tables - when we enter a kernel thread -- this seems to be quite cheap - except for the cost of serializing the CPU. - - - If we don't have PCID, then we set a flag and switch to init_mm - the first time we would otherwise need to flush the TLB. - -The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed -to override the default mode for benchmarking. - -In theory, we could optimize this better by only flushing the TLB in -lazy CPUs when a page table is freed. Doing that would require -auditing the mm code to make sure that all page table freeing goes -through tlb_remove_page() as well as reworking some data structures -to implement the improved flush logic. - -Reported-by: Markus Trippelsdorf -Reported-by: Adam Borowski -Signed-off-by: Andy Lutomirski -Signed-off-by: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Daniel Borkmann -Cc: Eric Biggers -Cc: Johannes Hirte -Cc: Kees Cook -Cc: Kirill A. Shutemov -Cc: Linus Torvalds -Cc: Nadav Amit -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Roman Kagan -Cc: Thomas Gleixner -Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking") -Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic -Signed-off-by: Ingo Molnar -(backported from commit b956575bed91ecfb136a8300742ecbbf451471ab) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit a4bb9409c548ece51ec246fc5113a32b8d130142) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/mmu_context.h | 8 +- - arch/x86/include/asm/tlbflush.h | 24 ++++++ - arch/x86/mm/tlb.c | 160 +++++++++++++++++++++++++------------ - 3 files changed, 136 insertions(+), 56 deletions(-) - -diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h -index c120b5db178a..3c856a15b98e 100644 ---- a/arch/x86/include/asm/mmu_context.h -+++ b/arch/x86/include/asm/mmu_context.h -@@ -126,13 +126,7 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) - DEBUG_LOCKS_WARN_ON(preemptible()); - } - --static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) --{ -- int cpu = smp_processor_id(); -- -- if (cpumask_test_cpu(cpu, mm_cpumask(mm))) -- cpumask_clear_cpu(cpu, mm_cpumask(mm)); --} -+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); - - static inline int init_new_context(struct task_struct *tsk, - struct mm_struct *mm) -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index d23e61dc0640..6533da3036c9 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -82,6 +82,13 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) - #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) - #endif - -+/* -+ * If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point -+ * to init_mm when we switch to a kernel thread (e.g. the idle thread). If -+ * it's false, then we immediately switch CR3 when entering a kernel thread. -+ */ -+DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode); -+ - /* - * 6 because 6 should be plenty and struct tlb_state will fit in - * two cache lines. -@@ -104,6 +111,23 @@ struct tlb_state { - u16 loaded_mm_asid; - u16 next_asid; - -+ /* -+ * We can be in one of several states: -+ * -+ * - Actively using an mm. Our CPU's bit will be set in -+ * mm_cpumask(loaded_mm) and is_lazy == false; -+ * -+ * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit -+ * will not be set in mm_cpumask(&init_mm) and is_lazy == false. -+ * -+ * - Lazily using a real mm. loaded_mm != &init_mm, our bit -+ * is set in mm_cpumask(loaded_mm), but is_lazy == true. -+ * We're heuristically guessing that the CR3 load we -+ * skipped more than makes up for the overhead added by -+ * lazy mode. -+ */ -+ bool is_lazy; -+ - /* - * Access to this CR4 shadow and to H/W CR4 is protected by - * disabling interrupts when modifying either one. -diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c -index 440400316c8a..b27aceaf7ed1 100644 ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -30,6 +30,8 @@ - - atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); - -+DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode); -+ - static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, - u16 *new_asid, bool *need_flush) - { -@@ -80,7 +82,7 @@ void leave_mm(int cpu) - return; - - /* Warn if we're not lazy. */ -- WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); -+ WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy)); - - switch_mm(NULL, &init_mm, NULL); - } -@@ -140,52 +142,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - __flush_tlb_all(); - } - #endif -+ this_cpu_write(cpu_tlbstate.is_lazy, false); - - if (real_prev == next) { - VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != - next->context.ctx_id); - -- if (cpumask_test_cpu(cpu, mm_cpumask(next))) { -- /* -- * There's nothing to do: we weren't lazy, and we -- * aren't changing our mm. We don't need to flush -- * anything, nor do we need to update CR3, CR4, or -- * LDTR. -- */ -- return; -- } -- -- /* Resume remote flushes and then read tlb_gen. */ -- cpumask_set_cpu(cpu, mm_cpumask(next)); -- next_tlb_gen = atomic64_read(&next->context.tlb_gen); -- -- if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < -- next_tlb_gen) { -- /* -- * Ideally, we'd have a flush_tlb() variant that -- * takes the known CR3 value as input. This would -- * be faster on Xen PV and on hypothetical CPUs -- * on which INVPCID is fast. -- */ -- this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, -- next_tlb_gen); -- write_cr3(build_cr3(next, prev_asid)); -- -- /* -- * This gets called via leave_mm() in the idle path -- * where RCU functions differently. Tracing normally -- * uses RCU, so we have to call the tracepoint -- * specially here. -- */ -- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, -- TLB_FLUSH_ALL); -- } -- - /* -- * We just exited lazy mode, which means that CR4 and/or LDTR -- * may be stale. (Changes to the required CR4 and LDTR states -- * are not reflected in tlb_gen.) -+ * We don't currently support having a real mm loaded without -+ * our cpu set in mm_cpumask(). We have all the bookkeeping -+ * in place to figure out whether we would need to flush -+ * if our cpu were cleared in mm_cpumask(), but we don't -+ * currently use it. - */ -+ if (WARN_ON_ONCE(real_prev != &init_mm && -+ !cpumask_test_cpu(cpu, mm_cpumask(next)))) -+ cpumask_set_cpu(cpu, mm_cpumask(next)); -+ -+ return; - } else { - u16 new_asid; - bool need_flush; -@@ -204,10 +178,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - } - - /* Stop remote flushes for the previous mm */ -- if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) -- cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); -- -- VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); -+ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && -+ real_prev != &init_mm); -+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); - - /* - * Start remote flushes and then read tlb_gen. -@@ -237,6 +210,37 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - switch_ldt(real_prev, next); - } - -+/* -+ * enter_lazy_tlb() is a hint from the scheduler that we are entering a -+ * kernel thread or other context without an mm. Acceptable implementations -+ * include doing nothing whatsoever, switching to init_mm, or various clever -+ * lazy tricks to try to minimize TLB flushes. -+ * -+ * The scheduler reserves the right to call enter_lazy_tlb() several times -+ * in a row. It will notify us that we're going back to a real mm by -+ * calling switch_mm_irqs_off(). -+ */ -+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) -+{ -+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) -+ return; -+ -+ if (static_branch_unlikely(&tlb_use_lazy_mode)) { -+ /* -+ * There's a significant optimization that may be possible -+ * here. We have accurate enough TLB flush tracking that we -+ * don't need to maintain coherence of TLB per se when we're -+ * lazy. We do, however, need to maintain coherence of -+ * paging-structure caches. We could, in principle, leave our -+ * old mm loaded and only switch to init_mm when -+ * tlb_remove_page() happens. -+ */ -+ this_cpu_write(cpu_tlbstate.is_lazy, true); -+ } else { -+ switch_mm(NULL, &init_mm, NULL); -+ } -+} -+ - /* - * Call this when reinitializing a CPU. It fixes the following potential - * problems: -@@ -308,16 +312,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, - /* This code cannot presently handle being reentered. */ - VM_WARN_ON(!irqs_disabled()); - -+ if (unlikely(loaded_mm == &init_mm)) -+ return; -+ - VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != - loaded_mm->context.ctx_id); - -- if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { -+ if (this_cpu_read(cpu_tlbstate.is_lazy)) { - /* -- * We're in lazy mode -- don't flush. We can get here on -- * remote flushes due to races and on local flushes if a -- * kernel thread coincidentally flushes the mm it's lazily -- * still using. -+ * We're in lazy mode. We need to at least flush our -+ * paging-structure cache to avoid speculatively reading -+ * garbage into our TLB. Since switching to init_mm is barely -+ * slower than a minimal flush, just switch to init_mm. - */ -+ switch_mm_irqs_off(NULL, &init_mm, NULL); - return; - } - -@@ -616,3 +624,57 @@ static int __init create_tlb_single_page_flush_ceiling(void) - return 0; - } - late_initcall(create_tlb_single_page_flush_ceiling); -+ -+static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf, -+ size_t count, loff_t *ppos) -+{ -+ char buf[2]; -+ -+ buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0'; -+ buf[1] = '\n'; -+ -+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2); -+} -+ -+static ssize_t tlblazy_write_file(struct file *file, -+ const char __user *user_buf, size_t count, loff_t *ppos) -+{ -+ bool val; -+ -+ if (kstrtobool_from_user(user_buf, count, &val)) -+ return -EINVAL; -+ -+ if (val) -+ static_branch_enable(&tlb_use_lazy_mode); -+ else -+ static_branch_disable(&tlb_use_lazy_mode); -+ -+ return count; -+} -+ -+static const struct file_operations fops_tlblazy = { -+ .read = tlblazy_read_file, -+ .write = tlblazy_write_file, -+ .llseek = default_llseek, -+}; -+ -+static int __init init_tlb_use_lazy_mode(void) -+{ -+ if (boot_cpu_has(X86_FEATURE_PCID)) { -+ /* -+ * Heuristic: with PCID on, switching to and from -+ * init_mm is reasonably fast, but remote flush IPIs -+ * as expensive as ever, so turn off lazy TLB mode. -+ * -+ * We can't do this in setup_pcid() because static keys -+ * haven't been initialized yet, and it would blow up -+ * badly. -+ */ -+ static_branch_disable(&tlb_use_lazy_mode); -+ } -+ -+ debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR, -+ arch_debugfs_dir, NULL, &fops_tlblazy); -+ return 0; -+} -+late_initcall(init_tlb_use_lazy_mode); --- -2.14.2 - diff --git a/patches/kernel/0048-Revert-x86-mm-Stop-calling-leave_mm-in-idle-code.patch b/patches/kernel/0048-Revert-x86-mm-Stop-calling-leave_mm-in-idle-code.patch deleted file mode 100644 index ffd56ee..0000000 --- a/patches/kernel/0048-Revert-x86-mm-Stop-calling-leave_mm-in-idle-code.patch +++ /dev/null @@ -1,101 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Sat, 4 Nov 2017 04:16:12 -0700 -Subject: [PATCH] Revert "x86/mm: Stop calling leave_mm() in idle code" -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -This reverts commit 43858b4f25cf0adc5c2ca9cf5ce5fdf2532941e5. - -The reason I removed the leave_mm() calls in question is because the -heuristic wasn't needed after that patch. With the original version -of my PCID series, we never flushed a "lazy cpu" (i.e. a CPU running -kernel thread) due a flush on the loaded mm. - -Unfortunately, that caused architectural issues, so now I've -reinstated these flushes on non-PCID systems in: - - commit b956575bed91 ("x86/mm: Flush more aggressively in lazy TLB mode"). - -That, in turn, gives us a power management and occasionally -performance regression as compared to old kernels: a process that -goes into a deep idle state on a given CPU and gets its mm flushed -due to activity on a different CPU will wake the idle CPU. - -Reinstate the old ugly heuristic: if a CPU goes into ACPI C3 or an -intel_idle state that is likely to cause a TLB flush gets its mm -switched to init_mm before going idle. - -FWIW, this heuristic is lousy. Whether we should change CR3 before -idle isn't a good hint except insofar as the performance hit is a bit -lower if the TLB is getting flushed by the idle code anyway. What we -really want to know is whether we anticipate being idle long enough -that the mm is likely to be flushed before we wake up. This is more a -matter of the expected latency than the idle state that gets chosen. -This heuristic also completely fails on systems that don't know -whether the TLB will be flushed (e.g. AMD systems?). OTOH it may be a -bit obsolete anyway -- PCID systems don't presently benefit from this -heuristic at all. - -We also shouldn't do this callback from innermost bit of the idle code -due to the RCU nastiness it causes. All the information need is -available before rcu_idle_enter() needs to happen. - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Denys Vlasenko -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Fixes: 43858b4f25cf "x86/mm: Stop calling leave_mm() in idle code" -Link: http://lkml.kernel.org/r/c513bbd4e653747213e05bc7062de000bf0202a5.1509793738.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 675357362aeba19688440eb1aaa7991067f73b12) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit b607843145fd0593fcd87e2596d1dc5a1d5f79a5) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/tlb.c | 16 +++++++++++++--- - 1 file changed, 13 insertions(+), 3 deletions(-) - -diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c -index b27aceaf7ed1..ed06f1593390 100644 ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -194,12 +194,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - write_cr3(build_cr3(next, new_asid)); -- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, -- TLB_FLUSH_ALL); -+ -+ /* -+ * NB: This gets called via leave_mm() in the idle path -+ * where RCU functions differently. Tracing normally -+ * uses RCU, so we need to use the _rcuidle variant. -+ * -+ * (There is no good reason for this. The idle code should -+ * be rearranged to call this before rcu_idle_enter().) -+ */ -+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - } else { - /* The new ASID is already up to date. */ - write_cr3(build_cr3_noflush(next, new_asid)); -- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); -+ -+ /* See above wrt _rcuidle. */ -+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); - } - - this_cpu_write(cpu_tlbstate.loaded_mm, next); --- -2.14.2 - diff --git a/patches/kernel/0048-x86-mm-Flush-more-aggressively-in-lazy-TLB-mode.patch b/patches/kernel/0048-x86-mm-Flush-more-aggressively-in-lazy-TLB-mode.patch new file mode 100644 index 0000000..62c8c07 --- /dev/null +++ b/patches/kernel/0048-x86-mm-Flush-more-aggressively-in-lazy-TLB-mode.patch @@ -0,0 +1,401 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 9 Oct 2017 09:50:49 -0700 +Subject: [PATCH] x86/mm: Flush more aggressively in lazy TLB mode +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Since commit: + + 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking") + +x86's lazy TLB mode has been all the way lazy: when running a kernel thread +(including the idle thread), the kernel keeps using the last user mm's +page tables without attempting to maintain user TLB coherence at all. + +From a pure semantic perspective, this is fine -- kernel threads won't +attempt to access user pages, so having stale TLB entries doesn't matter. + +Unfortunately, I forgot about a subtlety. By skipping TLB flushes, +we also allow any paging-structure caches that may exist on the CPU +to become incoherent. This means that we can have a +paging-structure cache entry that references a freed page table, and +the CPU is within its rights to do a speculative page walk starting +at the freed page table. + +I can imagine this causing two different problems: + + - A speculative page walk starting from a bogus page table could read + IO addresses. I haven't seen any reports of this causing problems. + + - A speculative page walk that involves a bogus page table can install + garbage in the TLB. Such garbage would always be at a user VA, but + some AMD CPUs have logic that triggers a machine check when it notices + these bogus entries. I've seen a couple reports of this. + +Boris further explains the failure mode: + +> It is actually more of an optimization which assumes that paging-structure +> entries are in WB DRAM: +> +> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables +> performance optimization that assumes PML4, PDP, PDE, and PTE entries +> are in cacheable WB-DRAM; memory type checks may be bypassed, and +> addresses outside of WB-DRAM may result in undefined behavior or NB +> protocol errors. 1=Disables performance optimization and allows PML4, +> PDP, PDE and PTE entries to be in any memory type. Operating systems +> that maintain page tables in memory types other than WB- DRAM must set +> TlbCacheDis to insure proper operation." +> +> The MCE generated is an NB protocol error to signal that +> +> "Link: A specific coherent-only packet from a CPU was issued to an +> IO link. This may be caused by software which addresses page table +> structures in a memory type other than cacheable WB-DRAM without +> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for +> example, when page table structure addresses are above top of memory. In +> such cases, the NB will generate an MCE if it sees a mismatch between +> the memory operation generated by the core and the link type." +> +> I'm assuming coherent-only packets don't go out on IO links, thus the +> error. + +To fix this, reinstate TLB coherence in lazy mode. With this patch +applied, we do it in one of two ways: + + - If we have PCID, we simply switch back to init_mm's page tables + when we enter a kernel thread -- this seems to be quite cheap + except for the cost of serializing the CPU. + + - If we don't have PCID, then we set a flag and switch to init_mm + the first time we would otherwise need to flush the TLB. + +The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed +to override the default mode for benchmarking. + +In theory, we could optimize this better by only flushing the TLB in +lazy CPUs when a page table is freed. Doing that would require +auditing the mm code to make sure that all page table freeing goes +through tlb_remove_page() as well as reworking some data structures +to implement the improved flush logic. + +Reported-by: Markus Trippelsdorf +Reported-by: Adam Borowski +Signed-off-by: Andy Lutomirski +Signed-off-by: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Daniel Borkmann +Cc: Eric Biggers +Cc: Johannes Hirte +Cc: Kees Cook +Cc: Kirill A. Shutemov +Cc: Linus Torvalds +Cc: Nadav Amit +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Roman Kagan +Cc: Thomas Gleixner +Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking") +Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic +Signed-off-by: Ingo Molnar +(backported from commit b956575bed91ecfb136a8300742ecbbf451471ab) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit a4bb9409c548ece51ec246fc5113a32b8d130142) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/mmu_context.h | 8 +- + arch/x86/include/asm/tlbflush.h | 24 ++++++ + arch/x86/mm/tlb.c | 160 +++++++++++++++++++++++++------------ + 3 files changed, 136 insertions(+), 56 deletions(-) + +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index c120b5db178a..3c856a15b98e 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -126,13 +126,7 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) + DEBUG_LOCKS_WARN_ON(preemptible()); + } + +-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +-{ +- int cpu = smp_processor_id(); +- +- if (cpumask_test_cpu(cpu, mm_cpumask(mm))) +- cpumask_clear_cpu(cpu, mm_cpumask(mm)); +-} ++void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); + + static inline int init_new_context(struct task_struct *tsk, + struct mm_struct *mm) +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index d23e61dc0640..6533da3036c9 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -82,6 +82,13 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) + #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) + #endif + ++/* ++ * If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point ++ * to init_mm when we switch to a kernel thread (e.g. the idle thread). If ++ * it's false, then we immediately switch CR3 when entering a kernel thread. ++ */ ++DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode); ++ + /* + * 6 because 6 should be plenty and struct tlb_state will fit in + * two cache lines. +@@ -104,6 +111,23 @@ struct tlb_state { + u16 loaded_mm_asid; + u16 next_asid; + ++ /* ++ * We can be in one of several states: ++ * ++ * - Actively using an mm. Our CPU's bit will be set in ++ * mm_cpumask(loaded_mm) and is_lazy == false; ++ * ++ * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit ++ * will not be set in mm_cpumask(&init_mm) and is_lazy == false. ++ * ++ * - Lazily using a real mm. loaded_mm != &init_mm, our bit ++ * is set in mm_cpumask(loaded_mm), but is_lazy == true. ++ * We're heuristically guessing that the CR3 load we ++ * skipped more than makes up for the overhead added by ++ * lazy mode. ++ */ ++ bool is_lazy; ++ + /* + * Access to this CR4 shadow and to H/W CR4 is protected by + * disabling interrupts when modifying either one. +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 440400316c8a..b27aceaf7ed1 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -30,6 +30,8 @@ + + atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); + ++DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode); ++ + static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, + u16 *new_asid, bool *need_flush) + { +@@ -80,7 +82,7 @@ void leave_mm(int cpu) + return; + + /* Warn if we're not lazy. */ +- WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); ++ WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy)); + + switch_mm(NULL, &init_mm, NULL); + } +@@ -140,52 +142,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + __flush_tlb_all(); + } + #endif ++ this_cpu_write(cpu_tlbstate.is_lazy, false); + + if (real_prev == next) { + VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != + next->context.ctx_id); + +- if (cpumask_test_cpu(cpu, mm_cpumask(next))) { +- /* +- * There's nothing to do: we weren't lazy, and we +- * aren't changing our mm. We don't need to flush +- * anything, nor do we need to update CR3, CR4, or +- * LDTR. +- */ +- return; +- } +- +- /* Resume remote flushes and then read tlb_gen. */ +- cpumask_set_cpu(cpu, mm_cpumask(next)); +- next_tlb_gen = atomic64_read(&next->context.tlb_gen); +- +- if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < +- next_tlb_gen) { +- /* +- * Ideally, we'd have a flush_tlb() variant that +- * takes the known CR3 value as input. This would +- * be faster on Xen PV and on hypothetical CPUs +- * on which INVPCID is fast. +- */ +- this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, +- next_tlb_gen); +- write_cr3(build_cr3(next, prev_asid)); +- +- /* +- * This gets called via leave_mm() in the idle path +- * where RCU functions differently. Tracing normally +- * uses RCU, so we have to call the tracepoint +- * specially here. +- */ +- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, +- TLB_FLUSH_ALL); +- } +- + /* +- * We just exited lazy mode, which means that CR4 and/or LDTR +- * may be stale. (Changes to the required CR4 and LDTR states +- * are not reflected in tlb_gen.) ++ * We don't currently support having a real mm loaded without ++ * our cpu set in mm_cpumask(). We have all the bookkeeping ++ * in place to figure out whether we would need to flush ++ * if our cpu were cleared in mm_cpumask(), but we don't ++ * currently use it. + */ ++ if (WARN_ON_ONCE(real_prev != &init_mm && ++ !cpumask_test_cpu(cpu, mm_cpumask(next)))) ++ cpumask_set_cpu(cpu, mm_cpumask(next)); ++ ++ return; + } else { + u16 new_asid; + bool need_flush; +@@ -204,10 +178,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + } + + /* Stop remote flushes for the previous mm */ +- if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) +- cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); +- +- VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); ++ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && ++ real_prev != &init_mm); ++ cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + + /* + * Start remote flushes and then read tlb_gen. +@@ -237,6 +210,37 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + switch_ldt(real_prev, next); + } + ++/* ++ * enter_lazy_tlb() is a hint from the scheduler that we are entering a ++ * kernel thread or other context without an mm. Acceptable implementations ++ * include doing nothing whatsoever, switching to init_mm, or various clever ++ * lazy tricks to try to minimize TLB flushes. ++ * ++ * The scheduler reserves the right to call enter_lazy_tlb() several times ++ * in a row. It will notify us that we're going back to a real mm by ++ * calling switch_mm_irqs_off(). ++ */ ++void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) ++{ ++ if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) ++ return; ++ ++ if (static_branch_unlikely(&tlb_use_lazy_mode)) { ++ /* ++ * There's a significant optimization that may be possible ++ * here. We have accurate enough TLB flush tracking that we ++ * don't need to maintain coherence of TLB per se when we're ++ * lazy. We do, however, need to maintain coherence of ++ * paging-structure caches. We could, in principle, leave our ++ * old mm loaded and only switch to init_mm when ++ * tlb_remove_page() happens. ++ */ ++ this_cpu_write(cpu_tlbstate.is_lazy, true); ++ } else { ++ switch_mm(NULL, &init_mm, NULL); ++ } ++} ++ + /* + * Call this when reinitializing a CPU. It fixes the following potential + * problems: +@@ -308,16 +312,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, + /* This code cannot presently handle being reentered. */ + VM_WARN_ON(!irqs_disabled()); + ++ if (unlikely(loaded_mm == &init_mm)) ++ return; ++ + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != + loaded_mm->context.ctx_id); + +- if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { ++ if (this_cpu_read(cpu_tlbstate.is_lazy)) { + /* +- * We're in lazy mode -- don't flush. We can get here on +- * remote flushes due to races and on local flushes if a +- * kernel thread coincidentally flushes the mm it's lazily +- * still using. ++ * We're in lazy mode. We need to at least flush our ++ * paging-structure cache to avoid speculatively reading ++ * garbage into our TLB. Since switching to init_mm is barely ++ * slower than a minimal flush, just switch to init_mm. + */ ++ switch_mm_irqs_off(NULL, &init_mm, NULL); + return; + } + +@@ -616,3 +624,57 @@ static int __init create_tlb_single_page_flush_ceiling(void) + return 0; + } + late_initcall(create_tlb_single_page_flush_ceiling); ++ ++static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf, ++ size_t count, loff_t *ppos) ++{ ++ char buf[2]; ++ ++ buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0'; ++ buf[1] = '\n'; ++ ++ return simple_read_from_buffer(user_buf, count, ppos, buf, 2); ++} ++ ++static ssize_t tlblazy_write_file(struct file *file, ++ const char __user *user_buf, size_t count, loff_t *ppos) ++{ ++ bool val; ++ ++ if (kstrtobool_from_user(user_buf, count, &val)) ++ return -EINVAL; ++ ++ if (val) ++ static_branch_enable(&tlb_use_lazy_mode); ++ else ++ static_branch_disable(&tlb_use_lazy_mode); ++ ++ return count; ++} ++ ++static const struct file_operations fops_tlblazy = { ++ .read = tlblazy_read_file, ++ .write = tlblazy_write_file, ++ .llseek = default_llseek, ++}; ++ ++static int __init init_tlb_use_lazy_mode(void) ++{ ++ if (boot_cpu_has(X86_FEATURE_PCID)) { ++ /* ++ * Heuristic: with PCID on, switching to and from ++ * init_mm is reasonably fast, but remote flush IPIs ++ * as expensive as ever, so turn off lazy TLB mode. ++ * ++ * We can't do this in setup_pcid() because static keys ++ * haven't been initialized yet, and it would blow up ++ * badly. ++ */ ++ static_branch_disable(&tlb_use_lazy_mode); ++ } ++ ++ debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR, ++ arch_debugfs_dir, NULL, &fops_tlblazy); ++ return 0; ++} ++late_initcall(init_tlb_use_lazy_mode); +-- +2.14.2 + diff --git a/patches/kernel/0049-Revert-x86-mm-Stop-calling-leave_mm-in-idle-code.patch b/patches/kernel/0049-Revert-x86-mm-Stop-calling-leave_mm-in-idle-code.patch new file mode 100644 index 0000000..ffd56ee --- /dev/null +++ b/patches/kernel/0049-Revert-x86-mm-Stop-calling-leave_mm-in-idle-code.patch @@ -0,0 +1,101 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Sat, 4 Nov 2017 04:16:12 -0700 +Subject: [PATCH] Revert "x86/mm: Stop calling leave_mm() in idle code" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +This reverts commit 43858b4f25cf0adc5c2ca9cf5ce5fdf2532941e5. + +The reason I removed the leave_mm() calls in question is because the +heuristic wasn't needed after that patch. With the original version +of my PCID series, we never flushed a "lazy cpu" (i.e. a CPU running +kernel thread) due a flush on the loaded mm. + +Unfortunately, that caused architectural issues, so now I've +reinstated these flushes on non-PCID systems in: + + commit b956575bed91 ("x86/mm: Flush more aggressively in lazy TLB mode"). + +That, in turn, gives us a power management and occasionally +performance regression as compared to old kernels: a process that +goes into a deep idle state on a given CPU and gets its mm flushed +due to activity on a different CPU will wake the idle CPU. + +Reinstate the old ugly heuristic: if a CPU goes into ACPI C3 or an +intel_idle state that is likely to cause a TLB flush gets its mm +switched to init_mm before going idle. + +FWIW, this heuristic is lousy. Whether we should change CR3 before +idle isn't a good hint except insofar as the performance hit is a bit +lower if the TLB is getting flushed by the idle code anyway. What we +really want to know is whether we anticipate being idle long enough +that the mm is likely to be flushed before we wake up. This is more a +matter of the expected latency than the idle state that gets chosen. +This heuristic also completely fails on systems that don't know +whether the TLB will be flushed (e.g. AMD systems?). OTOH it may be a +bit obsolete anyway -- PCID systems don't presently benefit from this +heuristic at all. + +We also shouldn't do this callback from innermost bit of the idle code +due to the RCU nastiness it causes. All the information need is +available before rcu_idle_enter() needs to happen. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Fixes: 43858b4f25cf "x86/mm: Stop calling leave_mm() in idle code" +Link: http://lkml.kernel.org/r/c513bbd4e653747213e05bc7062de000bf0202a5.1509793738.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 675357362aeba19688440eb1aaa7991067f73b12) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit b607843145fd0593fcd87e2596d1dc5a1d5f79a5) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/tlb.c | 16 +++++++++++++--- + 1 file changed, 13 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index b27aceaf7ed1..ed06f1593390 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -194,12 +194,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + write_cr3(build_cr3(next, new_asid)); +- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, +- TLB_FLUSH_ALL); ++ ++ /* ++ * NB: This gets called via leave_mm() in the idle path ++ * where RCU functions differently. Tracing normally ++ * uses RCU, so we need to use the _rcuidle variant. ++ * ++ * (There is no good reason for this. The idle code should ++ * be rearranged to call this before rcu_idle_enter().) ++ */ ++ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + write_cr3(build_cr3_noflush(next, new_asid)); +- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); ++ ++ /* See above wrt _rcuidle. */ ++ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); + } + + this_cpu_write(cpu_tlbstate.loaded_mm, next); +-- +2.14.2 + diff --git a/patches/kernel/0049-kprobes-x86-Set-up-frame-pointer-in-kprobe-trampolin.patch b/patches/kernel/0049-kprobes-x86-Set-up-frame-pointer-in-kprobe-trampolin.patch deleted file mode 100644 index 4d673b0..0000000 --- a/patches/kernel/0049-kprobes-x86-Set-up-frame-pointer-in-kprobe-trampolin.patch +++ /dev/null @@ -1,85 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Tue, 3 Oct 2017 08:51:43 -0500 -Subject: [PATCH] kprobes/x86: Set up frame pointer in kprobe trampoline -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Richard Weinberger saw an unwinder warning when running bcc's opensnoop: - - WARNING: kernel stack frame pointer at ffff99ef4076bea0 in opensnoop:2008 has bad value 0000000000000008 - unwind stack type:0 next_sp: (null) mask:0x2 graph_idx:0 - ... - ffff99ef4076be88: ffff99ef4076bea0 (0xffff99ef4076bea0) - ffff99ef4076be90: ffffffffac442721 (optimized_callback +0x81/0x90) - ... - -A lockdep stack trace was initiated from inside a kprobe handler, when -the unwinder noticed a bad frame pointer on the stack. The bad frame -pointer is related to the fact that the kprobe optprobe trampoline -doesn't save the frame pointer before calling into optimized_callback(). - -Reported-and-tested-by: Richard Weinberger -Signed-off-by: Josh Poimboeuf -Acked-by: Masami Hiramatsu -Cc: Ananth N Mavinakayanahalli -Cc: Anil S Keshavamurthy -Cc: David S . Miller -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/7aef2f8ecd75c2f505ef9b80490412262cf4a44c.1507038547.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit ee213fc72fd67d0988525af501534f4cb924d1e9) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 0f7d5518c91335584b16c7bed1c54c10b78ea76a) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/kprobes/common.h | 13 +++++++++++-- - 1 file changed, 11 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h -index db2182d63ed0..3fc0f9a794cb 100644 ---- a/arch/x86/kernel/kprobes/common.h -+++ b/arch/x86/kernel/kprobes/common.h -@@ -3,6 +3,15 @@ - - /* Kprobes and Optprobes common header */ - -+#include -+ -+#ifdef CONFIG_FRAME_POINTER -+# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \ -+ " mov %" _ASM_SP ", %" _ASM_BP "\n" -+#else -+# define SAVE_RBP_STRING " push %" _ASM_BP "\n" -+#endif -+ - #ifdef CONFIG_X86_64 - #define SAVE_REGS_STRING \ - /* Skip cs, ip, orig_ax. */ \ -@@ -17,7 +26,7 @@ - " pushq %r10\n" \ - " pushq %r11\n" \ - " pushq %rbx\n" \ -- " pushq %rbp\n" \ -+ SAVE_RBP_STRING \ - " pushq %r12\n" \ - " pushq %r13\n" \ - " pushq %r14\n" \ -@@ -48,7 +57,7 @@ - " pushl %es\n" \ - " pushl %ds\n" \ - " pushl %eax\n" \ -- " pushl %ebp\n" \ -+ SAVE_RBP_STRING \ - " pushl %edi\n" \ - " pushl %esi\n" \ - " pushl %edx\n" \ --- -2.14.2 - diff --git a/patches/kernel/0050-kprobes-x86-Set-up-frame-pointer-in-kprobe-trampolin.patch b/patches/kernel/0050-kprobes-x86-Set-up-frame-pointer-in-kprobe-trampolin.patch new file mode 100644 index 0000000..4d673b0 --- /dev/null +++ b/patches/kernel/0050-kprobes-x86-Set-up-frame-pointer-in-kprobe-trampolin.patch @@ -0,0 +1,85 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Tue, 3 Oct 2017 08:51:43 -0500 +Subject: [PATCH] kprobes/x86: Set up frame pointer in kprobe trampoline +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Richard Weinberger saw an unwinder warning when running bcc's opensnoop: + + WARNING: kernel stack frame pointer at ffff99ef4076bea0 in opensnoop:2008 has bad value 0000000000000008 + unwind stack type:0 next_sp: (null) mask:0x2 graph_idx:0 + ... + ffff99ef4076be88: ffff99ef4076bea0 (0xffff99ef4076bea0) + ffff99ef4076be90: ffffffffac442721 (optimized_callback +0x81/0x90) + ... + +A lockdep stack trace was initiated from inside a kprobe handler, when +the unwinder noticed a bad frame pointer on the stack. The bad frame +pointer is related to the fact that the kprobe optprobe trampoline +doesn't save the frame pointer before calling into optimized_callback(). + +Reported-and-tested-by: Richard Weinberger +Signed-off-by: Josh Poimboeuf +Acked-by: Masami Hiramatsu +Cc: Ananth N Mavinakayanahalli +Cc: Anil S Keshavamurthy +Cc: David S . Miller +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/7aef2f8ecd75c2f505ef9b80490412262cf4a44c.1507038547.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit ee213fc72fd67d0988525af501534f4cb924d1e9) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 0f7d5518c91335584b16c7bed1c54c10b78ea76a) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/kprobes/common.h | 13 +++++++++++-- + 1 file changed, 11 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h +index db2182d63ed0..3fc0f9a794cb 100644 +--- a/arch/x86/kernel/kprobes/common.h ++++ b/arch/x86/kernel/kprobes/common.h +@@ -3,6 +3,15 @@ + + /* Kprobes and Optprobes common header */ + ++#include ++ ++#ifdef CONFIG_FRAME_POINTER ++# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \ ++ " mov %" _ASM_SP ", %" _ASM_BP "\n" ++#else ++# define SAVE_RBP_STRING " push %" _ASM_BP "\n" ++#endif ++ + #ifdef CONFIG_X86_64 + #define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax. */ \ +@@ -17,7 +26,7 @@ + " pushq %r10\n" \ + " pushq %r11\n" \ + " pushq %rbx\n" \ +- " pushq %rbp\n" \ ++ SAVE_RBP_STRING \ + " pushq %r12\n" \ + " pushq %r13\n" \ + " pushq %r14\n" \ +@@ -48,7 +57,7 @@ + " pushl %es\n" \ + " pushl %ds\n" \ + " pushl %eax\n" \ +- " pushl %ebp\n" \ ++ SAVE_RBP_STRING \ + " pushl %edi\n" \ + " pushl %esi\n" \ + " pushl %edx\n" \ +-- +2.14.2 + diff --git a/patches/kernel/0050-x86-tracing-Introduce-a-static-key-for-exception-tra.patch b/patches/kernel/0050-x86-tracing-Introduce-a-static-key-for-exception-tra.patch deleted file mode 100644 index efbc800..0000000 --- a/patches/kernel/0050-x86-tracing-Introduce-a-static-key-for-exception-tra.patch +++ /dev/null @@ -1,139 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 28 Aug 2017 08:47:21 +0200 -Subject: [PATCH] x86/tracing: Introduce a static key for exception tracing -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Switching the IDT just for avoiding tracepoints creates a completely -impenetrable macro/inline/ifdef mess. - -There is no point in avoiding tracepoints for most of the traps/exceptions. -For the more expensive tracepoints, like pagefaults, this can be handled with -an explicit static key. - -Preparatory patch to remove the tracing IDT. - -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Steven Rostedt -Link: http://lkml.kernel.org/r/20170828064956.593094539@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 2feb1b316d48004d905278c02a55902cab0be8be) -Signed-off-by: Andy Whitcroft -(cherry picked from commit 15e0ff2a63fdd93f8881e2ebba5c048c5b601e57) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit d58a56e851c339d8d9d311dc9b4fad6abbf8bf19) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/trace/common.h | 15 +++++++++++++++ - arch/x86/include/asm/trace/exceptions.h | 4 +--- - arch/x86/include/asm/trace/irq_vectors.h | 4 +--- - arch/x86/kernel/tracepoint.c | 9 ++++++++- - 4 files changed, 25 insertions(+), 7 deletions(-) - create mode 100644 arch/x86/include/asm/trace/common.h - -diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h -new file mode 100644 -index 000000000000..b1eb7b18ee8a ---- /dev/null -+++ b/arch/x86/include/asm/trace/common.h -@@ -0,0 +1,15 @@ -+#ifndef _ASM_TRACE_COMMON_H -+#define _ASM_TRACE_COMMON_H -+ -+extern int trace_irq_vector_regfunc(void); -+extern void trace_irq_vector_unregfunc(void); -+ -+#ifdef CONFIG_TRACING -+DECLARE_STATIC_KEY_FALSE(trace_irqvectors_key); -+#define trace_irqvectors_enabled() \ -+ static_branch_unlikely(&trace_irqvectors_key) -+#else -+static inline bool trace_irqvectors_enabled(void) { return false; } -+#endif -+ -+#endif -diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h -index 2422b14c50a7..960a5b50ac3b 100644 ---- a/arch/x86/include/asm/trace/exceptions.h -+++ b/arch/x86/include/asm/trace/exceptions.h -@@ -5,9 +5,7 @@ - #define _TRACE_PAGE_FAULT_H - - #include -- --extern int trace_irq_vector_regfunc(void); --extern void trace_irq_vector_unregfunc(void); -+#include - - DECLARE_EVENT_CLASS(x86_exceptions, - -diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h -index 32dd6a9e343c..7825b4426e7e 100644 ---- a/arch/x86/include/asm/trace/irq_vectors.h -+++ b/arch/x86/include/asm/trace/irq_vectors.h -@@ -5,9 +5,7 @@ - #define _TRACE_IRQ_VECTORS_H - - #include -- --extern int trace_irq_vector_regfunc(void); --extern void trace_irq_vector_unregfunc(void); -+#include - - DECLARE_EVENT_CLASS(x86_irq_vector, - -diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c -index 15515132bf0d..dd4aa04bb95c 100644 ---- a/arch/x86/kernel/tracepoint.c -+++ b/arch/x86/kernel/tracepoint.c -@@ -4,9 +4,11 @@ - * Copyright (C) 2013 Seiji Aguchi - * - */ -+#include -+#include -+ - #include - #include --#include - - atomic_t trace_idt_ctr = ATOMIC_INIT(0); - struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, -@@ -15,6 +17,7 @@ struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, - /* No need to be aligned, but done to keep all IDTs defined the same way. */ - gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; - -+DEFINE_STATIC_KEY_FALSE(trace_irqvectors_key); - static int trace_irq_vector_refcount; - static DEFINE_MUTEX(irq_vector_mutex); - -@@ -36,6 +39,8 @@ static void switch_idt(void *arg) - - int trace_irq_vector_regfunc(void) - { -+ static_branch_inc(&trace_irqvectors_key); -+ - mutex_lock(&irq_vector_mutex); - if (!trace_irq_vector_refcount) { - set_trace_idt_ctr(1); -@@ -49,6 +54,8 @@ int trace_irq_vector_regfunc(void) - - void trace_irq_vector_unregfunc(void) - { -+ static_branch_dec(&trace_irqvectors_key); -+ - mutex_lock(&irq_vector_mutex); - trace_irq_vector_refcount--; - if (!trace_irq_vector_refcount) { --- -2.14.2 - diff --git a/patches/kernel/0051-x86-boot-Add-early-cmdline-parsing-for-options-with-.patch b/patches/kernel/0051-x86-boot-Add-early-cmdline-parsing-for-options-with-.patch deleted file mode 100644 index 0872493..0000000 --- a/patches/kernel/0051-x86-boot-Add-early-cmdline-parsing-for-options-with-.patch +++ /dev/null @@ -1,189 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tom Lendacky -Date: Mon, 17 Jul 2017 16:10:33 -0500 -Subject: [PATCH] x86/boot: Add early cmdline parsing for options with - arguments -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Add a cmdline_find_option() function to look for cmdline options that -take arguments. The argument is returned in a supplied buffer and the -argument length (regardless of whether it fits in the supplied buffer) -is returned, with -1 indicating not found. - -Signed-off-by: Tom Lendacky -Reviewed-by: Thomas Gleixner -Cc: Alexander Potapenko -Cc: Andrey Ryabinin -Cc: Andy Lutomirski -Cc: Arnd Bergmann -Cc: Borislav Petkov -Cc: Brijesh Singh -Cc: Dave Young -Cc: Dmitry Vyukov -Cc: Jonathan Corbet -Cc: Konrad Rzeszutek Wilk -Cc: Larry Woodman -Cc: Linus Torvalds -Cc: Matt Fleming -Cc: Michael S. Tsirkin -Cc: Paolo Bonzini -Cc: Peter Zijlstra -Cc: Radim Krčmář -Cc: Rik van Riel -Cc: Toshimitsu Kani -Cc: kasan-dev@googlegroups.com -Cc: kvm@vger.kernel.org -Cc: linux-arch@vger.kernel.org -Cc: linux-doc@vger.kernel.org -Cc: linux-efi@vger.kernel.org -Cc: linux-mm@kvack.org -Link: http://lkml.kernel.org/r/36b5f97492a9745dce27682305f990fc20e5cf8a.1500319216.git.thomas.lendacky@amd.com -Signed-off-by: Ingo Molnar -(cherry picked from commit e505371dd83963caae1a37ead9524e8d997341be) -Signed-off-by: Andy Whitcroft -(cherry picked from commit 37569cd003aa69a57d5666530436c2d973a57b26) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit b9f03418aa9b8ecbb1c7f32ac2bfe68fd21de4f5) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/cmdline.h | 2 + - arch/x86/lib/cmdline.c | 105 +++++++++++++++++++++++++++++++++++++++++ - 2 files changed, 107 insertions(+) - -diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h -index e01f7f7ccb0c..84ae170bc3d0 100644 ---- a/arch/x86/include/asm/cmdline.h -+++ b/arch/x86/include/asm/cmdline.h -@@ -2,5 +2,7 @@ - #define _ASM_X86_CMDLINE_H - - int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); -+int cmdline_find_option(const char *cmdline_ptr, const char *option, -+ char *buffer, int bufsize); - - #endif /* _ASM_X86_CMDLINE_H */ -diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c -index 5cc78bf57232..3261abb21ef4 100644 ---- a/arch/x86/lib/cmdline.c -+++ b/arch/x86/lib/cmdline.c -@@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size, - return 0; /* Buffer overrun */ - } - -+/* -+ * Find a non-boolean option (i.e. option=argument). In accordance with -+ * standard Linux practice, if this option is repeated, this returns the -+ * last instance on the command line. -+ * -+ * @cmdline: the cmdline string -+ * @max_cmdline_size: the maximum size of cmdline -+ * @option: option string to look for -+ * @buffer: memory buffer to return the option argument -+ * @bufsize: size of the supplied memory buffer -+ * -+ * Returns the length of the argument (regardless of if it was -+ * truncated to fit in the buffer), or -1 on not found. -+ */ -+static int -+__cmdline_find_option(const char *cmdline, int max_cmdline_size, -+ const char *option, char *buffer, int bufsize) -+{ -+ char c; -+ int pos = 0, len = -1; -+ const char *opptr = NULL; -+ char *bufptr = buffer; -+ enum { -+ st_wordstart = 0, /* Start of word/after whitespace */ -+ st_wordcmp, /* Comparing this word */ -+ st_wordskip, /* Miscompare, skip */ -+ st_bufcpy, /* Copying this to buffer */ -+ } state = st_wordstart; -+ -+ if (!cmdline) -+ return -1; /* No command line */ -+ -+ /* -+ * This 'pos' check ensures we do not overrun -+ * a non-NULL-terminated 'cmdline' -+ */ -+ while (pos++ < max_cmdline_size) { -+ c = *(char *)cmdline++; -+ if (!c) -+ break; -+ -+ switch (state) { -+ case st_wordstart: -+ if (myisspace(c)) -+ break; -+ -+ state = st_wordcmp; -+ opptr = option; -+ /* fall through */ -+ -+ case st_wordcmp: -+ if ((c == '=') && !*opptr) { -+ /* -+ * We matched all the way to the end of the -+ * option we were looking for, prepare to -+ * copy the argument. -+ */ -+ len = 0; -+ bufptr = buffer; -+ state = st_bufcpy; -+ break; -+ } else if (c == *opptr++) { -+ /* -+ * We are currently matching, so continue -+ * to the next character on the cmdline. -+ */ -+ break; -+ } -+ state = st_wordskip; -+ /* fall through */ -+ -+ case st_wordskip: -+ if (myisspace(c)) -+ state = st_wordstart; -+ break; -+ -+ case st_bufcpy: -+ if (myisspace(c)) { -+ state = st_wordstart; -+ } else { -+ /* -+ * Increment len, but don't overrun the -+ * supplied buffer and leave room for the -+ * NULL terminator. -+ */ -+ if (++len < bufsize) -+ *bufptr++ = c; -+ } -+ break; -+ } -+ } -+ -+ if (bufsize) -+ *bufptr = '\0'; -+ -+ return len; -+} -+ - int cmdline_find_option_bool(const char *cmdline, const char *option) - { - return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); - } -+ -+int cmdline_find_option(const char *cmdline, const char *option, char *buffer, -+ int bufsize) -+{ -+ return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, -+ buffer, bufsize); -+} --- -2.14.2 - diff --git a/patches/kernel/0051-x86-tracing-Introduce-a-static-key-for-exception-tra.patch b/patches/kernel/0051-x86-tracing-Introduce-a-static-key-for-exception-tra.patch new file mode 100644 index 0000000..efbc800 --- /dev/null +++ b/patches/kernel/0051-x86-tracing-Introduce-a-static-key-for-exception-tra.patch @@ -0,0 +1,139 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 28 Aug 2017 08:47:21 +0200 +Subject: [PATCH] x86/tracing: Introduce a static key for exception tracing +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Switching the IDT just for avoiding tracepoints creates a completely +impenetrable macro/inline/ifdef mess. + +There is no point in avoiding tracepoints for most of the traps/exceptions. +For the more expensive tracepoints, like pagefaults, this can be handled with +an explicit static key. + +Preparatory patch to remove the tracing IDT. + +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Steven Rostedt +Link: http://lkml.kernel.org/r/20170828064956.593094539@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 2feb1b316d48004d905278c02a55902cab0be8be) +Signed-off-by: Andy Whitcroft +(cherry picked from commit 15e0ff2a63fdd93f8881e2ebba5c048c5b601e57) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit d58a56e851c339d8d9d311dc9b4fad6abbf8bf19) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/trace/common.h | 15 +++++++++++++++ + arch/x86/include/asm/trace/exceptions.h | 4 +--- + arch/x86/include/asm/trace/irq_vectors.h | 4 +--- + arch/x86/kernel/tracepoint.c | 9 ++++++++- + 4 files changed, 25 insertions(+), 7 deletions(-) + create mode 100644 arch/x86/include/asm/trace/common.h + +diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h +new file mode 100644 +index 000000000000..b1eb7b18ee8a +--- /dev/null ++++ b/arch/x86/include/asm/trace/common.h +@@ -0,0 +1,15 @@ ++#ifndef _ASM_TRACE_COMMON_H ++#define _ASM_TRACE_COMMON_H ++ ++extern int trace_irq_vector_regfunc(void); ++extern void trace_irq_vector_unregfunc(void); ++ ++#ifdef CONFIG_TRACING ++DECLARE_STATIC_KEY_FALSE(trace_irqvectors_key); ++#define trace_irqvectors_enabled() \ ++ static_branch_unlikely(&trace_irqvectors_key) ++#else ++static inline bool trace_irqvectors_enabled(void) { return false; } ++#endif ++ ++#endif +diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h +index 2422b14c50a7..960a5b50ac3b 100644 +--- a/arch/x86/include/asm/trace/exceptions.h ++++ b/arch/x86/include/asm/trace/exceptions.h +@@ -5,9 +5,7 @@ + #define _TRACE_PAGE_FAULT_H + + #include +- +-extern int trace_irq_vector_regfunc(void); +-extern void trace_irq_vector_unregfunc(void); ++#include + + DECLARE_EVENT_CLASS(x86_exceptions, + +diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h +index 32dd6a9e343c..7825b4426e7e 100644 +--- a/arch/x86/include/asm/trace/irq_vectors.h ++++ b/arch/x86/include/asm/trace/irq_vectors.h +@@ -5,9 +5,7 @@ + #define _TRACE_IRQ_VECTORS_H + + #include +- +-extern int trace_irq_vector_regfunc(void); +-extern void trace_irq_vector_unregfunc(void); ++#include + + DECLARE_EVENT_CLASS(x86_irq_vector, + +diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c +index 15515132bf0d..dd4aa04bb95c 100644 +--- a/arch/x86/kernel/tracepoint.c ++++ b/arch/x86/kernel/tracepoint.c +@@ -4,9 +4,11 @@ + * Copyright (C) 2013 Seiji Aguchi + * + */ ++#include ++#include ++ + #include + #include +-#include + + atomic_t trace_idt_ctr = ATOMIC_INIT(0); + struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, +@@ -15,6 +17,7 @@ struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, + /* No need to be aligned, but done to keep all IDTs defined the same way. */ + gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; + ++DEFINE_STATIC_KEY_FALSE(trace_irqvectors_key); + static int trace_irq_vector_refcount; + static DEFINE_MUTEX(irq_vector_mutex); + +@@ -36,6 +39,8 @@ static void switch_idt(void *arg) + + int trace_irq_vector_regfunc(void) + { ++ static_branch_inc(&trace_irqvectors_key); ++ + mutex_lock(&irq_vector_mutex); + if (!trace_irq_vector_refcount) { + set_trace_idt_ctr(1); +@@ -49,6 +54,8 @@ int trace_irq_vector_regfunc(void) + + void trace_irq_vector_unregfunc(void) + { ++ static_branch_dec(&trace_irqvectors_key); ++ + mutex_lock(&irq_vector_mutex); + trace_irq_vector_refcount--; + if (!trace_irq_vector_refcount) { +-- +2.14.2 + diff --git a/patches/kernel/0052-mm-x86-mm-Fix-performance-regression-in-get_user_pag.patch b/patches/kernel/0052-mm-x86-mm-Fix-performance-regression-in-get_user_pag.patch deleted file mode 100644 index bc60e7a..0000000 --- a/patches/kernel/0052-mm-x86-mm-Fix-performance-regression-in-get_user_pag.patch +++ /dev/null @@ -1,192 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: "Kirill A. Shutemov" -Date: Sat, 9 Sep 2017 00:56:03 +0300 -Subject: [PATCH] mm, x86/mm: Fix performance regression in - get_user_pages_fast() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The 0-day test bot found a performance regression that was tracked down to -switching x86 to the generic get_user_pages_fast() implementation: - - http://lkml.kernel.org/r/20170710024020.GA26389@yexl-desktop - -The regression was caused by the fact that we now use local_irq_save() + -local_irq_restore() in get_user_pages_fast() to disable interrupts. -In x86 implementation local_irq_disable() + local_irq_enable() was used. - -The fix is to make get_user_pages_fast() use local_irq_disable(), -leaving local_irq_save() for __get_user_pages_fast() that can be called -with interrupts disabled. - -Numbers for pinning a gigabyte of memory, one page a time, 20 repeats: - - Before: Average: 14.91 ms, stddev: 0.45 ms - After: Average: 10.76 ms, stddev: 0.18 ms - -Signed-off-by: Kirill A. Shutemov -Cc: Andrew Morton -Cc: Huang Ying -Cc: Jonathan Corbet -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: Thorsten Leemhuis -Cc: linux-mm@kvack.org -Fixes: e585513b76f7 ("x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation") -Link: http://lkml.kernel.org/r/20170908215603.9189-3-kirill.shutemov@linux.intel.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 5b65c4677a57a1d4414212f9995aa0e46a21ff80) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 5241f4b2c68284612e34910305f3234e4a64701b) -Signed-off-by: Fabian Grünbichler ---- - mm/gup.c | 97 ++++++++++++++++++++++++++++++++++++++-------------------------- - 1 file changed, 58 insertions(+), 39 deletions(-) - -diff --git a/mm/gup.c b/mm/gup.c -index 23f01c40c88f..4a789f1c6a27 100644 ---- a/mm/gup.c -+++ b/mm/gup.c -@@ -1618,6 +1618,47 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, - return 1; - } - -+static void gup_pgd_range(unsigned long addr, unsigned long end, -+ int write, struct page **pages, int *nr) -+{ -+ unsigned long next; -+ pgd_t *pgdp; -+ -+ pgdp = pgd_offset(current->mm, addr); -+ do { -+ pgd_t pgd = READ_ONCE(*pgdp); -+ -+ next = pgd_addr_end(addr, end); -+ if (pgd_none(pgd)) -+ return; -+ if (unlikely(pgd_huge(pgd))) { -+ if (!gup_huge_pgd(pgd, pgdp, addr, next, write, -+ pages, nr)) -+ return; -+ } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { -+ if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, -+ PGDIR_SHIFT, next, write, pages, nr)) -+ return; -+ } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr)) -+ return; -+ } while (pgdp++, addr = next, addr != end); -+} -+ -+#ifndef gup_fast_permitted -+/* -+ * Check if it's allowed to use __get_user_pages_fast() for the range, or -+ * we need to fall back to the slow version: -+ */ -+bool gup_fast_permitted(unsigned long start, int nr_pages, int write) -+{ -+ unsigned long len, end; -+ -+ len = (unsigned long) nr_pages << PAGE_SHIFT; -+ end = start + len; -+ return end >= start; -+} -+#endif -+ - /* - * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to - * the regular GUP. It will only return non-negative values. -@@ -1625,10 +1666,8 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, - int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) - { -- struct mm_struct *mm = current->mm; - unsigned long addr, len, end; -- unsigned long next, flags; -- pgd_t *pgdp; -+ unsigned long flags; - int nr = 0; - - start &= PAGE_MASK; -@@ -1652,45 +1691,15 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - * block IPIs that come from THPs splitting. - */ - -- local_irq_save(flags); -- pgdp = pgd_offset(mm, addr); -- do { -- pgd_t pgd = READ_ONCE(*pgdp); -- -- next = pgd_addr_end(addr, end); -- if (pgd_none(pgd)) -- break; -- if (unlikely(pgd_huge(pgd))) { -- if (!gup_huge_pgd(pgd, pgdp, addr, next, write, -- pages, &nr)) -- break; -- } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { -- if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, -- PGDIR_SHIFT, next, write, pages, &nr)) -- break; -- } else if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) -- break; -- } while (pgdp++, addr = next, addr != end); -- local_irq_restore(flags); -+ if (gup_fast_permitted(start, nr_pages, write)) { -+ local_irq_save(flags); -+ gup_pgd_range(addr, end, write, pages, &nr); -+ local_irq_restore(flags); -+ } - - return nr; - } - --#ifndef gup_fast_permitted --/* -- * Check if it's allowed to use __get_user_pages_fast() for the range, or -- * we need to fall back to the slow version: -- */ --bool gup_fast_permitted(unsigned long start, int nr_pages, int write) --{ -- unsigned long len, end; -- -- len = (unsigned long) nr_pages << PAGE_SHIFT; -- end = start + len; -- return end >= start; --} --#endif -- - /** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address -@@ -1710,12 +1719,22 @@ bool gup_fast_permitted(unsigned long start, int nr_pages, int write) - int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) - { -+ unsigned long addr, len, end; - int nr = 0, ret = 0; - - start &= PAGE_MASK; -+ addr = start; -+ len = (unsigned long) nr_pages << PAGE_SHIFT; -+ end = start + len; -+ -+ if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, -+ (void __user *)start, len))) -+ return 0; - - if (gup_fast_permitted(start, nr_pages, write)) { -- nr = __get_user_pages_fast(start, nr_pages, write, pages); -+ local_irq_disable(); -+ gup_pgd_range(addr, end, write, pages, &nr); -+ local_irq_enable(); - ret = nr; - } - --- -2.14.2 - diff --git a/patches/kernel/0052-x86-boot-Add-early-cmdline-parsing-for-options-with-.patch b/patches/kernel/0052-x86-boot-Add-early-cmdline-parsing-for-options-with-.patch new file mode 100644 index 0000000..0872493 --- /dev/null +++ b/patches/kernel/0052-x86-boot-Add-early-cmdline-parsing-for-options-with-.patch @@ -0,0 +1,189 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Mon, 17 Jul 2017 16:10:33 -0500 +Subject: [PATCH] x86/boot: Add early cmdline parsing for options with + arguments +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Add a cmdline_find_option() function to look for cmdline options that +take arguments. The argument is returned in a supplied buffer and the +argument length (regardless of whether it fits in the supplied buffer) +is returned, with -1 indicating not found. + +Signed-off-by: Tom Lendacky +Reviewed-by: Thomas Gleixner +Cc: Alexander Potapenko +Cc: Andrey Ryabinin +Cc: Andy Lutomirski +Cc: Arnd Bergmann +Cc: Borislav Petkov +Cc: Brijesh Singh +Cc: Dave Young +Cc: Dmitry Vyukov +Cc: Jonathan Corbet +Cc: Konrad Rzeszutek Wilk +Cc: Larry Woodman +Cc: Linus Torvalds +Cc: Matt Fleming +Cc: Michael S. Tsirkin +Cc: Paolo Bonzini +Cc: Peter Zijlstra +Cc: Radim Krčmář +Cc: Rik van Riel +Cc: Toshimitsu Kani +Cc: kasan-dev@googlegroups.com +Cc: kvm@vger.kernel.org +Cc: linux-arch@vger.kernel.org +Cc: linux-doc@vger.kernel.org +Cc: linux-efi@vger.kernel.org +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/36b5f97492a9745dce27682305f990fc20e5cf8a.1500319216.git.thomas.lendacky@amd.com +Signed-off-by: Ingo Molnar +(cherry picked from commit e505371dd83963caae1a37ead9524e8d997341be) +Signed-off-by: Andy Whitcroft +(cherry picked from commit 37569cd003aa69a57d5666530436c2d973a57b26) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit b9f03418aa9b8ecbb1c7f32ac2bfe68fd21de4f5) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/cmdline.h | 2 + + arch/x86/lib/cmdline.c | 105 +++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 107 insertions(+) + +diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h +index e01f7f7ccb0c..84ae170bc3d0 100644 +--- a/arch/x86/include/asm/cmdline.h ++++ b/arch/x86/include/asm/cmdline.h +@@ -2,5 +2,7 @@ + #define _ASM_X86_CMDLINE_H + + int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); ++int cmdline_find_option(const char *cmdline_ptr, const char *option, ++ char *buffer, int bufsize); + + #endif /* _ASM_X86_CMDLINE_H */ +diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c +index 5cc78bf57232..3261abb21ef4 100644 +--- a/arch/x86/lib/cmdline.c ++++ b/arch/x86/lib/cmdline.c +@@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size, + return 0; /* Buffer overrun */ + } + ++/* ++ * Find a non-boolean option (i.e. option=argument). In accordance with ++ * standard Linux practice, if this option is repeated, this returns the ++ * last instance on the command line. ++ * ++ * @cmdline: the cmdline string ++ * @max_cmdline_size: the maximum size of cmdline ++ * @option: option string to look for ++ * @buffer: memory buffer to return the option argument ++ * @bufsize: size of the supplied memory buffer ++ * ++ * Returns the length of the argument (regardless of if it was ++ * truncated to fit in the buffer), or -1 on not found. ++ */ ++static int ++__cmdline_find_option(const char *cmdline, int max_cmdline_size, ++ const char *option, char *buffer, int bufsize) ++{ ++ char c; ++ int pos = 0, len = -1; ++ const char *opptr = NULL; ++ char *bufptr = buffer; ++ enum { ++ st_wordstart = 0, /* Start of word/after whitespace */ ++ st_wordcmp, /* Comparing this word */ ++ st_wordskip, /* Miscompare, skip */ ++ st_bufcpy, /* Copying this to buffer */ ++ } state = st_wordstart; ++ ++ if (!cmdline) ++ return -1; /* No command line */ ++ ++ /* ++ * This 'pos' check ensures we do not overrun ++ * a non-NULL-terminated 'cmdline' ++ */ ++ while (pos++ < max_cmdline_size) { ++ c = *(char *)cmdline++; ++ if (!c) ++ break; ++ ++ switch (state) { ++ case st_wordstart: ++ if (myisspace(c)) ++ break; ++ ++ state = st_wordcmp; ++ opptr = option; ++ /* fall through */ ++ ++ case st_wordcmp: ++ if ((c == '=') && !*opptr) { ++ /* ++ * We matched all the way to the end of the ++ * option we were looking for, prepare to ++ * copy the argument. ++ */ ++ len = 0; ++ bufptr = buffer; ++ state = st_bufcpy; ++ break; ++ } else if (c == *opptr++) { ++ /* ++ * We are currently matching, so continue ++ * to the next character on the cmdline. ++ */ ++ break; ++ } ++ state = st_wordskip; ++ /* fall through */ ++ ++ case st_wordskip: ++ if (myisspace(c)) ++ state = st_wordstart; ++ break; ++ ++ case st_bufcpy: ++ if (myisspace(c)) { ++ state = st_wordstart; ++ } else { ++ /* ++ * Increment len, but don't overrun the ++ * supplied buffer and leave room for the ++ * NULL terminator. ++ */ ++ if (++len < bufsize) ++ *bufptr++ = c; ++ } ++ break; ++ } ++ } ++ ++ if (bufsize) ++ *bufptr = '\0'; ++ ++ return len; ++} ++ + int cmdline_find_option_bool(const char *cmdline, const char *option) + { + return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); + } ++ ++int cmdline_find_option(const char *cmdline, const char *option, char *buffer, ++ int bufsize) ++{ ++ return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, ++ buffer, bufsize); ++} +-- +2.14.2 + diff --git a/patches/kernel/0053-mm-x86-mm-Fix-performance-regression-in-get_user_pag.patch b/patches/kernel/0053-mm-x86-mm-Fix-performance-regression-in-get_user_pag.patch new file mode 100644 index 0000000..bc60e7a --- /dev/null +++ b/patches/kernel/0053-mm-x86-mm-Fix-performance-regression-in-get_user_pag.patch @@ -0,0 +1,192 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: "Kirill A. Shutemov" +Date: Sat, 9 Sep 2017 00:56:03 +0300 +Subject: [PATCH] mm, x86/mm: Fix performance regression in + get_user_pages_fast() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The 0-day test bot found a performance regression that was tracked down to +switching x86 to the generic get_user_pages_fast() implementation: + + http://lkml.kernel.org/r/20170710024020.GA26389@yexl-desktop + +The regression was caused by the fact that we now use local_irq_save() + +local_irq_restore() in get_user_pages_fast() to disable interrupts. +In x86 implementation local_irq_disable() + local_irq_enable() was used. + +The fix is to make get_user_pages_fast() use local_irq_disable(), +leaving local_irq_save() for __get_user_pages_fast() that can be called +with interrupts disabled. + +Numbers for pinning a gigabyte of memory, one page a time, 20 repeats: + + Before: Average: 14.91 ms, stddev: 0.45 ms + After: Average: 10.76 ms, stddev: 0.18 ms + +Signed-off-by: Kirill A. Shutemov +Cc: Andrew Morton +Cc: Huang Ying +Cc: Jonathan Corbet +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: Thorsten Leemhuis +Cc: linux-mm@kvack.org +Fixes: e585513b76f7 ("x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation") +Link: http://lkml.kernel.org/r/20170908215603.9189-3-kirill.shutemov@linux.intel.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 5b65c4677a57a1d4414212f9995aa0e46a21ff80) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 5241f4b2c68284612e34910305f3234e4a64701b) +Signed-off-by: Fabian Grünbichler +--- + mm/gup.c | 97 ++++++++++++++++++++++++++++++++++++++-------------------------- + 1 file changed, 58 insertions(+), 39 deletions(-) + +diff --git a/mm/gup.c b/mm/gup.c +index 23f01c40c88f..4a789f1c6a27 100644 +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -1618,6 +1618,47 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, + return 1; + } + ++static void gup_pgd_range(unsigned long addr, unsigned long end, ++ int write, struct page **pages, int *nr) ++{ ++ unsigned long next; ++ pgd_t *pgdp; ++ ++ pgdp = pgd_offset(current->mm, addr); ++ do { ++ pgd_t pgd = READ_ONCE(*pgdp); ++ ++ next = pgd_addr_end(addr, end); ++ if (pgd_none(pgd)) ++ return; ++ if (unlikely(pgd_huge(pgd))) { ++ if (!gup_huge_pgd(pgd, pgdp, addr, next, write, ++ pages, nr)) ++ return; ++ } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { ++ if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, ++ PGDIR_SHIFT, next, write, pages, nr)) ++ return; ++ } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr)) ++ return; ++ } while (pgdp++, addr = next, addr != end); ++} ++ ++#ifndef gup_fast_permitted ++/* ++ * Check if it's allowed to use __get_user_pages_fast() for the range, or ++ * we need to fall back to the slow version: ++ */ ++bool gup_fast_permitted(unsigned long start, int nr_pages, int write) ++{ ++ unsigned long len, end; ++ ++ len = (unsigned long) nr_pages << PAGE_SHIFT; ++ end = start + len; ++ return end >= start; ++} ++#endif ++ + /* + * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to + * the regular GUP. It will only return non-negative values. +@@ -1625,10 +1666,8 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, + int __get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) + { +- struct mm_struct *mm = current->mm; + unsigned long addr, len, end; +- unsigned long next, flags; +- pgd_t *pgdp; ++ unsigned long flags; + int nr = 0; + + start &= PAGE_MASK; +@@ -1652,45 +1691,15 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, + * block IPIs that come from THPs splitting. + */ + +- local_irq_save(flags); +- pgdp = pgd_offset(mm, addr); +- do { +- pgd_t pgd = READ_ONCE(*pgdp); +- +- next = pgd_addr_end(addr, end); +- if (pgd_none(pgd)) +- break; +- if (unlikely(pgd_huge(pgd))) { +- if (!gup_huge_pgd(pgd, pgdp, addr, next, write, +- pages, &nr)) +- break; +- } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { +- if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, +- PGDIR_SHIFT, next, write, pages, &nr)) +- break; +- } else if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) +- break; +- } while (pgdp++, addr = next, addr != end); +- local_irq_restore(flags); ++ if (gup_fast_permitted(start, nr_pages, write)) { ++ local_irq_save(flags); ++ gup_pgd_range(addr, end, write, pages, &nr); ++ local_irq_restore(flags); ++ } + + return nr; + } + +-#ifndef gup_fast_permitted +-/* +- * Check if it's allowed to use __get_user_pages_fast() for the range, or +- * we need to fall back to the slow version: +- */ +-bool gup_fast_permitted(unsigned long start, int nr_pages, int write) +-{ +- unsigned long len, end; +- +- len = (unsigned long) nr_pages << PAGE_SHIFT; +- end = start + len; +- return end >= start; +-} +-#endif +- + /** + * get_user_pages_fast() - pin user pages in memory + * @start: starting user address +@@ -1710,12 +1719,22 @@ bool gup_fast_permitted(unsigned long start, int nr_pages, int write) + int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) + { ++ unsigned long addr, len, end; + int nr = 0, ret = 0; + + start &= PAGE_MASK; ++ addr = start; ++ len = (unsigned long) nr_pages << PAGE_SHIFT; ++ end = start + len; ++ ++ if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, ++ (void __user *)start, len))) ++ return 0; + + if (gup_fast_permitted(start, nr_pages, write)) { +- nr = __get_user_pages_fast(start, nr_pages, write, pages); ++ local_irq_disable(); ++ gup_pgd_range(addr, end, write, pages, &nr); ++ local_irq_enable(); + ret = nr; + } + +-- +2.14.2 + diff --git a/patches/kernel/0053-x86-asm-Remove-unnecessary-n-t-in-front-of-CC_SET-fr.patch b/patches/kernel/0053-x86-asm-Remove-unnecessary-n-t-in-front-of-CC_SET-fr.patch deleted file mode 100644 index 182565e..0000000 --- a/patches/kernel/0053-x86-asm-Remove-unnecessary-n-t-in-front-of-CC_SET-fr.patch +++ /dev/null @@ -1,149 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Uros Bizjak -Date: Wed, 6 Sep 2017 17:18:08 +0200 -Subject: [PATCH] x86/asm: Remove unnecessary \n\t in front of CC_SET() from - asm templates -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -There is no need for \n\t in front of CC_SET(), as the macro already includes these two. - -Signed-off-by: Uros Bizjak -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/20170906151808.5634-1-ubizjak@gmail.com -Signed-off-by: Ingo Molnar -(backported from commit 3c52b5c64326d9dcfee4e10611c53ec1b1b20675) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 1c3f29ec5586e3aecfde2c6f83b8786e1aecd9ac) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/archrandom.h | 8 ++++---- - arch/x86/include/asm/bitops.h | 10 +++++----- - arch/x86/include/asm/percpu.h | 2 +- - arch/x86/include/asm/rmwcc.h | 2 +- - 4 files changed, 11 insertions(+), 11 deletions(-) - -diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h -index 5b0579abb398..3ac991d81e74 100644 ---- a/arch/x86/include/asm/archrandom.h -+++ b/arch/x86/include/asm/archrandom.h -@@ -45,7 +45,7 @@ static inline bool rdrand_long(unsigned long *v) - bool ok; - unsigned int retry = RDRAND_RETRY_LOOPS; - do { -- asm volatile(RDRAND_LONG "\n\t" -+ asm volatile(RDRAND_LONG - CC_SET(c) - : CC_OUT(c) (ok), "=a" (*v)); - if (ok) -@@ -59,7 +59,7 @@ static inline bool rdrand_int(unsigned int *v) - bool ok; - unsigned int retry = RDRAND_RETRY_LOOPS; - do { -- asm volatile(RDRAND_INT "\n\t" -+ asm volatile(RDRAND_INT - CC_SET(c) - : CC_OUT(c) (ok), "=a" (*v)); - if (ok) -@@ -71,7 +71,7 @@ static inline bool rdrand_int(unsigned int *v) - static inline bool rdseed_long(unsigned long *v) - { - bool ok; -- asm volatile(RDSEED_LONG "\n\t" -+ asm volatile(RDSEED_LONG - CC_SET(c) - : CC_OUT(c) (ok), "=a" (*v)); - return ok; -@@ -80,7 +80,7 @@ static inline bool rdseed_long(unsigned long *v) - static inline bool rdseed_int(unsigned int *v) - { - bool ok; -- asm volatile(RDSEED_INT "\n\t" -+ asm volatile(RDSEED_INT - CC_SET(c) - : CC_OUT(c) (ok), "=a" (*v)); - return ok; -diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h -index 854022772c5b..8cee8db6dffb 100644 ---- a/arch/x86/include/asm/bitops.h -+++ b/arch/x86/include/asm/bitops.h -@@ -142,7 +142,7 @@ static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) - static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) - { - bool negative; -- asm volatile(LOCK_PREFIX "andb %2,%1\n\t" -+ asm volatile(LOCK_PREFIX "andb %2,%1" - CC_SET(s) - : CC_OUT(s) (negative), ADDR - : "ir" ((char) ~(1 << nr)) : "memory"); -@@ -245,7 +245,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * - { - bool oldbit; - -- asm("bts %2,%1\n\t" -+ asm("bts %2,%1" - CC_SET(c) - : CC_OUT(c) (oldbit), ADDR - : "Ir" (nr)); -@@ -285,7 +285,7 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long - { - bool oldbit; - -- asm volatile("btr %2,%1\n\t" -+ asm volatile("btr %2,%1" - CC_SET(c) - : CC_OUT(c) (oldbit), ADDR - : "Ir" (nr)); -@@ -297,7 +297,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon - { - bool oldbit; - -- asm volatile("btc %2,%1\n\t" -+ asm volatile("btc %2,%1" - CC_SET(c) - : CC_OUT(c) (oldbit), ADDR - : "Ir" (nr) : "memory"); -@@ -328,7 +328,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l - { - bool oldbit; - -- asm volatile("bt %2,%1\n\t" -+ asm volatile("bt %2,%1" - CC_SET(c) - : CC_OUT(c) (oldbit) - : "m" (*(unsigned long *)addr), "Ir" (nr)); -diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h -index 9fa03604b2b3..b21a475fd7ed 100644 ---- a/arch/x86/include/asm/percpu.h -+++ b/arch/x86/include/asm/percpu.h -@@ -525,7 +525,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr, - { - bool oldbit; - -- asm volatile("bt "__percpu_arg(2)",%1\n\t" -+ asm volatile("bt "__percpu_arg(2)",%1" - CC_SET(c) - : CC_OUT(c) (oldbit) - : "m" (*(unsigned long __percpu *)addr), "Ir" (nr)); -diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h -index 661dd305694a..dd7ba5aa8dca 100644 ---- a/arch/x86/include/asm/rmwcc.h -+++ b/arch/x86/include/asm/rmwcc.h -@@ -28,7 +28,7 @@ cc_label: \ - #define __GEN_RMWcc(fullop, var, cc, ...) \ - do { \ - bool c; \ -- asm volatile (fullop ";" CC_SET(cc) \ -+ asm volatile (fullop CC_SET(cc) \ - : "+m" (var), CC_OUT(cc) (c) \ - : __VA_ARGS__ : "memory"); \ - return c; \ --- -2.14.2 - diff --git a/patches/kernel/0054-objtool-Don-t-report-end-of-section-error-after-an-e.patch b/patches/kernel/0054-objtool-Don-t-report-end-of-section-error-after-an-e.patch deleted file mode 100644 index 67c73f2..0000000 --- a/patches/kernel/0054-objtool-Don-t-report-end-of-section-error-after-an-e.patch +++ /dev/null @@ -1,58 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Mon, 18 Sep 2017 21:43:30 -0500 -Subject: [PATCH] objtool: Don't report end of section error after an empty - unwind hint -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -If asm code specifies an UNWIND_HINT_EMPTY hint, don't warn if the -section ends unexpectedly. This can happen with the xen-head.S code -because the hypercall_page is "text" but it's all zeros. - -Signed-off-by: Josh Poimboeuf -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Jiri Slaby -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/ddafe199dd8797e40e3c2777373347eba1d65572.1505764066.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 00d96180dc38ef872ac471c2d3e14b067cbd895d) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 9d22f903bba24f2ac86de8a81dc1788f9957aca8) -Signed-off-by: Fabian Grünbichler ---- - tools/objtool/check.c | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) - -diff --git a/tools/objtool/check.c b/tools/objtool/check.c -index 368275de5f23..0a86fd0ac082 100644 ---- a/tools/objtool/check.c -+++ b/tools/objtool/check.c -@@ -1652,11 +1652,14 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, - if (insn->dead_end) - return 0; - -- insn = next_insn; -- if (!insn) { -+ if (!next_insn) { -+ if (state.cfa.base == CFI_UNDEFINED) -+ return 0; - WARN("%s: unexpected end of section", sec->name); - return 1; - } -+ -+ insn = next_insn; - } - - return 0; --- -2.14.2 - diff --git a/patches/kernel/0054-x86-asm-Remove-unnecessary-n-t-in-front-of-CC_SET-fr.patch b/patches/kernel/0054-x86-asm-Remove-unnecessary-n-t-in-front-of-CC_SET-fr.patch new file mode 100644 index 0000000..182565e --- /dev/null +++ b/patches/kernel/0054-x86-asm-Remove-unnecessary-n-t-in-front-of-CC_SET-fr.patch @@ -0,0 +1,149 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Uros Bizjak +Date: Wed, 6 Sep 2017 17:18:08 +0200 +Subject: [PATCH] x86/asm: Remove unnecessary \n\t in front of CC_SET() from + asm templates +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +There is no need for \n\t in front of CC_SET(), as the macro already includes these two. + +Signed-off-by: Uros Bizjak +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/20170906151808.5634-1-ubizjak@gmail.com +Signed-off-by: Ingo Molnar +(backported from commit 3c52b5c64326d9dcfee4e10611c53ec1b1b20675) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 1c3f29ec5586e3aecfde2c6f83b8786e1aecd9ac) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/archrandom.h | 8 ++++---- + arch/x86/include/asm/bitops.h | 10 +++++----- + arch/x86/include/asm/percpu.h | 2 +- + arch/x86/include/asm/rmwcc.h | 2 +- + 4 files changed, 11 insertions(+), 11 deletions(-) + +diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h +index 5b0579abb398..3ac991d81e74 100644 +--- a/arch/x86/include/asm/archrandom.h ++++ b/arch/x86/include/asm/archrandom.h +@@ -45,7 +45,7 @@ static inline bool rdrand_long(unsigned long *v) + bool ok; + unsigned int retry = RDRAND_RETRY_LOOPS; + do { +- asm volatile(RDRAND_LONG "\n\t" ++ asm volatile(RDRAND_LONG + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); + if (ok) +@@ -59,7 +59,7 @@ static inline bool rdrand_int(unsigned int *v) + bool ok; + unsigned int retry = RDRAND_RETRY_LOOPS; + do { +- asm volatile(RDRAND_INT "\n\t" ++ asm volatile(RDRAND_INT + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); + if (ok) +@@ -71,7 +71,7 @@ static inline bool rdrand_int(unsigned int *v) + static inline bool rdseed_long(unsigned long *v) + { + bool ok; +- asm volatile(RDSEED_LONG "\n\t" ++ asm volatile(RDSEED_LONG + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); + return ok; +@@ -80,7 +80,7 @@ static inline bool rdseed_long(unsigned long *v) + static inline bool rdseed_int(unsigned int *v) + { + bool ok; +- asm volatile(RDSEED_INT "\n\t" ++ asm volatile(RDSEED_INT + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); + return ok; +diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h +index 854022772c5b..8cee8db6dffb 100644 +--- a/arch/x86/include/asm/bitops.h ++++ b/arch/x86/include/asm/bitops.h +@@ -142,7 +142,7 @@ static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) + static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) + { + bool negative; +- asm volatile(LOCK_PREFIX "andb %2,%1\n\t" ++ asm volatile(LOCK_PREFIX "andb %2,%1" + CC_SET(s) + : CC_OUT(s) (negative), ADDR + : "ir" ((char) ~(1 << nr)) : "memory"); +@@ -245,7 +245,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * + { + bool oldbit; + +- asm("bts %2,%1\n\t" ++ asm("bts %2,%1" + CC_SET(c) + : CC_OUT(c) (oldbit), ADDR + : "Ir" (nr)); +@@ -285,7 +285,7 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long + { + bool oldbit; + +- asm volatile("btr %2,%1\n\t" ++ asm volatile("btr %2,%1" + CC_SET(c) + : CC_OUT(c) (oldbit), ADDR + : "Ir" (nr)); +@@ -297,7 +297,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon + { + bool oldbit; + +- asm volatile("btc %2,%1\n\t" ++ asm volatile("btc %2,%1" + CC_SET(c) + : CC_OUT(c) (oldbit), ADDR + : "Ir" (nr) : "memory"); +@@ -328,7 +328,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l + { + bool oldbit; + +- asm volatile("bt %2,%1\n\t" ++ asm volatile("bt %2,%1" + CC_SET(c) + : CC_OUT(c) (oldbit) + : "m" (*(unsigned long *)addr), "Ir" (nr)); +diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h +index 9fa03604b2b3..b21a475fd7ed 100644 +--- a/arch/x86/include/asm/percpu.h ++++ b/arch/x86/include/asm/percpu.h +@@ -525,7 +525,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr, + { + bool oldbit; + +- asm volatile("bt "__percpu_arg(2)",%1\n\t" ++ asm volatile("bt "__percpu_arg(2)",%1" + CC_SET(c) + : CC_OUT(c) (oldbit) + : "m" (*(unsigned long __percpu *)addr), "Ir" (nr)); +diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h +index 661dd305694a..dd7ba5aa8dca 100644 +--- a/arch/x86/include/asm/rmwcc.h ++++ b/arch/x86/include/asm/rmwcc.h +@@ -28,7 +28,7 @@ cc_label: \ + #define __GEN_RMWcc(fullop, var, cc, ...) \ + do { \ + bool c; \ +- asm volatile (fullop ";" CC_SET(cc) \ ++ asm volatile (fullop CC_SET(cc) \ + : "+m" (var), CC_OUT(cc) (c) \ + : __VA_ARGS__ : "memory"); \ + return c; \ +-- +2.14.2 + diff --git a/patches/kernel/0055-objtool-Don-t-report-end-of-section-error-after-an-e.patch b/patches/kernel/0055-objtool-Don-t-report-end-of-section-error-after-an-e.patch new file mode 100644 index 0000000..67c73f2 --- /dev/null +++ b/patches/kernel/0055-objtool-Don-t-report-end-of-section-error-after-an-e.patch @@ -0,0 +1,58 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Mon, 18 Sep 2017 21:43:30 -0500 +Subject: [PATCH] objtool: Don't report end of section error after an empty + unwind hint +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +If asm code specifies an UNWIND_HINT_EMPTY hint, don't warn if the +section ends unexpectedly. This can happen with the xen-head.S code +because the hypercall_page is "text" but it's all zeros. + +Signed-off-by: Josh Poimboeuf +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Jiri Slaby +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/ddafe199dd8797e40e3c2777373347eba1d65572.1505764066.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 00d96180dc38ef872ac471c2d3e14b067cbd895d) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 9d22f903bba24f2ac86de8a81dc1788f9957aca8) +Signed-off-by: Fabian Grünbichler +--- + tools/objtool/check.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/tools/objtool/check.c b/tools/objtool/check.c +index 368275de5f23..0a86fd0ac082 100644 +--- a/tools/objtool/check.c ++++ b/tools/objtool/check.c +@@ -1652,11 +1652,14 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, + if (insn->dead_end) + return 0; + +- insn = next_insn; +- if (!insn) { ++ if (!next_insn) { ++ if (state.cfa.base == CFI_UNDEFINED) ++ return 0; + WARN("%s: unexpected end of section", sec->name); + return 1; + } ++ ++ insn = next_insn; + } + + return 0; +-- +2.14.2 + diff --git a/patches/kernel/0055-x86-head-Remove-confusing-comment.patch b/patches/kernel/0055-x86-head-Remove-confusing-comment.patch deleted file mode 100644 index 82fe715..0000000 --- a/patches/kernel/0055-x86-head-Remove-confusing-comment.patch +++ /dev/null @@ -1,54 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Mon, 18 Sep 2017 21:43:31 -0500 -Subject: [PATCH] x86/head: Remove confusing comment -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -This comment is actively wrong and confusing. It refers to the -registers' stack offsets after the pt_regs has been constructed on the -stack, but this code is *before* that. - -At this point the stack just has the standard iret frame, for which no -comment should be needed. - -Signed-off-by: Josh Poimboeuf -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Jiri Slaby -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/a3c267b770fc56c9b86df9c11c552848248aace2.1505764066.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 17270717e80de33a884ad328fea5f407d87f6d6a) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 49187e0108184688304260a75d29b789f36f3a2b) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/head_64.S | 4 ---- - 1 file changed, 4 deletions(-) - -diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S -index 6225550883df..627c798b2f15 100644 ---- a/arch/x86/kernel/head_64.S -+++ b/arch/x86/kernel/head_64.S -@@ -254,10 +254,6 @@ bad_address: - - __INIT - ENTRY(early_idt_handler_array) -- # 104(%rsp) %rflags -- # 96(%rsp) %cs -- # 88(%rsp) %rip -- # 80(%rsp) error code - i = 0 - .rept NUM_EXCEPTION_VECTORS - .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 --- -2.14.2 - diff --git a/patches/kernel/0056-x86-head-Remove-confusing-comment.patch b/patches/kernel/0056-x86-head-Remove-confusing-comment.patch new file mode 100644 index 0000000..82fe715 --- /dev/null +++ b/patches/kernel/0056-x86-head-Remove-confusing-comment.patch @@ -0,0 +1,54 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Mon, 18 Sep 2017 21:43:31 -0500 +Subject: [PATCH] x86/head: Remove confusing comment +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +This comment is actively wrong and confusing. It refers to the +registers' stack offsets after the pt_regs has been constructed on the +stack, but this code is *before* that. + +At this point the stack just has the standard iret frame, for which no +comment should be needed. + +Signed-off-by: Josh Poimboeuf +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Jiri Slaby +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/a3c267b770fc56c9b86df9c11c552848248aace2.1505764066.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 17270717e80de33a884ad328fea5f407d87f6d6a) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 49187e0108184688304260a75d29b789f36f3a2b) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/head_64.S | 4 ---- + 1 file changed, 4 deletions(-) + +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index 6225550883df..627c798b2f15 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -254,10 +254,6 @@ bad_address: + + __INIT + ENTRY(early_idt_handler_array) +- # 104(%rsp) %rflags +- # 96(%rsp) %cs +- # 88(%rsp) %rip +- # 80(%rsp) error code + i = 0 + .rept NUM_EXCEPTION_VECTORS + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 +-- +2.14.2 + diff --git a/patches/kernel/0056-x86-head-Remove-unused-bad_address-code.patch b/patches/kernel/0056-x86-head-Remove-unused-bad_address-code.patch deleted file mode 100644 index 62a5ad6..0000000 --- a/patches/kernel/0056-x86-head-Remove-unused-bad_address-code.patch +++ /dev/null @@ -1,48 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Mon, 18 Sep 2017 21:43:32 -0500 -Subject: [PATCH] x86/head: Remove unused 'bad_address' code -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -It's no longer possible for this code to be executed, so remove it. - -Signed-off-by: Josh Poimboeuf -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Jiri Slaby -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/32a46fe92d2083700599b36872b26e7dfd7b7965.1505764066.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit a8b88e84d124bc92c4808e72b8b8c0e0bb538630) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit d790ff35a3a49ef0942a3484f024551433fd2ddf) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/head_64.S | 3 --- - 1 file changed, 3 deletions(-) - -diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S -index 627c798b2f15..37d9905d38d6 100644 ---- a/arch/x86/kernel/head_64.S -+++ b/arch/x86/kernel/head_64.S -@@ -249,9 +249,6 @@ ENDPROC(start_cpu0) - .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS - __FINITDATA - --bad_address: -- jmp bad_address -- - __INIT - ENTRY(early_idt_handler_array) - i = 0 --- -2.14.2 - diff --git a/patches/kernel/0057-x86-head-Fix-head-ELF-function-annotations.patch b/patches/kernel/0057-x86-head-Fix-head-ELF-function-annotations.patch deleted file mode 100644 index abf72c8..0000000 --- a/patches/kernel/0057-x86-head-Fix-head-ELF-function-annotations.patch +++ /dev/null @@ -1,66 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Mon, 18 Sep 2017 21:43:33 -0500 -Subject: [PATCH] x86/head: Fix head ELF function annotations -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -These functions aren't callable C-type functions, so don't annotate them -as such. - -Signed-off-by: Josh Poimboeuf -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Jiri Slaby -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/36eb182738c28514f8bf95e403d89b6413a88883.1505764066.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 015a2ea5478680fc5216d56b7ff306f2a74efaf9) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 707517a56928fed1c03eefdb4e00fa57dfddc4fd) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/head_64.S | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S -index 37d9905d38d6..45b18b1a6417 100644 ---- a/arch/x86/kernel/head_64.S -+++ b/arch/x86/kernel/head_64.S -@@ -218,7 +218,7 @@ ENTRY(secondary_startup_64) - pushq %rax # target address in negative space - lretq - .Lafter_lret: --ENDPROC(secondary_startup_64) -+END(secondary_startup_64) - - #include "verify_cpu.S" - -@@ -261,7 +261,7 @@ ENTRY(early_idt_handler_array) - i = i + 1 - .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc - .endr --ENDPROC(early_idt_handler_array) -+END(early_idt_handler_array) - - early_idt_handler_common: - /* -@@ -304,7 +304,7 @@ early_idt_handler_common: - 20: - decl early_recursion_flag(%rip) - jmp restore_regs_and_iret --ENDPROC(early_idt_handler_common) -+END(early_idt_handler_common) - - __INITDATA - --- -2.14.2 - diff --git a/patches/kernel/0057-x86-head-Remove-unused-bad_address-code.patch b/patches/kernel/0057-x86-head-Remove-unused-bad_address-code.patch new file mode 100644 index 0000000..62a5ad6 --- /dev/null +++ b/patches/kernel/0057-x86-head-Remove-unused-bad_address-code.patch @@ -0,0 +1,48 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Mon, 18 Sep 2017 21:43:32 -0500 +Subject: [PATCH] x86/head: Remove unused 'bad_address' code +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +It's no longer possible for this code to be executed, so remove it. + +Signed-off-by: Josh Poimboeuf +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Jiri Slaby +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/32a46fe92d2083700599b36872b26e7dfd7b7965.1505764066.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit a8b88e84d124bc92c4808e72b8b8c0e0bb538630) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit d790ff35a3a49ef0942a3484f024551433fd2ddf) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/head_64.S | 3 --- + 1 file changed, 3 deletions(-) + +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index 627c798b2f15..37d9905d38d6 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -249,9 +249,6 @@ ENDPROC(start_cpu0) + .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS + __FINITDATA + +-bad_address: +- jmp bad_address +- + __INIT + ENTRY(early_idt_handler_array) + i = 0 +-- +2.14.2 + diff --git a/patches/kernel/0058-x86-boot-Annotate-verify_cpu-as-a-callable-function.patch b/patches/kernel/0058-x86-boot-Annotate-verify_cpu-as-a-callable-function.patch deleted file mode 100644 index 23456b4..0000000 --- a/patches/kernel/0058-x86-boot-Annotate-verify_cpu-as-a-callable-function.patch +++ /dev/null @@ -1,52 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Mon, 18 Sep 2017 21:43:34 -0500 -Subject: [PATCH] x86/boot: Annotate verify_cpu() as a callable function -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -verify_cpu() is a callable function. Annotate it as such. - -Signed-off-by: Josh Poimboeuf -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Jiri Slaby -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/293024b8a080832075312f38c07ccc970fc70292.1505764066.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit e93db75a0054b23a874a12c63376753544f3fe9e) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 48a432c46026f864e194cdf9a8133e7c9109274e) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/verify_cpu.S | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S -index 014ea59aa153..3d3c2f71f617 100644 ---- a/arch/x86/kernel/verify_cpu.S -+++ b/arch/x86/kernel/verify_cpu.S -@@ -33,7 +33,7 @@ - #include - #include - --verify_cpu: -+ENTRY(verify_cpu) - pushf # Save caller passed flags - push $0 # Kill any dangerous flags - popf -@@ -139,3 +139,4 @@ verify_cpu: - popf # Restore caller passed flags - xorl %eax, %eax - ret -+ENDPROC(verify_cpu) --- -2.14.2 - diff --git a/patches/kernel/0058-x86-head-Fix-head-ELF-function-annotations.patch b/patches/kernel/0058-x86-head-Fix-head-ELF-function-annotations.patch new file mode 100644 index 0000000..abf72c8 --- /dev/null +++ b/patches/kernel/0058-x86-head-Fix-head-ELF-function-annotations.patch @@ -0,0 +1,66 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Mon, 18 Sep 2017 21:43:33 -0500 +Subject: [PATCH] x86/head: Fix head ELF function annotations +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +These functions aren't callable C-type functions, so don't annotate them +as such. + +Signed-off-by: Josh Poimboeuf +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Jiri Slaby +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/36eb182738c28514f8bf95e403d89b6413a88883.1505764066.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 015a2ea5478680fc5216d56b7ff306f2a74efaf9) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 707517a56928fed1c03eefdb4e00fa57dfddc4fd) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/head_64.S | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index 37d9905d38d6..45b18b1a6417 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -218,7 +218,7 @@ ENTRY(secondary_startup_64) + pushq %rax # target address in negative space + lretq + .Lafter_lret: +-ENDPROC(secondary_startup_64) ++END(secondary_startup_64) + + #include "verify_cpu.S" + +@@ -261,7 +261,7 @@ ENTRY(early_idt_handler_array) + i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc + .endr +-ENDPROC(early_idt_handler_array) ++END(early_idt_handler_array) + + early_idt_handler_common: + /* +@@ -304,7 +304,7 @@ early_idt_handler_common: + 20: + decl early_recursion_flag(%rip) + jmp restore_regs_and_iret +-ENDPROC(early_idt_handler_common) ++END(early_idt_handler_common) + + __INITDATA + +-- +2.14.2 + diff --git a/patches/kernel/0059-x86-boot-Annotate-verify_cpu-as-a-callable-function.patch b/patches/kernel/0059-x86-boot-Annotate-verify_cpu-as-a-callable-function.patch new file mode 100644 index 0000000..23456b4 --- /dev/null +++ b/patches/kernel/0059-x86-boot-Annotate-verify_cpu-as-a-callable-function.patch @@ -0,0 +1,52 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Mon, 18 Sep 2017 21:43:34 -0500 +Subject: [PATCH] x86/boot: Annotate verify_cpu() as a callable function +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +verify_cpu() is a callable function. Annotate it as such. + +Signed-off-by: Josh Poimboeuf +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Jiri Slaby +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/293024b8a080832075312f38c07ccc970fc70292.1505764066.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit e93db75a0054b23a874a12c63376753544f3fe9e) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 48a432c46026f864e194cdf9a8133e7c9109274e) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/verify_cpu.S | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S +index 014ea59aa153..3d3c2f71f617 100644 +--- a/arch/x86/kernel/verify_cpu.S ++++ b/arch/x86/kernel/verify_cpu.S +@@ -33,7 +33,7 @@ + #include + #include + +-verify_cpu: ++ENTRY(verify_cpu) + pushf # Save caller passed flags + push $0 # Kill any dangerous flags + popf +@@ -139,3 +139,4 @@ verify_cpu: + popf # Restore caller passed flags + xorl %eax, %eax + ret ++ENDPROC(verify_cpu) +-- +2.14.2 + diff --git a/patches/kernel/0059-x86-xen-Fix-xen-head-ELF-annotations.patch b/patches/kernel/0059-x86-xen-Fix-xen-head-ELF-annotations.patch deleted file mode 100644 index d261ae4..0000000 --- a/patches/kernel/0059-x86-xen-Fix-xen-head-ELF-annotations.patch +++ /dev/null @@ -1,56 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Mon, 18 Sep 2017 21:43:35 -0500 -Subject: [PATCH] x86/xen: Fix xen head ELF annotations -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Mark the ends of the startup_xen and hypercall_page code sections. - -Signed-off-by: Josh Poimboeuf -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Jiri Slaby -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/3a80a394d30af43d9cefa1a29628c45ed8420c97.1505764066.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 2582d3df95c76d3b686453baf90b64d57e87d1e8) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit b9410861f1436c1e38958a9b85009ad252aad9f5) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/xen/xen-head.S | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S -index 72a8e6adebe6..2f0cff2cc265 100644 ---- a/arch/x86/xen/xen-head.S -+++ b/arch/x86/xen/xen-head.S -@@ -33,7 +33,7 @@ ENTRY(startup_xen) - mov $init_thread_union+THREAD_SIZE, %_ASM_SP - - jmp xen_start_kernel -- -+END(startup_xen) - __FINIT - #endif - -@@ -47,7 +47,7 @@ ENTRY(hypercall_page) - .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 - #include - #undef HYPERCALL -- -+END(hypercall_page) - .popsection - - ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") --- -2.14.2 - diff --git a/patches/kernel/0060-x86-xen-Add-unwind-hint-annotations.patch b/patches/kernel/0060-x86-xen-Add-unwind-hint-annotations.patch deleted file mode 100644 index ce10c49..0000000 --- a/patches/kernel/0060-x86-xen-Add-unwind-hint-annotations.patch +++ /dev/null @@ -1,70 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Mon, 18 Sep 2017 21:43:36 -0500 -Subject: [PATCH] x86/xen: Add unwind hint annotations -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Add unwind hint annotations to the xen head code so the ORC unwinder can -read head_64.o. - -hypercall_page needs empty annotations at 32-byte intervals to match the -'xen_hypercall_*' ELF functions at those locations. - -Signed-off-by: Josh Poimboeuf -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Jiri Slaby -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/70ed2eb516fe9266be766d953f93c2571bca88cc.1505764066.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit abbe1cac6214d81d2f4e149aba64a8760703144e) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 9f099a90cb39eaff9b3187e8a6d8151c8af53db1) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/xen/xen-head.S | 7 ++++++- - 1 file changed, 6 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S -index 2f0cff2cc265..ad189ab2c329 100644 ---- a/arch/x86/xen/xen-head.S -+++ b/arch/x86/xen/xen-head.S -@@ -9,6 +9,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -19,6 +20,7 @@ - #ifdef CONFIG_XEN_PV - __INIT - ENTRY(startup_xen) -+ UNWIND_HINT_EMPTY - cld - - /* Clear .bss */ -@@ -40,7 +42,10 @@ END(startup_xen) - .pushsection .text - .balign PAGE_SIZE - ENTRY(hypercall_page) -- .skip PAGE_SIZE -+ .rept (PAGE_SIZE / 32) -+ UNWIND_HINT_EMPTY -+ .skip 32 -+ .endr - - #define HYPERCALL(n) \ - .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ --- -2.14.2 - diff --git a/patches/kernel/0060-x86-xen-Fix-xen-head-ELF-annotations.patch b/patches/kernel/0060-x86-xen-Fix-xen-head-ELF-annotations.patch new file mode 100644 index 0000000..d261ae4 --- /dev/null +++ b/patches/kernel/0060-x86-xen-Fix-xen-head-ELF-annotations.patch @@ -0,0 +1,56 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Mon, 18 Sep 2017 21:43:35 -0500 +Subject: [PATCH] x86/xen: Fix xen head ELF annotations +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Mark the ends of the startup_xen and hypercall_page code sections. + +Signed-off-by: Josh Poimboeuf +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Jiri Slaby +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/3a80a394d30af43d9cefa1a29628c45ed8420c97.1505764066.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 2582d3df95c76d3b686453baf90b64d57e87d1e8) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit b9410861f1436c1e38958a9b85009ad252aad9f5) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/xen/xen-head.S | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S +index 72a8e6adebe6..2f0cff2cc265 100644 +--- a/arch/x86/xen/xen-head.S ++++ b/arch/x86/xen/xen-head.S +@@ -33,7 +33,7 @@ ENTRY(startup_xen) + mov $init_thread_union+THREAD_SIZE, %_ASM_SP + + jmp xen_start_kernel +- ++END(startup_xen) + __FINIT + #endif + +@@ -47,7 +47,7 @@ ENTRY(hypercall_page) + .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 + #include + #undef HYPERCALL +- ++END(hypercall_page) + .popsection + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") +-- +2.14.2 + diff --git a/patches/kernel/0061-x86-head-Add-unwind-hint-annotations.patch b/patches/kernel/0061-x86-head-Add-unwind-hint-annotations.patch deleted file mode 100644 index 9579011..0000000 --- a/patches/kernel/0061-x86-head-Add-unwind-hint-annotations.patch +++ /dev/null @@ -1,134 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Mon, 18 Sep 2017 21:43:37 -0500 -Subject: [PATCH] x86/head: Add unwind hint annotations -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Jiri Slaby reported an ORC issue when unwinding from an idle task. The -stack was: - - ffffffff811083c2 do_idle+0x142/0x1e0 - ffffffff8110861d cpu_startup_entry+0x5d/0x60 - ffffffff82715f58 start_kernel+0x3ff/0x407 - ffffffff827153e8 x86_64_start_kernel+0x14e/0x15d - ffffffff810001bf secondary_startup_64+0x9f/0xa0 - -The ORC unwinder errored out at secondary_startup_64 because the head -code isn't annotated yet so there wasn't a corresponding ORC entry. - -Fix that and any other head-related unwinding issues by adding unwind -hints to the head code. - -Reported-by: Jiri Slaby -Tested-by: Jiri Slaby -Signed-off-by: Josh Poimboeuf -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/78ef000a2f68f545d6eef44ee912edceaad82ccf.1505764066.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 2704fbb672d0d9a19414907fda7949283dcef6a1) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit b63a868e404e64172afefea553c6a40963a151db) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/Makefile | 1 - - arch/x86/kernel/head_64.S | 14 ++++++++++++-- - 2 files changed, 12 insertions(+), 3 deletions(-) - -diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile -index 287eac7d207f..e2315aecc441 100644 ---- a/arch/x86/kernel/Makefile -+++ b/arch/x86/kernel/Makefile -@@ -26,7 +26,6 @@ KASAN_SANITIZE_dumpstack.o := n - KASAN_SANITIZE_dumpstack_$(BITS).o := n - KASAN_SANITIZE_stacktrace.o := n - --OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y - OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y - OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y - OBJECT_FILES_NON_STANDARD_test_nx.o := y -diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S -index 45b18b1a6417..d081bc7a027d 100644 ---- a/arch/x86/kernel/head_64.S -+++ b/arch/x86/kernel/head_64.S -@@ -49,6 +49,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) - .code64 - .globl startup_64 - startup_64: -+ UNWIND_HINT_EMPTY - /* - * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, - * and someone has loaded an identity mapped page table -@@ -81,6 +82,7 @@ startup_64: - movq $(early_top_pgt - __START_KERNEL_map), %rax - jmp 1f - ENTRY(secondary_startup_64) -+ UNWIND_HINT_EMPTY - /* - * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, - * and someone has loaded a mapped page table. -@@ -116,6 +118,7 @@ ENTRY(secondary_startup_64) - movq $1f, %rax - jmp *%rax - 1: -+ UNWIND_HINT_EMPTY - - /* Check if nx is implemented */ - movl $0x80000001, %eax -@@ -230,6 +233,7 @@ END(secondary_startup_64) - */ - ENTRY(start_cpu0) - movq initial_stack(%rip), %rsp -+ UNWIND_HINT_EMPTY - jmp .Ljump_to_C_code - ENDPROC(start_cpu0) - #endif -@@ -254,13 +258,18 @@ ENTRY(early_idt_handler_array) - i = 0 - .rept NUM_EXCEPTION_VECTORS - .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 -- pushq $0 # Dummy error code, to make stack frame uniform -+ UNWIND_HINT_IRET_REGS -+ pushq $0 # Dummy error code, to make stack frame uniform -+ .else -+ UNWIND_HINT_IRET_REGS offset=8 - .endif - pushq $i # 72(%rsp) Vector number - jmp early_idt_handler_common -+ UNWIND_HINT_IRET_REGS - i = i + 1 - .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc - .endr -+ UNWIND_HINT_IRET_REGS offset=16 - END(early_idt_handler_array) - - early_idt_handler_common: -@@ -289,6 +298,7 @@ early_idt_handler_common: - pushq %r13 /* pt_regs->r13 */ - pushq %r14 /* pt_regs->r14 */ - pushq %r15 /* pt_regs->r15 */ -+ UNWIND_HINT_REGS - - cmpq $14,%rsi /* Page fault? */ - jnz 10f -@@ -411,7 +421,7 @@ ENTRY(phys_base) - EXPORT_SYMBOL(phys_base) - - #include "../../x86/xen/xen-head.S" -- -+ - __PAGE_ALIGNED_BSS - NEXT_PAGE(empty_zero_page) - .skip PAGE_SIZE --- -2.14.2 - diff --git a/patches/kernel/0061-x86-xen-Add-unwind-hint-annotations.patch b/patches/kernel/0061-x86-xen-Add-unwind-hint-annotations.patch new file mode 100644 index 0000000..ce10c49 --- /dev/null +++ b/patches/kernel/0061-x86-xen-Add-unwind-hint-annotations.patch @@ -0,0 +1,70 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Mon, 18 Sep 2017 21:43:36 -0500 +Subject: [PATCH] x86/xen: Add unwind hint annotations +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Add unwind hint annotations to the xen head code so the ORC unwinder can +read head_64.o. + +hypercall_page needs empty annotations at 32-byte intervals to match the +'xen_hypercall_*' ELF functions at those locations. + +Signed-off-by: Josh Poimboeuf +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Jiri Slaby +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/70ed2eb516fe9266be766d953f93c2571bca88cc.1505764066.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit abbe1cac6214d81d2f4e149aba64a8760703144e) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 9f099a90cb39eaff9b3187e8a6d8151c8af53db1) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/xen/xen-head.S | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S +index 2f0cff2cc265..ad189ab2c329 100644 +--- a/arch/x86/xen/xen-head.S ++++ b/arch/x86/xen/xen-head.S +@@ -9,6 +9,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -19,6 +20,7 @@ + #ifdef CONFIG_XEN_PV + __INIT + ENTRY(startup_xen) ++ UNWIND_HINT_EMPTY + cld + + /* Clear .bss */ +@@ -40,7 +42,10 @@ END(startup_xen) + .pushsection .text + .balign PAGE_SIZE + ENTRY(hypercall_page) +- .skip PAGE_SIZE ++ .rept (PAGE_SIZE / 32) ++ UNWIND_HINT_EMPTY ++ .skip 32 ++ .endr + + #define HYPERCALL(n) \ + .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ +-- +2.14.2 + diff --git a/patches/kernel/0062-ACPI-APEI-adjust-a-local-variable-type-in-ghes_iorem.patch b/patches/kernel/0062-ACPI-APEI-adjust-a-local-variable-type-in-ghes_iorem.patch deleted file mode 100644 index 78d3cb4..0000000 --- a/patches/kernel/0062-ACPI-APEI-adjust-a-local-variable-type-in-ghes_iorem.patch +++ /dev/null @@ -1,43 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Jan Beulich -Date: Mon, 25 Sep 2017 02:06:19 -0600 -Subject: [PATCH] ACPI / APEI: adjust a local variable type in - ghes_ioremap_pfn_irq() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Match up with what 7edda0886b ("acpi: apei: handle SEA notification -type for ARMv8") did for ghes_ioremap_pfn_nmi(). - -Signed-off-by: Jan Beulich -Reviewed-by: Borislav Petkov -Signed-off-by: Rafael J. Wysocki -(cherry picked from commit 095f613c6b386a1704b73a549e9ba66c1d5381ae) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 0a5c092882b0ead111dc3a6bbaa870665b54d796) -Signed-off-by: Fabian Grünbichler ---- - drivers/acpi/apei/ghes.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c -index d661d452b238..3628078ee351 100644 ---- a/drivers/acpi/apei/ghes.c -+++ b/drivers/acpi/apei/ghes.c -@@ -174,7 +174,8 @@ static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn) - - static void __iomem *ghes_ioremap_pfn_irq(u64 pfn) - { -- unsigned long vaddr, paddr; -+ unsigned long vaddr; -+ phys_addr_t paddr; - pgprot_t prot; - - vaddr = (unsigned long)GHES_IOREMAP_IRQ_PAGE(ghes_ioremap_area->addr); --- -2.14.2 - diff --git a/patches/kernel/0062-x86-head-Add-unwind-hint-annotations.patch b/patches/kernel/0062-x86-head-Add-unwind-hint-annotations.patch new file mode 100644 index 0000000..9579011 --- /dev/null +++ b/patches/kernel/0062-x86-head-Add-unwind-hint-annotations.patch @@ -0,0 +1,134 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Mon, 18 Sep 2017 21:43:37 -0500 +Subject: [PATCH] x86/head: Add unwind hint annotations +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Jiri Slaby reported an ORC issue when unwinding from an idle task. The +stack was: + + ffffffff811083c2 do_idle+0x142/0x1e0 + ffffffff8110861d cpu_startup_entry+0x5d/0x60 + ffffffff82715f58 start_kernel+0x3ff/0x407 + ffffffff827153e8 x86_64_start_kernel+0x14e/0x15d + ffffffff810001bf secondary_startup_64+0x9f/0xa0 + +The ORC unwinder errored out at secondary_startup_64 because the head +code isn't annotated yet so there wasn't a corresponding ORC entry. + +Fix that and any other head-related unwinding issues by adding unwind +hints to the head code. + +Reported-by: Jiri Slaby +Tested-by: Jiri Slaby +Signed-off-by: Josh Poimboeuf +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/78ef000a2f68f545d6eef44ee912edceaad82ccf.1505764066.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 2704fbb672d0d9a19414907fda7949283dcef6a1) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit b63a868e404e64172afefea553c6a40963a151db) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/Makefile | 1 - + arch/x86/kernel/head_64.S | 14 ++++++++++++-- + 2 files changed, 12 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile +index 287eac7d207f..e2315aecc441 100644 +--- a/arch/x86/kernel/Makefile ++++ b/arch/x86/kernel/Makefile +@@ -26,7 +26,6 @@ KASAN_SANITIZE_dumpstack.o := n + KASAN_SANITIZE_dumpstack_$(BITS).o := n + KASAN_SANITIZE_stacktrace.o := n + +-OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y + OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y + OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y + OBJECT_FILES_NON_STANDARD_test_nx.o := y +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index 45b18b1a6417..d081bc7a027d 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -49,6 +49,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) + .code64 + .globl startup_64 + startup_64: ++ UNWIND_HINT_EMPTY + /* + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, + * and someone has loaded an identity mapped page table +@@ -81,6 +82,7 @@ startup_64: + movq $(early_top_pgt - __START_KERNEL_map), %rax + jmp 1f + ENTRY(secondary_startup_64) ++ UNWIND_HINT_EMPTY + /* + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, + * and someone has loaded a mapped page table. +@@ -116,6 +118,7 @@ ENTRY(secondary_startup_64) + movq $1f, %rax + jmp *%rax + 1: ++ UNWIND_HINT_EMPTY + + /* Check if nx is implemented */ + movl $0x80000001, %eax +@@ -230,6 +233,7 @@ END(secondary_startup_64) + */ + ENTRY(start_cpu0) + movq initial_stack(%rip), %rsp ++ UNWIND_HINT_EMPTY + jmp .Ljump_to_C_code + ENDPROC(start_cpu0) + #endif +@@ -254,13 +258,18 @@ ENTRY(early_idt_handler_array) + i = 0 + .rept NUM_EXCEPTION_VECTORS + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 +- pushq $0 # Dummy error code, to make stack frame uniform ++ UNWIND_HINT_IRET_REGS ++ pushq $0 # Dummy error code, to make stack frame uniform ++ .else ++ UNWIND_HINT_IRET_REGS offset=8 + .endif + pushq $i # 72(%rsp) Vector number + jmp early_idt_handler_common ++ UNWIND_HINT_IRET_REGS + i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc + .endr ++ UNWIND_HINT_IRET_REGS offset=16 + END(early_idt_handler_array) + + early_idt_handler_common: +@@ -289,6 +298,7 @@ early_idt_handler_common: + pushq %r13 /* pt_regs->r13 */ + pushq %r14 /* pt_regs->r14 */ + pushq %r15 /* pt_regs->r15 */ ++ UNWIND_HINT_REGS + + cmpq $14,%rsi /* Page fault? */ + jnz 10f +@@ -411,7 +421,7 @@ ENTRY(phys_base) + EXPORT_SYMBOL(phys_base) + + #include "../../x86/xen/xen-head.S" +- ++ + __PAGE_ALIGNED_BSS + NEXT_PAGE(empty_zero_page) + .skip PAGE_SIZE +-- +2.14.2 + diff --git a/patches/kernel/0063-ACPI-APEI-adjust-a-local-variable-type-in-ghes_iorem.patch b/patches/kernel/0063-ACPI-APEI-adjust-a-local-variable-type-in-ghes_iorem.patch new file mode 100644 index 0000000..78d3cb4 --- /dev/null +++ b/patches/kernel/0063-ACPI-APEI-adjust-a-local-variable-type-in-ghes_iorem.patch @@ -0,0 +1,43 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Mon, 25 Sep 2017 02:06:19 -0600 +Subject: [PATCH] ACPI / APEI: adjust a local variable type in + ghes_ioremap_pfn_irq() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Match up with what 7edda0886b ("acpi: apei: handle SEA notification +type for ARMv8") did for ghes_ioremap_pfn_nmi(). + +Signed-off-by: Jan Beulich +Reviewed-by: Borislav Petkov +Signed-off-by: Rafael J. Wysocki +(cherry picked from commit 095f613c6b386a1704b73a549e9ba66c1d5381ae) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 0a5c092882b0ead111dc3a6bbaa870665b54d796) +Signed-off-by: Fabian Grünbichler +--- + drivers/acpi/apei/ghes.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c +index d661d452b238..3628078ee351 100644 +--- a/drivers/acpi/apei/ghes.c ++++ b/drivers/acpi/apei/ghes.c +@@ -174,7 +174,8 @@ static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn) + + static void __iomem *ghes_ioremap_pfn_irq(u64 pfn) + { +- unsigned long vaddr, paddr; ++ unsigned long vaddr; ++ phys_addr_t paddr; + pgprot_t prot; + + vaddr = (unsigned long)GHES_IOREMAP_IRQ_PAGE(ghes_ioremap_area->addr); +-- +2.14.2 + diff --git a/patches/kernel/0063-x86-unwinder-Make-CONFIG_UNWINDER_ORC-y-the-default-.patch b/patches/kernel/0063-x86-unwinder-Make-CONFIG_UNWINDER_ORC-y-the-default-.patch deleted file mode 100644 index a04c95d..0000000 --- a/patches/kernel/0063-x86-unwinder-Make-CONFIG_UNWINDER_ORC-y-the-default-.patch +++ /dev/null @@ -1,44 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Ingo Molnar -Date: Thu, 12 Oct 2017 09:24:30 +0200 -Subject: [PATCH] x86/unwinder: Make CONFIG_UNWINDER_ORC=y the default in the - 64-bit defconfig -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Increase testing coverage by turning on the primary x86 unwinder for -the 64-bit defconfig. - -Cc: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: linux-kernel@vger.kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 1e4078f0bba46ad61b69548abe6a6faf63b89380) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit ebcba768c005dce435721f6c998e3afdf5534666) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/configs/x86_64_defconfig | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig -index 4a4b16e56d35..eb65c248708d 100644 ---- a/arch/x86/configs/x86_64_defconfig -+++ b/arch/x86/configs/x86_64_defconfig -@@ -299,6 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y - # CONFIG_DEBUG_RODATA_TEST is not set - CONFIG_DEBUG_BOOT_PARAMS=y - CONFIG_OPTIMIZE_INLINING=y -+CONFIG_ORC_UNWINDER=y - CONFIG_SECURITY=y - CONFIG_SECURITY_NETWORK=y - CONFIG_SECURITY_SELINUX=y --- -2.14.2 - diff --git a/patches/kernel/0064-x86-fpu-debug-Remove-unused-x86_fpu_state-and-x86_fp.patch b/patches/kernel/0064-x86-fpu-debug-Remove-unused-x86_fpu_state-and-x86_fp.patch deleted file mode 100644 index be4f5ca..0000000 --- a/patches/kernel/0064-x86-fpu-debug-Remove-unused-x86_fpu_state-and-x86_fp.patch +++ /dev/null @@ -1,66 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: "Steven Rostedt (VMware)" -Date: Thu, 12 Oct 2017 18:06:19 -0400 -Subject: [PATCH] x86/fpu/debug: Remove unused 'x86_fpu_state' and - 'x86_fpu_deactivate_state' tracepoints -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Commit: - - d1898b733619 ("x86/fpu: Add tracepoints to dump FPU state at key points") - -... added the 'x86_fpu_state' and 'x86_fpu_deactivate_state' trace points, -but never used them. Today they are still not used. As they take up -and waste memory, remove them. - -Signed-off-by: Steven Rostedt (VMware) -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/20171012180619.670b68b6@gandalf.local.home -Signed-off-by: Ingo Molnar -(cherry picked from commit 127a1bea40f7f2a36bc7207ea4d51bb6b4e936fa) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit c7c367ddb6ffb6af2cfee287960e97c4aefc6548) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/trace/fpu.h | 10 ---------- - 1 file changed, 10 deletions(-) - -diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h -index 342e59789fcd..fed7d9ecae60 100644 ---- a/arch/x86/include/asm/trace/fpu.h -+++ b/arch/x86/include/asm/trace/fpu.h -@@ -36,11 +36,6 @@ DECLARE_EVENT_CLASS(x86_fpu, - ) - ); - --DEFINE_EVENT(x86_fpu, x86_fpu_state, -- TP_PROTO(struct fpu *fpu), -- TP_ARGS(fpu) --); -- - DEFINE_EVENT(x86_fpu, x86_fpu_before_save, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -@@ -76,11 +71,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_activate_state, - TP_ARGS(fpu) - ); - --DEFINE_EVENT(x86_fpu, x86_fpu_deactivate_state, -- TP_PROTO(struct fpu *fpu), -- TP_ARGS(fpu) --); -- - DEFINE_EVENT(x86_fpu, x86_fpu_init_state, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) --- -2.14.2 - diff --git a/patches/kernel/0064-x86-unwinder-Make-CONFIG_UNWINDER_ORC-y-the-default-.patch b/patches/kernel/0064-x86-unwinder-Make-CONFIG_UNWINDER_ORC-y-the-default-.patch new file mode 100644 index 0000000..a04c95d --- /dev/null +++ b/patches/kernel/0064-x86-unwinder-Make-CONFIG_UNWINDER_ORC-y-the-default-.patch @@ -0,0 +1,44 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Ingo Molnar +Date: Thu, 12 Oct 2017 09:24:30 +0200 +Subject: [PATCH] x86/unwinder: Make CONFIG_UNWINDER_ORC=y the default in the + 64-bit defconfig +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Increase testing coverage by turning on the primary x86 unwinder for +the 64-bit defconfig. + +Cc: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 1e4078f0bba46ad61b69548abe6a6faf63b89380) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit ebcba768c005dce435721f6c998e3afdf5534666) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/configs/x86_64_defconfig | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index 4a4b16e56d35..eb65c248708d 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -299,6 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y + # CONFIG_DEBUG_RODATA_TEST is not set + CONFIG_DEBUG_BOOT_PARAMS=y + CONFIG_OPTIMIZE_INLINING=y ++CONFIG_ORC_UNWINDER=y + CONFIG_SECURITY=y + CONFIG_SECURITY_NETWORK=y + CONFIG_SECURITY_SELINUX=y +-- +2.14.2 + diff --git a/patches/kernel/0065-x86-fpu-debug-Remove-unused-x86_fpu_state-and-x86_fp.patch b/patches/kernel/0065-x86-fpu-debug-Remove-unused-x86_fpu_state-and-x86_fp.patch new file mode 100644 index 0000000..be4f5ca --- /dev/null +++ b/patches/kernel/0065-x86-fpu-debug-Remove-unused-x86_fpu_state-and-x86_fp.patch @@ -0,0 +1,66 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (VMware)" +Date: Thu, 12 Oct 2017 18:06:19 -0400 +Subject: [PATCH] x86/fpu/debug: Remove unused 'x86_fpu_state' and + 'x86_fpu_deactivate_state' tracepoints +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Commit: + + d1898b733619 ("x86/fpu: Add tracepoints to dump FPU state at key points") + +... added the 'x86_fpu_state' and 'x86_fpu_deactivate_state' trace points, +but never used them. Today they are still not used. As they take up +and waste memory, remove them. + +Signed-off-by: Steven Rostedt (VMware) +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/20171012180619.670b68b6@gandalf.local.home +Signed-off-by: Ingo Molnar +(cherry picked from commit 127a1bea40f7f2a36bc7207ea4d51bb6b4e936fa) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit c7c367ddb6ffb6af2cfee287960e97c4aefc6548) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/trace/fpu.h | 10 ---------- + 1 file changed, 10 deletions(-) + +diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h +index 342e59789fcd..fed7d9ecae60 100644 +--- a/arch/x86/include/asm/trace/fpu.h ++++ b/arch/x86/include/asm/trace/fpu.h +@@ -36,11 +36,6 @@ DECLARE_EVENT_CLASS(x86_fpu, + ) + ); + +-DEFINE_EVENT(x86_fpu, x86_fpu_state, +- TP_PROTO(struct fpu *fpu), +- TP_ARGS(fpu) +-); +- + DEFINE_EVENT(x86_fpu, x86_fpu_before_save, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +@@ -76,11 +71,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_activate_state, + TP_ARGS(fpu) + ); + +-DEFINE_EVENT(x86_fpu, x86_fpu_deactivate_state, +- TP_PROTO(struct fpu *fpu), +- TP_ARGS(fpu) +-); +- + DEFINE_EVENT(x86_fpu, x86_fpu_init_state, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +-- +2.14.2 + diff --git a/patches/kernel/0065-x86-unwind-Rename-unwinder-config-options-to-CONFIG_.patch b/patches/kernel/0065-x86-unwind-Rename-unwinder-config-options-to-CONFIG_.patch deleted file mode 100644 index 9689db5..0000000 --- a/patches/kernel/0065-x86-unwind-Rename-unwinder-config-options-to-CONFIG_.patch +++ /dev/null @@ -1,273 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Fri, 13 Oct 2017 15:02:00 -0500 -Subject: [PATCH] x86/unwind: Rename unwinder config options to - 'CONFIG_UNWINDER_*' -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Rename the unwinder config options from: - - CONFIG_ORC_UNWINDER - CONFIG_FRAME_POINTER_UNWINDER - CONFIG_GUESS_UNWINDER - -to: - - CONFIG_UNWINDER_ORC - CONFIG_UNWINDER_FRAME_POINTER - CONFIG_UNWINDER_GUESS - -... in order to give them a more logical config namespace. - -Suggested-by: Ingo Molnar -Signed-off-by: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/73972fc7e2762e91912c6b9584582703d6f1b8cc.1507924831.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 11af847446ed0d131cf24d16a7ef3d5ea7a49554) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 27ab2a240a797b073ce63385b1d5db06e44fc3ae) -Signed-off-by: Fabian Grünbichler ---- - Documentation/x86/orc-unwinder.txt | 2 +- - Makefile | 4 ++-- - arch/x86/kernel/Makefile | 6 +++--- - scripts/Makefile.build | 2 +- - arch/x86/include/asm/module.h | 2 +- - arch/x86/include/asm/unwind.h | 8 ++++---- - include/asm-generic/vmlinux.lds.h | 2 +- - arch/x86/Kconfig | 2 +- - arch/x86/Kconfig.debug | 10 +++++----- - arch/x86/configs/tiny.config | 4 ++-- - arch/x86/configs/x86_64_defconfig | 2 +- - lib/Kconfig.debug | 2 +- - 12 files changed, 23 insertions(+), 23 deletions(-) - -diff --git a/Documentation/x86/orc-unwinder.txt b/Documentation/x86/orc-unwinder.txt -index af0c9a4c65a6..cd4b29be29af 100644 ---- a/Documentation/x86/orc-unwinder.txt -+++ b/Documentation/x86/orc-unwinder.txt -@@ -4,7 +4,7 @@ ORC unwinder - Overview - -------- - --The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is -+The kernel CONFIG_UNWINDER_ORC option enables the ORC unwinder, which is - similar in concept to a DWARF unwinder. The difference is that the - format of the ORC data is much simpler than DWARF, which in turn allows - the ORC unwinder to be much simpler and faster. -diff --git a/Makefile b/Makefile -index 490ce18685ea..b740e3dc9ff8 100644 ---- a/Makefile -+++ b/Makefile -@@ -965,8 +965,8 @@ ifdef CONFIG_STACK_VALIDATION - ifeq ($(has_libelf),1) - objtool_target := tools/objtool FORCE - else -- ifdef CONFIG_ORC_UNWINDER -- $(error "Cannot generate ORC metadata for CONFIG_ORC_UNWINDER=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") -+ ifdef CONFIG_UNWINDER_ORC -+ $(error "Cannot generate ORC metadata for CONFIG_UNWINDER_ORC=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") - else - $(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") - endif -diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile -index e2315aecc441..5bf0d5a473b4 100644 ---- a/arch/x86/kernel/Makefile -+++ b/arch/x86/kernel/Makefile -@@ -125,9 +125,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o - obj-$(CONFIG_TRACING) += tracepoint.o - obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o - --obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o --obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o --obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o -+obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o -+obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o -+obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o - - ### - # 64 bit specific files -diff --git a/scripts/Makefile.build b/scripts/Makefile.build -index ab2c8ef43cdb..436005392047 100644 ---- a/scripts/Makefile.build -+++ b/scripts/Makefile.build -@@ -258,7 +258,7 @@ ifneq ($(SKIP_STACK_VALIDATION),1) - - __objtool_obj := $(objtree)/tools/objtool/objtool - --objtool_args = $(if $(CONFIG_ORC_UNWINDER),orc generate,check) -+objtool_args = $(if $(CONFIG_UNWINDER_ORC),orc generate,check) - - ifndef CONFIG_FRAME_POINTER - objtool_args += --no-fp -diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h -index 9eb7c718aaf8..9f05a1002aa9 100644 ---- a/arch/x86/include/asm/module.h -+++ b/arch/x86/include/asm/module.h -@@ -5,7 +5,7 @@ - #include - - struct mod_arch_specific { --#ifdef CONFIG_ORC_UNWINDER -+#ifdef CONFIG_UNWINDER_ORC - unsigned int num_orcs; - int *orc_unwind_ip; - struct orc_entry *orc_unwind; -diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h -index e9f793e2df7a..35d67dc7b69f 100644 ---- a/arch/x86/include/asm/unwind.h -+++ b/arch/x86/include/asm/unwind.h -@@ -12,11 +12,11 @@ struct unwind_state { - struct task_struct *task; - int graph_idx; - bool error; --#if defined(CONFIG_ORC_UNWINDER) -+#if defined(CONFIG_UNWINDER_ORC) - bool signal, full_regs; - unsigned long sp, bp, ip; - struct pt_regs *regs; --#elif defined(CONFIG_FRAME_POINTER_UNWINDER) -+#elif defined(CONFIG_UNWINDER_FRAME_POINTER) - bool got_irq; - unsigned long *bp, *orig_sp, ip; - struct pt_regs *regs; -@@ -50,7 +50,7 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, - __unwind_start(state, task, regs, first_frame); - } - --#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER) -+#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) - static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) - { - if (unwind_done(state)) -@@ -65,7 +65,7 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) - } - #endif - --#ifdef CONFIG_ORC_UNWINDER -+#ifdef CONFIG_UNWINDER_ORC - void unwind_init(void); - void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, - void *orc, size_t orc_size); -diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h -index 9fdb54a95976..e71e42432360 100644 ---- a/include/asm-generic/vmlinux.lds.h -+++ b/include/asm-generic/vmlinux.lds.h -@@ -686,7 +686,7 @@ - #define BUG_TABLE - #endif - --#ifdef CONFIG_ORC_UNWINDER -+#ifdef CONFIG_UNWINDER_ORC - #define ORC_UNWIND_TABLE \ - . = ALIGN(4); \ - .orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) { \ -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 3a0b8cb57caf..bf9f03740c30 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -168,7 +168,7 @@ config X86 - select HAVE_PERF_REGS - select HAVE_PERF_USER_STACK_DUMP - select HAVE_REGS_AND_STACK_ACCESS_API -- select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION -+ select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION - select HAVE_STACK_VALIDATION if X86_64 - select HAVE_SYSCALL_TRACEPOINTS - select HAVE_UNSTABLE_SCHED_CLOCK -diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug -index c441b5d65ec8..5435a943f894 100644 ---- a/arch/x86/Kconfig.debug -+++ b/arch/x86/Kconfig.debug -@@ -358,13 +358,13 @@ config PUNIT_ATOM_DEBUG - - choice - prompt "Choose kernel unwinder" -- default FRAME_POINTER_UNWINDER -+ default UNWINDER_FRAME_POINTER - ---help--- - This determines which method will be used for unwinding kernel stack - traces for panics, oopses, bugs, warnings, perf, /proc//stack, - livepatch, lockdep, and more. - --config FRAME_POINTER_UNWINDER -+config UNWINDER_FRAME_POINTER - bool "Frame pointer unwinder" - select FRAME_POINTER - ---help--- -@@ -379,7 +379,7 @@ config FRAME_POINTER_UNWINDER - consistency model, as this is currently the only way to get a - reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). - --config ORC_UNWINDER -+config UNWINDER_ORC - bool "ORC unwinder" - depends on X86_64 - select STACK_VALIDATION -@@ -396,7 +396,7 @@ config ORC_UNWINDER - Enabling this option will increase the kernel's runtime memory usage - by roughly 2-4MB, depending on your kernel config. - --config GUESS_UNWINDER -+config UNWINDER_GUESS - bool "Guess unwinder" - depends on EXPERT - ---help--- -@@ -411,7 +411,7 @@ config GUESS_UNWINDER - endchoice - - config FRAME_POINTER -- depends on !ORC_UNWINDER && !GUESS_UNWINDER -+ depends on !UNWINDER_ORC && !UNWINDER_GUESS - bool - - endmenu -diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config -index 550cd5012b73..66c9e2aab16c 100644 ---- a/arch/x86/configs/tiny.config -+++ b/arch/x86/configs/tiny.config -@@ -1,5 +1,5 @@ - CONFIG_NOHIGHMEM=y - # CONFIG_HIGHMEM4G is not set - # CONFIG_HIGHMEM64G is not set --CONFIG_GUESS_UNWINDER=y --# CONFIG_FRAME_POINTER_UNWINDER is not set -+CONFIG_UNWINDER_GUESS=y -+# CONFIG_UNWINDER_FRAME_POINTER is not set -diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig -index eb65c248708d..e32fc1f274d8 100644 ---- a/arch/x86/configs/x86_64_defconfig -+++ b/arch/x86/configs/x86_64_defconfig -@@ -299,7 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y - # CONFIG_DEBUG_RODATA_TEST is not set - CONFIG_DEBUG_BOOT_PARAMS=y - CONFIG_OPTIMIZE_INLINING=y --CONFIG_ORC_UNWINDER=y -+CONFIG_UNWINDER_ORC=y - CONFIG_SECURITY=y - CONFIG_SECURITY_NETWORK=y - CONFIG_SECURITY_SELINUX=y -diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index 0b4d1b3880b0..4f6ca5f60f7e 100644 ---- a/lib/Kconfig.debug -+++ b/lib/Kconfig.debug -@@ -375,7 +375,7 @@ config STACK_VALIDATION - that runtime stack traces are more reliable. - - This is also a prerequisite for generation of ORC unwind data, which -- is needed for CONFIG_ORC_UNWINDER. -+ is needed for CONFIG_UNWINDER_ORC. - - For more information, see - tools/objtool/Documentation/stack-validation.txt. --- -2.14.2 - diff --git a/patches/kernel/0066-x86-unwind-Make-CONFIG_UNWINDER_ORC-y-the-default-in.patch b/patches/kernel/0066-x86-unwind-Make-CONFIG_UNWINDER_ORC-y-the-default-in.patch deleted file mode 100644 index 3735815..0000000 --- a/patches/kernel/0066-x86-unwind-Make-CONFIG_UNWINDER_ORC-y-the-default-in.patch +++ /dev/null @@ -1,90 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Fri, 13 Oct 2017 15:02:01 -0500 -Subject: [PATCH] x86/unwind: Make CONFIG_UNWINDER_ORC=y the default in kconfig - for 64-bit -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The ORC unwinder has been stable in testing so far. Give it much wider -testing by making it the default in kconfig for x86_64. It's not yet -supported for 32-bit, so leave frame pointers as the default there. - -Suggested-by: Ingo Molnar -Signed-off-by: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/9b1237bbe7244ed9cdf8db2dcb1253e37e1c341e.1507924831.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit fc72ae40e30327aa24eb88a24b9c7058f938bd36) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit aff8d5169f46ae6ac0eb26a5ba745aaf9afa0704) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/Kconfig.debug | 33 +++++++++++++++++---------------- - 1 file changed, 17 insertions(+), 16 deletions(-) - -diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug -index 5435a943f894..7d88e9878a75 100644 ---- a/arch/x86/Kconfig.debug -+++ b/arch/x86/Kconfig.debug -@@ -358,27 +358,13 @@ config PUNIT_ATOM_DEBUG - - choice - prompt "Choose kernel unwinder" -- default UNWINDER_FRAME_POINTER -+ default UNWINDER_ORC if X86_64 -+ default UNWINDER_FRAME_POINTER if X86_32 - ---help--- - This determines which method will be used for unwinding kernel stack - traces for panics, oopses, bugs, warnings, perf, /proc//stack, - livepatch, lockdep, and more. - --config UNWINDER_FRAME_POINTER -- bool "Frame pointer unwinder" -- select FRAME_POINTER -- ---help--- -- This option enables the frame pointer unwinder for unwinding kernel -- stack traces. -- -- The unwinder itself is fast and it uses less RAM than the ORC -- unwinder, but the kernel text size will grow by ~3% and the kernel's -- overall performance will degrade by roughly 5-10%. -- -- This option is recommended if you want to use the livepatch -- consistency model, as this is currently the only way to get a -- reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). -- - config UNWINDER_ORC - bool "ORC unwinder" - depends on X86_64 -@@ -396,6 +382,21 @@ config UNWINDER_ORC - Enabling this option will increase the kernel's runtime memory usage - by roughly 2-4MB, depending on your kernel config. - -+config UNWINDER_FRAME_POINTER -+ bool "Frame pointer unwinder" -+ select FRAME_POINTER -+ ---help--- -+ This option enables the frame pointer unwinder for unwinding kernel -+ stack traces. -+ -+ The unwinder itself is fast and it uses less RAM than the ORC -+ unwinder, but the kernel text size will grow by ~3% and the kernel's -+ overall performance will degrade by roughly 5-10%. -+ -+ This option is recommended if you want to use the livepatch -+ consistency model, as this is currently the only way to get a -+ reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). -+ - config UNWINDER_GUESS - bool "Guess unwinder" - depends on EXPERT --- -2.14.2 - diff --git a/patches/kernel/0066-x86-unwind-Rename-unwinder-config-options-to-CONFIG_.patch b/patches/kernel/0066-x86-unwind-Rename-unwinder-config-options-to-CONFIG_.patch new file mode 100644 index 0000000..9689db5 --- /dev/null +++ b/patches/kernel/0066-x86-unwind-Rename-unwinder-config-options-to-CONFIG_.patch @@ -0,0 +1,273 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Fri, 13 Oct 2017 15:02:00 -0500 +Subject: [PATCH] x86/unwind: Rename unwinder config options to + 'CONFIG_UNWINDER_*' +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Rename the unwinder config options from: + + CONFIG_ORC_UNWINDER + CONFIG_FRAME_POINTER_UNWINDER + CONFIG_GUESS_UNWINDER + +to: + + CONFIG_UNWINDER_ORC + CONFIG_UNWINDER_FRAME_POINTER + CONFIG_UNWINDER_GUESS + +... in order to give them a more logical config namespace. + +Suggested-by: Ingo Molnar +Signed-off-by: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/73972fc7e2762e91912c6b9584582703d6f1b8cc.1507924831.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 11af847446ed0d131cf24d16a7ef3d5ea7a49554) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 27ab2a240a797b073ce63385b1d5db06e44fc3ae) +Signed-off-by: Fabian Grünbichler +--- + Documentation/x86/orc-unwinder.txt | 2 +- + Makefile | 4 ++-- + arch/x86/kernel/Makefile | 6 +++--- + scripts/Makefile.build | 2 +- + arch/x86/include/asm/module.h | 2 +- + arch/x86/include/asm/unwind.h | 8 ++++---- + include/asm-generic/vmlinux.lds.h | 2 +- + arch/x86/Kconfig | 2 +- + arch/x86/Kconfig.debug | 10 +++++----- + arch/x86/configs/tiny.config | 4 ++-- + arch/x86/configs/x86_64_defconfig | 2 +- + lib/Kconfig.debug | 2 +- + 12 files changed, 23 insertions(+), 23 deletions(-) + +diff --git a/Documentation/x86/orc-unwinder.txt b/Documentation/x86/orc-unwinder.txt +index af0c9a4c65a6..cd4b29be29af 100644 +--- a/Documentation/x86/orc-unwinder.txt ++++ b/Documentation/x86/orc-unwinder.txt +@@ -4,7 +4,7 @@ ORC unwinder + Overview + -------- + +-The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is ++The kernel CONFIG_UNWINDER_ORC option enables the ORC unwinder, which is + similar in concept to a DWARF unwinder. The difference is that the + format of the ORC data is much simpler than DWARF, which in turn allows + the ORC unwinder to be much simpler and faster. +diff --git a/Makefile b/Makefile +index 490ce18685ea..b740e3dc9ff8 100644 +--- a/Makefile ++++ b/Makefile +@@ -965,8 +965,8 @@ ifdef CONFIG_STACK_VALIDATION + ifeq ($(has_libelf),1) + objtool_target := tools/objtool FORCE + else +- ifdef CONFIG_ORC_UNWINDER +- $(error "Cannot generate ORC metadata for CONFIG_ORC_UNWINDER=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") ++ ifdef CONFIG_UNWINDER_ORC ++ $(error "Cannot generate ORC metadata for CONFIG_UNWINDER_ORC=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") + else + $(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") + endif +diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile +index e2315aecc441..5bf0d5a473b4 100644 +--- a/arch/x86/kernel/Makefile ++++ b/arch/x86/kernel/Makefile +@@ -125,9 +125,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o + obj-$(CONFIG_TRACING) += tracepoint.o + obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o + +-obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o +-obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o +-obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o ++obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o ++obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o ++obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o + + ### + # 64 bit specific files +diff --git a/scripts/Makefile.build b/scripts/Makefile.build +index ab2c8ef43cdb..436005392047 100644 +--- a/scripts/Makefile.build ++++ b/scripts/Makefile.build +@@ -258,7 +258,7 @@ ifneq ($(SKIP_STACK_VALIDATION),1) + + __objtool_obj := $(objtree)/tools/objtool/objtool + +-objtool_args = $(if $(CONFIG_ORC_UNWINDER),orc generate,check) ++objtool_args = $(if $(CONFIG_UNWINDER_ORC),orc generate,check) + + ifndef CONFIG_FRAME_POINTER + objtool_args += --no-fp +diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h +index 9eb7c718aaf8..9f05a1002aa9 100644 +--- a/arch/x86/include/asm/module.h ++++ b/arch/x86/include/asm/module.h +@@ -5,7 +5,7 @@ + #include + + struct mod_arch_specific { +-#ifdef CONFIG_ORC_UNWINDER ++#ifdef CONFIG_UNWINDER_ORC + unsigned int num_orcs; + int *orc_unwind_ip; + struct orc_entry *orc_unwind; +diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h +index e9f793e2df7a..35d67dc7b69f 100644 +--- a/arch/x86/include/asm/unwind.h ++++ b/arch/x86/include/asm/unwind.h +@@ -12,11 +12,11 @@ struct unwind_state { + struct task_struct *task; + int graph_idx; + bool error; +-#if defined(CONFIG_ORC_UNWINDER) ++#if defined(CONFIG_UNWINDER_ORC) + bool signal, full_regs; + unsigned long sp, bp, ip; + struct pt_regs *regs; +-#elif defined(CONFIG_FRAME_POINTER_UNWINDER) ++#elif defined(CONFIG_UNWINDER_FRAME_POINTER) + bool got_irq; + unsigned long *bp, *orig_sp, ip; + struct pt_regs *regs; +@@ -50,7 +50,7 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, + __unwind_start(state, task, regs, first_frame); + } + +-#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER) ++#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) + static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) + { + if (unwind_done(state)) +@@ -65,7 +65,7 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) + } + #endif + +-#ifdef CONFIG_ORC_UNWINDER ++#ifdef CONFIG_UNWINDER_ORC + void unwind_init(void); + void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size); +diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h +index 9fdb54a95976..e71e42432360 100644 +--- a/include/asm-generic/vmlinux.lds.h ++++ b/include/asm-generic/vmlinux.lds.h +@@ -686,7 +686,7 @@ + #define BUG_TABLE + #endif + +-#ifdef CONFIG_ORC_UNWINDER ++#ifdef CONFIG_UNWINDER_ORC + #define ORC_UNWIND_TABLE \ + . = ALIGN(4); \ + .orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) { \ +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 3a0b8cb57caf..bf9f03740c30 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -168,7 +168,7 @@ config X86 + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP + select HAVE_REGS_AND_STACK_ACCESS_API +- select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION ++ select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION + select HAVE_STACK_VALIDATION if X86_64 + select HAVE_SYSCALL_TRACEPOINTS + select HAVE_UNSTABLE_SCHED_CLOCK +diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug +index c441b5d65ec8..5435a943f894 100644 +--- a/arch/x86/Kconfig.debug ++++ b/arch/x86/Kconfig.debug +@@ -358,13 +358,13 @@ config PUNIT_ATOM_DEBUG + + choice + prompt "Choose kernel unwinder" +- default FRAME_POINTER_UNWINDER ++ default UNWINDER_FRAME_POINTER + ---help--- + This determines which method will be used for unwinding kernel stack + traces for panics, oopses, bugs, warnings, perf, /proc//stack, + livepatch, lockdep, and more. + +-config FRAME_POINTER_UNWINDER ++config UNWINDER_FRAME_POINTER + bool "Frame pointer unwinder" + select FRAME_POINTER + ---help--- +@@ -379,7 +379,7 @@ config FRAME_POINTER_UNWINDER + consistency model, as this is currently the only way to get a + reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). + +-config ORC_UNWINDER ++config UNWINDER_ORC + bool "ORC unwinder" + depends on X86_64 + select STACK_VALIDATION +@@ -396,7 +396,7 @@ config ORC_UNWINDER + Enabling this option will increase the kernel's runtime memory usage + by roughly 2-4MB, depending on your kernel config. + +-config GUESS_UNWINDER ++config UNWINDER_GUESS + bool "Guess unwinder" + depends on EXPERT + ---help--- +@@ -411,7 +411,7 @@ config GUESS_UNWINDER + endchoice + + config FRAME_POINTER +- depends on !ORC_UNWINDER && !GUESS_UNWINDER ++ depends on !UNWINDER_ORC && !UNWINDER_GUESS + bool + + endmenu +diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config +index 550cd5012b73..66c9e2aab16c 100644 +--- a/arch/x86/configs/tiny.config ++++ b/arch/x86/configs/tiny.config +@@ -1,5 +1,5 @@ + CONFIG_NOHIGHMEM=y + # CONFIG_HIGHMEM4G is not set + # CONFIG_HIGHMEM64G is not set +-CONFIG_GUESS_UNWINDER=y +-# CONFIG_FRAME_POINTER_UNWINDER is not set ++CONFIG_UNWINDER_GUESS=y ++# CONFIG_UNWINDER_FRAME_POINTER is not set +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index eb65c248708d..e32fc1f274d8 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -299,7 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y + # CONFIG_DEBUG_RODATA_TEST is not set + CONFIG_DEBUG_BOOT_PARAMS=y + CONFIG_OPTIMIZE_INLINING=y +-CONFIG_ORC_UNWINDER=y ++CONFIG_UNWINDER_ORC=y + CONFIG_SECURITY=y + CONFIG_SECURITY_NETWORK=y + CONFIG_SECURITY_SELINUX=y +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index 0b4d1b3880b0..4f6ca5f60f7e 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -375,7 +375,7 @@ config STACK_VALIDATION + that runtime stack traces are more reliable. + + This is also a prerequisite for generation of ORC unwind data, which +- is needed for CONFIG_ORC_UNWINDER. ++ is needed for CONFIG_UNWINDER_ORC. + + For more information, see + tools/objtool/Documentation/stack-validation.txt. +-- +2.14.2 + diff --git a/patches/kernel/0067-bitops-Add-clear-set_bit32-to-linux-bitops.h.patch b/patches/kernel/0067-bitops-Add-clear-set_bit32-to-linux-bitops.h.patch deleted file mode 100644 index 302f3fb..0000000 --- a/patches/kernel/0067-bitops-Add-clear-set_bit32-to-linux-bitops.h.patch +++ /dev/null @@ -1,69 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andi Kleen -Date: Fri, 13 Oct 2017 14:56:41 -0700 -Subject: [PATCH] bitops: Add clear/set_bit32() to linux/bitops.h -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Add two simple wrappers around set_bit/clear_bit() that accept -the common case of an u32 array. This avoids writing -casts in all callers. - -Signed-off-by: Andi Kleen -Reviewed-by: Thomas Gleixner -Cc: Linus Torvalds -Cc: Peter Zijlstra -Link: http://lkml.kernel.org/r/20171013215645.23166-2-andi@firstfloor.org -Signed-off-by: Ingo Molnar -(cherry picked from commit cbe96375025e14fc76f9ed42ee5225120d7210f8) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 06d31c11519ca0e8f9b7cab857f442ef44dfc1b2) -Signed-off-by: Fabian Grünbichler ---- - include/linux/bitops.h | 26 ++++++++++++++++++++++++++ - 1 file changed, 26 insertions(+) - -diff --git a/include/linux/bitops.h b/include/linux/bitops.h -index a83c822c35c2..eb257a96db6d 100644 ---- a/include/linux/bitops.h -+++ b/include/linux/bitops.h -@@ -226,6 +226,32 @@ static inline unsigned long __ffs64(u64 word) - return __ffs((unsigned long)word); - } - -+/* -+ * clear_bit32 - Clear a bit in memory for u32 array -+ * @nr: Bit to clear -+ * @addr: u32 * address of bitmap -+ * -+ * Same as clear_bit, but avoids needing casts for u32 arrays. -+ */ -+ -+static __always_inline void clear_bit32(long nr, volatile u32 *addr) -+{ -+ clear_bit(nr, (volatile unsigned long *)addr); -+} -+ -+/* -+ * set_bit32 - Set a bit in memory for u32 array -+ * @nr: Bit to clear -+ * @addr: u32 * address of bitmap -+ * -+ * Same as set_bit, but avoids needing casts for u32 arrays. -+ */ -+ -+static __always_inline void set_bit32(long nr, volatile u32 *addr) -+{ -+ set_bit(nr, (volatile unsigned long *)addr); -+} -+ - #ifdef __KERNEL__ - - #ifndef set_mask_bits --- -2.14.2 - diff --git a/patches/kernel/0067-x86-unwind-Make-CONFIG_UNWINDER_ORC-y-the-default-in.patch b/patches/kernel/0067-x86-unwind-Make-CONFIG_UNWINDER_ORC-y-the-default-in.patch new file mode 100644 index 0000000..3735815 --- /dev/null +++ b/patches/kernel/0067-x86-unwind-Make-CONFIG_UNWINDER_ORC-y-the-default-in.patch @@ -0,0 +1,90 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Fri, 13 Oct 2017 15:02:01 -0500 +Subject: [PATCH] x86/unwind: Make CONFIG_UNWINDER_ORC=y the default in kconfig + for 64-bit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The ORC unwinder has been stable in testing so far. Give it much wider +testing by making it the default in kconfig for x86_64. It's not yet +supported for 32-bit, so leave frame pointers as the default there. + +Suggested-by: Ingo Molnar +Signed-off-by: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/9b1237bbe7244ed9cdf8db2dcb1253e37e1c341e.1507924831.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit fc72ae40e30327aa24eb88a24b9c7058f938bd36) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit aff8d5169f46ae6ac0eb26a5ba745aaf9afa0704) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/Kconfig.debug | 33 +++++++++++++++++---------------- + 1 file changed, 17 insertions(+), 16 deletions(-) + +diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug +index 5435a943f894..7d88e9878a75 100644 +--- a/arch/x86/Kconfig.debug ++++ b/arch/x86/Kconfig.debug +@@ -358,27 +358,13 @@ config PUNIT_ATOM_DEBUG + + choice + prompt "Choose kernel unwinder" +- default UNWINDER_FRAME_POINTER ++ default UNWINDER_ORC if X86_64 ++ default UNWINDER_FRAME_POINTER if X86_32 + ---help--- + This determines which method will be used for unwinding kernel stack + traces for panics, oopses, bugs, warnings, perf, /proc//stack, + livepatch, lockdep, and more. + +-config UNWINDER_FRAME_POINTER +- bool "Frame pointer unwinder" +- select FRAME_POINTER +- ---help--- +- This option enables the frame pointer unwinder for unwinding kernel +- stack traces. +- +- The unwinder itself is fast and it uses less RAM than the ORC +- unwinder, but the kernel text size will grow by ~3% and the kernel's +- overall performance will degrade by roughly 5-10%. +- +- This option is recommended if you want to use the livepatch +- consistency model, as this is currently the only way to get a +- reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). +- + config UNWINDER_ORC + bool "ORC unwinder" + depends on X86_64 +@@ -396,6 +382,21 @@ config UNWINDER_ORC + Enabling this option will increase the kernel's runtime memory usage + by roughly 2-4MB, depending on your kernel config. + ++config UNWINDER_FRAME_POINTER ++ bool "Frame pointer unwinder" ++ select FRAME_POINTER ++ ---help--- ++ This option enables the frame pointer unwinder for unwinding kernel ++ stack traces. ++ ++ The unwinder itself is fast and it uses less RAM than the ORC ++ unwinder, but the kernel text size will grow by ~3% and the kernel's ++ overall performance will degrade by roughly 5-10%. ++ ++ This option is recommended if you want to use the livepatch ++ consistency model, as this is currently the only way to get a ++ reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). ++ + config UNWINDER_GUESS + bool "Guess unwinder" + depends on EXPERT +-- +2.14.2 + diff --git a/patches/kernel/0068-bitops-Add-clear-set_bit32-to-linux-bitops.h.patch b/patches/kernel/0068-bitops-Add-clear-set_bit32-to-linux-bitops.h.patch new file mode 100644 index 0000000..302f3fb --- /dev/null +++ b/patches/kernel/0068-bitops-Add-clear-set_bit32-to-linux-bitops.h.patch @@ -0,0 +1,69 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andi Kleen +Date: Fri, 13 Oct 2017 14:56:41 -0700 +Subject: [PATCH] bitops: Add clear/set_bit32() to linux/bitops.h +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Add two simple wrappers around set_bit/clear_bit() that accept +the common case of an u32 array. This avoids writing +casts in all callers. + +Signed-off-by: Andi Kleen +Reviewed-by: Thomas Gleixner +Cc: Linus Torvalds +Cc: Peter Zijlstra +Link: http://lkml.kernel.org/r/20171013215645.23166-2-andi@firstfloor.org +Signed-off-by: Ingo Molnar +(cherry picked from commit cbe96375025e14fc76f9ed42ee5225120d7210f8) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 06d31c11519ca0e8f9b7cab857f442ef44dfc1b2) +Signed-off-by: Fabian Grünbichler +--- + include/linux/bitops.h | 26 ++++++++++++++++++++++++++ + 1 file changed, 26 insertions(+) + +diff --git a/include/linux/bitops.h b/include/linux/bitops.h +index a83c822c35c2..eb257a96db6d 100644 +--- a/include/linux/bitops.h ++++ b/include/linux/bitops.h +@@ -226,6 +226,32 @@ static inline unsigned long __ffs64(u64 word) + return __ffs((unsigned long)word); + } + ++/* ++ * clear_bit32 - Clear a bit in memory for u32 array ++ * @nr: Bit to clear ++ * @addr: u32 * address of bitmap ++ * ++ * Same as clear_bit, but avoids needing casts for u32 arrays. ++ */ ++ ++static __always_inline void clear_bit32(long nr, volatile u32 *addr) ++{ ++ clear_bit(nr, (volatile unsigned long *)addr); ++} ++ ++/* ++ * set_bit32 - Set a bit in memory for u32 array ++ * @nr: Bit to clear ++ * @addr: u32 * address of bitmap ++ * ++ * Same as set_bit, but avoids needing casts for u32 arrays. ++ */ ++ ++static __always_inline void set_bit32(long nr, volatile u32 *addr) ++{ ++ set_bit(nr, (volatile unsigned long *)addr); ++} ++ + #ifdef __KERNEL__ + + #ifndef set_mask_bits +-- +2.14.2 + diff --git a/patches/kernel/0068-x86-cpuid-Add-generic-table-for-CPUID-dependencies.patch b/patches/kernel/0068-x86-cpuid-Add-generic-table-for-CPUID-dependencies.patch deleted file mode 100644 index 4c4b7ba..0000000 --- a/patches/kernel/0068-x86-cpuid-Add-generic-table-for-CPUID-dependencies.patch +++ /dev/null @@ -1,221 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andi Kleen -Date: Fri, 13 Oct 2017 14:56:42 -0700 -Subject: [PATCH] x86/cpuid: Add generic table for CPUID dependencies -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Some CPUID features depend on other features. Currently it's -possible to to clear dependent features, but not clear the base features, -which can cause various interesting problems. - -This patch implements a generic table to describe dependencies -between CPUID features, to be used by all code that clears -CPUID. - -Some subsystems (like XSAVE) had an own implementation of this, -but it's better to do it all in a single place for everyone. - -Then clear_cpu_cap and setup_clear_cpu_cap always look up -this table and clear all dependencies too. - -This is intended to be a practical table: only for features -that make sense to clear. If someone for example clears FPU, -or other features that are essentially part of the required -base feature set, not much is going to work. Handling -that is right now out of scope. We're only handling -features which can be usefully cleared. - -Signed-off-by: Andi Kleen -Reviewed-by: Thomas Gleixner -Cc: Jonathan McDowell -Cc: Linus Torvalds -Cc: Peter Zijlstra -Link: http://lkml.kernel.org/r/20171013215645.23166-3-andi@firstfloor.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 0b00de857a648dafe7020878c7a27cf776f5edf4) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 35672522f2fc9a2e116ed1766f190bc08ef5582a) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/cpu/Makefile | 1 + - arch/x86/include/asm/cpufeature.h | 9 ++- - arch/x86/include/asm/cpufeatures.h | 5 ++ - arch/x86/kernel/cpu/cpuid-deps.c | 113 +++++++++++++++++++++++++++++++++++++ - 4 files changed, 123 insertions(+), 5 deletions(-) - create mode 100644 arch/x86/kernel/cpu/cpuid-deps.c - -diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile -index e17942c131c8..de260fae1017 100644 ---- a/arch/x86/kernel/cpu/Makefile -+++ b/arch/x86/kernel/cpu/Makefile -@@ -22,6 +22,7 @@ obj-y += rdrand.o - obj-y += match.o - obj-y += bugs.o - obj-$(CONFIG_CPU_FREQ) += aperfmperf.o -+obj-y += cpuid-deps.o - - obj-$(CONFIG_PROC_FS) += proc.o - obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o -diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h -index d59c15c3defd..225fd8374fae 100644 ---- a/arch/x86/include/asm/cpufeature.h -+++ b/arch/x86/include/asm/cpufeature.h -@@ -125,11 +125,10 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; - #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) - - #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) --#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) --#define setup_clear_cpu_cap(bit) do { \ -- clear_cpu_cap(&boot_cpu_data, bit); \ -- set_bit(bit, (unsigned long *)cpu_caps_cleared); \ --} while (0) -+ -+extern void setup_clear_cpu_cap(unsigned int bit); -+extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); -+ - #define setup_force_cpu_cap(bit) do { \ - set_cpu_cap(&boot_cpu_data, bit); \ - set_bit(bit, (unsigned long *)cpu_caps_set); \ -diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h -index 5a28e8e55e36..f4e145c4b06f 100644 ---- a/arch/x86/include/asm/cpufeatures.h -+++ b/arch/x86/include/asm/cpufeatures.h -@@ -21,6 +21,11 @@ - * this feature bit is not displayed in /proc/cpuinfo at all. - */ - -+/* -+ * When adding new features here that depend on other features, -+ * please update the table in kernel/cpu/cpuid-deps.c -+ */ -+ - /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ - #define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ - #define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ -diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c -new file mode 100644 -index 000000000000..e48eb7313120 ---- /dev/null -+++ b/arch/x86/kernel/cpu/cpuid-deps.c -@@ -0,0 +1,113 @@ -+/* Declare dependencies between CPUIDs */ -+#include -+#include -+#include -+#include -+ -+struct cpuid_dep { -+ unsigned int feature; -+ unsigned int depends; -+}; -+ -+/* -+ * Table of CPUID features that depend on others. -+ * -+ * This only includes dependencies that can be usefully disabled, not -+ * features part of the base set (like FPU). -+ * -+ * Note this all is not __init / __initdata because it can be -+ * called from cpu hotplug. It shouldn't do anything in this case, -+ * but it's difficult to tell that to the init reference checker. -+ */ -+const static struct cpuid_dep cpuid_deps[] = { -+ { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, -+ { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, -+ { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, -+ { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, -+ { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, -+ { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, -+ { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, -+ { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, -+ { X86_FEATURE_XMM, X86_FEATURE_FXSR }, -+ { X86_FEATURE_XMM2, X86_FEATURE_XMM }, -+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, -+ { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, -+ { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, -+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, -+ { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, -+ { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, -+ { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, -+ { X86_FEATURE_AES, X86_FEATURE_XMM2 }, -+ { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, -+ { X86_FEATURE_FMA, X86_FEATURE_AVX }, -+ { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, -+ { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, -+ { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, -+ { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, -+ { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, -+ { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, -+ { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, -+ { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, -+ { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, -+ { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, -+ { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, -+ { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, -+ { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, -+ {} -+}; -+ -+static inline void __clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit) -+{ -+ clear_bit32(bit, c->x86_capability); -+} -+ -+static inline void __setup_clear_cpu_cap(unsigned int bit) -+{ -+ clear_cpu_cap(&boot_cpu_data, bit); -+ set_bit32(bit, cpu_caps_cleared); -+} -+ -+static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) -+{ -+ if (!c) -+ __setup_clear_cpu_cap(feature); -+ else -+ __clear_cpu_cap(c, feature); -+} -+ -+static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) -+{ -+ bool changed; -+ DECLARE_BITMAP(disable, NCAPINTS * sizeof(u32) * 8); -+ const struct cpuid_dep *d; -+ -+ clear_feature(c, feature); -+ -+ /* Collect all features to disable, handling dependencies */ -+ memset(disable, 0, sizeof(disable)); -+ __set_bit(feature, disable); -+ -+ /* Loop until we get a stable state. */ -+ do { -+ changed = false; -+ for (d = cpuid_deps; d->feature; d++) { -+ if (!test_bit(d->depends, disable)) -+ continue; -+ if (__test_and_set_bit(d->feature, disable)) -+ continue; -+ -+ changed = true; -+ clear_feature(c, d->feature); -+ } -+ } while (changed); -+} -+ -+void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) -+{ -+ do_clear_cpu_cap(c, feature); -+} -+ -+void setup_clear_cpu_cap(unsigned int feature) -+{ -+ do_clear_cpu_cap(NULL, feature); -+} --- -2.14.2 - diff --git a/patches/kernel/0069-x86-cpuid-Add-generic-table-for-CPUID-dependencies.patch b/patches/kernel/0069-x86-cpuid-Add-generic-table-for-CPUID-dependencies.patch new file mode 100644 index 0000000..4c4b7ba --- /dev/null +++ b/patches/kernel/0069-x86-cpuid-Add-generic-table-for-CPUID-dependencies.patch @@ -0,0 +1,221 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andi Kleen +Date: Fri, 13 Oct 2017 14:56:42 -0700 +Subject: [PATCH] x86/cpuid: Add generic table for CPUID dependencies +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Some CPUID features depend on other features. Currently it's +possible to to clear dependent features, but not clear the base features, +which can cause various interesting problems. + +This patch implements a generic table to describe dependencies +between CPUID features, to be used by all code that clears +CPUID. + +Some subsystems (like XSAVE) had an own implementation of this, +but it's better to do it all in a single place for everyone. + +Then clear_cpu_cap and setup_clear_cpu_cap always look up +this table and clear all dependencies too. + +This is intended to be a practical table: only for features +that make sense to clear. If someone for example clears FPU, +or other features that are essentially part of the required +base feature set, not much is going to work. Handling +that is right now out of scope. We're only handling +features which can be usefully cleared. + +Signed-off-by: Andi Kleen +Reviewed-by: Thomas Gleixner +Cc: Jonathan McDowell +Cc: Linus Torvalds +Cc: Peter Zijlstra +Link: http://lkml.kernel.org/r/20171013215645.23166-3-andi@firstfloor.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 0b00de857a648dafe7020878c7a27cf776f5edf4) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 35672522f2fc9a2e116ed1766f190bc08ef5582a) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/cpu/Makefile | 1 + + arch/x86/include/asm/cpufeature.h | 9 ++- + arch/x86/include/asm/cpufeatures.h | 5 ++ + arch/x86/kernel/cpu/cpuid-deps.c | 113 +++++++++++++++++++++++++++++++++++++ + 4 files changed, 123 insertions(+), 5 deletions(-) + create mode 100644 arch/x86/kernel/cpu/cpuid-deps.c + +diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile +index e17942c131c8..de260fae1017 100644 +--- a/arch/x86/kernel/cpu/Makefile ++++ b/arch/x86/kernel/cpu/Makefile +@@ -22,6 +22,7 @@ obj-y += rdrand.o + obj-y += match.o + obj-y += bugs.o + obj-$(CONFIG_CPU_FREQ) += aperfmperf.o ++obj-y += cpuid-deps.o + + obj-$(CONFIG_PROC_FS) += proc.o + obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o +diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h +index d59c15c3defd..225fd8374fae 100644 +--- a/arch/x86/include/asm/cpufeature.h ++++ b/arch/x86/include/asm/cpufeature.h +@@ -125,11 +125,10 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; + #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) + + #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) +-#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) +-#define setup_clear_cpu_cap(bit) do { \ +- clear_cpu_cap(&boot_cpu_data, bit); \ +- set_bit(bit, (unsigned long *)cpu_caps_cleared); \ +-} while (0) ++ ++extern void setup_clear_cpu_cap(unsigned int bit); ++extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); ++ + #define setup_force_cpu_cap(bit) do { \ + set_cpu_cap(&boot_cpu_data, bit); \ + set_bit(bit, (unsigned long *)cpu_caps_set); \ +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 5a28e8e55e36..f4e145c4b06f 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -21,6 +21,11 @@ + * this feature bit is not displayed in /proc/cpuinfo at all. + */ + ++/* ++ * When adding new features here that depend on other features, ++ * please update the table in kernel/cpu/cpuid-deps.c ++ */ ++ + /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ + #define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ + #define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ +diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c +new file mode 100644 +index 000000000000..e48eb7313120 +--- /dev/null ++++ b/arch/x86/kernel/cpu/cpuid-deps.c +@@ -0,0 +1,113 @@ ++/* Declare dependencies between CPUIDs */ ++#include ++#include ++#include ++#include ++ ++struct cpuid_dep { ++ unsigned int feature; ++ unsigned int depends; ++}; ++ ++/* ++ * Table of CPUID features that depend on others. ++ * ++ * This only includes dependencies that can be usefully disabled, not ++ * features part of the base set (like FPU). ++ * ++ * Note this all is not __init / __initdata because it can be ++ * called from cpu hotplug. It shouldn't do anything in this case, ++ * but it's difficult to tell that to the init reference checker. ++ */ ++const static struct cpuid_dep cpuid_deps[] = { ++ { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, ++ { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, ++ { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, ++ { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, ++ { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, ++ { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, ++ { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, ++ { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, ++ { X86_FEATURE_XMM, X86_FEATURE_FXSR }, ++ { X86_FEATURE_XMM2, X86_FEATURE_XMM }, ++ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, ++ { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, ++ { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, ++ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, ++ { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, ++ { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, ++ { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, ++ { X86_FEATURE_AES, X86_FEATURE_XMM2 }, ++ { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, ++ { X86_FEATURE_FMA, X86_FEATURE_AVX }, ++ { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, ++ { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, ++ { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, ++ {} ++}; ++ ++static inline void __clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit) ++{ ++ clear_bit32(bit, c->x86_capability); ++} ++ ++static inline void __setup_clear_cpu_cap(unsigned int bit) ++{ ++ clear_cpu_cap(&boot_cpu_data, bit); ++ set_bit32(bit, cpu_caps_cleared); ++} ++ ++static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) ++{ ++ if (!c) ++ __setup_clear_cpu_cap(feature); ++ else ++ __clear_cpu_cap(c, feature); ++} ++ ++static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) ++{ ++ bool changed; ++ DECLARE_BITMAP(disable, NCAPINTS * sizeof(u32) * 8); ++ const struct cpuid_dep *d; ++ ++ clear_feature(c, feature); ++ ++ /* Collect all features to disable, handling dependencies */ ++ memset(disable, 0, sizeof(disable)); ++ __set_bit(feature, disable); ++ ++ /* Loop until we get a stable state. */ ++ do { ++ changed = false; ++ for (d = cpuid_deps; d->feature; d++) { ++ if (!test_bit(d->depends, disable)) ++ continue; ++ if (__test_and_set_bit(d->feature, disable)) ++ continue; ++ ++ changed = true; ++ clear_feature(c, d->feature); ++ } ++ } while (changed); ++} ++ ++void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) ++{ ++ do_clear_cpu_cap(c, feature); ++} ++ ++void setup_clear_cpu_cap(unsigned int feature) ++{ ++ do_clear_cpu_cap(NULL, feature); ++} +-- +2.14.2 + diff --git a/patches/kernel/0069-x86-fpu-Parse-clearcpuid-as-early-XSAVE-argument.patch b/patches/kernel/0069-x86-fpu-Parse-clearcpuid-as-early-XSAVE-argument.patch deleted file mode 100644 index b4c6d58..0000000 --- a/patches/kernel/0069-x86-fpu-Parse-clearcpuid-as-early-XSAVE-argument.patch +++ /dev/null @@ -1,97 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andi Kleen -Date: Fri, 13 Oct 2017 14:56:43 -0700 -Subject: [PATCH] x86/fpu: Parse clearcpuid= as early XSAVE argument -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -With a followon patch we want to make clearcpuid affect the XSAVE -configuration. But xsave is currently initialized before arguments -are parsed. Move the clearcpuid= parsing into the special -early xsave argument parsing code. - -Since clearcpuid= contains a = we need to keep the old __setup -around as a dummy, otherwise it would end up as a environment -variable in init's environment. - -Signed-off-by: Andi Kleen -Reviewed-by: Thomas Gleixner -Cc: Linus Torvalds -Cc: Peter Zijlstra -Link: http://lkml.kernel.org/r/20171013215645.23166-4-andi@firstfloor.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 0c2a3913d6f50503f7c59d83a6219e39508cc898) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 27deb452eb0d27c406f3817ab057201aa8767abe) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/cpu/common.c | 16 +++++++--------- - arch/x86/kernel/fpu/init.c | 11 +++++++++++ - 2 files changed, 18 insertions(+), 9 deletions(-) - -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index 4be7b209a3d6..ef7b1ba56363 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -1293,18 +1293,16 @@ void print_cpu_info(struct cpuinfo_x86 *c) - pr_cont(")\n"); - } - --static __init int setup_disablecpuid(char *arg) -+/* -+ * clearcpuid= was already parsed in fpu__init_parse_early_param. -+ * But we need to keep a dummy __setup around otherwise it would -+ * show up as an environment variable for init. -+ */ -+static __init int setup_clearcpuid(char *arg) - { -- int bit; -- -- if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32) -- setup_clear_cpu_cap(bit); -- else -- return 0; -- - return 1; - } --__setup("clearcpuid=", setup_disablecpuid); -+__setup("clearcpuid=", setup_clearcpuid); - - #ifdef CONFIG_X86_64 - struct desc_ptr idt_descr __ro_after_init = { -diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c -index d5d44c452624..07f0ab877f49 100644 ---- a/arch/x86/kernel/fpu/init.c -+++ b/arch/x86/kernel/fpu/init.c -@@ -249,6 +249,10 @@ static void __init fpu__init_system_ctx_switch(void) - */ - static void __init fpu__init_parse_early_param(void) - { -+ char arg[32]; -+ char *argptr = arg; -+ int bit; -+ - if (cmdline_find_option_bool(boot_command_line, "no387")) - setup_clear_cpu_cap(X86_FEATURE_FPU); - -@@ -266,6 +270,13 @@ static void __init fpu__init_parse_early_param(void) - - if (cmdline_find_option_bool(boot_command_line, "noxsaves")) - setup_clear_cpu_cap(X86_FEATURE_XSAVES); -+ -+ if (cmdline_find_option(boot_command_line, "clearcpuid", arg, -+ sizeof(arg)) && -+ get_option(&argptr, &bit) && -+ bit >= 0 && -+ bit < NCAPINTS * 32) -+ setup_clear_cpu_cap(bit); - } - - /* --- -2.14.2 - diff --git a/patches/kernel/0070-x86-fpu-Make-XSAVE-check-the-base-CPUID-features-bef.patch b/patches/kernel/0070-x86-fpu-Make-XSAVE-check-the-base-CPUID-features-bef.patch deleted file mode 100644 index 0c563c6..0000000 --- a/patches/kernel/0070-x86-fpu-Make-XSAVE-check-the-base-CPUID-features-bef.patch +++ /dev/null @@ -1,90 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andi Kleen -Date: Fri, 13 Oct 2017 14:56:44 -0700 -Subject: [PATCH] x86/fpu: Make XSAVE check the base CPUID features before - enabling -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Before enabling XSAVE, not only check the XSAVE specific CPUID bits, -but also the base CPUID features of the respective XSAVE feature. -This allows to disable individual XSAVE states using the existing -clearcpuid= option, which can be useful for performance testing -and debugging, and also in general avoids inconsistencies. - -Signed-off-by: Andi Kleen -Reviewed-by: Thomas Gleixner -Cc: Linus Torvalds -Cc: Peter Zijlstra -Link: http://lkml.kernel.org/r/20171013215645.23166-5-andi@firstfloor.org -Signed-off-by: Ingo Molnar -(cherry picked from commit ccb18db2ab9d923df07e7495123fe5fb02329713) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 2efda26f9ee0eeb9919772e90ca30dbe59008dc8) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/fpu/xstate.c | 23 +++++++++++++++++++++++ - 1 file changed, 23 insertions(+) - -diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c -index c24ac1efb12d..3abe85b08234 100644 ---- a/arch/x86/kernel/fpu/xstate.c -+++ b/arch/x86/kernel/fpu/xstate.c -@@ -15,6 +15,7 @@ - #include - - #include -+#include - - /* - * Although we spell it out in here, the Processor Trace -@@ -36,6 +37,19 @@ static const char *xfeature_names[] = - "unknown xstate feature" , - }; - -+static short xsave_cpuid_features[] __initdata = { -+ X86_FEATURE_FPU, -+ X86_FEATURE_XMM, -+ X86_FEATURE_AVX, -+ X86_FEATURE_MPX, -+ X86_FEATURE_MPX, -+ X86_FEATURE_AVX512F, -+ X86_FEATURE_AVX512F, -+ X86_FEATURE_AVX512F, -+ X86_FEATURE_INTEL_PT, -+ X86_FEATURE_PKU, -+}; -+ - /* - * Mask of xstate features supported by the CPU and the kernel: - */ -@@ -702,6 +716,7 @@ void __init fpu__init_system_xstate(void) - unsigned int eax, ebx, ecx, edx; - static int on_boot_cpu __initdata = 1; - int err; -+ int i; - - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; -@@ -735,6 +750,14 @@ void __init fpu__init_system_xstate(void) - goto out_disable; - } - -+ /* -+ * Clear XSAVE features that are disabled in the normal CPUID. -+ */ -+ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { -+ if (!boot_cpu_has(xsave_cpuid_features[i])) -+ xfeatures_mask &= ~BIT(i); -+ } -+ - xfeatures_mask &= fpu__get_supported_xfeatures_mask(); - - /* Enable xstate instructions to be able to continue with initialization: */ --- -2.14.2 - diff --git a/patches/kernel/0070-x86-fpu-Parse-clearcpuid-as-early-XSAVE-argument.patch b/patches/kernel/0070-x86-fpu-Parse-clearcpuid-as-early-XSAVE-argument.patch new file mode 100644 index 0000000..b4c6d58 --- /dev/null +++ b/patches/kernel/0070-x86-fpu-Parse-clearcpuid-as-early-XSAVE-argument.patch @@ -0,0 +1,97 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andi Kleen +Date: Fri, 13 Oct 2017 14:56:43 -0700 +Subject: [PATCH] x86/fpu: Parse clearcpuid= as early XSAVE argument +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +With a followon patch we want to make clearcpuid affect the XSAVE +configuration. But xsave is currently initialized before arguments +are parsed. Move the clearcpuid= parsing into the special +early xsave argument parsing code. + +Since clearcpuid= contains a = we need to keep the old __setup +around as a dummy, otherwise it would end up as a environment +variable in init's environment. + +Signed-off-by: Andi Kleen +Reviewed-by: Thomas Gleixner +Cc: Linus Torvalds +Cc: Peter Zijlstra +Link: http://lkml.kernel.org/r/20171013215645.23166-4-andi@firstfloor.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 0c2a3913d6f50503f7c59d83a6219e39508cc898) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 27deb452eb0d27c406f3817ab057201aa8767abe) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/cpu/common.c | 16 +++++++--------- + arch/x86/kernel/fpu/init.c | 11 +++++++++++ + 2 files changed, 18 insertions(+), 9 deletions(-) + +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 4be7b209a3d6..ef7b1ba56363 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1293,18 +1293,16 @@ void print_cpu_info(struct cpuinfo_x86 *c) + pr_cont(")\n"); + } + +-static __init int setup_disablecpuid(char *arg) ++/* ++ * clearcpuid= was already parsed in fpu__init_parse_early_param. ++ * But we need to keep a dummy __setup around otherwise it would ++ * show up as an environment variable for init. ++ */ ++static __init int setup_clearcpuid(char *arg) + { +- int bit; +- +- if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32) +- setup_clear_cpu_cap(bit); +- else +- return 0; +- + return 1; + } +-__setup("clearcpuid=", setup_disablecpuid); ++__setup("clearcpuid=", setup_clearcpuid); + + #ifdef CONFIG_X86_64 + struct desc_ptr idt_descr __ro_after_init = { +diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c +index d5d44c452624..07f0ab877f49 100644 +--- a/arch/x86/kernel/fpu/init.c ++++ b/arch/x86/kernel/fpu/init.c +@@ -249,6 +249,10 @@ static void __init fpu__init_system_ctx_switch(void) + */ + static void __init fpu__init_parse_early_param(void) + { ++ char arg[32]; ++ char *argptr = arg; ++ int bit; ++ + if (cmdline_find_option_bool(boot_command_line, "no387")) + setup_clear_cpu_cap(X86_FEATURE_FPU); + +@@ -266,6 +270,13 @@ static void __init fpu__init_parse_early_param(void) + + if (cmdline_find_option_bool(boot_command_line, "noxsaves")) + setup_clear_cpu_cap(X86_FEATURE_XSAVES); ++ ++ if (cmdline_find_option(boot_command_line, "clearcpuid", arg, ++ sizeof(arg)) && ++ get_option(&argptr, &bit) && ++ bit >= 0 && ++ bit < NCAPINTS * 32) ++ setup_clear_cpu_cap(bit); + } + + /* +-- +2.14.2 + diff --git a/patches/kernel/0071-x86-fpu-Make-XSAVE-check-the-base-CPUID-features-bef.patch b/patches/kernel/0071-x86-fpu-Make-XSAVE-check-the-base-CPUID-features-bef.patch new file mode 100644 index 0000000..0c563c6 --- /dev/null +++ b/patches/kernel/0071-x86-fpu-Make-XSAVE-check-the-base-CPUID-features-bef.patch @@ -0,0 +1,90 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andi Kleen +Date: Fri, 13 Oct 2017 14:56:44 -0700 +Subject: [PATCH] x86/fpu: Make XSAVE check the base CPUID features before + enabling +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Before enabling XSAVE, not only check the XSAVE specific CPUID bits, +but also the base CPUID features of the respective XSAVE feature. +This allows to disable individual XSAVE states using the existing +clearcpuid= option, which can be useful for performance testing +and debugging, and also in general avoids inconsistencies. + +Signed-off-by: Andi Kleen +Reviewed-by: Thomas Gleixner +Cc: Linus Torvalds +Cc: Peter Zijlstra +Link: http://lkml.kernel.org/r/20171013215645.23166-5-andi@firstfloor.org +Signed-off-by: Ingo Molnar +(cherry picked from commit ccb18db2ab9d923df07e7495123fe5fb02329713) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 2efda26f9ee0eeb9919772e90ca30dbe59008dc8) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/fpu/xstate.c | 23 +++++++++++++++++++++++ + 1 file changed, 23 insertions(+) + +diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c +index c24ac1efb12d..3abe85b08234 100644 +--- a/arch/x86/kernel/fpu/xstate.c ++++ b/arch/x86/kernel/fpu/xstate.c +@@ -15,6 +15,7 @@ + #include + + #include ++#include + + /* + * Although we spell it out in here, the Processor Trace +@@ -36,6 +37,19 @@ static const char *xfeature_names[] = + "unknown xstate feature" , + }; + ++static short xsave_cpuid_features[] __initdata = { ++ X86_FEATURE_FPU, ++ X86_FEATURE_XMM, ++ X86_FEATURE_AVX, ++ X86_FEATURE_MPX, ++ X86_FEATURE_MPX, ++ X86_FEATURE_AVX512F, ++ X86_FEATURE_AVX512F, ++ X86_FEATURE_AVX512F, ++ X86_FEATURE_INTEL_PT, ++ X86_FEATURE_PKU, ++}; ++ + /* + * Mask of xstate features supported by the CPU and the kernel: + */ +@@ -702,6 +716,7 @@ void __init fpu__init_system_xstate(void) + unsigned int eax, ebx, ecx, edx; + static int on_boot_cpu __initdata = 1; + int err; ++ int i; + + WARN_ON_FPU(!on_boot_cpu); + on_boot_cpu = 0; +@@ -735,6 +750,14 @@ void __init fpu__init_system_xstate(void) + goto out_disable; + } + ++ /* ++ * Clear XSAVE features that are disabled in the normal CPUID. ++ */ ++ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { ++ if (!boot_cpu_has(xsave_cpuid_features[i])) ++ xfeatures_mask &= ~BIT(i); ++ } ++ + xfeatures_mask &= fpu__get_supported_xfeatures_mask(); + + /* Enable xstate instructions to be able to continue with initialization: */ +-- +2.14.2 + diff --git a/patches/kernel/0071-x86-fpu-Remove-the-explicit-clearing-of-XSAVE-depend.patch b/patches/kernel/0071-x86-fpu-Remove-the-explicit-clearing-of-XSAVE-depend.patch deleted file mode 100644 index 91e271b..0000000 --- a/patches/kernel/0071-x86-fpu-Remove-the-explicit-clearing-of-XSAVE-depend.patch +++ /dev/null @@ -1,70 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andi Kleen -Date: Fri, 13 Oct 2017 14:56:45 -0700 -Subject: [PATCH] x86/fpu: Remove the explicit clearing of XSAVE dependent - features -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Clearing a CPU feature with setup_clear_cpu_cap() clears all features -which depend on it. Expressing feature dependencies in one place is -easier to maintain than keeping functions like -fpu__xstate_clear_all_cpu_caps() up to date. - -The features which depend on XSAVE have their dependency expressed in the -dependency table, so its sufficient to clear X86_FEATURE_XSAVE. - -Remove the explicit clearing of XSAVE dependent features. - -Signed-off-by: Andi Kleen -Reviewed-by: Thomas Gleixner -Cc: Linus Torvalds -Cc: Peter Zijlstra -Link: http://lkml.kernel.org/r/20171013215645.23166-6-andi@firstfloor.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 73e3a7d2a7c3be29a5a22b85026f6cfa5664267f) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit af445f9ba8bb30b47ccb5247b8f5ba28c9f2be3e) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/fpu/xstate.c | 20 -------------------- - 1 file changed, 20 deletions(-) - -diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c -index 3abe85b08234..fd6882c42246 100644 ---- a/arch/x86/kernel/fpu/xstate.c -+++ b/arch/x86/kernel/fpu/xstate.c -@@ -73,26 +73,6 @@ unsigned int fpu_user_xstate_size; - void fpu__xstate_clear_all_cpu_caps(void) - { - setup_clear_cpu_cap(X86_FEATURE_XSAVE); -- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); -- setup_clear_cpu_cap(X86_FEATURE_XSAVEC); -- setup_clear_cpu_cap(X86_FEATURE_XSAVES); -- setup_clear_cpu_cap(X86_FEATURE_AVX); -- setup_clear_cpu_cap(X86_FEATURE_AVX2); -- setup_clear_cpu_cap(X86_FEATURE_AVX512F); -- setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA); -- setup_clear_cpu_cap(X86_FEATURE_AVX512PF); -- setup_clear_cpu_cap(X86_FEATURE_AVX512ER); -- setup_clear_cpu_cap(X86_FEATURE_AVX512CD); -- setup_clear_cpu_cap(X86_FEATURE_AVX512DQ); -- setup_clear_cpu_cap(X86_FEATURE_AVX512BW); -- setup_clear_cpu_cap(X86_FEATURE_AVX512VL); -- setup_clear_cpu_cap(X86_FEATURE_MPX); -- setup_clear_cpu_cap(X86_FEATURE_XGETBV1); -- setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI); -- setup_clear_cpu_cap(X86_FEATURE_PKU); -- setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); -- setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); -- setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ); - } - - /* --- -2.14.2 - diff --git a/patches/kernel/0072-x86-fpu-Remove-the-explicit-clearing-of-XSAVE-depend.patch b/patches/kernel/0072-x86-fpu-Remove-the-explicit-clearing-of-XSAVE-depend.patch new file mode 100644 index 0000000..91e271b --- /dev/null +++ b/patches/kernel/0072-x86-fpu-Remove-the-explicit-clearing-of-XSAVE-depend.patch @@ -0,0 +1,70 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andi Kleen +Date: Fri, 13 Oct 2017 14:56:45 -0700 +Subject: [PATCH] x86/fpu: Remove the explicit clearing of XSAVE dependent + features +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Clearing a CPU feature with setup_clear_cpu_cap() clears all features +which depend on it. Expressing feature dependencies in one place is +easier to maintain than keeping functions like +fpu__xstate_clear_all_cpu_caps() up to date. + +The features which depend on XSAVE have their dependency expressed in the +dependency table, so its sufficient to clear X86_FEATURE_XSAVE. + +Remove the explicit clearing of XSAVE dependent features. + +Signed-off-by: Andi Kleen +Reviewed-by: Thomas Gleixner +Cc: Linus Torvalds +Cc: Peter Zijlstra +Link: http://lkml.kernel.org/r/20171013215645.23166-6-andi@firstfloor.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 73e3a7d2a7c3be29a5a22b85026f6cfa5664267f) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit af445f9ba8bb30b47ccb5247b8f5ba28c9f2be3e) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/fpu/xstate.c | 20 -------------------- + 1 file changed, 20 deletions(-) + +diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c +index 3abe85b08234..fd6882c42246 100644 +--- a/arch/x86/kernel/fpu/xstate.c ++++ b/arch/x86/kernel/fpu/xstate.c +@@ -73,26 +73,6 @@ unsigned int fpu_user_xstate_size; + void fpu__xstate_clear_all_cpu_caps(void) + { + setup_clear_cpu_cap(X86_FEATURE_XSAVE); +- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); +- setup_clear_cpu_cap(X86_FEATURE_XSAVEC); +- setup_clear_cpu_cap(X86_FEATURE_XSAVES); +- setup_clear_cpu_cap(X86_FEATURE_AVX); +- setup_clear_cpu_cap(X86_FEATURE_AVX2); +- setup_clear_cpu_cap(X86_FEATURE_AVX512F); +- setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA); +- setup_clear_cpu_cap(X86_FEATURE_AVX512PF); +- setup_clear_cpu_cap(X86_FEATURE_AVX512ER); +- setup_clear_cpu_cap(X86_FEATURE_AVX512CD); +- setup_clear_cpu_cap(X86_FEATURE_AVX512DQ); +- setup_clear_cpu_cap(X86_FEATURE_AVX512BW); +- setup_clear_cpu_cap(X86_FEATURE_AVX512VL); +- setup_clear_cpu_cap(X86_FEATURE_MPX); +- setup_clear_cpu_cap(X86_FEATURE_XGETBV1); +- setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI); +- setup_clear_cpu_cap(X86_FEATURE_PKU); +- setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); +- setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); +- setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ); + } + + /* +-- +2.14.2 + diff --git a/patches/kernel/0072-x86-platform-UV-Convert-timers-to-use-timer_setup.patch b/patches/kernel/0072-x86-platform-UV-Convert-timers-to-use-timer_setup.patch deleted file mode 100644 index 02e2fb8..0000000 --- a/patches/kernel/0072-x86-platform-UV-Convert-timers-to-use-timer_setup.patch +++ /dev/null @@ -1,57 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Kees Cook -Date: Mon, 16 Oct 2017 16:22:31 -0700 -Subject: [PATCH] x86/platform/UV: Convert timers to use timer_setup() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -In preparation for unconditionally passing the struct timer_list pointer to -all timer callbacks, switch to using the new timer_setup() and from_timer() -to pass the timer pointer explicitly. - -Signed-off-by: Kees Cook -Signed-off-by: Thomas Gleixner -Cc: Dimitri Sivanich -Cc: Russ Anderson -Cc: Mike Travis -Link: https://lkml.kernel.org/r/20171016232231.GA100493@beast - -(cherry picked from commit 376f3bcebdc999cc737d9052109cc33b573b3a8b) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 869cbd2b31024e70d574527b8c6851bf2ebbe483) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/apic/x2apic_uv_x.c | 5 ++--- - 1 file changed, 2 insertions(+), 3 deletions(-) - -diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c -index 0d57bb9079c9..c0b694810ff4 100644 ---- a/arch/x86/kernel/apic/x2apic_uv_x.c -+++ b/arch/x86/kernel/apic/x2apic_uv_x.c -@@ -920,9 +920,8 @@ static __init void uv_rtc_init(void) - /* - * percpu heartbeat timer - */ --static void uv_heartbeat(unsigned long ignored) -+static void uv_heartbeat(struct timer_list *timer) - { -- struct timer_list *timer = &uv_scir_info->timer; - unsigned char bits = uv_scir_info->state; - - /* Flip heartbeat bit: */ -@@ -947,7 +946,7 @@ static int uv_heartbeat_enable(unsigned int cpu) - struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer; - - uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); -- setup_pinned_timer(timer, uv_heartbeat, cpu); -+ timer_setup(timer, uv_heartbeat, TIMER_PINNED); - timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; - add_timer_on(timer, cpu); - uv_cpu_scir_info(cpu)->enabled = 1; --- -2.14.2 - diff --git a/patches/kernel/0073-objtool-Print-top-level-commands-on-incorrect-usage.patch b/patches/kernel/0073-objtool-Print-top-level-commands-on-incorrect-usage.patch deleted file mode 100644 index 5c596ce..0000000 --- a/patches/kernel/0073-objtool-Print-top-level-commands-on-incorrect-usage.patch +++ /dev/null @@ -1,73 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Kamalesh Babulal -Date: Sat, 14 Oct 2017 20:17:54 +0530 -Subject: [PATCH] objtool: Print top level commands on incorrect usage -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Print top-level objtool commands, along with the error on incorrect -command line usage. Objtool command line parser exit's with code 129, -for incorrect usage. Convert the cmd_usage() exit code also, to maintain -consistency across objtool. - -After the patch: - - $ ./objtool -j - - Unknown option: -j - - usage: objtool COMMAND [ARGS] - - Commands: - check Perform stack metadata validation on an object file - orc Generate in-place ORC unwind tables for an object file - - $ echo $? - 129 - -Signed-off-by: Kamalesh Babulal -Acked-by: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/1507992474-16142-1-git-send-email-kamalesh@linux.vnet.ibm.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 6a93bb7e4a7d6670677d5b0eb980936eb9cc5d2e) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit cd75c9c55a5f288e1d3f20c48c5c4c2caf3966e8) -Signed-off-by: Fabian Grünbichler ---- - tools/objtool/objtool.c | 6 ++---- - 1 file changed, 2 insertions(+), 4 deletions(-) - -diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c -index 31e0f9143840..07f329919828 100644 ---- a/tools/objtool/objtool.c -+++ b/tools/objtool/objtool.c -@@ -70,7 +70,7 @@ static void cmd_usage(void) - - printf("\n"); - -- exit(1); -+ exit(129); - } - - static void handle_options(int *argc, const char ***argv) -@@ -86,9 +86,7 @@ static void handle_options(int *argc, const char ***argv) - break; - } else { - fprintf(stderr, "Unknown option: %s\n", cmd); -- fprintf(stderr, "\n Usage: %s\n", -- objtool_usage_string); -- exit(1); -+ cmd_usage(); - } - - (*argv)++; --- -2.14.2 - diff --git a/patches/kernel/0073-x86-platform-UV-Convert-timers-to-use-timer_setup.patch b/patches/kernel/0073-x86-platform-UV-Convert-timers-to-use-timer_setup.patch new file mode 100644 index 0000000..02e2fb8 --- /dev/null +++ b/patches/kernel/0073-x86-platform-UV-Convert-timers-to-use-timer_setup.patch @@ -0,0 +1,57 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Kees Cook +Date: Mon, 16 Oct 2017 16:22:31 -0700 +Subject: [PATCH] x86/platform/UV: Convert timers to use timer_setup() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +In preparation for unconditionally passing the struct timer_list pointer to +all timer callbacks, switch to using the new timer_setup() and from_timer() +to pass the timer pointer explicitly. + +Signed-off-by: Kees Cook +Signed-off-by: Thomas Gleixner +Cc: Dimitri Sivanich +Cc: Russ Anderson +Cc: Mike Travis +Link: https://lkml.kernel.org/r/20171016232231.GA100493@beast + +(cherry picked from commit 376f3bcebdc999cc737d9052109cc33b573b3a8b) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 869cbd2b31024e70d574527b8c6851bf2ebbe483) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/apic/x2apic_uv_x.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c +index 0d57bb9079c9..c0b694810ff4 100644 +--- a/arch/x86/kernel/apic/x2apic_uv_x.c ++++ b/arch/x86/kernel/apic/x2apic_uv_x.c +@@ -920,9 +920,8 @@ static __init void uv_rtc_init(void) + /* + * percpu heartbeat timer + */ +-static void uv_heartbeat(unsigned long ignored) ++static void uv_heartbeat(struct timer_list *timer) + { +- struct timer_list *timer = &uv_scir_info->timer; + unsigned char bits = uv_scir_info->state; + + /* Flip heartbeat bit: */ +@@ -947,7 +946,7 @@ static int uv_heartbeat_enable(unsigned int cpu) + struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer; + + uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); +- setup_pinned_timer(timer, uv_heartbeat, cpu); ++ timer_setup(timer, uv_heartbeat, TIMER_PINNED); + timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; + add_timer_on(timer, cpu); + uv_cpu_scir_info(cpu)->enabled = 1; +-- +2.14.2 + diff --git a/patches/kernel/0074-objtool-Print-top-level-commands-on-incorrect-usage.patch b/patches/kernel/0074-objtool-Print-top-level-commands-on-incorrect-usage.patch new file mode 100644 index 0000000..5c596ce --- /dev/null +++ b/patches/kernel/0074-objtool-Print-top-level-commands-on-incorrect-usage.patch @@ -0,0 +1,73 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Kamalesh Babulal +Date: Sat, 14 Oct 2017 20:17:54 +0530 +Subject: [PATCH] objtool: Print top level commands on incorrect usage +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Print top-level objtool commands, along with the error on incorrect +command line usage. Objtool command line parser exit's with code 129, +for incorrect usage. Convert the cmd_usage() exit code also, to maintain +consistency across objtool. + +After the patch: + + $ ./objtool -j + + Unknown option: -j + + usage: objtool COMMAND [ARGS] + + Commands: + check Perform stack metadata validation on an object file + orc Generate in-place ORC unwind tables for an object file + + $ echo $? + 129 + +Signed-off-by: Kamalesh Babulal +Acked-by: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/1507992474-16142-1-git-send-email-kamalesh@linux.vnet.ibm.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 6a93bb7e4a7d6670677d5b0eb980936eb9cc5d2e) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit cd75c9c55a5f288e1d3f20c48c5c4c2caf3966e8) +Signed-off-by: Fabian Grünbichler +--- + tools/objtool/objtool.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c +index 31e0f9143840..07f329919828 100644 +--- a/tools/objtool/objtool.c ++++ b/tools/objtool/objtool.c +@@ -70,7 +70,7 @@ static void cmd_usage(void) + + printf("\n"); + +- exit(1); ++ exit(129); + } + + static void handle_options(int *argc, const char ***argv) +@@ -86,9 +86,7 @@ static void handle_options(int *argc, const char ***argv) + break; + } else { + fprintf(stderr, "Unknown option: %s\n", cmd); +- fprintf(stderr, "\n Usage: %s\n", +- objtool_usage_string); +- exit(1); ++ cmd_usage(); + } + + (*argv)++; +-- +2.14.2 + diff --git a/patches/kernel/0074-x86-cpuid-Prevent-out-of-bound-access-in-do_clear_cp.patch b/patches/kernel/0074-x86-cpuid-Prevent-out-of-bound-access-in-do_clear_cp.patch deleted file mode 100644 index aad4047..0000000 --- a/patches/kernel/0074-x86-cpuid-Prevent-out-of-bound-access-in-do_clear_cp.patch +++ /dev/null @@ -1,64 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Wed, 18 Oct 2017 19:39:35 +0200 -Subject: [PATCH] x86/cpuid: Prevent out of bound access in do_clear_cpu_cap() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -do_clear_cpu_cap() allocates a bitmap to keep track of disabled feature -dependencies. That bitmap is sized NCAPINTS * BITS_PER_INIT. The possible -'features' which can be handed in are larger than this, because after the -capabilities the bug 'feature' bits occupy another 32bit. Not really -obvious... - -So clearing any of the misfeature bits, as 32bit does for the F00F bug, -accesses that bitmap out of bounds thereby corrupting the stack. - -Size the bitmap proper and add a sanity check to catch accidental out of -bound access. - -Fixes: 0b00de857a64 ("x86/cpuid: Add generic table for CPUID dependencies") -Reported-by: kernel test robot -Signed-off-by: Thomas Gleixner -Cc: Andi Kleen -Cc: Borislav Petkov -Link: https://lkml.kernel.org/r/20171018022023.GA12058@yexl-desktop -(cherry picked from commit 57b8b1a1856adaa849d02d547411a553a531022b) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 4b3a90bd20b35a97fd9ca6f6a71131f4417782e4) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/cpu/cpuid-deps.c | 10 ++++++++-- - 1 file changed, 8 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c -index e48eb7313120..c1d49842a411 100644 ---- a/arch/x86/kernel/cpu/cpuid-deps.c -+++ b/arch/x86/kernel/cpu/cpuid-deps.c -@@ -75,11 +75,17 @@ static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) - __clear_cpu_cap(c, feature); - } - -+/* Take the capabilities and the BUG bits into account */ -+#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8) -+ - static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) - { -- bool changed; -- DECLARE_BITMAP(disable, NCAPINTS * sizeof(u32) * 8); -+ DECLARE_BITMAP(disable, MAX_FEATURE_BITS); - const struct cpuid_dep *d; -+ bool changed; -+ -+ if (WARN_ON(feature >= MAX_FEATURE_BITS)) -+ return; - - clear_feature(c, feature); - --- -2.14.2 - diff --git a/patches/kernel/0075-x86-cpuid-Prevent-out-of-bound-access-in-do_clear_cp.patch b/patches/kernel/0075-x86-cpuid-Prevent-out-of-bound-access-in-do_clear_cp.patch new file mode 100644 index 0000000..aad4047 --- /dev/null +++ b/patches/kernel/0075-x86-cpuid-Prevent-out-of-bound-access-in-do_clear_cp.patch @@ -0,0 +1,64 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Wed, 18 Oct 2017 19:39:35 +0200 +Subject: [PATCH] x86/cpuid: Prevent out of bound access in do_clear_cpu_cap() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +do_clear_cpu_cap() allocates a bitmap to keep track of disabled feature +dependencies. That bitmap is sized NCAPINTS * BITS_PER_INIT. The possible +'features' which can be handed in are larger than this, because after the +capabilities the bug 'feature' bits occupy another 32bit. Not really +obvious... + +So clearing any of the misfeature bits, as 32bit does for the F00F bug, +accesses that bitmap out of bounds thereby corrupting the stack. + +Size the bitmap proper and add a sanity check to catch accidental out of +bound access. + +Fixes: 0b00de857a64 ("x86/cpuid: Add generic table for CPUID dependencies") +Reported-by: kernel test robot +Signed-off-by: Thomas Gleixner +Cc: Andi Kleen +Cc: Borislav Petkov +Link: https://lkml.kernel.org/r/20171018022023.GA12058@yexl-desktop +(cherry picked from commit 57b8b1a1856adaa849d02d547411a553a531022b) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 4b3a90bd20b35a97fd9ca6f6a71131f4417782e4) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/cpu/cpuid-deps.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c +index e48eb7313120..c1d49842a411 100644 +--- a/arch/x86/kernel/cpu/cpuid-deps.c ++++ b/arch/x86/kernel/cpu/cpuid-deps.c +@@ -75,11 +75,17 @@ static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) + __clear_cpu_cap(c, feature); + } + ++/* Take the capabilities and the BUG bits into account */ ++#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8) ++ + static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) + { +- bool changed; +- DECLARE_BITMAP(disable, NCAPINTS * sizeof(u32) * 8); ++ DECLARE_BITMAP(disable, MAX_FEATURE_BITS); + const struct cpuid_dep *d; ++ bool changed; ++ ++ if (WARN_ON(feature >= MAX_FEATURE_BITS)) ++ return; + + clear_feature(c, feature); + +-- +2.14.2 + diff --git a/patches/kernel/0075-x86-entry-Use-SYSCALL_DEFINE-macros-for-sys_modify_l.patch b/patches/kernel/0075-x86-entry-Use-SYSCALL_DEFINE-macros-for-sys_modify_l.patch deleted file mode 100644 index af37b4e..0000000 --- a/patches/kernel/0075-x86-entry-Use-SYSCALL_DEFINE-macros-for-sys_modify_l.patch +++ /dev/null @@ -1,124 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Wed, 18 Oct 2017 10:21:07 -0700 -Subject: [PATCH] x86/entry: Use SYSCALL_DEFINE() macros for sys_modify_ldt() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -We do not have tracepoints for sys_modify_ldt() because we define -it directly instead of using the normal SYSCALL_DEFINEx() macros. - -However, there is a reason sys_modify_ldt() does not use the macros: -it has an 'int' return type instead of 'unsigned long'. This is -a bug, but it's a bug cemented in the ABI. - -What does this mean? If we return -EINVAL from a function that -returns 'int', we have 0x00000000ffffffea in %rax. But, if we -return -EINVAL from a function returning 'unsigned long', we end -up with 0xffffffffffffffea in %rax, which is wrong. - -To work around this and maintain the 'int' behavior while using -the SYSCALL_DEFINEx() macros, so we add a cast to 'unsigned int' -in both implementations of sys_modify_ldt(). - -Signed-off-by: Dave Hansen -Reviewed-by: Andy Lutomirski -Reviewed-by: Brian Gerst -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/20171018172107.1A79C532@viggo.jf.intel.com -Signed-off-by: Ingo Molnar -(cherry picked from commit da20ab35180780e4a6eadc804544f1fa967f3567) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit d865f635f4b2c3307e79de9be5c49ea8bd4c43a6) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/syscalls.h | 2 +- - arch/x86/kernel/ldt.c | 16 +++++++++++++--- - arch/x86/um/ldt.c | 7 +++++-- - 3 files changed, 19 insertions(+), 6 deletions(-) - -diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h -index 91dfcafe27a6..bad25bb80679 100644 ---- a/arch/x86/include/asm/syscalls.h -+++ b/arch/x86/include/asm/syscalls.h -@@ -21,7 +21,7 @@ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); - asmlinkage long sys_iopl(unsigned int); - - /* kernel/ldt.c */ --asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); -+asmlinkage long sys_modify_ldt(int, void __user *, unsigned long); - - /* kernel/signal.c */ - asmlinkage long sys_rt_sigreturn(void); -diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c -index f0e64db18ac8..0402d44deb4d 100644 ---- a/arch/x86/kernel/ldt.c -+++ b/arch/x86/kernel/ldt.c -@@ -12,6 +12,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -294,8 +295,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) - return error; - } - --asmlinkage int sys_modify_ldt(int func, void __user *ptr, -- unsigned long bytecount) -+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , -+ unsigned long , bytecount) - { - int ret = -ENOSYS; - -@@ -313,5 +314,14 @@ asmlinkage int sys_modify_ldt(int func, void __user *ptr, - ret = write_ldt(ptr, bytecount, 0); - break; - } -- return ret; -+ /* -+ * The SYSCALL_DEFINE() macros give us an 'unsigned long' -+ * return type, but tht ABI for sys_modify_ldt() expects -+ * 'int'. This cast gives us an int-sized value in %rax -+ * for the return code. The 'unsigned' is necessary so -+ * the compiler does not try to sign-extend the negative -+ * return codes into the high half of the register when -+ * taking the value from int->long. -+ */ -+ return (unsigned int)ret; - } -diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c -index 836a1eb5df43..3ee234b6234d 100644 ---- a/arch/x86/um/ldt.c -+++ b/arch/x86/um/ldt.c -@@ -6,6 +6,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -369,7 +370,9 @@ void free_ldt(struct mm_context *mm) - mm->arch.ldt.entry_count = 0; - } - --int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) -+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , -+ unsigned long , bytecount) - { -- return do_modify_ldt_skas(func, ptr, bytecount); -+ /* See non-um modify_ldt() for why we do this cast */ -+ return (unsigned int)do_modify_ldt_skas(func, ptr, bytecount); - } --- -2.14.2 - diff --git a/patches/kernel/0076-mm-sparsemem-Allocate-mem_section-at-runtime-for-CON.patch b/patches/kernel/0076-mm-sparsemem-Allocate-mem_section-at-runtime-for-CON.patch deleted file mode 100644 index d0045b1..0000000 --- a/patches/kernel/0076-mm-sparsemem-Allocate-mem_section-at-runtime-for-CON.patch +++ /dev/null @@ -1,141 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: "Kirill A. Shutemov" -Date: Fri, 29 Sep 2017 17:08:16 +0300 -Subject: [PATCH] mm/sparsemem: Allocate mem_section at runtime for - CONFIG_SPARSEMEM_EXTREME=y -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Size of the mem_section[] array depends on the size of the physical address space. - -In preparation for boot-time switching between paging modes on x86-64 -we need to make the allocation of mem_section[] dynamic, because otherwise -we waste a lot of RAM: with CONFIG_NODE_SHIFT=10, mem_section[] size is 32kB -for 4-level paging and 2MB for 5-level paging mode. - -The patch allocates the array on the first call to sparse_memory_present_with_active_regions(). - -Signed-off-by: Kirill A. Shutemov -Cc: Andrew Morton -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Cyrill Gorcunov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: linux-mm@kvack.org -Link: http://lkml.kernel.org/r/20170929140821.37654-2-kirill.shutemov@linux.intel.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 83e3c48729d9ebb7af5a31a504f3fd6aff0348c4) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit c70f71e01a0ae5d884abae0424618abe90b82011) -Signed-off-by: Fabian Grünbichler ---- - include/linux/mmzone.h | 6 +++++- - mm/page_alloc.c | 10 ++++++++++ - mm/sparse.c | 17 +++++++++++------ - 3 files changed, 26 insertions(+), 7 deletions(-) - -diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h -index fc14b8b3f6ce..9c6c001a8c6c 100644 ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -1137,13 +1137,17 @@ struct mem_section { - #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) - - #ifdef CONFIG_SPARSEMEM_EXTREME --extern struct mem_section *mem_section[NR_SECTION_ROOTS]; -+extern struct mem_section **mem_section; - #else - extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; - #endif - - static inline struct mem_section *__nr_to_section(unsigned long nr) - { -+#ifdef CONFIG_SPARSEMEM_EXTREME -+ if (!mem_section) -+ return NULL; -+#endif - if (!mem_section[SECTION_NR_TO_ROOT(nr)]) - return NULL; - return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; -diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index 1423da8dd16f..66eb23ab658d 100644 ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -5707,6 +5707,16 @@ void __init sparse_memory_present_with_active_regions(int nid) - unsigned long start_pfn, end_pfn; - int i, this_nid; - -+#ifdef CONFIG_SPARSEMEM_EXTREME -+ if (!mem_section) { -+ unsigned long size, align; -+ -+ size = sizeof(struct mem_section) * NR_SECTION_ROOTS; -+ align = 1 << (INTERNODE_CACHE_SHIFT); -+ mem_section = memblock_virt_alloc(size, align); -+ } -+#endif -+ - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) - memory_present(this_nid, start_pfn, end_pfn); - } -diff --git a/mm/sparse.c b/mm/sparse.c -index cdce7a7bb3f3..308a0789d1bb 100644 ---- a/mm/sparse.c -+++ b/mm/sparse.c -@@ -22,8 +22,7 @@ - * 1) mem_section - memory sections, mem_map's for valid memory - */ - #ifdef CONFIG_SPARSEMEM_EXTREME --struct mem_section *mem_section[NR_SECTION_ROOTS] -- ____cacheline_internodealigned_in_smp; -+struct mem_section **mem_section; - #else - struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] - ____cacheline_internodealigned_in_smp; -@@ -104,7 +103,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) - int __section_nr(struct mem_section* ms) - { - unsigned long root_nr; -- struct mem_section* root; -+ struct mem_section *root = NULL; - - for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) { - root = __nr_to_section(root_nr * SECTIONS_PER_ROOT); -@@ -115,7 +114,7 @@ int __section_nr(struct mem_section* ms) - break; - } - -- VM_BUG_ON(root_nr == NR_SECTION_ROOTS); -+ VM_BUG_ON(!root); - - return (root_nr * SECTIONS_PER_ROOT) + (ms - root); - } -@@ -333,11 +332,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, - static void __init check_usemap_section_nr(int nid, unsigned long *usemap) - { - unsigned long usemap_snr, pgdat_snr; -- static unsigned long old_usemap_snr = NR_MEM_SECTIONS; -- static unsigned long old_pgdat_snr = NR_MEM_SECTIONS; -+ static unsigned long old_usemap_snr; -+ static unsigned long old_pgdat_snr; - struct pglist_data *pgdat = NODE_DATA(nid); - int usemap_nid; - -+ /* First call */ -+ if (!old_usemap_snr) { -+ old_usemap_snr = NR_MEM_SECTIONS; -+ old_pgdat_snr = NR_MEM_SECTIONS; -+ } -+ - usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT); - pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); - if (usemap_snr == pgdat_snr) --- -2.14.2 - diff --git a/patches/kernel/0076-x86-entry-Use-SYSCALL_DEFINE-macros-for-sys_modify_l.patch b/patches/kernel/0076-x86-entry-Use-SYSCALL_DEFINE-macros-for-sys_modify_l.patch new file mode 100644 index 0000000..af37b4e --- /dev/null +++ b/patches/kernel/0076-x86-entry-Use-SYSCALL_DEFINE-macros-for-sys_modify_l.patch @@ -0,0 +1,124 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Wed, 18 Oct 2017 10:21:07 -0700 +Subject: [PATCH] x86/entry: Use SYSCALL_DEFINE() macros for sys_modify_ldt() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +We do not have tracepoints for sys_modify_ldt() because we define +it directly instead of using the normal SYSCALL_DEFINEx() macros. + +However, there is a reason sys_modify_ldt() does not use the macros: +it has an 'int' return type instead of 'unsigned long'. This is +a bug, but it's a bug cemented in the ABI. + +What does this mean? If we return -EINVAL from a function that +returns 'int', we have 0x00000000ffffffea in %rax. But, if we +return -EINVAL from a function returning 'unsigned long', we end +up with 0xffffffffffffffea in %rax, which is wrong. + +To work around this and maintain the 'int' behavior while using +the SYSCALL_DEFINEx() macros, so we add a cast to 'unsigned int' +in both implementations of sys_modify_ldt(). + +Signed-off-by: Dave Hansen +Reviewed-by: Andy Lutomirski +Reviewed-by: Brian Gerst +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/20171018172107.1A79C532@viggo.jf.intel.com +Signed-off-by: Ingo Molnar +(cherry picked from commit da20ab35180780e4a6eadc804544f1fa967f3567) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit d865f635f4b2c3307e79de9be5c49ea8bd4c43a6) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/syscalls.h | 2 +- + arch/x86/kernel/ldt.c | 16 +++++++++++++--- + arch/x86/um/ldt.c | 7 +++++-- + 3 files changed, 19 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h +index 91dfcafe27a6..bad25bb80679 100644 +--- a/arch/x86/include/asm/syscalls.h ++++ b/arch/x86/include/asm/syscalls.h +@@ -21,7 +21,7 @@ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); + asmlinkage long sys_iopl(unsigned int); + + /* kernel/ldt.c */ +-asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); ++asmlinkage long sys_modify_ldt(int, void __user *, unsigned long); + + /* kernel/signal.c */ + asmlinkage long sys_rt_sigreturn(void); +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index f0e64db18ac8..0402d44deb4d 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -12,6 +12,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -294,8 +295,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) + return error; + } + +-asmlinkage int sys_modify_ldt(int func, void __user *ptr, +- unsigned long bytecount) ++SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , ++ unsigned long , bytecount) + { + int ret = -ENOSYS; + +@@ -313,5 +314,14 @@ asmlinkage int sys_modify_ldt(int func, void __user *ptr, + ret = write_ldt(ptr, bytecount, 0); + break; + } +- return ret; ++ /* ++ * The SYSCALL_DEFINE() macros give us an 'unsigned long' ++ * return type, but tht ABI for sys_modify_ldt() expects ++ * 'int'. This cast gives us an int-sized value in %rax ++ * for the return code. The 'unsigned' is necessary so ++ * the compiler does not try to sign-extend the negative ++ * return codes into the high half of the register when ++ * taking the value from int->long. ++ */ ++ return (unsigned int)ret; + } +diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c +index 836a1eb5df43..3ee234b6234d 100644 +--- a/arch/x86/um/ldt.c ++++ b/arch/x86/um/ldt.c +@@ -6,6 +6,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -369,7 +370,9 @@ void free_ldt(struct mm_context *mm) + mm->arch.ldt.entry_count = 0; + } + +-int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) ++SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , ++ unsigned long , bytecount) + { +- return do_modify_ldt_skas(func, ptr, bytecount); ++ /* See non-um modify_ldt() for why we do this cast */ ++ return (unsigned int)do_modify_ldt_skas(func, ptr, bytecount); + } +-- +2.14.2 + diff --git a/patches/kernel/0077-mm-sparsemem-Allocate-mem_section-at-runtime-for-CON.patch b/patches/kernel/0077-mm-sparsemem-Allocate-mem_section-at-runtime-for-CON.patch new file mode 100644 index 0000000..d0045b1 --- /dev/null +++ b/patches/kernel/0077-mm-sparsemem-Allocate-mem_section-at-runtime-for-CON.patch @@ -0,0 +1,141 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: "Kirill A. Shutemov" +Date: Fri, 29 Sep 2017 17:08:16 +0300 +Subject: [PATCH] mm/sparsemem: Allocate mem_section at runtime for + CONFIG_SPARSEMEM_EXTREME=y +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Size of the mem_section[] array depends on the size of the physical address space. + +In preparation for boot-time switching between paging modes on x86-64 +we need to make the allocation of mem_section[] dynamic, because otherwise +we waste a lot of RAM: with CONFIG_NODE_SHIFT=10, mem_section[] size is 32kB +for 4-level paging and 2MB for 5-level paging mode. + +The patch allocates the array on the first call to sparse_memory_present_with_active_regions(). + +Signed-off-by: Kirill A. Shutemov +Cc: Andrew Morton +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Cyrill Gorcunov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/20170929140821.37654-2-kirill.shutemov@linux.intel.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 83e3c48729d9ebb7af5a31a504f3fd6aff0348c4) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit c70f71e01a0ae5d884abae0424618abe90b82011) +Signed-off-by: Fabian Grünbichler +--- + include/linux/mmzone.h | 6 +++++- + mm/page_alloc.c | 10 ++++++++++ + mm/sparse.c | 17 +++++++++++------ + 3 files changed, 26 insertions(+), 7 deletions(-) + +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index fc14b8b3f6ce..9c6c001a8c6c 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -1137,13 +1137,17 @@ struct mem_section { + #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) + + #ifdef CONFIG_SPARSEMEM_EXTREME +-extern struct mem_section *mem_section[NR_SECTION_ROOTS]; ++extern struct mem_section **mem_section; + #else + extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; + #endif + + static inline struct mem_section *__nr_to_section(unsigned long nr) + { ++#ifdef CONFIG_SPARSEMEM_EXTREME ++ if (!mem_section) ++ return NULL; ++#endif + if (!mem_section[SECTION_NR_TO_ROOT(nr)]) + return NULL; + return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 1423da8dd16f..66eb23ab658d 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -5707,6 +5707,16 @@ void __init sparse_memory_present_with_active_regions(int nid) + unsigned long start_pfn, end_pfn; + int i, this_nid; + ++#ifdef CONFIG_SPARSEMEM_EXTREME ++ if (!mem_section) { ++ unsigned long size, align; ++ ++ size = sizeof(struct mem_section) * NR_SECTION_ROOTS; ++ align = 1 << (INTERNODE_CACHE_SHIFT); ++ mem_section = memblock_virt_alloc(size, align); ++ } ++#endif ++ + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) + memory_present(this_nid, start_pfn, end_pfn); + } +diff --git a/mm/sparse.c b/mm/sparse.c +index cdce7a7bb3f3..308a0789d1bb 100644 +--- a/mm/sparse.c ++++ b/mm/sparse.c +@@ -22,8 +22,7 @@ + * 1) mem_section - memory sections, mem_map's for valid memory + */ + #ifdef CONFIG_SPARSEMEM_EXTREME +-struct mem_section *mem_section[NR_SECTION_ROOTS] +- ____cacheline_internodealigned_in_smp; ++struct mem_section **mem_section; + #else + struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] + ____cacheline_internodealigned_in_smp; +@@ -104,7 +103,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) + int __section_nr(struct mem_section* ms) + { + unsigned long root_nr; +- struct mem_section* root; ++ struct mem_section *root = NULL; + + for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) { + root = __nr_to_section(root_nr * SECTIONS_PER_ROOT); +@@ -115,7 +114,7 @@ int __section_nr(struct mem_section* ms) + break; + } + +- VM_BUG_ON(root_nr == NR_SECTION_ROOTS); ++ VM_BUG_ON(!root); + + return (root_nr * SECTIONS_PER_ROOT) + (ms - root); + } +@@ -333,11 +332,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, + static void __init check_usemap_section_nr(int nid, unsigned long *usemap) + { + unsigned long usemap_snr, pgdat_snr; +- static unsigned long old_usemap_snr = NR_MEM_SECTIONS; +- static unsigned long old_pgdat_snr = NR_MEM_SECTIONS; ++ static unsigned long old_usemap_snr; ++ static unsigned long old_pgdat_snr; + struct pglist_data *pgdat = NODE_DATA(nid); + int usemap_nid; + ++ /* First call */ ++ if (!old_usemap_snr) { ++ old_usemap_snr = NR_MEM_SECTIONS; ++ old_pgdat_snr = NR_MEM_SECTIONS; ++ } ++ + usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT); + pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); + if (usemap_snr == pgdat_snr) +-- +2.14.2 + diff --git a/patches/kernel/0077-x86-kasan-Use-the-same-shadow-offset-for-4-and-5-lev.patch b/patches/kernel/0077-x86-kasan-Use-the-same-shadow-offset-for-4-and-5-lev.patch deleted file mode 100644 index ddf50de..0000000 --- a/patches/kernel/0077-x86-kasan-Use-the-same-shadow-offset-for-4-and-5-lev.patch +++ /dev/null @@ -1,244 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andrey Ryabinin -Date: Fri, 29 Sep 2017 17:08:18 +0300 -Subject: [PATCH] x86/kasan: Use the same shadow offset for 4- and 5-level - paging -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -We are going to support boot-time switching between 4- and 5-level -paging. For KASAN it means we cannot have different KASAN_SHADOW_OFFSET -for different paging modes: the constant is passed to gcc to generate -code and cannot be changed at runtime. - -This patch changes KASAN code to use 0xdffffc0000000000 as shadow offset -for both 4- and 5-level paging. - -For 5-level paging it means that shadow memory region is not aligned to -PGD boundary anymore and we have to handle unaligned parts of the region -properly. - -In addition, we have to exclude paravirt code from KASAN instrumentation -as we now use set_pgd() before KASAN is fully ready. - -[kirill.shutemov@linux.intel.com: clenaup, changelog message] -Signed-off-by: Andrey Ryabinin -Signed-off-by: Kirill A. Shutemov -Cc: Andrew Morton -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Cyrill Gorcunov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: linux-mm@kvack.org -Link: http://lkml.kernel.org/r/20170929140821.37654-4-kirill.shutemov@linux.intel.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 12a8cc7fcf54a8575f094be1e99032ec38aa045c) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 2ce428150e002623aa0ed2a1ab840fde5f860f32) -Signed-off-by: Fabian Grünbichler ---- - Documentation/x86/x86_64/mm.txt | 2 +- - arch/x86/kernel/Makefile | 3 +- - arch/x86/mm/kasan_init_64.c | 101 +++++++++++++++++++++++++++++++--------- - arch/x86/Kconfig | 1 - - 4 files changed, 83 insertions(+), 24 deletions(-) - -diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt -index b0798e281aa6..3448e675b462 100644 ---- a/Documentation/x86/x86_64/mm.txt -+++ b/Documentation/x86/x86_64/mm.txt -@@ -34,7 +34,7 @@ ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space - ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole - ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) - ... unused hole ... --ffd8000000000000 - fff7ffffffffffff (=53 bits) kasan shadow memory (8PB) -+ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) - ... unused hole ... - ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks - ... unused hole ... -diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile -index 5bf0d5a473b4..aa059806201d 100644 ---- a/arch/x86/kernel/Makefile -+++ b/arch/x86/kernel/Makefile -@@ -24,7 +24,8 @@ endif - KASAN_SANITIZE_head$(BITS).o := n - KASAN_SANITIZE_dumpstack.o := n - KASAN_SANITIZE_dumpstack_$(BITS).o := n --KASAN_SANITIZE_stacktrace.o := n -+KASAN_SANITIZE_stacktrace.o := n -+KASAN_SANITIZE_paravirt.o := n - - OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y - OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y -diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c -index 02c9d7553409..464089f33e80 100644 ---- a/arch/x86/mm/kasan_init_64.c -+++ b/arch/x86/mm/kasan_init_64.c -@@ -15,6 +15,8 @@ - extern pgd_t early_top_pgt[PTRS_PER_PGD]; - extern struct range pfn_mapped[E820_MAX_ENTRIES]; - -+static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); -+ - static int __init map_range(struct range *range) - { - unsigned long start; -@@ -30,8 +32,10 @@ static void __init clear_pgds(unsigned long start, - unsigned long end) - { - pgd_t *pgd; -+ /* See comment in kasan_init() */ -+ unsigned long pgd_end = end & PGDIR_MASK; - -- for (; start < end; start += PGDIR_SIZE) { -+ for (; start < pgd_end; start += PGDIR_SIZE) { - pgd = pgd_offset_k(start); - /* - * With folded p4d, pgd_clear() is nop, use p4d_clear() -@@ -42,29 +46,61 @@ static void __init clear_pgds(unsigned long start, - else - pgd_clear(pgd); - } -+ -+ pgd = pgd_offset_k(start); -+ for (; start < end; start += P4D_SIZE) -+ p4d_clear(p4d_offset(pgd, start)); -+} -+ -+static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr) -+{ -+ unsigned long p4d; -+ -+ if (!IS_ENABLED(CONFIG_X86_5LEVEL)) -+ return (p4d_t *)pgd; -+ -+ p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK; -+ p4d += __START_KERNEL_map - phys_base; -+ return (p4d_t *)p4d + p4d_index(addr); -+} -+ -+static void __init kasan_early_p4d_populate(pgd_t *pgd, -+ unsigned long addr, -+ unsigned long end) -+{ -+ pgd_t pgd_entry; -+ p4d_t *p4d, p4d_entry; -+ unsigned long next; -+ -+ if (pgd_none(*pgd)) { -+ pgd_entry = __pgd(_KERNPG_TABLE | __pa_nodebug(kasan_zero_p4d)); -+ set_pgd(pgd, pgd_entry); -+ } -+ -+ p4d = early_p4d_offset(pgd, addr); -+ do { -+ next = p4d_addr_end(addr, end); -+ -+ if (!p4d_none(*p4d)) -+ continue; -+ -+ p4d_entry = __p4d(_KERNPG_TABLE | __pa_nodebug(kasan_zero_pud)); -+ set_p4d(p4d, p4d_entry); -+ } while (p4d++, addr = next, addr != end && p4d_none(*p4d)); - } - - static void __init kasan_map_early_shadow(pgd_t *pgd) - { -- int i; -- unsigned long start = KASAN_SHADOW_START; -+ /* See comment in kasan_init() */ -+ unsigned long addr = KASAN_SHADOW_START & PGDIR_MASK; - unsigned long end = KASAN_SHADOW_END; -+ unsigned long next; - -- for (i = pgd_index(start); start < end; i++) { -- switch (CONFIG_PGTABLE_LEVELS) { -- case 4: -- pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) | -- _KERNPG_TABLE); -- break; -- case 5: -- pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) | -- _KERNPG_TABLE); -- break; -- default: -- BUILD_BUG(); -- } -- start += PGDIR_SIZE; -- } -+ pgd += pgd_index(addr); -+ do { -+ next = pgd_addr_end(addr, end); -+ kasan_early_p4d_populate(pgd, addr, next); -+ } while (pgd++, addr = next, addr != end); - } - - #ifdef CONFIG_KASAN_INLINE -@@ -101,7 +137,7 @@ void __init kasan_early_init(void) - for (i = 0; i < PTRS_PER_PUD; i++) - kasan_zero_pud[i] = __pud(pud_val); - -- for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++) -+ for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++) - kasan_zero_p4d[i] = __p4d(p4d_val); - - kasan_map_early_shadow(early_top_pgt); -@@ -117,12 +153,35 @@ void __init kasan_init(void) - #endif - - memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt)); -+ -+ /* -+ * We use the same shadow offset for 4- and 5-level paging to -+ * facilitate boot-time switching between paging modes. -+ * As result in 5-level paging mode KASAN_SHADOW_START and -+ * KASAN_SHADOW_END are not aligned to PGD boundary. -+ * -+ * KASAN_SHADOW_START doesn't share PGD with anything else. -+ * We claim whole PGD entry to make things easier. -+ * -+ * KASAN_SHADOW_END lands in the last PGD entry and it collides with -+ * bunch of things like kernel code, modules, EFI mapping, etc. -+ * We need to take extra steps to not overwrite them. -+ */ -+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) { -+ void *ptr; -+ -+ ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END)); -+ memcpy(tmp_p4d_table, (void *)ptr, sizeof(tmp_p4d_table)); -+ set_pgd(&early_top_pgt[pgd_index(KASAN_SHADOW_END)], -+ __pgd(__pa(tmp_p4d_table) | _KERNPG_TABLE)); -+ } -+ - load_cr3(early_top_pgt); - __flush_tlb_all(); - -- clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); -+ clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END); - -- kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, -+ kasan_populate_zero_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK), - kasan_mem_to_shadow((void *)PAGE_OFFSET)); - - for (i = 0; i < E820_MAX_ENTRIES; i++) { -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index bf9f03740c30..67d07802ae95 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -300,7 +300,6 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC - config KASAN_SHADOW_OFFSET - hex - depends on KASAN -- default 0xdff8000000000000 if X86_5LEVEL - default 0xdffffc0000000000 - - config HAVE_INTEL_TXT --- -2.14.2 - diff --git a/patches/kernel/0078-x86-kasan-Use-the-same-shadow-offset-for-4-and-5-lev.patch b/patches/kernel/0078-x86-kasan-Use-the-same-shadow-offset-for-4-and-5-lev.patch new file mode 100644 index 0000000..ddf50de --- /dev/null +++ b/patches/kernel/0078-x86-kasan-Use-the-same-shadow-offset-for-4-and-5-lev.patch @@ -0,0 +1,244 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andrey Ryabinin +Date: Fri, 29 Sep 2017 17:08:18 +0300 +Subject: [PATCH] x86/kasan: Use the same shadow offset for 4- and 5-level + paging +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +We are going to support boot-time switching between 4- and 5-level +paging. For KASAN it means we cannot have different KASAN_SHADOW_OFFSET +for different paging modes: the constant is passed to gcc to generate +code and cannot be changed at runtime. + +This patch changes KASAN code to use 0xdffffc0000000000 as shadow offset +for both 4- and 5-level paging. + +For 5-level paging it means that shadow memory region is not aligned to +PGD boundary anymore and we have to handle unaligned parts of the region +properly. + +In addition, we have to exclude paravirt code from KASAN instrumentation +as we now use set_pgd() before KASAN is fully ready. + +[kirill.shutemov@linux.intel.com: clenaup, changelog message] +Signed-off-by: Andrey Ryabinin +Signed-off-by: Kirill A. Shutemov +Cc: Andrew Morton +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Cyrill Gorcunov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/20170929140821.37654-4-kirill.shutemov@linux.intel.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 12a8cc7fcf54a8575f094be1e99032ec38aa045c) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 2ce428150e002623aa0ed2a1ab840fde5f860f32) +Signed-off-by: Fabian Grünbichler +--- + Documentation/x86/x86_64/mm.txt | 2 +- + arch/x86/kernel/Makefile | 3 +- + arch/x86/mm/kasan_init_64.c | 101 +++++++++++++++++++++++++++++++--------- + arch/x86/Kconfig | 1 - + 4 files changed, 83 insertions(+), 24 deletions(-) + +diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt +index b0798e281aa6..3448e675b462 100644 +--- a/Documentation/x86/x86_64/mm.txt ++++ b/Documentation/x86/x86_64/mm.txt +@@ -34,7 +34,7 @@ ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space + ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole + ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) + ... unused hole ... +-ffd8000000000000 - fff7ffffffffffff (=53 bits) kasan shadow memory (8PB) ++ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) + ... unused hole ... + ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks + ... unused hole ... +diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile +index 5bf0d5a473b4..aa059806201d 100644 +--- a/arch/x86/kernel/Makefile ++++ b/arch/x86/kernel/Makefile +@@ -24,7 +24,8 @@ endif + KASAN_SANITIZE_head$(BITS).o := n + KASAN_SANITIZE_dumpstack.o := n + KASAN_SANITIZE_dumpstack_$(BITS).o := n +-KASAN_SANITIZE_stacktrace.o := n ++KASAN_SANITIZE_stacktrace.o := n ++KASAN_SANITIZE_paravirt.o := n + + OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y + OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y +diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c +index 02c9d7553409..464089f33e80 100644 +--- a/arch/x86/mm/kasan_init_64.c ++++ b/arch/x86/mm/kasan_init_64.c +@@ -15,6 +15,8 @@ + extern pgd_t early_top_pgt[PTRS_PER_PGD]; + extern struct range pfn_mapped[E820_MAX_ENTRIES]; + ++static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); ++ + static int __init map_range(struct range *range) + { + unsigned long start; +@@ -30,8 +32,10 @@ static void __init clear_pgds(unsigned long start, + unsigned long end) + { + pgd_t *pgd; ++ /* See comment in kasan_init() */ ++ unsigned long pgd_end = end & PGDIR_MASK; + +- for (; start < end; start += PGDIR_SIZE) { ++ for (; start < pgd_end; start += PGDIR_SIZE) { + pgd = pgd_offset_k(start); + /* + * With folded p4d, pgd_clear() is nop, use p4d_clear() +@@ -42,29 +46,61 @@ static void __init clear_pgds(unsigned long start, + else + pgd_clear(pgd); + } ++ ++ pgd = pgd_offset_k(start); ++ for (; start < end; start += P4D_SIZE) ++ p4d_clear(p4d_offset(pgd, start)); ++} ++ ++static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr) ++{ ++ unsigned long p4d; ++ ++ if (!IS_ENABLED(CONFIG_X86_5LEVEL)) ++ return (p4d_t *)pgd; ++ ++ p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK; ++ p4d += __START_KERNEL_map - phys_base; ++ return (p4d_t *)p4d + p4d_index(addr); ++} ++ ++static void __init kasan_early_p4d_populate(pgd_t *pgd, ++ unsigned long addr, ++ unsigned long end) ++{ ++ pgd_t pgd_entry; ++ p4d_t *p4d, p4d_entry; ++ unsigned long next; ++ ++ if (pgd_none(*pgd)) { ++ pgd_entry = __pgd(_KERNPG_TABLE | __pa_nodebug(kasan_zero_p4d)); ++ set_pgd(pgd, pgd_entry); ++ } ++ ++ p4d = early_p4d_offset(pgd, addr); ++ do { ++ next = p4d_addr_end(addr, end); ++ ++ if (!p4d_none(*p4d)) ++ continue; ++ ++ p4d_entry = __p4d(_KERNPG_TABLE | __pa_nodebug(kasan_zero_pud)); ++ set_p4d(p4d, p4d_entry); ++ } while (p4d++, addr = next, addr != end && p4d_none(*p4d)); + } + + static void __init kasan_map_early_shadow(pgd_t *pgd) + { +- int i; +- unsigned long start = KASAN_SHADOW_START; ++ /* See comment in kasan_init() */ ++ unsigned long addr = KASAN_SHADOW_START & PGDIR_MASK; + unsigned long end = KASAN_SHADOW_END; ++ unsigned long next; + +- for (i = pgd_index(start); start < end; i++) { +- switch (CONFIG_PGTABLE_LEVELS) { +- case 4: +- pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) | +- _KERNPG_TABLE); +- break; +- case 5: +- pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) | +- _KERNPG_TABLE); +- break; +- default: +- BUILD_BUG(); +- } +- start += PGDIR_SIZE; +- } ++ pgd += pgd_index(addr); ++ do { ++ next = pgd_addr_end(addr, end); ++ kasan_early_p4d_populate(pgd, addr, next); ++ } while (pgd++, addr = next, addr != end); + } + + #ifdef CONFIG_KASAN_INLINE +@@ -101,7 +137,7 @@ void __init kasan_early_init(void) + for (i = 0; i < PTRS_PER_PUD; i++) + kasan_zero_pud[i] = __pud(pud_val); + +- for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++) ++ for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++) + kasan_zero_p4d[i] = __p4d(p4d_val); + + kasan_map_early_shadow(early_top_pgt); +@@ -117,12 +153,35 @@ void __init kasan_init(void) + #endif + + memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt)); ++ ++ /* ++ * We use the same shadow offset for 4- and 5-level paging to ++ * facilitate boot-time switching between paging modes. ++ * As result in 5-level paging mode KASAN_SHADOW_START and ++ * KASAN_SHADOW_END are not aligned to PGD boundary. ++ * ++ * KASAN_SHADOW_START doesn't share PGD with anything else. ++ * We claim whole PGD entry to make things easier. ++ * ++ * KASAN_SHADOW_END lands in the last PGD entry and it collides with ++ * bunch of things like kernel code, modules, EFI mapping, etc. ++ * We need to take extra steps to not overwrite them. ++ */ ++ if (IS_ENABLED(CONFIG_X86_5LEVEL)) { ++ void *ptr; ++ ++ ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END)); ++ memcpy(tmp_p4d_table, (void *)ptr, sizeof(tmp_p4d_table)); ++ set_pgd(&early_top_pgt[pgd_index(KASAN_SHADOW_END)], ++ __pgd(__pa(tmp_p4d_table) | _KERNPG_TABLE)); ++ } ++ + load_cr3(early_top_pgt); + __flush_tlb_all(); + +- clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); ++ clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END); + +- kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, ++ kasan_populate_zero_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK), + kasan_mem_to_shadow((void *)PAGE_OFFSET)); + + for (i = 0; i < E820_MAX_ENTRIES; i++) { +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index bf9f03740c30..67d07802ae95 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -300,7 +300,6 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC + config KASAN_SHADOW_OFFSET + hex + depends on KASAN +- default 0xdff8000000000000 if X86_5LEVEL + default 0xdffffc0000000000 + + config HAVE_INTEL_TXT +-- +2.14.2 + diff --git a/patches/kernel/0078-x86-xen-Provide-pre-built-page-tables-only-for-CONFI.patch b/patches/kernel/0078-x86-xen-Provide-pre-built-page-tables-only-for-CONFI.patch deleted file mode 100644 index e7675b6..0000000 --- a/patches/kernel/0078-x86-xen-Provide-pre-built-page-tables-only-for-CONFI.patch +++ /dev/null @@ -1,80 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: "Kirill A. Shutemov" -Date: Fri, 29 Sep 2017 17:08:19 +0300 -Subject: [PATCH] x86/xen: Provide pre-built page tables only for - CONFIG_XEN_PV=y and CONFIG_XEN_PVH=y -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Looks like we only need pre-built page tables in the CONFIG_XEN_PV=y and -CONFIG_XEN_PVH=y cases. - -Let's not provide them for other configurations. - -Signed-off-by: Kirill A. Shutemov -Reviewed-by: Juergen Gross -Cc: Andrew Morton -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Cyrill Gorcunov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: linux-mm@kvack.org -Link: http://lkml.kernel.org/r/20170929140821.37654-5-kirill.shutemov@linux.intel.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 4375c29985f155d7eb2346615d84e62d1b673682) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit a883ee7f3c1dc64a8c946543ac598399353d1b03) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/head_64.S | 11 ++++++----- - 1 file changed, 6 insertions(+), 5 deletions(-) - -diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S -index d081bc7a027d..12daaa0b187f 100644 ---- a/arch/x86/kernel/head_64.S -+++ b/arch/x86/kernel/head_64.S -@@ -37,11 +37,12 @@ - * - */ - --#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) - #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) - -+#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) - PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) - PGD_START_KERNEL = pgd_index(__START_KERNEL_map) -+#endif - L3_START_KERNEL = pud_index(__START_KERNEL_map) - - .text -@@ -348,10 +349,7 @@ NEXT_PAGE(early_dynamic_pgts) - - .data - --#ifndef CONFIG_XEN --NEXT_PAGE(init_top_pgt) -- .fill 512,8,0 --#else -+#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) - NEXT_PAGE(init_top_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 -@@ -368,6 +366,9 @@ NEXT_PAGE(level2_ident_pgt) - * Don't set NX because code runs from these pages. - */ - PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) -+#else -+NEXT_PAGE(init_top_pgt) -+ .fill 512,8,0 - #endif - - #ifdef CONFIG_X86_5LEVEL --- -2.14.2 - diff --git a/patches/kernel/0079-x86-xen-Drop-5-level-paging-support-code-from-the-XE.patch b/patches/kernel/0079-x86-xen-Drop-5-level-paging-support-code-from-the-XE.patch deleted file mode 100644 index 7073e79..0000000 --- a/patches/kernel/0079-x86-xen-Drop-5-level-paging-support-code-from-the-XE.patch +++ /dev/null @@ -1,316 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: "Kirill A. Shutemov" -Date: Fri, 29 Sep 2017 17:08:20 +0300 -Subject: [PATCH] x86/xen: Drop 5-level paging support code from the XEN_PV - code -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -It was decided 5-level paging is not going to be supported in XEN_PV. - -Let's drop the dead code from the XEN_PV code. - -Tested-by: Juergen Gross -Signed-off-by: Kirill A. Shutemov -Reviewed-by: Juergen Gross -Cc: Andrew Morton -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Cyrill Gorcunov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: linux-mm@kvack.org -Link: http://lkml.kernel.org/r/20170929140821.37654-6-kirill.shutemov@linux.intel.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 773dd2fca581b0a80e5a33332cc8ee67e5a79cba) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 3fd0b7ef0094fd8bb3c8172d9b137ebe0d81ecbc) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/xen/mmu_pv.c | 159 +++++++++++++++++++------------------------------- - 1 file changed, 60 insertions(+), 99 deletions(-) - -diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c -index ba76f3ce997f..45bb2d462e44 100644 ---- a/arch/x86/xen/mmu_pv.c -+++ b/arch/x86/xen/mmu_pv.c -@@ -469,7 +469,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd) - } - PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); - --#if CONFIG_PGTABLE_LEVELS == 4 -+#ifdef CONFIG_X86_64 - __visible pudval_t xen_pud_val(pud_t pud) - { - return pte_mfn_to_pfn(pud.pud); -@@ -558,7 +558,7 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val) - - xen_mc_issue(PARAVIRT_LAZY_MMU); - } --#endif /* CONFIG_PGTABLE_LEVELS == 4 */ -+#endif /* CONFIG_X86_64 */ - - static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, - int (*func)(struct mm_struct *mm, struct page *, enum pt_level), -@@ -600,21 +600,17 @@ static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d, - int (*func)(struct mm_struct *mm, struct page *, enum pt_level), - bool last, unsigned long limit) - { -- int i, nr, flush = 0; -+ int flush = 0; -+ pud_t *pud; - -- nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D; -- for (i = 0; i < nr; i++) { -- pud_t *pud; - -- if (p4d_none(p4d[i])) -- continue; -+ if (p4d_none(*p4d)) -+ return flush; - -- pud = pud_offset(&p4d[i], 0); -- if (PTRS_PER_PUD > 1) -- flush |= (*func)(mm, virt_to_page(pud), PT_PUD); -- flush |= xen_pud_walk(mm, pud, func, -- last && i == nr - 1, limit); -- } -+ pud = pud_offset(p4d, 0); -+ if (PTRS_PER_PUD > 1) -+ flush |= (*func)(mm, virt_to_page(pud), PT_PUD); -+ flush |= xen_pud_walk(mm, pud, func, last, limit); - return flush; - } - -@@ -664,8 +660,6 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, - continue; - - p4d = p4d_offset(&pgd[i], 0); -- if (PTRS_PER_P4D > 1) -- flush |= (*func)(mm, virt_to_page(p4d), PT_P4D); - flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit); - } - -@@ -1196,22 +1190,14 @@ static void __init xen_cleanmfnmap(unsigned long vaddr) - { - pgd_t *pgd; - p4d_t *p4d; -- unsigned int i; - bool unpin; - - unpin = (vaddr == 2 * PGDIR_SIZE); - vaddr &= PMD_MASK; - pgd = pgd_offset_k(vaddr); - p4d = p4d_offset(pgd, 0); -- for (i = 0; i < PTRS_PER_P4D; i++) { -- if (p4d_none(p4d[i])) -- continue; -- xen_cleanmfnmap_p4d(p4d + i, unpin); -- } -- if (IS_ENABLED(CONFIG_X86_5LEVEL)) { -- set_pgd(pgd, __pgd(0)); -- xen_cleanmfnmap_free_pgtbl(p4d, unpin); -- } -+ if (!p4d_none(*p4d)) -+ xen_cleanmfnmap_p4d(p4d, unpin); - } - - static void __init xen_pagetable_p2m_free(void) -@@ -1717,7 +1703,7 @@ static void xen_release_pmd(unsigned long pfn) - xen_release_ptpage(pfn, PT_PMD); - } - --#if CONFIG_PGTABLE_LEVELS >= 4 -+#ifdef CONFIG_X86_64 - static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) - { - xen_alloc_ptpage(mm, pfn, PT_PUD); -@@ -2054,13 +2040,12 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) - */ - void __init xen_relocate_p2m(void) - { -- phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys; -+ phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys; - unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; -- int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d; -+ int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud; - pte_t *pt; - pmd_t *pmd; - pud_t *pud; -- p4d_t *p4d = NULL; - pgd_t *pgd; - unsigned long *new_p2m; - int save_pud; -@@ -2070,11 +2055,7 @@ void __init xen_relocate_p2m(void) - n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; - n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; - n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT; -- if (PTRS_PER_P4D > 1) -- n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; -- else -- n_p4d = 0; -- n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d; -+ n_frames = n_pte + n_pt + n_pmd + n_pud; - - new_area = xen_find_free_area(PFN_PHYS(n_frames)); - if (!new_area) { -@@ -2090,76 +2071,56 @@ void __init xen_relocate_p2m(void) - * To avoid any possible virtual address collision, just use - * 2 * PUD_SIZE for the new area. - */ -- p4d_phys = new_area; -- pud_phys = p4d_phys + PFN_PHYS(n_p4d); -+ pud_phys = new_area; - pmd_phys = pud_phys + PFN_PHYS(n_pud); - pt_phys = pmd_phys + PFN_PHYS(n_pmd); - p2m_pfn = PFN_DOWN(pt_phys) + n_pt; - - pgd = __va(read_cr3_pa()); - new_p2m = (unsigned long *)(2 * PGDIR_SIZE); -- idx_p4d = 0; - save_pud = n_pud; -- do { -- if (n_p4d > 0) { -- p4d = early_memremap(p4d_phys, PAGE_SIZE); -- clear_page(p4d); -- n_pud = min(save_pud, PTRS_PER_P4D); -- } -- for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { -- pud = early_memremap(pud_phys, PAGE_SIZE); -- clear_page(pud); -- for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); -- idx_pmd++) { -- pmd = early_memremap(pmd_phys, PAGE_SIZE); -- clear_page(pmd); -- for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); -- idx_pt++) { -- pt = early_memremap(pt_phys, PAGE_SIZE); -- clear_page(pt); -- for (idx_pte = 0; -- idx_pte < min(n_pte, PTRS_PER_PTE); -- idx_pte++) { -- set_pte(pt + idx_pte, -- pfn_pte(p2m_pfn, PAGE_KERNEL)); -- p2m_pfn++; -- } -- n_pte -= PTRS_PER_PTE; -- early_memunmap(pt, PAGE_SIZE); -- make_lowmem_page_readonly(__va(pt_phys)); -- pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, -- PFN_DOWN(pt_phys)); -- set_pmd(pmd + idx_pt, -- __pmd(_PAGE_TABLE | pt_phys)); -- pt_phys += PAGE_SIZE; -+ for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { -+ pud = early_memremap(pud_phys, PAGE_SIZE); -+ clear_page(pud); -+ for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); -+ idx_pmd++) { -+ pmd = early_memremap(pmd_phys, PAGE_SIZE); -+ clear_page(pmd); -+ for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); -+ idx_pt++) { -+ pt = early_memremap(pt_phys, PAGE_SIZE); -+ clear_page(pt); -+ for (idx_pte = 0; -+ idx_pte < min(n_pte, PTRS_PER_PTE); -+ idx_pte++) { -+ set_pte(pt + idx_pte, -+ pfn_pte(p2m_pfn, PAGE_KERNEL)); -+ p2m_pfn++; - } -- n_pt -= PTRS_PER_PMD; -- early_memunmap(pmd, PAGE_SIZE); -- make_lowmem_page_readonly(__va(pmd_phys)); -- pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, -- PFN_DOWN(pmd_phys)); -- set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); -- pmd_phys += PAGE_SIZE; -+ n_pte -= PTRS_PER_PTE; -+ early_memunmap(pt, PAGE_SIZE); -+ make_lowmem_page_readonly(__va(pt_phys)); -+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, -+ PFN_DOWN(pt_phys)); -+ set_pmd(pmd + idx_pt, -+ __pmd(_PAGE_TABLE | pt_phys)); -+ pt_phys += PAGE_SIZE; - } -- n_pmd -= PTRS_PER_PUD; -- early_memunmap(pud, PAGE_SIZE); -- make_lowmem_page_readonly(__va(pud_phys)); -- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); -- if (n_p4d > 0) -- set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys)); -- else -- set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); -- pud_phys += PAGE_SIZE; -- } -- if (n_p4d > 0) { -- save_pud -= PTRS_PER_P4D; -- early_memunmap(p4d, PAGE_SIZE); -- make_lowmem_page_readonly(__va(p4d_phys)); -- pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys)); -- set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys)); -- p4d_phys += PAGE_SIZE; -+ n_pt -= PTRS_PER_PMD; -+ early_memunmap(pmd, PAGE_SIZE); -+ make_lowmem_page_readonly(__va(pmd_phys)); -+ pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, -+ PFN_DOWN(pmd_phys)); -+ set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); -+ pmd_phys += PAGE_SIZE; - } -- } while (++idx_p4d < n_p4d); -+ n_pmd -= PTRS_PER_PUD; -+ early_memunmap(pud, PAGE_SIZE); -+ make_lowmem_page_readonly(__va(pud_phys)); -+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); -+ set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); -+ pud_phys += PAGE_SIZE; -+ } - - /* Now copy the old p2m info to the new area. */ - memcpy(new_p2m, xen_p2m_addr, size); -@@ -2386,7 +2347,7 @@ static void __init xen_post_allocator_init(void) - pv_mmu_ops.set_pte = xen_set_pte; - pv_mmu_ops.set_pmd = xen_set_pmd; - pv_mmu_ops.set_pud = xen_set_pud; --#if CONFIG_PGTABLE_LEVELS >= 4 -+#ifdef CONFIG_X86_64 - pv_mmu_ops.set_p4d = xen_set_p4d; - #endif - -@@ -2396,7 +2357,7 @@ static void __init xen_post_allocator_init(void) - pv_mmu_ops.alloc_pmd = xen_alloc_pmd; - pv_mmu_ops.release_pte = xen_release_pte; - pv_mmu_ops.release_pmd = xen_release_pmd; --#if CONFIG_PGTABLE_LEVELS >= 4 -+#ifdef CONFIG_X86_64 - pv_mmu_ops.alloc_pud = xen_alloc_pud; - pv_mmu_ops.release_pud = xen_release_pud; - #endif -@@ -2460,14 +2421,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { - .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), - .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), - --#if CONFIG_PGTABLE_LEVELS >= 4 -+#ifdef CONFIG_X86_64 - .pud_val = PV_CALLEE_SAVE(xen_pud_val), - .make_pud = PV_CALLEE_SAVE(xen_make_pud), - .set_p4d = xen_set_p4d_hyper, - - .alloc_pud = xen_alloc_pmd_init, - .release_pud = xen_release_pmd_init, --#endif /* CONFIG_PGTABLE_LEVELS == 4 */ -+#endif /* CONFIG_X86_64 */ - - .activate_mm = xen_activate_mm, - .dup_mmap = xen_dup_mmap, --- -2.14.2 - diff --git a/patches/kernel/0079-x86-xen-Provide-pre-built-page-tables-only-for-CONFI.patch b/patches/kernel/0079-x86-xen-Provide-pre-built-page-tables-only-for-CONFI.patch new file mode 100644 index 0000000..e7675b6 --- /dev/null +++ b/patches/kernel/0079-x86-xen-Provide-pre-built-page-tables-only-for-CONFI.patch @@ -0,0 +1,80 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: "Kirill A. Shutemov" +Date: Fri, 29 Sep 2017 17:08:19 +0300 +Subject: [PATCH] x86/xen: Provide pre-built page tables only for + CONFIG_XEN_PV=y and CONFIG_XEN_PVH=y +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Looks like we only need pre-built page tables in the CONFIG_XEN_PV=y and +CONFIG_XEN_PVH=y cases. + +Let's not provide them for other configurations. + +Signed-off-by: Kirill A. Shutemov +Reviewed-by: Juergen Gross +Cc: Andrew Morton +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Cyrill Gorcunov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/20170929140821.37654-5-kirill.shutemov@linux.intel.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 4375c29985f155d7eb2346615d84e62d1b673682) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit a883ee7f3c1dc64a8c946543ac598399353d1b03) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/head_64.S | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index d081bc7a027d..12daaa0b187f 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -37,11 +37,12 @@ + * + */ + +-#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) + #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) + ++#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) + PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) + PGD_START_KERNEL = pgd_index(__START_KERNEL_map) ++#endif + L3_START_KERNEL = pud_index(__START_KERNEL_map) + + .text +@@ -348,10 +349,7 @@ NEXT_PAGE(early_dynamic_pgts) + + .data + +-#ifndef CONFIG_XEN +-NEXT_PAGE(init_top_pgt) +- .fill 512,8,0 +-#else ++#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) + NEXT_PAGE(init_top_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 +@@ -368,6 +366,9 @@ NEXT_PAGE(level2_ident_pgt) + * Don't set NX because code runs from these pages. + */ + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) ++#else ++NEXT_PAGE(init_top_pgt) ++ .fill 512,8,0 + #endif + + #ifdef CONFIG_X86_5LEVEL +-- +2.14.2 + diff --git a/patches/kernel/0080-ACPI-APEI-remove-the-unused-dead-code-for-SEA-NMI-no.patch b/patches/kernel/0080-ACPI-APEI-remove-the-unused-dead-code-for-SEA-NMI-no.patch deleted file mode 100644 index 209308b..0000000 --- a/patches/kernel/0080-ACPI-APEI-remove-the-unused-dead-code-for-SEA-NMI-no.patch +++ /dev/null @@ -1,88 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dongjiu Geng -Date: Tue, 17 Oct 2017 16:02:20 +0800 -Subject: [PATCH] ACPI / APEI: remove the unused dead-code for SEA/NMI - notification type -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -For the SEA notification, the two functions ghes_sea_add() and -ghes_sea_remove() are only called when CONFIG_ACPI_APEI_SEA -is defined. If not, it will return errors in the ghes_probe() -and not continue. If the probe is failed, the ghes_sea_remove() -also has no chance to be called. Hence, remove the unnecessary -handling when CONFIG_ACPI_APEI_SEA is not defined. - -For the NMI notification, it has the same issue as SEA notification, -so also remove the unused dead-code for it. - -Signed-off-by: Dongjiu Geng -Tested-by: Tyler Baicar -Reviewed-by: Borislav Petkov -Signed-off-by: Rafael J. Wysocki -(cherry picked from commit c49870e89f4d2c21c76ebe90568246bb0f3572b7) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 55f73c32ba6438e8886f348722d2b25aef129d40) -Signed-off-by: Fabian Grünbichler ---- - drivers/acpi/apei/ghes.c | 33 +++++---------------------------- - 1 file changed, 5 insertions(+), 28 deletions(-) - -diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c -index 3628078ee351..4827176f838d 100644 ---- a/drivers/acpi/apei/ghes.c -+++ b/drivers/acpi/apei/ghes.c -@@ -850,17 +850,8 @@ static void ghes_sea_remove(struct ghes *ghes) - synchronize_rcu(); - } - #else /* CONFIG_ACPI_APEI_SEA */ --static inline void ghes_sea_add(struct ghes *ghes) --{ -- pr_err(GHES_PFX "ID: %d, trying to add SEA notification which is not supported\n", -- ghes->generic->header.source_id); --} -- --static inline void ghes_sea_remove(struct ghes *ghes) --{ -- pr_err(GHES_PFX "ID: %d, trying to remove SEA notification which is not supported\n", -- ghes->generic->header.source_id); --} -+static inline void ghes_sea_add(struct ghes *ghes) { } -+static inline void ghes_sea_remove(struct ghes *ghes) { } - #endif /* CONFIG_ACPI_APEI_SEA */ - - #ifdef CONFIG_HAVE_ACPI_APEI_NMI -@@ -1062,23 +1053,9 @@ static void ghes_nmi_init_cxt(void) - init_irq_work(&ghes_proc_irq_work, ghes_proc_in_irq); - } - #else /* CONFIG_HAVE_ACPI_APEI_NMI */ --static inline void ghes_nmi_add(struct ghes *ghes) --{ -- pr_err(GHES_PFX "ID: %d, trying to add NMI notification which is not supported!\n", -- ghes->generic->header.source_id); -- BUG(); --} -- --static inline void ghes_nmi_remove(struct ghes *ghes) --{ -- pr_err(GHES_PFX "ID: %d, trying to remove NMI notification which is not supported!\n", -- ghes->generic->header.source_id); -- BUG(); --} -- --static inline void ghes_nmi_init_cxt(void) --{ --} -+static inline void ghes_nmi_add(struct ghes *ghes) { } -+static inline void ghes_nmi_remove(struct ghes *ghes) { } -+static inline void ghes_nmi_init_cxt(void) { } - #endif /* CONFIG_HAVE_ACPI_APEI_NMI */ - - static int ghes_probe(struct platform_device *ghes_dev) --- -2.14.2 - diff --git a/patches/kernel/0080-x86-xen-Drop-5-level-paging-support-code-from-the-XE.patch b/patches/kernel/0080-x86-xen-Drop-5-level-paging-support-code-from-the-XE.patch new file mode 100644 index 0000000..7073e79 --- /dev/null +++ b/patches/kernel/0080-x86-xen-Drop-5-level-paging-support-code-from-the-XE.patch @@ -0,0 +1,316 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: "Kirill A. Shutemov" +Date: Fri, 29 Sep 2017 17:08:20 +0300 +Subject: [PATCH] x86/xen: Drop 5-level paging support code from the XEN_PV + code +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +It was decided 5-level paging is not going to be supported in XEN_PV. + +Let's drop the dead code from the XEN_PV code. + +Tested-by: Juergen Gross +Signed-off-by: Kirill A. Shutemov +Reviewed-by: Juergen Gross +Cc: Andrew Morton +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Cyrill Gorcunov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/20170929140821.37654-6-kirill.shutemov@linux.intel.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 773dd2fca581b0a80e5a33332cc8ee67e5a79cba) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 3fd0b7ef0094fd8bb3c8172d9b137ebe0d81ecbc) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/xen/mmu_pv.c | 159 +++++++++++++++++++------------------------------- + 1 file changed, 60 insertions(+), 99 deletions(-) + +diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c +index ba76f3ce997f..45bb2d462e44 100644 +--- a/arch/x86/xen/mmu_pv.c ++++ b/arch/x86/xen/mmu_pv.c +@@ -469,7 +469,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd) + } + PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); + +-#if CONFIG_PGTABLE_LEVELS == 4 ++#ifdef CONFIG_X86_64 + __visible pudval_t xen_pud_val(pud_t pud) + { + return pte_mfn_to_pfn(pud.pud); +@@ -558,7 +558,7 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val) + + xen_mc_issue(PARAVIRT_LAZY_MMU); + } +-#endif /* CONFIG_PGTABLE_LEVELS == 4 */ ++#endif /* CONFIG_X86_64 */ + + static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), +@@ -600,21 +600,17 @@ static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), + bool last, unsigned long limit) + { +- int i, nr, flush = 0; ++ int flush = 0; ++ pud_t *pud; + +- nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D; +- for (i = 0; i < nr; i++) { +- pud_t *pud; + +- if (p4d_none(p4d[i])) +- continue; ++ if (p4d_none(*p4d)) ++ return flush; + +- pud = pud_offset(&p4d[i], 0); +- if (PTRS_PER_PUD > 1) +- flush |= (*func)(mm, virt_to_page(pud), PT_PUD); +- flush |= xen_pud_walk(mm, pud, func, +- last && i == nr - 1, limit); +- } ++ pud = pud_offset(p4d, 0); ++ if (PTRS_PER_PUD > 1) ++ flush |= (*func)(mm, virt_to_page(pud), PT_PUD); ++ flush |= xen_pud_walk(mm, pud, func, last, limit); + return flush; + } + +@@ -664,8 +660,6 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, + continue; + + p4d = p4d_offset(&pgd[i], 0); +- if (PTRS_PER_P4D > 1) +- flush |= (*func)(mm, virt_to_page(p4d), PT_P4D); + flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit); + } + +@@ -1196,22 +1190,14 @@ static void __init xen_cleanmfnmap(unsigned long vaddr) + { + pgd_t *pgd; + p4d_t *p4d; +- unsigned int i; + bool unpin; + + unpin = (vaddr == 2 * PGDIR_SIZE); + vaddr &= PMD_MASK; + pgd = pgd_offset_k(vaddr); + p4d = p4d_offset(pgd, 0); +- for (i = 0; i < PTRS_PER_P4D; i++) { +- if (p4d_none(p4d[i])) +- continue; +- xen_cleanmfnmap_p4d(p4d + i, unpin); +- } +- if (IS_ENABLED(CONFIG_X86_5LEVEL)) { +- set_pgd(pgd, __pgd(0)); +- xen_cleanmfnmap_free_pgtbl(p4d, unpin); +- } ++ if (!p4d_none(*p4d)) ++ xen_cleanmfnmap_p4d(p4d, unpin); + } + + static void __init xen_pagetable_p2m_free(void) +@@ -1717,7 +1703,7 @@ static void xen_release_pmd(unsigned long pfn) + xen_release_ptpage(pfn, PT_PMD); + } + +-#if CONFIG_PGTABLE_LEVELS >= 4 ++#ifdef CONFIG_X86_64 + static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) + { + xen_alloc_ptpage(mm, pfn, PT_PUD); +@@ -2054,13 +2040,12 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) + */ + void __init xen_relocate_p2m(void) + { +- phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys; ++ phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys; + unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; +- int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d; ++ int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud; + pte_t *pt; + pmd_t *pmd; + pud_t *pud; +- p4d_t *p4d = NULL; + pgd_t *pgd; + unsigned long *new_p2m; + int save_pud; +@@ -2070,11 +2055,7 @@ void __init xen_relocate_p2m(void) + n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; + n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; + n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT; +- if (PTRS_PER_P4D > 1) +- n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; +- else +- n_p4d = 0; +- n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d; ++ n_frames = n_pte + n_pt + n_pmd + n_pud; + + new_area = xen_find_free_area(PFN_PHYS(n_frames)); + if (!new_area) { +@@ -2090,76 +2071,56 @@ void __init xen_relocate_p2m(void) + * To avoid any possible virtual address collision, just use + * 2 * PUD_SIZE for the new area. + */ +- p4d_phys = new_area; +- pud_phys = p4d_phys + PFN_PHYS(n_p4d); ++ pud_phys = new_area; + pmd_phys = pud_phys + PFN_PHYS(n_pud); + pt_phys = pmd_phys + PFN_PHYS(n_pmd); + p2m_pfn = PFN_DOWN(pt_phys) + n_pt; + + pgd = __va(read_cr3_pa()); + new_p2m = (unsigned long *)(2 * PGDIR_SIZE); +- idx_p4d = 0; + save_pud = n_pud; +- do { +- if (n_p4d > 0) { +- p4d = early_memremap(p4d_phys, PAGE_SIZE); +- clear_page(p4d); +- n_pud = min(save_pud, PTRS_PER_P4D); +- } +- for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { +- pud = early_memremap(pud_phys, PAGE_SIZE); +- clear_page(pud); +- for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); +- idx_pmd++) { +- pmd = early_memremap(pmd_phys, PAGE_SIZE); +- clear_page(pmd); +- for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); +- idx_pt++) { +- pt = early_memremap(pt_phys, PAGE_SIZE); +- clear_page(pt); +- for (idx_pte = 0; +- idx_pte < min(n_pte, PTRS_PER_PTE); +- idx_pte++) { +- set_pte(pt + idx_pte, +- pfn_pte(p2m_pfn, PAGE_KERNEL)); +- p2m_pfn++; +- } +- n_pte -= PTRS_PER_PTE; +- early_memunmap(pt, PAGE_SIZE); +- make_lowmem_page_readonly(__va(pt_phys)); +- pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, +- PFN_DOWN(pt_phys)); +- set_pmd(pmd + idx_pt, +- __pmd(_PAGE_TABLE | pt_phys)); +- pt_phys += PAGE_SIZE; ++ for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { ++ pud = early_memremap(pud_phys, PAGE_SIZE); ++ clear_page(pud); ++ for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); ++ idx_pmd++) { ++ pmd = early_memremap(pmd_phys, PAGE_SIZE); ++ clear_page(pmd); ++ for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); ++ idx_pt++) { ++ pt = early_memremap(pt_phys, PAGE_SIZE); ++ clear_page(pt); ++ for (idx_pte = 0; ++ idx_pte < min(n_pte, PTRS_PER_PTE); ++ idx_pte++) { ++ set_pte(pt + idx_pte, ++ pfn_pte(p2m_pfn, PAGE_KERNEL)); ++ p2m_pfn++; + } +- n_pt -= PTRS_PER_PMD; +- early_memunmap(pmd, PAGE_SIZE); +- make_lowmem_page_readonly(__va(pmd_phys)); +- pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, +- PFN_DOWN(pmd_phys)); +- set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); +- pmd_phys += PAGE_SIZE; ++ n_pte -= PTRS_PER_PTE; ++ early_memunmap(pt, PAGE_SIZE); ++ make_lowmem_page_readonly(__va(pt_phys)); ++ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, ++ PFN_DOWN(pt_phys)); ++ set_pmd(pmd + idx_pt, ++ __pmd(_PAGE_TABLE | pt_phys)); ++ pt_phys += PAGE_SIZE; + } +- n_pmd -= PTRS_PER_PUD; +- early_memunmap(pud, PAGE_SIZE); +- make_lowmem_page_readonly(__va(pud_phys)); +- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); +- if (n_p4d > 0) +- set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys)); +- else +- set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); +- pud_phys += PAGE_SIZE; +- } +- if (n_p4d > 0) { +- save_pud -= PTRS_PER_P4D; +- early_memunmap(p4d, PAGE_SIZE); +- make_lowmem_page_readonly(__va(p4d_phys)); +- pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys)); +- set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys)); +- p4d_phys += PAGE_SIZE; ++ n_pt -= PTRS_PER_PMD; ++ early_memunmap(pmd, PAGE_SIZE); ++ make_lowmem_page_readonly(__va(pmd_phys)); ++ pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, ++ PFN_DOWN(pmd_phys)); ++ set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); ++ pmd_phys += PAGE_SIZE; + } +- } while (++idx_p4d < n_p4d); ++ n_pmd -= PTRS_PER_PUD; ++ early_memunmap(pud, PAGE_SIZE); ++ make_lowmem_page_readonly(__va(pud_phys)); ++ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); ++ set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); ++ pud_phys += PAGE_SIZE; ++ } + + /* Now copy the old p2m info to the new area. */ + memcpy(new_p2m, xen_p2m_addr, size); +@@ -2386,7 +2347,7 @@ static void __init xen_post_allocator_init(void) + pv_mmu_ops.set_pte = xen_set_pte; + pv_mmu_ops.set_pmd = xen_set_pmd; + pv_mmu_ops.set_pud = xen_set_pud; +-#if CONFIG_PGTABLE_LEVELS >= 4 ++#ifdef CONFIG_X86_64 + pv_mmu_ops.set_p4d = xen_set_p4d; + #endif + +@@ -2396,7 +2357,7 @@ static void __init xen_post_allocator_init(void) + pv_mmu_ops.alloc_pmd = xen_alloc_pmd; + pv_mmu_ops.release_pte = xen_release_pte; + pv_mmu_ops.release_pmd = xen_release_pmd; +-#if CONFIG_PGTABLE_LEVELS >= 4 ++#ifdef CONFIG_X86_64 + pv_mmu_ops.alloc_pud = xen_alloc_pud; + pv_mmu_ops.release_pud = xen_release_pud; + #endif +@@ -2460,14 +2421,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { + .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), + .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), + +-#if CONFIG_PGTABLE_LEVELS >= 4 ++#ifdef CONFIG_X86_64 + .pud_val = PV_CALLEE_SAVE(xen_pud_val), + .make_pud = PV_CALLEE_SAVE(xen_make_pud), + .set_p4d = xen_set_p4d_hyper, + + .alloc_pud = xen_alloc_pmd_init, + .release_pud = xen_release_pmd_init, +-#endif /* CONFIG_PGTABLE_LEVELS == 4 */ ++#endif /* CONFIG_X86_64 */ + + .activate_mm = xen_activate_mm, + .dup_mmap = xen_dup_mmap, +-- +2.14.2 + diff --git a/patches/kernel/0081-ACPI-APEI-remove-the-unused-dead-code-for-SEA-NMI-no.patch b/patches/kernel/0081-ACPI-APEI-remove-the-unused-dead-code-for-SEA-NMI-no.patch new file mode 100644 index 0000000..209308b --- /dev/null +++ b/patches/kernel/0081-ACPI-APEI-remove-the-unused-dead-code-for-SEA-NMI-no.patch @@ -0,0 +1,88 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dongjiu Geng +Date: Tue, 17 Oct 2017 16:02:20 +0800 +Subject: [PATCH] ACPI / APEI: remove the unused dead-code for SEA/NMI + notification type +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +For the SEA notification, the two functions ghes_sea_add() and +ghes_sea_remove() are only called when CONFIG_ACPI_APEI_SEA +is defined. If not, it will return errors in the ghes_probe() +and not continue. If the probe is failed, the ghes_sea_remove() +also has no chance to be called. Hence, remove the unnecessary +handling when CONFIG_ACPI_APEI_SEA is not defined. + +For the NMI notification, it has the same issue as SEA notification, +so also remove the unused dead-code for it. + +Signed-off-by: Dongjiu Geng +Tested-by: Tyler Baicar +Reviewed-by: Borislav Petkov +Signed-off-by: Rafael J. Wysocki +(cherry picked from commit c49870e89f4d2c21c76ebe90568246bb0f3572b7) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 55f73c32ba6438e8886f348722d2b25aef129d40) +Signed-off-by: Fabian Grünbichler +--- + drivers/acpi/apei/ghes.c | 33 +++++---------------------------- + 1 file changed, 5 insertions(+), 28 deletions(-) + +diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c +index 3628078ee351..4827176f838d 100644 +--- a/drivers/acpi/apei/ghes.c ++++ b/drivers/acpi/apei/ghes.c +@@ -850,17 +850,8 @@ static void ghes_sea_remove(struct ghes *ghes) + synchronize_rcu(); + } + #else /* CONFIG_ACPI_APEI_SEA */ +-static inline void ghes_sea_add(struct ghes *ghes) +-{ +- pr_err(GHES_PFX "ID: %d, trying to add SEA notification which is not supported\n", +- ghes->generic->header.source_id); +-} +- +-static inline void ghes_sea_remove(struct ghes *ghes) +-{ +- pr_err(GHES_PFX "ID: %d, trying to remove SEA notification which is not supported\n", +- ghes->generic->header.source_id); +-} ++static inline void ghes_sea_add(struct ghes *ghes) { } ++static inline void ghes_sea_remove(struct ghes *ghes) { } + #endif /* CONFIG_ACPI_APEI_SEA */ + + #ifdef CONFIG_HAVE_ACPI_APEI_NMI +@@ -1062,23 +1053,9 @@ static void ghes_nmi_init_cxt(void) + init_irq_work(&ghes_proc_irq_work, ghes_proc_in_irq); + } + #else /* CONFIG_HAVE_ACPI_APEI_NMI */ +-static inline void ghes_nmi_add(struct ghes *ghes) +-{ +- pr_err(GHES_PFX "ID: %d, trying to add NMI notification which is not supported!\n", +- ghes->generic->header.source_id); +- BUG(); +-} +- +-static inline void ghes_nmi_remove(struct ghes *ghes) +-{ +- pr_err(GHES_PFX "ID: %d, trying to remove NMI notification which is not supported!\n", +- ghes->generic->header.source_id); +- BUG(); +-} +- +-static inline void ghes_nmi_init_cxt(void) +-{ +-} ++static inline void ghes_nmi_add(struct ghes *ghes) { } ++static inline void ghes_nmi_remove(struct ghes *ghes) { } ++static inline void ghes_nmi_init_cxt(void) { } + #endif /* CONFIG_HAVE_ACPI_APEI_NMI */ + + static int ghes_probe(struct platform_device *ghes_dev) +-- +2.14.2 + diff --git a/patches/kernel/0081-x86-asm-Don-t-use-the-confusing-.ifeq-directive.patch b/patches/kernel/0081-x86-asm-Don-t-use-the-confusing-.ifeq-directive.patch deleted file mode 100644 index dfe7a2f..0000000 --- a/patches/kernel/0081-x86-asm-Don-t-use-the-confusing-.ifeq-directive.patch +++ /dev/null @@ -1,78 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Fri, 20 Oct 2017 11:21:35 -0500 -Subject: [PATCH] x86/asm: Don't use the confusing '.ifeq' directive -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -I find the '.ifeq ' directive to be confusing. Reading it -quickly seems to suggest its opposite meaning, or that it's missing an -argument. - -Improve readability by replacing all of its x86 uses with -'.if == 0'. - -Signed-off-by: Josh Poimboeuf -Cc: Andrei Vagin -Cc: Andy Lutomirski -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/757da028e802c7e98d23fbab8d234b1063e161cf.1508516398.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 82c62fa0c49aa305104013cee4468772799bb391) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 981dedac1061fb47d0b04e07f6752be195d7e41a) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/entry_64.S | 2 +- - arch/x86/kernel/head_32.S | 2 +- - arch/x86/kernel/head_64.S | 2 +- - 3 files changed, 3 insertions(+), 3 deletions(-) - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 2e4fc6425f47..34adfe0221d2 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -830,7 +830,7 @@ ENTRY(\sym) - - ASM_CLAC - -- .ifeq \has_error_code -+ .if \has_error_code == 0 - pushq $-1 /* ORIG_RAX: no syscall to restart */ - .endif - -diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S -index 1f85ee8f9439..337a65377baf 100644 ---- a/arch/x86/kernel/head_32.S -+++ b/arch/x86/kernel/head_32.S -@@ -435,7 +435,7 @@ ENTRY(early_idt_handler_array) - # 24(%rsp) error code - i = 0 - .rept NUM_EXCEPTION_VECTORS -- .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 -+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 - pushl $0 # Dummy error code, to make stack frame uniform - .endif - pushl $i # 20(%esp) Vector number -diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S -index 12daaa0b187f..a2d8541b1da4 100644 ---- a/arch/x86/kernel/head_64.S -+++ b/arch/x86/kernel/head_64.S -@@ -258,7 +258,7 @@ ENDPROC(start_cpu0) - ENTRY(early_idt_handler_array) - i = 0 - .rept NUM_EXCEPTION_VECTORS -- .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 -+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 - UNWIND_HINT_IRET_REGS - pushq $0 # Dummy error code, to make stack frame uniform - .else --- -2.14.2 - diff --git a/patches/kernel/0082-x86-asm-Don-t-use-the-confusing-.ifeq-directive.patch b/patches/kernel/0082-x86-asm-Don-t-use-the-confusing-.ifeq-directive.patch new file mode 100644 index 0000000..dfe7a2f --- /dev/null +++ b/patches/kernel/0082-x86-asm-Don-t-use-the-confusing-.ifeq-directive.patch @@ -0,0 +1,78 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Fri, 20 Oct 2017 11:21:35 -0500 +Subject: [PATCH] x86/asm: Don't use the confusing '.ifeq' directive +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +I find the '.ifeq ' directive to be confusing. Reading it +quickly seems to suggest its opposite meaning, or that it's missing an +argument. + +Improve readability by replacing all of its x86 uses with +'.if == 0'. + +Signed-off-by: Josh Poimboeuf +Cc: Andrei Vagin +Cc: Andy Lutomirski +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/757da028e802c7e98d23fbab8d234b1063e161cf.1508516398.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 82c62fa0c49aa305104013cee4468772799bb391) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 981dedac1061fb47d0b04e07f6752be195d7e41a) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/entry_64.S | 2 +- + arch/x86/kernel/head_32.S | 2 +- + arch/x86/kernel/head_64.S | 2 +- + 3 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 2e4fc6425f47..34adfe0221d2 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -830,7 +830,7 @@ ENTRY(\sym) + + ASM_CLAC + +- .ifeq \has_error_code ++ .if \has_error_code == 0 + pushq $-1 /* ORIG_RAX: no syscall to restart */ + .endif + +diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S +index 1f85ee8f9439..337a65377baf 100644 +--- a/arch/x86/kernel/head_32.S ++++ b/arch/x86/kernel/head_32.S +@@ -435,7 +435,7 @@ ENTRY(early_idt_handler_array) + # 24(%rsp) error code + i = 0 + .rept NUM_EXCEPTION_VECTORS +- .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 ++ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 + pushl $0 # Dummy error code, to make stack frame uniform + .endif + pushl $i # 20(%esp) Vector number +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index 12daaa0b187f..a2d8541b1da4 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -258,7 +258,7 @@ ENDPROC(start_cpu0) + ENTRY(early_idt_handler_array) + i = 0 + .rept NUM_EXCEPTION_VECTORS +- .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 ++ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 + UNWIND_HINT_IRET_REGS + pushq $0 # Dummy error code, to make stack frame uniform + .else +-- +2.14.2 + diff --git a/patches/kernel/0082-x86-build-Beautify-build-log-of-syscall-headers.patch b/patches/kernel/0082-x86-build-Beautify-build-log-of-syscall-headers.patch deleted file mode 100644 index 8bd14b6..0000000 --- a/patches/kernel/0082-x86-build-Beautify-build-log-of-syscall-headers.patch +++ /dev/null @@ -1,62 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Masahiro Yamada -Date: Fri, 27 Oct 2017 13:11:10 +0900 -Subject: [PATCH] x86/build: Beautify build log of syscall headers -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -This makes the build log look nicer. - -Before: - SYSTBL arch/x86/entry/syscalls/../../include/generated/asm/syscalls_32.h - SYSHDR arch/x86/entry/syscalls/../../include/generated/asm/unistd_32_ia32.h - SYSHDR arch/x86/entry/syscalls/../../include/generated/asm/unistd_64_x32.h - SYSTBL arch/x86/entry/syscalls/../../include/generated/asm/syscalls_64.h - SYSHDR arch/x86/entry/syscalls/../../include/generated/uapi/asm/unistd_32.h - SYSHDR arch/x86/entry/syscalls/../../include/generated/uapi/asm/unistd_64.h - SYSHDR arch/x86/entry/syscalls/../../include/generated/uapi/asm/unistd_x32.h - -After: - SYSTBL arch/x86/include/generated/asm/syscalls_32.h - SYSHDR arch/x86/include/generated/asm/unistd_32_ia32.h - SYSHDR arch/x86/include/generated/asm/unistd_64_x32.h - SYSTBL arch/x86/include/generated/asm/syscalls_64.h - SYSHDR arch/x86/include/generated/uapi/asm/unistd_32.h - SYSHDR arch/x86/include/generated/uapi/asm/unistd_64.h - SYSHDR arch/x86/include/generated/uapi/asm/unistd_x32.h - -Signed-off-by: Masahiro Yamada -Acked-by: Thomas Gleixner -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: "H. Peter Anvin" -Cc: linux-kbuild@vger.kernel.org -Link: http://lkml.kernel.org/r/1509077470-2735-1-git-send-email-yamada.masahiro@socionext.com -Signed-off-by: Ingo Molnar -(cherry picked from commit af8e947079a7dab0480b5d6db6b093fd04b86fc9) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit d945957924e9b1a469516b4029fd384138c2cb69) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/syscalls/Makefile | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile -index 57aa59fd140c..e34c7a931994 100644 ---- a/arch/x86/entry/syscalls/Makefile -+++ b/arch/x86/entry/syscalls/Makefile -@@ -1,5 +1,5 @@ --out := $(obj)/../../include/generated/asm --uapi := $(obj)/../../include/generated/uapi/asm -+out := arch/$(SRCARCH)/include/generated/asm -+uapi := arch/$(SRCARCH)/include/generated/uapi/asm - - # Create output directory if not already present - _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ --- -2.14.2 - diff --git a/patches/kernel/0083-x86-build-Beautify-build-log-of-syscall-headers.patch b/patches/kernel/0083-x86-build-Beautify-build-log-of-syscall-headers.patch new file mode 100644 index 0000000..8bd14b6 --- /dev/null +++ b/patches/kernel/0083-x86-build-Beautify-build-log-of-syscall-headers.patch @@ -0,0 +1,62 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Masahiro Yamada +Date: Fri, 27 Oct 2017 13:11:10 +0900 +Subject: [PATCH] x86/build: Beautify build log of syscall headers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +This makes the build log look nicer. + +Before: + SYSTBL arch/x86/entry/syscalls/../../include/generated/asm/syscalls_32.h + SYSHDR arch/x86/entry/syscalls/../../include/generated/asm/unistd_32_ia32.h + SYSHDR arch/x86/entry/syscalls/../../include/generated/asm/unistd_64_x32.h + SYSTBL arch/x86/entry/syscalls/../../include/generated/asm/syscalls_64.h + SYSHDR arch/x86/entry/syscalls/../../include/generated/uapi/asm/unistd_32.h + SYSHDR arch/x86/entry/syscalls/../../include/generated/uapi/asm/unistd_64.h + SYSHDR arch/x86/entry/syscalls/../../include/generated/uapi/asm/unistd_x32.h + +After: + SYSTBL arch/x86/include/generated/asm/syscalls_32.h + SYSHDR arch/x86/include/generated/asm/unistd_32_ia32.h + SYSHDR arch/x86/include/generated/asm/unistd_64_x32.h + SYSTBL arch/x86/include/generated/asm/syscalls_64.h + SYSHDR arch/x86/include/generated/uapi/asm/unistd_32.h + SYSHDR arch/x86/include/generated/uapi/asm/unistd_64.h + SYSHDR arch/x86/include/generated/uapi/asm/unistd_x32.h + +Signed-off-by: Masahiro Yamada +Acked-by: Thomas Gleixner +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: "H. Peter Anvin" +Cc: linux-kbuild@vger.kernel.org +Link: http://lkml.kernel.org/r/1509077470-2735-1-git-send-email-yamada.masahiro@socionext.com +Signed-off-by: Ingo Molnar +(cherry picked from commit af8e947079a7dab0480b5d6db6b093fd04b86fc9) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit d945957924e9b1a469516b4029fd384138c2cb69) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/syscalls/Makefile | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile +index 57aa59fd140c..e34c7a931994 100644 +--- a/arch/x86/entry/syscalls/Makefile ++++ b/arch/x86/entry/syscalls/Makefile +@@ -1,5 +1,5 @@ +-out := $(obj)/../../include/generated/asm +-uapi := $(obj)/../../include/generated/uapi/asm ++out := arch/$(SRCARCH)/include/generated/asm ++uapi := arch/$(SRCARCH)/include/generated/uapi/asm + + # Create output directory if not already present + _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ +-- +2.14.2 + diff --git a/patches/kernel/0083-x86-mm-64-Rename-the-register_page_bootmem_memmap-si.patch b/patches/kernel/0083-x86-mm-64-Rename-the-register_page_bootmem_memmap-si.patch deleted file mode 100644 index 3b8f212..0000000 --- a/patches/kernel/0083-x86-mm-64-Rename-the-register_page_bootmem_memmap-si.patch +++ /dev/null @@ -1,90 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Baoquan He -Date: Sat, 28 Oct 2017 09:30:38 +0800 -Subject: [PATCH] x86/mm/64: Rename the register_page_bootmem_memmap() 'size' - parameter to 'nr_pages' -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -register_page_bootmem_memmap()'s 3rd 'size' parameter is named -in a somewhat misleading fashion - rename it to 'nr_pages' which -makes the units of it much clearer. - -Meanwhile rename the existing local variable 'nr_pages' to -'nr_pmd_pages', a more expressive name, to avoid conflict with -new function parameter 'nr_pages'. - -(Also clean up the unnecessary parentheses in which get_order() is called.) - -Signed-off-by: Baoquan He -Acked-by: Thomas Gleixner -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: akpm@linux-foundation.org -Link: http://lkml.kernel.org/r/1509154238-23250-1-git-send-email-bhe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 15670bfe19905b1dcbb63137f40d718b59d84479) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit d73ad1d31ef8a44c6e5977c5123cbaa6d02e2035) -Signed-off-by: Fabian Grünbichler ---- - include/linux/mm.h | 2 +- - arch/x86/mm/init_64.c | 10 +++++----- - 2 files changed, 6 insertions(+), 6 deletions(-) - -diff --git a/include/linux/mm.h b/include/linux/mm.h -index 07630442bbf2..97f6ca707010 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -2475,7 +2475,7 @@ void vmemmap_populate_print_last(void); - void vmemmap_free(unsigned long start, unsigned long end); - #endif - void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, -- unsigned long size); -+ unsigned long nr_pages); - - enum mf_flags { - MF_COUNT_INCREASED = 1 << 0, -diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c -index 136422d7d539..902983c8ea8c 100644 ---- a/arch/x86/mm/init_64.c -+++ b/arch/x86/mm/init_64.c -@@ -1418,16 +1418,16 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) - - #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) - void register_page_bootmem_memmap(unsigned long section_nr, -- struct page *start_page, unsigned long size) -+ struct page *start_page, unsigned long nr_pages) - { - unsigned long addr = (unsigned long)start_page; -- unsigned long end = (unsigned long)(start_page + size); -+ unsigned long end = (unsigned long)(start_page + nr_pages); - unsigned long next; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; -- unsigned int nr_pages; -+ unsigned int nr_pmd_pages; - struct page *page; - - for (; addr < end; addr = next) { -@@ -1474,9 +1474,9 @@ void register_page_bootmem_memmap(unsigned long section_nr, - if (pmd_none(*pmd)) - continue; - -- nr_pages = 1 << (get_order(PMD_SIZE)); -+ nr_pmd_pages = 1 << get_order(PMD_SIZE); - page = pmd_page(*pmd); -- while (nr_pages--) -+ while (nr_pmd_pages--) - get_page_bootmem(section_nr, page++, - SECTION_INFO); - } --- -2.14.2 - diff --git a/patches/kernel/0084-x86-cpufeatures-Enable-new-SSE-AVX-AVX512-CPU-featur.patch b/patches/kernel/0084-x86-cpufeatures-Enable-new-SSE-AVX-AVX512-CPU-featur.patch deleted file mode 100644 index bf5c981..0000000 --- a/patches/kernel/0084-x86-cpufeatures-Enable-new-SSE-AVX-AVX512-CPU-featur.patch +++ /dev/null @@ -1,86 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Gayatri Kammela -Date: Mon, 30 Oct 2017 18:20:29 -0700 -Subject: [PATCH] x86/cpufeatures: Enable new SSE/AVX/AVX512 CPU features -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Add a few new SSE/AVX/AVX512 instruction groups/features for enumeration -in /proc/cpuinfo: AVX512_VBMI2, GFNI, VAES, VPCLMULQDQ, AVX512_VNNI, -AVX512_BITALG. - - CPUID.(EAX=7,ECX=0):ECX[bit 6] AVX512_VBMI2 - CPUID.(EAX=7,ECX=0):ECX[bit 8] GFNI - CPUID.(EAX=7,ECX=0):ECX[bit 9] VAES - CPUID.(EAX=7,ECX=0):ECX[bit 10] VPCLMULQDQ - CPUID.(EAX=7,ECX=0):ECX[bit 11] AVX512_VNNI - CPUID.(EAX=7,ECX=0):ECX[bit 12] AVX512_BITALG - -Detailed information of CPUID bits for these features can be found -in the Intel Architecture Instruction Set Extensions and Future Features -Programming Interface document (refer to Table 1-1. and Table 1-2.). -A copy of this document is available at -https://bugzilla.kernel.org/show_bug.cgi?id=197239 - -Signed-off-by: Gayatri Kammela -Acked-by: Thomas Gleixner -Cc: Andi Kleen -Cc: Fenghua Yu -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Ravi Shankar -Cc: Ricardo Neri -Cc: Yang Zhong -Cc: bp@alien8.de -Link: http://lkml.kernel.org/r/1509412829-23380-1-git-send-email-gayatri.kammela@intel.com -Signed-off-by: Ingo Molnar -(cherry picked from commit c128dbfa0f879f8ce7b79054037889b0b2240728) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit b29eb29c5aca4708d66fa977db40c779366636a2) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/cpufeatures.h | 6 ++++++ - arch/x86/kernel/cpu/cpuid-deps.c | 6 ++++++ - 2 files changed, 12 insertions(+) - -diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h -index f4e145c4b06f..c465bd6613ed 100644 ---- a/arch/x86/include/asm/cpufeatures.h -+++ b/arch/x86/include/asm/cpufeatures.h -@@ -297,6 +297,12 @@ - #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ - #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ - #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ -+#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ -+#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ -+#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ -+#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ -+#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ -+#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ - #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ - #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ - #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ -diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c -index c1d49842a411..c21f22d836ad 100644 ---- a/arch/x86/kernel/cpu/cpuid-deps.c -+++ b/arch/x86/kernel/cpu/cpuid-deps.c -@@ -50,6 +50,12 @@ const static struct cpuid_dep cpuid_deps[] = { - { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, -+ { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, -+ { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, -+ { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, -+ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, -+ { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, -+ { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, - { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, --- -2.14.2 - diff --git a/patches/kernel/0084-x86-mm-64-Rename-the-register_page_bootmem_memmap-si.patch b/patches/kernel/0084-x86-mm-64-Rename-the-register_page_bootmem_memmap-si.patch new file mode 100644 index 0000000..3b8f212 --- /dev/null +++ b/patches/kernel/0084-x86-mm-64-Rename-the-register_page_bootmem_memmap-si.patch @@ -0,0 +1,90 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Baoquan He +Date: Sat, 28 Oct 2017 09:30:38 +0800 +Subject: [PATCH] x86/mm/64: Rename the register_page_bootmem_memmap() 'size' + parameter to 'nr_pages' +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +register_page_bootmem_memmap()'s 3rd 'size' parameter is named +in a somewhat misleading fashion - rename it to 'nr_pages' which +makes the units of it much clearer. + +Meanwhile rename the existing local variable 'nr_pages' to +'nr_pmd_pages', a more expressive name, to avoid conflict with +new function parameter 'nr_pages'. + +(Also clean up the unnecessary parentheses in which get_order() is called.) + +Signed-off-by: Baoquan He +Acked-by: Thomas Gleixner +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: akpm@linux-foundation.org +Link: http://lkml.kernel.org/r/1509154238-23250-1-git-send-email-bhe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 15670bfe19905b1dcbb63137f40d718b59d84479) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit d73ad1d31ef8a44c6e5977c5123cbaa6d02e2035) +Signed-off-by: Fabian Grünbichler +--- + include/linux/mm.h | 2 +- + arch/x86/mm/init_64.c | 10 +++++----- + 2 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 07630442bbf2..97f6ca707010 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2475,7 +2475,7 @@ void vmemmap_populate_print_last(void); + void vmemmap_free(unsigned long start, unsigned long end); + #endif + void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, +- unsigned long size); ++ unsigned long nr_pages); + + enum mf_flags { + MF_COUNT_INCREASED = 1 << 0, +diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c +index 136422d7d539..902983c8ea8c 100644 +--- a/arch/x86/mm/init_64.c ++++ b/arch/x86/mm/init_64.c +@@ -1418,16 +1418,16 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) + + #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) + void register_page_bootmem_memmap(unsigned long section_nr, +- struct page *start_page, unsigned long size) ++ struct page *start_page, unsigned long nr_pages) + { + unsigned long addr = (unsigned long)start_page; +- unsigned long end = (unsigned long)(start_page + size); ++ unsigned long end = (unsigned long)(start_page + nr_pages); + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; +- unsigned int nr_pages; ++ unsigned int nr_pmd_pages; + struct page *page; + + for (; addr < end; addr = next) { +@@ -1474,9 +1474,9 @@ void register_page_bootmem_memmap(unsigned long section_nr, + if (pmd_none(*pmd)) + continue; + +- nr_pages = 1 << (get_order(PMD_SIZE)); ++ nr_pmd_pages = 1 << get_order(PMD_SIZE); + page = pmd_page(*pmd); +- while (nr_pages--) ++ while (nr_pmd_pages--) + get_page_bootmem(section_nr, page++, + SECTION_INFO); + } +-- +2.14.2 + diff --git a/patches/kernel/0085-x86-cpufeatures-Enable-new-SSE-AVX-AVX512-CPU-featur.patch b/patches/kernel/0085-x86-cpufeatures-Enable-new-SSE-AVX-AVX512-CPU-featur.patch new file mode 100644 index 0000000..bf5c981 --- /dev/null +++ b/patches/kernel/0085-x86-cpufeatures-Enable-new-SSE-AVX-AVX512-CPU-featur.patch @@ -0,0 +1,86 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Gayatri Kammela +Date: Mon, 30 Oct 2017 18:20:29 -0700 +Subject: [PATCH] x86/cpufeatures: Enable new SSE/AVX/AVX512 CPU features +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Add a few new SSE/AVX/AVX512 instruction groups/features for enumeration +in /proc/cpuinfo: AVX512_VBMI2, GFNI, VAES, VPCLMULQDQ, AVX512_VNNI, +AVX512_BITALG. + + CPUID.(EAX=7,ECX=0):ECX[bit 6] AVX512_VBMI2 + CPUID.(EAX=7,ECX=0):ECX[bit 8] GFNI + CPUID.(EAX=7,ECX=0):ECX[bit 9] VAES + CPUID.(EAX=7,ECX=0):ECX[bit 10] VPCLMULQDQ + CPUID.(EAX=7,ECX=0):ECX[bit 11] AVX512_VNNI + CPUID.(EAX=7,ECX=0):ECX[bit 12] AVX512_BITALG + +Detailed information of CPUID bits for these features can be found +in the Intel Architecture Instruction Set Extensions and Future Features +Programming Interface document (refer to Table 1-1. and Table 1-2.). +A copy of this document is available at +https://bugzilla.kernel.org/show_bug.cgi?id=197239 + +Signed-off-by: Gayatri Kammela +Acked-by: Thomas Gleixner +Cc: Andi Kleen +Cc: Fenghua Yu +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Ravi Shankar +Cc: Ricardo Neri +Cc: Yang Zhong +Cc: bp@alien8.de +Link: http://lkml.kernel.org/r/1509412829-23380-1-git-send-email-gayatri.kammela@intel.com +Signed-off-by: Ingo Molnar +(cherry picked from commit c128dbfa0f879f8ce7b79054037889b0b2240728) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit b29eb29c5aca4708d66fa977db40c779366636a2) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/cpufeatures.h | 6 ++++++ + arch/x86/kernel/cpu/cpuid-deps.c | 6 ++++++ + 2 files changed, 12 insertions(+) + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index f4e145c4b06f..c465bd6613ed 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -297,6 +297,12 @@ + #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ + #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ + #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ ++#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ ++#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ ++#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ ++#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ ++#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ ++#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ + #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ + #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ + #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ +diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c +index c1d49842a411..c21f22d836ad 100644 +--- a/arch/x86/kernel/cpu/cpuid-deps.c ++++ b/arch/x86/kernel/cpu/cpuid-deps.c +@@ -50,6 +50,12 @@ const static struct cpuid_dep cpuid_deps[] = { + { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, ++ { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, ++ { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, ++ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, ++ { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, ++ { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, +-- +2.14.2 + diff --git a/patches/kernel/0085-x86-mm-Relocate-page-fault-error-codes-to-traps.h.patch b/patches/kernel/0085-x86-mm-Relocate-page-fault-error-codes-to-traps.h.patch deleted file mode 100644 index fad29e3..0000000 --- a/patches/kernel/0085-x86-mm-Relocate-page-fault-error-codes-to-traps.h.patch +++ /dev/null @@ -1,363 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Ricardo Neri -Date: Fri, 27 Oct 2017 13:25:28 -0700 -Subject: [PATCH] x86/mm: Relocate page fault error codes to traps.h -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Up to this point, only fault.c used the definitions of the page fault error -codes. Thus, it made sense to keep them within such file. Other portions of -code might be interested in those definitions too. For instance, the User- -Mode Instruction Prevention emulation code will use such definitions to -emulate a page fault when it is unable to successfully copy the results -of the emulated instructions to user space. - -While relocating the error code enumeration, the prefix X86_ is used to -make it consistent with the rest of the definitions in traps.h. Of course, -code using the enumeration had to be updated as well. No functional changes -were performed. - -Signed-off-by: Ricardo Neri -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Reviewed-by: Andy Lutomirski -Cc: "Michael S. Tsirkin" -Cc: Peter Zijlstra -Cc: Dave Hansen -Cc: ricardo.neri@intel.com -Cc: Paul Gortmaker -Cc: Huang Rui -Cc: Shuah Khan -Cc: Jonathan Corbet -Cc: Jiri Slaby -Cc: "Ravi V. Shankar" -Cc: Chris Metcalf -Cc: Brian Gerst -Cc: Josh Poimboeuf -Cc: Chen Yucong -Cc: Vlastimil Babka -Cc: Masami Hiramatsu -Cc: Paolo Bonzini -Cc: Andrew Morton -Cc: "Kirill A. Shutemov" -Link: https://lkml.kernel.org/r/1509135945-13762-2-git-send-email-ricardo.neri-calderon@linux.intel.com - -(cherry picked from commit 1067f030994c69ca1fba8c607437c8895dcf8509) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit a85a07ab9111e3c78797c20b60a664dbd5db4981) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/traps.h | 18 +++++++++ - arch/x86/mm/fault.c | 88 +++++++++++++++++--------------------------- - 2 files changed, 52 insertions(+), 54 deletions(-) - -diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h -index feb89dbe359d..8e5bf86f87e5 100644 ---- a/arch/x86/include/asm/traps.h -+++ b/arch/x86/include/asm/traps.h -@@ -162,4 +162,22 @@ enum { - X86_TRAP_IRET = 32, /* 32, IRET Exception */ - }; - -+/* -+ * Page fault error code bits: -+ * -+ * bit 0 == 0: no page found 1: protection fault -+ * bit 1 == 0: read access 1: write access -+ * bit 2 == 0: kernel-mode access 1: user-mode access -+ * bit 3 == 1: use of reserved bit detected -+ * bit 4 == 1: fault was an instruction fetch -+ * bit 5 == 1: protection keys block access -+ */ -+enum x86_pf_error_code { -+ X86_PF_PROT = 1 << 0, -+ X86_PF_WRITE = 1 << 1, -+ X86_PF_USER = 1 << 2, -+ X86_PF_RSVD = 1 << 3, -+ X86_PF_INSTR = 1 << 4, -+ X86_PF_PK = 1 << 5, -+}; - #endif /* _ASM_X86_TRAPS_H */ -diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c -index 4ee9eb916826..d3a57e7ad311 100644 ---- a/arch/x86/mm/fault.c -+++ b/arch/x86/mm/fault.c -@@ -28,26 +28,6 @@ - #define CREATE_TRACE_POINTS - #include - --/* -- * Page fault error code bits: -- * -- * bit 0 == 0: no page found 1: protection fault -- * bit 1 == 0: read access 1: write access -- * bit 2 == 0: kernel-mode access 1: user-mode access -- * bit 3 == 1: use of reserved bit detected -- * bit 4 == 1: fault was an instruction fetch -- * bit 5 == 1: protection keys block access -- */ --enum x86_pf_error_code { -- -- PF_PROT = 1 << 0, -- PF_WRITE = 1 << 1, -- PF_USER = 1 << 2, -- PF_RSVD = 1 << 3, -- PF_INSTR = 1 << 4, -- PF_PK = 1 << 5, --}; -- - /* - * Returns 0 if mmiotrace is disabled, or if the fault is not - * handled by mmiotrace: -@@ -149,7 +129,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) - * If it was a exec (instruction fetch) fault on NX page, then - * do not ignore the fault: - */ -- if (error_code & PF_INSTR) -+ if (error_code & X86_PF_INSTR) - return 0; - - instr = (void *)convert_ip_to_linear(current, regs); -@@ -179,7 +159,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) - * siginfo so userspace can discover which protection key was set - * on the PTE. - * -- * If we get here, we know that the hardware signaled a PF_PK -+ * If we get here, we know that the hardware signaled a X86_PF_PK - * fault and that there was a VMA once we got in the fault - * handler. It does *not* guarantee that the VMA we find here - * was the one that we faulted on. -@@ -204,7 +184,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) - /* - * force_sig_info_fault() is called from a number of - * contexts, some of which have a VMA and some of which -- * do not. The PF_PK handing happens after we have a -+ * do not. The X86_PF_PK handing happens after we have a - * valid VMA, so we should never reach this without a - * valid VMA. - */ -@@ -693,7 +673,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, - if (!oops_may_print()) - return; - -- if (error_code & PF_INSTR) { -+ if (error_code & X86_PF_INSTR) { - unsigned int level; - pgd_t *pgd; - pte_t *pte; -@@ -775,7 +755,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, - */ - if (current->thread.sig_on_uaccess_err && signal) { - tsk->thread.trap_nr = X86_TRAP_PF; -- tsk->thread.error_code = error_code | PF_USER; -+ tsk->thread.error_code = error_code | X86_PF_USER; - tsk->thread.cr2 = address; - - /* XXX: hwpoison faults will set the wrong code. */ -@@ -894,7 +874,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - struct task_struct *tsk = current; - - /* User mode accesses just cause a SIGSEGV */ -- if (error_code & PF_USER) { -+ if (error_code & X86_PF_USER) { - /* - * It's possible to have interrupts off here: - */ -@@ -915,7 +895,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - * Instruction fetch faults in the vsyscall page might need - * emulation. - */ -- if (unlikely((error_code & PF_INSTR) && -+ if (unlikely((error_code & X86_PF_INSTR) && - ((address & ~0xfff) == VSYSCALL_ADDR))) { - if (emulate_vsyscall(regs, address)) - return; -@@ -928,7 +908,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - * are always protection faults. - */ - if (address >= TASK_SIZE_MAX) -- error_code |= PF_PROT; -+ error_code |= X86_PF_PROT; - - if (likely(show_unhandled_signals)) - show_signal_msg(regs, error_code, address, tsk); -@@ -989,11 +969,11 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code, - - if (!boot_cpu_has(X86_FEATURE_OSPKE)) - return false; -- if (error_code & PF_PK) -+ if (error_code & X86_PF_PK) - return true; - /* this checks permission keys on the VMA: */ -- if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), -- (error_code & PF_INSTR), foreign)) -+ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), -+ (error_code & X86_PF_INSTR), foreign)) - return true; - return false; - } -@@ -1021,7 +1001,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, - int code = BUS_ADRERR; - - /* Kernel mode? Handle exceptions or die: */ -- if (!(error_code & PF_USER)) { -+ if (!(error_code & X86_PF_USER)) { - no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); - return; - } -@@ -1049,14 +1029,14 @@ static noinline void - mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, u32 *pkey, unsigned int fault) - { -- if (fatal_signal_pending(current) && !(error_code & PF_USER)) { -+ if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { - no_context(regs, error_code, address, 0, 0); - return; - } - - if (fault & VM_FAULT_OOM) { - /* Kernel mode? Handle exceptions or die: */ -- if (!(error_code & PF_USER)) { -+ if (!(error_code & X86_PF_USER)) { - no_context(regs, error_code, address, - SIGSEGV, SEGV_MAPERR); - return; -@@ -1081,16 +1061,16 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, - - static int spurious_fault_check(unsigned long error_code, pte_t *pte) - { -- if ((error_code & PF_WRITE) && !pte_write(*pte)) -+ if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) - return 0; - -- if ((error_code & PF_INSTR) && !pte_exec(*pte)) -+ if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) - return 0; - /* - * Note: We do not do lazy flushing on protection key -- * changes, so no spurious fault will ever set PF_PK. -+ * changes, so no spurious fault will ever set X86_PF_PK. - */ -- if ((error_code & PF_PK)) -+ if ((error_code & X86_PF_PK)) - return 1; - - return 1; -@@ -1136,8 +1116,8 @@ spurious_fault(unsigned long error_code, unsigned long address) - * change, so user accesses are not expected to cause spurious - * faults. - */ -- if (error_code != (PF_WRITE | PF_PROT) -- && error_code != (PF_INSTR | PF_PROT)) -+ if (error_code != (X86_PF_WRITE | X86_PF_PROT) && -+ error_code != (X86_PF_INSTR | X86_PF_PROT)) - return 0; - - pgd = init_mm.pgd + pgd_index(address); -@@ -1197,19 +1177,19 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) - * always an unconditional error and can never result in - * a follow-up action to resolve the fault, like a COW. - */ -- if (error_code & PF_PK) -+ if (error_code & X86_PF_PK) - return 1; - - /* - * Make sure to check the VMA so that we do not perform -- * faults just to hit a PF_PK as soon as we fill in a -+ * faults just to hit a X86_PF_PK as soon as we fill in a - * page. - */ -- if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), -- (error_code & PF_INSTR), foreign)) -+ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), -+ (error_code & X86_PF_INSTR), foreign)) - return 1; - -- if (error_code & PF_WRITE) { -+ if (error_code & X86_PF_WRITE) { - /* write, present and write, not present: */ - if (unlikely(!(vma->vm_flags & VM_WRITE))) - return 1; -@@ -1217,7 +1197,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) - } - - /* read, present: */ -- if (unlikely(error_code & PF_PROT)) -+ if (unlikely(error_code & X86_PF_PROT)) - return 1; - - /* read, not present: */ -@@ -1240,7 +1220,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) - if (!static_cpu_has(X86_FEATURE_SMAP)) - return false; - -- if (error_code & PF_USER) -+ if (error_code & X86_PF_USER) - return false; - - if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) -@@ -1293,7 +1273,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, - * protection error (error_code & 9) == 0. - */ - if (unlikely(fault_in_kernel_space(address))) { -- if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { -+ if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { - if (vmalloc_fault(address) >= 0) - return; - -@@ -1321,7 +1301,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, - if (unlikely(kprobes_fault(regs))) - return; - -- if (unlikely(error_code & PF_RSVD)) -+ if (unlikely(error_code & X86_PF_RSVD)) - pgtable_bad(regs, error_code, address); - - if (unlikely(smap_violation(error_code, regs))) { -@@ -1347,7 +1327,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, - */ - if (user_mode(regs)) { - local_irq_enable(); -- error_code |= PF_USER; -+ error_code |= X86_PF_USER; - flags |= FAULT_FLAG_USER; - } else { - if (regs->flags & X86_EFLAGS_IF) -@@ -1356,9 +1336,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, - - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - -- if (error_code & PF_WRITE) -+ if (error_code & X86_PF_WRITE) - flags |= FAULT_FLAG_WRITE; -- if (error_code & PF_INSTR) -+ if (error_code & X86_PF_INSTR) - flags |= FAULT_FLAG_INSTRUCTION; - - /* -@@ -1378,7 +1358,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, - * space check, thus avoiding the deadlock: - */ - if (unlikely(!down_read_trylock(&mm->mmap_sem))) { -- if ((error_code & PF_USER) == 0 && -+ if (!(error_code & X86_PF_USER) && - !search_exception_tables(regs->ip)) { - bad_area_nosemaphore(regs, error_code, address, NULL); - return; -@@ -1405,7 +1385,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, - bad_area(regs, error_code, address); - return; - } -- if (error_code & PF_USER) { -+ if (error_code & X86_PF_USER) { - /* - * Accessing the stack below %sp is always a bug. - * The large cushion allows instructions like enter --- -2.14.2 - diff --git a/patches/kernel/0086-x86-boot-Relocate-definition-of-the-initial-state-of.patch b/patches/kernel/0086-x86-boot-Relocate-definition-of-the-initial-state-of.patch deleted file mode 100644 index 936d6b0..0000000 --- a/patches/kernel/0086-x86-boot-Relocate-definition-of-the-initial-state-of.patch +++ /dev/null @@ -1,103 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Ricardo Neri -Date: Fri, 27 Oct 2017 13:25:29 -0700 -Subject: [PATCH] x86/boot: Relocate definition of the initial state of CR0 -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Both head_32.S and head_64.S utilize the same value to initialize the -control register CR0. Also, other parts of the kernel might want to access -this initial definition (e.g., emulation code for User-Mode Instruction -Prevention uses this state to provide a sane dummy value for CR0 when -emulating the smsw instruction). Thus, relocate this definition to a -header file from which it can be conveniently accessed. - -Suggested-by: Borislav Petkov -Signed-off-by: Ricardo Neri -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Reviewed-by: Andy Lutomirski -Cc: "Michael S. Tsirkin" -Cc: Peter Zijlstra -Cc: Dave Hansen -Cc: ricardo.neri@intel.com -Cc: linux-mm@kvack.org -Cc: Paul Gortmaker -Cc: Huang Rui -Cc: Shuah Khan -Cc: linux-arch@vger.kernel.org -Cc: Jonathan Corbet -Cc: Jiri Slaby -Cc: "Ravi V. Shankar" -Cc: Denys Vlasenko -Cc: Chris Metcalf -Cc: Brian Gerst -Cc: Josh Poimboeuf -Cc: Chen Yucong -Cc: Vlastimil Babka -Cc: Dave Hansen -Cc: Andy Lutomirski -Cc: Masami Hiramatsu -Cc: Paolo Bonzini -Cc: Andrew Morton -Cc: Linus Torvalds -Link: https://lkml.kernel.org/r/1509135945-13762-3-git-send-email-ricardo.neri-calderon@linux.intel.com - -(cherry picked from commit b0ce5b8c95c83a7b98c679b117e3d6ae6f97154b) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 27c31a88c22edab269abe17c0ac7db0351d26c5f) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/uapi/asm/processor-flags.h | 3 +++ - arch/x86/kernel/head_32.S | 3 --- - arch/x86/kernel/head_64.S | 3 --- - 3 files changed, 3 insertions(+), 6 deletions(-) - -diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h -index 185f3d10c194..39946d0a1d41 100644 ---- a/arch/x86/include/uapi/asm/processor-flags.h -+++ b/arch/x86/include/uapi/asm/processor-flags.h -@@ -151,5 +151,8 @@ - #define CX86_ARR_BASE 0xc4 - #define CX86_RCR_BASE 0xdc - -+#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ -+ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ -+ X86_CR0_PG) - - #endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */ -diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S -index 337a65377baf..7bbcdb1ea31a 100644 ---- a/arch/x86/kernel/head_32.S -+++ b/arch/x86/kernel/head_32.S -@@ -213,9 +213,6 @@ ENTRY(startup_32_smp) - #endif - - .Ldefault_entry: --#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ -- X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ -- X86_CR0_PG) - movl $(CR0_STATE & ~X86_CR0_PG),%eax - movl %eax,%cr0 - -diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S -index a2d8541b1da4..4117c1e0b3d2 100644 ---- a/arch/x86/kernel/head_64.S -+++ b/arch/x86/kernel/head_64.S -@@ -137,9 +137,6 @@ ENTRY(secondary_startup_64) - 1: wrmsr /* Make changes effective */ - - /* Setup cr0 */ --#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ -- X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ -- X86_CR0_PG) - movl $CR0_STATE, %eax - /* Make changes effective */ - movq %rax, %cr0 --- -2.14.2 - diff --git a/patches/kernel/0086-x86-mm-Relocate-page-fault-error-codes-to-traps.h.patch b/patches/kernel/0086-x86-mm-Relocate-page-fault-error-codes-to-traps.h.patch new file mode 100644 index 0000000..fad29e3 --- /dev/null +++ b/patches/kernel/0086-x86-mm-Relocate-page-fault-error-codes-to-traps.h.patch @@ -0,0 +1,363 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Fri, 27 Oct 2017 13:25:28 -0700 +Subject: [PATCH] x86/mm: Relocate page fault error codes to traps.h +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Up to this point, only fault.c used the definitions of the page fault error +codes. Thus, it made sense to keep them within such file. Other portions of +code might be interested in those definitions too. For instance, the User- +Mode Instruction Prevention emulation code will use such definitions to +emulate a page fault when it is unable to successfully copy the results +of the emulated instructions to user space. + +While relocating the error code enumeration, the prefix X86_ is used to +make it consistent with the rest of the definitions in traps.h. Of course, +code using the enumeration had to be updated as well. No functional changes +were performed. + +Signed-off-by: Ricardo Neri +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Reviewed-by: Andy Lutomirski +Cc: "Michael S. Tsirkin" +Cc: Peter Zijlstra +Cc: Dave Hansen +Cc: ricardo.neri@intel.com +Cc: Paul Gortmaker +Cc: Huang Rui +Cc: Shuah Khan +Cc: Jonathan Corbet +Cc: Jiri Slaby +Cc: "Ravi V. Shankar" +Cc: Chris Metcalf +Cc: Brian Gerst +Cc: Josh Poimboeuf +Cc: Chen Yucong +Cc: Vlastimil Babka +Cc: Masami Hiramatsu +Cc: Paolo Bonzini +Cc: Andrew Morton +Cc: "Kirill A. Shutemov" +Link: https://lkml.kernel.org/r/1509135945-13762-2-git-send-email-ricardo.neri-calderon@linux.intel.com + +(cherry picked from commit 1067f030994c69ca1fba8c607437c8895dcf8509) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit a85a07ab9111e3c78797c20b60a664dbd5db4981) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/traps.h | 18 +++++++++ + arch/x86/mm/fault.c | 88 +++++++++++++++++--------------------------- + 2 files changed, 52 insertions(+), 54 deletions(-) + +diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h +index feb89dbe359d..8e5bf86f87e5 100644 +--- a/arch/x86/include/asm/traps.h ++++ b/arch/x86/include/asm/traps.h +@@ -162,4 +162,22 @@ enum { + X86_TRAP_IRET = 32, /* 32, IRET Exception */ + }; + ++/* ++ * Page fault error code bits: ++ * ++ * bit 0 == 0: no page found 1: protection fault ++ * bit 1 == 0: read access 1: write access ++ * bit 2 == 0: kernel-mode access 1: user-mode access ++ * bit 3 == 1: use of reserved bit detected ++ * bit 4 == 1: fault was an instruction fetch ++ * bit 5 == 1: protection keys block access ++ */ ++enum x86_pf_error_code { ++ X86_PF_PROT = 1 << 0, ++ X86_PF_WRITE = 1 << 1, ++ X86_PF_USER = 1 << 2, ++ X86_PF_RSVD = 1 << 3, ++ X86_PF_INSTR = 1 << 4, ++ X86_PF_PK = 1 << 5, ++}; + #endif /* _ASM_X86_TRAPS_H */ +diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c +index 4ee9eb916826..d3a57e7ad311 100644 +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -28,26 +28,6 @@ + #define CREATE_TRACE_POINTS + #include + +-/* +- * Page fault error code bits: +- * +- * bit 0 == 0: no page found 1: protection fault +- * bit 1 == 0: read access 1: write access +- * bit 2 == 0: kernel-mode access 1: user-mode access +- * bit 3 == 1: use of reserved bit detected +- * bit 4 == 1: fault was an instruction fetch +- * bit 5 == 1: protection keys block access +- */ +-enum x86_pf_error_code { +- +- PF_PROT = 1 << 0, +- PF_WRITE = 1 << 1, +- PF_USER = 1 << 2, +- PF_RSVD = 1 << 3, +- PF_INSTR = 1 << 4, +- PF_PK = 1 << 5, +-}; +- + /* + * Returns 0 if mmiotrace is disabled, or if the fault is not + * handled by mmiotrace: +@@ -149,7 +129,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) + * If it was a exec (instruction fetch) fault on NX page, then + * do not ignore the fault: + */ +- if (error_code & PF_INSTR) ++ if (error_code & X86_PF_INSTR) + return 0; + + instr = (void *)convert_ip_to_linear(current, regs); +@@ -179,7 +159,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) + * siginfo so userspace can discover which protection key was set + * on the PTE. + * +- * If we get here, we know that the hardware signaled a PF_PK ++ * If we get here, we know that the hardware signaled a X86_PF_PK + * fault and that there was a VMA once we got in the fault + * handler. It does *not* guarantee that the VMA we find here + * was the one that we faulted on. +@@ -204,7 +184,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) + /* + * force_sig_info_fault() is called from a number of + * contexts, some of which have a VMA and some of which +- * do not. The PF_PK handing happens after we have a ++ * do not. The X86_PF_PK handing happens after we have a + * valid VMA, so we should never reach this without a + * valid VMA. + */ +@@ -693,7 +673,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, + if (!oops_may_print()) + return; + +- if (error_code & PF_INSTR) { ++ if (error_code & X86_PF_INSTR) { + unsigned int level; + pgd_t *pgd; + pte_t *pte; +@@ -775,7 +755,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, + */ + if (current->thread.sig_on_uaccess_err && signal) { + tsk->thread.trap_nr = X86_TRAP_PF; +- tsk->thread.error_code = error_code | PF_USER; ++ tsk->thread.error_code = error_code | X86_PF_USER; + tsk->thread.cr2 = address; + + /* XXX: hwpoison faults will set the wrong code. */ +@@ -894,7 +874,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + struct task_struct *tsk = current; + + /* User mode accesses just cause a SIGSEGV */ +- if (error_code & PF_USER) { ++ if (error_code & X86_PF_USER) { + /* + * It's possible to have interrupts off here: + */ +@@ -915,7 +895,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + * Instruction fetch faults in the vsyscall page might need + * emulation. + */ +- if (unlikely((error_code & PF_INSTR) && ++ if (unlikely((error_code & X86_PF_INSTR) && + ((address & ~0xfff) == VSYSCALL_ADDR))) { + if (emulate_vsyscall(regs, address)) + return; +@@ -928,7 +908,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + * are always protection faults. + */ + if (address >= TASK_SIZE_MAX) +- error_code |= PF_PROT; ++ error_code |= X86_PF_PROT; + + if (likely(show_unhandled_signals)) + show_signal_msg(regs, error_code, address, tsk); +@@ -989,11 +969,11 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code, + + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return false; +- if (error_code & PF_PK) ++ if (error_code & X86_PF_PK) + return true; + /* this checks permission keys on the VMA: */ +- if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), +- (error_code & PF_INSTR), foreign)) ++ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), ++ (error_code & X86_PF_INSTR), foreign)) + return true; + return false; + } +@@ -1021,7 +1001,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, + int code = BUS_ADRERR; + + /* Kernel mode? Handle exceptions or die: */ +- if (!(error_code & PF_USER)) { ++ if (!(error_code & X86_PF_USER)) { + no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); + return; + } +@@ -1049,14 +1029,14 @@ static noinline void + mm_fault_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address, u32 *pkey, unsigned int fault) + { +- if (fatal_signal_pending(current) && !(error_code & PF_USER)) { ++ if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { + no_context(regs, error_code, address, 0, 0); + return; + } + + if (fault & VM_FAULT_OOM) { + /* Kernel mode? Handle exceptions or die: */ +- if (!(error_code & PF_USER)) { ++ if (!(error_code & X86_PF_USER)) { + no_context(regs, error_code, address, + SIGSEGV, SEGV_MAPERR); + return; +@@ -1081,16 +1061,16 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, + + static int spurious_fault_check(unsigned long error_code, pte_t *pte) + { +- if ((error_code & PF_WRITE) && !pte_write(*pte)) ++ if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) + return 0; + +- if ((error_code & PF_INSTR) && !pte_exec(*pte)) ++ if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) + return 0; + /* + * Note: We do not do lazy flushing on protection key +- * changes, so no spurious fault will ever set PF_PK. ++ * changes, so no spurious fault will ever set X86_PF_PK. + */ +- if ((error_code & PF_PK)) ++ if ((error_code & X86_PF_PK)) + return 1; + + return 1; +@@ -1136,8 +1116,8 @@ spurious_fault(unsigned long error_code, unsigned long address) + * change, so user accesses are not expected to cause spurious + * faults. + */ +- if (error_code != (PF_WRITE | PF_PROT) +- && error_code != (PF_INSTR | PF_PROT)) ++ if (error_code != (X86_PF_WRITE | X86_PF_PROT) && ++ error_code != (X86_PF_INSTR | X86_PF_PROT)) + return 0; + + pgd = init_mm.pgd + pgd_index(address); +@@ -1197,19 +1177,19 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) + * always an unconditional error and can never result in + * a follow-up action to resolve the fault, like a COW. + */ +- if (error_code & PF_PK) ++ if (error_code & X86_PF_PK) + return 1; + + /* + * Make sure to check the VMA so that we do not perform +- * faults just to hit a PF_PK as soon as we fill in a ++ * faults just to hit a X86_PF_PK as soon as we fill in a + * page. + */ +- if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), +- (error_code & PF_INSTR), foreign)) ++ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), ++ (error_code & X86_PF_INSTR), foreign)) + return 1; + +- if (error_code & PF_WRITE) { ++ if (error_code & X86_PF_WRITE) { + /* write, present and write, not present: */ + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return 1; +@@ -1217,7 +1197,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) + } + + /* read, present: */ +- if (unlikely(error_code & PF_PROT)) ++ if (unlikely(error_code & X86_PF_PROT)) + return 1; + + /* read, not present: */ +@@ -1240,7 +1220,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) + if (!static_cpu_has(X86_FEATURE_SMAP)) + return false; + +- if (error_code & PF_USER) ++ if (error_code & X86_PF_USER) + return false; + + if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) +@@ -1293,7 +1273,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, + * protection error (error_code & 9) == 0. + */ + if (unlikely(fault_in_kernel_space(address))) { +- if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { ++ if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { + if (vmalloc_fault(address) >= 0) + return; + +@@ -1321,7 +1301,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, + if (unlikely(kprobes_fault(regs))) + return; + +- if (unlikely(error_code & PF_RSVD)) ++ if (unlikely(error_code & X86_PF_RSVD)) + pgtable_bad(regs, error_code, address); + + if (unlikely(smap_violation(error_code, regs))) { +@@ -1347,7 +1327,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, + */ + if (user_mode(regs)) { + local_irq_enable(); +- error_code |= PF_USER; ++ error_code |= X86_PF_USER; + flags |= FAULT_FLAG_USER; + } else { + if (regs->flags & X86_EFLAGS_IF) +@@ -1356,9 +1336,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + +- if (error_code & PF_WRITE) ++ if (error_code & X86_PF_WRITE) + flags |= FAULT_FLAG_WRITE; +- if (error_code & PF_INSTR) ++ if (error_code & X86_PF_INSTR) + flags |= FAULT_FLAG_INSTRUCTION; + + /* +@@ -1378,7 +1358,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, + * space check, thus avoiding the deadlock: + */ + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { +- if ((error_code & PF_USER) == 0 && ++ if (!(error_code & X86_PF_USER) && + !search_exception_tables(regs->ip)) { + bad_area_nosemaphore(regs, error_code, address, NULL); + return; +@@ -1405,7 +1385,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, + bad_area(regs, error_code, address); + return; + } +- if (error_code & PF_USER) { ++ if (error_code & X86_PF_USER) { + /* + * Accessing the stack below %sp is always a bug. + * The large cushion allows instructions like enter +-- +2.14.2 + diff --git a/patches/kernel/0087-ptrace-x86-Make-user_64bit_mode-available-to-32-bit-.patch b/patches/kernel/0087-ptrace-x86-Make-user_64bit_mode-available-to-32-bit-.patch deleted file mode 100644 index 65e6b7c..0000000 --- a/patches/kernel/0087-ptrace-x86-Make-user_64bit_mode-available-to-32-bit-.patch +++ /dev/null @@ -1,92 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Ricardo Neri -Date: Fri, 27 Oct 2017 13:25:30 -0700 -Subject: [PATCH] ptrace,x86: Make user_64bit_mode() available to 32-bit builds -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -In its current form, user_64bit_mode() can only be used when CONFIG_X86_64 -is selected. This implies that code built with CONFIG_X86_64=n cannot use -it. If a piece of code needs to be built for both CONFIG_X86_64=y and -CONFIG_X86_64=n and wants to use this function, it needs to wrap it in -an #ifdef/#endif; potentially, in multiple places. - -This can be easily avoided with a single #ifdef/#endif pair within -user_64bit_mode() itself. - -Suggested-by: Borislav Petkov -Signed-off-by: Ricardo Neri -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: "Michael S. Tsirkin" -Cc: Peter Zijlstra -Cc: Dave Hansen -Cc: ricardo.neri@intel.com -Cc: Adrian Hunter -Cc: Paul Gortmaker -Cc: Huang Rui -Cc: Qiaowei Ren -Cc: Shuah Khan -Cc: Kees Cook -Cc: Jonathan Corbet -Cc: Jiri Slaby -Cc: Dmitry Vyukov -Cc: "Ravi V. Shankar" -Cc: Chris Metcalf -Cc: Brian Gerst -Cc: Arnaldo Carvalho de Melo -Cc: Andy Lutomirski -Cc: Colin Ian King -Cc: Chen Yucong -Cc: Adam Buchbinder -Cc: Vlastimil Babka -Cc: Lorenzo Stoakes -Cc: Masami Hiramatsu -Cc: Paolo Bonzini -Cc: Andrew Morton -Cc: Thomas Garnier -Link: https://lkml.kernel.org/r/1509135945-13762-4-git-send-email-ricardo.neri-calderon@linux.intel.com - -(cherry picked from commit e27c310af5c05cf876d9cad006928076c27f54d4) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 20ddf08f867d3d96788299cd2fb7676590d64250) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/ptrace.h | 6 +++++- - 1 file changed, 5 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h -index 2b5d686ea9f3..ea78a8438a8a 100644 ---- a/arch/x86/include/asm/ptrace.h -+++ b/arch/x86/include/asm/ptrace.h -@@ -115,9 +115,9 @@ static inline int v8086_mode(struct pt_regs *regs) - #endif - } - --#ifdef CONFIG_X86_64 - static inline bool user_64bit_mode(struct pt_regs *regs) - { -+#ifdef CONFIG_X86_64 - #ifndef CONFIG_PARAVIRT - /* - * On non-paravirt systems, this is the only long mode CPL 3 -@@ -128,8 +128,12 @@ static inline bool user_64bit_mode(struct pt_regs *regs) - /* Headers are too twisted for this to go in paravirt.h. */ - return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs; - #endif -+#else /* !CONFIG_X86_64 */ -+ return false; -+#endif - } - -+#ifdef CONFIG_X86_64 - #define current_user_stack_pointer() current_pt_regs()->sp - #define compat_user_stack_pointer() current_pt_regs()->sp - #endif --- -2.14.2 - diff --git a/patches/kernel/0087-x86-boot-Relocate-definition-of-the-initial-state-of.patch b/patches/kernel/0087-x86-boot-Relocate-definition-of-the-initial-state-of.patch new file mode 100644 index 0000000..936d6b0 --- /dev/null +++ b/patches/kernel/0087-x86-boot-Relocate-definition-of-the-initial-state-of.patch @@ -0,0 +1,103 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Fri, 27 Oct 2017 13:25:29 -0700 +Subject: [PATCH] x86/boot: Relocate definition of the initial state of CR0 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Both head_32.S and head_64.S utilize the same value to initialize the +control register CR0. Also, other parts of the kernel might want to access +this initial definition (e.g., emulation code for User-Mode Instruction +Prevention uses this state to provide a sane dummy value for CR0 when +emulating the smsw instruction). Thus, relocate this definition to a +header file from which it can be conveniently accessed. + +Suggested-by: Borislav Petkov +Signed-off-by: Ricardo Neri +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Reviewed-by: Andy Lutomirski +Cc: "Michael S. Tsirkin" +Cc: Peter Zijlstra +Cc: Dave Hansen +Cc: ricardo.neri@intel.com +Cc: linux-mm@kvack.org +Cc: Paul Gortmaker +Cc: Huang Rui +Cc: Shuah Khan +Cc: linux-arch@vger.kernel.org +Cc: Jonathan Corbet +Cc: Jiri Slaby +Cc: "Ravi V. Shankar" +Cc: Denys Vlasenko +Cc: Chris Metcalf +Cc: Brian Gerst +Cc: Josh Poimboeuf +Cc: Chen Yucong +Cc: Vlastimil Babka +Cc: Dave Hansen +Cc: Andy Lutomirski +Cc: Masami Hiramatsu +Cc: Paolo Bonzini +Cc: Andrew Morton +Cc: Linus Torvalds +Link: https://lkml.kernel.org/r/1509135945-13762-3-git-send-email-ricardo.neri-calderon@linux.intel.com + +(cherry picked from commit b0ce5b8c95c83a7b98c679b117e3d6ae6f97154b) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 27c31a88c22edab269abe17c0ac7db0351d26c5f) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/uapi/asm/processor-flags.h | 3 +++ + arch/x86/kernel/head_32.S | 3 --- + arch/x86/kernel/head_64.S | 3 --- + 3 files changed, 3 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h +index 185f3d10c194..39946d0a1d41 100644 +--- a/arch/x86/include/uapi/asm/processor-flags.h ++++ b/arch/x86/include/uapi/asm/processor-flags.h +@@ -151,5 +151,8 @@ + #define CX86_ARR_BASE 0xc4 + #define CX86_RCR_BASE 0xdc + ++#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ ++ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ ++ X86_CR0_PG) + + #endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */ +diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S +index 337a65377baf..7bbcdb1ea31a 100644 +--- a/arch/x86/kernel/head_32.S ++++ b/arch/x86/kernel/head_32.S +@@ -213,9 +213,6 @@ ENTRY(startup_32_smp) + #endif + + .Ldefault_entry: +-#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ +- X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ +- X86_CR0_PG) + movl $(CR0_STATE & ~X86_CR0_PG),%eax + movl %eax,%cr0 + +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index a2d8541b1da4..4117c1e0b3d2 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -137,9 +137,6 @@ ENTRY(secondary_startup_64) + 1: wrmsr /* Make changes effective */ + + /* Setup cr0 */ +-#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ +- X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ +- X86_CR0_PG) + movl $CR0_STATE, %eax + /* Make changes effective */ + movq %rax, %cr0 +-- +2.14.2 + diff --git a/patches/kernel/0088-ptrace-x86-Make-user_64bit_mode-available-to-32-bit-.patch b/patches/kernel/0088-ptrace-x86-Make-user_64bit_mode-available-to-32-bit-.patch new file mode 100644 index 0000000..65e6b7c --- /dev/null +++ b/patches/kernel/0088-ptrace-x86-Make-user_64bit_mode-available-to-32-bit-.patch @@ -0,0 +1,92 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Fri, 27 Oct 2017 13:25:30 -0700 +Subject: [PATCH] ptrace,x86: Make user_64bit_mode() available to 32-bit builds +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +In its current form, user_64bit_mode() can only be used when CONFIG_X86_64 +is selected. This implies that code built with CONFIG_X86_64=n cannot use +it. If a piece of code needs to be built for both CONFIG_X86_64=y and +CONFIG_X86_64=n and wants to use this function, it needs to wrap it in +an #ifdef/#endif; potentially, in multiple places. + +This can be easily avoided with a single #ifdef/#endif pair within +user_64bit_mode() itself. + +Suggested-by: Borislav Petkov +Signed-off-by: Ricardo Neri +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: "Michael S. Tsirkin" +Cc: Peter Zijlstra +Cc: Dave Hansen +Cc: ricardo.neri@intel.com +Cc: Adrian Hunter +Cc: Paul Gortmaker +Cc: Huang Rui +Cc: Qiaowei Ren +Cc: Shuah Khan +Cc: Kees Cook +Cc: Jonathan Corbet +Cc: Jiri Slaby +Cc: Dmitry Vyukov +Cc: "Ravi V. Shankar" +Cc: Chris Metcalf +Cc: Brian Gerst +Cc: Arnaldo Carvalho de Melo +Cc: Andy Lutomirski +Cc: Colin Ian King +Cc: Chen Yucong +Cc: Adam Buchbinder +Cc: Vlastimil Babka +Cc: Lorenzo Stoakes +Cc: Masami Hiramatsu +Cc: Paolo Bonzini +Cc: Andrew Morton +Cc: Thomas Garnier +Link: https://lkml.kernel.org/r/1509135945-13762-4-git-send-email-ricardo.neri-calderon@linux.intel.com + +(cherry picked from commit e27c310af5c05cf876d9cad006928076c27f54d4) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 20ddf08f867d3d96788299cd2fb7676590d64250) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/ptrace.h | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h +index 2b5d686ea9f3..ea78a8438a8a 100644 +--- a/arch/x86/include/asm/ptrace.h ++++ b/arch/x86/include/asm/ptrace.h +@@ -115,9 +115,9 @@ static inline int v8086_mode(struct pt_regs *regs) + #endif + } + +-#ifdef CONFIG_X86_64 + static inline bool user_64bit_mode(struct pt_regs *regs) + { ++#ifdef CONFIG_X86_64 + #ifndef CONFIG_PARAVIRT + /* + * On non-paravirt systems, this is the only long mode CPL 3 +@@ -128,8 +128,12 @@ static inline bool user_64bit_mode(struct pt_regs *regs) + /* Headers are too twisted for this to go in paravirt.h. */ + return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs; + #endif ++#else /* !CONFIG_X86_64 */ ++ return false; ++#endif + } + ++#ifdef CONFIG_X86_64 + #define current_user_stack_pointer() current_pt_regs()->sp + #define compat_user_stack_pointer() current_pt_regs()->sp + #endif +-- +2.14.2 + diff --git a/patches/kernel/0088-x86-entry-64-Remove-the-restore_c_regs_and_iret-labe.patch b/patches/kernel/0088-x86-entry-64-Remove-the-restore_c_regs_and_iret-labe.patch deleted file mode 100644 index 436f7da..0000000 --- a/patches/kernel/0088-x86-entry-64-Remove-the-restore_c_regs_and_iret-labe.patch +++ /dev/null @@ -1,74 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:58:58 -0700 -Subject: [PATCH] x86/entry/64: Remove the restore_c_regs_and_iret label -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The only user was the 64-bit opportunistic SYSRET failure path, and -that path didn't really need it. This change makes the -opportunistic SYSRET code a bit more straightforward and gets rid of -the label. - -Signed-off-by: Andy Lutomirski -Reviewed-by: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/be3006a7ad3326e3458cf1cc55d416252cbe1986.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 9da78ba6b47b46428cfdfc0851511ab29c869798) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 629c8b858cbe72e88e7f44a8f10e1b434ab80721) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/entry_64.S | 5 ++--- - 1 file changed, 2 insertions(+), 3 deletions(-) - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 34adfe0221d2..fac354ddf056 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -245,7 +245,6 @@ entry_SYSCALL64_slow_path: - call do_syscall_64 /* returns with IRQs disabled */ - - return_from_SYSCALL_64: -- RESTORE_EXTRA_REGS - TRACE_IRQS_IRETQ /* we're about to change IF */ - - /* -@@ -314,6 +313,7 @@ return_from_SYSCALL_64: - */ - syscall_return_via_sysret: - /* rcx and r11 are already restored (see code above) */ -+ RESTORE_EXTRA_REGS - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp), %rsp - UNWIND_HINT_EMPTY -@@ -321,7 +321,7 @@ syscall_return_via_sysret: - - opportunistic_sysret_failed: - SWAPGS -- jmp restore_c_regs_and_iret -+ jmp restore_regs_and_iret - END(entry_SYSCALL_64) - - ENTRY(stub_ptregs_64) -@@ -638,7 +638,6 @@ retint_kernel: - */ - GLOBAL(restore_regs_and_iret) - RESTORE_EXTRA_REGS --restore_c_regs_and_iret: - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 - INTERRUPT_RETURN --- -2.14.2 - diff --git a/patches/kernel/0089-x86-entry-64-Remove-the-restore_c_regs_and_iret-labe.patch b/patches/kernel/0089-x86-entry-64-Remove-the-restore_c_regs_and_iret-labe.patch new file mode 100644 index 0000000..436f7da --- /dev/null +++ b/patches/kernel/0089-x86-entry-64-Remove-the-restore_c_regs_and_iret-labe.patch @@ -0,0 +1,74 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:58:58 -0700 +Subject: [PATCH] x86/entry/64: Remove the restore_c_regs_and_iret label +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The only user was the 64-bit opportunistic SYSRET failure path, and +that path didn't really need it. This change makes the +opportunistic SYSRET code a bit more straightforward and gets rid of +the label. + +Signed-off-by: Andy Lutomirski +Reviewed-by: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/be3006a7ad3326e3458cf1cc55d416252cbe1986.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 9da78ba6b47b46428cfdfc0851511ab29c869798) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 629c8b858cbe72e88e7f44a8f10e1b434ab80721) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/entry_64.S | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 34adfe0221d2..fac354ddf056 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -245,7 +245,6 @@ entry_SYSCALL64_slow_path: + call do_syscall_64 /* returns with IRQs disabled */ + + return_from_SYSCALL_64: +- RESTORE_EXTRA_REGS + TRACE_IRQS_IRETQ /* we're about to change IF */ + + /* +@@ -314,6 +313,7 @@ return_from_SYSCALL_64: + */ + syscall_return_via_sysret: + /* rcx and r11 are already restored (see code above) */ ++ RESTORE_EXTRA_REGS + RESTORE_C_REGS_EXCEPT_RCX_R11 + movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY +@@ -321,7 +321,7 @@ syscall_return_via_sysret: + + opportunistic_sysret_failed: + SWAPGS +- jmp restore_c_regs_and_iret ++ jmp restore_regs_and_iret + END(entry_SYSCALL_64) + + ENTRY(stub_ptregs_64) +@@ -638,7 +638,6 @@ retint_kernel: + */ + GLOBAL(restore_regs_and_iret) + RESTORE_EXTRA_REGS +-restore_c_regs_and_iret: + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 + INTERRUPT_RETURN +-- +2.14.2 + diff --git a/patches/kernel/0089-x86-entry-64-Split-the-IRET-to-user-and-IRET-to-kern.patch b/patches/kernel/0089-x86-entry-64-Split-the-IRET-to-user-and-IRET-to-kern.patch deleted file mode 100644 index 960c7be..0000000 --- a/patches/kernel/0089-x86-entry-64-Split-the-IRET-to-user-and-IRET-to-kern.patch +++ /dev/null @@ -1,134 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:58:59 -0700 -Subject: [PATCH] x86/entry/64: Split the IRET-to-user and IRET-to-kernel paths -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -These code paths will diverge soon. - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/dccf8c7b3750199b4b30383c812d4e2931811509.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 26c4ef9c49d8a0341f6d97ce2cfdd55d1236ed29) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 64adfba0aeb668304d171c383ac80b22158ec128) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/entry_64.S | 34 +++++++++++++++++++++++++--------- - arch/x86/entry/entry_64_compat.S | 2 +- - arch/x86/kernel/head_64.S | 2 +- - 3 files changed, 27 insertions(+), 11 deletions(-) - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index fac354ddf056..e546441fbec3 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -321,7 +321,7 @@ syscall_return_via_sysret: - - opportunistic_sysret_failed: - SWAPGS -- jmp restore_regs_and_iret -+ jmp restore_regs_and_return_to_usermode - END(entry_SYSCALL_64) - - ENTRY(stub_ptregs_64) -@@ -423,7 +423,7 @@ ENTRY(ret_from_fork) - call syscall_return_slowpath /* returns with IRQs disabled */ - TRACE_IRQS_ON /* user mode is traced as IRQS on */ - SWAPGS -- jmp restore_regs_and_iret -+ jmp restore_regs_and_return_to_usermode - - 1: - /* kernel thread */ -@@ -612,7 +612,20 @@ GLOBAL(retint_user) - call prepare_exit_to_usermode - TRACE_IRQS_IRETQ - SWAPGS -- jmp restore_regs_and_iret -+ -+GLOBAL(restore_regs_and_return_to_usermode) -+#ifdef CONFIG_DEBUG_ENTRY -+ /* Assert that pt_regs indicates user mode. */ -+ testl $3, CS(%rsp) -+ jnz 1f -+ ud2 -+1: -+#endif -+ RESTORE_EXTRA_REGS -+ RESTORE_C_REGS -+ REMOVE_PT_GPREGS_FROM_STACK 8 -+ INTERRUPT_RETURN -+ - - /* Returning to kernel space */ - retint_kernel: -@@ -632,11 +645,14 @@ retint_kernel: - */ - TRACE_IRQS_IRETQ - --/* -- * At this label, code paths which return to kernel and to user, -- * which come from interrupts/exception and from syscalls, merge. -- */ --GLOBAL(restore_regs_and_iret) -+GLOBAL(restore_regs_and_return_to_kernel) -+#ifdef CONFIG_DEBUG_ENTRY -+ /* Assert that pt_regs indicates kernel mode. */ -+ testl $3, CS(%rsp) -+ jz 1f -+ ud2 -+1: -+#endif - RESTORE_EXTRA_REGS - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 -@@ -1340,7 +1356,7 @@ ENTRY(nmi) - * work, because we don't want to enable interrupts. - */ - SWAPGS -- jmp restore_regs_and_iret -+ jmp restore_regs_and_return_to_usermode - - .Lnmi_from_kernel: - /* -diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S -index d8468ba24be0..2b3a88feaa2b 100644 ---- a/arch/x86/entry/entry_64_compat.S -+++ b/arch/x86/entry/entry_64_compat.S -@@ -337,7 +337,7 @@ ENTRY(entry_INT80_compat) - /* Go back to user mode. */ - TRACE_IRQS_ON - SWAPGS -- jmp restore_regs_and_iret -+ jmp restore_regs_and_return_to_usermode - END(entry_INT80_compat) - - ALIGN -diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S -index 4117c1e0b3d2..e785734980ad 100644 ---- a/arch/x86/kernel/head_64.S -+++ b/arch/x86/kernel/head_64.S -@@ -311,7 +311,7 @@ early_idt_handler_common: - - 20: - decl early_recursion_flag(%rip) -- jmp restore_regs_and_iret -+ jmp restore_regs_and_return_to_kernel - END(early_idt_handler_common) - - __INITDATA --- -2.14.2 - diff --git a/patches/kernel/0090-x86-entry-64-Move-SWAPGS-into-the-common-IRET-to-use.patch b/patches/kernel/0090-x86-entry-64-Move-SWAPGS-into-the-common-IRET-to-use.patch deleted file mode 100644 index 81edf0f..0000000 --- a/patches/kernel/0090-x86-entry-64-Move-SWAPGS-into-the-common-IRET-to-use.patch +++ /dev/null @@ -1,156 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:59:00 -0700 -Subject: [PATCH] x86/entry/64: Move SWAPGS into the common IRET-to-usermode - path -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -All of the code paths that ended up doing IRET to usermode did -SWAPGS immediately beforehand. Move the SWAPGS into the common -code. - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/27fd6f45b7cd640de38fb9066fd0349bcd11f8e1.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 8a055d7f411d41755ce30db5bb65b154777c4b78) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 62a85594f9be3baeb2495089f1c2980bc497d03b) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/entry_64.S | 32 ++++++++++++++------------------ - arch/x86/entry/entry_64_compat.S | 3 +-- - 2 files changed, 15 insertions(+), 20 deletions(-) - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index e546441fbec3..7c8258e3ad2d 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -249,12 +249,14 @@ return_from_SYSCALL_64: - - /* - * Try to use SYSRET instead of IRET if we're returning to -- * a completely clean 64-bit userspace context. -+ * a completely clean 64-bit userspace context. If we're not, -+ * go to the slow exit path. - */ - movq RCX(%rsp), %rcx - movq RIP(%rsp), %r11 -- cmpq %rcx, %r11 /* RCX == RIP */ -- jne opportunistic_sysret_failed -+ -+ cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ -+ jne swapgs_restore_regs_and_return_to_usermode - - /* - * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP -@@ -272,14 +274,14 @@ return_from_SYSCALL_64: - - /* If this changed %rcx, it was not canonical */ - cmpq %rcx, %r11 -- jne opportunistic_sysret_failed -+ jne swapgs_restore_regs_and_return_to_usermode - - cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ -- jne opportunistic_sysret_failed -+ jne swapgs_restore_regs_and_return_to_usermode - - movq R11(%rsp), %r11 - cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ -- jne opportunistic_sysret_failed -+ jne swapgs_restore_regs_and_return_to_usermode - - /* - * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot -@@ -300,12 +302,12 @@ return_from_SYSCALL_64: - * would never get past 'stuck_here'. - */ - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 -- jnz opportunistic_sysret_failed -+ jnz swapgs_restore_regs_and_return_to_usermode - - /* nothing to check for RSP */ - - cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ -- jne opportunistic_sysret_failed -+ jne swapgs_restore_regs_and_return_to_usermode - - /* - * We win! This label is here just for ease of understanding -@@ -318,10 +320,6 @@ syscall_return_via_sysret: - movq RSP(%rsp), %rsp - UNWIND_HINT_EMPTY - USERGS_SYSRET64 -- --opportunistic_sysret_failed: -- SWAPGS -- jmp restore_regs_and_return_to_usermode - END(entry_SYSCALL_64) - - ENTRY(stub_ptregs_64) -@@ -422,8 +420,7 @@ ENTRY(ret_from_fork) - movq %rsp, %rdi - call syscall_return_slowpath /* returns with IRQs disabled */ - TRACE_IRQS_ON /* user mode is traced as IRQS on */ -- SWAPGS -- jmp restore_regs_and_return_to_usermode -+ jmp swapgs_restore_regs_and_return_to_usermode - - 1: - /* kernel thread */ -@@ -611,9 +608,8 @@ GLOBAL(retint_user) - mov %rsp,%rdi - call prepare_exit_to_usermode - TRACE_IRQS_IRETQ -- SWAPGS - --GLOBAL(restore_regs_and_return_to_usermode) -+GLOBAL(swapgs_restore_regs_and_return_to_usermode) - #ifdef CONFIG_DEBUG_ENTRY - /* Assert that pt_regs indicates user mode. */ - testl $3, CS(%rsp) -@@ -621,6 +617,7 @@ GLOBAL(restore_regs_and_return_to_usermode) - ud2 - 1: - #endif -+ SWAPGS - RESTORE_EXTRA_REGS - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 -@@ -1355,8 +1352,7 @@ ENTRY(nmi) - * Return back to user mode. We must *not* do the normal exit - * work, because we don't want to enable interrupts. - */ -- SWAPGS -- jmp restore_regs_and_return_to_usermode -+ jmp swapgs_restore_regs_and_return_to_usermode - - .Lnmi_from_kernel: - /* -diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S -index 2b3a88feaa2b..be745b7a3e3e 100644 ---- a/arch/x86/entry/entry_64_compat.S -+++ b/arch/x86/entry/entry_64_compat.S -@@ -336,8 +336,7 @@ ENTRY(entry_INT80_compat) - - /* Go back to user mode. */ - TRACE_IRQS_ON -- SWAPGS -- jmp restore_regs_and_return_to_usermode -+ jmp swapgs_restore_regs_and_return_to_usermode - END(entry_INT80_compat) - - ALIGN --- -2.14.2 - diff --git a/patches/kernel/0090-x86-entry-64-Split-the-IRET-to-user-and-IRET-to-kern.patch b/patches/kernel/0090-x86-entry-64-Split-the-IRET-to-user-and-IRET-to-kern.patch new file mode 100644 index 0000000..960c7be --- /dev/null +++ b/patches/kernel/0090-x86-entry-64-Split-the-IRET-to-user-and-IRET-to-kern.patch @@ -0,0 +1,134 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:58:59 -0700 +Subject: [PATCH] x86/entry/64: Split the IRET-to-user and IRET-to-kernel paths +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +These code paths will diverge soon. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/dccf8c7b3750199b4b30383c812d4e2931811509.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 26c4ef9c49d8a0341f6d97ce2cfdd55d1236ed29) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 64adfba0aeb668304d171c383ac80b22158ec128) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/entry_64.S | 34 +++++++++++++++++++++++++--------- + arch/x86/entry/entry_64_compat.S | 2 +- + arch/x86/kernel/head_64.S | 2 +- + 3 files changed, 27 insertions(+), 11 deletions(-) + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index fac354ddf056..e546441fbec3 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -321,7 +321,7 @@ syscall_return_via_sysret: + + opportunistic_sysret_failed: + SWAPGS +- jmp restore_regs_and_iret ++ jmp restore_regs_and_return_to_usermode + END(entry_SYSCALL_64) + + ENTRY(stub_ptregs_64) +@@ -423,7 +423,7 @@ ENTRY(ret_from_fork) + call syscall_return_slowpath /* returns with IRQs disabled */ + TRACE_IRQS_ON /* user mode is traced as IRQS on */ + SWAPGS +- jmp restore_regs_and_iret ++ jmp restore_regs_and_return_to_usermode + + 1: + /* kernel thread */ +@@ -612,7 +612,20 @@ GLOBAL(retint_user) + call prepare_exit_to_usermode + TRACE_IRQS_IRETQ + SWAPGS +- jmp restore_regs_and_iret ++ ++GLOBAL(restore_regs_and_return_to_usermode) ++#ifdef CONFIG_DEBUG_ENTRY ++ /* Assert that pt_regs indicates user mode. */ ++ testl $3, CS(%rsp) ++ jnz 1f ++ ud2 ++1: ++#endif ++ RESTORE_EXTRA_REGS ++ RESTORE_C_REGS ++ REMOVE_PT_GPREGS_FROM_STACK 8 ++ INTERRUPT_RETURN ++ + + /* Returning to kernel space */ + retint_kernel: +@@ -632,11 +645,14 @@ retint_kernel: + */ + TRACE_IRQS_IRETQ + +-/* +- * At this label, code paths which return to kernel and to user, +- * which come from interrupts/exception and from syscalls, merge. +- */ +-GLOBAL(restore_regs_and_iret) ++GLOBAL(restore_regs_and_return_to_kernel) ++#ifdef CONFIG_DEBUG_ENTRY ++ /* Assert that pt_regs indicates kernel mode. */ ++ testl $3, CS(%rsp) ++ jz 1f ++ ud2 ++1: ++#endif + RESTORE_EXTRA_REGS + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 +@@ -1340,7 +1356,7 @@ ENTRY(nmi) + * work, because we don't want to enable interrupts. + */ + SWAPGS +- jmp restore_regs_and_iret ++ jmp restore_regs_and_return_to_usermode + + .Lnmi_from_kernel: + /* +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index d8468ba24be0..2b3a88feaa2b 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -337,7 +337,7 @@ ENTRY(entry_INT80_compat) + /* Go back to user mode. */ + TRACE_IRQS_ON + SWAPGS +- jmp restore_regs_and_iret ++ jmp restore_regs_and_return_to_usermode + END(entry_INT80_compat) + + ALIGN +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index 4117c1e0b3d2..e785734980ad 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -311,7 +311,7 @@ early_idt_handler_common: + + 20: + decl early_recursion_flag(%rip) +- jmp restore_regs_and_iret ++ jmp restore_regs_and_return_to_kernel + END(early_idt_handler_common) + + __INITDATA +-- +2.14.2 + diff --git a/patches/kernel/0091-x86-entry-64-Move-SWAPGS-into-the-common-IRET-to-use.patch b/patches/kernel/0091-x86-entry-64-Move-SWAPGS-into-the-common-IRET-to-use.patch new file mode 100644 index 0000000..81edf0f --- /dev/null +++ b/patches/kernel/0091-x86-entry-64-Move-SWAPGS-into-the-common-IRET-to-use.patch @@ -0,0 +1,156 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:59:00 -0700 +Subject: [PATCH] x86/entry/64: Move SWAPGS into the common IRET-to-usermode + path +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +All of the code paths that ended up doing IRET to usermode did +SWAPGS immediately beforehand. Move the SWAPGS into the common +code. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/27fd6f45b7cd640de38fb9066fd0349bcd11f8e1.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 8a055d7f411d41755ce30db5bb65b154777c4b78) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 62a85594f9be3baeb2495089f1c2980bc497d03b) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/entry_64.S | 32 ++++++++++++++------------------ + arch/x86/entry/entry_64_compat.S | 3 +-- + 2 files changed, 15 insertions(+), 20 deletions(-) + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index e546441fbec3..7c8258e3ad2d 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -249,12 +249,14 @@ return_from_SYSCALL_64: + + /* + * Try to use SYSRET instead of IRET if we're returning to +- * a completely clean 64-bit userspace context. ++ * a completely clean 64-bit userspace context. If we're not, ++ * go to the slow exit path. + */ + movq RCX(%rsp), %rcx + movq RIP(%rsp), %r11 +- cmpq %rcx, %r11 /* RCX == RIP */ +- jne opportunistic_sysret_failed ++ ++ cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ ++ jne swapgs_restore_regs_and_return_to_usermode + + /* + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP +@@ -272,14 +274,14 @@ return_from_SYSCALL_64: + + /* If this changed %rcx, it was not canonical */ + cmpq %rcx, %r11 +- jne opportunistic_sysret_failed ++ jne swapgs_restore_regs_and_return_to_usermode + + cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ +- jne opportunistic_sysret_failed ++ jne swapgs_restore_regs_and_return_to_usermode + + movq R11(%rsp), %r11 + cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ +- jne opportunistic_sysret_failed ++ jne swapgs_restore_regs_and_return_to_usermode + + /* + * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot +@@ -300,12 +302,12 @@ return_from_SYSCALL_64: + * would never get past 'stuck_here'. + */ + testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 +- jnz opportunistic_sysret_failed ++ jnz swapgs_restore_regs_and_return_to_usermode + + /* nothing to check for RSP */ + + cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ +- jne opportunistic_sysret_failed ++ jne swapgs_restore_regs_and_return_to_usermode + + /* + * We win! This label is here just for ease of understanding +@@ -318,10 +320,6 @@ syscall_return_via_sysret: + movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY + USERGS_SYSRET64 +- +-opportunistic_sysret_failed: +- SWAPGS +- jmp restore_regs_and_return_to_usermode + END(entry_SYSCALL_64) + + ENTRY(stub_ptregs_64) +@@ -422,8 +420,7 @@ ENTRY(ret_from_fork) + movq %rsp, %rdi + call syscall_return_slowpath /* returns with IRQs disabled */ + TRACE_IRQS_ON /* user mode is traced as IRQS on */ +- SWAPGS +- jmp restore_regs_and_return_to_usermode ++ jmp swapgs_restore_regs_and_return_to_usermode + + 1: + /* kernel thread */ +@@ -611,9 +608,8 @@ GLOBAL(retint_user) + mov %rsp,%rdi + call prepare_exit_to_usermode + TRACE_IRQS_IRETQ +- SWAPGS + +-GLOBAL(restore_regs_and_return_to_usermode) ++GLOBAL(swapgs_restore_regs_and_return_to_usermode) + #ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates user mode. */ + testl $3, CS(%rsp) +@@ -621,6 +617,7 @@ GLOBAL(restore_regs_and_return_to_usermode) + ud2 + 1: + #endif ++ SWAPGS + RESTORE_EXTRA_REGS + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 +@@ -1355,8 +1352,7 @@ ENTRY(nmi) + * Return back to user mode. We must *not* do the normal exit + * work, because we don't want to enable interrupts. + */ +- SWAPGS +- jmp restore_regs_and_return_to_usermode ++ jmp swapgs_restore_regs_and_return_to_usermode + + .Lnmi_from_kernel: + /* +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index 2b3a88feaa2b..be745b7a3e3e 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -336,8 +336,7 @@ ENTRY(entry_INT80_compat) + + /* Go back to user mode. */ + TRACE_IRQS_ON +- SWAPGS +- jmp restore_regs_and_return_to_usermode ++ jmp swapgs_restore_regs_and_return_to_usermode + END(entry_INT80_compat) + + ALIGN +-- +2.14.2 + diff --git a/patches/kernel/0091-x86-entry-64-Simplify-reg-restore-code-in-the-standa.patch b/patches/kernel/0091-x86-entry-64-Simplify-reg-restore-code-in-the-standa.patch deleted file mode 100644 index c413507..0000000 --- a/patches/kernel/0091-x86-entry-64-Simplify-reg-restore-code-in-the-standa.patch +++ /dev/null @@ -1,103 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:59:01 -0700 -Subject: [PATCH] x86/entry/64: Simplify reg restore code in the standard IRET - paths -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The old code restored all the registers with movq instead of pop. - -In theory, this was done because some CPUs have higher movq -throughput, but any gain there would be tiny and is almost certainly -outweighed by the higher text size. - -This saves 96 bytes of text. - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/ad82520a207ccd851b04ba613f4f752b33ac05f7.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit e872045bfd9c465a8555bab4b8567d56a4d2d3bb) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit f926575cd370de4052e89477582b349af5664a56) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/calling.h | 21 +++++++++++++++++++++ - arch/x86/entry/entry_64.S | 12 ++++++------ - 2 files changed, 27 insertions(+), 6 deletions(-) - -diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h -index 640aafebdc00..0b9dd8123701 100644 ---- a/arch/x86/entry/calling.h -+++ b/arch/x86/entry/calling.h -@@ -151,6 +151,27 @@ For 32-bit we have the following conventions - kernel is built with - UNWIND_HINT_REGS offset=\offset extra=0 - .endm - -+ .macro POP_EXTRA_REGS -+ popq %r15 -+ popq %r14 -+ popq %r13 -+ popq %r12 -+ popq %rbp -+ popq %rbx -+ .endm -+ -+ .macro POP_C_REGS -+ popq %r11 -+ popq %r10 -+ popq %r9 -+ popq %r8 -+ popq %rax -+ popq %rcx -+ popq %rdx -+ popq %rsi -+ popq %rdi -+ .endm -+ - .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 - .if \rstor_r11 - movq 6*8(%rsp), %r11 -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 7c8258e3ad2d..a1a86e782a0e 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -618,9 +618,9 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) - 1: - #endif - SWAPGS -- RESTORE_EXTRA_REGS -- RESTORE_C_REGS -- REMOVE_PT_GPREGS_FROM_STACK 8 -+ POP_EXTRA_REGS -+ POP_C_REGS -+ addq $8, %rsp /* skip regs->orig_ax */ - INTERRUPT_RETURN - - -@@ -650,9 +650,9 @@ GLOBAL(restore_regs_and_return_to_kernel) - ud2 - 1: - #endif -- RESTORE_EXTRA_REGS -- RESTORE_C_REGS -- REMOVE_PT_GPREGS_FROM_STACK 8 -+ POP_EXTRA_REGS -+ POP_C_REGS -+ addq $8, %rsp /* skip regs->orig_ax */ - INTERRUPT_RETURN - - ENTRY(native_iret) --- -2.14.2 - diff --git a/patches/kernel/0092-x86-entry-64-Shrink-paranoid_exit_restore-and-make-l.patch b/patches/kernel/0092-x86-entry-64-Shrink-paranoid_exit_restore-and-make-l.patch deleted file mode 100644 index dce86c4..0000000 --- a/patches/kernel/0092-x86-entry-64-Shrink-paranoid_exit_restore-and-make-l.patch +++ /dev/null @@ -1,70 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:59:02 -0700 -Subject: [PATCH] x86/entry/64: Shrink paranoid_exit_restore and make labels - local -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -paranoid_exit_restore was a copy of restore_regs_and_return_to_kernel. -Merge them and make the paranoid_exit internal labels local. - -Keeping .Lparanoid_exit makes the code a bit shorter because it -allows a 2-byte jnz instead of a 5-byte jnz. - -Saves 96 bytes of text. - -( This is still a bit suboptimal in a non-CONFIG_TRACE_IRQFLAGS - kernel, but fixing that would make the code rather messy. ) - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/510d66a1895cda9473c84b1086f0bb974f22de6a.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit e53178328c9b96fbdbc719e78c93b5687ee007c3) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit fb53fe10add935c3d0eb63199e43426eaf3b4299) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/entry_64.S | 13 +++++-------- - 1 file changed, 5 insertions(+), 8 deletions(-) - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index a1a86e782a0e..6995f7e08aa1 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -1136,17 +1136,14 @@ ENTRY(paranoid_exit) - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF_DEBUG - testl %ebx, %ebx /* swapgs needed? */ -- jnz paranoid_exit_no_swapgs -+ jnz .Lparanoid_exit_no_swapgs - TRACE_IRQS_IRETQ - SWAPGS_UNSAFE_STACK -- jmp paranoid_exit_restore --paranoid_exit_no_swapgs: -+ jmp .Lparanoid_exit_restore -+.Lparanoid_exit_no_swapgs: - TRACE_IRQS_IRETQ_DEBUG --paranoid_exit_restore: -- RESTORE_EXTRA_REGS -- RESTORE_C_REGS -- REMOVE_PT_GPREGS_FROM_STACK 8 -- INTERRUPT_RETURN -+.Lparanoid_exit_restore: -+ jmp restore_regs_and_return_to_kernel - END(paranoid_exit) - - /* --- -2.14.2 - diff --git a/patches/kernel/0092-x86-entry-64-Simplify-reg-restore-code-in-the-standa.patch b/patches/kernel/0092-x86-entry-64-Simplify-reg-restore-code-in-the-standa.patch new file mode 100644 index 0000000..c413507 --- /dev/null +++ b/patches/kernel/0092-x86-entry-64-Simplify-reg-restore-code-in-the-standa.patch @@ -0,0 +1,103 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:59:01 -0700 +Subject: [PATCH] x86/entry/64: Simplify reg restore code in the standard IRET + paths +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The old code restored all the registers with movq instead of pop. + +In theory, this was done because some CPUs have higher movq +throughput, but any gain there would be tiny and is almost certainly +outweighed by the higher text size. + +This saves 96 bytes of text. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/ad82520a207ccd851b04ba613f4f752b33ac05f7.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit e872045bfd9c465a8555bab4b8567d56a4d2d3bb) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit f926575cd370de4052e89477582b349af5664a56) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/calling.h | 21 +++++++++++++++++++++ + arch/x86/entry/entry_64.S | 12 ++++++------ + 2 files changed, 27 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h +index 640aafebdc00..0b9dd8123701 100644 +--- a/arch/x86/entry/calling.h ++++ b/arch/x86/entry/calling.h +@@ -151,6 +151,27 @@ For 32-bit we have the following conventions - kernel is built with + UNWIND_HINT_REGS offset=\offset extra=0 + .endm + ++ .macro POP_EXTRA_REGS ++ popq %r15 ++ popq %r14 ++ popq %r13 ++ popq %r12 ++ popq %rbp ++ popq %rbx ++ .endm ++ ++ .macro POP_C_REGS ++ popq %r11 ++ popq %r10 ++ popq %r9 ++ popq %r8 ++ popq %rax ++ popq %rcx ++ popq %rdx ++ popq %rsi ++ popq %rdi ++ .endm ++ + .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 + .if \rstor_r11 + movq 6*8(%rsp), %r11 +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 7c8258e3ad2d..a1a86e782a0e 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -618,9 +618,9 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) + 1: + #endif + SWAPGS +- RESTORE_EXTRA_REGS +- RESTORE_C_REGS +- REMOVE_PT_GPREGS_FROM_STACK 8 ++ POP_EXTRA_REGS ++ POP_C_REGS ++ addq $8, %rsp /* skip regs->orig_ax */ + INTERRUPT_RETURN + + +@@ -650,9 +650,9 @@ GLOBAL(restore_regs_and_return_to_kernel) + ud2 + 1: + #endif +- RESTORE_EXTRA_REGS +- RESTORE_C_REGS +- REMOVE_PT_GPREGS_FROM_STACK 8 ++ POP_EXTRA_REGS ++ POP_C_REGS ++ addq $8, %rsp /* skip regs->orig_ax */ + INTERRUPT_RETURN + + ENTRY(native_iret) +-- +2.14.2 + diff --git a/patches/kernel/0093-x86-entry-64-Shrink-paranoid_exit_restore-and-make-l.patch b/patches/kernel/0093-x86-entry-64-Shrink-paranoid_exit_restore-and-make-l.patch new file mode 100644 index 0000000..dce86c4 --- /dev/null +++ b/patches/kernel/0093-x86-entry-64-Shrink-paranoid_exit_restore-and-make-l.patch @@ -0,0 +1,70 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:59:02 -0700 +Subject: [PATCH] x86/entry/64: Shrink paranoid_exit_restore and make labels + local +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +paranoid_exit_restore was a copy of restore_regs_and_return_to_kernel. +Merge them and make the paranoid_exit internal labels local. + +Keeping .Lparanoid_exit makes the code a bit shorter because it +allows a 2-byte jnz instead of a 5-byte jnz. + +Saves 96 bytes of text. + +( This is still a bit suboptimal in a non-CONFIG_TRACE_IRQFLAGS + kernel, but fixing that would make the code rather messy. ) + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/510d66a1895cda9473c84b1086f0bb974f22de6a.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit e53178328c9b96fbdbc719e78c93b5687ee007c3) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit fb53fe10add935c3d0eb63199e43426eaf3b4299) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/entry_64.S | 13 +++++-------- + 1 file changed, 5 insertions(+), 8 deletions(-) + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index a1a86e782a0e..6995f7e08aa1 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1136,17 +1136,14 @@ ENTRY(paranoid_exit) + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF_DEBUG + testl %ebx, %ebx /* swapgs needed? */ +- jnz paranoid_exit_no_swapgs ++ jnz .Lparanoid_exit_no_swapgs + TRACE_IRQS_IRETQ + SWAPGS_UNSAFE_STACK +- jmp paranoid_exit_restore +-paranoid_exit_no_swapgs: ++ jmp .Lparanoid_exit_restore ++.Lparanoid_exit_no_swapgs: + TRACE_IRQS_IRETQ_DEBUG +-paranoid_exit_restore: +- RESTORE_EXTRA_REGS +- RESTORE_C_REGS +- REMOVE_PT_GPREGS_FROM_STACK 8 +- INTERRUPT_RETURN ++.Lparanoid_exit_restore: ++ jmp restore_regs_and_return_to_kernel + END(paranoid_exit) + + /* +-- +2.14.2 + diff --git a/patches/kernel/0093-x86-entry-64-Use-pop-instead-of-movq-in-syscall_retu.patch b/patches/kernel/0093-x86-entry-64-Use-pop-instead-of-movq-in-syscall_retu.patch deleted file mode 100644 index 557e590..0000000 --- a/patches/kernel/0093-x86-entry-64-Use-pop-instead-of-movq-in-syscall_retu.patch +++ /dev/null @@ -1,61 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:59:03 -0700 -Subject: [PATCH] x86/entry/64: Use pop instead of movq in - syscall_return_via_sysret -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Saves 64 bytes. - -Signed-off-by: Andy Lutomirski -Reviewed-by: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/6609b7f74ab31c36604ad746e019ea8495aec76c.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 4fbb39108f972437c44e5ffa781b56635d496826) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 1e9a9d5ef9f65eeb26eb8f0974dd3e693894baf1) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/entry_64.S | 14 +++++++++++--- - 1 file changed, 11 insertions(+), 3 deletions(-) - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 6995f7e08aa1..33a416c7df2d 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -315,10 +315,18 @@ return_from_SYSCALL_64: - */ - syscall_return_via_sysret: - /* rcx and r11 are already restored (see code above) */ -- RESTORE_EXTRA_REGS -- RESTORE_C_REGS_EXCEPT_RCX_R11 -- movq RSP(%rsp), %rsp - UNWIND_HINT_EMPTY -+ POP_EXTRA_REGS -+ popq %rsi /* skip r11 */ -+ popq %r10 -+ popq %r9 -+ popq %r8 -+ popq %rax -+ popq %rsi /* skip rcx */ -+ popq %rdx -+ popq %rsi -+ popq %rdi -+ movq RSP-ORIG_RAX(%rsp), %rsp - USERGS_SYSRET64 - END(entry_SYSCALL_64) - --- -2.14.2 - diff --git a/patches/kernel/0094-x86-entry-64-Merge-the-fast-and-slow-SYSRET-paths.patch b/patches/kernel/0094-x86-entry-64-Merge-the-fast-and-slow-SYSRET-paths.patch deleted file mode 100644 index 4fa0876..0000000 --- a/patches/kernel/0094-x86-entry-64-Merge-the-fast-and-slow-SYSRET-paths.patch +++ /dev/null @@ -1,60 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:59:04 -0700 -Subject: [PATCH] x86/entry/64: Merge the fast and slow SYSRET paths -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -They did almost the same thing. Remove a bunch of pointless -instructions (mostly hidden in macros) and reduce cognitive load by -merging them. - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/1204e20233fcab9130a1ba80b3b1879b5db3fc1f.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit a512210643da8082cb44181dba8b18e752bd68f0) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 7c4575d8bb2d01960ba9b9840fa22460e0179eca) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/entry_64.S | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 33a416c7df2d..87be1cd1fa88 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -220,10 +220,9 @@ entry_SYSCALL_64_fastpath: - TRACE_IRQS_ON /* user mode is traced as IRQs on */ - movq RIP(%rsp), %rcx - movq EFLAGS(%rsp), %r11 -- RESTORE_C_REGS_EXCEPT_RCX_R11 -- movq RSP(%rsp), %rsp -+ addq $6*8, %rsp /* skip extra regs -- they were preserved */ - UNWIND_HINT_EMPTY -- USERGS_SYSRET64 -+ jmp .Lpop_c_regs_except_rcx_r11_and_sysret - - 1: - /* -@@ -317,6 +316,7 @@ syscall_return_via_sysret: - /* rcx and r11 are already restored (see code above) */ - UNWIND_HINT_EMPTY - POP_EXTRA_REGS -+.Lpop_c_regs_except_rcx_r11_and_sysret: - popq %rsi /* skip r11 */ - popq %r10 - popq %r9 --- -2.14.2 - diff --git a/patches/kernel/0094-x86-entry-64-Use-pop-instead-of-movq-in-syscall_retu.patch b/patches/kernel/0094-x86-entry-64-Use-pop-instead-of-movq-in-syscall_retu.patch new file mode 100644 index 0000000..557e590 --- /dev/null +++ b/patches/kernel/0094-x86-entry-64-Use-pop-instead-of-movq-in-syscall_retu.patch @@ -0,0 +1,61 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:59:03 -0700 +Subject: [PATCH] x86/entry/64: Use pop instead of movq in + syscall_return_via_sysret +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Saves 64 bytes. + +Signed-off-by: Andy Lutomirski +Reviewed-by: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/6609b7f74ab31c36604ad746e019ea8495aec76c.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 4fbb39108f972437c44e5ffa781b56635d496826) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 1e9a9d5ef9f65eeb26eb8f0974dd3e693894baf1) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/entry_64.S | 14 +++++++++++--- + 1 file changed, 11 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 6995f7e08aa1..33a416c7df2d 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -315,10 +315,18 @@ return_from_SYSCALL_64: + */ + syscall_return_via_sysret: + /* rcx and r11 are already restored (see code above) */ +- RESTORE_EXTRA_REGS +- RESTORE_C_REGS_EXCEPT_RCX_R11 +- movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY ++ POP_EXTRA_REGS ++ popq %rsi /* skip r11 */ ++ popq %r10 ++ popq %r9 ++ popq %r8 ++ popq %rax ++ popq %rsi /* skip rcx */ ++ popq %rdx ++ popq %rsi ++ popq %rdi ++ movq RSP-ORIG_RAX(%rsp), %rsp + USERGS_SYSRET64 + END(entry_SYSCALL_64) + +-- +2.14.2 + diff --git a/patches/kernel/0095-x86-entry-64-Merge-the-fast-and-slow-SYSRET-paths.patch b/patches/kernel/0095-x86-entry-64-Merge-the-fast-and-slow-SYSRET-paths.patch new file mode 100644 index 0000000..4fa0876 --- /dev/null +++ b/patches/kernel/0095-x86-entry-64-Merge-the-fast-and-slow-SYSRET-paths.patch @@ -0,0 +1,60 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:59:04 -0700 +Subject: [PATCH] x86/entry/64: Merge the fast and slow SYSRET paths +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +They did almost the same thing. Remove a bunch of pointless +instructions (mostly hidden in macros) and reduce cognitive load by +merging them. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/1204e20233fcab9130a1ba80b3b1879b5db3fc1f.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit a512210643da8082cb44181dba8b18e752bd68f0) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 7c4575d8bb2d01960ba9b9840fa22460e0179eca) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/entry_64.S | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 33a416c7df2d..87be1cd1fa88 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -220,10 +220,9 @@ entry_SYSCALL_64_fastpath: + TRACE_IRQS_ON /* user mode is traced as IRQs on */ + movq RIP(%rsp), %rcx + movq EFLAGS(%rsp), %r11 +- RESTORE_C_REGS_EXCEPT_RCX_R11 +- movq RSP(%rsp), %rsp ++ addq $6*8, %rsp /* skip extra regs -- they were preserved */ + UNWIND_HINT_EMPTY +- USERGS_SYSRET64 ++ jmp .Lpop_c_regs_except_rcx_r11_and_sysret + + 1: + /* +@@ -317,6 +316,7 @@ syscall_return_via_sysret: + /* rcx and r11 are already restored (see code above) */ + UNWIND_HINT_EMPTY + POP_EXTRA_REGS ++.Lpop_c_regs_except_rcx_r11_and_sysret: + popq %rsi /* skip r11 */ + popq %r10 + popq %r9 +-- +2.14.2 + diff --git a/patches/kernel/0095-x86-entry-64-Use-POP-instead-of-MOV-to-restore-regs-.patch b/patches/kernel/0095-x86-entry-64-Use-POP-instead-of-MOV-to-restore-regs-.patch deleted file mode 100644 index d93f334..0000000 --- a/patches/kernel/0095-x86-entry-64-Use-POP-instead-of-MOV-to-restore-regs-.patch +++ /dev/null @@ -1,57 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:59:05 -0700 -Subject: [PATCH] x86/entry/64: Use POP instead of MOV to restore regs on NMI - return -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -This gets rid of the last user of the old RESTORE_..._REGS infrastructure. - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/652a260f17a160789bc6a41d997f98249b73e2ab.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 471ee4832209e986029b9fabdaad57b1eecb856b) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 3c5771a43d8f00e53081871027fea891a091ff5e) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/entry_64.S | 11 +++++++---- - 1 file changed, 7 insertions(+), 4 deletions(-) - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 87be1cd1fa88..4eff3aca54ed 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -1572,11 +1572,14 @@ end_repeat_nmi: - nmi_swapgs: - SWAPGS_UNSAFE_STACK - nmi_restore: -- RESTORE_EXTRA_REGS -- RESTORE_C_REGS -+ POP_EXTRA_REGS -+ POP_C_REGS - -- /* Point RSP at the "iret" frame. */ -- REMOVE_PT_GPREGS_FROM_STACK 6*8 -+ /* -+ * Skip orig_ax and the "outermost" frame to point RSP at the "iret" -+ * at the "iret" frame. -+ */ -+ addq $6*8, %rsp - - /* - * Clear "NMI executing". Set DF first so that we can easily --- -2.14.2 - diff --git a/patches/kernel/0096-x86-entry-64-Remove-the-RESTORE_._REGS-infrastructur.patch b/patches/kernel/0096-x86-entry-64-Remove-the-RESTORE_._REGS-infrastructur.patch deleted file mode 100644 index 0c1434a..0000000 --- a/patches/kernel/0096-x86-entry-64-Remove-the-RESTORE_._REGS-infrastructur.patch +++ /dev/null @@ -1,104 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:59:06 -0700 -Subject: [PATCH] x86/entry/64: Remove the RESTORE_..._REGS infrastructure -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -All users of RESTORE_EXTRA_REGS, RESTORE_C_REGS and such, and -REMOVE_PT_GPREGS_FROM_STACK are gone. Delete the macros. - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/c32672f6e47c561893316d48e06c7656b1039a36.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit c39858de696f0cc160a544455e8403d663d577e9) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit d248c62028c5467cd5a5ce06d344e3fb330da3ec) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/calling.h | 52 ------------------------------------------------ - 1 file changed, 52 deletions(-) - -diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h -index 0b9dd8123701..1895a685d3dd 100644 ---- a/arch/x86/entry/calling.h -+++ b/arch/x86/entry/calling.h -@@ -141,16 +141,6 @@ For 32-bit we have the following conventions - kernel is built with - UNWIND_HINT_REGS offset=\offset - .endm - -- .macro RESTORE_EXTRA_REGS offset=0 -- movq 0*8+\offset(%rsp), %r15 -- movq 1*8+\offset(%rsp), %r14 -- movq 2*8+\offset(%rsp), %r13 -- movq 3*8+\offset(%rsp), %r12 -- movq 4*8+\offset(%rsp), %rbp -- movq 5*8+\offset(%rsp), %rbx -- UNWIND_HINT_REGS offset=\offset extra=0 -- .endm -- - .macro POP_EXTRA_REGS - popq %r15 - popq %r14 -@@ -172,48 +162,6 @@ For 32-bit we have the following conventions - kernel is built with - popq %rdi - .endm - -- .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 -- .if \rstor_r11 -- movq 6*8(%rsp), %r11 -- .endif -- .if \rstor_r8910 -- movq 7*8(%rsp), %r10 -- movq 8*8(%rsp), %r9 -- movq 9*8(%rsp), %r8 -- .endif -- .if \rstor_rax -- movq 10*8(%rsp), %rax -- .endif -- .if \rstor_rcx -- movq 11*8(%rsp), %rcx -- .endif -- .if \rstor_rdx -- movq 12*8(%rsp), %rdx -- .endif -- movq 13*8(%rsp), %rsi -- movq 14*8(%rsp), %rdi -- UNWIND_HINT_IRET_REGS offset=16*8 -- .endm -- .macro RESTORE_C_REGS -- RESTORE_C_REGS_HELPER 1,1,1,1,1 -- .endm -- .macro RESTORE_C_REGS_EXCEPT_RAX -- RESTORE_C_REGS_HELPER 0,1,1,1,1 -- .endm -- .macro RESTORE_C_REGS_EXCEPT_RCX -- RESTORE_C_REGS_HELPER 1,0,1,1,1 -- .endm -- .macro RESTORE_C_REGS_EXCEPT_R11 -- RESTORE_C_REGS_HELPER 1,1,0,1,1 -- .endm -- .macro RESTORE_C_REGS_EXCEPT_RCX_R11 -- RESTORE_C_REGS_HELPER 1,0,0,1,1 -- .endm -- -- .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 -- subq $-(15*8+\addskip), %rsp -- .endm -- - .macro icebp - .byte 0xf1 - .endm --- -2.14.2 - diff --git a/patches/kernel/0096-x86-entry-64-Use-POP-instead-of-MOV-to-restore-regs-.patch b/patches/kernel/0096-x86-entry-64-Use-POP-instead-of-MOV-to-restore-regs-.patch new file mode 100644 index 0000000..d93f334 --- /dev/null +++ b/patches/kernel/0096-x86-entry-64-Use-POP-instead-of-MOV-to-restore-regs-.patch @@ -0,0 +1,57 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:59:05 -0700 +Subject: [PATCH] x86/entry/64: Use POP instead of MOV to restore regs on NMI + return +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +This gets rid of the last user of the old RESTORE_..._REGS infrastructure. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/652a260f17a160789bc6a41d997f98249b73e2ab.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 471ee4832209e986029b9fabdaad57b1eecb856b) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 3c5771a43d8f00e53081871027fea891a091ff5e) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/entry_64.S | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 87be1cd1fa88..4eff3aca54ed 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1572,11 +1572,14 @@ end_repeat_nmi: + nmi_swapgs: + SWAPGS_UNSAFE_STACK + nmi_restore: +- RESTORE_EXTRA_REGS +- RESTORE_C_REGS ++ POP_EXTRA_REGS ++ POP_C_REGS + +- /* Point RSP at the "iret" frame. */ +- REMOVE_PT_GPREGS_FROM_STACK 6*8 ++ /* ++ * Skip orig_ax and the "outermost" frame to point RSP at the "iret" ++ * at the "iret" frame. ++ */ ++ addq $6*8, %rsp + + /* + * Clear "NMI executing". Set DF first so that we can easily +-- +2.14.2 + diff --git a/patches/kernel/0097-x86-entry-64-Remove-the-RESTORE_._REGS-infrastructur.patch b/patches/kernel/0097-x86-entry-64-Remove-the-RESTORE_._REGS-infrastructur.patch new file mode 100644 index 0000000..0c1434a --- /dev/null +++ b/patches/kernel/0097-x86-entry-64-Remove-the-RESTORE_._REGS-infrastructur.patch @@ -0,0 +1,104 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:59:06 -0700 +Subject: [PATCH] x86/entry/64: Remove the RESTORE_..._REGS infrastructure +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +All users of RESTORE_EXTRA_REGS, RESTORE_C_REGS and such, and +REMOVE_PT_GPREGS_FROM_STACK are gone. Delete the macros. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/c32672f6e47c561893316d48e06c7656b1039a36.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit c39858de696f0cc160a544455e8403d663d577e9) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit d248c62028c5467cd5a5ce06d344e3fb330da3ec) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/calling.h | 52 ------------------------------------------------ + 1 file changed, 52 deletions(-) + +diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h +index 0b9dd8123701..1895a685d3dd 100644 +--- a/arch/x86/entry/calling.h ++++ b/arch/x86/entry/calling.h +@@ -141,16 +141,6 @@ For 32-bit we have the following conventions - kernel is built with + UNWIND_HINT_REGS offset=\offset + .endm + +- .macro RESTORE_EXTRA_REGS offset=0 +- movq 0*8+\offset(%rsp), %r15 +- movq 1*8+\offset(%rsp), %r14 +- movq 2*8+\offset(%rsp), %r13 +- movq 3*8+\offset(%rsp), %r12 +- movq 4*8+\offset(%rsp), %rbp +- movq 5*8+\offset(%rsp), %rbx +- UNWIND_HINT_REGS offset=\offset extra=0 +- .endm +- + .macro POP_EXTRA_REGS + popq %r15 + popq %r14 +@@ -172,48 +162,6 @@ For 32-bit we have the following conventions - kernel is built with + popq %rdi + .endm + +- .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 +- .if \rstor_r11 +- movq 6*8(%rsp), %r11 +- .endif +- .if \rstor_r8910 +- movq 7*8(%rsp), %r10 +- movq 8*8(%rsp), %r9 +- movq 9*8(%rsp), %r8 +- .endif +- .if \rstor_rax +- movq 10*8(%rsp), %rax +- .endif +- .if \rstor_rcx +- movq 11*8(%rsp), %rcx +- .endif +- .if \rstor_rdx +- movq 12*8(%rsp), %rdx +- .endif +- movq 13*8(%rsp), %rsi +- movq 14*8(%rsp), %rdi +- UNWIND_HINT_IRET_REGS offset=16*8 +- .endm +- .macro RESTORE_C_REGS +- RESTORE_C_REGS_HELPER 1,1,1,1,1 +- .endm +- .macro RESTORE_C_REGS_EXCEPT_RAX +- RESTORE_C_REGS_HELPER 0,1,1,1,1 +- .endm +- .macro RESTORE_C_REGS_EXCEPT_RCX +- RESTORE_C_REGS_HELPER 1,0,1,1,1 +- .endm +- .macro RESTORE_C_REGS_EXCEPT_R11 +- RESTORE_C_REGS_HELPER 1,1,0,1,1 +- .endm +- .macro RESTORE_C_REGS_EXCEPT_RCX_R11 +- RESTORE_C_REGS_HELPER 1,0,0,1,1 +- .endm +- +- .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 +- subq $-(15*8+\addskip), %rsp +- .endm +- + .macro icebp + .byte 0xf1 + .endm +-- +2.14.2 + diff --git a/patches/kernel/0097-xen-x86-entry-64-Add-xen-NMI-trap-entry.patch b/patches/kernel/0097-xen-x86-entry-64-Add-xen-NMI-trap-entry.patch deleted file mode 100644 index d0f7148..0000000 --- a/patches/kernel/0097-xen-x86-entry-64-Add-xen-NMI-trap-entry.patch +++ /dev/null @@ -1,105 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Thu, 2 Nov 2017 00:59:07 -0700 -Subject: [PATCH] xen, x86/entry/64: Add xen NMI trap entry -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Instead of trying to execute any NMI via the bare metal's NMI trap -handler use a Xen specific one for PV domains, like we do for e.g. -debug traps. As in a PV domain the NMI is handled via the normal -kernel stack this is the correct thing to do. - -This will enable us to get rid of the very fragile and questionable -dependencies between the bare metal NMI handler and Xen assumptions -believed to be broken anyway. - -Signed-off-by: Juergen Gross -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/5baf5c0528d58402441550c5770b98e7961e7680.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 43e4111086a70c78bedb6ad990bee97f17b27a6e) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 20c970e03b42141abf6c45938ce6d4fdc3555921) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/traps.h | 2 +- - arch/x86/xen/enlighten_pv.c | 2 +- - arch/x86/entry/entry_64.S | 2 +- - arch/x86/xen/xen-asm_64.S | 2 +- - 4 files changed, 4 insertions(+), 4 deletions(-) - -diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h -index 8e5bf86f87e5..b052a7621ca1 100644 ---- a/arch/x86/include/asm/traps.h -+++ b/arch/x86/include/asm/traps.h -@@ -55,9 +55,9 @@ asmlinkage void simd_coprocessor_error(void); - - #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) - asmlinkage void xen_divide_error(void); -+asmlinkage void xen_xennmi(void); - asmlinkage void xen_xendebug(void); - asmlinkage void xen_xenint3(void); --asmlinkage void xen_nmi(void); - asmlinkage void xen_overflow(void); - asmlinkage void xen_bounds(void); - asmlinkage void xen_invalid_op(void); -diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c -index 69b9deff7e5c..8da4eff19c2a 100644 ---- a/arch/x86/xen/enlighten_pv.c -+++ b/arch/x86/xen/enlighten_pv.c -@@ -600,7 +600,7 @@ static struct trap_array_entry trap_array[] = { - #ifdef CONFIG_X86_MCE - { machine_check, xen_machine_check, true }, - #endif -- { nmi, xen_nmi, true }, -+ { nmi, xen_xennmi, true }, - { overflow, xen_overflow, false }, - #ifdef CONFIG_IA32_EMULATION - { entry_INT80_compat, xen_entry_INT80_compat, false }, -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 4eff3aca54ed..5a6aba7cf3bd 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -1091,6 +1091,7 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK - idtentry stack_segment do_stack_segment has_error_code=1 - - #ifdef CONFIG_XEN -+idtentry xennmi do_nmi has_error_code=0 - idtentry xendebug do_debug has_error_code=0 - idtentry xenint3 do_int3 has_error_code=0 - #endif -@@ -1253,7 +1254,6 @@ ENTRY(error_exit) - END(error_exit) - - /* Runs on exception stack */ --/* XXX: broken on Xen PV */ - ENTRY(nmi) - UNWIND_HINT_IRET_REGS - /* -diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S -index dae2cc33afb5..286ecc198562 100644 ---- a/arch/x86/xen/xen-asm_64.S -+++ b/arch/x86/xen/xen-asm_64.S -@@ -29,7 +29,7 @@ xen_pv_trap debug - xen_pv_trap xendebug - xen_pv_trap int3 - xen_pv_trap xenint3 --xen_pv_trap nmi -+xen_pv_trap xennmi - xen_pv_trap overflow - xen_pv_trap bounds - xen_pv_trap invalid_op --- -2.14.2 - diff --git a/patches/kernel/0098-x86-entry-64-De-Xen-ify-our-NMI-code.patch b/patches/kernel/0098-x86-entry-64-De-Xen-ify-our-NMI-code.patch deleted file mode 100644 index 0a2d534..0000000 --- a/patches/kernel/0098-x86-entry-64-De-Xen-ify-our-NMI-code.patch +++ /dev/null @@ -1,117 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:59:08 -0700 -Subject: [PATCH] x86/entry/64: De-Xen-ify our NMI code -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Xen PV is fundamentally incompatible with our fancy NMI code: it -doesn't use IST at all, and Xen entries clobber two stack slots -below the hardware frame. - -Drop Xen PV support from our NMI code entirely. - -Signed-off-by: Andy Lutomirski -Reviewed-by: Borislav Petkov -Acked-by: Juergen Gross -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/bfbe711b5ae03f672f8848999a8eb2711efc7f98.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 929bacec21478a72c78e4f29f98fb799bd00105a) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit ffc372909c1701c4fdd2bde7861692573ef381a7) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/entry_64.S | 30 ++++++++++++++++++------------ - 1 file changed, 18 insertions(+), 12 deletions(-) - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 5a6aba7cf3bd..05501c781c20 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -1253,9 +1253,13 @@ ENTRY(error_exit) - jmp retint_user - END(error_exit) - --/* Runs on exception stack */ -+/* -+ * Runs on exception stack. Xen PV does not go through this path at all, -+ * so we can use real assembly here. -+ */ - ENTRY(nmi) - UNWIND_HINT_IRET_REGS -+ - /* - * We allow breakpoints in NMIs. If a breakpoint occurs, then - * the iretq it performs will take us out of NMI context. -@@ -1313,7 +1317,7 @@ ENTRY(nmi) - * stacks lest we corrupt the "NMI executing" variable. - */ - -- SWAPGS_UNSAFE_STACK -+ swapgs - cld - movq %rsp, %rdx - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp -@@ -1478,7 +1482,7 @@ nested_nmi_out: - popq %rdx - - /* We are returning to kernel mode, so this cannot result in a fault. */ -- INTERRUPT_RETURN -+ iretq - - first_nmi: - /* Restore rdx. */ -@@ -1509,7 +1513,7 @@ first_nmi: - pushfq /* RFLAGS */ - pushq $__KERNEL_CS /* CS */ - pushq $1f /* RIP */ -- INTERRUPT_RETURN /* continues at repeat_nmi below */ -+ iretq /* continues at repeat_nmi below */ - UNWIND_HINT_IRET_REGS - 1: - #endif -@@ -1584,20 +1588,22 @@ nmi_restore: - /* - * Clear "NMI executing". Set DF first so that we can easily - * distinguish the remaining code between here and IRET from -- * the SYSCALL entry and exit paths. On a native kernel, we -- * could just inspect RIP, but, on paravirt kernels, -- * INTERRUPT_RETURN can translate into a jump into a -- * hypercall page. -+ * the SYSCALL entry and exit paths. -+ * -+ * We arguably should just inspect RIP instead, but I (Andy) wrote -+ * this code when I had the misapprehension that Xen PV supported -+ * NMIs, and Xen PV would break that approach. - */ - std - movq $0, 5*8(%rsp) /* clear "NMI executing" */ - - /* -- * INTERRUPT_RETURN reads the "iret" frame and exits the NMI -- * stack in a single instruction. We are returning to kernel -- * mode, so this cannot result in a fault. -+ * iretq reads the "iret" frame and exits the NMI stack in a -+ * single instruction. We are returning to kernel mode, so this -+ * cannot result in a fault. Similarly, we don't need to worry -+ * about espfix64 on the way back to kernel mode. - */ -- INTERRUPT_RETURN -+ iretq - END(nmi) - - ENTRY(ignore_sysret) --- -2.14.2 - diff --git a/patches/kernel/0098-xen-x86-entry-64-Add-xen-NMI-trap-entry.patch b/patches/kernel/0098-xen-x86-entry-64-Add-xen-NMI-trap-entry.patch new file mode 100644 index 0000000..d0f7148 --- /dev/null +++ b/patches/kernel/0098-xen-x86-entry-64-Add-xen-NMI-trap-entry.patch @@ -0,0 +1,105 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Juergen Gross +Date: Thu, 2 Nov 2017 00:59:07 -0700 +Subject: [PATCH] xen, x86/entry/64: Add xen NMI trap entry +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Instead of trying to execute any NMI via the bare metal's NMI trap +handler use a Xen specific one for PV domains, like we do for e.g. +debug traps. As in a PV domain the NMI is handled via the normal +kernel stack this is the correct thing to do. + +This will enable us to get rid of the very fragile and questionable +dependencies between the bare metal NMI handler and Xen assumptions +believed to be broken anyway. + +Signed-off-by: Juergen Gross +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/5baf5c0528d58402441550c5770b98e7961e7680.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 43e4111086a70c78bedb6ad990bee97f17b27a6e) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 20c970e03b42141abf6c45938ce6d4fdc3555921) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/traps.h | 2 +- + arch/x86/xen/enlighten_pv.c | 2 +- + arch/x86/entry/entry_64.S | 2 +- + arch/x86/xen/xen-asm_64.S | 2 +- + 4 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h +index 8e5bf86f87e5..b052a7621ca1 100644 +--- a/arch/x86/include/asm/traps.h ++++ b/arch/x86/include/asm/traps.h +@@ -55,9 +55,9 @@ asmlinkage void simd_coprocessor_error(void); + + #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) + asmlinkage void xen_divide_error(void); ++asmlinkage void xen_xennmi(void); + asmlinkage void xen_xendebug(void); + asmlinkage void xen_xenint3(void); +-asmlinkage void xen_nmi(void); + asmlinkage void xen_overflow(void); + asmlinkage void xen_bounds(void); + asmlinkage void xen_invalid_op(void); +diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c +index 69b9deff7e5c..8da4eff19c2a 100644 +--- a/arch/x86/xen/enlighten_pv.c ++++ b/arch/x86/xen/enlighten_pv.c +@@ -600,7 +600,7 @@ static struct trap_array_entry trap_array[] = { + #ifdef CONFIG_X86_MCE + { machine_check, xen_machine_check, true }, + #endif +- { nmi, xen_nmi, true }, ++ { nmi, xen_xennmi, true }, + { overflow, xen_overflow, false }, + #ifdef CONFIG_IA32_EMULATION + { entry_INT80_compat, xen_entry_INT80_compat, false }, +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 4eff3aca54ed..5a6aba7cf3bd 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1091,6 +1091,7 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK + idtentry stack_segment do_stack_segment has_error_code=1 + + #ifdef CONFIG_XEN ++idtentry xennmi do_nmi has_error_code=0 + idtentry xendebug do_debug has_error_code=0 + idtentry xenint3 do_int3 has_error_code=0 + #endif +@@ -1253,7 +1254,6 @@ ENTRY(error_exit) + END(error_exit) + + /* Runs on exception stack */ +-/* XXX: broken on Xen PV */ + ENTRY(nmi) + UNWIND_HINT_IRET_REGS + /* +diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S +index dae2cc33afb5..286ecc198562 100644 +--- a/arch/x86/xen/xen-asm_64.S ++++ b/arch/x86/xen/xen-asm_64.S +@@ -29,7 +29,7 @@ xen_pv_trap debug + xen_pv_trap xendebug + xen_pv_trap int3 + xen_pv_trap xenint3 +-xen_pv_trap nmi ++xen_pv_trap xennmi + xen_pv_trap overflow + xen_pv_trap bounds + xen_pv_trap invalid_op +-- +2.14.2 + diff --git a/patches/kernel/0099-x86-entry-32-Pull-the-MSR_IA32_SYSENTER_CS-update-co.patch b/patches/kernel/0099-x86-entry-32-Pull-the-MSR_IA32_SYSENTER_CS-update-co.patch deleted file mode 100644 index 05daea1..0000000 --- a/patches/kernel/0099-x86-entry-32-Pull-the-MSR_IA32_SYSENTER_CS-update-co.patch +++ /dev/null @@ -1,145 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:59:09 -0700 -Subject: [PATCH] x86/entry/32: Pull the MSR_IA32_SYSENTER_CS update code out - of native_load_sp0() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -This causes the MSR_IA32_SYSENTER_CS write to move out of the -paravirt callback. This shouldn't affect Xen PV: Xen already ignores -MSR_IA32_SYSENTER_ESP writes. In any event, Xen doesn't support -vm86() in a useful way. - -Note to any potential backporters: This patch won't break lguest, as -lguest didn't have any SYSENTER support at all. - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/75cf09fe03ae778532d0ca6c65aa58e66bc2f90c.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit bd7dc5a6afac719d8ce4092391eef2c7e83c2a75) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 779e32d0da9a547f3b11fbecac8287e458ba67f5) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/processor.h | 7 ------- - arch/x86/include/asm/switch_to.h | 12 ++++++++++++ - arch/x86/kernel/process_32.c | 4 +++- - arch/x86/kernel/process_64.c | 2 +- - arch/x86/kernel/vm86_32.c | 6 +++++- - 5 files changed, 21 insertions(+), 10 deletions(-) - -diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index 028245e1c42b..ee37fb86900a 100644 ---- a/arch/x86/include/asm/processor.h -+++ b/arch/x86/include/asm/processor.h -@@ -513,13 +513,6 @@ static inline void - native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) - { - tss->x86_tss.sp0 = thread->sp0; --#ifdef CONFIG_X86_32 -- /* Only happens when SEP is enabled, no need to test "SEP"arately: */ -- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { -- tss->x86_tss.ss1 = thread->sysenter_cs; -- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); -- } --#endif - } - - static inline void native_swapgs(void) -diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h -index fcc5cd387fd1..7ae8caffbada 100644 ---- a/arch/x86/include/asm/switch_to.h -+++ b/arch/x86/include/asm/switch_to.h -@@ -72,4 +72,16 @@ do { \ - ((last) = __switch_to_asm((prev), (next))); \ - } while (0) - -+#ifdef CONFIG_X86_32 -+static inline void refresh_sysenter_cs(struct thread_struct *thread) -+{ -+ /* Only happens when SEP is enabled, no need to test "SEP"arately: */ -+ if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) -+ return; -+ -+ this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); -+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); -+} -+#endif -+ - #endif /* _ASM_X86_SWITCH_TO_H */ -diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c -index 22802162eeb9..2e42b66b8ca4 100644 ---- a/arch/x86/kernel/process_32.c -+++ b/arch/x86/kernel/process_32.c -@@ -284,9 +284,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) - - /* - * Reload esp0 and cpu_current_top_of_stack. This changes -- * current_thread_info(). -+ * current_thread_info(). Refresh the SYSENTER configuration in -+ * case prev or next is vm86. - */ - load_sp0(tss, next); -+ refresh_sysenter_cs(next); - this_cpu_write(cpu_current_top_of_stack, - (unsigned long)task_stack_page(next_p) + - THREAD_SIZE); -diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c -index 1e7701c4cd80..565daaa6f18d 100644 ---- a/arch/x86/kernel/process_64.c -+++ b/arch/x86/kernel/process_64.c -@@ -465,7 +465,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) - */ - this_cpu_write(current_task, next_p); - -- /* Reload esp0 and ss1. This changes current_thread_info(). */ -+ /* Reload sp0. */ - load_sp0(tss, next); - - /* -diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c -index 7924a5356c8a..5bc1c3ab6287 100644 ---- a/arch/x86/kernel/vm86_32.c -+++ b/arch/x86/kernel/vm86_32.c -@@ -54,6 +54,7 @@ - #include - #include - #include -+#include - - /* - * Known problems: -@@ -149,6 +150,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) - tsk->thread.sp0 = vm86->saved_sp0; - tsk->thread.sysenter_cs = __KERNEL_CS; - load_sp0(tss, &tsk->thread); -+ refresh_sysenter_cs(&tsk->thread); - vm86->saved_sp0 = 0; - put_cpu(); - -@@ -368,8 +370,10 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) - /* make room for real-mode segments */ - tsk->thread.sp0 += 16; - -- if (static_cpu_has(X86_FEATURE_SEP)) -+ if (static_cpu_has(X86_FEATURE_SEP)) { - tsk->thread.sysenter_cs = 0; -+ refresh_sysenter_cs(&tsk->thread); -+ } - - load_sp0(tss, &tsk->thread); - put_cpu(); --- -2.14.2 - diff --git a/patches/kernel/0099-x86-entry-64-De-Xen-ify-our-NMI-code.patch b/patches/kernel/0099-x86-entry-64-De-Xen-ify-our-NMI-code.patch new file mode 100644 index 0000000..0a2d534 --- /dev/null +++ b/patches/kernel/0099-x86-entry-64-De-Xen-ify-our-NMI-code.patch @@ -0,0 +1,117 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:59:08 -0700 +Subject: [PATCH] x86/entry/64: De-Xen-ify our NMI code +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Xen PV is fundamentally incompatible with our fancy NMI code: it +doesn't use IST at all, and Xen entries clobber two stack slots +below the hardware frame. + +Drop Xen PV support from our NMI code entirely. + +Signed-off-by: Andy Lutomirski +Reviewed-by: Borislav Petkov +Acked-by: Juergen Gross +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/bfbe711b5ae03f672f8848999a8eb2711efc7f98.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 929bacec21478a72c78e4f29f98fb799bd00105a) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit ffc372909c1701c4fdd2bde7861692573ef381a7) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/entry_64.S | 30 ++++++++++++++++++------------ + 1 file changed, 18 insertions(+), 12 deletions(-) + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 5a6aba7cf3bd..05501c781c20 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1253,9 +1253,13 @@ ENTRY(error_exit) + jmp retint_user + END(error_exit) + +-/* Runs on exception stack */ ++/* ++ * Runs on exception stack. Xen PV does not go through this path at all, ++ * so we can use real assembly here. ++ */ + ENTRY(nmi) + UNWIND_HINT_IRET_REGS ++ + /* + * We allow breakpoints in NMIs. If a breakpoint occurs, then + * the iretq it performs will take us out of NMI context. +@@ -1313,7 +1317,7 @@ ENTRY(nmi) + * stacks lest we corrupt the "NMI executing" variable. + */ + +- SWAPGS_UNSAFE_STACK ++ swapgs + cld + movq %rsp, %rdx + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp +@@ -1478,7 +1482,7 @@ nested_nmi_out: + popq %rdx + + /* We are returning to kernel mode, so this cannot result in a fault. */ +- INTERRUPT_RETURN ++ iretq + + first_nmi: + /* Restore rdx. */ +@@ -1509,7 +1513,7 @@ first_nmi: + pushfq /* RFLAGS */ + pushq $__KERNEL_CS /* CS */ + pushq $1f /* RIP */ +- INTERRUPT_RETURN /* continues at repeat_nmi below */ ++ iretq /* continues at repeat_nmi below */ + UNWIND_HINT_IRET_REGS + 1: + #endif +@@ -1584,20 +1588,22 @@ nmi_restore: + /* + * Clear "NMI executing". Set DF first so that we can easily + * distinguish the remaining code between here and IRET from +- * the SYSCALL entry and exit paths. On a native kernel, we +- * could just inspect RIP, but, on paravirt kernels, +- * INTERRUPT_RETURN can translate into a jump into a +- * hypercall page. ++ * the SYSCALL entry and exit paths. ++ * ++ * We arguably should just inspect RIP instead, but I (Andy) wrote ++ * this code when I had the misapprehension that Xen PV supported ++ * NMIs, and Xen PV would break that approach. + */ + std + movq $0, 5*8(%rsp) /* clear "NMI executing" */ + + /* +- * INTERRUPT_RETURN reads the "iret" frame and exits the NMI +- * stack in a single instruction. We are returning to kernel +- * mode, so this cannot result in a fault. ++ * iretq reads the "iret" frame and exits the NMI stack in a ++ * single instruction. We are returning to kernel mode, so this ++ * cannot result in a fault. Similarly, we don't need to worry ++ * about espfix64 on the way back to kernel mode. + */ +- INTERRUPT_RETURN ++ iretq + END(nmi) + + ENTRY(ignore_sysret) +-- +2.14.2 + diff --git a/patches/kernel/0100-x86-entry-32-Pull-the-MSR_IA32_SYSENTER_CS-update-co.patch b/patches/kernel/0100-x86-entry-32-Pull-the-MSR_IA32_SYSENTER_CS-update-co.patch new file mode 100644 index 0000000..05daea1 --- /dev/null +++ b/patches/kernel/0100-x86-entry-32-Pull-the-MSR_IA32_SYSENTER_CS-update-co.patch @@ -0,0 +1,145 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:59:09 -0700 +Subject: [PATCH] x86/entry/32: Pull the MSR_IA32_SYSENTER_CS update code out + of native_load_sp0() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +This causes the MSR_IA32_SYSENTER_CS write to move out of the +paravirt callback. This shouldn't affect Xen PV: Xen already ignores +MSR_IA32_SYSENTER_ESP writes. In any event, Xen doesn't support +vm86() in a useful way. + +Note to any potential backporters: This patch won't break lguest, as +lguest didn't have any SYSENTER support at all. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/75cf09fe03ae778532d0ca6c65aa58e66bc2f90c.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit bd7dc5a6afac719d8ce4092391eef2c7e83c2a75) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 779e32d0da9a547f3b11fbecac8287e458ba67f5) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/processor.h | 7 ------- + arch/x86/include/asm/switch_to.h | 12 ++++++++++++ + arch/x86/kernel/process_32.c | 4 +++- + arch/x86/kernel/process_64.c | 2 +- + arch/x86/kernel/vm86_32.c | 6 +++++- + 5 files changed, 21 insertions(+), 10 deletions(-) + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 028245e1c42b..ee37fb86900a 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -513,13 +513,6 @@ static inline void + native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) + { + tss->x86_tss.sp0 = thread->sp0; +-#ifdef CONFIG_X86_32 +- /* Only happens when SEP is enabled, no need to test "SEP"arately: */ +- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { +- tss->x86_tss.ss1 = thread->sysenter_cs; +- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); +- } +-#endif + } + + static inline void native_swapgs(void) +diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h +index fcc5cd387fd1..7ae8caffbada 100644 +--- a/arch/x86/include/asm/switch_to.h ++++ b/arch/x86/include/asm/switch_to.h +@@ -72,4 +72,16 @@ do { \ + ((last) = __switch_to_asm((prev), (next))); \ + } while (0) + ++#ifdef CONFIG_X86_32 ++static inline void refresh_sysenter_cs(struct thread_struct *thread) ++{ ++ /* Only happens when SEP is enabled, no need to test "SEP"arately: */ ++ if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) ++ return; ++ ++ this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); ++ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); ++} ++#endif ++ + #endif /* _ASM_X86_SWITCH_TO_H */ +diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c +index 22802162eeb9..2e42b66b8ca4 100644 +--- a/arch/x86/kernel/process_32.c ++++ b/arch/x86/kernel/process_32.c +@@ -284,9 +284,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + + /* + * Reload esp0 and cpu_current_top_of_stack. This changes +- * current_thread_info(). ++ * current_thread_info(). Refresh the SYSENTER configuration in ++ * case prev or next is vm86. + */ + load_sp0(tss, next); ++ refresh_sysenter_cs(next); + this_cpu_write(cpu_current_top_of_stack, + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE); +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index 1e7701c4cd80..565daaa6f18d 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -465,7 +465,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + */ + this_cpu_write(current_task, next_p); + +- /* Reload esp0 and ss1. This changes current_thread_info(). */ ++ /* Reload sp0. */ + load_sp0(tss, next); + + /* +diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c +index 7924a5356c8a..5bc1c3ab6287 100644 +--- a/arch/x86/kernel/vm86_32.c ++++ b/arch/x86/kernel/vm86_32.c +@@ -54,6 +54,7 @@ + #include + #include + #include ++#include + + /* + * Known problems: +@@ -149,6 +150,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) + tsk->thread.sp0 = vm86->saved_sp0; + tsk->thread.sysenter_cs = __KERNEL_CS; + load_sp0(tss, &tsk->thread); ++ refresh_sysenter_cs(&tsk->thread); + vm86->saved_sp0 = 0; + put_cpu(); + +@@ -368,8 +370,10 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) + /* make room for real-mode segments */ + tsk->thread.sp0 += 16; + +- if (static_cpu_has(X86_FEATURE_SEP)) ++ if (static_cpu_has(X86_FEATURE_SEP)) { + tsk->thread.sysenter_cs = 0; ++ refresh_sysenter_cs(&tsk->thread); ++ } + + load_sp0(tss, &tsk->thread); + put_cpu(); +-- +2.14.2 + diff --git a/patches/kernel/0100-x86-entry-64-Pass-SP0-directly-to-load_sp0.patch b/patches/kernel/0100-x86-entry-64-Pass-SP0-directly-to-load_sp0.patch deleted file mode 100644 index 4b94cba..0000000 --- a/patches/kernel/0100-x86-entry-64-Pass-SP0-directly-to-load_sp0.patch +++ /dev/null @@ -1,238 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:59:10 -0700 -Subject: [PATCH] x86/entry/64: Pass SP0 directly to load_sp0() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -load_sp0() had an odd signature: - - void load_sp0(struct tss_struct *tss, struct thread_struct *thread); - -Simplify it to: - - void load_sp0(unsigned long sp0); - -Also simplify a few get_cpu()/put_cpu() sequences to -preempt_disable()/preempt_enable(). - -Signed-off-by: Andy Lutomirski -Reviewed-by: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/2655d8b42ed940aa384fe18ee1129bbbcf730a08.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit da51da189a24bb9b7e2d5a123be096e51a4695a5) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 41f6a89b0be4d052a6af59df5e56102d4e4c79ef) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/paravirt.h | 5 ++--- - arch/x86/include/asm/paravirt_types.h | 2 +- - arch/x86/include/asm/processor.h | 9 ++++----- - arch/x86/kernel/cpu/common.c | 4 ++-- - arch/x86/kernel/process_32.c | 2 +- - arch/x86/kernel/process_64.c | 2 +- - arch/x86/kernel/vm86_32.c | 14 ++++++-------- - arch/x86/xen/enlighten_pv.c | 7 +++---- - 8 files changed, 20 insertions(+), 25 deletions(-) - -diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h -index 12deec722cf0..43d4f90edebc 100644 ---- a/arch/x86/include/asm/paravirt.h -+++ b/arch/x86/include/asm/paravirt.h -@@ -15,10 +15,9 @@ - #include - #include - --static inline void load_sp0(struct tss_struct *tss, -- struct thread_struct *thread) -+static inline void load_sp0(unsigned long sp0) - { -- PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread); -+ PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0); - } - - /* The paravirtualized CPUID instruction. */ -diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h -index 42873edd9f9d..e3953a1e2b57 100644 ---- a/arch/x86/include/asm/paravirt_types.h -+++ b/arch/x86/include/asm/paravirt_types.h -@@ -133,7 +133,7 @@ struct pv_cpu_ops { - void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); - void (*free_ldt)(struct desc_struct *ldt, unsigned entries); - -- void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); -+ void (*load_sp0)(unsigned long sp0); - - void (*set_iopl_mask)(unsigned mask); - -diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index ee37fb86900a..85ddfc1a9bb5 100644 ---- a/arch/x86/include/asm/processor.h -+++ b/arch/x86/include/asm/processor.h -@@ -510,9 +510,9 @@ static inline void native_set_iopl_mask(unsigned mask) - } - - static inline void --native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) -+native_load_sp0(unsigned long sp0) - { -- tss->x86_tss.sp0 = thread->sp0; -+ this_cpu_write(cpu_tss.x86_tss.sp0, sp0); - } - - static inline void native_swapgs(void) -@@ -537,10 +537,9 @@ static inline unsigned long current_top_of_stack(void) - #else - #define __cpuid native_cpuid - --static inline void load_sp0(struct tss_struct *tss, -- struct thread_struct *thread) -+static inline void load_sp0(unsigned long sp0) - { -- native_load_sp0(tss, thread); -+ native_load_sp0(sp0); - } - - #define set_iopl_mask native_set_iopl_mask -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index ef7b1ba56363..6562acbfc4e0 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -1570,7 +1570,7 @@ void cpu_init(void) - BUG_ON(me->mm); - enter_lazy_tlb(&init_mm, me); - -- load_sp0(t, ¤t->thread); -+ load_sp0(current->thread.sp0); - set_tss_desc(cpu, t); - load_TR_desc(); - load_mm_ldt(&init_mm); -@@ -1624,7 +1624,7 @@ void cpu_init(void) - BUG_ON(curr->mm); - enter_lazy_tlb(&init_mm, curr); - -- load_sp0(t, thread); -+ load_sp0(thread->sp0); - set_tss_desc(cpu, t); - load_TR_desc(); - load_mm_ldt(&init_mm); -diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c -index 2e42b66b8ca4..48a3f240f565 100644 ---- a/arch/x86/kernel/process_32.c -+++ b/arch/x86/kernel/process_32.c -@@ -287,7 +287,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) - * current_thread_info(). Refresh the SYSENTER configuration in - * case prev or next is vm86. - */ -- load_sp0(tss, next); -+ load_sp0(next->sp0); - refresh_sysenter_cs(next); - this_cpu_write(cpu_current_top_of_stack, - (unsigned long)task_stack_page(next_p) + -diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c -index 565daaa6f18d..37b933628a8b 100644 ---- a/arch/x86/kernel/process_64.c -+++ b/arch/x86/kernel/process_64.c -@@ -466,7 +466,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) - this_cpu_write(current_task, next_p); - - /* Reload sp0. */ -- load_sp0(tss, next); -+ load_sp0(next->sp0); - - /* - * Now maybe reload the debug registers and handle I/O bitmaps -diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c -index 5bc1c3ab6287..0f1d92cd20ad 100644 ---- a/arch/x86/kernel/vm86_32.c -+++ b/arch/x86/kernel/vm86_32.c -@@ -94,7 +94,6 @@ - - void save_v86_state(struct kernel_vm86_regs *regs, int retval) - { -- struct tss_struct *tss; - struct task_struct *tsk = current; - struct vm86plus_struct __user *user; - struct vm86 *vm86 = current->thread.vm86; -@@ -146,13 +145,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) - do_exit(SIGSEGV); - } - -- tss = &per_cpu(cpu_tss, get_cpu()); -+ preempt_disable(); - tsk->thread.sp0 = vm86->saved_sp0; - tsk->thread.sysenter_cs = __KERNEL_CS; -- load_sp0(tss, &tsk->thread); -+ load_sp0(tsk->thread.sp0); - refresh_sysenter_cs(&tsk->thread); - vm86->saved_sp0 = 0; -- put_cpu(); -+ preempt_enable(); - - memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); - -@@ -238,7 +237,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) - - static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) - { -- struct tss_struct *tss; - struct task_struct *tsk = current; - struct vm86 *vm86 = tsk->thread.vm86; - struct kernel_vm86_regs vm86regs; -@@ -366,8 +364,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) - vm86->saved_sp0 = tsk->thread.sp0; - lazy_save_gs(vm86->regs32.gs); - -- tss = &per_cpu(cpu_tss, get_cpu()); - /* make room for real-mode segments */ -+ preempt_disable(); - tsk->thread.sp0 += 16; - - if (static_cpu_has(X86_FEATURE_SEP)) { -@@ -375,8 +373,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) - refresh_sysenter_cs(&tsk->thread); - } - -- load_sp0(tss, &tsk->thread); -- put_cpu(); -+ load_sp0(tsk->thread.sp0); -+ preempt_enable(); - - if (vm86->flags & VM86_SCREEN_BITMAP) - mark_screen_rdonly(tsk->mm); -diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c -index 8da4eff19c2a..e7b213047724 100644 ---- a/arch/x86/xen/enlighten_pv.c -+++ b/arch/x86/xen/enlighten_pv.c -@@ -810,15 +810,14 @@ static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, - } - } - --static void xen_load_sp0(struct tss_struct *tss, -- struct thread_struct *thread) -+static void xen_load_sp0(unsigned long sp0) - { - struct multicall_space mcs; - - mcs = xen_mc_entry(0); -- MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); -+ MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); - xen_mc_issue(PARAVIRT_LAZY_CPU); -- tss->x86_tss.sp0 = thread->sp0; -+ this_cpu_write(cpu_tss.x86_tss.sp0, sp0); - } - - void xen_set_iopl_mask(unsigned mask) --- -2.14.2 - diff --git a/patches/kernel/0101-x86-entry-64-Pass-SP0-directly-to-load_sp0.patch b/patches/kernel/0101-x86-entry-64-Pass-SP0-directly-to-load_sp0.patch new file mode 100644 index 0000000..4b94cba --- /dev/null +++ b/patches/kernel/0101-x86-entry-64-Pass-SP0-directly-to-load_sp0.patch @@ -0,0 +1,238 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:59:10 -0700 +Subject: [PATCH] x86/entry/64: Pass SP0 directly to load_sp0() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +load_sp0() had an odd signature: + + void load_sp0(struct tss_struct *tss, struct thread_struct *thread); + +Simplify it to: + + void load_sp0(unsigned long sp0); + +Also simplify a few get_cpu()/put_cpu() sequences to +preempt_disable()/preempt_enable(). + +Signed-off-by: Andy Lutomirski +Reviewed-by: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/2655d8b42ed940aa384fe18ee1129bbbcf730a08.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit da51da189a24bb9b7e2d5a123be096e51a4695a5) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 41f6a89b0be4d052a6af59df5e56102d4e4c79ef) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/paravirt.h | 5 ++--- + arch/x86/include/asm/paravirt_types.h | 2 +- + arch/x86/include/asm/processor.h | 9 ++++----- + arch/x86/kernel/cpu/common.c | 4 ++-- + arch/x86/kernel/process_32.c | 2 +- + arch/x86/kernel/process_64.c | 2 +- + arch/x86/kernel/vm86_32.c | 14 ++++++-------- + arch/x86/xen/enlighten_pv.c | 7 +++---- + 8 files changed, 20 insertions(+), 25 deletions(-) + +diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h +index 12deec722cf0..43d4f90edebc 100644 +--- a/arch/x86/include/asm/paravirt.h ++++ b/arch/x86/include/asm/paravirt.h +@@ -15,10 +15,9 @@ + #include + #include + +-static inline void load_sp0(struct tss_struct *tss, +- struct thread_struct *thread) ++static inline void load_sp0(unsigned long sp0) + { +- PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread); ++ PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0); + } + + /* The paravirtualized CPUID instruction. */ +diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h +index 42873edd9f9d..e3953a1e2b57 100644 +--- a/arch/x86/include/asm/paravirt_types.h ++++ b/arch/x86/include/asm/paravirt_types.h +@@ -133,7 +133,7 @@ struct pv_cpu_ops { + void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); + void (*free_ldt)(struct desc_struct *ldt, unsigned entries); + +- void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); ++ void (*load_sp0)(unsigned long sp0); + + void (*set_iopl_mask)(unsigned mask); + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index ee37fb86900a..85ddfc1a9bb5 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -510,9 +510,9 @@ static inline void native_set_iopl_mask(unsigned mask) + } + + static inline void +-native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) ++native_load_sp0(unsigned long sp0) + { +- tss->x86_tss.sp0 = thread->sp0; ++ this_cpu_write(cpu_tss.x86_tss.sp0, sp0); + } + + static inline void native_swapgs(void) +@@ -537,10 +537,9 @@ static inline unsigned long current_top_of_stack(void) + #else + #define __cpuid native_cpuid + +-static inline void load_sp0(struct tss_struct *tss, +- struct thread_struct *thread) ++static inline void load_sp0(unsigned long sp0) + { +- native_load_sp0(tss, thread); ++ native_load_sp0(sp0); + } + + #define set_iopl_mask native_set_iopl_mask +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index ef7b1ba56363..6562acbfc4e0 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1570,7 +1570,7 @@ void cpu_init(void) + BUG_ON(me->mm); + enter_lazy_tlb(&init_mm, me); + +- load_sp0(t, ¤t->thread); ++ load_sp0(current->thread.sp0); + set_tss_desc(cpu, t); + load_TR_desc(); + load_mm_ldt(&init_mm); +@@ -1624,7 +1624,7 @@ void cpu_init(void) + BUG_ON(curr->mm); + enter_lazy_tlb(&init_mm, curr); + +- load_sp0(t, thread); ++ load_sp0(thread->sp0); + set_tss_desc(cpu, t); + load_TR_desc(); + load_mm_ldt(&init_mm); +diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c +index 2e42b66b8ca4..48a3f240f565 100644 +--- a/arch/x86/kernel/process_32.c ++++ b/arch/x86/kernel/process_32.c +@@ -287,7 +287,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + * current_thread_info(). Refresh the SYSENTER configuration in + * case prev or next is vm86. + */ +- load_sp0(tss, next); ++ load_sp0(next->sp0); + refresh_sysenter_cs(next); + this_cpu_write(cpu_current_top_of_stack, + (unsigned long)task_stack_page(next_p) + +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index 565daaa6f18d..37b933628a8b 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -466,7 +466,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + this_cpu_write(current_task, next_p); + + /* Reload sp0. */ +- load_sp0(tss, next); ++ load_sp0(next->sp0); + + /* + * Now maybe reload the debug registers and handle I/O bitmaps +diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c +index 5bc1c3ab6287..0f1d92cd20ad 100644 +--- a/arch/x86/kernel/vm86_32.c ++++ b/arch/x86/kernel/vm86_32.c +@@ -94,7 +94,6 @@ + + void save_v86_state(struct kernel_vm86_regs *regs, int retval) + { +- struct tss_struct *tss; + struct task_struct *tsk = current; + struct vm86plus_struct __user *user; + struct vm86 *vm86 = current->thread.vm86; +@@ -146,13 +145,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) + do_exit(SIGSEGV); + } + +- tss = &per_cpu(cpu_tss, get_cpu()); ++ preempt_disable(); + tsk->thread.sp0 = vm86->saved_sp0; + tsk->thread.sysenter_cs = __KERNEL_CS; +- load_sp0(tss, &tsk->thread); ++ load_sp0(tsk->thread.sp0); + refresh_sysenter_cs(&tsk->thread); + vm86->saved_sp0 = 0; +- put_cpu(); ++ preempt_enable(); + + memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); + +@@ -238,7 +237,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) + + static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) + { +- struct tss_struct *tss; + struct task_struct *tsk = current; + struct vm86 *vm86 = tsk->thread.vm86; + struct kernel_vm86_regs vm86regs; +@@ -366,8 +364,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) + vm86->saved_sp0 = tsk->thread.sp0; + lazy_save_gs(vm86->regs32.gs); + +- tss = &per_cpu(cpu_tss, get_cpu()); + /* make room for real-mode segments */ ++ preempt_disable(); + tsk->thread.sp0 += 16; + + if (static_cpu_has(X86_FEATURE_SEP)) { +@@ -375,8 +373,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) + refresh_sysenter_cs(&tsk->thread); + } + +- load_sp0(tss, &tsk->thread); +- put_cpu(); ++ load_sp0(tsk->thread.sp0); ++ preempt_enable(); + + if (vm86->flags & VM86_SCREEN_BITMAP) + mark_screen_rdonly(tsk->mm); +diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c +index 8da4eff19c2a..e7b213047724 100644 +--- a/arch/x86/xen/enlighten_pv.c ++++ b/arch/x86/xen/enlighten_pv.c +@@ -810,15 +810,14 @@ static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, + } + } + +-static void xen_load_sp0(struct tss_struct *tss, +- struct thread_struct *thread) ++static void xen_load_sp0(unsigned long sp0) + { + struct multicall_space mcs; + + mcs = xen_mc_entry(0); +- MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); ++ MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); + xen_mc_issue(PARAVIRT_LAZY_CPU); +- tss->x86_tss.sp0 = thread->sp0; ++ this_cpu_write(cpu_tss.x86_tss.sp0, sp0); + } + + void xen_set_iopl_mask(unsigned mask) +-- +2.14.2 + diff --git a/patches/kernel/0101-x86-entry-Add-task_top_of_stack-to-find-the-top-of-a.patch b/patches/kernel/0101-x86-entry-Add-task_top_of_stack-to-find-the-top-of-a.patch deleted file mode 100644 index 1708df2..0000000 --- a/patches/kernel/0101-x86-entry-Add-task_top_of_stack-to-find-the-top-of-a.patch +++ /dev/null @@ -1,48 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:59:11 -0700 -Subject: [PATCH] x86/entry: Add task_top_of_stack() to find the top of a - task's stack -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -This will let us get rid of a few places that hardcode accesses to -thread.sp0. - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/b49b3f95a8ff858c40c9b0f5b32be0355324327d.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 3500130b84a3cdc5b6796eba1daf178944935efe) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit f1078e10e361afaeb22ee72c54d5ad397e19728d) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/processor.h | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index 85ddfc1a9bb5..f83fbf1b6dd9 100644 ---- a/arch/x86/include/asm/processor.h -+++ b/arch/x86/include/asm/processor.h -@@ -788,6 +788,8 @@ static inline void spin_lock_prefetch(const void *x) - #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ - TOP_OF_KERNEL_STACK_PADDING) - -+#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1)) -+ - #ifdef CONFIG_X86_32 - /* - * User space process size: 3GB (default). --- -2.14.2 - diff --git a/patches/kernel/0102-x86-entry-Add-task_top_of_stack-to-find-the-top-of-a.patch b/patches/kernel/0102-x86-entry-Add-task_top_of_stack-to-find-the-top-of-a.patch new file mode 100644 index 0000000..1708df2 --- /dev/null +++ b/patches/kernel/0102-x86-entry-Add-task_top_of_stack-to-find-the-top-of-a.patch @@ -0,0 +1,48 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:59:11 -0700 +Subject: [PATCH] x86/entry: Add task_top_of_stack() to find the top of a + task's stack +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +This will let us get rid of a few places that hardcode accesses to +thread.sp0. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/b49b3f95a8ff858c40c9b0f5b32be0355324327d.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 3500130b84a3cdc5b6796eba1daf178944935efe) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit f1078e10e361afaeb22ee72c54d5ad397e19728d) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/processor.h | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 85ddfc1a9bb5..f83fbf1b6dd9 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -788,6 +788,8 @@ static inline void spin_lock_prefetch(const void *x) + #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ + TOP_OF_KERNEL_STACK_PADDING) + ++#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1)) ++ + #ifdef CONFIG_X86_32 + /* + * User space process size: 3GB (default). +-- +2.14.2 + diff --git a/patches/kernel/0102-x86-xen-64-x86-entry-64-Clean-up-SP-code-in-cpu_init.patch b/patches/kernel/0102-x86-xen-64-x86-entry-64-Clean-up-SP-code-in-cpu_init.patch deleted file mode 100644 index 5c37994..0000000 --- a/patches/kernel/0102-x86-xen-64-x86-entry-64-Clean-up-SP-code-in-cpu_init.patch +++ /dev/null @@ -1,99 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:59:12 -0700 -Subject: [PATCH] x86/xen/64, x86/entry/64: Clean up SP code in - cpu_initialize_context() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -I'm removing thread_struct::sp0, and Xen's usage of it is slightly -dubious and unnecessary. Use appropriate helpers instead. - -While we're at at, reorder the code slightly to make it more obvious -what's going on. - -Signed-off-by: Andy Lutomirski -Reviewed-by: Juergen Gross -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/d5b9a3da2b47c68325bd2bbe8f82d9554dee0d0f.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit f16b3da1dc936c0f8121741d0a1731bf242f2f56) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 27c60a1f6c49062151f67042458a523386cc3dc5) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/xen/smp_pv.c | 17 ++++++++++++++--- - 1 file changed, 14 insertions(+), 3 deletions(-) - -diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c -index 51471408fdd1..8c0e047d0b80 100644 ---- a/arch/x86/xen/smp_pv.c -+++ b/arch/x86/xen/smp_pv.c -@@ -13,6 +13,7 @@ - * single-threaded. - */ - #include -+#include - #include - #include - #include -@@ -293,12 +294,19 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) - #endif - memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); - -+ /* -+ * Bring up the CPU in cpu_bringup_and_idle() with the stack -+ * pointing just below where pt_regs would be if it were a normal -+ * kernel entry. -+ */ - ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; - ctxt->flags = VGCF_IN_KERNEL; - ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ - ctxt->user_regs.ds = __USER_DS; - ctxt->user_regs.es = __USER_DS; - ctxt->user_regs.ss = __KERNEL_DS; -+ ctxt->user_regs.cs = __KERNEL_CS; -+ ctxt->user_regs.esp = (unsigned long)task_pt_regs(idle); - - xen_copy_trap_info(ctxt->trap_ctxt); - -@@ -313,8 +321,13 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) - ctxt->gdt_frames[0] = gdt_mfn; - ctxt->gdt_ents = GDT_ENTRIES; - -+ /* -+ * Set SS:SP that Xen will use when entering guest kernel mode -+ * from guest user mode. Subsequent calls to load_sp0() can -+ * change this value. -+ */ - ctxt->kernel_ss = __KERNEL_DS; -- ctxt->kernel_sp = idle->thread.sp0; -+ ctxt->kernel_sp = task_top_of_stack(idle); - - #ifdef CONFIG_X86_32 - ctxt->event_callback_cs = __KERNEL_CS; -@@ -326,10 +339,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) - (unsigned long)xen_hypervisor_callback; - ctxt->failsafe_callback_eip = - (unsigned long)xen_failsafe_callback; -- ctxt->user_regs.cs = __KERNEL_CS; - per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); - -- ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); - ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); - if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) - BUG(); --- -2.14.2 - diff --git a/patches/kernel/0103-x86-entry-64-Stop-initializing-TSS.sp0-at-boot.patch b/patches/kernel/0103-x86-entry-64-Stop-initializing-TSS.sp0-at-boot.patch deleted file mode 100644 index acf1fd2..0000000 --- a/patches/kernel/0103-x86-entry-64-Stop-initializing-TSS.sp0-at-boot.patch +++ /dev/null @@ -1,102 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:59:13 -0700 -Subject: [PATCH] x86/entry/64: Stop initializing TSS.sp0 at boot -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -In my quest to get rid of thread_struct::sp0, I want to clean up or -remove all of its readers. Two of them are in cpu_init() (32-bit and -64-bit), and they aren't needed. This is because we never enter -userspace at all on the threads that CPUs are initialized in. - -Poison the initial TSS.sp0 and stop initializing it on CPU init. - -The comment text mostly comes from Dave Hansen. Thanks! - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/ee4a00540ad28c6cff475fbcc7769a4460acc861.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 20bb83443ea79087b5e5f8dab4e9d80bb9bf7acb) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 8c6b12e88bd87433087ea1f1cd5a9a4975e4623c) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/cpu/common.c | 13 ++++++++++--- - arch/x86/kernel/process.c | 8 +++++++- - 2 files changed, 17 insertions(+), 4 deletions(-) - -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index 6562acbfc4e0..121fe3570d6f 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -1570,9 +1570,13 @@ void cpu_init(void) - BUG_ON(me->mm); - enter_lazy_tlb(&init_mm, me); - -- load_sp0(current->thread.sp0); -+ /* -+ * Initialize the TSS. Don't bother initializing sp0, as the initial -+ * task never enters user mode. -+ */ - set_tss_desc(cpu, t); - load_TR_desc(); -+ - load_mm_ldt(&init_mm); - - clear_all_debug_regs(); -@@ -1594,7 +1598,6 @@ void cpu_init(void) - int cpu = smp_processor_id(); - struct task_struct *curr = current; - struct tss_struct *t = &per_cpu(cpu_tss, cpu); -- struct thread_struct *thread = &curr->thread; - - wait_for_master_cpu(cpu); - -@@ -1624,9 +1627,13 @@ void cpu_init(void) - BUG_ON(curr->mm); - enter_lazy_tlb(&init_mm, curr); - -- load_sp0(thread->sp0); -+ /* -+ * Initialize the TSS. Don't bother initializing sp0, as the initial -+ * task never enters user mode. -+ */ - set_tss_desc(cpu, t); - load_TR_desc(); -+ - load_mm_ldt(&init_mm); - - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); -diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c -index 3ca198080ea9..ccf3a4f4ef68 100644 ---- a/arch/x86/kernel/process.c -+++ b/arch/x86/kernel/process.c -@@ -48,7 +48,13 @@ - */ - __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { - .x86_tss = { -- .sp0 = TOP_OF_INIT_STACK, -+ /* -+ * .sp0 is only used when entering ring 0 from a lower -+ * privilege level. Since the init task never runs anything -+ * but ring 0 code, there is no need for a valid value here. -+ * Poison it. -+ */ -+ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, - #ifdef CONFIG_X86_32 - .ss0 = __KERNEL_DS, - .ss1 = __KERNEL_CS, --- -2.14.2 - diff --git a/patches/kernel/0103-x86-xen-64-x86-entry-64-Clean-up-SP-code-in-cpu_init.patch b/patches/kernel/0103-x86-xen-64-x86-entry-64-Clean-up-SP-code-in-cpu_init.patch new file mode 100644 index 0000000..5c37994 --- /dev/null +++ b/patches/kernel/0103-x86-xen-64-x86-entry-64-Clean-up-SP-code-in-cpu_init.patch @@ -0,0 +1,99 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:59:12 -0700 +Subject: [PATCH] x86/xen/64, x86/entry/64: Clean up SP code in + cpu_initialize_context() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +I'm removing thread_struct::sp0, and Xen's usage of it is slightly +dubious and unnecessary. Use appropriate helpers instead. + +While we're at at, reorder the code slightly to make it more obvious +what's going on. + +Signed-off-by: Andy Lutomirski +Reviewed-by: Juergen Gross +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/d5b9a3da2b47c68325bd2bbe8f82d9554dee0d0f.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit f16b3da1dc936c0f8121741d0a1731bf242f2f56) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 27c60a1f6c49062151f67042458a523386cc3dc5) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/xen/smp_pv.c | 17 ++++++++++++++--- + 1 file changed, 14 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c +index 51471408fdd1..8c0e047d0b80 100644 +--- a/arch/x86/xen/smp_pv.c ++++ b/arch/x86/xen/smp_pv.c +@@ -13,6 +13,7 @@ + * single-threaded. + */ + #include ++#include + #include + #include + #include +@@ -293,12 +294,19 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) + #endif + memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); + ++ /* ++ * Bring up the CPU in cpu_bringup_and_idle() with the stack ++ * pointing just below where pt_regs would be if it were a normal ++ * kernel entry. ++ */ + ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; + ctxt->flags = VGCF_IN_KERNEL; + ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ + ctxt->user_regs.ds = __USER_DS; + ctxt->user_regs.es = __USER_DS; + ctxt->user_regs.ss = __KERNEL_DS; ++ ctxt->user_regs.cs = __KERNEL_CS; ++ ctxt->user_regs.esp = (unsigned long)task_pt_regs(idle); + + xen_copy_trap_info(ctxt->trap_ctxt); + +@@ -313,8 +321,13 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) + ctxt->gdt_frames[0] = gdt_mfn; + ctxt->gdt_ents = GDT_ENTRIES; + ++ /* ++ * Set SS:SP that Xen will use when entering guest kernel mode ++ * from guest user mode. Subsequent calls to load_sp0() can ++ * change this value. ++ */ + ctxt->kernel_ss = __KERNEL_DS; +- ctxt->kernel_sp = idle->thread.sp0; ++ ctxt->kernel_sp = task_top_of_stack(idle); + + #ifdef CONFIG_X86_32 + ctxt->event_callback_cs = __KERNEL_CS; +@@ -326,10 +339,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) + (unsigned long)xen_hypervisor_callback; + ctxt->failsafe_callback_eip = + (unsigned long)xen_failsafe_callback; +- ctxt->user_regs.cs = __KERNEL_CS; + per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); + +- ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); + ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); + if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) + BUG(); +-- +2.14.2 + diff --git a/patches/kernel/0104-x86-entry-64-Remove-all-remaining-direct-thread_stru.patch b/patches/kernel/0104-x86-entry-64-Remove-all-remaining-direct-thread_stru.patch deleted file mode 100644 index a9687e1..0000000 --- a/patches/kernel/0104-x86-entry-64-Remove-all-remaining-direct-thread_stru.patch +++ /dev/null @@ -1,103 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:59:14 -0700 -Subject: [PATCH] x86/entry/64: Remove all remaining direct thread_struct::sp0 - reads -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The only remaining readers in context switch code or vm86(), and -they all just want to update TSS.sp0 to match the current task. -Replace them all with a new helper update_sp0(). - -Signed-off-by: Andy Lutomirski -Reviewed-by: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/2d231687f4ff288c9d9e98d7861b7df374246ac3.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 46f5a10a721ce8dce8cc8fe55279b49e1c6b3288) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit cc87284caa7d31d9d5a55c418eb5278cab6e2db1) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/switch_to.h | 6 ++++++ - arch/x86/kernel/process_32.c | 2 +- - arch/x86/kernel/process_64.c | 2 +- - arch/x86/kernel/vm86_32.c | 4 ++-- - 4 files changed, 10 insertions(+), 4 deletions(-) - -diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h -index 7ae8caffbada..54e64d909725 100644 ---- a/arch/x86/include/asm/switch_to.h -+++ b/arch/x86/include/asm/switch_to.h -@@ -84,4 +84,10 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) - } - #endif - -+/* This is used when switching tasks or entering/exiting vm86 mode. */ -+static inline void update_sp0(struct task_struct *task) -+{ -+ load_sp0(task->thread.sp0); -+} -+ - #endif /* _ASM_X86_SWITCH_TO_H */ -diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c -index 48a3f240f565..c0d60420466c 100644 ---- a/arch/x86/kernel/process_32.c -+++ b/arch/x86/kernel/process_32.c -@@ -287,7 +287,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) - * current_thread_info(). Refresh the SYSENTER configuration in - * case prev or next is vm86. - */ -- load_sp0(next->sp0); -+ update_sp0(next_p); - refresh_sysenter_cs(next); - this_cpu_write(cpu_current_top_of_stack, - (unsigned long)task_stack_page(next_p) + -diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c -index 37b933628a8b..8a748e17bf6e 100644 ---- a/arch/x86/kernel/process_64.c -+++ b/arch/x86/kernel/process_64.c -@@ -466,7 +466,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) - this_cpu_write(current_task, next_p); - - /* Reload sp0. */ -- load_sp0(next->sp0); -+ update_sp0(next_p); - - /* - * Now maybe reload the debug registers and handle I/O bitmaps -diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c -index 0f1d92cd20ad..a7b44c75c642 100644 ---- a/arch/x86/kernel/vm86_32.c -+++ b/arch/x86/kernel/vm86_32.c -@@ -148,7 +148,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) - preempt_disable(); - tsk->thread.sp0 = vm86->saved_sp0; - tsk->thread.sysenter_cs = __KERNEL_CS; -- load_sp0(tsk->thread.sp0); -+ update_sp0(tsk); - refresh_sysenter_cs(&tsk->thread); - vm86->saved_sp0 = 0; - preempt_enable(); -@@ -373,7 +373,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) - refresh_sysenter_cs(&tsk->thread); - } - -- load_sp0(tsk->thread.sp0); -+ update_sp0(tsk); - preempt_enable(); - - if (vm86->flags & VM86_SCREEN_BITMAP) --- -2.14.2 - diff --git a/patches/kernel/0104-x86-entry-64-Stop-initializing-TSS.sp0-at-boot.patch b/patches/kernel/0104-x86-entry-64-Stop-initializing-TSS.sp0-at-boot.patch new file mode 100644 index 0000000..acf1fd2 --- /dev/null +++ b/patches/kernel/0104-x86-entry-64-Stop-initializing-TSS.sp0-at-boot.patch @@ -0,0 +1,102 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:59:13 -0700 +Subject: [PATCH] x86/entry/64: Stop initializing TSS.sp0 at boot +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +In my quest to get rid of thread_struct::sp0, I want to clean up or +remove all of its readers. Two of them are in cpu_init() (32-bit and +64-bit), and they aren't needed. This is because we never enter +userspace at all on the threads that CPUs are initialized in. + +Poison the initial TSS.sp0 and stop initializing it on CPU init. + +The comment text mostly comes from Dave Hansen. Thanks! + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/ee4a00540ad28c6cff475fbcc7769a4460acc861.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 20bb83443ea79087b5e5f8dab4e9d80bb9bf7acb) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 8c6b12e88bd87433087ea1f1cd5a9a4975e4623c) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/cpu/common.c | 13 ++++++++++--- + arch/x86/kernel/process.c | 8 +++++++- + 2 files changed, 17 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 6562acbfc4e0..121fe3570d6f 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1570,9 +1570,13 @@ void cpu_init(void) + BUG_ON(me->mm); + enter_lazy_tlb(&init_mm, me); + +- load_sp0(current->thread.sp0); ++ /* ++ * Initialize the TSS. Don't bother initializing sp0, as the initial ++ * task never enters user mode. ++ */ + set_tss_desc(cpu, t); + load_TR_desc(); ++ + load_mm_ldt(&init_mm); + + clear_all_debug_regs(); +@@ -1594,7 +1598,6 @@ void cpu_init(void) + int cpu = smp_processor_id(); + struct task_struct *curr = current; + struct tss_struct *t = &per_cpu(cpu_tss, cpu); +- struct thread_struct *thread = &curr->thread; + + wait_for_master_cpu(cpu); + +@@ -1624,9 +1627,13 @@ void cpu_init(void) + BUG_ON(curr->mm); + enter_lazy_tlb(&init_mm, curr); + +- load_sp0(thread->sp0); ++ /* ++ * Initialize the TSS. Don't bother initializing sp0, as the initial ++ * task never enters user mode. ++ */ + set_tss_desc(cpu, t); + load_TR_desc(); ++ + load_mm_ldt(&init_mm); + + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 3ca198080ea9..ccf3a4f4ef68 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -48,7 +48,13 @@ + */ + __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { + .x86_tss = { +- .sp0 = TOP_OF_INIT_STACK, ++ /* ++ * .sp0 is only used when entering ring 0 from a lower ++ * privilege level. Since the init task never runs anything ++ * but ring 0 code, there is no need for a valid value here. ++ * Poison it. ++ */ ++ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, + #ifdef CONFIG_X86_32 + .ss0 = __KERNEL_DS, + .ss1 = __KERNEL_CS, +-- +2.14.2 + diff --git a/patches/kernel/0105-x86-entry-32-Fix-cpu_current_top_of_stack-initializa.patch b/patches/kernel/0105-x86-entry-32-Fix-cpu_current_top_of_stack-initializa.patch deleted file mode 100644 index 6e49d46..0000000 --- a/patches/kernel/0105-x86-entry-32-Fix-cpu_current_top_of_stack-initializa.patch +++ /dev/null @@ -1,51 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:59:15 -0700 -Subject: [PATCH] x86/entry/32: Fix cpu_current_top_of_stack initialization at - boot -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -cpu_current_top_of_stack's initialization forgot about -TOP_OF_KERNEL_STACK_PADDING. This bug didn't matter because the -idle threads never enter user mode. - -Signed-off-by: Andy Lutomirski -Reviewed-by: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/e5e370a7e6e4fddd1c4e4cf619765d96bb874b21.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit cd493a6deb8b78eca280d05f7fa73fd69403ae29) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 258c98e7d4b8f1459772e656cd736c028a13add9) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/smpboot.c | 3 +-- - 1 file changed, 1 insertion(+), 2 deletions(-) - -diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c -index d05006f6c31c..8ea3b18cbdc1 100644 ---- a/arch/x86/kernel/smpboot.c -+++ b/arch/x86/kernel/smpboot.c -@@ -961,8 +961,7 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) - #ifdef CONFIG_X86_32 - /* Stack for startup_32 can be just as for start_secondary onwards */ - irq_ctx_init(cpu); -- per_cpu(cpu_current_top_of_stack, cpu) = -- (unsigned long)task_stack_page(idle) + THREAD_SIZE; -+ per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); - #else - initial_gs = per_cpu_offset(cpu); - #endif --- -2.14.2 - diff --git a/patches/kernel/0105-x86-entry-64-Remove-all-remaining-direct-thread_stru.patch b/patches/kernel/0105-x86-entry-64-Remove-all-remaining-direct-thread_stru.patch new file mode 100644 index 0000000..a9687e1 --- /dev/null +++ b/patches/kernel/0105-x86-entry-64-Remove-all-remaining-direct-thread_stru.patch @@ -0,0 +1,103 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:59:14 -0700 +Subject: [PATCH] x86/entry/64: Remove all remaining direct thread_struct::sp0 + reads +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The only remaining readers in context switch code or vm86(), and +they all just want to update TSS.sp0 to match the current task. +Replace them all with a new helper update_sp0(). + +Signed-off-by: Andy Lutomirski +Reviewed-by: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/2d231687f4ff288c9d9e98d7861b7df374246ac3.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 46f5a10a721ce8dce8cc8fe55279b49e1c6b3288) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit cc87284caa7d31d9d5a55c418eb5278cab6e2db1) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/switch_to.h | 6 ++++++ + arch/x86/kernel/process_32.c | 2 +- + arch/x86/kernel/process_64.c | 2 +- + arch/x86/kernel/vm86_32.c | 4 ++-- + 4 files changed, 10 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h +index 7ae8caffbada..54e64d909725 100644 +--- a/arch/x86/include/asm/switch_to.h ++++ b/arch/x86/include/asm/switch_to.h +@@ -84,4 +84,10 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) + } + #endif + ++/* This is used when switching tasks or entering/exiting vm86 mode. */ ++static inline void update_sp0(struct task_struct *task) ++{ ++ load_sp0(task->thread.sp0); ++} ++ + #endif /* _ASM_X86_SWITCH_TO_H */ +diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c +index 48a3f240f565..c0d60420466c 100644 +--- a/arch/x86/kernel/process_32.c ++++ b/arch/x86/kernel/process_32.c +@@ -287,7 +287,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + * current_thread_info(). Refresh the SYSENTER configuration in + * case prev or next is vm86. + */ +- load_sp0(next->sp0); ++ update_sp0(next_p); + refresh_sysenter_cs(next); + this_cpu_write(cpu_current_top_of_stack, + (unsigned long)task_stack_page(next_p) + +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index 37b933628a8b..8a748e17bf6e 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -466,7 +466,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + this_cpu_write(current_task, next_p); + + /* Reload sp0. */ +- load_sp0(next->sp0); ++ update_sp0(next_p); + + /* + * Now maybe reload the debug registers and handle I/O bitmaps +diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c +index 0f1d92cd20ad..a7b44c75c642 100644 +--- a/arch/x86/kernel/vm86_32.c ++++ b/arch/x86/kernel/vm86_32.c +@@ -148,7 +148,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) + preempt_disable(); + tsk->thread.sp0 = vm86->saved_sp0; + tsk->thread.sysenter_cs = __KERNEL_CS; +- load_sp0(tsk->thread.sp0); ++ update_sp0(tsk); + refresh_sysenter_cs(&tsk->thread); + vm86->saved_sp0 = 0; + preempt_enable(); +@@ -373,7 +373,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) + refresh_sysenter_cs(&tsk->thread); + } + +- load_sp0(tsk->thread.sp0); ++ update_sp0(tsk); + preempt_enable(); + + if (vm86->flags & VM86_SCREEN_BITMAP) +-- +2.14.2 + diff --git a/patches/kernel/0106-x86-entry-32-Fix-cpu_current_top_of_stack-initializa.patch b/patches/kernel/0106-x86-entry-32-Fix-cpu_current_top_of_stack-initializa.patch new file mode 100644 index 0000000..6e49d46 --- /dev/null +++ b/patches/kernel/0106-x86-entry-32-Fix-cpu_current_top_of_stack-initializa.patch @@ -0,0 +1,51 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:59:15 -0700 +Subject: [PATCH] x86/entry/32: Fix cpu_current_top_of_stack initialization at + boot +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +cpu_current_top_of_stack's initialization forgot about +TOP_OF_KERNEL_STACK_PADDING. This bug didn't matter because the +idle threads never enter user mode. + +Signed-off-by: Andy Lutomirski +Reviewed-by: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/e5e370a7e6e4fddd1c4e4cf619765d96bb874b21.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit cd493a6deb8b78eca280d05f7fa73fd69403ae29) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 258c98e7d4b8f1459772e656cd736c028a13add9) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/smpboot.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index d05006f6c31c..8ea3b18cbdc1 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -961,8 +961,7 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) + #ifdef CONFIG_X86_32 + /* Stack for startup_32 can be just as for start_secondary onwards */ + irq_ctx_init(cpu); +- per_cpu(cpu_current_top_of_stack, cpu) = +- (unsigned long)task_stack_page(idle) + THREAD_SIZE; ++ per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); + #else + initial_gs = per_cpu_offset(cpu); + #endif +-- +2.14.2 + diff --git a/patches/kernel/0106-x86-entry-64-Remove-thread_struct-sp0.patch b/patches/kernel/0106-x86-entry-64-Remove-thread_struct-sp0.patch deleted file mode 100644 index 3f9fffb..0000000 --- a/patches/kernel/0106-x86-entry-64-Remove-thread_struct-sp0.patch +++ /dev/null @@ -1,154 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:59:16 -0700 -Subject: [PATCH] x86/entry/64: Remove thread_struct::sp0 -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -On x86_64, we can easily calculate sp0 when needed instead of -storing it in thread_struct. - -On x86_32, a similar cleanup would be possible, but it would require -cleaning up the vm86 code first, and that can wait for a later -cleanup series. - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/719cd9c66c548c4350d98a90f050aee8b17f8919.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit d375cf1530595e33961a8844192cddab913650e3) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 4910af19c69a87e9432467f4d7cb78da5fbcc30a) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/compat.h | 1 + - arch/x86/include/asm/processor.h | 28 +++++++++------------------- - arch/x86/include/asm/switch_to.h | 6 ++++++ - arch/x86/kernel/process_64.c | 1 - - 4 files changed, 16 insertions(+), 20 deletions(-) - -diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h -index 5343c19814b3..948b6d8ec46f 100644 ---- a/arch/x86/include/asm/compat.h -+++ b/arch/x86/include/asm/compat.h -@@ -6,6 +6,7 @@ - */ - #include - #include -+#include - #include - #include - #include -diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index f83fbf1b6dd9..cec9a329c0f1 100644 ---- a/arch/x86/include/asm/processor.h -+++ b/arch/x86/include/asm/processor.h -@@ -423,7 +423,9 @@ typedef struct { - struct thread_struct { - /* Cached TLS descriptors: */ - struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; -+#ifdef CONFIG_X86_32 - unsigned long sp0; -+#endif - unsigned long sp; - #ifdef CONFIG_X86_32 - unsigned long sysenter_cs; -@@ -790,6 +792,13 @@ static inline void spin_lock_prefetch(const void *x) - - #define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1)) - -+#define task_pt_regs(task) \ -+({ \ -+ unsigned long __ptr = (unsigned long)task_stack_page(task); \ -+ __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ -+ ((struct pt_regs *)__ptr) - 1; \ -+}) -+ - #ifdef CONFIG_X86_32 - /* - * User space process size: 3GB (default). -@@ -807,23 +816,6 @@ static inline void spin_lock_prefetch(const void *x) - .addr_limit = KERNEL_DS, \ - } - --/* -- * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. -- * This is necessary to guarantee that the entire "struct pt_regs" -- * is accessible even if the CPU haven't stored the SS/ESP registers -- * on the stack (interrupt gate does not save these registers -- * when switching to the same priv ring). -- * Therefore beware: accessing the ss/esp fields of the -- * "struct pt_regs" is possible, but they may contain the -- * completely wrong values. -- */ --#define task_pt_regs(task) \ --({ \ -- unsigned long __ptr = (unsigned long)task_stack_page(task); \ -- __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ -- ((struct pt_regs *)__ptr) - 1; \ --}) -- - #define KSTK_ESP(task) (task_pt_regs(task)->sp) - - #else -@@ -853,11 +845,9 @@ static inline void spin_lock_prefetch(const void *x) - #define STACK_TOP_MAX TASK_SIZE_MAX - - #define INIT_THREAD { \ -- .sp0 = TOP_OF_INIT_STACK, \ - .addr_limit = KERNEL_DS, \ - } - --#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) - extern unsigned long KSTK_ESP(struct task_struct *task); - - #endif /* CONFIG_X86_64 */ -diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h -index 54e64d909725..010cd6e4eafc 100644 ---- a/arch/x86/include/asm/switch_to.h -+++ b/arch/x86/include/asm/switch_to.h -@@ -1,6 +1,8 @@ - #ifndef _ASM_X86_SWITCH_TO_H - #define _ASM_X86_SWITCH_TO_H - -+#include -+ - struct task_struct; /* one of the stranger aspects of C forward declarations */ - - struct task_struct *__switch_to_asm(struct task_struct *prev, -@@ -87,7 +89,11 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) - /* This is used when switching tasks or entering/exiting vm86 mode. */ - static inline void update_sp0(struct task_struct *task) - { -+#ifdef CONFIG_X86_32 - load_sp0(task->thread.sp0); -+#else -+ load_sp0(task_top_of_stack(task)); -+#endif - } - - #endif /* _ASM_X86_SWITCH_TO_H */ -diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c -index 8a748e17bf6e..b08b9b6c40eb 100644 ---- a/arch/x86/kernel/process_64.c -+++ b/arch/x86/kernel/process_64.c -@@ -275,7 +275,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, - struct inactive_task_frame *frame; - struct task_struct *me = current; - -- p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; - childregs = task_pt_regs(p); - fork_frame = container_of(childregs, struct fork_frame, regs); - frame = &fork_frame->frame; --- -2.14.2 - diff --git a/patches/kernel/0107-x86-entry-64-Remove-thread_struct-sp0.patch b/patches/kernel/0107-x86-entry-64-Remove-thread_struct-sp0.patch new file mode 100644 index 0000000..3f9fffb --- /dev/null +++ b/patches/kernel/0107-x86-entry-64-Remove-thread_struct-sp0.patch @@ -0,0 +1,154 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:59:16 -0700 +Subject: [PATCH] x86/entry/64: Remove thread_struct::sp0 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +On x86_64, we can easily calculate sp0 when needed instead of +storing it in thread_struct. + +On x86_32, a similar cleanup would be possible, but it would require +cleaning up the vm86 code first, and that can wait for a later +cleanup series. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/719cd9c66c548c4350d98a90f050aee8b17f8919.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit d375cf1530595e33961a8844192cddab913650e3) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 4910af19c69a87e9432467f4d7cb78da5fbcc30a) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/compat.h | 1 + + arch/x86/include/asm/processor.h | 28 +++++++++------------------- + arch/x86/include/asm/switch_to.h | 6 ++++++ + arch/x86/kernel/process_64.c | 1 - + 4 files changed, 16 insertions(+), 20 deletions(-) + +diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h +index 5343c19814b3..948b6d8ec46f 100644 +--- a/arch/x86/include/asm/compat.h ++++ b/arch/x86/include/asm/compat.h +@@ -6,6 +6,7 @@ + */ + #include + #include ++#include + #include + #include + #include +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index f83fbf1b6dd9..cec9a329c0f1 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -423,7 +423,9 @@ typedef struct { + struct thread_struct { + /* Cached TLS descriptors: */ + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; ++#ifdef CONFIG_X86_32 + unsigned long sp0; ++#endif + unsigned long sp; + #ifdef CONFIG_X86_32 + unsigned long sysenter_cs; +@@ -790,6 +792,13 @@ static inline void spin_lock_prefetch(const void *x) + + #define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1)) + ++#define task_pt_regs(task) \ ++({ \ ++ unsigned long __ptr = (unsigned long)task_stack_page(task); \ ++ __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ ++ ((struct pt_regs *)__ptr) - 1; \ ++}) ++ + #ifdef CONFIG_X86_32 + /* + * User space process size: 3GB (default). +@@ -807,23 +816,6 @@ static inline void spin_lock_prefetch(const void *x) + .addr_limit = KERNEL_DS, \ + } + +-/* +- * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. +- * This is necessary to guarantee that the entire "struct pt_regs" +- * is accessible even if the CPU haven't stored the SS/ESP registers +- * on the stack (interrupt gate does not save these registers +- * when switching to the same priv ring). +- * Therefore beware: accessing the ss/esp fields of the +- * "struct pt_regs" is possible, but they may contain the +- * completely wrong values. +- */ +-#define task_pt_regs(task) \ +-({ \ +- unsigned long __ptr = (unsigned long)task_stack_page(task); \ +- __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ +- ((struct pt_regs *)__ptr) - 1; \ +-}) +- + #define KSTK_ESP(task) (task_pt_regs(task)->sp) + + #else +@@ -853,11 +845,9 @@ static inline void spin_lock_prefetch(const void *x) + #define STACK_TOP_MAX TASK_SIZE_MAX + + #define INIT_THREAD { \ +- .sp0 = TOP_OF_INIT_STACK, \ + .addr_limit = KERNEL_DS, \ + } + +-#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) + extern unsigned long KSTK_ESP(struct task_struct *task); + + #endif /* CONFIG_X86_64 */ +diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h +index 54e64d909725..010cd6e4eafc 100644 +--- a/arch/x86/include/asm/switch_to.h ++++ b/arch/x86/include/asm/switch_to.h +@@ -1,6 +1,8 @@ + #ifndef _ASM_X86_SWITCH_TO_H + #define _ASM_X86_SWITCH_TO_H + ++#include ++ + struct task_struct; /* one of the stranger aspects of C forward declarations */ + + struct task_struct *__switch_to_asm(struct task_struct *prev, +@@ -87,7 +89,11 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) + /* This is used when switching tasks or entering/exiting vm86 mode. */ + static inline void update_sp0(struct task_struct *task) + { ++#ifdef CONFIG_X86_32 + load_sp0(task->thread.sp0); ++#else ++ load_sp0(task_top_of_stack(task)); ++#endif + } + + #endif /* _ASM_X86_SWITCH_TO_H */ +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index 8a748e17bf6e..b08b9b6c40eb 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -275,7 +275,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + struct inactive_task_frame *frame; + struct task_struct *me = current; + +- p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; + childregs = task_pt_regs(p); + fork_frame = container_of(childregs, struct fork_frame, regs); + frame = &fork_frame->frame; +-- +2.14.2 + diff --git a/patches/kernel/0107-x86-traps-Use-a-new-on_thread_stack-helper-to-clean-.patch b/patches/kernel/0107-x86-traps-Use-a-new-on_thread_stack-helper-to-clean-.patch deleted file mode 100644 index 4535109..0000000 --- a/patches/kernel/0107-x86-traps-Use-a-new-on_thread_stack-helper-to-clean-.patch +++ /dev/null @@ -1,118 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Thu, 2 Nov 2017 00:59:17 -0700 -Subject: [PATCH] x86/traps: Use a new on_thread_stack() helper to clean up an - assertion -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Let's keep the stack-related logic together rather than open-coding -a comparison in an assertion in the traps code. - -Signed-off-by: Andy Lutomirski -Reviewed-by: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/856b15bee1f55017b8f79d3758b0d51c48a08cf8.1509609304.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(backported from commit 3383642c2f9d4f5b4fa37436db4a109a1a10018c) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 243de7bd3434c50fb07dd0fc84c462236cfcba3e) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/processor.h | 8 ++++++++ - arch/x86/include/asm/thread_info.h | 22 +++++++++++----------- - arch/x86/kernel/traps.c | 3 +-- - 3 files changed, 20 insertions(+), 13 deletions(-) - -diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index cec9a329c0f1..79739e5f939a 100644 ---- a/arch/x86/include/asm/processor.h -+++ b/arch/x86/include/asm/processor.h -@@ -159,6 +159,8 @@ enum cpuid_regs_idx { - extern struct cpuinfo_x86 boot_cpu_data; - extern struct cpuinfo_x86 new_cpu_data; - -+#include -+ - extern struct tss_struct doublefault_tss; - extern __u32 cpu_caps_cleared[NCAPINTS]; - extern __u32 cpu_caps_set[NCAPINTS]; -@@ -534,6 +536,12 @@ static inline unsigned long current_top_of_stack(void) - #endif - } - -+static inline bool on_thread_stack(void) -+{ -+ return (unsigned long)(current_top_of_stack() - -+ current_stack_pointer()) < THREAD_SIZE; -+} -+ - #ifdef CONFIG_PARAVIRT - #include - #else -diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h -index e00e1bd6e7b3..ec8ef3bbb7dc 100644 ---- a/arch/x86/include/asm/thread_info.h -+++ b/arch/x86/include/asm/thread_info.h -@@ -48,6 +48,17 @@ - * - this struct shares the supervisor stack pages - */ - #ifndef __ASSEMBLY__ -+static inline unsigned long current_stack_pointer(void) -+{ -+ unsigned long sp; -+#ifdef CONFIG_X86_64 -+ asm("mov %%rsp,%0" : "=g" (sp)); -+#else -+ asm("mov %%esp,%0" : "=g" (sp)); -+#endif -+ return sp; -+} -+ - struct task_struct; - #include - #include -@@ -155,17 +166,6 @@ struct thread_info { - */ - #ifndef __ASSEMBLY__ - --static inline unsigned long current_stack_pointer(void) --{ -- unsigned long sp; --#ifdef CONFIG_X86_64 -- asm("mov %%rsp,%0" : "=g" (sp)); --#else -- asm("mov %%esp,%0" : "=g" (sp)); --#endif -- return sp; --} -- - /* - * Walks up the stack frames to make sure that the specified object is - * entirely contained by a single stack frame. -diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c -index b2157d4a5338..3a46cab2696e 100644 ---- a/arch/x86/kernel/traps.c -+++ b/arch/x86/kernel/traps.c -@@ -153,8 +153,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) - * will catch asm bugs and any attempt to use ist_preempt_enable - * from double_fault. - */ -- BUG_ON((unsigned long)(current_top_of_stack() - -- current_stack_pointer()) >= THREAD_SIZE); -+ BUG_ON(!on_thread_stack()); - - preempt_enable_no_resched(); - } --- -2.14.2 - diff --git a/patches/kernel/0108-x86-entry-64-Shorten-TEST-instructions.patch b/patches/kernel/0108-x86-entry-64-Shorten-TEST-instructions.patch deleted file mode 100644 index 59a5157..0000000 --- a/patches/kernel/0108-x86-entry-64-Shorten-TEST-instructions.patch +++ /dev/null @@ -1,57 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Borislav Petkov -Date: Thu, 2 Nov 2017 13:09:26 +0100 -Subject: [PATCH] x86/entry/64: Shorten TEST instructions -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Convert TESTL to TESTB and save 3 bytes per callsite. - -No functionality change. - -Signed-off-by: Borislav Petkov -Cc: Andy Lutomirski -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/20171102120926.4srwerqrr7g72e2k@pd.tnic -Signed-off-by: Ingo Molnar -(cherry picked from commit 1e4c4f610f774df6088d7c065b2dd4d22adba698) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 2b5cfca36261d4ce45ebfdf2602d65201fa3c780) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/entry_64.S | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 05501c781c20..2491b3b25b9a 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -620,7 +620,7 @@ GLOBAL(retint_user) - GLOBAL(swapgs_restore_regs_and_return_to_usermode) - #ifdef CONFIG_DEBUG_ENTRY - /* Assert that pt_regs indicates user mode. */ -- testl $3, CS(%rsp) -+ testb $3, CS(%rsp) - jnz 1f - ud2 - 1: -@@ -653,7 +653,7 @@ retint_kernel: - GLOBAL(restore_regs_and_return_to_kernel) - #ifdef CONFIG_DEBUG_ENTRY - /* Assert that pt_regs indicates kernel mode. */ -- testl $3, CS(%rsp) -+ testb $3, CS(%rsp) - jz 1f - ud2 - 1: --- -2.14.2 - diff --git a/patches/kernel/0108-x86-traps-Use-a-new-on_thread_stack-helper-to-clean-.patch b/patches/kernel/0108-x86-traps-Use-a-new-on_thread_stack-helper-to-clean-.patch new file mode 100644 index 0000000..4535109 --- /dev/null +++ b/patches/kernel/0108-x86-traps-Use-a-new-on_thread_stack-helper-to-clean-.patch @@ -0,0 +1,118 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Thu, 2 Nov 2017 00:59:17 -0700 +Subject: [PATCH] x86/traps: Use a new on_thread_stack() helper to clean up an + assertion +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Let's keep the stack-related logic together rather than open-coding +a comparison in an assertion in the traps code. + +Signed-off-by: Andy Lutomirski +Reviewed-by: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/856b15bee1f55017b8f79d3758b0d51c48a08cf8.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(backported from commit 3383642c2f9d4f5b4fa37436db4a109a1a10018c) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 243de7bd3434c50fb07dd0fc84c462236cfcba3e) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/processor.h | 8 ++++++++ + arch/x86/include/asm/thread_info.h | 22 +++++++++++----------- + arch/x86/kernel/traps.c | 3 +-- + 3 files changed, 20 insertions(+), 13 deletions(-) + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index cec9a329c0f1..79739e5f939a 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -159,6 +159,8 @@ enum cpuid_regs_idx { + extern struct cpuinfo_x86 boot_cpu_data; + extern struct cpuinfo_x86 new_cpu_data; + ++#include ++ + extern struct tss_struct doublefault_tss; + extern __u32 cpu_caps_cleared[NCAPINTS]; + extern __u32 cpu_caps_set[NCAPINTS]; +@@ -534,6 +536,12 @@ static inline unsigned long current_top_of_stack(void) + #endif + } + ++static inline bool on_thread_stack(void) ++{ ++ return (unsigned long)(current_top_of_stack() - ++ current_stack_pointer()) < THREAD_SIZE; ++} ++ + #ifdef CONFIG_PARAVIRT + #include + #else +diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h +index e00e1bd6e7b3..ec8ef3bbb7dc 100644 +--- a/arch/x86/include/asm/thread_info.h ++++ b/arch/x86/include/asm/thread_info.h +@@ -48,6 +48,17 @@ + * - this struct shares the supervisor stack pages + */ + #ifndef __ASSEMBLY__ ++static inline unsigned long current_stack_pointer(void) ++{ ++ unsigned long sp; ++#ifdef CONFIG_X86_64 ++ asm("mov %%rsp,%0" : "=g" (sp)); ++#else ++ asm("mov %%esp,%0" : "=g" (sp)); ++#endif ++ return sp; ++} ++ + struct task_struct; + #include + #include +@@ -155,17 +166,6 @@ struct thread_info { + */ + #ifndef __ASSEMBLY__ + +-static inline unsigned long current_stack_pointer(void) +-{ +- unsigned long sp; +-#ifdef CONFIG_X86_64 +- asm("mov %%rsp,%0" : "=g" (sp)); +-#else +- asm("mov %%esp,%0" : "=g" (sp)); +-#endif +- return sp; +-} +- + /* + * Walks up the stack frames to make sure that the specified object is + * entirely contained by a single stack frame. +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index b2157d4a5338..3a46cab2696e 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -153,8 +153,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) + * will catch asm bugs and any attempt to use ist_preempt_enable + * from double_fault. + */ +- BUG_ON((unsigned long)(current_top_of_stack() - +- current_stack_pointer()) >= THREAD_SIZE); ++ BUG_ON(!on_thread_stack()); + + preempt_enable_no_resched(); + } +-- +2.14.2 + diff --git a/patches/kernel/0109-x86-cpuid-Replace-set-clear_bit32.patch b/patches/kernel/0109-x86-cpuid-Replace-set-clear_bit32.patch deleted file mode 100644 index 7e56665..0000000 --- a/patches/kernel/0109-x86-cpuid-Replace-set-clear_bit32.patch +++ /dev/null @@ -1,71 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Thu, 2 Nov 2017 13:22:35 +0100 -Subject: [PATCH] x86/cpuid: Replace set/clear_bit32() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Peter pointed out that the set/clear_bit32() variants are broken in various -aspects. - -Replace them with open coded set/clear_bit() and type cast -cpu_info::x86_capability as it's done in all other places throughout x86. - -Fixes: 0b00de857a64 ("x86/cpuid: Add generic table for CPUID dependencies") -Reported-by: Peter Ziljstra -Signed-off-by: Thomas Gleixner -Cc: Andi Kleen -(cherry picked from commit 06dd688ddda5819025e014b79aea9af6ab475fa2) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 3e511952bc3ff9b233d418b0a75a8331deb08171) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/cpu/cpuid-deps.c | 26 +++++++++++--------------- - 1 file changed, 11 insertions(+), 15 deletions(-) - -diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c -index c21f22d836ad..904b0a3c4e53 100644 ---- a/arch/x86/kernel/cpu/cpuid-deps.c -+++ b/arch/x86/kernel/cpu/cpuid-deps.c -@@ -62,23 +62,19 @@ const static struct cpuid_dep cpuid_deps[] = { - {} - }; - --static inline void __clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit) --{ -- clear_bit32(bit, c->x86_capability); --} -- --static inline void __setup_clear_cpu_cap(unsigned int bit) --{ -- clear_cpu_cap(&boot_cpu_data, bit); -- set_bit32(bit, cpu_caps_cleared); --} -- - static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) - { -- if (!c) -- __setup_clear_cpu_cap(feature); -- else -- __clear_cpu_cap(c, feature); -+ /* -+ * Note: This could use the non atomic __*_bit() variants, but the -+ * rest of the cpufeature code uses atomics as well, so keep it for -+ * consistency. Cleanup all of it separately. -+ */ -+ if (!c) { -+ clear_cpu_cap(&boot_cpu_data, feature); -+ set_bit(feature, (unsigned long *)cpu_caps_cleared); -+ } else { -+ clear_bit(feature, (unsigned long *)c->x86_capability); -+ } - } - - /* Take the capabilities and the BUG bits into account */ --- -2.14.2 - diff --git a/patches/kernel/0109-x86-entry-64-Shorten-TEST-instructions.patch b/patches/kernel/0109-x86-entry-64-Shorten-TEST-instructions.patch new file mode 100644 index 0000000..59a5157 --- /dev/null +++ b/patches/kernel/0109-x86-entry-64-Shorten-TEST-instructions.patch @@ -0,0 +1,57 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Borislav Petkov +Date: Thu, 2 Nov 2017 13:09:26 +0100 +Subject: [PATCH] x86/entry/64: Shorten TEST instructions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Convert TESTL to TESTB and save 3 bytes per callsite. + +No functionality change. + +Signed-off-by: Borislav Petkov +Cc: Andy Lutomirski +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/20171102120926.4srwerqrr7g72e2k@pd.tnic +Signed-off-by: Ingo Molnar +(cherry picked from commit 1e4c4f610f774df6088d7c065b2dd4d22adba698) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 2b5cfca36261d4ce45ebfdf2602d65201fa3c780) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/entry_64.S | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 05501c781c20..2491b3b25b9a 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -620,7 +620,7 @@ GLOBAL(retint_user) + GLOBAL(swapgs_restore_regs_and_return_to_usermode) + #ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates user mode. */ +- testl $3, CS(%rsp) ++ testb $3, CS(%rsp) + jnz 1f + ud2 + 1: +@@ -653,7 +653,7 @@ retint_kernel: + GLOBAL(restore_regs_and_return_to_kernel) + #ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates kernel mode. */ +- testl $3, CS(%rsp) ++ testb $3, CS(%rsp) + jz 1f + ud2 + 1: +-- +2.14.2 + diff --git a/patches/kernel/0110-bitops-Revert-cbe96375025e-bitops-Add-clear-set_bit3.patch b/patches/kernel/0110-bitops-Revert-cbe96375025e-bitops-Add-clear-set_bit3.patch deleted file mode 100644 index e5d86e8..0000000 --- a/patches/kernel/0110-bitops-Revert-cbe96375025e-bitops-Add-clear-set_bit3.patch +++ /dev/null @@ -1,67 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Thu, 2 Nov 2017 13:30:03 +0100 -Subject: [PATCH] bitops: Revert cbe96375025e ("bitops: Add clear/set_bit32() - to linux/bitops.h") -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -These ops are not endian safe and may break on architectures which have -aligment requirements. - -Reverts: cbe96375025e ("bitops: Add clear/set_bit32() to linux/bitops.h") -Reported-by: Peter Zijlstra -Signed-off-by: Thomas Gleixner -Cc: Andi Kleen -(cherry picked from commit 1943dc07b45e347c52c1bfdd4a37e04a86e399aa) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit adb64d8c852206281ea6ee6590ae35076a219409) -Signed-off-by: Fabian Grünbichler ---- - include/linux/bitops.h | 26 -------------------------- - 1 file changed, 26 deletions(-) - -diff --git a/include/linux/bitops.h b/include/linux/bitops.h -index eb257a96db6d..a83c822c35c2 100644 ---- a/include/linux/bitops.h -+++ b/include/linux/bitops.h -@@ -226,32 +226,6 @@ static inline unsigned long __ffs64(u64 word) - return __ffs((unsigned long)word); - } - --/* -- * clear_bit32 - Clear a bit in memory for u32 array -- * @nr: Bit to clear -- * @addr: u32 * address of bitmap -- * -- * Same as clear_bit, but avoids needing casts for u32 arrays. -- */ -- --static __always_inline void clear_bit32(long nr, volatile u32 *addr) --{ -- clear_bit(nr, (volatile unsigned long *)addr); --} -- --/* -- * set_bit32 - Set a bit in memory for u32 array -- * @nr: Bit to clear -- * @addr: u32 * address of bitmap -- * -- * Same as set_bit, but avoids needing casts for u32 arrays. -- */ -- --static __always_inline void set_bit32(long nr, volatile u32 *addr) --{ -- set_bit(nr, (volatile unsigned long *)addr); --} -- - #ifdef __KERNEL__ - - #ifndef set_mask_bits --- -2.14.2 - diff --git a/patches/kernel/0110-x86-cpuid-Replace-set-clear_bit32.patch b/patches/kernel/0110-x86-cpuid-Replace-set-clear_bit32.patch new file mode 100644 index 0000000..7e56665 --- /dev/null +++ b/patches/kernel/0110-x86-cpuid-Replace-set-clear_bit32.patch @@ -0,0 +1,71 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Thu, 2 Nov 2017 13:22:35 +0100 +Subject: [PATCH] x86/cpuid: Replace set/clear_bit32() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Peter pointed out that the set/clear_bit32() variants are broken in various +aspects. + +Replace them with open coded set/clear_bit() and type cast +cpu_info::x86_capability as it's done in all other places throughout x86. + +Fixes: 0b00de857a64 ("x86/cpuid: Add generic table for CPUID dependencies") +Reported-by: Peter Ziljstra +Signed-off-by: Thomas Gleixner +Cc: Andi Kleen +(cherry picked from commit 06dd688ddda5819025e014b79aea9af6ab475fa2) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 3e511952bc3ff9b233d418b0a75a8331deb08171) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/cpu/cpuid-deps.c | 26 +++++++++++--------------- + 1 file changed, 11 insertions(+), 15 deletions(-) + +diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c +index c21f22d836ad..904b0a3c4e53 100644 +--- a/arch/x86/kernel/cpu/cpuid-deps.c ++++ b/arch/x86/kernel/cpu/cpuid-deps.c +@@ -62,23 +62,19 @@ const static struct cpuid_dep cpuid_deps[] = { + {} + }; + +-static inline void __clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit) +-{ +- clear_bit32(bit, c->x86_capability); +-} +- +-static inline void __setup_clear_cpu_cap(unsigned int bit) +-{ +- clear_cpu_cap(&boot_cpu_data, bit); +- set_bit32(bit, cpu_caps_cleared); +-} +- + static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) + { +- if (!c) +- __setup_clear_cpu_cap(feature); +- else +- __clear_cpu_cap(c, feature); ++ /* ++ * Note: This could use the non atomic __*_bit() variants, but the ++ * rest of the cpufeature code uses atomics as well, so keep it for ++ * consistency. Cleanup all of it separately. ++ */ ++ if (!c) { ++ clear_cpu_cap(&boot_cpu_data, feature); ++ set_bit(feature, (unsigned long *)cpu_caps_cleared); ++ } else { ++ clear_bit(feature, (unsigned long *)c->x86_capability); ++ } + } + + /* Take the capabilities and the BUG bits into account */ +-- +2.14.2 + diff --git a/patches/kernel/0111-bitops-Revert-cbe96375025e-bitops-Add-clear-set_bit3.patch b/patches/kernel/0111-bitops-Revert-cbe96375025e-bitops-Add-clear-set_bit3.patch new file mode 100644 index 0000000..e5d86e8 --- /dev/null +++ b/patches/kernel/0111-bitops-Revert-cbe96375025e-bitops-Add-clear-set_bit3.patch @@ -0,0 +1,67 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Thu, 2 Nov 2017 13:30:03 +0100 +Subject: [PATCH] bitops: Revert cbe96375025e ("bitops: Add clear/set_bit32() + to linux/bitops.h") +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +These ops are not endian safe and may break on architectures which have +aligment requirements. + +Reverts: cbe96375025e ("bitops: Add clear/set_bit32() to linux/bitops.h") +Reported-by: Peter Zijlstra +Signed-off-by: Thomas Gleixner +Cc: Andi Kleen +(cherry picked from commit 1943dc07b45e347c52c1bfdd4a37e04a86e399aa) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit adb64d8c852206281ea6ee6590ae35076a219409) +Signed-off-by: Fabian Grünbichler +--- + include/linux/bitops.h | 26 -------------------------- + 1 file changed, 26 deletions(-) + +diff --git a/include/linux/bitops.h b/include/linux/bitops.h +index eb257a96db6d..a83c822c35c2 100644 +--- a/include/linux/bitops.h ++++ b/include/linux/bitops.h +@@ -226,32 +226,6 @@ static inline unsigned long __ffs64(u64 word) + return __ffs((unsigned long)word); + } + +-/* +- * clear_bit32 - Clear a bit in memory for u32 array +- * @nr: Bit to clear +- * @addr: u32 * address of bitmap +- * +- * Same as clear_bit, but avoids needing casts for u32 arrays. +- */ +- +-static __always_inline void clear_bit32(long nr, volatile u32 *addr) +-{ +- clear_bit(nr, (volatile unsigned long *)addr); +-} +- +-/* +- * set_bit32 - Set a bit in memory for u32 array +- * @nr: Bit to clear +- * @addr: u32 * address of bitmap +- * +- * Same as set_bit, but avoids needing casts for u32 arrays. +- */ +- +-static __always_inline void set_bit32(long nr, volatile u32 *addr) +-{ +- set_bit(nr, (volatile unsigned long *)addr); +-} +- + #ifdef __KERNEL__ + + #ifndef set_mask_bits +-- +2.14.2 + diff --git a/patches/kernel/0111-x86-mm-Define-_PAGE_TABLE-using-_KERNPG_TABLE.patch b/patches/kernel/0111-x86-mm-Define-_PAGE_TABLE-using-_KERNPG_TABLE.patch deleted file mode 100644 index f92c75e..0000000 --- a/patches/kernel/0111-x86-mm-Define-_PAGE_TABLE-using-_KERNPG_TABLE.patch +++ /dev/null @@ -1,48 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Borislav Petkov -Date: Fri, 3 Nov 2017 11:20:28 +0100 -Subject: [PATCH] x86/mm: Define _PAGE_TABLE using _KERNPG_TABLE -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -... so that the difference is obvious. - -No functionality change. - -Signed-off-by: Borislav Petkov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/20171103102028.20284-1-bp@alien8.de -Signed-off-by: Ingo Molnar -(backported from commit c7da092a1f243bfd1bfb4124f538e69e941882da) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 8c69b0c03cd24576ac69c36ede00afae76bab464) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/pgtable_types.h | 3 +-- - 1 file changed, 1 insertion(+), 2 deletions(-) - -diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h -index bf9638e1ee42..01f6dc938ccb 100644 ---- a/arch/x86/include/asm/pgtable_types.h -+++ b/arch/x86/include/asm/pgtable_types.h -@@ -121,10 +121,9 @@ - - #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) - --#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ -- _PAGE_ACCESSED | _PAGE_DIRTY) - #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ - _PAGE_DIRTY) -+#define _PAGE_TABLE (_KERNPG_TABLE | _PAGE_USER) - - /* - * Set of bits not changed in pte_modify. The pte's --- -2.14.2 - diff --git a/patches/kernel/0112-x86-cpufeatures-Re-tabulate-the-X86_FEATURE-definiti.patch b/patches/kernel/0112-x86-cpufeatures-Re-tabulate-the-X86_FEATURE-definiti.patch deleted file mode 100644 index 3883aa5..0000000 --- a/patches/kernel/0112-x86-cpufeatures-Re-tabulate-the-X86_FEATURE-definiti.patch +++ /dev/null @@ -1,623 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Ingo Molnar -Date: Tue, 31 Oct 2017 13:17:22 +0100 -Subject: [PATCH] x86/cpufeatures: Re-tabulate the X86_FEATURE definitions -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Over the years asm/cpufeatures.h has become somewhat of a mess: the original -tabulation style was too narrow, while x86 feature names also kept growing -in length, creating frequent field width overflows. - -Re-tabulate it to make it wider and easier to read/modify. Also harmonize -the tabulation of the other defines in this file to match it. - -Cc: Andrew Morton -Cc: Andy Lutomirski -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Denys Vlasenko -Cc: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/20171031121723.28524-3-mingo@kernel.org -Signed-off-by: Ingo Molnar -(backported from commit acbc845ffefd9fb70466182cd8555a26189462b2) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit df7c6e7b62274889a028357a579acfb2215c3f98) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/cpufeatures.h | 506 +++++++++++++++++++------------------ - 1 file changed, 254 insertions(+), 252 deletions(-) - -diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h -index c465bd6613ed..a021b0756af6 100644 ---- a/arch/x86/include/asm/cpufeatures.h -+++ b/arch/x86/include/asm/cpufeatures.h -@@ -12,8 +12,8 @@ - /* - * Defines x86 CPU feature bits - */ --#define NCAPINTS 18 /* N 32-bit words worth of info */ --#define NBUGINTS 1 /* N 32-bit bug flags */ -+#define NCAPINTS 18 /* N 32-bit words worth of info */ -+#define NBUGINTS 1 /* N 32-bit bug flags */ - - /* - * Note: If the comment begins with a quoted string, that string is used -@@ -27,163 +27,163 @@ - */ - - /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ --#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ --#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ --#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ --#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ --#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ --#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ --#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ --#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ --#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ --#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ --#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ --#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ --#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ --#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ --#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ -+#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ -+#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ -+#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ -+#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ -+#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ -+#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ -+#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ -+#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ -+#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ -+#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ -+#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ -+#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ -+#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ -+#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ -+#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ - /* (plus FCMOVcc, FCOMI with FPU) */ --#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ --#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ --#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ --#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ --#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ --#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ --#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ --#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ --#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ --#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ --#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ --#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ --#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ --#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ --#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ -+#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ -+#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ -+#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ -+#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ -+#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ -+#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ -+#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ -+#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ -+#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ -+#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ -+#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ -+#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ -+#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ -+#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ -+#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ - - /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ - /* Don't duplicate feature flags which are redundant with Intel! */ --#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ --#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ --#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ --#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ --#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ --#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ --#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ --#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ --#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ --#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ -+#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ -+#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ -+#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ -+#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ -+#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ -+#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ -+#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ -+#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ -+#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ -+#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ - - /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ --#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ --#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ --#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ -+#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ -+#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ -+#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ - - /* Other features, Linux-defined mapping, word 3 */ - /* This range is used for feature bits which conflict or are synthesized */ --#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ --#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ --#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ --#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ -+#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ -+#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ -+#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ -+#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ - /* cpu types for specific tunings: */ --#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ --#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ --#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ --#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ --#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ --#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ --#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ --#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ --#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ --#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ --#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ --#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ --#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ --#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ --#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ --#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ --#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ --#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ --#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ --#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ --#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ --#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ --#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ --#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ --#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ --#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ --#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ -+#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -+#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ -+#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ -+#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ -+#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ -+#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ -+#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ -+#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ -+#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ -+#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ -+#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ -+#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ -+#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ -+#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ -+#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ -+#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ -+#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ -+#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ -+#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ -+#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ -+#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ -+#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ -+#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ -+#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ -+#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ -+#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ -+#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ - - /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ --#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ --#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ --#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ --#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ --#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ --#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ --#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ --#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ --#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ --#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ --#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ --#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ --#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ --#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ --#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ --#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ --#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ --#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ --#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ --#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ --#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ --#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ --#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ -+#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ -+#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ -+#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ -+#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ -+#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ -+#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ -+#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ -+#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ -+#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ -+#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ -+#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ -+#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ -+#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ -+#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ -+#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ -+#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ -+#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ -+#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ -+#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ -+#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ -+#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ -+#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ -+#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ - #define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ --#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ --#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ --#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ --#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ --#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ --#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ --#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ -+#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ -+#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ -+#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ -+#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ -+#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ -+#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ -+#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ - - /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ --#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ --#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ --#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ --#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ --#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ --#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ --#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ --#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ --#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ --#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ -+#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ -+#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ -+#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ -+#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ -+#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ -+#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ -+#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ -+#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ -+#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ -+#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ - - /* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ --#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ --#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ --#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ --#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ --#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ --#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ --#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ --#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ --#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ --#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ --#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ --#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ --#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ --#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ --#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ --#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ --#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ --#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ --#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ --#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ --#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ --#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ --#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ --#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ --#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ --#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ -+#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ -+#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ -+#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ -+#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ -+#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ -+#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ -+#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ -+#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ -+#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ -+#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ -+#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ -+#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ -+#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ -+#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ -+#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ -+#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ -+#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ -+#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ -+#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ -+#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ -+#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ -+#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ -+#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ -+#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ -+#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* Last Level Cache performance counter extensions */ -+#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ - - /* - * Auxiliary flags: Linux defined - For features scattered in various -@@ -191,150 +191,152 @@ - * - * Reuse free bits when adding new feature flags! - */ --#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ --#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ --#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ --#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ --#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ --#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ --#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ -+#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ -+#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ -+#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ -+#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ -+#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ -+#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ -+#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ - --#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ --#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -+#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ -+#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -+#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ - --#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ --#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ --#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ --#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ -+#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ -+#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ -+#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ -+#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ - --#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ -+#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ - - /* Virtualization flags: Linux defined, word 8 */ --#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ --#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ --#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ --#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ --#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ -+#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ -+#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ -+#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ -+#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ -+#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ - --#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ --#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ -+#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ -+#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ - - - /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ --#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ --#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ --#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ --#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ --#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ --#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ --#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ --#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ --#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ --#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ --#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ --#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ --#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ --#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ --#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ --#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ --#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ --#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ --#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ --#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ --#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ --#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ --#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ --#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ --#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ --#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ --#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ -+#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ -+#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ -+#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ -+#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ -+#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ -+#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ -+#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ -+#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ -+#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ -+#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ -+#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ -+#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ -+#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ -+#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ -+#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ -+#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ -+#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ -+#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ -+#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ -+#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ -+#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ -+#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ -+#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ -+#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ -+#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ -+#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ -+#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ - - /* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ --#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ --#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ --#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ --#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ -+#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ -+#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ -+#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ -+#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ - - /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ --#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ -+#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ - - /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ --#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ --#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ --#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ -+#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ -+#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ -+#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ - - /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ --#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ --#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ -+#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ -+#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ - - /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ --#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ --#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ --#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ --#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ --#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ --#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ --#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ --#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ --#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ --#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ -+#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -+#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ -+#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ -+#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ -+#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ -+#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ -+#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ -+#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ -+#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ -+#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ - - /* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ --#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ --#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ --#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ --#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ --#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ --#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ --#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ --#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ --#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ --#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ --#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ --#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ -+#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ -+#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ -+#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ -+#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ -+#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ -+#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ -+#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ -+#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ -+#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ -+#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ -+#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ -+#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ -+#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ - - /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ --#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ --#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ --#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ --#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ --#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ --#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ --#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ --#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ --#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ --#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ --#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ --#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ -+#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ -+#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ -+#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ -+#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ -+#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ -+#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ -+#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ -+#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ -+#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ -+#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ -+#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ -+#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ - - /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ --#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ --#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ --#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ -+#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ -+#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ -+#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ - - /* - * BUG word(s) - */ --#define X86_BUG(x) (NCAPINTS*32 + (x)) -+#define X86_BUG(x) (NCAPINTS*32 + (x)) - --#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ --#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ --#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ --#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ --#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ --#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ --#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ --#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ --#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ -+#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ -+#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ -+#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ -+#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ -+#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ -+#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ -+#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ -+#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ -+#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ - #ifdef CONFIG_X86_32 - /* - * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional - * to avoid confusion. - */ --#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ -+#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ - #endif --#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ --#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ --#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ --#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ -+#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ -+#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ -+#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ -+#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ - #endif /* _ASM_X86_CPUFEATURES_H */ --- -2.14.2 - diff --git a/patches/kernel/0112-x86-mm-Define-_PAGE_TABLE-using-_KERNPG_TABLE.patch b/patches/kernel/0112-x86-mm-Define-_PAGE_TABLE-using-_KERNPG_TABLE.patch new file mode 100644 index 0000000..f92c75e --- /dev/null +++ b/patches/kernel/0112-x86-mm-Define-_PAGE_TABLE-using-_KERNPG_TABLE.patch @@ -0,0 +1,48 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Borislav Petkov +Date: Fri, 3 Nov 2017 11:20:28 +0100 +Subject: [PATCH] x86/mm: Define _PAGE_TABLE using _KERNPG_TABLE +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +... so that the difference is obvious. + +No functionality change. + +Signed-off-by: Borislav Petkov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/20171103102028.20284-1-bp@alien8.de +Signed-off-by: Ingo Molnar +(backported from commit c7da092a1f243bfd1bfb4124f538e69e941882da) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 8c69b0c03cd24576ac69c36ede00afae76bab464) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/pgtable_types.h | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h +index bf9638e1ee42..01f6dc938ccb 100644 +--- a/arch/x86/include/asm/pgtable_types.h ++++ b/arch/x86/include/asm/pgtable_types.h +@@ -121,10 +121,9 @@ + + #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) + +-#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ +- _PAGE_ACCESSED | _PAGE_DIRTY) + #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ + _PAGE_DIRTY) ++#define _PAGE_TABLE (_KERNPG_TABLE | _PAGE_USER) + + /* + * Set of bits not changed in pte_modify. The pte's +-- +2.14.2 + diff --git a/patches/kernel/0113-x86-cpufeatures-Fix-various-details-in-the-feature-d.patch b/patches/kernel/0113-x86-cpufeatures-Fix-various-details-in-the-feature-d.patch deleted file mode 100644 index 0b12f37..0000000 --- a/patches/kernel/0113-x86-cpufeatures-Fix-various-details-in-the-feature-d.patch +++ /dev/null @@ -1,369 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Ingo Molnar -Date: Tue, 31 Oct 2017 13:17:23 +0100 -Subject: [PATCH] x86/cpufeatures: Fix various details in the feature - definitions -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Kept this commit separate from the re-tabulation changes, to make -the changes easier to review: - - - add better explanation for entries with no explanation - - fix/enhance the text of some of the entries - - fix the vertical alignment of some of the feature number definitions - - fix inconsistent capitalization - - ... and lots of other small details - -i.e. make it all more of a coherent unit, instead of a patchwork of years of additions. - -Cc: Andrew Morton -Cc: Andy Lutomirski -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Denys Vlasenko -Cc: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/20171031121723.28524-4-mingo@kernel.org -Signed-off-by: Ingo Molnar -(backported from commit f3a624e901c633593156f7b00ca743a6204a29bc) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 256c600cf0edb23ea5f2d70e7da091c909f5ace6) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/cpufeatures.h | 149 ++++++++++++++++++------------------- - 1 file changed, 74 insertions(+), 75 deletions(-) - -diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h -index a021b0756af6..6db782ed9cdb 100644 ---- a/arch/x86/include/asm/cpufeatures.h -+++ b/arch/x86/include/asm/cpufeatures.h -@@ -19,14 +19,12 @@ - * Note: If the comment begins with a quoted string, that string is used - * in /proc/cpuinfo instead of the macro name. If the string is "", - * this feature bit is not displayed in /proc/cpuinfo at all. -- */ -- --/* -+ * - * When adding new features here that depend on other features, -- * please update the table in kernel/cpu/cpuid-deps.c -+ * please update the table in kernel/cpu/cpuid-deps.c as well. - */ - --/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ -+/* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */ - #define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ - #define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ - #define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ -@@ -41,8 +39,7 @@ - #define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ - #define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ - #define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ --#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ -- /* (plus FCMOVcc, FCOMI with FPU) */ -+#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ - #define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ - #define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ - #define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ -@@ -62,15 +59,15 @@ - /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ - /* Don't duplicate feature flags which are redundant with Intel! */ - #define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ --#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ -+#define X86_FEATURE_MP ( 1*32+19) /* MP Capable */ - #define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ - #define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ - #define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ - #define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ - #define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ --#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ --#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ --#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ -+#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64, 64-bit support) */ -+#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow extensions */ -+#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow */ - - /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ - #define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ -@@ -83,66 +80,67 @@ - #define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ - #define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ - #define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ --/* cpu types for specific tunings: */ -+ -+/* CPU types for specific tunings: */ - #define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ - #define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ - #define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ - #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ - #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ --#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ --#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ -+#define X86_FEATURE_UP ( 3*32+ 9) /* SMP kernel running on UP */ -+#define X86_FEATURE_ART ( 3*32+10) /* Always running timer (ART) */ - #define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ - #define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ - #define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ --#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ --#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ --#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ --#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ --#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ -+#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ -+#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ -+#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -+#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" MFENCE synchronizes RDTSC */ -+#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ - #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ - #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ - #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ --#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ -+#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* CPU topology enum extensions */ - #define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ - #define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ - #define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ --#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ --#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ --#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ -+#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */ -+#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */ -+#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ - #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ - #define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ - --/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ -+/* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */ - #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ - #define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ - #define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ --#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ --#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ -+#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */ -+#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ - #define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ --#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ -+#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer Mode eXtensions */ - #define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ - #define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ - #define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ - #define X86_FEATURE_CID ( 4*32+10) /* Context ID */ - #define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ - #define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ --#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ -+#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B instruction */ - #define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ --#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ -+#define X86_FEATURE_PDCM ( 4*32+15) /* Perf/Debug Capabilities MSR */ - #define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ - #define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ - #define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ - #define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ --#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ -+#define X86_FEATURE_X2APIC ( 4*32+21) /* X2APIC */ - #define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ - #define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ --#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ -+#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* TSC deadline timer */ - #define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ --#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ --#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ -+#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */ -+#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE instruction enabled in the OS */ - #define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ --#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ --#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ -+#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit FP conversions */ -+#define X86_FEATURE_RDRAND ( 4*32+30) /* RDRAND instruction */ - #define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ - - /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ -@@ -157,10 +155,10 @@ - #define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ - #define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ - --/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ -+/* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */ - #define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ - #define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ --#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ -+#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure Virtual Machine */ - #define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ - #define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ - #define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ -@@ -174,16 +172,16 @@ - #define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ - #define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ - #define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ --#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ -+#define X86_FEATURE_TCE ( 6*32+17) /* Translation Cache Extension */ - #define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ --#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ --#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ --#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ -+#define X86_FEATURE_TBM ( 6*32+21) /* Trailing Bit Manipulations */ -+#define X86_FEATURE_TOPOEXT ( 6*32+22) /* Topology extensions CPUID leafs */ -+#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* Core performance counter extensions */ - #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ --#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ --#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ -+#define X86_FEATURE_BPEXT ( 6*32+26) /* Data breakpoint extension */ -+#define X86_FEATURE_PTSC ( 6*32+27) /* Performance time-stamp counter */ - #define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* Last Level Cache performance counter extensions */ --#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ -+#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */ - - /* - * Auxiliary flags: Linux defined - For features scattered in various -@@ -191,7 +189,7 @@ - * - * Reuse free bits when adding new feature flags! - */ --#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ -+#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */ - #define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ - #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ - #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ -@@ -205,8 +203,8 @@ - - #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ - #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ --#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ --#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ -+#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ -+#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */ - - #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ - -@@ -217,19 +215,19 @@ - #define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ - #define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ - --#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ -+#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ - #define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ - - --/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ --#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ --#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ -+/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ -+#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ -+#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */ - #define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ - #define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ - #define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ - #define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ - #define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ --#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ -+#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ - #define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ - #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ - #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ -@@ -237,8 +235,8 @@ - #define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ - #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ - #define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ --#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ --#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ -+#define X86_FEATURE_RDSEED ( 9*32+18) /* RDSEED instruction */ -+#define X86_FEATURE_ADX ( 9*32+19) /* ADCX and ADOX instructions */ - #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ - #define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ - #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ -@@ -250,25 +248,25 @@ - #define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ - #define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ - --/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ --#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ --#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ --#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ --#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ -+/* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */ -+#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */ -+#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */ -+#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ -+#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ - --/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ -+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */ - #define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ - --/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ --#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ -+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */ -+#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */ - #define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ - #define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ - --/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ --#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ --#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ -+/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ -+#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ -+#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ - --/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ -+/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ - #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ - #define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ - #define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ -@@ -280,7 +278,7 @@ - #define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ - #define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ - --/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ -+/* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ - #define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ - #define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ - #define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ -@@ -295,24 +293,24 @@ - #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ - #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ - --/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ -+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ - #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ - #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ - #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ - #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ - #define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ - #define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ --#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ --#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ --#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ -+#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */ -+#define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */ -+#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ - #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ - #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ - #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ - --/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ --#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ --#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ --#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ -+/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ -+#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ -+#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */ -+#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */ - - /* - * BUG word(s) -@@ -339,4 +337,5 @@ - #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ - #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ - #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ -+ - #endif /* _ASM_X86_CPUFEATURES_H */ --- -2.14.2 - diff --git a/patches/kernel/0113-x86-cpufeatures-Re-tabulate-the-X86_FEATURE-definiti.patch b/patches/kernel/0113-x86-cpufeatures-Re-tabulate-the-X86_FEATURE-definiti.patch new file mode 100644 index 0000000..3883aa5 --- /dev/null +++ b/patches/kernel/0113-x86-cpufeatures-Re-tabulate-the-X86_FEATURE-definiti.patch @@ -0,0 +1,623 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Ingo Molnar +Date: Tue, 31 Oct 2017 13:17:22 +0100 +Subject: [PATCH] x86/cpufeatures: Re-tabulate the X86_FEATURE definitions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Over the years asm/cpufeatures.h has become somewhat of a mess: the original +tabulation style was too narrow, while x86 feature names also kept growing +in length, creating frequent field width overflows. + +Re-tabulate it to make it wider and easier to read/modify. Also harmonize +the tabulation of the other defines in this file to match it. + +Cc: Andrew Morton +Cc: Andy Lutomirski +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/20171031121723.28524-3-mingo@kernel.org +Signed-off-by: Ingo Molnar +(backported from commit acbc845ffefd9fb70466182cd8555a26189462b2) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit df7c6e7b62274889a028357a579acfb2215c3f98) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/cpufeatures.h | 506 +++++++++++++++++++------------------ + 1 file changed, 254 insertions(+), 252 deletions(-) + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index c465bd6613ed..a021b0756af6 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -12,8 +12,8 @@ + /* + * Defines x86 CPU feature bits + */ +-#define NCAPINTS 18 /* N 32-bit words worth of info */ +-#define NBUGINTS 1 /* N 32-bit bug flags */ ++#define NCAPINTS 18 /* N 32-bit words worth of info */ ++#define NBUGINTS 1 /* N 32-bit bug flags */ + + /* + * Note: If the comment begins with a quoted string, that string is used +@@ -27,163 +27,163 @@ + */ + + /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ +-#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ +-#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ +-#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ +-#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ +-#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ +-#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ +-#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ +-#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ +-#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ +-#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ +-#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ +-#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ +-#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ +-#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ +-#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ ++#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ ++#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ ++#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ ++#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ ++#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ ++#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ ++#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ ++#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ ++#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ ++#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ ++#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ ++#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ ++#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ ++#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ ++#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ + /* (plus FCMOVcc, FCOMI with FPU) */ +-#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ +-#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ +-#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ +-#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ +-#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ +-#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ +-#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ +-#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +-#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ +-#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ +-#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ +-#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ +-#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ +-#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ +-#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ ++#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ ++#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ ++#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ ++#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ ++#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ ++#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ ++#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ ++#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ ++#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ ++#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ ++#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ ++#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ ++#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ ++#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ ++#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ + + /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ + /* Don't duplicate feature flags which are redundant with Intel! */ +-#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ +-#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ +-#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ +-#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ +-#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ +-#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ +-#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ +-#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ +-#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ +-#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ ++#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ ++#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ ++#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ ++#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ ++#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ ++#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ ++#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ ++#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ ++#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ ++#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ + + /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ +-#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ +-#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ +-#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ ++#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ ++#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ ++#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ + + /* Other features, Linux-defined mapping, word 3 */ + /* This range is used for feature bits which conflict or are synthesized */ +-#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ +-#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ +-#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ +-#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ ++#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ ++#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ ++#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ ++#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ + /* cpu types for specific tunings: */ +-#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ +-#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ +-#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ +-#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ +-#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ +-#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ +-#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ +-#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ +-#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ +-#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ +-#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ +-#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ +-#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ +-#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ +-#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ +-#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ +-#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ +-#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ +-#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ +-#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ +-#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ +-#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ +-#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ +-#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ +-#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ +-#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ +-#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ ++#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ ++#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ ++#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ ++#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ ++#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ ++#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ ++#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ ++#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ ++#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ ++#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ ++#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ ++#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ ++#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ ++#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ ++#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ ++#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ ++#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ ++#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ ++#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ ++#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ ++#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ ++#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ ++#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ ++#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ ++#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ ++#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ ++#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ + + /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ +-#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ +-#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ +-#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +-#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ +-#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ +-#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ +-#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ +-#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ +-#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ +-#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ +-#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ +-#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ +-#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ +-#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ +-#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ +-#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ +-#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ +-#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ +-#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ +-#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ +-#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ +-#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ +-#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ ++#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ ++#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ ++#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ ++#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ ++#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ ++#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ ++#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ ++#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ ++#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ ++#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ ++#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ ++#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ ++#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ ++#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ ++#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ ++#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ ++#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ ++#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ ++#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ ++#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ ++#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ ++#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ ++#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ + #define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ +-#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ +-#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ +-#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ +-#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ +-#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ +-#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ +-#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ ++#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ ++#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ ++#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ ++#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ ++#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ ++#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ ++#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ + + /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ +-#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ +-#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ +-#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ +-#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ +-#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ +-#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ +-#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ +-#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ +-#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ +-#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ ++#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ ++#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ ++#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ ++#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ ++#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ ++#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ ++#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ ++#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ ++#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ ++#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ + + /* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ +-#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ +-#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ +-#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ +-#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ +-#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ +-#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ +-#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ +-#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ +-#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ +-#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ +-#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ +-#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ +-#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ +-#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ +-#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ +-#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ +-#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ +-#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ +-#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ +-#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ +-#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ +-#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ +-#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ +-#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ +-#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ +-#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ ++#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ ++#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ ++#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ ++#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ ++#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ ++#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ ++#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ ++#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ ++#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ ++#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ ++#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ ++#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ ++#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ ++#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ ++#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ ++#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ ++#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ ++#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ ++#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ ++#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ ++#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ ++#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ ++#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ ++#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ ++#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* Last Level Cache performance counter extensions */ ++#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ + + /* + * Auxiliary flags: Linux defined - For features scattered in various +@@ -191,150 +191,152 @@ + * + * Reuse free bits when adding new feature flags! + */ +-#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ +-#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ +-#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ +-#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +-#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ +-#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ +-#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ ++#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ ++#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ ++#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ ++#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ ++#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ ++#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ ++#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ + +-#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ +-#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ ++#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ ++#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ ++#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ + +-#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ +-#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ +-#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ +-#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ ++#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ ++#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ ++#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ ++#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ + +-#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ ++#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ + + /* Virtualization flags: Linux defined, word 8 */ +-#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ +-#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ +-#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ +-#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ +-#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ ++#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ ++#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ ++#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ ++#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ ++#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ + +-#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ +-#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ ++#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ ++#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ + + + /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ +-#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +-#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ +-#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ +-#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ +-#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ +-#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ +-#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ +-#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ +-#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ +-#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ +-#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ +-#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ +-#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ +-#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ +-#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ +-#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ +-#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ +-#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +-#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ +-#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ +-#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ +-#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ +-#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ +-#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ +-#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ +-#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ +-#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ ++#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ ++#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ ++#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ ++#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ ++#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ ++#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ ++#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ ++#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ ++#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ ++#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ ++#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ ++#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ ++#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ ++#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ ++#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ ++#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ ++#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ ++#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ ++#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ ++#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ ++#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ ++#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ ++#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ ++#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ ++#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ ++#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ ++#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ + + /* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ +-#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ +-#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ +-#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ +-#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ ++#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ ++#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ ++#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ ++#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ + + /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ +-#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ ++#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ + + /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ +-#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ +-#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ +-#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ ++#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ ++#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ ++#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ + + /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ +-#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ +-#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ ++#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ ++#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ + + /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ +-#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +-#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ +-#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ +-#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ +-#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ +-#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ +-#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ +-#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ +-#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ +-#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ ++#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ ++#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ ++#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ ++#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ ++#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ ++#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ ++#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ ++#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ ++#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ ++#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ + + /* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ +-#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ +-#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ +-#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ +-#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ +-#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ +-#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ +-#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ +-#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ +-#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ +-#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ +-#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ +-#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ ++#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ ++#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ ++#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ ++#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ ++#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ ++#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ ++#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ ++#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ ++#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ ++#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ ++#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ ++#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ ++#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ + + /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ +-#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ +-#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ +-#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +-#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +-#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ +-#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ +-#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ +-#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ +-#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ +-#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ +-#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ +-#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ ++#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ ++#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ ++#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ ++#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ ++#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ ++#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ ++#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ ++#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ ++#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ ++#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ ++#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ ++#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ + + /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ +-#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ +-#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ +-#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ ++#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ ++#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ ++#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ + + /* + * BUG word(s) + */ +-#define X86_BUG(x) (NCAPINTS*32 + (x)) ++#define X86_BUG(x) (NCAPINTS*32 + (x)) + +-#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ +-#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ +-#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ +-#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ +-#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ +-#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ +-#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ +-#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ +-#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ ++#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ ++#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ ++#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ ++#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ ++#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ ++#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ ++#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ ++#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ ++#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ + #ifdef CONFIG_X86_32 + /* + * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional + * to avoid confusion. + */ +-#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ ++#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ + #endif +-#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ +-#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ +-#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ +-#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ ++#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ ++#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ ++#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ ++#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ + #endif /* _ASM_X86_CPUFEATURES_H */ +-- +2.14.2 + diff --git a/patches/kernel/0114-selftests-x86-protection_keys-Fix-syscall-NR-redefin.patch b/patches/kernel/0114-selftests-x86-protection_keys-Fix-syscall-NR-redefin.patch deleted file mode 100644 index 810e63a..0000000 --- a/patches/kernel/0114-selftests-x86-protection_keys-Fix-syscall-NR-redefin.patch +++ /dev/null @@ -1,77 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Sat, 4 Nov 2017 04:19:48 -0700 -Subject: [PATCH] selftests/x86/protection_keys: Fix syscall NR redefinition - warnings -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -On new enough glibc, the pkey syscalls numbers are available. Check -first before defining them to avoid warnings like: - -protection_keys.c:198:0: warning: "SYS_pkey_alloc" redefined - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: stable@vger.kernel.org -Link: http://lkml.kernel.org/r/1fbef53a9e6befb7165ff855fc1a7d4788a191d6.1509794321.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 693cb5580fdb026922363aa103add64b3ecd572e) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 64c8ec4beb84ca8b0ff3250a8b6044d06be6315b) -Signed-off-by: Fabian Grünbichler ---- - tools/testing/selftests/x86/protection_keys.c | 24 ++++++++++++++++++------ - 1 file changed, 18 insertions(+), 6 deletions(-) - -diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c -index 3237bc010e1c..3c54d5c40952 100644 ---- a/tools/testing/selftests/x86/protection_keys.c -+++ b/tools/testing/selftests/x86/protection_keys.c -@@ -188,17 +188,29 @@ void lots_o_noops_around_write(int *write_to_me) - #define u64 uint64_t - - #ifdef __i386__ --#define SYS_mprotect_key 380 --#define SYS_pkey_alloc 381 --#define SYS_pkey_free 382 -+ -+#ifndef SYS_mprotect_key -+# define SYS_mprotect_key 380 -+#endif -+#ifndef SYS_pkey_alloc -+# define SYS_pkey_alloc 381 -+# define SYS_pkey_free 382 -+#endif - #define REG_IP_IDX REG_EIP - #define si_pkey_offset 0x14 -+ - #else --#define SYS_mprotect_key 329 --#define SYS_pkey_alloc 330 --#define SYS_pkey_free 331 -+ -+#ifndef SYS_mprotect_key -+# define SYS_mprotect_key 329 -+#endif -+#ifndef SYS_pkey_alloc -+# define SYS_pkey_alloc 330 -+# define SYS_pkey_free 331 -+#endif - #define REG_IP_IDX REG_RIP - #define si_pkey_offset 0x20 -+ - #endif - - void dump_mem(void *dumpme, int len_bytes) --- -2.14.2 - diff --git a/patches/kernel/0114-x86-cpufeatures-Fix-various-details-in-the-feature-d.patch b/patches/kernel/0114-x86-cpufeatures-Fix-various-details-in-the-feature-d.patch new file mode 100644 index 0000000..0b12f37 --- /dev/null +++ b/patches/kernel/0114-x86-cpufeatures-Fix-various-details-in-the-feature-d.patch @@ -0,0 +1,369 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Ingo Molnar +Date: Tue, 31 Oct 2017 13:17:23 +0100 +Subject: [PATCH] x86/cpufeatures: Fix various details in the feature + definitions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Kept this commit separate from the re-tabulation changes, to make +the changes easier to review: + + - add better explanation for entries with no explanation + - fix/enhance the text of some of the entries + - fix the vertical alignment of some of the feature number definitions + - fix inconsistent capitalization + - ... and lots of other small details + +i.e. make it all more of a coherent unit, instead of a patchwork of years of additions. + +Cc: Andrew Morton +Cc: Andy Lutomirski +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/20171031121723.28524-4-mingo@kernel.org +Signed-off-by: Ingo Molnar +(backported from commit f3a624e901c633593156f7b00ca743a6204a29bc) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 256c600cf0edb23ea5f2d70e7da091c909f5ace6) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/cpufeatures.h | 149 ++++++++++++++++++------------------- + 1 file changed, 74 insertions(+), 75 deletions(-) + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index a021b0756af6..6db782ed9cdb 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -19,14 +19,12 @@ + * Note: If the comment begins with a quoted string, that string is used + * in /proc/cpuinfo instead of the macro name. If the string is "", + * this feature bit is not displayed in /proc/cpuinfo at all. +- */ +- +-/* ++ * + * When adding new features here that depend on other features, +- * please update the table in kernel/cpu/cpuid-deps.c ++ * please update the table in kernel/cpu/cpuid-deps.c as well. + */ + +-/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ ++/* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */ + #define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ + #define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ + #define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ +@@ -41,8 +39,7 @@ + #define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ + #define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ + #define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ +-#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ +- /* (plus FCMOVcc, FCOMI with FPU) */ ++#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ + #define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ + #define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ + #define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ +@@ -62,15 +59,15 @@ + /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ + /* Don't duplicate feature flags which are redundant with Intel! */ + #define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ +-#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ ++#define X86_FEATURE_MP ( 1*32+19) /* MP Capable */ + #define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ + #define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ + #define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ + #define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ + #define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ +-#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ +-#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ +-#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ ++#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64, 64-bit support) */ ++#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow extensions */ ++#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow */ + + /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ + #define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ +@@ -83,66 +80,67 @@ + #define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ + #define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ + #define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ +-/* cpu types for specific tunings: */ ++ ++/* CPU types for specific tunings: */ + #define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ + #define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ + #define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ + #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ + #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ +-#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ +-#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ ++#define X86_FEATURE_UP ( 3*32+ 9) /* SMP kernel running on UP */ ++#define X86_FEATURE_ART ( 3*32+10) /* Always running timer (ART) */ + #define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ + #define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ + #define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ +-#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ +-#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ +-#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ +-#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ +-#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ ++#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ ++#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ ++#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ ++#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" MFENCE synchronizes RDTSC */ ++#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ + #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ + #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ + #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ +-#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ ++#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* CPU topology enum extensions */ + #define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ + #define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ + #define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ +-#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ +-#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ +-#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ ++#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */ ++#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */ ++#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ + #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ + #define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ + +-/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ ++/* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */ + #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ + #define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ + #define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +-#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ +-#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ ++#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */ ++#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ + #define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ +-#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ ++#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer Mode eXtensions */ + #define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ + #define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ + #define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ + #define X86_FEATURE_CID ( 4*32+10) /* Context ID */ + #define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ + #define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ +-#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ ++#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B instruction */ + #define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ +-#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ ++#define X86_FEATURE_PDCM ( 4*32+15) /* Perf/Debug Capabilities MSR */ + #define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ + #define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ + #define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ + #define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ +-#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ ++#define X86_FEATURE_X2APIC ( 4*32+21) /* X2APIC */ + #define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ + #define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ +-#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ ++#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* TSC deadline timer */ + #define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ +-#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ +-#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ ++#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */ ++#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE instruction enabled in the OS */ + #define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ +-#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ +-#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ ++#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit FP conversions */ ++#define X86_FEATURE_RDRAND ( 4*32+30) /* RDRAND instruction */ + #define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ + + /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ +@@ -157,10 +155,10 @@ + #define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ + #define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ + +-/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ ++/* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */ + #define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ + #define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ +-#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ ++#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure Virtual Machine */ + #define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ + #define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ + #define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ +@@ -174,16 +172,16 @@ + #define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ + #define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ + #define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ +-#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ ++#define X86_FEATURE_TCE ( 6*32+17) /* Translation Cache Extension */ + #define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ +-#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ +-#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ +-#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ ++#define X86_FEATURE_TBM ( 6*32+21) /* Trailing Bit Manipulations */ ++#define X86_FEATURE_TOPOEXT ( 6*32+22) /* Topology extensions CPUID leafs */ ++#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* Core performance counter extensions */ + #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ +-#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ +-#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ ++#define X86_FEATURE_BPEXT ( 6*32+26) /* Data breakpoint extension */ ++#define X86_FEATURE_PTSC ( 6*32+27) /* Performance time-stamp counter */ + #define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* Last Level Cache performance counter extensions */ +-#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ ++#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */ + + /* + * Auxiliary flags: Linux defined - For features scattered in various +@@ -191,7 +189,7 @@ + * + * Reuse free bits when adding new feature flags! + */ +-#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ ++#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */ + #define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ + #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ + #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +@@ -205,8 +203,8 @@ + + #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ + #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ +-#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ +-#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ ++#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ ++#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */ + + #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ + +@@ -217,19 +215,19 @@ + #define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ + #define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ + +-#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ ++#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ + #define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ + + +-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ +-#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +-#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ ++/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ ++#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ ++#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */ + #define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ + #define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ + #define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ + #define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ + #define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ +-#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ ++#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ + #define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ + #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ + #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ +@@ -237,8 +235,8 @@ + #define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ + #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ + #define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ +-#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ +-#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ ++#define X86_FEATURE_RDSEED ( 9*32+18) /* RDSEED instruction */ ++#define X86_FEATURE_ADX ( 9*32+19) /* ADCX and ADOX instructions */ + #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ + #define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ + #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ +@@ -250,25 +248,25 @@ + #define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ + #define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ + +-/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ +-#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ +-#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ +-#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ +-#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ ++/* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */ ++#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */ ++#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */ ++#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ ++#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ + +-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ ++/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */ + #define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ + +-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ +-#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ ++/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */ ++#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */ + #define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ + #define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ + +-/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ +-#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ +-#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ ++/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ ++#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ ++#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ + +-/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ ++/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ + #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ + #define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ + #define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ +@@ -280,7 +278,7 @@ + #define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ + #define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ + +-/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ ++/* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ + #define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ + #define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ + #define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ +@@ -295,24 +293,24 @@ + #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ + #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ + +-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ ++/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ + #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ + #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ + #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ + #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ + #define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ + #define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ +-#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ +-#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ +-#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ ++#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */ ++#define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */ ++#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ + #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ + #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ + #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ + +-/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ +-#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ +-#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ +-#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ ++/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ ++#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ ++#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */ ++#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */ + + /* + * BUG word(s) +@@ -339,4 +337,5 @@ + #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ + #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ + #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ ++ + #endif /* _ASM_X86_CPUFEATURES_H */ +-- +2.14.2 + diff --git a/patches/kernel/0115-selftests-x86-ldt_gdt-Robustify-against-set_thread_a.patch b/patches/kernel/0115-selftests-x86-ldt_gdt-Robustify-against-set_thread_a.patch deleted file mode 100644 index f3cb8f9..0000000 --- a/patches/kernel/0115-selftests-x86-ldt_gdt-Robustify-against-set_thread_a.patch +++ /dev/null @@ -1,64 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Sat, 4 Nov 2017 04:19:49 -0700 -Subject: [PATCH] selftests/x86/ldt_gdt: Robustify against set_thread_area() - and LAR oddities -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Bits 19:16 of LAR's result are undefined, and some upcoming -improvements to the test case seem to trigger this. Mask off those -bits to avoid spurious failures. - -commit 5b781c7e317f ("x86/tls: Forcibly set the accessed bit in TLS -segments") adds a valid case in which LAR's output doesn't quite -agree with set_thread_area()'s input. This isn't triggered in the -test as is, but it will be if we start calling set_thread_area() -with the accessed bit clear. Work around this discrepency. - -I've added a Fixes tag so that -stable can pick this up if neccesary. - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Fixes: 5b781c7e317f ("x86/tls: Forcibly set the accessed bit in TLS segments") -Link: http://lkml.kernel.org/r/b82f3f89c034b53580970ac865139fd8863f44e2.1509794321.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit d60ad744c9741586010d4bea286f09a063a90fbd) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit d4c2ffcf3efe0d9610919fd48f5a1a5e38c28c07) -Signed-off-by: Fabian Grünbichler ---- - tools/testing/selftests/x86/ldt_gdt.c | 10 +++++++++- - 1 file changed, 9 insertions(+), 1 deletion(-) - -diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c -index b9a22f18566a..b2c54f4673f2 100644 ---- a/tools/testing/selftests/x86/ldt_gdt.c -+++ b/tools/testing/selftests/x86/ldt_gdt.c -@@ -114,7 +114,15 @@ static void check_valid_segment(uint16_t index, int ldt, - return; - } - -- if (ar != expected_ar) { -+ /* The SDM says "bits 19:16 are undefined". Thanks. */ -+ ar &= ~0xF0000; -+ -+ /* -+ * NB: Different Linux versions do different things with the -+ * accessed bit in set_thread_area(). -+ */ -+ if (ar != expected_ar && -+ (ldt || ar != (expected_ar | AR_ACCESSED))) { - printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n", - (ldt ? "LDT" : "GDT"), index, ar, expected_ar); - nerrs++; --- -2.14.2 - diff --git a/patches/kernel/0115-selftests-x86-protection_keys-Fix-syscall-NR-redefin.patch b/patches/kernel/0115-selftests-x86-protection_keys-Fix-syscall-NR-redefin.patch new file mode 100644 index 0000000..810e63a --- /dev/null +++ b/patches/kernel/0115-selftests-x86-protection_keys-Fix-syscall-NR-redefin.patch @@ -0,0 +1,77 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Sat, 4 Nov 2017 04:19:48 -0700 +Subject: [PATCH] selftests/x86/protection_keys: Fix syscall NR redefinition + warnings +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +On new enough glibc, the pkey syscalls numbers are available. Check +first before defining them to avoid warnings like: + +protection_keys.c:198:0: warning: "SYS_pkey_alloc" redefined + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: stable@vger.kernel.org +Link: http://lkml.kernel.org/r/1fbef53a9e6befb7165ff855fc1a7d4788a191d6.1509794321.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 693cb5580fdb026922363aa103add64b3ecd572e) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 64c8ec4beb84ca8b0ff3250a8b6044d06be6315b) +Signed-off-by: Fabian Grünbichler +--- + tools/testing/selftests/x86/protection_keys.c | 24 ++++++++++++++++++------ + 1 file changed, 18 insertions(+), 6 deletions(-) + +diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c +index 3237bc010e1c..3c54d5c40952 100644 +--- a/tools/testing/selftests/x86/protection_keys.c ++++ b/tools/testing/selftests/x86/protection_keys.c +@@ -188,17 +188,29 @@ void lots_o_noops_around_write(int *write_to_me) + #define u64 uint64_t + + #ifdef __i386__ +-#define SYS_mprotect_key 380 +-#define SYS_pkey_alloc 381 +-#define SYS_pkey_free 382 ++ ++#ifndef SYS_mprotect_key ++# define SYS_mprotect_key 380 ++#endif ++#ifndef SYS_pkey_alloc ++# define SYS_pkey_alloc 381 ++# define SYS_pkey_free 382 ++#endif + #define REG_IP_IDX REG_EIP + #define si_pkey_offset 0x14 ++ + #else +-#define SYS_mprotect_key 329 +-#define SYS_pkey_alloc 330 +-#define SYS_pkey_free 331 ++ ++#ifndef SYS_mprotect_key ++# define SYS_mprotect_key 329 ++#endif ++#ifndef SYS_pkey_alloc ++# define SYS_pkey_alloc 330 ++# define SYS_pkey_free 331 ++#endif + #define REG_IP_IDX REG_RIP + #define si_pkey_offset 0x20 ++ + #endif + + void dump_mem(void *dumpme, int len_bytes) +-- +2.14.2 + diff --git a/patches/kernel/0116-selftests-x86-ldt_gdt-Add-infrastructure-to-test-set.patch b/patches/kernel/0116-selftests-x86-ldt_gdt-Add-infrastructure-to-test-set.patch deleted file mode 100644 index 62ea6e9..0000000 --- a/patches/kernel/0116-selftests-x86-ldt_gdt-Add-infrastructure-to-test-set.patch +++ /dev/null @@ -1,114 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Sat, 4 Nov 2017 04:19:50 -0700 -Subject: [PATCH] selftests/x86/ldt_gdt: Add infrastructure to test - set_thread_area() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Much of the test design could apply to set_thread_area() (i.e. GDT), -not just modify_ldt(). Add set_thread_area() to the -install_valid_mode() helper. - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/02c23f8fba5547007f741dc24c3926e5284ede02.1509794321.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit d744dcad39094c9187075e274d1cdef79c57c8b5) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit d6ae7ac5849304e520538a6ce3111f372f809596) -Signed-off-by: Fabian Grünbichler ---- - tools/testing/selftests/x86/ldt_gdt.c | 53 ++++++++++++++++++++++++----------- - 1 file changed, 37 insertions(+), 16 deletions(-) - -diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c -index b2c54f4673f2..337f217d0ae9 100644 ---- a/tools/testing/selftests/x86/ldt_gdt.c -+++ b/tools/testing/selftests/x86/ldt_gdt.c -@@ -136,30 +136,51 @@ static void check_valid_segment(uint16_t index, int ldt, - } - } - --static bool install_valid_mode(const struct user_desc *desc, uint32_t ar, -- bool oldmode) -+static bool install_valid_mode(const struct user_desc *d, uint32_t ar, -+ bool oldmode, bool ldt) - { -- int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11, -- desc, sizeof(*desc)); -- if (ret < -1) -- errno = -ret; -+ struct user_desc desc = *d; -+ int ret; -+ -+ if (!ldt) { -+#ifndef __i386__ -+ /* No point testing set_thread_area in a 64-bit build */ -+ return false; -+#endif -+ if (!gdt_entry_num) -+ return false; -+ desc.entry_number = gdt_entry_num; -+ -+ ret = syscall(SYS_set_thread_area, &desc); -+ } else { -+ ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11, -+ &desc, sizeof(desc)); -+ -+ if (ret < -1) -+ errno = -ret; -+ -+ if (ret != 0 && errno == ENOSYS) { -+ printf("[OK]\tmodify_ldt returned -ENOSYS\n"); -+ return false; -+ } -+ } -+ - if (ret == 0) { -- uint32_t limit = desc->limit; -- if (desc->limit_in_pages) -+ uint32_t limit = desc.limit; -+ if (desc.limit_in_pages) - limit = (limit << 12) + 4095; -- check_valid_segment(desc->entry_number, 1, ar, limit, true); -+ check_valid_segment(desc.entry_number, ldt, ar, limit, true); - return true; -- } else if (errno == ENOSYS) { -- printf("[OK]\tmodify_ldt returned -ENOSYS\n"); -- return false; - } else { -- if (desc->seg_32bit) { -- printf("[FAIL]\tUnexpected modify_ldt failure %d\n", -+ if (desc.seg_32bit) { -+ printf("[FAIL]\tUnexpected %s failure %d\n", -+ ldt ? "modify_ldt" : "set_thread_area", - errno); - nerrs++; - return false; - } else { -- printf("[OK]\tmodify_ldt rejected 16 bit segment\n"); -+ printf("[OK]\t%s rejected 16 bit segment\n", -+ ldt ? "modify_ldt" : "set_thread_area"); - return false; - } - } -@@ -167,7 +188,7 @@ static bool install_valid_mode(const struct user_desc *desc, uint32_t ar, - - static bool install_valid(const struct user_desc *desc, uint32_t ar) - { -- return install_valid_mode(desc, ar, false); -+ return install_valid_mode(desc, ar, false, true); - } - - static void install_invalid(const struct user_desc *desc, bool oldmode) --- -2.14.2 - diff --git a/patches/kernel/0116-selftests-x86-ldt_gdt-Robustify-against-set_thread_a.patch b/patches/kernel/0116-selftests-x86-ldt_gdt-Robustify-against-set_thread_a.patch new file mode 100644 index 0000000..f3cb8f9 --- /dev/null +++ b/patches/kernel/0116-selftests-x86-ldt_gdt-Robustify-against-set_thread_a.patch @@ -0,0 +1,64 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Sat, 4 Nov 2017 04:19:49 -0700 +Subject: [PATCH] selftests/x86/ldt_gdt: Robustify against set_thread_area() + and LAR oddities +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Bits 19:16 of LAR's result are undefined, and some upcoming +improvements to the test case seem to trigger this. Mask off those +bits to avoid spurious failures. + +commit 5b781c7e317f ("x86/tls: Forcibly set the accessed bit in TLS +segments") adds a valid case in which LAR's output doesn't quite +agree with set_thread_area()'s input. This isn't triggered in the +test as is, but it will be if we start calling set_thread_area() +with the accessed bit clear. Work around this discrepency. + +I've added a Fixes tag so that -stable can pick this up if neccesary. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Fixes: 5b781c7e317f ("x86/tls: Forcibly set the accessed bit in TLS segments") +Link: http://lkml.kernel.org/r/b82f3f89c034b53580970ac865139fd8863f44e2.1509794321.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit d60ad744c9741586010d4bea286f09a063a90fbd) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit d4c2ffcf3efe0d9610919fd48f5a1a5e38c28c07) +Signed-off-by: Fabian Grünbichler +--- + tools/testing/selftests/x86/ldt_gdt.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c +index b9a22f18566a..b2c54f4673f2 100644 +--- a/tools/testing/selftests/x86/ldt_gdt.c ++++ b/tools/testing/selftests/x86/ldt_gdt.c +@@ -114,7 +114,15 @@ static void check_valid_segment(uint16_t index, int ldt, + return; + } + +- if (ar != expected_ar) { ++ /* The SDM says "bits 19:16 are undefined". Thanks. */ ++ ar &= ~0xF0000; ++ ++ /* ++ * NB: Different Linux versions do different things with the ++ * accessed bit in set_thread_area(). ++ */ ++ if (ar != expected_ar && ++ (ldt || ar != (expected_ar | AR_ACCESSED))) { + printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n", + (ldt ? "LDT" : "GDT"), index, ar, expected_ar); + nerrs++; +-- +2.14.2 + diff --git a/patches/kernel/0117-selftests-x86-ldt_gdt-Add-infrastructure-to-test-set.patch b/patches/kernel/0117-selftests-x86-ldt_gdt-Add-infrastructure-to-test-set.patch new file mode 100644 index 0000000..62ea6e9 --- /dev/null +++ b/patches/kernel/0117-selftests-x86-ldt_gdt-Add-infrastructure-to-test-set.patch @@ -0,0 +1,114 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Sat, 4 Nov 2017 04:19:50 -0700 +Subject: [PATCH] selftests/x86/ldt_gdt: Add infrastructure to test + set_thread_area() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Much of the test design could apply to set_thread_area() (i.e. GDT), +not just modify_ldt(). Add set_thread_area() to the +install_valid_mode() helper. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/02c23f8fba5547007f741dc24c3926e5284ede02.1509794321.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit d744dcad39094c9187075e274d1cdef79c57c8b5) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit d6ae7ac5849304e520538a6ce3111f372f809596) +Signed-off-by: Fabian Grünbichler +--- + tools/testing/selftests/x86/ldt_gdt.c | 53 ++++++++++++++++++++++++----------- + 1 file changed, 37 insertions(+), 16 deletions(-) + +diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c +index b2c54f4673f2..337f217d0ae9 100644 +--- a/tools/testing/selftests/x86/ldt_gdt.c ++++ b/tools/testing/selftests/x86/ldt_gdt.c +@@ -136,30 +136,51 @@ static void check_valid_segment(uint16_t index, int ldt, + } + } + +-static bool install_valid_mode(const struct user_desc *desc, uint32_t ar, +- bool oldmode) ++static bool install_valid_mode(const struct user_desc *d, uint32_t ar, ++ bool oldmode, bool ldt) + { +- int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11, +- desc, sizeof(*desc)); +- if (ret < -1) +- errno = -ret; ++ struct user_desc desc = *d; ++ int ret; ++ ++ if (!ldt) { ++#ifndef __i386__ ++ /* No point testing set_thread_area in a 64-bit build */ ++ return false; ++#endif ++ if (!gdt_entry_num) ++ return false; ++ desc.entry_number = gdt_entry_num; ++ ++ ret = syscall(SYS_set_thread_area, &desc); ++ } else { ++ ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11, ++ &desc, sizeof(desc)); ++ ++ if (ret < -1) ++ errno = -ret; ++ ++ if (ret != 0 && errno == ENOSYS) { ++ printf("[OK]\tmodify_ldt returned -ENOSYS\n"); ++ return false; ++ } ++ } ++ + if (ret == 0) { +- uint32_t limit = desc->limit; +- if (desc->limit_in_pages) ++ uint32_t limit = desc.limit; ++ if (desc.limit_in_pages) + limit = (limit << 12) + 4095; +- check_valid_segment(desc->entry_number, 1, ar, limit, true); ++ check_valid_segment(desc.entry_number, ldt, ar, limit, true); + return true; +- } else if (errno == ENOSYS) { +- printf("[OK]\tmodify_ldt returned -ENOSYS\n"); +- return false; + } else { +- if (desc->seg_32bit) { +- printf("[FAIL]\tUnexpected modify_ldt failure %d\n", ++ if (desc.seg_32bit) { ++ printf("[FAIL]\tUnexpected %s failure %d\n", ++ ldt ? "modify_ldt" : "set_thread_area", + errno); + nerrs++; + return false; + } else { +- printf("[OK]\tmodify_ldt rejected 16 bit segment\n"); ++ printf("[OK]\t%s rejected 16 bit segment\n", ++ ldt ? "modify_ldt" : "set_thread_area"); + return false; + } + } +@@ -167,7 +188,7 @@ static bool install_valid_mode(const struct user_desc *desc, uint32_t ar, + + static bool install_valid(const struct user_desc *desc, uint32_t ar) + { +- return install_valid_mode(desc, ar, false); ++ return install_valid_mode(desc, ar, false, true); + } + + static void install_invalid(const struct user_desc *desc, bool oldmode) +-- +2.14.2 + diff --git a/patches/kernel/0117-selftests-x86-ldt_gdt-Run-most-existing-LDT-test-cas.patch b/patches/kernel/0117-selftests-x86-ldt_gdt-Run-most-existing-LDT-test-cas.patch deleted file mode 100644 index a1d34a3..0000000 --- a/patches/kernel/0117-selftests-x86-ldt_gdt-Run-most-existing-LDT-test-cas.patch +++ /dev/null @@ -1,54 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Sat, 4 Nov 2017 04:19:51 -0700 -Subject: [PATCH] selftests/x86/ldt_gdt: Run most existing LDT test cases - against the GDT as well -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Now that the main test infrastructure supports the GDT, run tests -that will pass the kernel's GDT permission tests against the GDT. - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/686a1eda63414da38fcecc2412db8dba1ae40581.1509794321.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit adedf2893c192dd09b1cc2f2dcfdd7cad99ec49d) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit f05c092307d8479094d83d4337d66e6e86e730a9) -Signed-off-by: Fabian Grünbichler ---- - tools/testing/selftests/x86/ldt_gdt.c | 10 +++++++++- - 1 file changed, 9 insertions(+), 1 deletion(-) - -diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c -index 337f217d0ae9..05d0d6f49c2c 100644 ---- a/tools/testing/selftests/x86/ldt_gdt.c -+++ b/tools/testing/selftests/x86/ldt_gdt.c -@@ -188,7 +188,15 @@ static bool install_valid_mode(const struct user_desc *d, uint32_t ar, - - static bool install_valid(const struct user_desc *desc, uint32_t ar) - { -- return install_valid_mode(desc, ar, false, true); -+ bool ret = install_valid_mode(desc, ar, false, true); -+ -+ if (desc->contents <= 1 && desc->seg_32bit && -+ !desc->seg_not_present) { -+ /* Should work in the GDT, too. */ -+ install_valid_mode(desc, ar, false, false); -+ } -+ -+ return ret; - } - - static void install_invalid(const struct user_desc *desc, bool oldmode) --- -2.14.2 - diff --git a/patches/kernel/0118-selftests-x86-ldt_gdt-Run-most-existing-LDT-test-cas.patch b/patches/kernel/0118-selftests-x86-ldt_gdt-Run-most-existing-LDT-test-cas.patch new file mode 100644 index 0000000..a1d34a3 --- /dev/null +++ b/patches/kernel/0118-selftests-x86-ldt_gdt-Run-most-existing-LDT-test-cas.patch @@ -0,0 +1,54 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Sat, 4 Nov 2017 04:19:51 -0700 +Subject: [PATCH] selftests/x86/ldt_gdt: Run most existing LDT test cases + against the GDT as well +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Now that the main test infrastructure supports the GDT, run tests +that will pass the kernel's GDT permission tests against the GDT. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/686a1eda63414da38fcecc2412db8dba1ae40581.1509794321.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit adedf2893c192dd09b1cc2f2dcfdd7cad99ec49d) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit f05c092307d8479094d83d4337d66e6e86e730a9) +Signed-off-by: Fabian Grünbichler +--- + tools/testing/selftests/x86/ldt_gdt.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c +index 337f217d0ae9..05d0d6f49c2c 100644 +--- a/tools/testing/selftests/x86/ldt_gdt.c ++++ b/tools/testing/selftests/x86/ldt_gdt.c +@@ -188,7 +188,15 @@ static bool install_valid_mode(const struct user_desc *d, uint32_t ar, + + static bool install_valid(const struct user_desc *desc, uint32_t ar) + { +- return install_valid_mode(desc, ar, false, true); ++ bool ret = install_valid_mode(desc, ar, false, true); ++ ++ if (desc->contents <= 1 && desc->seg_32bit && ++ !desc->seg_not_present) { ++ /* Should work in the GDT, too. */ ++ install_valid_mode(desc, ar, false, false); ++ } ++ ++ return ret; + } + + static void install_invalid(const struct user_desc *desc, bool oldmode) +-- +2.14.2 + diff --git a/patches/kernel/0118-selftests-x86-ldt_get-Add-a-few-additional-tests-for.patch b/patches/kernel/0118-selftests-x86-ldt_get-Add-a-few-additional-tests-for.patch deleted file mode 100644 index ebb340b..0000000 --- a/patches/kernel/0118-selftests-x86-ldt_get-Add-a-few-additional-tests-for.patch +++ /dev/null @@ -1,66 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Sat, 4 Nov 2017 04:19:52 -0700 -Subject: [PATCH] selftests/x86/ldt_get: Add a few additional tests for limits -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -We weren't testing the .limit and .limit_in_pages fields very well. -Add more tests. - -This addition seems to trigger the "bits 16:19 are undefined" issue -that was fixed in an earlier patch. I think that, at least on my -CPU, the high nibble of the limit ends in LAR bits 16:19. - -Signed-off-by: Andy Lutomirski -Cc: Borislav Petkov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/5601c15ea9b3113d288953fd2838b18bedf6bc67.1509794321.git.luto@kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit fec8f5ae1715a01c72ad52cb2ecd8aacaf142302) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit a1cdabf77d7ed9ba02697ad8beb04adf46a7c7b8) -Signed-off-by: Fabian Grünbichler ---- - tools/testing/selftests/x86/ldt_gdt.c | 17 ++++++++++++++++- - 1 file changed, 16 insertions(+), 1 deletion(-) - -diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c -index 05d0d6f49c2c..8e290c9b2c3f 100644 ---- a/tools/testing/selftests/x86/ldt_gdt.c -+++ b/tools/testing/selftests/x86/ldt_gdt.c -@@ -403,9 +403,24 @@ static void do_simple_tests(void) - install_invalid(&desc, false); - - desc.seg_not_present = 0; -- desc.read_exec_only = 0; - desc.seg_32bit = 1; -+ desc.read_exec_only = 0; -+ desc.limit = 0xfffff; -+ - install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB); -+ -+ desc.limit_in_pages = 1; -+ -+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB | AR_G); -+ desc.read_exec_only = 1; -+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S | AR_P | AR_DB | AR_G); -+ desc.contents = 1; -+ desc.read_exec_only = 0; -+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA_EXPDOWN | AR_S | AR_P | AR_DB | AR_G); -+ desc.read_exec_only = 1; -+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA_EXPDOWN | AR_S | AR_P | AR_DB | AR_G); -+ -+ desc.limit = 0; - install_invalid(&desc, true); - } - --- -2.14.2 - diff --git a/patches/kernel/0119-ACPI-APEI-Replace-ioremap_page_range-with-fixmap.patch b/patches/kernel/0119-ACPI-APEI-Replace-ioremap_page_range-with-fixmap.patch deleted file mode 100644 index cb047d9..0000000 --- a/patches/kernel/0119-ACPI-APEI-Replace-ioremap_page_range-with-fixmap.patch +++ /dev/null @@ -1,190 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: James Morse -Date: Mon, 6 Nov 2017 18:44:24 +0000 -Subject: [PATCH] ACPI / APEI: Replace ioremap_page_range() with fixmap -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Replace ghes_io{re,un}map_pfn_{nmi,irq}()s use of ioremap_page_range() -with __set_fixmap() as ioremap_page_range() may sleep to allocate a new -level of page-table, even if its passed an existing final-address to -use in the mapping. - -The GHES driver can only be enabled for architectures that select -HAVE_ACPI_APEI: Add fixmap entries to both x86 and arm64. - -clear_fixmap() does the TLB invalidation in __set_fixmap() for arm64 -and __set_pte_vaddr() for x86. In each case its the same as the -respective arch_apei_flush_tlb_one(). - -Reported-by: Fengguang Wu -Suggested-by: Linus Torvalds -Signed-off-by: James Morse -Reviewed-by: Borislav Petkov -Tested-by: Tyler Baicar -Tested-by: Toshi Kani -[ For the arm64 bits: ] -Acked-by: Will Deacon -[ For the x86 bits: ] -Acked-by: Ingo Molnar -Signed-off-by: Rafael J. Wysocki -Cc: All applicable -(cherry picked from commit 4f89fa286f6729312e227e7c2d764e8e7b9d340e) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit eda363b23c1601f733cb1d7d66d1a4975c4c5d09) -Signed-off-by: Fabian Grünbichler ---- - arch/arm64/include/asm/fixmap.h | 7 +++++++ - arch/x86/include/asm/fixmap.h | 6 ++++++ - drivers/acpi/apei/ghes.c | 44 +++++++++++++---------------------------- - 3 files changed, 27 insertions(+), 30 deletions(-) - -diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h -index caf86be815ba..4052ec39e8db 100644 ---- a/arch/arm64/include/asm/fixmap.h -+++ b/arch/arm64/include/asm/fixmap.h -@@ -51,6 +51,13 @@ enum fixed_addresses { - - FIX_EARLYCON_MEM_BASE, - FIX_TEXT_POKE0, -+ -+#ifdef CONFIG_ACPI_APEI_GHES -+ /* Used for GHES mapping from assorted contexts */ -+ FIX_APEI_GHES_IRQ, -+ FIX_APEI_GHES_NMI, -+#endif /* CONFIG_ACPI_APEI_GHES */ -+ - __end_of_permanent_fixed_addresses, - - /* -diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h -index b65155cc3760..81c2b11f50a6 100644 ---- a/arch/x86/include/asm/fixmap.h -+++ b/arch/x86/include/asm/fixmap.h -@@ -104,6 +104,12 @@ enum fixed_addresses { - FIX_GDT_REMAP_BEGIN, - FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1, - -+#ifdef CONFIG_ACPI_APEI_GHES -+ /* Used for GHES mapping from assorted contexts */ -+ FIX_APEI_GHES_IRQ, -+ FIX_APEI_GHES_NMI, -+#endif -+ - __end_of_permanent_fixed_addresses, - - /* -diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c -index 4827176f838d..f9f106e62e74 100644 ---- a/drivers/acpi/apei/ghes.c -+++ b/drivers/acpi/apei/ghes.c -@@ -51,6 +51,7 @@ - #include - #include - #include -+#include - #include - #include - -@@ -112,7 +113,7 @@ static DEFINE_MUTEX(ghes_list_mutex); - * Because the memory area used to transfer hardware error information - * from BIOS to Linux can be determined only in NMI, IRQ or timer - * handler, but general ioremap can not be used in atomic context, so -- * a special version of atomic ioremap is implemented for that. -+ * the fixmap is used instead. - */ - - /* -@@ -126,8 +127,8 @@ static DEFINE_MUTEX(ghes_list_mutex); - /* virtual memory area for atomic ioremap */ - static struct vm_struct *ghes_ioremap_area; - /* -- * These 2 spinlock is used to prevent atomic ioremap virtual memory -- * area from being mapped simultaneously. -+ * These 2 spinlocks are used to prevent the fixmap entries from being used -+ * simultaneously. - */ - static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi); - static DEFINE_SPINLOCK(ghes_ioremap_lock_irq); -@@ -159,53 +160,36 @@ static void ghes_ioremap_exit(void) - - static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn) - { -- unsigned long vaddr; - phys_addr_t paddr; - pgprot_t prot; - -- vaddr = (unsigned long)GHES_IOREMAP_NMI_PAGE(ghes_ioremap_area->addr); -- - paddr = pfn << PAGE_SHIFT; - prot = arch_apei_get_mem_attribute(paddr); -- ioremap_page_range(vaddr, vaddr + PAGE_SIZE, paddr, prot); -+ __set_fixmap(FIX_APEI_GHES_NMI, paddr, prot); - -- return (void __iomem *)vaddr; -+ return (void __iomem *) fix_to_virt(FIX_APEI_GHES_NMI); - } - - static void __iomem *ghes_ioremap_pfn_irq(u64 pfn) - { -- unsigned long vaddr; - phys_addr_t paddr; - pgprot_t prot; - -- vaddr = (unsigned long)GHES_IOREMAP_IRQ_PAGE(ghes_ioremap_area->addr); -- - paddr = pfn << PAGE_SHIFT; - prot = arch_apei_get_mem_attribute(paddr); -+ __set_fixmap(FIX_APEI_GHES_IRQ, paddr, prot); - -- ioremap_page_range(vaddr, vaddr + PAGE_SIZE, paddr, prot); -- -- return (void __iomem *)vaddr; -+ return (void __iomem *) fix_to_virt(FIX_APEI_GHES_IRQ); - } - --static void ghes_iounmap_nmi(void __iomem *vaddr_ptr) -+static void ghes_iounmap_nmi(void) - { -- unsigned long vaddr = (unsigned long __force)vaddr_ptr; -- void *base = ghes_ioremap_area->addr; -- -- BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_NMI_PAGE(base)); -- unmap_kernel_range_noflush(vaddr, PAGE_SIZE); -- arch_apei_flush_tlb_one(vaddr); -+ clear_fixmap(FIX_APEI_GHES_NMI); - } - --static void ghes_iounmap_irq(void __iomem *vaddr_ptr) -+static void ghes_iounmap_irq(void) - { -- unsigned long vaddr = (unsigned long __force)vaddr_ptr; -- void *base = ghes_ioremap_area->addr; -- -- BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_IRQ_PAGE(base)); -- unmap_kernel_range_noflush(vaddr, PAGE_SIZE); -- arch_apei_flush_tlb_one(vaddr); -+ clear_fixmap(FIX_APEI_GHES_IRQ); - } - - static int ghes_estatus_pool_init(void) -@@ -361,10 +345,10 @@ static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len, - paddr += trunk; - buffer += trunk; - if (in_nmi) { -- ghes_iounmap_nmi(vaddr); -+ ghes_iounmap_nmi(); - raw_spin_unlock(&ghes_ioremap_lock_nmi); - } else { -- ghes_iounmap_irq(vaddr); -+ ghes_iounmap_irq(); - spin_unlock_irqrestore(&ghes_ioremap_lock_irq, flags); - } - } --- -2.14.2 - diff --git a/patches/kernel/0119-selftests-x86-ldt_get-Add-a-few-additional-tests-for.patch b/patches/kernel/0119-selftests-x86-ldt_get-Add-a-few-additional-tests-for.patch new file mode 100644 index 0000000..ebb340b --- /dev/null +++ b/patches/kernel/0119-selftests-x86-ldt_get-Add-a-few-additional-tests-for.patch @@ -0,0 +1,66 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Sat, 4 Nov 2017 04:19:52 -0700 +Subject: [PATCH] selftests/x86/ldt_get: Add a few additional tests for limits +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +We weren't testing the .limit and .limit_in_pages fields very well. +Add more tests. + +This addition seems to trigger the "bits 16:19 are undefined" issue +that was fixed in an earlier patch. I think that, at least on my +CPU, the high nibble of the limit ends in LAR bits 16:19. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/5601c15ea9b3113d288953fd2838b18bedf6bc67.1509794321.git.luto@kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit fec8f5ae1715a01c72ad52cb2ecd8aacaf142302) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit a1cdabf77d7ed9ba02697ad8beb04adf46a7c7b8) +Signed-off-by: Fabian Grünbichler +--- + tools/testing/selftests/x86/ldt_gdt.c | 17 ++++++++++++++++- + 1 file changed, 16 insertions(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c +index 05d0d6f49c2c..8e290c9b2c3f 100644 +--- a/tools/testing/selftests/x86/ldt_gdt.c ++++ b/tools/testing/selftests/x86/ldt_gdt.c +@@ -403,9 +403,24 @@ static void do_simple_tests(void) + install_invalid(&desc, false); + + desc.seg_not_present = 0; +- desc.read_exec_only = 0; + desc.seg_32bit = 1; ++ desc.read_exec_only = 0; ++ desc.limit = 0xfffff; ++ + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB); ++ ++ desc.limit_in_pages = 1; ++ ++ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB | AR_G); ++ desc.read_exec_only = 1; ++ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S | AR_P | AR_DB | AR_G); ++ desc.contents = 1; ++ desc.read_exec_only = 0; ++ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA_EXPDOWN | AR_S | AR_P | AR_DB | AR_G); ++ desc.read_exec_only = 1; ++ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA_EXPDOWN | AR_S | AR_P | AR_DB | AR_G); ++ ++ desc.limit = 0; + install_invalid(&desc, true); + } + +-- +2.14.2 + diff --git a/patches/kernel/0120-ACPI-APEI-Replace-ioremap_page_range-with-fixmap.patch b/patches/kernel/0120-ACPI-APEI-Replace-ioremap_page_range-with-fixmap.patch new file mode 100644 index 0000000..cb047d9 --- /dev/null +++ b/patches/kernel/0120-ACPI-APEI-Replace-ioremap_page_range-with-fixmap.patch @@ -0,0 +1,190 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: James Morse +Date: Mon, 6 Nov 2017 18:44:24 +0000 +Subject: [PATCH] ACPI / APEI: Replace ioremap_page_range() with fixmap +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Replace ghes_io{re,un}map_pfn_{nmi,irq}()s use of ioremap_page_range() +with __set_fixmap() as ioremap_page_range() may sleep to allocate a new +level of page-table, even if its passed an existing final-address to +use in the mapping. + +The GHES driver can only be enabled for architectures that select +HAVE_ACPI_APEI: Add fixmap entries to both x86 and arm64. + +clear_fixmap() does the TLB invalidation in __set_fixmap() for arm64 +and __set_pte_vaddr() for x86. In each case its the same as the +respective arch_apei_flush_tlb_one(). + +Reported-by: Fengguang Wu +Suggested-by: Linus Torvalds +Signed-off-by: James Morse +Reviewed-by: Borislav Petkov +Tested-by: Tyler Baicar +Tested-by: Toshi Kani +[ For the arm64 bits: ] +Acked-by: Will Deacon +[ For the x86 bits: ] +Acked-by: Ingo Molnar +Signed-off-by: Rafael J. Wysocki +Cc: All applicable +(cherry picked from commit 4f89fa286f6729312e227e7c2d764e8e7b9d340e) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit eda363b23c1601f733cb1d7d66d1a4975c4c5d09) +Signed-off-by: Fabian Grünbichler +--- + arch/arm64/include/asm/fixmap.h | 7 +++++++ + arch/x86/include/asm/fixmap.h | 6 ++++++ + drivers/acpi/apei/ghes.c | 44 +++++++++++++---------------------------- + 3 files changed, 27 insertions(+), 30 deletions(-) + +diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h +index caf86be815ba..4052ec39e8db 100644 +--- a/arch/arm64/include/asm/fixmap.h ++++ b/arch/arm64/include/asm/fixmap.h +@@ -51,6 +51,13 @@ enum fixed_addresses { + + FIX_EARLYCON_MEM_BASE, + FIX_TEXT_POKE0, ++ ++#ifdef CONFIG_ACPI_APEI_GHES ++ /* Used for GHES mapping from assorted contexts */ ++ FIX_APEI_GHES_IRQ, ++ FIX_APEI_GHES_NMI, ++#endif /* CONFIG_ACPI_APEI_GHES */ ++ + __end_of_permanent_fixed_addresses, + + /* +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index b65155cc3760..81c2b11f50a6 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -104,6 +104,12 @@ enum fixed_addresses { + FIX_GDT_REMAP_BEGIN, + FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1, + ++#ifdef CONFIG_ACPI_APEI_GHES ++ /* Used for GHES mapping from assorted contexts */ ++ FIX_APEI_GHES_IRQ, ++ FIX_APEI_GHES_NMI, ++#endif ++ + __end_of_permanent_fixed_addresses, + + /* +diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c +index 4827176f838d..f9f106e62e74 100644 +--- a/drivers/acpi/apei/ghes.c ++++ b/drivers/acpi/apei/ghes.c +@@ -51,6 +51,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -112,7 +113,7 @@ static DEFINE_MUTEX(ghes_list_mutex); + * Because the memory area used to transfer hardware error information + * from BIOS to Linux can be determined only in NMI, IRQ or timer + * handler, but general ioremap can not be used in atomic context, so +- * a special version of atomic ioremap is implemented for that. ++ * the fixmap is used instead. + */ + + /* +@@ -126,8 +127,8 @@ static DEFINE_MUTEX(ghes_list_mutex); + /* virtual memory area for atomic ioremap */ + static struct vm_struct *ghes_ioremap_area; + /* +- * These 2 spinlock is used to prevent atomic ioremap virtual memory +- * area from being mapped simultaneously. ++ * These 2 spinlocks are used to prevent the fixmap entries from being used ++ * simultaneously. + */ + static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi); + static DEFINE_SPINLOCK(ghes_ioremap_lock_irq); +@@ -159,53 +160,36 @@ static void ghes_ioremap_exit(void) + + static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn) + { +- unsigned long vaddr; + phys_addr_t paddr; + pgprot_t prot; + +- vaddr = (unsigned long)GHES_IOREMAP_NMI_PAGE(ghes_ioremap_area->addr); +- + paddr = pfn << PAGE_SHIFT; + prot = arch_apei_get_mem_attribute(paddr); +- ioremap_page_range(vaddr, vaddr + PAGE_SIZE, paddr, prot); ++ __set_fixmap(FIX_APEI_GHES_NMI, paddr, prot); + +- return (void __iomem *)vaddr; ++ return (void __iomem *) fix_to_virt(FIX_APEI_GHES_NMI); + } + + static void __iomem *ghes_ioremap_pfn_irq(u64 pfn) + { +- unsigned long vaddr; + phys_addr_t paddr; + pgprot_t prot; + +- vaddr = (unsigned long)GHES_IOREMAP_IRQ_PAGE(ghes_ioremap_area->addr); +- + paddr = pfn << PAGE_SHIFT; + prot = arch_apei_get_mem_attribute(paddr); ++ __set_fixmap(FIX_APEI_GHES_IRQ, paddr, prot); + +- ioremap_page_range(vaddr, vaddr + PAGE_SIZE, paddr, prot); +- +- return (void __iomem *)vaddr; ++ return (void __iomem *) fix_to_virt(FIX_APEI_GHES_IRQ); + } + +-static void ghes_iounmap_nmi(void __iomem *vaddr_ptr) ++static void ghes_iounmap_nmi(void) + { +- unsigned long vaddr = (unsigned long __force)vaddr_ptr; +- void *base = ghes_ioremap_area->addr; +- +- BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_NMI_PAGE(base)); +- unmap_kernel_range_noflush(vaddr, PAGE_SIZE); +- arch_apei_flush_tlb_one(vaddr); ++ clear_fixmap(FIX_APEI_GHES_NMI); + } + +-static void ghes_iounmap_irq(void __iomem *vaddr_ptr) ++static void ghes_iounmap_irq(void) + { +- unsigned long vaddr = (unsigned long __force)vaddr_ptr; +- void *base = ghes_ioremap_area->addr; +- +- BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_IRQ_PAGE(base)); +- unmap_kernel_range_noflush(vaddr, PAGE_SIZE); +- arch_apei_flush_tlb_one(vaddr); ++ clear_fixmap(FIX_APEI_GHES_IRQ); + } + + static int ghes_estatus_pool_init(void) +@@ -361,10 +345,10 @@ static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len, + paddr += trunk; + buffer += trunk; + if (in_nmi) { +- ghes_iounmap_nmi(vaddr); ++ ghes_iounmap_nmi(); + raw_spin_unlock(&ghes_ioremap_lock_nmi); + } else { +- ghes_iounmap_irq(vaddr); ++ ghes_iounmap_irq(); + spin_unlock_irqrestore(&ghes_ioremap_lock_irq, flags); + } + } +-- +2.14.2 + diff --git a/patches/kernel/0120-x86-virt-x86-platform-Merge-struct-x86_hyper-into-st.patch b/patches/kernel/0120-x86-virt-x86-platform-Merge-struct-x86_hyper-into-st.patch deleted file mode 100644 index 7fa6116..0000000 --- a/patches/kernel/0120-x86-virt-x86-platform-Merge-struct-x86_hyper-into-st.patch +++ /dev/null @@ -1,406 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Thu, 9 Nov 2017 14:27:35 +0100 -Subject: [PATCH] x86/virt, x86/platform: Merge 'struct x86_hyper' into 'struct - x86_platform' and 'struct x86_init' -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Instead of x86_hyper being either NULL on bare metal or a pointer to a -struct hypervisor_x86 in case of the kernel running as a guest merge -the struct into x86_platform and x86_init. - -This will remove the need for wrappers making it hard to find out what -is being called. With dummy functions added for all callbacks testing -for a NULL function pointer can be removed, too. - -Suggested-by: Ingo Molnar -Signed-off-by: Juergen Gross -Acked-by: Thomas Gleixner -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: akataria@vmware.com -Cc: boris.ostrovsky@oracle.com -Cc: devel@linuxdriverproject.org -Cc: haiyangz@microsoft.com -Cc: kvm@vger.kernel.org -Cc: kys@microsoft.com -Cc: pbonzini@redhat.com -Cc: rkrcmar@redhat.com -Cc: rusty@rustcorp.com.au -Cc: sthemmin@microsoft.com -Cc: virtualization@lists.linux-foundation.org -Cc: xen-devel@lists.xenproject.org -Link: http://lkml.kernel.org/r/20171109132739.23465-2-jgross@suse.com -Signed-off-by: Ingo Molnar -(cherry picked from commit f72e38e8ec8869ac0ba5a75d7d2f897d98a1454e) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 2d0b017b38623bca666acbcb5ab251315845fa55) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/hypervisor.h | 25 ++++-------------- - arch/x86/include/asm/x86_init.h | 24 +++++++++++++++++ - include/linux/hypervisor.h | 8 ++++-- - arch/x86/kernel/apic/apic.c | 2 +- - arch/x86/kernel/cpu/hypervisor.c | 54 +++++++++++++++++++-------------------- - arch/x86/kernel/cpu/mshyperv.c | 2 +- - arch/x86/kernel/cpu/vmware.c | 4 +-- - arch/x86/kernel/kvm.c | 2 +- - arch/x86/kernel/x86_init.c | 9 +++++++ - arch/x86/mm/init.c | 2 +- - arch/x86/xen/enlighten_hvm.c | 8 +++--- - arch/x86/xen/enlighten_pv.c | 2 +- - 12 files changed, 81 insertions(+), 61 deletions(-) - -diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h -index 0ead9dbb9130..0eca7239a7aa 100644 ---- a/arch/x86/include/asm/hypervisor.h -+++ b/arch/x86/include/asm/hypervisor.h -@@ -23,6 +23,7 @@ - #ifdef CONFIG_HYPERVISOR_GUEST - - #include -+#include - #include - - /* -@@ -35,17 +36,11 @@ struct hypervisor_x86 { - /* Detection routine */ - uint32_t (*detect)(void); - -- /* Platform setup (run once per boot) */ -- void (*init_platform)(void); -+ /* init time callbacks */ -+ struct x86_hyper_init init; - -- /* X2APIC detection (run once per boot) */ -- bool (*x2apic_available)(void); -- -- /* pin current vcpu to specified physical cpu (run rarely) */ -- void (*pin_vcpu)(int); -- -- /* called during init_mem_mapping() to setup early mappings. */ -- void (*init_mem_mapping)(void); -+ /* runtime callbacks */ -+ struct x86_hyper_runtime runtime; - }; - - extern const struct hypervisor_x86 *x86_hyper; -@@ -58,17 +53,7 @@ extern const struct hypervisor_x86 x86_hyper_xen_hvm; - extern const struct hypervisor_x86 x86_hyper_kvm; - - extern void init_hypervisor_platform(void); --extern bool hypervisor_x2apic_available(void); --extern void hypervisor_pin_vcpu(int cpu); -- --static inline void hypervisor_init_mem_mapping(void) --{ -- if (x86_hyper && x86_hyper->init_mem_mapping) -- x86_hyper->init_mem_mapping(); --} - #else - static inline void init_hypervisor_platform(void) { } --static inline bool hypervisor_x2apic_available(void) { return false; } --static inline void hypervisor_init_mem_mapping(void) { } - #endif /* CONFIG_HYPERVISOR_GUEST */ - #endif /* _ASM_X86_HYPERVISOR_H */ -diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h -index 7ba7e90a9ad6..4d95e5a13c0b 100644 ---- a/arch/x86/include/asm/x86_init.h -+++ b/arch/x86/include/asm/x86_init.h -@@ -113,6 +113,18 @@ struct x86_init_pci { - void (*fixup_irqs)(void); - }; - -+/** -+ * struct x86_hyper_init - x86 hypervisor init functions -+ * @init_platform: platform setup -+ * @x2apic_available: X2APIC detection -+ * @init_mem_mapping: setup early mappings during init_mem_mapping() -+ */ -+struct x86_hyper_init { -+ void (*init_platform)(void); -+ bool (*x2apic_available)(void); -+ void (*init_mem_mapping)(void); -+}; -+ - /** - * struct x86_init_ops - functions for platform specific setup - * -@@ -126,6 +138,7 @@ struct x86_init_ops { - struct x86_init_timers timers; - struct x86_init_iommu iommu; - struct x86_init_pci pci; -+ struct x86_hyper_init hyper; - }; - - /** -@@ -198,6 +211,15 @@ struct x86_legacy_features { - struct x86_legacy_devices devices; - }; - -+/** -+ * struct x86_hyper_runtime - x86 hypervisor specific runtime callbacks -+ * -+ * @pin_vcpu: pin current vcpu to specified physical cpu (run rarely) -+ */ -+struct x86_hyper_runtime { -+ void (*pin_vcpu)(int cpu); -+}; -+ - /** - * struct x86_platform_ops - platform specific runtime functions - * @calibrate_cpu: calibrate CPU -@@ -217,6 +239,7 @@ struct x86_legacy_features { - * possible in x86_early_init_platform_quirks() by - * only using the current x86_hardware_subarch - * semantics. -+ * @hyper: x86 hypervisor specific runtime callbacks - */ - struct x86_platform_ops { - unsigned long (*calibrate_cpu)(void); -@@ -232,6 +255,7 @@ struct x86_platform_ops { - void (*apic_post_init)(void); - struct x86_legacy_features legacy; - void (*set_legacy_features)(void); -+ struct x86_hyper_runtime hyper; - }; - - struct pci_dev; -diff --git a/include/linux/hypervisor.h b/include/linux/hypervisor.h -index 3fa5ef2b3759..35e170ca87a8 100644 ---- a/include/linux/hypervisor.h -+++ b/include/linux/hypervisor.h -@@ -6,8 +6,12 @@ - * Juergen Gross - */ - --#ifdef CONFIG_HYPERVISOR_GUEST --#include -+#ifdef CONFIG_X86 -+#include -+static inline void hypervisor_pin_vcpu(int cpu) -+{ -+ x86_platform.hyper.pin_vcpu(cpu); -+} - #else - static inline void hypervisor_pin_vcpu(int cpu) - { -diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c -index 4a7f962b53ff..bb63c1350524 100644 ---- a/arch/x86/kernel/apic/apic.c -+++ b/arch/x86/kernel/apic/apic.c -@@ -1666,7 +1666,7 @@ static __init void try_to_enable_x2apic(int remap_mode) - * under KVM - */ - if (max_physical_apicid > 255 || -- !hypervisor_x2apic_available()) { -+ !x86_init.hyper.x2apic_available()) { - pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n"); - x2apic_disable(); - return; -diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c -index 4fa90006ac68..22226c1bf092 100644 ---- a/arch/x86/kernel/cpu/hypervisor.c -+++ b/arch/x86/kernel/cpu/hypervisor.c -@@ -44,51 +44,49 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = - const struct hypervisor_x86 *x86_hyper; - EXPORT_SYMBOL(x86_hyper); - --static inline void __init -+static inline const struct hypervisor_x86 * __init - detect_hypervisor_vendor(void) - { -- const struct hypervisor_x86 *h, * const *p; -+ const struct hypervisor_x86 *h = NULL, * const *p; - uint32_t pri, max_pri = 0; - - for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { -- h = *p; -- pri = h->detect(); -- if (pri != 0 && pri > max_pri) { -+ pri = (*p)->detect(); -+ if (pri > max_pri) { - max_pri = pri; -- x86_hyper = h; -+ h = *p; - } - } - -- if (max_pri) -- pr_info("Hypervisor detected: %s\n", x86_hyper->name); -+ if (h) -+ pr_info("Hypervisor detected: %s\n", h->name); -+ -+ return h; - } - --void __init init_hypervisor_platform(void) -+static void __init copy_array(const void *src, void *target, unsigned int size) - { -+ unsigned int i, n = size / sizeof(void *); -+ const void * const *from = (const void * const *)src; -+ const void **to = (const void **)target; - -- detect_hypervisor_vendor(); -- -- if (!x86_hyper) -- return; -- -- if (x86_hyper->init_platform) -- x86_hyper->init_platform(); -+ for (i = 0; i < n; i++) -+ if (from[i]) -+ to[i] = from[i]; - } - --bool __init hypervisor_x2apic_available(void) -+void __init init_hypervisor_platform(void) - { -- return x86_hyper && -- x86_hyper->x2apic_available && -- x86_hyper->x2apic_available(); --} -+ const struct hypervisor_x86 *h; - --void hypervisor_pin_vcpu(int cpu) --{ -- if (!x86_hyper) -+ h = detect_hypervisor_vendor(); -+ -+ if (!h) - return; - -- if (x86_hyper->pin_vcpu) -- x86_hyper->pin_vcpu(cpu); -- else -- WARN_ONCE(1, "vcpu pinning requested but not supported!\n"); -+ copy_array(&h->init, &x86_init.hyper, sizeof(h->init)); -+ copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime)); -+ -+ x86_hyper = h; -+ x86_init.hyper.init_platform(); - } -diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c -index 70e717fccdd6..9707e431da27 100644 ---- a/arch/x86/kernel/cpu/mshyperv.c -+++ b/arch/x86/kernel/cpu/mshyperv.c -@@ -255,6 +255,6 @@ static void __init ms_hyperv_init_platform(void) - const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { - .name = "Microsoft HyperV", - .detect = ms_hyperv_platform, -- .init_platform = ms_hyperv_init_platform, -+ .init.init_platform = ms_hyperv_init_platform, - }; - EXPORT_SYMBOL(x86_hyper_ms_hyperv); -diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c -index 40ed26852ebd..4804c1d063c8 100644 ---- a/arch/x86/kernel/cpu/vmware.c -+++ b/arch/x86/kernel/cpu/vmware.c -@@ -208,7 +208,7 @@ static bool __init vmware_legacy_x2apic_available(void) - const __refconst struct hypervisor_x86 x86_hyper_vmware = { - .name = "VMware", - .detect = vmware_platform, -- .init_platform = vmware_platform_setup, -- .x2apic_available = vmware_legacy_x2apic_available, -+ .init.init_platform = vmware_platform_setup, -+ .init.x2apic_available = vmware_legacy_x2apic_available, - }; - EXPORT_SYMBOL(x86_hyper_vmware); -diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c -index 9e3798b00e40..54e373bfeab9 100644 ---- a/arch/x86/kernel/kvm.c -+++ b/arch/x86/kernel/kvm.c -@@ -547,7 +547,7 @@ static uint32_t __init kvm_detect(void) - const struct hypervisor_x86 x86_hyper_kvm __refconst = { - .name = "KVM", - .detect = kvm_detect, -- .x2apic_available = kvm_para_available, -+ .init.x2apic_available = kvm_para_available, - }; - EXPORT_SYMBOL_GPL(x86_hyper_kvm); - -diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c -index a088b2c47f73..5b2d10c1973a 100644 ---- a/arch/x86/kernel/x86_init.c -+++ b/arch/x86/kernel/x86_init.c -@@ -28,6 +28,8 @@ void x86_init_noop(void) { } - void __init x86_init_uint_noop(unsigned int unused) { } - int __init iommu_init_noop(void) { return 0; } - void iommu_shutdown_noop(void) { } -+bool __init bool_x86_init_noop(void) { return false; } -+void x86_op_int_noop(int cpu) { } - - /* - * The platform setup functions are preset with the default functions -@@ -81,6 +83,12 @@ struct x86_init_ops x86_init __initdata = { - .init_irq = x86_default_pci_init_irq, - .fixup_irqs = x86_default_pci_fixup_irqs, - }, -+ -+ .hyper = { -+ .init_platform = x86_init_noop, -+ .x2apic_available = bool_x86_init_noop, -+ .init_mem_mapping = x86_init_noop, -+ }, - }; - - struct x86_cpuinit_ops x86_cpuinit = { -@@ -101,6 +109,7 @@ struct x86_platform_ops x86_platform __ro_after_init = { - .get_nmi_reason = default_get_nmi_reason, - .save_sched_clock_state = tsc_save_sched_clock_state, - .restore_sched_clock_state = tsc_restore_sched_clock_state, -+ .hyper.pin_vcpu = x86_op_int_noop, - }; - - EXPORT_SYMBOL_GPL(x86_platform); -diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c -index af5c1ed21d43..a22c2b95e513 100644 ---- a/arch/x86/mm/init.c -+++ b/arch/x86/mm/init.c -@@ -671,7 +671,7 @@ void __init init_mem_mapping(void) - load_cr3(swapper_pg_dir); - __flush_tlb_all(); - -- hypervisor_init_mem_mapping(); -+ x86_init.hyper.init_mem_mapping(); - - early_memtest(0, max_pfn_mapped << PAGE_SHIFT); - } -diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c -index de503c225ae1..7b1622089f96 100644 ---- a/arch/x86/xen/enlighten_hvm.c -+++ b/arch/x86/xen/enlighten_hvm.c -@@ -229,9 +229,9 @@ static uint32_t __init xen_platform_hvm(void) - const struct hypervisor_x86 x86_hyper_xen_hvm = { - .name = "Xen HVM", - .detect = xen_platform_hvm, -- .init_platform = xen_hvm_guest_init, -- .pin_vcpu = xen_pin_vcpu, -- .x2apic_available = xen_x2apic_para_available, -- .init_mem_mapping = xen_hvm_init_mem_mapping, -+ .init.init_platform = xen_hvm_guest_init, -+ .init.x2apic_available = xen_x2apic_para_available, -+ .init.init_mem_mapping = xen_hvm_init_mem_mapping, -+ .runtime.pin_vcpu = xen_pin_vcpu, - }; - EXPORT_SYMBOL(x86_hyper_xen_hvm); -diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c -index e7b213047724..4110fc9e5ee9 100644 ---- a/arch/x86/xen/enlighten_pv.c -+++ b/arch/x86/xen/enlighten_pv.c -@@ -1461,6 +1461,6 @@ static uint32_t __init xen_platform_pv(void) - const struct hypervisor_x86 x86_hyper_xen_pv = { - .name = "Xen PV", - .detect = xen_platform_pv, -- .pin_vcpu = xen_pin_vcpu, -+ .runtime.pin_vcpu = xen_pin_vcpu, - }; - EXPORT_SYMBOL(x86_hyper_xen_pv); --- -2.14.2 - diff --git a/patches/kernel/0121-x86-virt-Add-enum-for-hypervisors-to-replace-x86_hyp.patch b/patches/kernel/0121-x86-virt-Add-enum-for-hypervisors-to-replace-x86_hyp.patch deleted file mode 100644 index 02762ba..0000000 --- a/patches/kernel/0121-x86-virt-Add-enum-for-hypervisors-to-replace-x86_hyp.patch +++ /dev/null @@ -1,301 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Thu, 9 Nov 2017 14:27:36 +0100 -Subject: [PATCH] x86/virt: Add enum for hypervisors to replace x86_hyper -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The x86_hyper pointer is only used for checking whether a virtual -device is supporting the hypervisor the system is running on. - -Use an enum for that purpose instead and drop the x86_hyper pointer. - -Signed-off-by: Juergen Gross -Acked-by: Thomas Gleixner -Acked-by: Xavier Deguillard -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: akataria@vmware.com -Cc: arnd@arndb.de -Cc: boris.ostrovsky@oracle.com -Cc: devel@linuxdriverproject.org -Cc: dmitry.torokhov@gmail.com -Cc: gregkh@linuxfoundation.org -Cc: haiyangz@microsoft.com -Cc: kvm@vger.kernel.org -Cc: kys@microsoft.com -Cc: linux-graphics-maintainer@vmware.com -Cc: linux-input@vger.kernel.org -Cc: moltmann@vmware.com -Cc: pbonzini@redhat.com -Cc: pv-drivers@vmware.com -Cc: rkrcmar@redhat.com -Cc: sthemmin@microsoft.com -Cc: virtualization@lists.linux-foundation.org -Cc: xen-devel@lists.xenproject.org -Link: http://lkml.kernel.org/r/20171109132739.23465-3-jgross@suse.com -Signed-off-by: Ingo Molnar -(backported from commit 03b2a320b19f1424e9ac9c21696be9c60b6d0d93) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit c24b0a226fadfe1abe78fa568ff84fea6ecd7ca5) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/hypervisor.h | 23 ++++++++++++++--------- - arch/x86/hyperv/hv_init.c | 2 +- - arch/x86/kernel/cpu/hypervisor.c | 12 +++++++++--- - arch/x86/kernel/cpu/mshyperv.c | 6 +++--- - arch/x86/kernel/cpu/vmware.c | 4 ++-- - arch/x86/kernel/kvm.c | 4 ++-- - arch/x86/xen/enlighten_hvm.c | 4 ++-- - arch/x86/xen/enlighten_pv.c | 4 ++-- - drivers/hv/vmbus_drv.c | 2 +- - drivers/input/mouse/vmmouse.c | 10 ++++------ - drivers/misc/vmw_balloon.c | 2 +- - 11 files changed, 41 insertions(+), 32 deletions(-) - -diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h -index 0eca7239a7aa..1b0a5abcd8ae 100644 ---- a/arch/x86/include/asm/hypervisor.h -+++ b/arch/x86/include/asm/hypervisor.h -@@ -29,6 +29,16 @@ - /* - * x86 hypervisor information - */ -+ -+enum x86_hypervisor_type { -+ X86_HYPER_NATIVE = 0, -+ X86_HYPER_VMWARE, -+ X86_HYPER_MS_HYPERV, -+ X86_HYPER_XEN_PV, -+ X86_HYPER_XEN_HVM, -+ X86_HYPER_KVM, -+}; -+ - struct hypervisor_x86 { - /* Hypervisor name */ - const char *name; -@@ -36,6 +46,9 @@ struct hypervisor_x86 { - /* Detection routine */ - uint32_t (*detect)(void); - -+ /* Hypervisor type */ -+ enum x86_hypervisor_type type; -+ - /* init time callbacks */ - struct x86_hyper_init init; - -@@ -43,15 +56,7 @@ struct hypervisor_x86 { - struct x86_hyper_runtime runtime; - }; - --extern const struct hypervisor_x86 *x86_hyper; -- --/* Recognized hypervisors */ --extern const struct hypervisor_x86 x86_hyper_vmware; --extern const struct hypervisor_x86 x86_hyper_ms_hyperv; --extern const struct hypervisor_x86 x86_hyper_xen_pv; --extern const struct hypervisor_x86 x86_hyper_xen_hvm; --extern const struct hypervisor_x86 x86_hyper_kvm; -- -+extern enum x86_hypervisor_type x86_hyper_type; - extern void init_hypervisor_platform(void); - #else - static inline void init_hypervisor_platform(void) { } -diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c -index ec7c9661743f..32fa894139d5 100644 ---- a/arch/x86/hyperv/hv_init.c -+++ b/arch/x86/hyperv/hv_init.c -@@ -99,7 +99,7 @@ void hyperv_init(void) - u64 guest_id; - union hv_x64_msr_hypercall_contents hypercall_msr; - -- if (x86_hyper != &x86_hyper_ms_hyperv) -+ if (x86_hyper_type != X86_HYPER_MS_HYPERV) - return; - - /* -diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c -index 22226c1bf092..bea8d3e24f50 100644 ---- a/arch/x86/kernel/cpu/hypervisor.c -+++ b/arch/x86/kernel/cpu/hypervisor.c -@@ -26,6 +26,12 @@ - #include - #include - -+extern const struct hypervisor_x86 x86_hyper_vmware; -+extern const struct hypervisor_x86 x86_hyper_ms_hyperv; -+extern const struct hypervisor_x86 x86_hyper_xen_pv; -+extern const struct hypervisor_x86 x86_hyper_xen_hvm; -+extern const struct hypervisor_x86 x86_hyper_kvm; -+ - static const __initconst struct hypervisor_x86 * const hypervisors[] = - { - #ifdef CONFIG_XEN_PV -@@ -41,8 +47,8 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = - #endif - }; - --const struct hypervisor_x86 *x86_hyper; --EXPORT_SYMBOL(x86_hyper); -+enum x86_hypervisor_type x86_hyper_type; -+EXPORT_SYMBOL(x86_hyper_type); - - static inline const struct hypervisor_x86 * __init - detect_hypervisor_vendor(void) -@@ -87,6 +93,6 @@ void __init init_hypervisor_platform(void) - copy_array(&h->init, &x86_init.hyper, sizeof(h->init)); - copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime)); - -- x86_hyper = h; -+ x86_hyper_type = h->type; - x86_init.hyper.init_platform(); - } -diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c -index 9707e431da27..3672f1192119 100644 ---- a/arch/x86/kernel/cpu/mshyperv.c -+++ b/arch/x86/kernel/cpu/mshyperv.c -@@ -252,9 +252,9 @@ static void __init ms_hyperv_init_platform(void) - #endif - } - --const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { -- .name = "Microsoft HyperV", -+const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { -+ .name = "Microsoft Hyper-V", - .detect = ms_hyperv_platform, -+ .type = X86_HYPER_MS_HYPERV, - .init.init_platform = ms_hyperv_init_platform, - }; --EXPORT_SYMBOL(x86_hyper_ms_hyperv); -diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c -index 4804c1d063c8..8e005329648b 100644 ---- a/arch/x86/kernel/cpu/vmware.c -+++ b/arch/x86/kernel/cpu/vmware.c -@@ -205,10 +205,10 @@ static bool __init vmware_legacy_x2apic_available(void) - (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0; - } - --const __refconst struct hypervisor_x86 x86_hyper_vmware = { -+const __initconst struct hypervisor_x86 x86_hyper_vmware = { - .name = "VMware", - .detect = vmware_platform, -+ .type = X86_HYPER_VMWARE, - .init.init_platform = vmware_platform_setup, - .init.x2apic_available = vmware_legacy_x2apic_available, - }; --EXPORT_SYMBOL(x86_hyper_vmware); -diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c -index 54e373bfeab9..b65a51a24647 100644 ---- a/arch/x86/kernel/kvm.c -+++ b/arch/x86/kernel/kvm.c -@@ -544,12 +544,12 @@ static uint32_t __init kvm_detect(void) - return kvm_cpuid_base(); - } - --const struct hypervisor_x86 x86_hyper_kvm __refconst = { -+const __initconst struct hypervisor_x86 x86_hyper_kvm = { - .name = "KVM", - .detect = kvm_detect, -+ .type = X86_HYPER_KVM, - .init.x2apic_available = kvm_para_available, - }; --EXPORT_SYMBOL_GPL(x86_hyper_kvm); - - static __init int activate_jump_labels(void) - { -diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c -index 7b1622089f96..754d5391d9fa 100644 ---- a/arch/x86/xen/enlighten_hvm.c -+++ b/arch/x86/xen/enlighten_hvm.c -@@ -226,12 +226,12 @@ static uint32_t __init xen_platform_hvm(void) - return xen_cpuid_base(); - } - --const struct hypervisor_x86 x86_hyper_xen_hvm = { -+const __initconst struct hypervisor_x86 x86_hyper_xen_hvm = { - .name = "Xen HVM", - .detect = xen_platform_hvm, -+ .type = X86_HYPER_XEN_HVM, - .init.init_platform = xen_hvm_guest_init, - .init.x2apic_available = xen_x2apic_para_available, - .init.init_mem_mapping = xen_hvm_init_mem_mapping, - .runtime.pin_vcpu = xen_pin_vcpu, - }; --EXPORT_SYMBOL(x86_hyper_xen_hvm); -diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c -index 4110fc9e5ee9..63c81154083b 100644 ---- a/arch/x86/xen/enlighten_pv.c -+++ b/arch/x86/xen/enlighten_pv.c -@@ -1458,9 +1458,9 @@ static uint32_t __init xen_platform_pv(void) - return 0; - } - --const struct hypervisor_x86 x86_hyper_xen_pv = { -+const __initconst struct hypervisor_x86 x86_hyper_xen_pv = { - .name = "Xen PV", - .detect = xen_platform_pv, -+ .type = X86_HYPER_XEN_PV, - .runtime.pin_vcpu = xen_pin_vcpu, - }; --EXPORT_SYMBOL(x86_hyper_xen_pv); -diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c -index 5ad627044dd1..8aca7613e482 100644 ---- a/drivers/hv/vmbus_drv.c -+++ b/drivers/hv/vmbus_drv.c -@@ -1551,7 +1551,7 @@ static int __init hv_acpi_init(void) - { - int ret, t; - -- if (x86_hyper != &x86_hyper_ms_hyperv) -+ if (x86_hyper_type != X86_HYPER_MS_HYPERV) - return -ENODEV; - - init_completion(&probe_event); -diff --git a/drivers/input/mouse/vmmouse.c b/drivers/input/mouse/vmmouse.c -index 0f586780ceb4..1ae5c1ef3f5b 100644 ---- a/drivers/input/mouse/vmmouse.c -+++ b/drivers/input/mouse/vmmouse.c -@@ -316,11 +316,9 @@ static int vmmouse_enable(struct psmouse *psmouse) - /* - * Array of supported hypervisors. - */ --static const struct hypervisor_x86 *vmmouse_supported_hypervisors[] = { -- &x86_hyper_vmware, --#ifdef CONFIG_KVM_GUEST -- &x86_hyper_kvm, --#endif -+static enum x86_hypervisor_type vmmouse_supported_hypervisors[] = { -+ X86_HYPER_VMWARE, -+ X86_HYPER_KVM, - }; - - /** -@@ -331,7 +329,7 @@ static bool vmmouse_check_hypervisor(void) - int i; - - for (i = 0; i < ARRAY_SIZE(vmmouse_supported_hypervisors); i++) -- if (vmmouse_supported_hypervisors[i] == x86_hyper) -+ if (vmmouse_supported_hypervisors[i] == x86_hyper_type) - return true; - - return false; -diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c -index 1e688bfec567..9047c0a529b2 100644 ---- a/drivers/misc/vmw_balloon.c -+++ b/drivers/misc/vmw_balloon.c -@@ -1271,7 +1271,7 @@ static int __init vmballoon_init(void) - * Check if we are running on VMware's hypervisor and bail out - * if we are not. - */ -- if (x86_hyper != &x86_hyper_vmware) -+ if (x86_hyper_type != X86_HYPER_VMWARE) - return -ENODEV; - - for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES; --- -2.14.2 - diff --git a/patches/kernel/0121-x86-virt-x86-platform-Merge-struct-x86_hyper-into-st.patch b/patches/kernel/0121-x86-virt-x86-platform-Merge-struct-x86_hyper-into-st.patch new file mode 100644 index 0000000..7fa6116 --- /dev/null +++ b/patches/kernel/0121-x86-virt-x86-platform-Merge-struct-x86_hyper-into-st.patch @@ -0,0 +1,406 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Juergen Gross +Date: Thu, 9 Nov 2017 14:27:35 +0100 +Subject: [PATCH] x86/virt, x86/platform: Merge 'struct x86_hyper' into 'struct + x86_platform' and 'struct x86_init' +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Instead of x86_hyper being either NULL on bare metal or a pointer to a +struct hypervisor_x86 in case of the kernel running as a guest merge +the struct into x86_platform and x86_init. + +This will remove the need for wrappers making it hard to find out what +is being called. With dummy functions added for all callbacks testing +for a NULL function pointer can be removed, too. + +Suggested-by: Ingo Molnar +Signed-off-by: Juergen Gross +Acked-by: Thomas Gleixner +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: akataria@vmware.com +Cc: boris.ostrovsky@oracle.com +Cc: devel@linuxdriverproject.org +Cc: haiyangz@microsoft.com +Cc: kvm@vger.kernel.org +Cc: kys@microsoft.com +Cc: pbonzini@redhat.com +Cc: rkrcmar@redhat.com +Cc: rusty@rustcorp.com.au +Cc: sthemmin@microsoft.com +Cc: virtualization@lists.linux-foundation.org +Cc: xen-devel@lists.xenproject.org +Link: http://lkml.kernel.org/r/20171109132739.23465-2-jgross@suse.com +Signed-off-by: Ingo Molnar +(cherry picked from commit f72e38e8ec8869ac0ba5a75d7d2f897d98a1454e) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 2d0b017b38623bca666acbcb5ab251315845fa55) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/hypervisor.h | 25 ++++-------------- + arch/x86/include/asm/x86_init.h | 24 +++++++++++++++++ + include/linux/hypervisor.h | 8 ++++-- + arch/x86/kernel/apic/apic.c | 2 +- + arch/x86/kernel/cpu/hypervisor.c | 54 +++++++++++++++++++-------------------- + arch/x86/kernel/cpu/mshyperv.c | 2 +- + arch/x86/kernel/cpu/vmware.c | 4 +-- + arch/x86/kernel/kvm.c | 2 +- + arch/x86/kernel/x86_init.c | 9 +++++++ + arch/x86/mm/init.c | 2 +- + arch/x86/xen/enlighten_hvm.c | 8 +++--- + arch/x86/xen/enlighten_pv.c | 2 +- + 12 files changed, 81 insertions(+), 61 deletions(-) + +diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h +index 0ead9dbb9130..0eca7239a7aa 100644 +--- a/arch/x86/include/asm/hypervisor.h ++++ b/arch/x86/include/asm/hypervisor.h +@@ -23,6 +23,7 @@ + #ifdef CONFIG_HYPERVISOR_GUEST + + #include ++#include + #include + + /* +@@ -35,17 +36,11 @@ struct hypervisor_x86 { + /* Detection routine */ + uint32_t (*detect)(void); + +- /* Platform setup (run once per boot) */ +- void (*init_platform)(void); ++ /* init time callbacks */ ++ struct x86_hyper_init init; + +- /* X2APIC detection (run once per boot) */ +- bool (*x2apic_available)(void); +- +- /* pin current vcpu to specified physical cpu (run rarely) */ +- void (*pin_vcpu)(int); +- +- /* called during init_mem_mapping() to setup early mappings. */ +- void (*init_mem_mapping)(void); ++ /* runtime callbacks */ ++ struct x86_hyper_runtime runtime; + }; + + extern const struct hypervisor_x86 *x86_hyper; +@@ -58,17 +53,7 @@ extern const struct hypervisor_x86 x86_hyper_xen_hvm; + extern const struct hypervisor_x86 x86_hyper_kvm; + + extern void init_hypervisor_platform(void); +-extern bool hypervisor_x2apic_available(void); +-extern void hypervisor_pin_vcpu(int cpu); +- +-static inline void hypervisor_init_mem_mapping(void) +-{ +- if (x86_hyper && x86_hyper->init_mem_mapping) +- x86_hyper->init_mem_mapping(); +-} + #else + static inline void init_hypervisor_platform(void) { } +-static inline bool hypervisor_x2apic_available(void) { return false; } +-static inline void hypervisor_init_mem_mapping(void) { } + #endif /* CONFIG_HYPERVISOR_GUEST */ + #endif /* _ASM_X86_HYPERVISOR_H */ +diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h +index 7ba7e90a9ad6..4d95e5a13c0b 100644 +--- a/arch/x86/include/asm/x86_init.h ++++ b/arch/x86/include/asm/x86_init.h +@@ -113,6 +113,18 @@ struct x86_init_pci { + void (*fixup_irqs)(void); + }; + ++/** ++ * struct x86_hyper_init - x86 hypervisor init functions ++ * @init_platform: platform setup ++ * @x2apic_available: X2APIC detection ++ * @init_mem_mapping: setup early mappings during init_mem_mapping() ++ */ ++struct x86_hyper_init { ++ void (*init_platform)(void); ++ bool (*x2apic_available)(void); ++ void (*init_mem_mapping)(void); ++}; ++ + /** + * struct x86_init_ops - functions for platform specific setup + * +@@ -126,6 +138,7 @@ struct x86_init_ops { + struct x86_init_timers timers; + struct x86_init_iommu iommu; + struct x86_init_pci pci; ++ struct x86_hyper_init hyper; + }; + + /** +@@ -198,6 +211,15 @@ struct x86_legacy_features { + struct x86_legacy_devices devices; + }; + ++/** ++ * struct x86_hyper_runtime - x86 hypervisor specific runtime callbacks ++ * ++ * @pin_vcpu: pin current vcpu to specified physical cpu (run rarely) ++ */ ++struct x86_hyper_runtime { ++ void (*pin_vcpu)(int cpu); ++}; ++ + /** + * struct x86_platform_ops - platform specific runtime functions + * @calibrate_cpu: calibrate CPU +@@ -217,6 +239,7 @@ struct x86_legacy_features { + * possible in x86_early_init_platform_quirks() by + * only using the current x86_hardware_subarch + * semantics. ++ * @hyper: x86 hypervisor specific runtime callbacks + */ + struct x86_platform_ops { + unsigned long (*calibrate_cpu)(void); +@@ -232,6 +255,7 @@ struct x86_platform_ops { + void (*apic_post_init)(void); + struct x86_legacy_features legacy; + void (*set_legacy_features)(void); ++ struct x86_hyper_runtime hyper; + }; + + struct pci_dev; +diff --git a/include/linux/hypervisor.h b/include/linux/hypervisor.h +index 3fa5ef2b3759..35e170ca87a8 100644 +--- a/include/linux/hypervisor.h ++++ b/include/linux/hypervisor.h +@@ -6,8 +6,12 @@ + * Juergen Gross + */ + +-#ifdef CONFIG_HYPERVISOR_GUEST +-#include ++#ifdef CONFIG_X86 ++#include ++static inline void hypervisor_pin_vcpu(int cpu) ++{ ++ x86_platform.hyper.pin_vcpu(cpu); ++} + #else + static inline void hypervisor_pin_vcpu(int cpu) + { +diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c +index 4a7f962b53ff..bb63c1350524 100644 +--- a/arch/x86/kernel/apic/apic.c ++++ b/arch/x86/kernel/apic/apic.c +@@ -1666,7 +1666,7 @@ static __init void try_to_enable_x2apic(int remap_mode) + * under KVM + */ + if (max_physical_apicid > 255 || +- !hypervisor_x2apic_available()) { ++ !x86_init.hyper.x2apic_available()) { + pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n"); + x2apic_disable(); + return; +diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c +index 4fa90006ac68..22226c1bf092 100644 +--- a/arch/x86/kernel/cpu/hypervisor.c ++++ b/arch/x86/kernel/cpu/hypervisor.c +@@ -44,51 +44,49 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = + const struct hypervisor_x86 *x86_hyper; + EXPORT_SYMBOL(x86_hyper); + +-static inline void __init ++static inline const struct hypervisor_x86 * __init + detect_hypervisor_vendor(void) + { +- const struct hypervisor_x86 *h, * const *p; ++ const struct hypervisor_x86 *h = NULL, * const *p; + uint32_t pri, max_pri = 0; + + for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { +- h = *p; +- pri = h->detect(); +- if (pri != 0 && pri > max_pri) { ++ pri = (*p)->detect(); ++ if (pri > max_pri) { + max_pri = pri; +- x86_hyper = h; ++ h = *p; + } + } + +- if (max_pri) +- pr_info("Hypervisor detected: %s\n", x86_hyper->name); ++ if (h) ++ pr_info("Hypervisor detected: %s\n", h->name); ++ ++ return h; + } + +-void __init init_hypervisor_platform(void) ++static void __init copy_array(const void *src, void *target, unsigned int size) + { ++ unsigned int i, n = size / sizeof(void *); ++ const void * const *from = (const void * const *)src; ++ const void **to = (const void **)target; + +- detect_hypervisor_vendor(); +- +- if (!x86_hyper) +- return; +- +- if (x86_hyper->init_platform) +- x86_hyper->init_platform(); ++ for (i = 0; i < n; i++) ++ if (from[i]) ++ to[i] = from[i]; + } + +-bool __init hypervisor_x2apic_available(void) ++void __init init_hypervisor_platform(void) + { +- return x86_hyper && +- x86_hyper->x2apic_available && +- x86_hyper->x2apic_available(); +-} ++ const struct hypervisor_x86 *h; + +-void hypervisor_pin_vcpu(int cpu) +-{ +- if (!x86_hyper) ++ h = detect_hypervisor_vendor(); ++ ++ if (!h) + return; + +- if (x86_hyper->pin_vcpu) +- x86_hyper->pin_vcpu(cpu); +- else +- WARN_ONCE(1, "vcpu pinning requested but not supported!\n"); ++ copy_array(&h->init, &x86_init.hyper, sizeof(h->init)); ++ copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime)); ++ ++ x86_hyper = h; ++ x86_init.hyper.init_platform(); + } +diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c +index 70e717fccdd6..9707e431da27 100644 +--- a/arch/x86/kernel/cpu/mshyperv.c ++++ b/arch/x86/kernel/cpu/mshyperv.c +@@ -255,6 +255,6 @@ static void __init ms_hyperv_init_platform(void) + const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { + .name = "Microsoft HyperV", + .detect = ms_hyperv_platform, +- .init_platform = ms_hyperv_init_platform, ++ .init.init_platform = ms_hyperv_init_platform, + }; + EXPORT_SYMBOL(x86_hyper_ms_hyperv); +diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c +index 40ed26852ebd..4804c1d063c8 100644 +--- a/arch/x86/kernel/cpu/vmware.c ++++ b/arch/x86/kernel/cpu/vmware.c +@@ -208,7 +208,7 @@ static bool __init vmware_legacy_x2apic_available(void) + const __refconst struct hypervisor_x86 x86_hyper_vmware = { + .name = "VMware", + .detect = vmware_platform, +- .init_platform = vmware_platform_setup, +- .x2apic_available = vmware_legacy_x2apic_available, ++ .init.init_platform = vmware_platform_setup, ++ .init.x2apic_available = vmware_legacy_x2apic_available, + }; + EXPORT_SYMBOL(x86_hyper_vmware); +diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c +index 9e3798b00e40..54e373bfeab9 100644 +--- a/arch/x86/kernel/kvm.c ++++ b/arch/x86/kernel/kvm.c +@@ -547,7 +547,7 @@ static uint32_t __init kvm_detect(void) + const struct hypervisor_x86 x86_hyper_kvm __refconst = { + .name = "KVM", + .detect = kvm_detect, +- .x2apic_available = kvm_para_available, ++ .init.x2apic_available = kvm_para_available, + }; + EXPORT_SYMBOL_GPL(x86_hyper_kvm); + +diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c +index a088b2c47f73..5b2d10c1973a 100644 +--- a/arch/x86/kernel/x86_init.c ++++ b/arch/x86/kernel/x86_init.c +@@ -28,6 +28,8 @@ void x86_init_noop(void) { } + void __init x86_init_uint_noop(unsigned int unused) { } + int __init iommu_init_noop(void) { return 0; } + void iommu_shutdown_noop(void) { } ++bool __init bool_x86_init_noop(void) { return false; } ++void x86_op_int_noop(int cpu) { } + + /* + * The platform setup functions are preset with the default functions +@@ -81,6 +83,12 @@ struct x86_init_ops x86_init __initdata = { + .init_irq = x86_default_pci_init_irq, + .fixup_irqs = x86_default_pci_fixup_irqs, + }, ++ ++ .hyper = { ++ .init_platform = x86_init_noop, ++ .x2apic_available = bool_x86_init_noop, ++ .init_mem_mapping = x86_init_noop, ++ }, + }; + + struct x86_cpuinit_ops x86_cpuinit = { +@@ -101,6 +109,7 @@ struct x86_platform_ops x86_platform __ro_after_init = { + .get_nmi_reason = default_get_nmi_reason, + .save_sched_clock_state = tsc_save_sched_clock_state, + .restore_sched_clock_state = tsc_restore_sched_clock_state, ++ .hyper.pin_vcpu = x86_op_int_noop, + }; + + EXPORT_SYMBOL_GPL(x86_platform); +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c +index af5c1ed21d43..a22c2b95e513 100644 +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -671,7 +671,7 @@ void __init init_mem_mapping(void) + load_cr3(swapper_pg_dir); + __flush_tlb_all(); + +- hypervisor_init_mem_mapping(); ++ x86_init.hyper.init_mem_mapping(); + + early_memtest(0, max_pfn_mapped << PAGE_SHIFT); + } +diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c +index de503c225ae1..7b1622089f96 100644 +--- a/arch/x86/xen/enlighten_hvm.c ++++ b/arch/x86/xen/enlighten_hvm.c +@@ -229,9 +229,9 @@ static uint32_t __init xen_platform_hvm(void) + const struct hypervisor_x86 x86_hyper_xen_hvm = { + .name = "Xen HVM", + .detect = xen_platform_hvm, +- .init_platform = xen_hvm_guest_init, +- .pin_vcpu = xen_pin_vcpu, +- .x2apic_available = xen_x2apic_para_available, +- .init_mem_mapping = xen_hvm_init_mem_mapping, ++ .init.init_platform = xen_hvm_guest_init, ++ .init.x2apic_available = xen_x2apic_para_available, ++ .init.init_mem_mapping = xen_hvm_init_mem_mapping, ++ .runtime.pin_vcpu = xen_pin_vcpu, + }; + EXPORT_SYMBOL(x86_hyper_xen_hvm); +diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c +index e7b213047724..4110fc9e5ee9 100644 +--- a/arch/x86/xen/enlighten_pv.c ++++ b/arch/x86/xen/enlighten_pv.c +@@ -1461,6 +1461,6 @@ static uint32_t __init xen_platform_pv(void) + const struct hypervisor_x86 x86_hyper_xen_pv = { + .name = "Xen PV", + .detect = xen_platform_pv, +- .pin_vcpu = xen_pin_vcpu, ++ .runtime.pin_vcpu = xen_pin_vcpu, + }; + EXPORT_SYMBOL(x86_hyper_xen_pv); +-- +2.14.2 + diff --git a/patches/kernel/0122-drivers-misc-intel-pti-Rename-the-header-file-to-fre.patch b/patches/kernel/0122-drivers-misc-intel-pti-Rename-the-header-file-to-fre.patch deleted file mode 100644 index 5274bdb..0000000 --- a/patches/kernel/0122-drivers-misc-intel-pti-Rename-the-header-file-to-fre.patch +++ /dev/null @@ -1,73 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Ingo Molnar -Date: Tue, 5 Dec 2017 14:14:47 +0100 -Subject: [PATCH] drivers/misc/intel/pti: Rename the header file to free up the - namespace -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -We'd like to use the 'PTI' acronym for 'Page Table Isolation' - free up the -namespace by renaming the driver header to . - -(Also standardize the header guard name while at it.) - -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: J Freyensee -Cc: Greg Kroah-Hartman -Cc: linux-kernel@vger.kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 1784f9144b143a1e8b19fe94083b040aa559182b) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit fc05996d0900765640d56179acd2f5d052ad33e2) -Signed-off-by: Fabian Grünbichler ---- - include/linux/{pti.h => intel-pti.h} | 6 +++--- - drivers/misc/pti.c | 2 +- - 2 files changed, 4 insertions(+), 4 deletions(-) - rename include/linux/{pti.h => intel-pti.h} (94%) - -diff --git a/include/linux/pti.h b/include/linux/intel-pti.h -similarity index 94% -rename from include/linux/pti.h -rename to include/linux/intel-pti.h -index b3ea01a3197e..2710d72de3c9 100644 ---- a/include/linux/pti.h -+++ b/include/linux/intel-pti.h -@@ -22,8 +22,8 @@ - * interface to write out it's contents for debugging a mobile system. - */ - --#ifndef PTI_H_ --#define PTI_H_ -+#ifndef LINUX_INTEL_PTI_H_ -+#define LINUX_INTEL_PTI_H_ - - /* offset for last dword of any PTI message. Part of MIPI P1149.7 */ - #define PTI_LASTDWORD_DTS 0x30 -@@ -40,4 +40,4 @@ struct pti_masterchannel *pti_request_masterchannel(u8 type, - const char *thread_name); - void pti_release_masterchannel(struct pti_masterchannel *mc); - --#endif /*PTI_H_*/ -+#endif /* LINUX_INTEL_PTI_H_ */ -diff --git a/drivers/misc/pti.c b/drivers/misc/pti.c -index eda38cbe8530..41f2a9f6851d 100644 ---- a/drivers/misc/pti.c -+++ b/drivers/misc/pti.c -@@ -32,7 +32,7 @@ - #include - #include - #include --#include -+#include - #include - #include - --- -2.14.2 - diff --git a/patches/kernel/0122-x86-virt-Add-enum-for-hypervisors-to-replace-x86_hyp.patch b/patches/kernel/0122-x86-virt-Add-enum-for-hypervisors-to-replace-x86_hyp.patch new file mode 100644 index 0000000..02762ba --- /dev/null +++ b/patches/kernel/0122-x86-virt-Add-enum-for-hypervisors-to-replace-x86_hyp.patch @@ -0,0 +1,301 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Juergen Gross +Date: Thu, 9 Nov 2017 14:27:36 +0100 +Subject: [PATCH] x86/virt: Add enum for hypervisors to replace x86_hyper +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The x86_hyper pointer is only used for checking whether a virtual +device is supporting the hypervisor the system is running on. + +Use an enum for that purpose instead and drop the x86_hyper pointer. + +Signed-off-by: Juergen Gross +Acked-by: Thomas Gleixner +Acked-by: Xavier Deguillard +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: akataria@vmware.com +Cc: arnd@arndb.de +Cc: boris.ostrovsky@oracle.com +Cc: devel@linuxdriverproject.org +Cc: dmitry.torokhov@gmail.com +Cc: gregkh@linuxfoundation.org +Cc: haiyangz@microsoft.com +Cc: kvm@vger.kernel.org +Cc: kys@microsoft.com +Cc: linux-graphics-maintainer@vmware.com +Cc: linux-input@vger.kernel.org +Cc: moltmann@vmware.com +Cc: pbonzini@redhat.com +Cc: pv-drivers@vmware.com +Cc: rkrcmar@redhat.com +Cc: sthemmin@microsoft.com +Cc: virtualization@lists.linux-foundation.org +Cc: xen-devel@lists.xenproject.org +Link: http://lkml.kernel.org/r/20171109132739.23465-3-jgross@suse.com +Signed-off-by: Ingo Molnar +(backported from commit 03b2a320b19f1424e9ac9c21696be9c60b6d0d93) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit c24b0a226fadfe1abe78fa568ff84fea6ecd7ca5) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/hypervisor.h | 23 ++++++++++++++--------- + arch/x86/hyperv/hv_init.c | 2 +- + arch/x86/kernel/cpu/hypervisor.c | 12 +++++++++--- + arch/x86/kernel/cpu/mshyperv.c | 6 +++--- + arch/x86/kernel/cpu/vmware.c | 4 ++-- + arch/x86/kernel/kvm.c | 4 ++-- + arch/x86/xen/enlighten_hvm.c | 4 ++-- + arch/x86/xen/enlighten_pv.c | 4 ++-- + drivers/hv/vmbus_drv.c | 2 +- + drivers/input/mouse/vmmouse.c | 10 ++++------ + drivers/misc/vmw_balloon.c | 2 +- + 11 files changed, 41 insertions(+), 32 deletions(-) + +diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h +index 0eca7239a7aa..1b0a5abcd8ae 100644 +--- a/arch/x86/include/asm/hypervisor.h ++++ b/arch/x86/include/asm/hypervisor.h +@@ -29,6 +29,16 @@ + /* + * x86 hypervisor information + */ ++ ++enum x86_hypervisor_type { ++ X86_HYPER_NATIVE = 0, ++ X86_HYPER_VMWARE, ++ X86_HYPER_MS_HYPERV, ++ X86_HYPER_XEN_PV, ++ X86_HYPER_XEN_HVM, ++ X86_HYPER_KVM, ++}; ++ + struct hypervisor_x86 { + /* Hypervisor name */ + const char *name; +@@ -36,6 +46,9 @@ struct hypervisor_x86 { + /* Detection routine */ + uint32_t (*detect)(void); + ++ /* Hypervisor type */ ++ enum x86_hypervisor_type type; ++ + /* init time callbacks */ + struct x86_hyper_init init; + +@@ -43,15 +56,7 @@ struct hypervisor_x86 { + struct x86_hyper_runtime runtime; + }; + +-extern const struct hypervisor_x86 *x86_hyper; +- +-/* Recognized hypervisors */ +-extern const struct hypervisor_x86 x86_hyper_vmware; +-extern const struct hypervisor_x86 x86_hyper_ms_hyperv; +-extern const struct hypervisor_x86 x86_hyper_xen_pv; +-extern const struct hypervisor_x86 x86_hyper_xen_hvm; +-extern const struct hypervisor_x86 x86_hyper_kvm; +- ++extern enum x86_hypervisor_type x86_hyper_type; + extern void init_hypervisor_platform(void); + #else + static inline void init_hypervisor_platform(void) { } +diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c +index ec7c9661743f..32fa894139d5 100644 +--- a/arch/x86/hyperv/hv_init.c ++++ b/arch/x86/hyperv/hv_init.c +@@ -99,7 +99,7 @@ void hyperv_init(void) + u64 guest_id; + union hv_x64_msr_hypercall_contents hypercall_msr; + +- if (x86_hyper != &x86_hyper_ms_hyperv) ++ if (x86_hyper_type != X86_HYPER_MS_HYPERV) + return; + + /* +diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c +index 22226c1bf092..bea8d3e24f50 100644 +--- a/arch/x86/kernel/cpu/hypervisor.c ++++ b/arch/x86/kernel/cpu/hypervisor.c +@@ -26,6 +26,12 @@ + #include + #include + ++extern const struct hypervisor_x86 x86_hyper_vmware; ++extern const struct hypervisor_x86 x86_hyper_ms_hyperv; ++extern const struct hypervisor_x86 x86_hyper_xen_pv; ++extern const struct hypervisor_x86 x86_hyper_xen_hvm; ++extern const struct hypervisor_x86 x86_hyper_kvm; ++ + static const __initconst struct hypervisor_x86 * const hypervisors[] = + { + #ifdef CONFIG_XEN_PV +@@ -41,8 +47,8 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = + #endif + }; + +-const struct hypervisor_x86 *x86_hyper; +-EXPORT_SYMBOL(x86_hyper); ++enum x86_hypervisor_type x86_hyper_type; ++EXPORT_SYMBOL(x86_hyper_type); + + static inline const struct hypervisor_x86 * __init + detect_hypervisor_vendor(void) +@@ -87,6 +93,6 @@ void __init init_hypervisor_platform(void) + copy_array(&h->init, &x86_init.hyper, sizeof(h->init)); + copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime)); + +- x86_hyper = h; ++ x86_hyper_type = h->type; + x86_init.hyper.init_platform(); + } +diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c +index 9707e431da27..3672f1192119 100644 +--- a/arch/x86/kernel/cpu/mshyperv.c ++++ b/arch/x86/kernel/cpu/mshyperv.c +@@ -252,9 +252,9 @@ static void __init ms_hyperv_init_platform(void) + #endif + } + +-const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { +- .name = "Microsoft HyperV", ++const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { ++ .name = "Microsoft Hyper-V", + .detect = ms_hyperv_platform, ++ .type = X86_HYPER_MS_HYPERV, + .init.init_platform = ms_hyperv_init_platform, + }; +-EXPORT_SYMBOL(x86_hyper_ms_hyperv); +diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c +index 4804c1d063c8..8e005329648b 100644 +--- a/arch/x86/kernel/cpu/vmware.c ++++ b/arch/x86/kernel/cpu/vmware.c +@@ -205,10 +205,10 @@ static bool __init vmware_legacy_x2apic_available(void) + (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0; + } + +-const __refconst struct hypervisor_x86 x86_hyper_vmware = { ++const __initconst struct hypervisor_x86 x86_hyper_vmware = { + .name = "VMware", + .detect = vmware_platform, ++ .type = X86_HYPER_VMWARE, + .init.init_platform = vmware_platform_setup, + .init.x2apic_available = vmware_legacy_x2apic_available, + }; +-EXPORT_SYMBOL(x86_hyper_vmware); +diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c +index 54e373bfeab9..b65a51a24647 100644 +--- a/arch/x86/kernel/kvm.c ++++ b/arch/x86/kernel/kvm.c +@@ -544,12 +544,12 @@ static uint32_t __init kvm_detect(void) + return kvm_cpuid_base(); + } + +-const struct hypervisor_x86 x86_hyper_kvm __refconst = { ++const __initconst struct hypervisor_x86 x86_hyper_kvm = { + .name = "KVM", + .detect = kvm_detect, ++ .type = X86_HYPER_KVM, + .init.x2apic_available = kvm_para_available, + }; +-EXPORT_SYMBOL_GPL(x86_hyper_kvm); + + static __init int activate_jump_labels(void) + { +diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c +index 7b1622089f96..754d5391d9fa 100644 +--- a/arch/x86/xen/enlighten_hvm.c ++++ b/arch/x86/xen/enlighten_hvm.c +@@ -226,12 +226,12 @@ static uint32_t __init xen_platform_hvm(void) + return xen_cpuid_base(); + } + +-const struct hypervisor_x86 x86_hyper_xen_hvm = { ++const __initconst struct hypervisor_x86 x86_hyper_xen_hvm = { + .name = "Xen HVM", + .detect = xen_platform_hvm, ++ .type = X86_HYPER_XEN_HVM, + .init.init_platform = xen_hvm_guest_init, + .init.x2apic_available = xen_x2apic_para_available, + .init.init_mem_mapping = xen_hvm_init_mem_mapping, + .runtime.pin_vcpu = xen_pin_vcpu, + }; +-EXPORT_SYMBOL(x86_hyper_xen_hvm); +diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c +index 4110fc9e5ee9..63c81154083b 100644 +--- a/arch/x86/xen/enlighten_pv.c ++++ b/arch/x86/xen/enlighten_pv.c +@@ -1458,9 +1458,9 @@ static uint32_t __init xen_platform_pv(void) + return 0; + } + +-const struct hypervisor_x86 x86_hyper_xen_pv = { ++const __initconst struct hypervisor_x86 x86_hyper_xen_pv = { + .name = "Xen PV", + .detect = xen_platform_pv, ++ .type = X86_HYPER_XEN_PV, + .runtime.pin_vcpu = xen_pin_vcpu, + }; +-EXPORT_SYMBOL(x86_hyper_xen_pv); +diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c +index 5ad627044dd1..8aca7613e482 100644 +--- a/drivers/hv/vmbus_drv.c ++++ b/drivers/hv/vmbus_drv.c +@@ -1551,7 +1551,7 @@ static int __init hv_acpi_init(void) + { + int ret, t; + +- if (x86_hyper != &x86_hyper_ms_hyperv) ++ if (x86_hyper_type != X86_HYPER_MS_HYPERV) + return -ENODEV; + + init_completion(&probe_event); +diff --git a/drivers/input/mouse/vmmouse.c b/drivers/input/mouse/vmmouse.c +index 0f586780ceb4..1ae5c1ef3f5b 100644 +--- a/drivers/input/mouse/vmmouse.c ++++ b/drivers/input/mouse/vmmouse.c +@@ -316,11 +316,9 @@ static int vmmouse_enable(struct psmouse *psmouse) + /* + * Array of supported hypervisors. + */ +-static const struct hypervisor_x86 *vmmouse_supported_hypervisors[] = { +- &x86_hyper_vmware, +-#ifdef CONFIG_KVM_GUEST +- &x86_hyper_kvm, +-#endif ++static enum x86_hypervisor_type vmmouse_supported_hypervisors[] = { ++ X86_HYPER_VMWARE, ++ X86_HYPER_KVM, + }; + + /** +@@ -331,7 +329,7 @@ static bool vmmouse_check_hypervisor(void) + int i; + + for (i = 0; i < ARRAY_SIZE(vmmouse_supported_hypervisors); i++) +- if (vmmouse_supported_hypervisors[i] == x86_hyper) ++ if (vmmouse_supported_hypervisors[i] == x86_hyper_type) + return true; + + return false; +diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c +index 1e688bfec567..9047c0a529b2 100644 +--- a/drivers/misc/vmw_balloon.c ++++ b/drivers/misc/vmw_balloon.c +@@ -1271,7 +1271,7 @@ static int __init vmballoon_init(void) + * Check if we are running on VMware's hypervisor and bail out + * if we are not. + */ +- if (x86_hyper != &x86_hyper_vmware) ++ if (x86_hyper_type != X86_HYPER_VMWARE) + return -ENODEV; + + for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES; +-- +2.14.2 + diff --git a/patches/kernel/0123-drivers-misc-intel-pti-Rename-the-header-file-to-fre.patch b/patches/kernel/0123-drivers-misc-intel-pti-Rename-the-header-file-to-fre.patch new file mode 100644 index 0000000..5274bdb --- /dev/null +++ b/patches/kernel/0123-drivers-misc-intel-pti-Rename-the-header-file-to-fre.patch @@ -0,0 +1,73 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Ingo Molnar +Date: Tue, 5 Dec 2017 14:14:47 +0100 +Subject: [PATCH] drivers/misc/intel/pti: Rename the header file to free up the + namespace +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +We'd like to use the 'PTI' acronym for 'Page Table Isolation' - free up the +namespace by renaming the driver header to . + +(Also standardize the header guard name while at it.) + +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: J Freyensee +Cc: Greg Kroah-Hartman +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 1784f9144b143a1e8b19fe94083b040aa559182b) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit fc05996d0900765640d56179acd2f5d052ad33e2) +Signed-off-by: Fabian Grünbichler +--- + include/linux/{pti.h => intel-pti.h} | 6 +++--- + drivers/misc/pti.c | 2 +- + 2 files changed, 4 insertions(+), 4 deletions(-) + rename include/linux/{pti.h => intel-pti.h} (94%) + +diff --git a/include/linux/pti.h b/include/linux/intel-pti.h +similarity index 94% +rename from include/linux/pti.h +rename to include/linux/intel-pti.h +index b3ea01a3197e..2710d72de3c9 100644 +--- a/include/linux/pti.h ++++ b/include/linux/intel-pti.h +@@ -22,8 +22,8 @@ + * interface to write out it's contents for debugging a mobile system. + */ + +-#ifndef PTI_H_ +-#define PTI_H_ ++#ifndef LINUX_INTEL_PTI_H_ ++#define LINUX_INTEL_PTI_H_ + + /* offset for last dword of any PTI message. Part of MIPI P1149.7 */ + #define PTI_LASTDWORD_DTS 0x30 +@@ -40,4 +40,4 @@ struct pti_masterchannel *pti_request_masterchannel(u8 type, + const char *thread_name); + void pti_release_masterchannel(struct pti_masterchannel *mc); + +-#endif /*PTI_H_*/ ++#endif /* LINUX_INTEL_PTI_H_ */ +diff --git a/drivers/misc/pti.c b/drivers/misc/pti.c +index eda38cbe8530..41f2a9f6851d 100644 +--- a/drivers/misc/pti.c ++++ b/drivers/misc/pti.c +@@ -32,7 +32,7 @@ + #include + #include + #include +-#include ++#include + #include + #include + +-- +2.14.2 + diff --git a/patches/kernel/0123-x86-cpufeature-Add-User-Mode-Instruction-Prevention-.patch b/patches/kernel/0123-x86-cpufeature-Add-User-Mode-Instruction-Prevention-.patch deleted file mode 100644 index 1a0d9a2..0000000 --- a/patches/kernel/0123-x86-cpufeature-Add-User-Mode-Instruction-Prevention-.patch +++ /dev/null @@ -1,88 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Ricardo Neri -Date: Sun, 5 Nov 2017 18:27:51 -0800 -Subject: [PATCH] x86/cpufeature: Add User-Mode Instruction Prevention - definitions -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -[ Note, this is a Git cherry-pick of the following commit: (limited to the cpufeatures.h file) - - 3522c2a6a4f3 ("x86/cpufeature: Add User-Mode Instruction Prevention definitions") - - ... for easier x86 PTI code testing and back-porting. ] - -User-Mode Instruction Prevention is a security feature present in new -Intel processors that, when set, prevents the execution of a subset of -instructions if such instructions are executed in user mode (CPL > 0). -Attempting to execute such instructions causes a general protection -exception. - -The subset of instructions comprises: - - * SGDT - Store Global Descriptor Table - * SIDT - Store Interrupt Descriptor Table - * SLDT - Store Local Descriptor Table - * SMSW - Store Machine Status Word - * STR - Store Task Register - -This feature is also added to the list of disabled-features to allow -a cleaner handling of build-time configuration. - -Signed-off-by: Ricardo Neri -Reviewed-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Andrew Morton -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Chen Yucong -Cc: Chris Metcalf -Cc: Dave Hansen -Cc: Denys Vlasenko -Cc: Fenghua Yu -Cc: H. Peter Anvin -Cc: Huang Rui -Cc: Jiri Slaby -Cc: Jonathan Corbet -Cc: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Masami Hiramatsu -Cc: Michael S. Tsirkin -Cc: Paolo Bonzini -Cc: Paul Gortmaker -Cc: Peter Zijlstra -Cc: Ravi V. Shankar -Cc: Shuah Khan -Cc: Tony Luck -Cc: Vlastimil Babka -Cc: ricardo.neri@intel.com -Link: http://lkml.kernel.org/r/1509935277-22138-7-git-send-email-ricardo.neri-calderon@linux.intel.com -Signed-off-by: Ingo Molnar -(cherry picked from commit a8b4db562e7283a1520f9e9730297ecaab7622ea) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 6193ddb9de38665ba45f7f17dd9713baec3673ca) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/cpufeatures.h | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h -index 6db782ed9cdb..0ea630bb3e74 100644 ---- a/arch/x86/include/asm/cpufeatures.h -+++ b/arch/x86/include/asm/cpufeatures.h -@@ -295,6 +295,7 @@ - - /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ - #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ -+#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ - #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ - #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ - #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ --- -2.14.2 - diff --git a/patches/kernel/0124-x86-Make-X86_BUG_FXSAVE_LEAK-detectable-in-CPUID-on-.patch b/patches/kernel/0124-x86-Make-X86_BUG_FXSAVE_LEAK-detectable-in-CPUID-on-.patch deleted file mode 100644 index 342f3ef..0000000 --- a/patches/kernel/0124-x86-Make-X86_BUG_FXSAVE_LEAK-detectable-in-CPUID-on-.patch +++ /dev/null @@ -1,73 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Rudolf Marek -Date: Tue, 28 Nov 2017 22:01:06 +0100 -Subject: [PATCH] x86: Make X86_BUG_FXSAVE_LEAK detectable in CPUID on AMD -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -[ Note, this is a Git cherry-pick of the following commit: - - 2b67799bdf25 ("x86: Make X86_BUG_FXSAVE_LEAK detectable in CPUID on AMD") - - ... for easier x86 PTI code testing and back-porting. ] - -The latest AMD AMD64 Architecture Programmer's Manual -adds a CPUID feature XSaveErPtr (CPUID_Fn80000008_EBX[2]). - -If this feature is set, the FXSAVE, XSAVE, FXSAVEOPT, XSAVEC, XSAVES -/ FXRSTOR, XRSTOR, XRSTORS always save/restore error pointers, -thus making the X86_BUG_FXSAVE_LEAK workaround obsolete on such CPUs. - -Signed-Off-By: Rudolf Marek -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Tested-by: Borislav Petkov -Cc: Andy Lutomirski -Link: https://lkml.kernel.org/r/bdcebe90-62c5-1f05-083c-eba7f08b2540@assembler.cz -Signed-off-by: Ingo Molnar -(cherry picked from commit f2dbad36c55e5d3a91dccbde6e8cae345fe5632f) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 281b622113c66ba2de9b7725e1d232ea3c282114) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/cpufeatures.h | 1 + - arch/x86/kernel/cpu/amd.c | 7 +++++-- - 2 files changed, 6 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h -index 0ea630bb3e74..d57a174ec97c 100644 ---- a/arch/x86/include/asm/cpufeatures.h -+++ b/arch/x86/include/asm/cpufeatures.h -@@ -265,6 +265,7 @@ - /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ - #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ - #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ -+#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ - - /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ - #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c -index 3b9e220621f8..2a5328cc03a6 100644 ---- a/arch/x86/kernel/cpu/amd.c -+++ b/arch/x86/kernel/cpu/amd.c -@@ -760,8 +760,11 @@ static void init_amd(struct cpuinfo_x86 *c) - case 0x15: init_amd_bd(c); break; - } - -- /* Enable workaround for FXSAVE leak */ -- if (c->x86 >= 6) -+ /* -+ * Enable workaround for FXSAVE leak on CPUs -+ * without a XSaveErPtr feature -+ */ -+ if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR))) - set_cpu_bug(c, X86_BUG_FXSAVE_LEAK); - - cpu_detect_cache_sizes(c); --- -2.14.2 - diff --git a/patches/kernel/0124-x86-cpufeature-Add-User-Mode-Instruction-Prevention-.patch b/patches/kernel/0124-x86-cpufeature-Add-User-Mode-Instruction-Prevention-.patch new file mode 100644 index 0000000..1a0d9a2 --- /dev/null +++ b/patches/kernel/0124-x86-cpufeature-Add-User-Mode-Instruction-Prevention-.patch @@ -0,0 +1,88 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri +Date: Sun, 5 Nov 2017 18:27:51 -0800 +Subject: [PATCH] x86/cpufeature: Add User-Mode Instruction Prevention + definitions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +[ Note, this is a Git cherry-pick of the following commit: (limited to the cpufeatures.h file) + + 3522c2a6a4f3 ("x86/cpufeature: Add User-Mode Instruction Prevention definitions") + + ... for easier x86 PTI code testing and back-porting. ] + +User-Mode Instruction Prevention is a security feature present in new +Intel processors that, when set, prevents the execution of a subset of +instructions if such instructions are executed in user mode (CPL > 0). +Attempting to execute such instructions causes a general protection +exception. + +The subset of instructions comprises: + + * SGDT - Store Global Descriptor Table + * SIDT - Store Interrupt Descriptor Table + * SLDT - Store Local Descriptor Table + * SMSW - Store Machine Status Word + * STR - Store Task Register + +This feature is also added to the list of disabled-features to allow +a cleaner handling of build-time configuration. + +Signed-off-by: Ricardo Neri +Reviewed-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Andrew Morton +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Chen Yucong +Cc: Chris Metcalf +Cc: Dave Hansen +Cc: Denys Vlasenko +Cc: Fenghua Yu +Cc: H. Peter Anvin +Cc: Huang Rui +Cc: Jiri Slaby +Cc: Jonathan Corbet +Cc: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Masami Hiramatsu +Cc: Michael S. Tsirkin +Cc: Paolo Bonzini +Cc: Paul Gortmaker +Cc: Peter Zijlstra +Cc: Ravi V. Shankar +Cc: Shuah Khan +Cc: Tony Luck +Cc: Vlastimil Babka +Cc: ricardo.neri@intel.com +Link: http://lkml.kernel.org/r/1509935277-22138-7-git-send-email-ricardo.neri-calderon@linux.intel.com +Signed-off-by: Ingo Molnar +(cherry picked from commit a8b4db562e7283a1520f9e9730297ecaab7622ea) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 6193ddb9de38665ba45f7f17dd9713baec3673ca) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/cpufeatures.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 6db782ed9cdb..0ea630bb3e74 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -295,6 +295,7 @@ + + /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ + #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ ++#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ + #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ + #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ + #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +-- +2.14.2 + diff --git a/patches/kernel/0125-perf-x86-Enable-free-running-PEBS-for-REGS_USER-INTR.patch b/patches/kernel/0125-perf-x86-Enable-free-running-PEBS-for-REGS_USER-INTR.patch deleted file mode 100644 index 169282e..0000000 --- a/patches/kernel/0125-perf-x86-Enable-free-running-PEBS-for-REGS_USER-INTR.patch +++ /dev/null @@ -1,109 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andi Kleen -Date: Thu, 31 Aug 2017 14:46:30 -0700 -Subject: [PATCH] perf/x86: Enable free running PEBS for REGS_USER/INTR -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -[ Note, this is a Git cherry-pick of the following commit: - - a47ba4d77e12 ("perf/x86: Enable free running PEBS for REGS_USER/INTR") - - ... for easier x86 PTI code testing and back-porting. ] - -Currently free running PEBS is disabled when user or interrupt -registers are requested. Most of the registers are actually -available in the PEBS record and can be supported. - -So we just need to check for the supported registers and then -allow it: it is all except for the segment register. - -For user registers this only works when the counter is limited -to ring 3 only, so this also needs to be checked. - -Signed-off-by: Andi Kleen -Signed-off-by: Peter Zijlstra (Intel) -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/20170831214630.21892-1-andi@firstfloor.org -Signed-off-by: Ingo Molnar -(backported from commit 2fe1bc1f501d55e5925b4035bcd85781adc76c63) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 06c6715f5b78b9976e72467b6bba510e243e5aad) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/events/perf_event.h | 24 +++++++++++++++++++++++- - arch/x86/events/intel/core.c | 4 ++++ - 2 files changed, 27 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h -index 0f7dad8bd358..590eaf7c2c3e 100644 ---- a/arch/x86/events/perf_event.h -+++ b/arch/x86/events/perf_event.h -@@ -85,13 +85,15 @@ struct amd_nb { - * Flags PEBS can handle without an PMI. - * - * TID can only be handled by flushing at context switch. -+ * REGS_USER can be handled for events limited to ring 3. - * - */ - #define PEBS_FREERUNNING_FLAGS \ - (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ - PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ - PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ -- PERF_SAMPLE_TRANSACTION) -+ PERF_SAMPLE_TRANSACTION | \ -+ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) - - /* - * A debug store configuration. -@@ -110,6 +112,26 @@ struct debug_store { - u64 pebs_event_reset[MAX_PEBS_EVENTS]; - }; - -+#define PEBS_REGS \ -+ (PERF_REG_X86_AX | \ -+ PERF_REG_X86_BX | \ -+ PERF_REG_X86_CX | \ -+ PERF_REG_X86_DX | \ -+ PERF_REG_X86_DI | \ -+ PERF_REG_X86_SI | \ -+ PERF_REG_X86_SP | \ -+ PERF_REG_X86_BP | \ -+ PERF_REG_X86_IP | \ -+ PERF_REG_X86_FLAGS | \ -+ PERF_REG_X86_R8 | \ -+ PERF_REG_X86_R9 | \ -+ PERF_REG_X86_R10 | \ -+ PERF_REG_X86_R11 | \ -+ PERF_REG_X86_R12 | \ -+ PERF_REG_X86_R13 | \ -+ PERF_REG_X86_R14 | \ -+ PERF_REG_X86_R15) -+ - /* - * Per register state. - */ -diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c -index 6f342001ec6a..7f3afbf928bb 100644 ---- a/arch/x86/events/intel/core.c -+++ b/arch/x86/events/intel/core.c -@@ -2958,6 +2958,10 @@ static unsigned long intel_pmu_free_running_flags(struct perf_event *event) - - if (event->attr.use_clockid) - flags &= ~PERF_SAMPLE_TIME; -+ if (!event->attr.exclude_kernel) -+ flags &= ~PERF_SAMPLE_REGS_USER; -+ if (event->attr.sample_regs_user & ~PEBS_REGS) -+ flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); - return flags; - } - --- -2.14.2 - diff --git a/patches/kernel/0125-x86-Make-X86_BUG_FXSAVE_LEAK-detectable-in-CPUID-on-.patch b/patches/kernel/0125-x86-Make-X86_BUG_FXSAVE_LEAK-detectable-in-CPUID-on-.patch new file mode 100644 index 0000000..342f3ef --- /dev/null +++ b/patches/kernel/0125-x86-Make-X86_BUG_FXSAVE_LEAK-detectable-in-CPUID-on-.patch @@ -0,0 +1,73 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Rudolf Marek +Date: Tue, 28 Nov 2017 22:01:06 +0100 +Subject: [PATCH] x86: Make X86_BUG_FXSAVE_LEAK detectable in CPUID on AMD +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +[ Note, this is a Git cherry-pick of the following commit: + + 2b67799bdf25 ("x86: Make X86_BUG_FXSAVE_LEAK detectable in CPUID on AMD") + + ... for easier x86 PTI code testing and back-porting. ] + +The latest AMD AMD64 Architecture Programmer's Manual +adds a CPUID feature XSaveErPtr (CPUID_Fn80000008_EBX[2]). + +If this feature is set, the FXSAVE, XSAVE, FXSAVEOPT, XSAVEC, XSAVES +/ FXRSTOR, XRSTOR, XRSTORS always save/restore error pointers, +thus making the X86_BUG_FXSAVE_LEAK workaround obsolete on such CPUs. + +Signed-Off-By: Rudolf Marek +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Tested-by: Borislav Petkov +Cc: Andy Lutomirski +Link: https://lkml.kernel.org/r/bdcebe90-62c5-1f05-083c-eba7f08b2540@assembler.cz +Signed-off-by: Ingo Molnar +(cherry picked from commit f2dbad36c55e5d3a91dccbde6e8cae345fe5632f) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 281b622113c66ba2de9b7725e1d232ea3c282114) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/cpufeatures.h | 1 + + arch/x86/kernel/cpu/amd.c | 7 +++++-- + 2 files changed, 6 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 0ea630bb3e74..d57a174ec97c 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -265,6 +265,7 @@ + /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ + #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ + #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ ++#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ + + /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ + #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c +index 3b9e220621f8..2a5328cc03a6 100644 +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -760,8 +760,11 @@ static void init_amd(struct cpuinfo_x86 *c) + case 0x15: init_amd_bd(c); break; + } + +- /* Enable workaround for FXSAVE leak */ +- if (c->x86 >= 6) ++ /* ++ * Enable workaround for FXSAVE leak on CPUs ++ * without a XSaveErPtr feature ++ */ ++ if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR))) + set_cpu_bug(c, X86_BUG_FXSAVE_LEAK); + + cpu_detect_cache_sizes(c); +-- +2.14.2 + diff --git a/patches/kernel/0126-bpf-fix-build-issues-on-um-due-to-mising-bpf_perf_ev.patch b/patches/kernel/0126-bpf-fix-build-issues-on-um-due-to-mising-bpf_perf_ev.patch deleted file mode 100644 index 95f4ce8..0000000 --- a/patches/kernel/0126-bpf-fix-build-issues-on-um-due-to-mising-bpf_perf_ev.patch +++ /dev/null @@ -1,68 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Daniel Borkmann -Date: Tue, 12 Dec 2017 02:25:31 +0100 -Subject: [PATCH] bpf: fix build issues on um due to mising bpf_perf_event.h -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -[ Note, this is a Git cherry-pick of the following commit: - - a23f06f06dbe ("bpf: fix build issues on um due to mising bpf_perf_event.h") - - ... for easier x86 PTI code testing and back-porting. ] - -Since c895f6f703ad ("bpf: correct broken uapi for -BPF_PROG_TYPE_PERF_EVENT program type") um (uml) won't build -on i386 or x86_64: - - [...] - CC init/main.o - In file included from ../include/linux/perf_event.h:18:0, - from ../include/linux/trace_events.h:10, - from ../include/trace/syscall.h:7, - from ../include/linux/syscalls.h:82, - from ../init/main.c:20: - ../include/uapi/linux/bpf_perf_event.h:11:32: fatal error: - asm/bpf_perf_event.h: No such file or directory #include - - [...] - -Lets add missing bpf_perf_event.h also to um arch. This seems -to be the only one still missing. - -Fixes: c895f6f703ad ("bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type") -Reported-by: Randy Dunlap -Suggested-by: Richard Weinberger -Signed-off-by: Daniel Borkmann -Tested-by: Randy Dunlap -Cc: Hendrik Brueckner -Cc: Richard Weinberger -Acked-by: Alexei Starovoitov -Acked-by: Richard Weinberger -Signed-off-by: Alexei Starovoitov -Signed-off-by: Ingo Molnar -(cherry picked from commit ab95477e7cb35557ecfc837687007b646bab9a9f) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 1883b099261ebece3016b50fa403ffde90027a04) -Signed-off-by: Fabian Grünbichler ---- - arch/um/include/asm/Kbuild | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild -index 50a32c33d729..73c57f614c9e 100644 ---- a/arch/um/include/asm/Kbuild -+++ b/arch/um/include/asm/Kbuild -@@ -1,4 +1,5 @@ - generic-y += barrier.h -+generic-y += bpf_perf_event.h - generic-y += bug.h - generic-y += clkdev.h - generic-y += current.h --- -2.14.2 - diff --git a/patches/kernel/0126-perf-x86-Enable-free-running-PEBS-for-REGS_USER-INTR.patch b/patches/kernel/0126-perf-x86-Enable-free-running-PEBS-for-REGS_USER-INTR.patch new file mode 100644 index 0000000..169282e --- /dev/null +++ b/patches/kernel/0126-perf-x86-Enable-free-running-PEBS-for-REGS_USER-INTR.patch @@ -0,0 +1,109 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andi Kleen +Date: Thu, 31 Aug 2017 14:46:30 -0700 +Subject: [PATCH] perf/x86: Enable free running PEBS for REGS_USER/INTR +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +[ Note, this is a Git cherry-pick of the following commit: + + a47ba4d77e12 ("perf/x86: Enable free running PEBS for REGS_USER/INTR") + + ... for easier x86 PTI code testing and back-porting. ] + +Currently free running PEBS is disabled when user or interrupt +registers are requested. Most of the registers are actually +available in the PEBS record and can be supported. + +So we just need to check for the supported registers and then +allow it: it is all except for the segment register. + +For user registers this only works when the counter is limited +to ring 3 only, so this also needs to be checked. + +Signed-off-by: Andi Kleen +Signed-off-by: Peter Zijlstra (Intel) +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/20170831214630.21892-1-andi@firstfloor.org +Signed-off-by: Ingo Molnar +(backported from commit 2fe1bc1f501d55e5925b4035bcd85781adc76c63) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 06c6715f5b78b9976e72467b6bba510e243e5aad) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/events/perf_event.h | 24 +++++++++++++++++++++++- + arch/x86/events/intel/core.c | 4 ++++ + 2 files changed, 27 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h +index 0f7dad8bd358..590eaf7c2c3e 100644 +--- a/arch/x86/events/perf_event.h ++++ b/arch/x86/events/perf_event.h +@@ -85,13 +85,15 @@ struct amd_nb { + * Flags PEBS can handle without an PMI. + * + * TID can only be handled by flushing at context switch. ++ * REGS_USER can be handled for events limited to ring 3. + * + */ + #define PEBS_FREERUNNING_FLAGS \ + (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ + PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ + PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ +- PERF_SAMPLE_TRANSACTION) ++ PERF_SAMPLE_TRANSACTION | \ ++ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) + + /* + * A debug store configuration. +@@ -110,6 +112,26 @@ struct debug_store { + u64 pebs_event_reset[MAX_PEBS_EVENTS]; + }; + ++#define PEBS_REGS \ ++ (PERF_REG_X86_AX | \ ++ PERF_REG_X86_BX | \ ++ PERF_REG_X86_CX | \ ++ PERF_REG_X86_DX | \ ++ PERF_REG_X86_DI | \ ++ PERF_REG_X86_SI | \ ++ PERF_REG_X86_SP | \ ++ PERF_REG_X86_BP | \ ++ PERF_REG_X86_IP | \ ++ PERF_REG_X86_FLAGS | \ ++ PERF_REG_X86_R8 | \ ++ PERF_REG_X86_R9 | \ ++ PERF_REG_X86_R10 | \ ++ PERF_REG_X86_R11 | \ ++ PERF_REG_X86_R12 | \ ++ PERF_REG_X86_R13 | \ ++ PERF_REG_X86_R14 | \ ++ PERF_REG_X86_R15) ++ + /* + * Per register state. + */ +diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c +index 6f342001ec6a..7f3afbf928bb 100644 +--- a/arch/x86/events/intel/core.c ++++ b/arch/x86/events/intel/core.c +@@ -2958,6 +2958,10 @@ static unsigned long intel_pmu_free_running_flags(struct perf_event *event) + + if (event->attr.use_clockid) + flags &= ~PERF_SAMPLE_TIME; ++ if (!event->attr.exclude_kernel) ++ flags &= ~PERF_SAMPLE_REGS_USER; ++ if (event->attr.sample_regs_user & ~PEBS_REGS) ++ flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); + return flags; + } + +-- +2.14.2 + diff --git a/patches/kernel/0127-bpf-fix-build-issues-on-um-due-to-mising-bpf_perf_ev.patch b/patches/kernel/0127-bpf-fix-build-issues-on-um-due-to-mising-bpf_perf_ev.patch new file mode 100644 index 0000000..95f4ce8 --- /dev/null +++ b/patches/kernel/0127-bpf-fix-build-issues-on-um-due-to-mising-bpf_perf_ev.patch @@ -0,0 +1,68 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann +Date: Tue, 12 Dec 2017 02:25:31 +0100 +Subject: [PATCH] bpf: fix build issues on um due to mising bpf_perf_event.h +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +[ Note, this is a Git cherry-pick of the following commit: + + a23f06f06dbe ("bpf: fix build issues on um due to mising bpf_perf_event.h") + + ... for easier x86 PTI code testing and back-porting. ] + +Since c895f6f703ad ("bpf: correct broken uapi for +BPF_PROG_TYPE_PERF_EVENT program type") um (uml) won't build +on i386 or x86_64: + + [...] + CC init/main.o + In file included from ../include/linux/perf_event.h:18:0, + from ../include/linux/trace_events.h:10, + from ../include/trace/syscall.h:7, + from ../include/linux/syscalls.h:82, + from ../init/main.c:20: + ../include/uapi/linux/bpf_perf_event.h:11:32: fatal error: + asm/bpf_perf_event.h: No such file or directory #include + + [...] + +Lets add missing bpf_perf_event.h also to um arch. This seems +to be the only one still missing. + +Fixes: c895f6f703ad ("bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type") +Reported-by: Randy Dunlap +Suggested-by: Richard Weinberger +Signed-off-by: Daniel Borkmann +Tested-by: Randy Dunlap +Cc: Hendrik Brueckner +Cc: Richard Weinberger +Acked-by: Alexei Starovoitov +Acked-by: Richard Weinberger +Signed-off-by: Alexei Starovoitov +Signed-off-by: Ingo Molnar +(cherry picked from commit ab95477e7cb35557ecfc837687007b646bab9a9f) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 1883b099261ebece3016b50fa403ffde90027a04) +Signed-off-by: Fabian Grünbichler +--- + arch/um/include/asm/Kbuild | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild +index 50a32c33d729..73c57f614c9e 100644 +--- a/arch/um/include/asm/Kbuild ++++ b/arch/um/include/asm/Kbuild +@@ -1,4 +1,5 @@ + generic-y += barrier.h ++generic-y += bpf_perf_event.h + generic-y += bug.h + generic-y += clkdev.h + generic-y += current.h +-- +2.14.2 + diff --git a/patches/kernel/0127-locking-barriers-Add-implicit-smp_read_barrier_depen.patch b/patches/kernel/0127-locking-barriers-Add-implicit-smp_read_barrier_depen.patch deleted file mode 100644 index 03874b6..0000000 --- a/patches/kernel/0127-locking-barriers-Add-implicit-smp_read_barrier_depen.patch +++ /dev/null @@ -1,53 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Will Deacon -Date: Tue, 24 Oct 2017 11:22:47 +0100 -Subject: [PATCH] locking/barriers: Add implicit smp_read_barrier_depends() to - READ_ONCE() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -[ Note, this is a Git cherry-pick of the following commit: - - 76ebbe78f739 ("locking/barriers: Add implicit smp_read_barrier_depends() to READ_ONCE()") - - ... for easier x86 PTI code testing and back-porting. ] - -In preparation for the removal of lockless_dereference(), which is the -same as READ_ONCE() on all architectures other than Alpha, add an -implicit smp_read_barrier_depends() to READ_ONCE() so that it can be -used to head dependency chains on all architectures. - -Signed-off-by: Will Deacon -Cc: Linus Torvalds -Cc: Paul E. McKenney -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/1508840570-22169-3-git-send-email-will.deacon@arm.com -Signed-off-by: Ingo Molnar -(cherry picked from commit c2bc66082e1048c7573d72e62f597bdc5ce13fea) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 6ef3d843f8f1a8b72ca83d4b1d457c2896278ccd) -Signed-off-by: Fabian Grünbichler ---- - include/linux/compiler.h | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/include/linux/compiler.h b/include/linux/compiler.h -index 043b60de041e..8af8814ebe7a 100644 ---- a/include/linux/compiler.h -+++ b/include/linux/compiler.h -@@ -314,6 +314,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s - __read_once_size(&(x), __u.__c, sizeof(x)); \ - else \ - __read_once_size_nocheck(&(x), __u.__c, sizeof(x)); \ -+ smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \ - __u.__val; \ - }) - #define READ_ONCE(x) __READ_ONCE(x, 1) --- -2.14.2 - diff --git a/patches/kernel/0128-locking-barriers-Add-implicit-smp_read_barrier_depen.patch b/patches/kernel/0128-locking-barriers-Add-implicit-smp_read_barrier_depen.patch new file mode 100644 index 0000000..03874b6 --- /dev/null +++ b/patches/kernel/0128-locking-barriers-Add-implicit-smp_read_barrier_depen.patch @@ -0,0 +1,53 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Will Deacon +Date: Tue, 24 Oct 2017 11:22:47 +0100 +Subject: [PATCH] locking/barriers: Add implicit smp_read_barrier_depends() to + READ_ONCE() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +[ Note, this is a Git cherry-pick of the following commit: + + 76ebbe78f739 ("locking/barriers: Add implicit smp_read_barrier_depends() to READ_ONCE()") + + ... for easier x86 PTI code testing and back-porting. ] + +In preparation for the removal of lockless_dereference(), which is the +same as READ_ONCE() on all architectures other than Alpha, add an +implicit smp_read_barrier_depends() to READ_ONCE() so that it can be +used to head dependency chains on all architectures. + +Signed-off-by: Will Deacon +Cc: Linus Torvalds +Cc: Paul E. McKenney +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/1508840570-22169-3-git-send-email-will.deacon@arm.com +Signed-off-by: Ingo Molnar +(cherry picked from commit c2bc66082e1048c7573d72e62f597bdc5ce13fea) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 6ef3d843f8f1a8b72ca83d4b1d457c2896278ccd) +Signed-off-by: Fabian Grünbichler +--- + include/linux/compiler.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/include/linux/compiler.h b/include/linux/compiler.h +index 043b60de041e..8af8814ebe7a 100644 +--- a/include/linux/compiler.h ++++ b/include/linux/compiler.h +@@ -314,6 +314,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s + __read_once_size(&(x), __u.__c, sizeof(x)); \ + else \ + __read_once_size_nocheck(&(x), __u.__c, sizeof(x)); \ ++ smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \ + __u.__val; \ + }) + #define READ_ONCE(x) __READ_ONCE(x, 1) +-- +2.14.2 + diff --git a/patches/kernel/0128-locking-barriers-Convert-users-of-lockless_dereferen.patch b/patches/kernel/0128-locking-barriers-Convert-users-of-lockless_dereferen.patch deleted file mode 100644 index e519140..0000000 --- a/patches/kernel/0128-locking-barriers-Convert-users-of-lockless_dereferen.patch +++ /dev/null @@ -1,324 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Will Deacon -Date: Tue, 24 Oct 2017 11:22:48 +0100 -Subject: [PATCH] locking/barriers: Convert users of lockless_dereference() to - READ_ONCE() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -[ Note, this is a Git cherry-pick of the following commit: - - 506458efaf15 ("locking/barriers: Convert users of lockless_dereference() to READ_ONCE()") - - ... for easier x86 PTI code testing and back-porting. ] - -READ_ONCE() now has an implicit smp_read_barrier_depends() call, so it -can be used instead of lockless_dereference() without any change in -semantics. - -Signed-off-by: Will Deacon -Cc: Linus Torvalds -Cc: Paul E. McKenney -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/1508840570-22169-4-git-send-email-will.deacon@arm.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 3382290ed2d5e275429cef510ab21889d3ccd164) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 7252704bfd83e951d00ec75526ed2bf64a7f6ee1) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/mmu_context.h | 4 ++-- - fs/overlayfs/ovl_entry.h | 2 +- - include/linux/rculist.h | 4 ++-- - include/linux/rcupdate.h | 4 ++-- - mm/slab.h | 2 +- - arch/x86/events/core.c | 2 +- - arch/x86/kernel/ldt.c | 2 +- - drivers/md/dm-mpath.c | 20 ++++++++++---------- - fs/dcache.c | 4 ++-- - fs/overlayfs/readdir.c | 2 +- - kernel/events/core.c | 4 ++-- - kernel/seccomp.c | 2 +- - kernel/task_work.c | 2 +- - 13 files changed, 27 insertions(+), 27 deletions(-) - -diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h -index 3c856a15b98e..efc530642f7d 100644 ---- a/arch/x86/include/asm/mmu_context.h -+++ b/arch/x86/include/asm/mmu_context.h -@@ -72,8 +72,8 @@ static inline void load_mm_ldt(struct mm_struct *mm) - #ifdef CONFIG_MODIFY_LDT_SYSCALL - struct ldt_struct *ldt; - -- /* lockless_dereference synchronizes with smp_store_release */ -- ldt = lockless_dereference(mm->context.ldt); -+ /* READ_ONCE synchronizes with smp_store_release */ -+ ldt = READ_ONCE(mm->context.ldt); - - /* - * Any change to mm->context.ldt is followed by an IPI to all -diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h -index 25d9b5adcd42..36b49bd09264 100644 ---- a/fs/overlayfs/ovl_entry.h -+++ b/fs/overlayfs/ovl_entry.h -@@ -77,5 +77,5 @@ static inline struct ovl_inode *OVL_I(struct inode *inode) - - static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi) - { -- return lockless_dereference(oi->__upperdentry); -+ return READ_ONCE(oi->__upperdentry); - } -diff --git a/include/linux/rculist.h b/include/linux/rculist.h -index b1fd8bf85fdc..3a2bb7d8ed4d 100644 ---- a/include/linux/rculist.h -+++ b/include/linux/rculist.h -@@ -274,7 +274,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, - * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). - */ - #define list_entry_rcu(ptr, type, member) \ -- container_of(lockless_dereference(ptr), type, member) -+ container_of(READ_ONCE(ptr), type, member) - - /** - * Where are list_empty_rcu() and list_first_entry_rcu()? -@@ -367,7 +367,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, - * example is when items are added to the list, but never deleted. - */ - #define list_entry_lockless(ptr, type, member) \ -- container_of((typeof(ptr))lockless_dereference(ptr), type, member) -+ container_of((typeof(ptr))READ_ONCE(ptr), type, member) - - /** - * list_for_each_entry_lockless - iterate over rcu list of given type -diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h -index f816fc72b51e..ae494eb7b401 100644 ---- a/include/linux/rcupdate.h -+++ b/include/linux/rcupdate.h -@@ -341,7 +341,7 @@ static inline void rcu_preempt_sleep_check(void) { } - #define __rcu_dereference_check(p, c, space) \ - ({ \ - /* Dependency order vs. p above. */ \ -- typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \ -+ typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \ - RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ - rcu_dereference_sparse(p, space); \ - ((typeof(*p) __force __kernel *)(________p1)); \ -@@ -355,7 +355,7 @@ static inline void rcu_preempt_sleep_check(void) { } - #define rcu_dereference_raw(p) \ - ({ \ - /* Dependency order vs. p above. */ \ -- typeof(p) ________p1 = lockless_dereference(p); \ -+ typeof(p) ________p1 = READ_ONCE(p); \ - ((typeof(*p) __force __kernel *)(________p1)); \ - }) - -diff --git a/mm/slab.h b/mm/slab.h -index 6885e1192ec5..494cccef822a 100644 ---- a/mm/slab.h -+++ b/mm/slab.h -@@ -257,7 +257,7 @@ cache_from_memcg_idx(struct kmem_cache *s, int idx) - * memcg_caches issues a write barrier to match this (see - * memcg_create_kmem_cache()). - */ -- cachep = lockless_dereference(arr->entries[idx]); -+ cachep = READ_ONCE(arr->entries[idx]); - rcu_read_unlock(); - - return cachep; -diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c -index 939050169d12..18685de61288 100644 ---- a/arch/x86/events/core.c -+++ b/arch/x86/events/core.c -@@ -2336,7 +2336,7 @@ static unsigned long get_segment_base(unsigned int segment) - struct ldt_struct *ldt; - - /* IRQs are off, so this synchronizes with smp_store_release */ -- ldt = lockless_dereference(current->active_mm->context.ldt); -+ ldt = READ_ONCE(current->active_mm->context.ldt); - if (!ldt || idx >= ldt->nr_entries) - return 0; - -diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c -index 0402d44deb4d..b8be2413cb74 100644 ---- a/arch/x86/kernel/ldt.c -+++ b/arch/x86/kernel/ldt.c -@@ -102,7 +102,7 @@ static void finalize_ldt_struct(struct ldt_struct *ldt) - static void install_ldt(struct mm_struct *current_mm, - struct ldt_struct *ldt) - { -- /* Synchronizes with lockless_dereference in load_mm_ldt. */ -+ /* Synchronizes with READ_ONCE in load_mm_ldt. */ - smp_store_release(¤t_mm->context.ldt, ldt); - - /* Activate the LDT for all CPUs using current_mm. */ -diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c -index d24e4b05f5da..731b7ffc7e37 100644 ---- a/drivers/md/dm-mpath.c -+++ b/drivers/md/dm-mpath.c -@@ -366,7 +366,7 @@ static struct pgpath *choose_path_in_pg(struct multipath *m, - - pgpath = path_to_pgpath(path); - -- if (unlikely(lockless_dereference(m->current_pg) != pg)) { -+ if (unlikely(READ_ONCE(m->current_pg) != pg)) { - /* Only update current_pgpath if pg changed */ - spin_lock_irqsave(&m->lock, flags); - m->current_pgpath = pgpath; -@@ -390,7 +390,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) - } - - /* Were we instructed to switch PG? */ -- if (lockless_dereference(m->next_pg)) { -+ if (READ_ONCE(m->next_pg)) { - spin_lock_irqsave(&m->lock, flags); - pg = m->next_pg; - if (!pg) { -@@ -406,7 +406,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) - - /* Don't change PG until it has no remaining paths */ - check_current_pg: -- pg = lockless_dereference(m->current_pg); -+ pg = READ_ONCE(m->current_pg); - if (pg) { - pgpath = choose_path_in_pg(m, pg, nr_bytes); - if (!IS_ERR_OR_NULL(pgpath)) -@@ -473,7 +473,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, - struct request *clone; - - /* Do we need to select a new pgpath? */ -- pgpath = lockless_dereference(m->current_pgpath); -+ pgpath = READ_ONCE(m->current_pgpath); - if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) - pgpath = choose_pgpath(m, nr_bytes); - -@@ -535,7 +535,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m - bool queue_io; - - /* Do we need to select a new pgpath? */ -- pgpath = lockless_dereference(m->current_pgpath); -+ pgpath = READ_ONCE(m->current_pgpath); - queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); - if (!pgpath || !queue_io) - pgpath = choose_pgpath(m, nr_bytes); -@@ -1799,7 +1799,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, - struct pgpath *current_pgpath; - int r; - -- current_pgpath = lockless_dereference(m->current_pgpath); -+ current_pgpath = READ_ONCE(m->current_pgpath); - if (!current_pgpath) - current_pgpath = choose_pgpath(m, 0); - -@@ -1821,7 +1821,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, - } - - if (r == -ENOTCONN) { -- if (!lockless_dereference(m->current_pg)) { -+ if (!READ_ONCE(m->current_pg)) { - /* Path status changed, redo selection */ - (void) choose_pgpath(m, 0); - } -@@ -1890,9 +1890,9 @@ static int multipath_busy(struct dm_target *ti) - return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED); - - /* Guess which priority_group will be used at next mapping time */ -- pg = lockless_dereference(m->current_pg); -- next_pg = lockless_dereference(m->next_pg); -- if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg)) -+ pg = READ_ONCE(m->current_pg); -+ next_pg = READ_ONCE(m->next_pg); -+ if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg)) - pg = next_pg; - - if (!pg) { -diff --git a/fs/dcache.c b/fs/dcache.c -index 3203470c59c2..ccc2bcdcfdfb 100644 ---- a/fs/dcache.c -+++ b/fs/dcache.c -@@ -231,7 +231,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c - { - /* - * Be careful about RCU walk racing with rename: -- * use 'lockless_dereference' to fetch the name pointer. -+ * use 'READ_ONCE' to fetch the name pointer. - * - * NOTE! Even if a rename will mean that the length - * was not loaded atomically, we don't care. The -@@ -245,7 +245,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c - * early because the data cannot match (there can - * be no NUL in the ct/tcount data) - */ -- const unsigned char *cs = lockless_dereference(dentry->d_name.name); -+ const unsigned char *cs = READ_ONCE(dentry->d_name.name); - - return dentry_string_cmp(cs, ct, tcount); - } -diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c -index 3ff960372cb9..7920a3f62c19 100644 ---- a/fs/overlayfs/readdir.c -+++ b/fs/overlayfs/readdir.c -@@ -440,7 +440,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, - if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) { - struct inode *inode = file_inode(file); - -- realfile = lockless_dereference(od->upperfile); -+ realfile = READ_ONCE(od->upperfile); - if (!realfile) { - struct path upperpath; - -diff --git a/kernel/events/core.c b/kernel/events/core.c -index 5d4398d1fa19..9f51738bf32e 100644 ---- a/kernel/events/core.c -+++ b/kernel/events/core.c -@@ -4221,7 +4221,7 @@ static void perf_remove_from_owner(struct perf_event *event) - * indeed free this event, otherwise we need to serialize on - * owner->perf_event_mutex. - */ -- owner = lockless_dereference(event->owner); -+ owner = READ_ONCE(event->owner); - if (owner) { - /* - * Since delayed_put_task_struct() also drops the last -@@ -4318,7 +4318,7 @@ int perf_event_release_kernel(struct perf_event *event) - * Cannot change, child events are not migrated, see the - * comment with perf_event_ctx_lock_nested(). - */ -- ctx = lockless_dereference(child->ctx); -+ ctx = READ_ONCE(child->ctx); - /* - * Since child_mutex nests inside ctx::mutex, we must jump - * through hoops. We start by grabbing a reference on the ctx. -diff --git a/kernel/seccomp.c b/kernel/seccomp.c -index 34aced9ff3ff..3fd2c4b23697 100644 ---- a/kernel/seccomp.c -+++ b/kernel/seccomp.c -@@ -188,7 +188,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, - u32 ret = SECCOMP_RET_ALLOW; - /* Make sure cross-thread synced filter points somewhere sane. */ - struct seccomp_filter *f = -- lockless_dereference(current->seccomp.filter); -+ READ_ONCE(current->seccomp.filter); - - /* Ensure unexpected behavior doesn't result in failing open. */ - if (unlikely(WARN_ON(f == NULL))) -diff --git a/kernel/task_work.c b/kernel/task_work.c -index e056d5429783..0371093a2331 100644 ---- a/kernel/task_work.c -+++ b/kernel/task_work.c -@@ -67,7 +67,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) - * we raced with task_work_run(), *pprev == NULL/exited. - */ - raw_spin_lock_irqsave(&task->pi_lock, flags); -- while ((work = lockless_dereference(*pprev))) { -+ while ((work = READ_ONCE(*pprev))) { - if (work->func != func) - pprev = &work->next; - else if (cmpxchg(pprev, work, work->next) == work) --- -2.14.2 - diff --git a/patches/kernel/0129-locking-barriers-Convert-users-of-lockless_dereferen.patch b/patches/kernel/0129-locking-barriers-Convert-users-of-lockless_dereferen.patch new file mode 100644 index 0000000..e519140 --- /dev/null +++ b/patches/kernel/0129-locking-barriers-Convert-users-of-lockless_dereferen.patch @@ -0,0 +1,324 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Will Deacon +Date: Tue, 24 Oct 2017 11:22:48 +0100 +Subject: [PATCH] locking/barriers: Convert users of lockless_dereference() to + READ_ONCE() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +[ Note, this is a Git cherry-pick of the following commit: + + 506458efaf15 ("locking/barriers: Convert users of lockless_dereference() to READ_ONCE()") + + ... for easier x86 PTI code testing and back-porting. ] + +READ_ONCE() now has an implicit smp_read_barrier_depends() call, so it +can be used instead of lockless_dereference() without any change in +semantics. + +Signed-off-by: Will Deacon +Cc: Linus Torvalds +Cc: Paul E. McKenney +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/1508840570-22169-4-git-send-email-will.deacon@arm.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 3382290ed2d5e275429cef510ab21889d3ccd164) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 7252704bfd83e951d00ec75526ed2bf64a7f6ee1) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/mmu_context.h | 4 ++-- + fs/overlayfs/ovl_entry.h | 2 +- + include/linux/rculist.h | 4 ++-- + include/linux/rcupdate.h | 4 ++-- + mm/slab.h | 2 +- + arch/x86/events/core.c | 2 +- + arch/x86/kernel/ldt.c | 2 +- + drivers/md/dm-mpath.c | 20 ++++++++++---------- + fs/dcache.c | 4 ++-- + fs/overlayfs/readdir.c | 2 +- + kernel/events/core.c | 4 ++-- + kernel/seccomp.c | 2 +- + kernel/task_work.c | 2 +- + 13 files changed, 27 insertions(+), 27 deletions(-) + +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index 3c856a15b98e..efc530642f7d 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -72,8 +72,8 @@ static inline void load_mm_ldt(struct mm_struct *mm) + #ifdef CONFIG_MODIFY_LDT_SYSCALL + struct ldt_struct *ldt; + +- /* lockless_dereference synchronizes with smp_store_release */ +- ldt = lockless_dereference(mm->context.ldt); ++ /* READ_ONCE synchronizes with smp_store_release */ ++ ldt = READ_ONCE(mm->context.ldt); + + /* + * Any change to mm->context.ldt is followed by an IPI to all +diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h +index 25d9b5adcd42..36b49bd09264 100644 +--- a/fs/overlayfs/ovl_entry.h ++++ b/fs/overlayfs/ovl_entry.h +@@ -77,5 +77,5 @@ static inline struct ovl_inode *OVL_I(struct inode *inode) + + static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi) + { +- return lockless_dereference(oi->__upperdentry); ++ return READ_ONCE(oi->__upperdentry); + } +diff --git a/include/linux/rculist.h b/include/linux/rculist.h +index b1fd8bf85fdc..3a2bb7d8ed4d 100644 +--- a/include/linux/rculist.h ++++ b/include/linux/rculist.h +@@ -274,7 +274,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, + * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). + */ + #define list_entry_rcu(ptr, type, member) \ +- container_of(lockless_dereference(ptr), type, member) ++ container_of(READ_ONCE(ptr), type, member) + + /** + * Where are list_empty_rcu() and list_first_entry_rcu()? +@@ -367,7 +367,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, + * example is when items are added to the list, but never deleted. + */ + #define list_entry_lockless(ptr, type, member) \ +- container_of((typeof(ptr))lockless_dereference(ptr), type, member) ++ container_of((typeof(ptr))READ_ONCE(ptr), type, member) + + /** + * list_for_each_entry_lockless - iterate over rcu list of given type +diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h +index f816fc72b51e..ae494eb7b401 100644 +--- a/include/linux/rcupdate.h ++++ b/include/linux/rcupdate.h +@@ -341,7 +341,7 @@ static inline void rcu_preempt_sleep_check(void) { } + #define __rcu_dereference_check(p, c, space) \ + ({ \ + /* Dependency order vs. p above. */ \ +- typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \ ++ typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \ + RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ + rcu_dereference_sparse(p, space); \ + ((typeof(*p) __force __kernel *)(________p1)); \ +@@ -355,7 +355,7 @@ static inline void rcu_preempt_sleep_check(void) { } + #define rcu_dereference_raw(p) \ + ({ \ + /* Dependency order vs. p above. */ \ +- typeof(p) ________p1 = lockless_dereference(p); \ ++ typeof(p) ________p1 = READ_ONCE(p); \ + ((typeof(*p) __force __kernel *)(________p1)); \ + }) + +diff --git a/mm/slab.h b/mm/slab.h +index 6885e1192ec5..494cccef822a 100644 +--- a/mm/slab.h ++++ b/mm/slab.h +@@ -257,7 +257,7 @@ cache_from_memcg_idx(struct kmem_cache *s, int idx) + * memcg_caches issues a write barrier to match this (see + * memcg_create_kmem_cache()). + */ +- cachep = lockless_dereference(arr->entries[idx]); ++ cachep = READ_ONCE(arr->entries[idx]); + rcu_read_unlock(); + + return cachep; +diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c +index 939050169d12..18685de61288 100644 +--- a/arch/x86/events/core.c ++++ b/arch/x86/events/core.c +@@ -2336,7 +2336,7 @@ static unsigned long get_segment_base(unsigned int segment) + struct ldt_struct *ldt; + + /* IRQs are off, so this synchronizes with smp_store_release */ +- ldt = lockless_dereference(current->active_mm->context.ldt); ++ ldt = READ_ONCE(current->active_mm->context.ldt); + if (!ldt || idx >= ldt->nr_entries) + return 0; + +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index 0402d44deb4d..b8be2413cb74 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -102,7 +102,7 @@ static void finalize_ldt_struct(struct ldt_struct *ldt) + static void install_ldt(struct mm_struct *current_mm, + struct ldt_struct *ldt) + { +- /* Synchronizes with lockless_dereference in load_mm_ldt. */ ++ /* Synchronizes with READ_ONCE in load_mm_ldt. */ + smp_store_release(¤t_mm->context.ldt, ldt); + + /* Activate the LDT for all CPUs using current_mm. */ +diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c +index d24e4b05f5da..731b7ffc7e37 100644 +--- a/drivers/md/dm-mpath.c ++++ b/drivers/md/dm-mpath.c +@@ -366,7 +366,7 @@ static struct pgpath *choose_path_in_pg(struct multipath *m, + + pgpath = path_to_pgpath(path); + +- if (unlikely(lockless_dereference(m->current_pg) != pg)) { ++ if (unlikely(READ_ONCE(m->current_pg) != pg)) { + /* Only update current_pgpath if pg changed */ + spin_lock_irqsave(&m->lock, flags); + m->current_pgpath = pgpath; +@@ -390,7 +390,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) + } + + /* Were we instructed to switch PG? */ +- if (lockless_dereference(m->next_pg)) { ++ if (READ_ONCE(m->next_pg)) { + spin_lock_irqsave(&m->lock, flags); + pg = m->next_pg; + if (!pg) { +@@ -406,7 +406,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) + + /* Don't change PG until it has no remaining paths */ + check_current_pg: +- pg = lockless_dereference(m->current_pg); ++ pg = READ_ONCE(m->current_pg); + if (pg) { + pgpath = choose_path_in_pg(m, pg, nr_bytes); + if (!IS_ERR_OR_NULL(pgpath)) +@@ -473,7 +473,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, + struct request *clone; + + /* Do we need to select a new pgpath? */ +- pgpath = lockless_dereference(m->current_pgpath); ++ pgpath = READ_ONCE(m->current_pgpath); + if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) + pgpath = choose_pgpath(m, nr_bytes); + +@@ -535,7 +535,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m + bool queue_io; + + /* Do we need to select a new pgpath? */ +- pgpath = lockless_dereference(m->current_pgpath); ++ pgpath = READ_ONCE(m->current_pgpath); + queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); + if (!pgpath || !queue_io) + pgpath = choose_pgpath(m, nr_bytes); +@@ -1799,7 +1799,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, + struct pgpath *current_pgpath; + int r; + +- current_pgpath = lockless_dereference(m->current_pgpath); ++ current_pgpath = READ_ONCE(m->current_pgpath); + if (!current_pgpath) + current_pgpath = choose_pgpath(m, 0); + +@@ -1821,7 +1821,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, + } + + if (r == -ENOTCONN) { +- if (!lockless_dereference(m->current_pg)) { ++ if (!READ_ONCE(m->current_pg)) { + /* Path status changed, redo selection */ + (void) choose_pgpath(m, 0); + } +@@ -1890,9 +1890,9 @@ static int multipath_busy(struct dm_target *ti) + return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED); + + /* Guess which priority_group will be used at next mapping time */ +- pg = lockless_dereference(m->current_pg); +- next_pg = lockless_dereference(m->next_pg); +- if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg)) ++ pg = READ_ONCE(m->current_pg); ++ next_pg = READ_ONCE(m->next_pg); ++ if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg)) + pg = next_pg; + + if (!pg) { +diff --git a/fs/dcache.c b/fs/dcache.c +index 3203470c59c2..ccc2bcdcfdfb 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -231,7 +231,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c + { + /* + * Be careful about RCU walk racing with rename: +- * use 'lockless_dereference' to fetch the name pointer. ++ * use 'READ_ONCE' to fetch the name pointer. + * + * NOTE! Even if a rename will mean that the length + * was not loaded atomically, we don't care. The +@@ -245,7 +245,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c + * early because the data cannot match (there can + * be no NUL in the ct/tcount data) + */ +- const unsigned char *cs = lockless_dereference(dentry->d_name.name); ++ const unsigned char *cs = READ_ONCE(dentry->d_name.name); + + return dentry_string_cmp(cs, ct, tcount); + } +diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c +index 3ff960372cb9..7920a3f62c19 100644 +--- a/fs/overlayfs/readdir.c ++++ b/fs/overlayfs/readdir.c +@@ -440,7 +440,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, + if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) { + struct inode *inode = file_inode(file); + +- realfile = lockless_dereference(od->upperfile); ++ realfile = READ_ONCE(od->upperfile); + if (!realfile) { + struct path upperpath; + +diff --git a/kernel/events/core.c b/kernel/events/core.c +index 5d4398d1fa19..9f51738bf32e 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -4221,7 +4221,7 @@ static void perf_remove_from_owner(struct perf_event *event) + * indeed free this event, otherwise we need to serialize on + * owner->perf_event_mutex. + */ +- owner = lockless_dereference(event->owner); ++ owner = READ_ONCE(event->owner); + if (owner) { + /* + * Since delayed_put_task_struct() also drops the last +@@ -4318,7 +4318,7 @@ int perf_event_release_kernel(struct perf_event *event) + * Cannot change, child events are not migrated, see the + * comment with perf_event_ctx_lock_nested(). + */ +- ctx = lockless_dereference(child->ctx); ++ ctx = READ_ONCE(child->ctx); + /* + * Since child_mutex nests inside ctx::mutex, we must jump + * through hoops. We start by grabbing a reference on the ctx. +diff --git a/kernel/seccomp.c b/kernel/seccomp.c +index 34aced9ff3ff..3fd2c4b23697 100644 +--- a/kernel/seccomp.c ++++ b/kernel/seccomp.c +@@ -188,7 +188,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, + u32 ret = SECCOMP_RET_ALLOW; + /* Make sure cross-thread synced filter points somewhere sane. */ + struct seccomp_filter *f = +- lockless_dereference(current->seccomp.filter); ++ READ_ONCE(current->seccomp.filter); + + /* Ensure unexpected behavior doesn't result in failing open. */ + if (unlikely(WARN_ON(f == NULL))) +diff --git a/kernel/task_work.c b/kernel/task_work.c +index e056d5429783..0371093a2331 100644 +--- a/kernel/task_work.c ++++ b/kernel/task_work.c +@@ -67,7 +67,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) + * we raced with task_work_run(), *pprev == NULL/exited. + */ + raw_spin_lock_irqsave(&task->pi_lock, flags); +- while ((work = lockless_dereference(*pprev))) { ++ while ((work = READ_ONCE(*pprev))) { + if (work->func != func) + pprev = &work->next; + else if (cmpxchg(pprev, work, work->next) == work) +-- +2.14.2 + diff --git a/patches/kernel/0129-x86-mm-kasan-Don-t-use-vmemmap_populate-to-initializ.patch b/patches/kernel/0129-x86-mm-kasan-Don-t-use-vmemmap_populate-to-initializ.patch deleted file mode 100644 index 74c149b..0000000 --- a/patches/kernel/0129-x86-mm-kasan-Don-t-use-vmemmap_populate-to-initializ.patch +++ /dev/null @@ -1,266 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andrey Ryabinin -Date: Wed, 15 Nov 2017 17:36:35 -0800 -Subject: [PATCH] x86/mm/kasan: Don't use vmemmap_populate() to initialize - shadow -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -[ Note, this is a Git cherry-pick of the following commit: - - d17a1d97dc20: ("x86/mm/kasan: don't use vmemmap_populate() to initialize shadow") - - ... for easier x86 PTI code testing and back-porting. ] - -The KASAN shadow is currently mapped using vmemmap_populate() since that -provides a semi-convenient way to map pages into init_top_pgt. However, -since that no longer zeroes the mapped pages, it is not suitable for -KASAN, which requires zeroed shadow memory. - -Add kasan_populate_shadow() interface and use it instead of -vmemmap_populate(). Besides, this allows us to take advantage of -gigantic pages and use them to populate the shadow, which should save us -some memory wasted on page tables and reduce TLB pressure. - -Link: http://lkml.kernel.org/r/20171103185147.2688-2-pasha.tatashin@oracle.com -Signed-off-by: Andrey Ryabinin -Signed-off-by: Pavel Tatashin -Cc: Andy Lutomirski -Cc: Steven Sistare -Cc: Daniel Jordan -Cc: Bob Picco -Cc: Michal Hocko -Cc: Alexander Potapenko -Cc: Ard Biesheuvel -Cc: Catalin Marinas -Cc: Christian Borntraeger -Cc: David S. Miller -Cc: Dmitry Vyukov -Cc: Heiko Carstens -Cc: "H. Peter Anvin" -Cc: Ingo Molnar -Cc: Mark Rutland -Cc: Matthew Wilcox -Cc: Mel Gorman -Cc: Michal Hocko -Cc: Sam Ravnborg -Cc: Thomas Gleixner -Cc: Will Deacon -Signed-off-by: Andrew Morton -Signed-off-by: Linus Torvalds -Signed-off-by: Ingo Molnar -(cherry picked from commit 2aeb07365bcd489620f71390a7d2031cd4dfb83e) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit f60ab0015a57d9fbf659b212d504682f069b0590) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/kasan_init_64.c | 143 +++++++++++++++++++++++++++++++++++++++++--- - arch/x86/Kconfig | 2 +- - 2 files changed, 137 insertions(+), 8 deletions(-) - -diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c -index 464089f33e80..3d7341986e13 100644 ---- a/arch/x86/mm/kasan_init_64.c -+++ b/arch/x86/mm/kasan_init_64.c -@@ -3,12 +3,14 @@ - #include - #include - #include -+#include - #include - #include - #include - #include - - #include -+#include - #include - #include - -@@ -17,7 +19,134 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; - - static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); - --static int __init map_range(struct range *range) -+static __init void *early_alloc(size_t size, int nid) -+{ -+ return memblock_virt_alloc_try_nid_nopanic(size, size, -+ __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); -+} -+ -+static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, -+ unsigned long end, int nid) -+{ -+ pte_t *pte; -+ -+ if (pmd_none(*pmd)) { -+ void *p; -+ -+ if (boot_cpu_has(X86_FEATURE_PSE) && -+ ((end - addr) == PMD_SIZE) && -+ IS_ALIGNED(addr, PMD_SIZE)) { -+ p = early_alloc(PMD_SIZE, nid); -+ if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) -+ return; -+ else if (p) -+ memblock_free(__pa(p), PMD_SIZE); -+ } -+ -+ p = early_alloc(PAGE_SIZE, nid); -+ pmd_populate_kernel(&init_mm, pmd, p); -+ } -+ -+ pte = pte_offset_kernel(pmd, addr); -+ do { -+ pte_t entry; -+ void *p; -+ -+ if (!pte_none(*pte)) -+ continue; -+ -+ p = early_alloc(PAGE_SIZE, nid); -+ entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL); -+ set_pte_at(&init_mm, addr, pte, entry); -+ } while (pte++, addr += PAGE_SIZE, addr != end); -+} -+ -+static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, -+ unsigned long end, int nid) -+{ -+ pmd_t *pmd; -+ unsigned long next; -+ -+ if (pud_none(*pud)) { -+ void *p; -+ -+ if (boot_cpu_has(X86_FEATURE_GBPAGES) && -+ ((end - addr) == PUD_SIZE) && -+ IS_ALIGNED(addr, PUD_SIZE)) { -+ p = early_alloc(PUD_SIZE, nid); -+ if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) -+ return; -+ else if (p) -+ memblock_free(__pa(p), PUD_SIZE); -+ } -+ -+ p = early_alloc(PAGE_SIZE, nid); -+ pud_populate(&init_mm, pud, p); -+ } -+ -+ pmd = pmd_offset(pud, addr); -+ do { -+ next = pmd_addr_end(addr, end); -+ if (!pmd_large(*pmd)) -+ kasan_populate_pmd(pmd, addr, next, nid); -+ } while (pmd++, addr = next, addr != end); -+} -+ -+static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr, -+ unsigned long end, int nid) -+{ -+ pud_t *pud; -+ unsigned long next; -+ -+ if (p4d_none(*p4d)) { -+ void *p = early_alloc(PAGE_SIZE, nid); -+ -+ p4d_populate(&init_mm, p4d, p); -+ } -+ -+ pud = pud_offset(p4d, addr); -+ do { -+ next = pud_addr_end(addr, end); -+ if (!pud_large(*pud)) -+ kasan_populate_pud(pud, addr, next, nid); -+ } while (pud++, addr = next, addr != end); -+} -+ -+static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr, -+ unsigned long end, int nid) -+{ -+ void *p; -+ p4d_t *p4d; -+ unsigned long next; -+ -+ if (pgd_none(*pgd)) { -+ p = early_alloc(PAGE_SIZE, nid); -+ pgd_populate(&init_mm, pgd, p); -+ } -+ -+ p4d = p4d_offset(pgd, addr); -+ do { -+ next = p4d_addr_end(addr, end); -+ kasan_populate_p4d(p4d, addr, next, nid); -+ } while (p4d++, addr = next, addr != end); -+} -+ -+static void __init kasan_populate_shadow(unsigned long addr, unsigned long end, -+ int nid) -+{ -+ pgd_t *pgd; -+ unsigned long next; -+ -+ addr = addr & PAGE_MASK; -+ end = round_up(end, PAGE_SIZE); -+ pgd = pgd_offset_k(addr); -+ do { -+ next = pgd_addr_end(addr, end); -+ kasan_populate_pgd(pgd, addr, next, nid); -+ } while (pgd++, addr = next, addr != end); -+} -+ -+static void __init map_range(struct range *range) - { - unsigned long start; - unsigned long end; -@@ -25,7 +154,7 @@ static int __init map_range(struct range *range) - start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); - end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); - -- return vmemmap_populate(start, end, NUMA_NO_NODE); -+ kasan_populate_shadow(start, end, early_pfn_to_nid(range->start)); - } - - static void __init clear_pgds(unsigned long start, -@@ -188,16 +317,16 @@ void __init kasan_init(void) - if (pfn_mapped[i].end == 0) - break; - -- if (map_range(&pfn_mapped[i])) -- panic("kasan: unable to allocate shadow!"); -+ map_range(&pfn_mapped[i]); - } -+ - kasan_populate_zero_shadow( - kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), - kasan_mem_to_shadow((void *)__START_KERNEL_map)); - -- vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext), -- (unsigned long)kasan_mem_to_shadow(_end), -- NUMA_NO_NODE); -+ kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), -+ (unsigned long)kasan_mem_to_shadow(_end), -+ early_pfn_to_nid(__pa(_stext))); - - kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), - (void *)KASAN_SHADOW_END); -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 67d07802ae95..8b5499bb24bb 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -106,7 +106,7 @@ config X86 - select HAVE_ARCH_AUDITSYSCALL - select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE - select HAVE_ARCH_JUMP_LABEL -- select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP -+ select HAVE_ARCH_KASAN if X86_64 - select HAVE_ARCH_KGDB - select HAVE_ARCH_KMEMCHECK - select HAVE_ARCH_MMAP_RND_BITS if MMU --- -2.14.2 - diff --git a/patches/kernel/0130-mm-sparsemem-Fix-ARM64-boot-crash-when-CONFIG_SPARSE.patch b/patches/kernel/0130-mm-sparsemem-Fix-ARM64-boot-crash-when-CONFIG_SPARSE.patch deleted file mode 100644 index eea6854..0000000 --- a/patches/kernel/0130-mm-sparsemem-Fix-ARM64-boot-crash-when-CONFIG_SPARSE.patch +++ /dev/null @@ -1,88 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: "Kirill A. Shutemov" -Date: Tue, 7 Nov 2017 11:33:37 +0300 -Subject: [PATCH] mm/sparsemem: Fix ARM64 boot crash when - CONFIG_SPARSEMEM_EXTREME=y -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Since commit: - - 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y") - -we allocate the mem_section array dynamically in sparse_memory_present_with_active_regions(), -but some architectures, like arm64, don't call the routine to initialize sparsemem. - -Let's move the initialization into memory_present() it should cover all -architectures. - -Reported-and-tested-by: Sudeep Holla -Tested-by: Bjorn Andersson -Signed-off-by: Kirill A. Shutemov -Acked-by: Will Deacon -Cc: Andrew Morton -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: linux-mm@kvack.org -Fixes: 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y") -Link: http://lkml.kernel.org/r/20171107083337.89952-1-kirill.shutemov@linux.intel.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 629a359bdb0e0652a8227b4ff3125431995fec6e) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit fbc3acbf37de68310eb5bbc7f4d1977e7b90100e) -Signed-off-by: Fabian Grünbichler ---- - mm/page_alloc.c | 10 ---------- - mm/sparse.c | 10 ++++++++++ - 2 files changed, 10 insertions(+), 10 deletions(-) - -diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index 66eb23ab658d..1423da8dd16f 100644 ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -5707,16 +5707,6 @@ void __init sparse_memory_present_with_active_regions(int nid) - unsigned long start_pfn, end_pfn; - int i, this_nid; - --#ifdef CONFIG_SPARSEMEM_EXTREME -- if (!mem_section) { -- unsigned long size, align; -- -- size = sizeof(struct mem_section) * NR_SECTION_ROOTS; -- align = 1 << (INTERNODE_CACHE_SHIFT); -- mem_section = memblock_virt_alloc(size, align); -- } --#endif -- - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) - memory_present(this_nid, start_pfn, end_pfn); - } -diff --git a/mm/sparse.c b/mm/sparse.c -index 308a0789d1bb..9c48e4fe8ce0 100644 ---- a/mm/sparse.c -+++ b/mm/sparse.c -@@ -210,6 +210,16 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) - { - unsigned long pfn; - -+#ifdef CONFIG_SPARSEMEM_EXTREME -+ if (unlikely(!mem_section)) { -+ unsigned long size, align; -+ -+ size = sizeof(struct mem_section) * NR_SECTION_ROOTS; -+ align = 1 << (INTERNODE_CACHE_SHIFT); -+ mem_section = memblock_virt_alloc(size, align); -+ } -+#endif -+ - start &= PAGE_SECTION_MASK; - mminit_validate_memmodel_limits(&start, &end); - for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { --- -2.14.2 - diff --git a/patches/kernel/0130-x86-mm-kasan-Don-t-use-vmemmap_populate-to-initializ.patch b/patches/kernel/0130-x86-mm-kasan-Don-t-use-vmemmap_populate-to-initializ.patch new file mode 100644 index 0000000..74c149b --- /dev/null +++ b/patches/kernel/0130-x86-mm-kasan-Don-t-use-vmemmap_populate-to-initializ.patch @@ -0,0 +1,266 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andrey Ryabinin +Date: Wed, 15 Nov 2017 17:36:35 -0800 +Subject: [PATCH] x86/mm/kasan: Don't use vmemmap_populate() to initialize + shadow +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +[ Note, this is a Git cherry-pick of the following commit: + + d17a1d97dc20: ("x86/mm/kasan: don't use vmemmap_populate() to initialize shadow") + + ... for easier x86 PTI code testing and back-porting. ] + +The KASAN shadow is currently mapped using vmemmap_populate() since that +provides a semi-convenient way to map pages into init_top_pgt. However, +since that no longer zeroes the mapped pages, it is not suitable for +KASAN, which requires zeroed shadow memory. + +Add kasan_populate_shadow() interface and use it instead of +vmemmap_populate(). Besides, this allows us to take advantage of +gigantic pages and use them to populate the shadow, which should save us +some memory wasted on page tables and reduce TLB pressure. + +Link: http://lkml.kernel.org/r/20171103185147.2688-2-pasha.tatashin@oracle.com +Signed-off-by: Andrey Ryabinin +Signed-off-by: Pavel Tatashin +Cc: Andy Lutomirski +Cc: Steven Sistare +Cc: Daniel Jordan +Cc: Bob Picco +Cc: Michal Hocko +Cc: Alexander Potapenko +Cc: Ard Biesheuvel +Cc: Catalin Marinas +Cc: Christian Borntraeger +Cc: David S. Miller +Cc: Dmitry Vyukov +Cc: Heiko Carstens +Cc: "H. Peter Anvin" +Cc: Ingo Molnar +Cc: Mark Rutland +Cc: Matthew Wilcox +Cc: Mel Gorman +Cc: Michal Hocko +Cc: Sam Ravnborg +Cc: Thomas Gleixner +Cc: Will Deacon +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Ingo Molnar +(cherry picked from commit 2aeb07365bcd489620f71390a7d2031cd4dfb83e) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit f60ab0015a57d9fbf659b212d504682f069b0590) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/kasan_init_64.c | 143 +++++++++++++++++++++++++++++++++++++++++--- + arch/x86/Kconfig | 2 +- + 2 files changed, 137 insertions(+), 8 deletions(-) + +diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c +index 464089f33e80..3d7341986e13 100644 +--- a/arch/x86/mm/kasan_init_64.c ++++ b/arch/x86/mm/kasan_init_64.c +@@ -3,12 +3,14 @@ + #include + #include + #include ++#include + #include + #include + #include + #include + + #include ++#include + #include + #include + +@@ -17,7 +19,134 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; + + static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); + +-static int __init map_range(struct range *range) ++static __init void *early_alloc(size_t size, int nid) ++{ ++ return memblock_virt_alloc_try_nid_nopanic(size, size, ++ __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); ++} ++ ++static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, ++ unsigned long end, int nid) ++{ ++ pte_t *pte; ++ ++ if (pmd_none(*pmd)) { ++ void *p; ++ ++ if (boot_cpu_has(X86_FEATURE_PSE) && ++ ((end - addr) == PMD_SIZE) && ++ IS_ALIGNED(addr, PMD_SIZE)) { ++ p = early_alloc(PMD_SIZE, nid); ++ if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) ++ return; ++ else if (p) ++ memblock_free(__pa(p), PMD_SIZE); ++ } ++ ++ p = early_alloc(PAGE_SIZE, nid); ++ pmd_populate_kernel(&init_mm, pmd, p); ++ } ++ ++ pte = pte_offset_kernel(pmd, addr); ++ do { ++ pte_t entry; ++ void *p; ++ ++ if (!pte_none(*pte)) ++ continue; ++ ++ p = early_alloc(PAGE_SIZE, nid); ++ entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL); ++ set_pte_at(&init_mm, addr, pte, entry); ++ } while (pte++, addr += PAGE_SIZE, addr != end); ++} ++ ++static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, ++ unsigned long end, int nid) ++{ ++ pmd_t *pmd; ++ unsigned long next; ++ ++ if (pud_none(*pud)) { ++ void *p; ++ ++ if (boot_cpu_has(X86_FEATURE_GBPAGES) && ++ ((end - addr) == PUD_SIZE) && ++ IS_ALIGNED(addr, PUD_SIZE)) { ++ p = early_alloc(PUD_SIZE, nid); ++ if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) ++ return; ++ else if (p) ++ memblock_free(__pa(p), PUD_SIZE); ++ } ++ ++ p = early_alloc(PAGE_SIZE, nid); ++ pud_populate(&init_mm, pud, p); ++ } ++ ++ pmd = pmd_offset(pud, addr); ++ do { ++ next = pmd_addr_end(addr, end); ++ if (!pmd_large(*pmd)) ++ kasan_populate_pmd(pmd, addr, next, nid); ++ } while (pmd++, addr = next, addr != end); ++} ++ ++static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr, ++ unsigned long end, int nid) ++{ ++ pud_t *pud; ++ unsigned long next; ++ ++ if (p4d_none(*p4d)) { ++ void *p = early_alloc(PAGE_SIZE, nid); ++ ++ p4d_populate(&init_mm, p4d, p); ++ } ++ ++ pud = pud_offset(p4d, addr); ++ do { ++ next = pud_addr_end(addr, end); ++ if (!pud_large(*pud)) ++ kasan_populate_pud(pud, addr, next, nid); ++ } while (pud++, addr = next, addr != end); ++} ++ ++static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr, ++ unsigned long end, int nid) ++{ ++ void *p; ++ p4d_t *p4d; ++ unsigned long next; ++ ++ if (pgd_none(*pgd)) { ++ p = early_alloc(PAGE_SIZE, nid); ++ pgd_populate(&init_mm, pgd, p); ++ } ++ ++ p4d = p4d_offset(pgd, addr); ++ do { ++ next = p4d_addr_end(addr, end); ++ kasan_populate_p4d(p4d, addr, next, nid); ++ } while (p4d++, addr = next, addr != end); ++} ++ ++static void __init kasan_populate_shadow(unsigned long addr, unsigned long end, ++ int nid) ++{ ++ pgd_t *pgd; ++ unsigned long next; ++ ++ addr = addr & PAGE_MASK; ++ end = round_up(end, PAGE_SIZE); ++ pgd = pgd_offset_k(addr); ++ do { ++ next = pgd_addr_end(addr, end); ++ kasan_populate_pgd(pgd, addr, next, nid); ++ } while (pgd++, addr = next, addr != end); ++} ++ ++static void __init map_range(struct range *range) + { + unsigned long start; + unsigned long end; +@@ -25,7 +154,7 @@ static int __init map_range(struct range *range) + start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); + end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); + +- return vmemmap_populate(start, end, NUMA_NO_NODE); ++ kasan_populate_shadow(start, end, early_pfn_to_nid(range->start)); + } + + static void __init clear_pgds(unsigned long start, +@@ -188,16 +317,16 @@ void __init kasan_init(void) + if (pfn_mapped[i].end == 0) + break; + +- if (map_range(&pfn_mapped[i])) +- panic("kasan: unable to allocate shadow!"); ++ map_range(&pfn_mapped[i]); + } ++ + kasan_populate_zero_shadow( + kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), + kasan_mem_to_shadow((void *)__START_KERNEL_map)); + +- vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext), +- (unsigned long)kasan_mem_to_shadow(_end), +- NUMA_NO_NODE); ++ kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), ++ (unsigned long)kasan_mem_to_shadow(_end), ++ early_pfn_to_nid(__pa(_stext))); + + kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), + (void *)KASAN_SHADOW_END); +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 67d07802ae95..8b5499bb24bb 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -106,7 +106,7 @@ config X86 + select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE + select HAVE_ARCH_JUMP_LABEL +- select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP ++ select HAVE_ARCH_KASAN if X86_64 + select HAVE_ARCH_KGDB + select HAVE_ARCH_KMEMCHECK + select HAVE_ARCH_MMAP_RND_BITS if MMU +-- +2.14.2 + diff --git a/patches/kernel/0131-mm-sparsemem-Fix-ARM64-boot-crash-when-CONFIG_SPARSE.patch b/patches/kernel/0131-mm-sparsemem-Fix-ARM64-boot-crash-when-CONFIG_SPARSE.patch new file mode 100644 index 0000000..eea6854 --- /dev/null +++ b/patches/kernel/0131-mm-sparsemem-Fix-ARM64-boot-crash-when-CONFIG_SPARSE.patch @@ -0,0 +1,88 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: "Kirill A. Shutemov" +Date: Tue, 7 Nov 2017 11:33:37 +0300 +Subject: [PATCH] mm/sparsemem: Fix ARM64 boot crash when + CONFIG_SPARSEMEM_EXTREME=y +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Since commit: + + 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y") + +we allocate the mem_section array dynamically in sparse_memory_present_with_active_regions(), +but some architectures, like arm64, don't call the routine to initialize sparsemem. + +Let's move the initialization into memory_present() it should cover all +architectures. + +Reported-and-tested-by: Sudeep Holla +Tested-by: Bjorn Andersson +Signed-off-by: Kirill A. Shutemov +Acked-by: Will Deacon +Cc: Andrew Morton +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: linux-mm@kvack.org +Fixes: 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y") +Link: http://lkml.kernel.org/r/20171107083337.89952-1-kirill.shutemov@linux.intel.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 629a359bdb0e0652a8227b4ff3125431995fec6e) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit fbc3acbf37de68310eb5bbc7f4d1977e7b90100e) +Signed-off-by: Fabian Grünbichler +--- + mm/page_alloc.c | 10 ---------- + mm/sparse.c | 10 ++++++++++ + 2 files changed, 10 insertions(+), 10 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 66eb23ab658d..1423da8dd16f 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -5707,16 +5707,6 @@ void __init sparse_memory_present_with_active_regions(int nid) + unsigned long start_pfn, end_pfn; + int i, this_nid; + +-#ifdef CONFIG_SPARSEMEM_EXTREME +- if (!mem_section) { +- unsigned long size, align; +- +- size = sizeof(struct mem_section) * NR_SECTION_ROOTS; +- align = 1 << (INTERNODE_CACHE_SHIFT); +- mem_section = memblock_virt_alloc(size, align); +- } +-#endif +- + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) + memory_present(this_nid, start_pfn, end_pfn); + } +diff --git a/mm/sparse.c b/mm/sparse.c +index 308a0789d1bb..9c48e4fe8ce0 100644 +--- a/mm/sparse.c ++++ b/mm/sparse.c +@@ -210,6 +210,16 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) + { + unsigned long pfn; + ++#ifdef CONFIG_SPARSEMEM_EXTREME ++ if (unlikely(!mem_section)) { ++ unsigned long size, align; ++ ++ size = sizeof(struct mem_section) * NR_SECTION_ROOTS; ++ align = 1 << (INTERNODE_CACHE_SHIFT); ++ mem_section = memblock_virt_alloc(size, align); ++ } ++#endif ++ + start &= PAGE_SECTION_MASK; + mminit_validate_memmodel_limits(&start, &end); + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { +-- +2.14.2 + diff --git a/patches/kernel/0131-objtool-Move-synced-files-to-their-original-relative.patch b/patches/kernel/0131-objtool-Move-synced-files-to-their-original-relative.patch deleted file mode 100644 index 5c1bc25..0000000 --- a/patches/kernel/0131-objtool-Move-synced-files-to-their-original-relative.patch +++ /dev/null @@ -1,245 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Mon, 6 Nov 2017 07:21:50 -0600 -Subject: [PATCH] objtool: Move synced files to their original relative - locations -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -This will enable more straightforward comparisons, and it also makes the -files 100% identical. - -Suggested-by: Ingo Molnar -Signed-off-by: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/407b2aaa317741f48fcf821592c0e96ab3be1890.1509974346.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(backported from commit b90671a530137f42325b89c0848ca58d865c1710) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 3673cdbc4be1671fad71a4968a9f55357d9d356c) -Signed-off-by: Fabian Grünbichler ---- - .../arch/x86/{insn => lib}/x86-opcode-map.txt | 0 - tools/objtool/Makefile | 22 ++++++++++++---------- - .../objtool/arch/x86/{insn => include/asm}/inat.h | 2 +- - .../arch/x86/{insn => include/asm}/inat_types.h | 0 - .../objtool/arch/x86/{insn => include/asm}/insn.h | 2 +- - .../objtool/{ => arch/x86/include/asm}/orc_types.h | 0 - tools/objtool/orc.h | 2 +- - tools/objtool/arch/x86/decode.c | 6 +++--- - tools/objtool/arch/x86/{insn => lib}/inat.c | 2 +- - tools/objtool/arch/x86/{insn => lib}/insn.c | 4 ++-- - tools/objtool/.gitignore | 2 +- - tools/objtool/arch/x86/Build | 10 +++++----- - .../arch/x86/{insn => tools}/gen-insn-attr-x86.awk | 0 - 13 files changed, 27 insertions(+), 25 deletions(-) - rename tools/objtool/arch/x86/{insn => lib}/x86-opcode-map.txt (100%) - rename tools/objtool/arch/x86/{insn => include/asm}/inat.h (99%) - rename tools/objtool/arch/x86/{insn => include/asm}/inat_types.h (100%) - rename tools/objtool/arch/x86/{insn => include/asm}/insn.h (99%) - rename tools/objtool/{ => arch/x86/include/asm}/orc_types.h (100%) - rename tools/objtool/arch/x86/{insn => lib}/inat.c (99%) - rename tools/objtool/arch/x86/{insn => lib}/insn.c (99%) - rename tools/objtool/arch/x86/{insn => tools}/gen-insn-attr-x86.awk (100%) - -diff --git a/tools/objtool/arch/x86/insn/x86-opcode-map.txt b/tools/objtool/arch/x86/lib/x86-opcode-map.txt -similarity index 100% -rename from tools/objtool/arch/x86/insn/x86-opcode-map.txt -rename to tools/objtool/arch/x86/lib/x86-opcode-map.txt -diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile -index 3a6425fefc43..f95f48e445c3 100644 ---- a/tools/objtool/Makefile -+++ b/tools/objtool/Makefile -@@ -24,7 +24,9 @@ OBJTOOL_IN := $(OBJTOOL)-in.o - - all: $(OBJTOOL) - --INCLUDES := -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi -+INCLUDES := -I$(srctree)/tools/include \ -+ -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \ -+ -I$(srctree)/tools/objtool/arch/$(HOSTARCH)/include - CFLAGS += -Wall -Werror $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -fomit-frame-pointer -O2 -g $(INCLUDES) - LDFLAGS += -lelf $(LIBSUBCMD) - -@@ -44,16 +46,16 @@ $(OBJTOOL_IN): fixdep FORCE - $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN) - @(diff -I 2>&1 | grep -q 'option requires an argument' && \ - test -d ../../kernel -a -d ../../tools -a -d ../objtool && (( \ -- diff -I'^#include' arch/x86/insn/insn.c ../../arch/x86/lib/insn.c >/dev/null && \ -- diff -I'^#include' arch/x86/insn/inat.c ../../arch/x86/lib/inat.c >/dev/null && \ -- diff arch/x86/insn/x86-opcode-map.txt ../../arch/x86/lib/x86-opcode-map.txt >/dev/null && \ -- diff arch/x86/insn/gen-insn-attr-x86.awk ../../arch/x86/tools/gen-insn-attr-x86.awk >/dev/null && \ -- diff -I'^#include' arch/x86/insn/insn.h ../../arch/x86/include/asm/insn.h >/dev/null && \ -- diff -I'^#include' arch/x86/insn/inat.h ../../arch/x86/include/asm/inat.h >/dev/null && \ -- diff -I'^#include' arch/x86/insn/inat_types.h ../../arch/x86/include/asm/inat_types.h >/dev/null) \ -+ diff arch/x86/lib/insn.c ../../arch/x86/lib/insn.c >/dev/null && \ -+ diff arch/x86/lib/inat.c ../../arch/x86/lib/inat.c >/dev/null && \ -+ diff arch/x86/lib/x86-opcode-map.txt ../../arch/x86/lib/x86-opcode-map.txt >/dev/null && \ -+ diff arch/x86/tools/gen-insn-attr-x86.awk ../../arch/x86/tools/gen-insn-attr-x86.awk >/dev/null && \ -+ diff arch/x86/include/asm/insn.h ../../arch/x86/include/asm/insn.h >/dev/null && \ -+ diff arch/x86/include/asm/inat.h ../../arch/x86/include/asm/inat.h >/dev/null && \ -+ diff arch/x86/include/asm/inat_types.h ../../arch/x86/include/asm/inat_types.h >/dev/null) \ - || echo "warning: objtool: x86 instruction decoder differs from kernel" >&2 )) || true - @(test -d ../../kernel -a -d ../../tools -a -d ../objtool && (( \ -- diff ../../arch/x86/include/asm/orc_types.h orc_types.h >/dev/null) \ -+ diff ../../arch/x86/include/asm/orc_types.h arch/x86/include/asm/orc_types.h >/dev/null) \ - || echo "warning: objtool: orc_types.h differs from kernel" >&2 )) || true - $(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@ - -@@ -64,7 +66,7 @@ $(LIBSUBCMD): fixdep FORCE - clean: - $(call QUIET_CLEAN, objtool) $(RM) $(OBJTOOL) - $(Q)find $(OUTPUT) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete -- $(Q)$(RM) $(OUTPUT)arch/x86/insn/inat-tables.c $(OUTPUT)fixdep -+ $(Q)$(RM) $(OUTPUT)arch/x86/lib/inat-tables.c $(OUTPUT)fixdep - - FORCE: - -diff --git a/tools/objtool/arch/x86/insn/inat.h b/tools/objtool/arch/x86/include/asm/inat.h -similarity index 99% -rename from tools/objtool/arch/x86/insn/inat.h -rename to tools/objtool/arch/x86/include/asm/inat.h -index 125ecd2a300d..02aff0867211 100644 ---- a/tools/objtool/arch/x86/insn/inat.h -+++ b/tools/objtool/arch/x86/include/asm/inat.h -@@ -20,7 +20,7 @@ - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - */ --#include "inat_types.h" -+#include - - /* - * Internal bits. Don't use bitmasks directly, because these bits are -diff --git a/tools/objtool/arch/x86/insn/inat_types.h b/tools/objtool/arch/x86/include/asm/inat_types.h -similarity index 100% -rename from tools/objtool/arch/x86/insn/inat_types.h -rename to tools/objtool/arch/x86/include/asm/inat_types.h -diff --git a/tools/objtool/arch/x86/insn/insn.h b/tools/objtool/arch/x86/include/asm/insn.h -similarity index 99% -rename from tools/objtool/arch/x86/insn/insn.h -rename to tools/objtool/arch/x86/include/asm/insn.h -index e23578c7b1be..b3e32b010ab1 100644 ---- a/tools/objtool/arch/x86/insn/insn.h -+++ b/tools/objtool/arch/x86/include/asm/insn.h -@@ -21,7 +21,7 @@ - */ - - /* insn_attr_t is defined in inat.h */ --#include "inat.h" -+#include - - struct insn_field { - union { -diff --git a/tools/objtool/orc_types.h b/tools/objtool/arch/x86/include/asm/orc_types.h -similarity index 100% -rename from tools/objtool/orc_types.h -rename to tools/objtool/arch/x86/include/asm/orc_types.h -diff --git a/tools/objtool/orc.h b/tools/objtool/orc.h -index a4139e386ef3..b0e92a6d0903 100644 ---- a/tools/objtool/orc.h -+++ b/tools/objtool/orc.h -@@ -18,7 +18,7 @@ - #ifndef _ORC_H - #define _ORC_H - --#include "orc_types.h" -+#include - - struct objtool_file; - -diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c -index 4559a21a8de2..92f57996d66c 100644 ---- a/tools/objtool/arch/x86/decode.c -+++ b/tools/objtool/arch/x86/decode.c -@@ -19,9 +19,9 @@ - #include - - #define unlikely(cond) (cond) --#include "insn/insn.h" --#include "insn/inat.c" --#include "insn/insn.c" -+#include -+#include "lib/inat.c" -+#include "lib/insn.c" - - #include "../../elf.h" - #include "../../arch.h" -diff --git a/tools/objtool/arch/x86/insn/inat.c b/tools/objtool/arch/x86/lib/inat.c -similarity index 99% -rename from tools/objtool/arch/x86/insn/inat.c -rename to tools/objtool/arch/x86/lib/inat.c -index e4bf28e6f4c7..c1f01a8e9f65 100644 ---- a/tools/objtool/arch/x86/insn/inat.c -+++ b/tools/objtool/arch/x86/lib/inat.c -@@ -18,7 +18,7 @@ - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - */ --#include "insn.h" -+#include - - /* Attribute tables are generated from opcode map */ - #include "inat-tables.c" -diff --git a/tools/objtool/arch/x86/insn/insn.c b/tools/objtool/arch/x86/lib/insn.c -similarity index 99% -rename from tools/objtool/arch/x86/insn/insn.c -rename to tools/objtool/arch/x86/lib/insn.c -index ca983e2bea8b..1088eb8f3a5f 100644 ---- a/tools/objtool/arch/x86/insn/insn.c -+++ b/tools/objtool/arch/x86/lib/insn.c -@@ -23,8 +23,8 @@ - #else - #include - #endif --#include "inat.h" --#include "insn.h" -+#include -+#include - - /* Verify next sizeof(t) bytes can be on the same instruction */ - #define validate_next(t, insn, n) \ -diff --git a/tools/objtool/.gitignore b/tools/objtool/.gitignore -index d3102c865a95..914cff12899b 100644 ---- a/tools/objtool/.gitignore -+++ b/tools/objtool/.gitignore -@@ -1,3 +1,3 @@ --arch/x86/insn/inat-tables.c -+arch/x86/lib/inat-tables.c - objtool - fixdep -diff --git a/tools/objtool/arch/x86/Build b/tools/objtool/arch/x86/Build -index debbdb0b5c43..b998412c017d 100644 ---- a/tools/objtool/arch/x86/Build -+++ b/tools/objtool/arch/x86/Build -@@ -1,12 +1,12 @@ - objtool-y += decode.o - --inat_tables_script = arch/x86/insn/gen-insn-attr-x86.awk --inat_tables_maps = arch/x86/insn/x86-opcode-map.txt -+inat_tables_script = arch/x86/tools/gen-insn-attr-x86.awk -+inat_tables_maps = arch/x86/lib/x86-opcode-map.txt - --$(OUTPUT)arch/x86/insn/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) -+$(OUTPUT)arch/x86/lib/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) - $(call rule_mkdir) - $(Q)$(call echo-cmd,gen)$(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ - --$(OUTPUT)arch/x86/decode.o: $(OUTPUT)arch/x86/insn/inat-tables.c -+$(OUTPUT)arch/x86/decode.o: $(OUTPUT)arch/x86/lib/inat-tables.c - --CFLAGS_decode.o += -I$(OUTPUT)arch/x86/insn -+CFLAGS_decode.o += -I$(OUTPUT)arch/x86/lib -diff --git a/tools/objtool/arch/x86/insn/gen-insn-attr-x86.awk b/tools/objtool/arch/x86/tools/gen-insn-attr-x86.awk -similarity index 100% -rename from tools/objtool/arch/x86/insn/gen-insn-attr-x86.awk -rename to tools/objtool/arch/x86/tools/gen-insn-attr-x86.awk --- -2.14.2 - diff --git a/patches/kernel/0132-objtool-Move-kernel-headers-code-sync-check-to-a-scr.patch b/patches/kernel/0132-objtool-Move-kernel-headers-code-sync-check-to-a-scr.patch deleted file mode 100644 index 5ebc193..0000000 --- a/patches/kernel/0132-objtool-Move-kernel-headers-code-sync-check-to-a-scr.patch +++ /dev/null @@ -1,99 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Mon, 6 Nov 2017 07:21:51 -0600 -Subject: [PATCH] objtool: Move kernel headers/code sync check to a script -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Replace the nasty diff checks in the objtool Makefile with a clean bash -script, and make the warnings more specific. - -Heavily inspired by tools/perf/check-headers.sh. - -Suggested-by: Ingo Molnar -Signed-off-by: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/ab015f15ccd8c0c6008493c3c6ee3d495eaf2927.1509974346.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit a89ec413c623eb2870180bcad678046bf7bc8465) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 4e72ce95a057e744b8d580239e2d8afa51118d82) -Signed-off-by: Fabian Grünbichler ---- - tools/objtool/Makefile | 16 +--------------- - tools/objtool/sync-check.sh | 29 +++++++++++++++++++++++++++++ - 2 files changed, 30 insertions(+), 15 deletions(-) - create mode 100755 tools/objtool/sync-check.sh - -diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile -index f95f48e445c3..90b0241f3ccc 100644 ---- a/tools/objtool/Makefile -+++ b/tools/objtool/Makefile -@@ -41,22 +41,8 @@ include $(srctree)/tools/build/Makefile.include - $(OBJTOOL_IN): fixdep FORCE - @$(MAKE) $(build)=objtool - --# Busybox's diff doesn't have -I, avoid warning in that case --# - $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN) -- @(diff -I 2>&1 | grep -q 'option requires an argument' && \ -- test -d ../../kernel -a -d ../../tools -a -d ../objtool && (( \ -- diff arch/x86/lib/insn.c ../../arch/x86/lib/insn.c >/dev/null && \ -- diff arch/x86/lib/inat.c ../../arch/x86/lib/inat.c >/dev/null && \ -- diff arch/x86/lib/x86-opcode-map.txt ../../arch/x86/lib/x86-opcode-map.txt >/dev/null && \ -- diff arch/x86/tools/gen-insn-attr-x86.awk ../../arch/x86/tools/gen-insn-attr-x86.awk >/dev/null && \ -- diff arch/x86/include/asm/insn.h ../../arch/x86/include/asm/insn.h >/dev/null && \ -- diff arch/x86/include/asm/inat.h ../../arch/x86/include/asm/inat.h >/dev/null && \ -- diff arch/x86/include/asm/inat_types.h ../../arch/x86/include/asm/inat_types.h >/dev/null) \ -- || echo "warning: objtool: x86 instruction decoder differs from kernel" >&2 )) || true -- @(test -d ../../kernel -a -d ../../tools -a -d ../objtool && (( \ -- diff ../../arch/x86/include/asm/orc_types.h arch/x86/include/asm/orc_types.h >/dev/null) \ -- || echo "warning: objtool: orc_types.h differs from kernel" >&2 )) || true -+ @./sync-check.sh - $(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@ - - -diff --git a/tools/objtool/sync-check.sh b/tools/objtool/sync-check.sh -new file mode 100755 -index 000000000000..1470e74e9d66 ---- /dev/null -+++ b/tools/objtool/sync-check.sh -@@ -0,0 +1,29 @@ -+#!/bin/sh -+# SPDX-License-Identifier: GPL-2.0 -+ -+FILES=' -+arch/x86/lib/insn.c -+arch/x86/lib/inat.c -+arch/x86/lib/x86-opcode-map.txt -+arch/x86/tools/gen-insn-attr-x86.awk -+arch/x86/include/asm/insn.h -+arch/x86/include/asm/inat.h -+arch/x86/include/asm/inat_types.h -+arch/x86/include/asm/orc_types.h -+' -+ -+check() -+{ -+ local file=$1 -+ -+ diff $file ../../$file > /dev/null || -+ echo "Warning: synced file at 'tools/objtool/$file' differs from latest kernel version at '$file'" -+} -+ -+if [ ! -d ../../kernel ] || [ ! -d ../../tools ] || [ ! -d ../objtool ]; then -+ exit 0 -+fi -+ -+for i in $FILES; do -+ check $i -+done --- -2.14.2 - diff --git a/patches/kernel/0132-objtool-Move-synced-files-to-their-original-relative.patch b/patches/kernel/0132-objtool-Move-synced-files-to-their-original-relative.patch new file mode 100644 index 0000000..5c1bc25 --- /dev/null +++ b/patches/kernel/0132-objtool-Move-synced-files-to-their-original-relative.patch @@ -0,0 +1,245 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Mon, 6 Nov 2017 07:21:50 -0600 +Subject: [PATCH] objtool: Move synced files to their original relative + locations +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +This will enable more straightforward comparisons, and it also makes the +files 100% identical. + +Suggested-by: Ingo Molnar +Signed-off-by: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/407b2aaa317741f48fcf821592c0e96ab3be1890.1509974346.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(backported from commit b90671a530137f42325b89c0848ca58d865c1710) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 3673cdbc4be1671fad71a4968a9f55357d9d356c) +Signed-off-by: Fabian Grünbichler +--- + .../arch/x86/{insn => lib}/x86-opcode-map.txt | 0 + tools/objtool/Makefile | 22 ++++++++++++---------- + .../objtool/arch/x86/{insn => include/asm}/inat.h | 2 +- + .../arch/x86/{insn => include/asm}/inat_types.h | 0 + .../objtool/arch/x86/{insn => include/asm}/insn.h | 2 +- + .../objtool/{ => arch/x86/include/asm}/orc_types.h | 0 + tools/objtool/orc.h | 2 +- + tools/objtool/arch/x86/decode.c | 6 +++--- + tools/objtool/arch/x86/{insn => lib}/inat.c | 2 +- + tools/objtool/arch/x86/{insn => lib}/insn.c | 4 ++-- + tools/objtool/.gitignore | 2 +- + tools/objtool/arch/x86/Build | 10 +++++----- + .../arch/x86/{insn => tools}/gen-insn-attr-x86.awk | 0 + 13 files changed, 27 insertions(+), 25 deletions(-) + rename tools/objtool/arch/x86/{insn => lib}/x86-opcode-map.txt (100%) + rename tools/objtool/arch/x86/{insn => include/asm}/inat.h (99%) + rename tools/objtool/arch/x86/{insn => include/asm}/inat_types.h (100%) + rename tools/objtool/arch/x86/{insn => include/asm}/insn.h (99%) + rename tools/objtool/{ => arch/x86/include/asm}/orc_types.h (100%) + rename tools/objtool/arch/x86/{insn => lib}/inat.c (99%) + rename tools/objtool/arch/x86/{insn => lib}/insn.c (99%) + rename tools/objtool/arch/x86/{insn => tools}/gen-insn-attr-x86.awk (100%) + +diff --git a/tools/objtool/arch/x86/insn/x86-opcode-map.txt b/tools/objtool/arch/x86/lib/x86-opcode-map.txt +similarity index 100% +rename from tools/objtool/arch/x86/insn/x86-opcode-map.txt +rename to tools/objtool/arch/x86/lib/x86-opcode-map.txt +diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile +index 3a6425fefc43..f95f48e445c3 100644 +--- a/tools/objtool/Makefile ++++ b/tools/objtool/Makefile +@@ -24,7 +24,9 @@ OBJTOOL_IN := $(OBJTOOL)-in.o + + all: $(OBJTOOL) + +-INCLUDES := -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi ++INCLUDES := -I$(srctree)/tools/include \ ++ -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \ ++ -I$(srctree)/tools/objtool/arch/$(HOSTARCH)/include + CFLAGS += -Wall -Werror $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -fomit-frame-pointer -O2 -g $(INCLUDES) + LDFLAGS += -lelf $(LIBSUBCMD) + +@@ -44,16 +46,16 @@ $(OBJTOOL_IN): fixdep FORCE + $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN) + @(diff -I 2>&1 | grep -q 'option requires an argument' && \ + test -d ../../kernel -a -d ../../tools -a -d ../objtool && (( \ +- diff -I'^#include' arch/x86/insn/insn.c ../../arch/x86/lib/insn.c >/dev/null && \ +- diff -I'^#include' arch/x86/insn/inat.c ../../arch/x86/lib/inat.c >/dev/null && \ +- diff arch/x86/insn/x86-opcode-map.txt ../../arch/x86/lib/x86-opcode-map.txt >/dev/null && \ +- diff arch/x86/insn/gen-insn-attr-x86.awk ../../arch/x86/tools/gen-insn-attr-x86.awk >/dev/null && \ +- diff -I'^#include' arch/x86/insn/insn.h ../../arch/x86/include/asm/insn.h >/dev/null && \ +- diff -I'^#include' arch/x86/insn/inat.h ../../arch/x86/include/asm/inat.h >/dev/null && \ +- diff -I'^#include' arch/x86/insn/inat_types.h ../../arch/x86/include/asm/inat_types.h >/dev/null) \ ++ diff arch/x86/lib/insn.c ../../arch/x86/lib/insn.c >/dev/null && \ ++ diff arch/x86/lib/inat.c ../../arch/x86/lib/inat.c >/dev/null && \ ++ diff arch/x86/lib/x86-opcode-map.txt ../../arch/x86/lib/x86-opcode-map.txt >/dev/null && \ ++ diff arch/x86/tools/gen-insn-attr-x86.awk ../../arch/x86/tools/gen-insn-attr-x86.awk >/dev/null && \ ++ diff arch/x86/include/asm/insn.h ../../arch/x86/include/asm/insn.h >/dev/null && \ ++ diff arch/x86/include/asm/inat.h ../../arch/x86/include/asm/inat.h >/dev/null && \ ++ diff arch/x86/include/asm/inat_types.h ../../arch/x86/include/asm/inat_types.h >/dev/null) \ + || echo "warning: objtool: x86 instruction decoder differs from kernel" >&2 )) || true + @(test -d ../../kernel -a -d ../../tools -a -d ../objtool && (( \ +- diff ../../arch/x86/include/asm/orc_types.h orc_types.h >/dev/null) \ ++ diff ../../arch/x86/include/asm/orc_types.h arch/x86/include/asm/orc_types.h >/dev/null) \ + || echo "warning: objtool: orc_types.h differs from kernel" >&2 )) || true + $(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@ + +@@ -64,7 +66,7 @@ $(LIBSUBCMD): fixdep FORCE + clean: + $(call QUIET_CLEAN, objtool) $(RM) $(OBJTOOL) + $(Q)find $(OUTPUT) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete +- $(Q)$(RM) $(OUTPUT)arch/x86/insn/inat-tables.c $(OUTPUT)fixdep ++ $(Q)$(RM) $(OUTPUT)arch/x86/lib/inat-tables.c $(OUTPUT)fixdep + + FORCE: + +diff --git a/tools/objtool/arch/x86/insn/inat.h b/tools/objtool/arch/x86/include/asm/inat.h +similarity index 99% +rename from tools/objtool/arch/x86/insn/inat.h +rename to tools/objtool/arch/x86/include/asm/inat.h +index 125ecd2a300d..02aff0867211 100644 +--- a/tools/objtool/arch/x86/insn/inat.h ++++ b/tools/objtool/arch/x86/include/asm/inat.h +@@ -20,7 +20,7 @@ + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ +-#include "inat_types.h" ++#include + + /* + * Internal bits. Don't use bitmasks directly, because these bits are +diff --git a/tools/objtool/arch/x86/insn/inat_types.h b/tools/objtool/arch/x86/include/asm/inat_types.h +similarity index 100% +rename from tools/objtool/arch/x86/insn/inat_types.h +rename to tools/objtool/arch/x86/include/asm/inat_types.h +diff --git a/tools/objtool/arch/x86/insn/insn.h b/tools/objtool/arch/x86/include/asm/insn.h +similarity index 99% +rename from tools/objtool/arch/x86/insn/insn.h +rename to tools/objtool/arch/x86/include/asm/insn.h +index e23578c7b1be..b3e32b010ab1 100644 +--- a/tools/objtool/arch/x86/insn/insn.h ++++ b/tools/objtool/arch/x86/include/asm/insn.h +@@ -21,7 +21,7 @@ + */ + + /* insn_attr_t is defined in inat.h */ +-#include "inat.h" ++#include + + struct insn_field { + union { +diff --git a/tools/objtool/orc_types.h b/tools/objtool/arch/x86/include/asm/orc_types.h +similarity index 100% +rename from tools/objtool/orc_types.h +rename to tools/objtool/arch/x86/include/asm/orc_types.h +diff --git a/tools/objtool/orc.h b/tools/objtool/orc.h +index a4139e386ef3..b0e92a6d0903 100644 +--- a/tools/objtool/orc.h ++++ b/tools/objtool/orc.h +@@ -18,7 +18,7 @@ + #ifndef _ORC_H + #define _ORC_H + +-#include "orc_types.h" ++#include + + struct objtool_file; + +diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c +index 4559a21a8de2..92f57996d66c 100644 +--- a/tools/objtool/arch/x86/decode.c ++++ b/tools/objtool/arch/x86/decode.c +@@ -19,9 +19,9 @@ + #include + + #define unlikely(cond) (cond) +-#include "insn/insn.h" +-#include "insn/inat.c" +-#include "insn/insn.c" ++#include ++#include "lib/inat.c" ++#include "lib/insn.c" + + #include "../../elf.h" + #include "../../arch.h" +diff --git a/tools/objtool/arch/x86/insn/inat.c b/tools/objtool/arch/x86/lib/inat.c +similarity index 99% +rename from tools/objtool/arch/x86/insn/inat.c +rename to tools/objtool/arch/x86/lib/inat.c +index e4bf28e6f4c7..c1f01a8e9f65 100644 +--- a/tools/objtool/arch/x86/insn/inat.c ++++ b/tools/objtool/arch/x86/lib/inat.c +@@ -18,7 +18,7 @@ + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ +-#include "insn.h" ++#include + + /* Attribute tables are generated from opcode map */ + #include "inat-tables.c" +diff --git a/tools/objtool/arch/x86/insn/insn.c b/tools/objtool/arch/x86/lib/insn.c +similarity index 99% +rename from tools/objtool/arch/x86/insn/insn.c +rename to tools/objtool/arch/x86/lib/insn.c +index ca983e2bea8b..1088eb8f3a5f 100644 +--- a/tools/objtool/arch/x86/insn/insn.c ++++ b/tools/objtool/arch/x86/lib/insn.c +@@ -23,8 +23,8 @@ + #else + #include + #endif +-#include "inat.h" +-#include "insn.h" ++#include ++#include + + /* Verify next sizeof(t) bytes can be on the same instruction */ + #define validate_next(t, insn, n) \ +diff --git a/tools/objtool/.gitignore b/tools/objtool/.gitignore +index d3102c865a95..914cff12899b 100644 +--- a/tools/objtool/.gitignore ++++ b/tools/objtool/.gitignore +@@ -1,3 +1,3 @@ +-arch/x86/insn/inat-tables.c ++arch/x86/lib/inat-tables.c + objtool + fixdep +diff --git a/tools/objtool/arch/x86/Build b/tools/objtool/arch/x86/Build +index debbdb0b5c43..b998412c017d 100644 +--- a/tools/objtool/arch/x86/Build ++++ b/tools/objtool/arch/x86/Build +@@ -1,12 +1,12 @@ + objtool-y += decode.o + +-inat_tables_script = arch/x86/insn/gen-insn-attr-x86.awk +-inat_tables_maps = arch/x86/insn/x86-opcode-map.txt ++inat_tables_script = arch/x86/tools/gen-insn-attr-x86.awk ++inat_tables_maps = arch/x86/lib/x86-opcode-map.txt + +-$(OUTPUT)arch/x86/insn/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) ++$(OUTPUT)arch/x86/lib/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) + $(call rule_mkdir) + $(Q)$(call echo-cmd,gen)$(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ + +-$(OUTPUT)arch/x86/decode.o: $(OUTPUT)arch/x86/insn/inat-tables.c ++$(OUTPUT)arch/x86/decode.o: $(OUTPUT)arch/x86/lib/inat-tables.c + +-CFLAGS_decode.o += -I$(OUTPUT)arch/x86/insn ++CFLAGS_decode.o += -I$(OUTPUT)arch/x86/lib +diff --git a/tools/objtool/arch/x86/insn/gen-insn-attr-x86.awk b/tools/objtool/arch/x86/tools/gen-insn-attr-x86.awk +similarity index 100% +rename from tools/objtool/arch/x86/insn/gen-insn-attr-x86.awk +rename to tools/objtool/arch/x86/tools/gen-insn-attr-x86.awk +-- +2.14.2 + diff --git a/patches/kernel/0133-objtool-Fix-cross-build.patch b/patches/kernel/0133-objtool-Fix-cross-build.patch deleted file mode 100644 index 7e8ae7c..0000000 --- a/patches/kernel/0133-objtool-Fix-cross-build.patch +++ /dev/null @@ -1,56 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Tue, 7 Nov 2017 21:01:52 -0600 -Subject: [PATCH] objtool: Fix cross-build -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Stephen Rothwell reported this cross-compilation build failure: - -| In file included from orc_dump.c:19:0: -| orc.h:21:10: fatal error: asm/orc_types.h: No such file or directory -| ... - -Caused by: - - 6a77cff819ae ("objtool: Move synced files to their original relative locations") - -Use the proper arch header files location, not the host-arch location. - -Bisected-by: Stephen Rothwell -Reported-by: Stephen Rothwell -Signed-off-by: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Linux-Next Mailing List -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/20171108030152.bd76eahiwjwjt3kp@treble -Signed-off-by: Ingo Molnar -(backported from commit 26bda786fb129698d96c9bc6d243f7a3cd3fc668) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit fbe7b2a70b2949ec3ba359c04fb60d8f31f74e04) -Signed-off-by: Fabian Grünbichler ---- - tools/objtool/Makefile | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile -index 90b0241f3ccc..847e99aa54ea 100644 ---- a/tools/objtool/Makefile -+++ b/tools/objtool/Makefile -@@ -26,7 +26,7 @@ all: $(OBJTOOL) - - INCLUDES := -I$(srctree)/tools/include \ - -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \ -- -I$(srctree)/tools/objtool/arch/$(HOSTARCH)/include -+ -I$(srctree)/tools/objtool/arch/$(ARCH)/include - CFLAGS += -Wall -Werror $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -fomit-frame-pointer -O2 -g $(INCLUDES) - LDFLAGS += -lelf $(LIBSUBCMD) - --- -2.14.2 - diff --git a/patches/kernel/0133-objtool-Move-kernel-headers-code-sync-check-to-a-scr.patch b/patches/kernel/0133-objtool-Move-kernel-headers-code-sync-check-to-a-scr.patch new file mode 100644 index 0000000..5ebc193 --- /dev/null +++ b/patches/kernel/0133-objtool-Move-kernel-headers-code-sync-check-to-a-scr.patch @@ -0,0 +1,99 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Mon, 6 Nov 2017 07:21:51 -0600 +Subject: [PATCH] objtool: Move kernel headers/code sync check to a script +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Replace the nasty diff checks in the objtool Makefile with a clean bash +script, and make the warnings more specific. + +Heavily inspired by tools/perf/check-headers.sh. + +Suggested-by: Ingo Molnar +Signed-off-by: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/ab015f15ccd8c0c6008493c3c6ee3d495eaf2927.1509974346.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit a89ec413c623eb2870180bcad678046bf7bc8465) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 4e72ce95a057e744b8d580239e2d8afa51118d82) +Signed-off-by: Fabian Grünbichler +--- + tools/objtool/Makefile | 16 +--------------- + tools/objtool/sync-check.sh | 29 +++++++++++++++++++++++++++++ + 2 files changed, 30 insertions(+), 15 deletions(-) + create mode 100755 tools/objtool/sync-check.sh + +diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile +index f95f48e445c3..90b0241f3ccc 100644 +--- a/tools/objtool/Makefile ++++ b/tools/objtool/Makefile +@@ -41,22 +41,8 @@ include $(srctree)/tools/build/Makefile.include + $(OBJTOOL_IN): fixdep FORCE + @$(MAKE) $(build)=objtool + +-# Busybox's diff doesn't have -I, avoid warning in that case +-# + $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN) +- @(diff -I 2>&1 | grep -q 'option requires an argument' && \ +- test -d ../../kernel -a -d ../../tools -a -d ../objtool && (( \ +- diff arch/x86/lib/insn.c ../../arch/x86/lib/insn.c >/dev/null && \ +- diff arch/x86/lib/inat.c ../../arch/x86/lib/inat.c >/dev/null && \ +- diff arch/x86/lib/x86-opcode-map.txt ../../arch/x86/lib/x86-opcode-map.txt >/dev/null && \ +- diff arch/x86/tools/gen-insn-attr-x86.awk ../../arch/x86/tools/gen-insn-attr-x86.awk >/dev/null && \ +- diff arch/x86/include/asm/insn.h ../../arch/x86/include/asm/insn.h >/dev/null && \ +- diff arch/x86/include/asm/inat.h ../../arch/x86/include/asm/inat.h >/dev/null && \ +- diff arch/x86/include/asm/inat_types.h ../../arch/x86/include/asm/inat_types.h >/dev/null) \ +- || echo "warning: objtool: x86 instruction decoder differs from kernel" >&2 )) || true +- @(test -d ../../kernel -a -d ../../tools -a -d ../objtool && (( \ +- diff ../../arch/x86/include/asm/orc_types.h arch/x86/include/asm/orc_types.h >/dev/null) \ +- || echo "warning: objtool: orc_types.h differs from kernel" >&2 )) || true ++ @./sync-check.sh + $(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@ + + +diff --git a/tools/objtool/sync-check.sh b/tools/objtool/sync-check.sh +new file mode 100755 +index 000000000000..1470e74e9d66 +--- /dev/null ++++ b/tools/objtool/sync-check.sh +@@ -0,0 +1,29 @@ ++#!/bin/sh ++# SPDX-License-Identifier: GPL-2.0 ++ ++FILES=' ++arch/x86/lib/insn.c ++arch/x86/lib/inat.c ++arch/x86/lib/x86-opcode-map.txt ++arch/x86/tools/gen-insn-attr-x86.awk ++arch/x86/include/asm/insn.h ++arch/x86/include/asm/inat.h ++arch/x86/include/asm/inat_types.h ++arch/x86/include/asm/orc_types.h ++' ++ ++check() ++{ ++ local file=$1 ++ ++ diff $file ../../$file > /dev/null || ++ echo "Warning: synced file at 'tools/objtool/$file' differs from latest kernel version at '$file'" ++} ++ ++if [ ! -d ../../kernel ] || [ ! -d ../../tools ] || [ ! -d ../objtool ]; then ++ exit 0 ++fi ++ ++for i in $FILES; do ++ check $i ++done +-- +2.14.2 + diff --git a/patches/kernel/0134-objtool-Fix-cross-build.patch b/patches/kernel/0134-objtool-Fix-cross-build.patch new file mode 100644 index 0000000..7e8ae7c --- /dev/null +++ b/patches/kernel/0134-objtool-Fix-cross-build.patch @@ -0,0 +1,56 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Tue, 7 Nov 2017 21:01:52 -0600 +Subject: [PATCH] objtool: Fix cross-build +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Stephen Rothwell reported this cross-compilation build failure: + +| In file included from orc_dump.c:19:0: +| orc.h:21:10: fatal error: asm/orc_types.h: No such file or directory +| ... + +Caused by: + + 6a77cff819ae ("objtool: Move synced files to their original relative locations") + +Use the proper arch header files location, not the host-arch location. + +Bisected-by: Stephen Rothwell +Reported-by: Stephen Rothwell +Signed-off-by: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Linux-Next Mailing List +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/20171108030152.bd76eahiwjwjt3kp@treble +Signed-off-by: Ingo Molnar +(backported from commit 26bda786fb129698d96c9bc6d243f7a3cd3fc668) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit fbe7b2a70b2949ec3ba359c04fb60d8f31f74e04) +Signed-off-by: Fabian Grünbichler +--- + tools/objtool/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile +index 90b0241f3ccc..847e99aa54ea 100644 +--- a/tools/objtool/Makefile ++++ b/tools/objtool/Makefile +@@ -26,7 +26,7 @@ all: $(OBJTOOL) + + INCLUDES := -I$(srctree)/tools/include \ + -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \ +- -I$(srctree)/tools/objtool/arch/$(HOSTARCH)/include ++ -I$(srctree)/tools/objtool/arch/$(ARCH)/include + CFLAGS += -Wall -Werror $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -fomit-frame-pointer -O2 -g $(INCLUDES) + LDFLAGS += -lelf $(LIBSUBCMD) + +-- +2.14.2 + diff --git a/patches/kernel/0134-tools-headers-Sync-objtool-UAPI-header.patch b/patches/kernel/0134-tools-headers-Sync-objtool-UAPI-header.patch deleted file mode 100644 index 29ce403..0000000 --- a/patches/kernel/0134-tools-headers-Sync-objtool-UAPI-header.patch +++ /dev/null @@ -1,57 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Ingo Molnar -Date: Tue, 14 Nov 2017 07:24:22 +0100 -Subject: [PATCH] tools/headers: Sync objtool UAPI header -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -objtool grew this new warning: - - Warning: synced file at 'tools/objtool/arch/x86/include/asm/inat.h' differs from latest kernel version at 'arch/x86/include/asm/inat.h' - -which upstream header grew new INAT_SEG_* definitions. - -Sync up the tooling version of the header. - -Reported-by: Linus Torvalds -Cc: Josh Poimboeuf -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: linux-kernel@vger.kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 1ca1d1e5618960574fb01507dbab07e5337049a1) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 3dd05d51e337e9d780fb0e7c46d7216a79380d7b) -Signed-off-by: Fabian Grünbichler ---- - tools/objtool/arch/x86/include/asm/inat.h | 10 ++++++++++ - 1 file changed, 10 insertions(+) - -diff --git a/tools/objtool/arch/x86/include/asm/inat.h b/tools/objtool/arch/x86/include/asm/inat.h -index 02aff0867211..1c78580e58be 100644 ---- a/tools/objtool/arch/x86/include/asm/inat.h -+++ b/tools/objtool/arch/x86/include/asm/inat.h -@@ -97,6 +97,16 @@ - #define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) - #define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) - -+/* Identifiers for segment registers */ -+#define INAT_SEG_REG_IGNORE 0 -+#define INAT_SEG_REG_DEFAULT 1 -+#define INAT_SEG_REG_CS 2 -+#define INAT_SEG_REG_SS 3 -+#define INAT_SEG_REG_DS 4 -+#define INAT_SEG_REG_ES 5 -+#define INAT_SEG_REG_FS 6 -+#define INAT_SEG_REG_GS 7 -+ - /* Attribute search APIs */ - extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); - extern int inat_get_last_prefix_id(insn_byte_t last_pfx); --- -2.14.2 - diff --git a/patches/kernel/0135-objtool-Fix-64-bit-build-on-32-bit-host.patch b/patches/kernel/0135-objtool-Fix-64-bit-build-on-32-bit-host.patch deleted file mode 100644 index 2a4d103..0000000 --- a/patches/kernel/0135-objtool-Fix-64-bit-build-on-32-bit-host.patch +++ /dev/null @@ -1,103 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Mikulas Patocka -Date: Sat, 2 Dec 2017 16:17:44 -0600 -Subject: [PATCH] objtool: Fix 64-bit build on 32-bit host -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The new ORC unwinder breaks the build of a 64-bit kernel on a 32-bit -host. Building the kernel on a i386 or x32 host fails with: - - orc_dump.c: In function 'orc_dump': - orc_dump.c:105:26: error: passing argument 2 of 'elf_getshdrnum' from incompatible pointer type [-Werror=incompatible-pointer-types] - if (elf_getshdrnum(elf, &nr_sections)) { - ^ - In file included from /usr/local/include/gelf.h:32:0, - from elf.h:22, - from warn.h:26, - from orc_dump.c:20: - /usr/local/include/libelf.h:304:12: note: expected 'size_t * {aka unsigned int *}' but argument is of type 'long unsigned int *' - extern int elf_getshdrnum (Elf *__elf, size_t *__dst); - ^~~~~~~~~~~~~~ - orc_dump.c:190:17: error: format '%lx' expects argument of type 'long unsigned int', but argument 3 has type 'Elf64_Sxword {aka long long int}' [-Werror=format=] - printf("%s+%lx:", name, rela.r_addend); - ~~^ ~~~~~~~~~~~~~ - %llx - -Fix the build failure. - -Another problem is that if the user specifies HOSTCC or HOSTLD -variables, they are ignored in the objtool makefile. Change the -Makefile to respect these variables. - -Signed-off-by: Mikulas Patocka -Signed-off-by: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Sven Joachim -Cc: Thomas Gleixner -Fixes: 627fce14809b ("objtool: Add ORC unwind table generation") -Link: http://lkml.kernel.org/r/19f0e64d8e07e30a7b307cd010eb780c404fe08d.1512252895.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 0db897fb081b66c26a338e5481f317c71df779c9) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 979c9a5cacd1d40d08c1c24ed5c5810cf7f3069c) -Signed-off-by: Fabian Grünbichler ---- - tools/objtool/Makefile | 8 +++++--- - tools/objtool/orc_dump.c | 7 ++++--- - 2 files changed, 9 insertions(+), 6 deletions(-) - -diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile -index 847e99aa54ea..5c71bae01064 100644 ---- a/tools/objtool/Makefile -+++ b/tools/objtool/Makefile -@@ -6,9 +6,11 @@ ARCH := x86 - endif - - # always use the host compiler --CC = gcc --LD = ld --AR = ar -+HOSTCC ?= gcc -+HOSTLD ?= ld -+CC = $(HOSTCC) -+LD = $(HOSTLD) -+AR = ar - - ifeq ($(srctree),) - srctree := $(patsubst %/,%,$(dir $(CURDIR))) -diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c -index 36c5bf6a2675..c3343820916a 100644 ---- a/tools/objtool/orc_dump.c -+++ b/tools/objtool/orc_dump.c -@@ -76,7 +76,8 @@ int orc_dump(const char *_objname) - int fd, nr_entries, i, *orc_ip = NULL, orc_size = 0; - struct orc_entry *orc = NULL; - char *name; -- unsigned long nr_sections, orc_ip_addr = 0; -+ size_t nr_sections; -+ Elf64_Addr orc_ip_addr = 0; - size_t shstrtab_idx; - Elf *elf; - Elf_Scn *scn; -@@ -187,10 +188,10 @@ int orc_dump(const char *_objname) - return -1; - } - -- printf("%s+%lx:", name, rela.r_addend); -+ printf("%s+%llx:", name, (unsigned long long)rela.r_addend); - - } else { -- printf("%lx:", orc_ip_addr + (i * sizeof(int)) + orc_ip[i]); -+ printf("%llx:", (unsigned long long)(orc_ip_addr + (i * sizeof(int)) + orc_ip[i])); - } - - --- -2.14.2 - diff --git a/patches/kernel/0135-tools-headers-Sync-objtool-UAPI-header.patch b/patches/kernel/0135-tools-headers-Sync-objtool-UAPI-header.patch new file mode 100644 index 0000000..29ce403 --- /dev/null +++ b/patches/kernel/0135-tools-headers-Sync-objtool-UAPI-header.patch @@ -0,0 +1,57 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Ingo Molnar +Date: Tue, 14 Nov 2017 07:24:22 +0100 +Subject: [PATCH] tools/headers: Sync objtool UAPI header +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +objtool grew this new warning: + + Warning: synced file at 'tools/objtool/arch/x86/include/asm/inat.h' differs from latest kernel version at 'arch/x86/include/asm/inat.h' + +which upstream header grew new INAT_SEG_* definitions. + +Sync up the tooling version of the header. + +Reported-by: Linus Torvalds +Cc: Josh Poimboeuf +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 1ca1d1e5618960574fb01507dbab07e5337049a1) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 3dd05d51e337e9d780fb0e7c46d7216a79380d7b) +Signed-off-by: Fabian Grünbichler +--- + tools/objtool/arch/x86/include/asm/inat.h | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/tools/objtool/arch/x86/include/asm/inat.h b/tools/objtool/arch/x86/include/asm/inat.h +index 02aff0867211..1c78580e58be 100644 +--- a/tools/objtool/arch/x86/include/asm/inat.h ++++ b/tools/objtool/arch/x86/include/asm/inat.h +@@ -97,6 +97,16 @@ + #define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) + #define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) + ++/* Identifiers for segment registers */ ++#define INAT_SEG_REG_IGNORE 0 ++#define INAT_SEG_REG_DEFAULT 1 ++#define INAT_SEG_REG_CS 2 ++#define INAT_SEG_REG_SS 3 ++#define INAT_SEG_REG_DS 4 ++#define INAT_SEG_REG_ES 5 ++#define INAT_SEG_REG_FS 6 ++#define INAT_SEG_REG_GS 7 ++ + /* Attribute search APIs */ + extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); + extern int inat_get_last_prefix_id(insn_byte_t last_pfx); +-- +2.14.2 + diff --git a/patches/kernel/0136-objtool-Fix-64-bit-build-on-32-bit-host.patch b/patches/kernel/0136-objtool-Fix-64-bit-build-on-32-bit-host.patch new file mode 100644 index 0000000..2a4d103 --- /dev/null +++ b/patches/kernel/0136-objtool-Fix-64-bit-build-on-32-bit-host.patch @@ -0,0 +1,103 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Mikulas Patocka +Date: Sat, 2 Dec 2017 16:17:44 -0600 +Subject: [PATCH] objtool: Fix 64-bit build on 32-bit host +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The new ORC unwinder breaks the build of a 64-bit kernel on a 32-bit +host. Building the kernel on a i386 or x32 host fails with: + + orc_dump.c: In function 'orc_dump': + orc_dump.c:105:26: error: passing argument 2 of 'elf_getshdrnum' from incompatible pointer type [-Werror=incompatible-pointer-types] + if (elf_getshdrnum(elf, &nr_sections)) { + ^ + In file included from /usr/local/include/gelf.h:32:0, + from elf.h:22, + from warn.h:26, + from orc_dump.c:20: + /usr/local/include/libelf.h:304:12: note: expected 'size_t * {aka unsigned int *}' but argument is of type 'long unsigned int *' + extern int elf_getshdrnum (Elf *__elf, size_t *__dst); + ^~~~~~~~~~~~~~ + orc_dump.c:190:17: error: format '%lx' expects argument of type 'long unsigned int', but argument 3 has type 'Elf64_Sxword {aka long long int}' [-Werror=format=] + printf("%s+%lx:", name, rela.r_addend); + ~~^ ~~~~~~~~~~~~~ + %llx + +Fix the build failure. + +Another problem is that if the user specifies HOSTCC or HOSTLD +variables, they are ignored in the objtool makefile. Change the +Makefile to respect these variables. + +Signed-off-by: Mikulas Patocka +Signed-off-by: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Sven Joachim +Cc: Thomas Gleixner +Fixes: 627fce14809b ("objtool: Add ORC unwind table generation") +Link: http://lkml.kernel.org/r/19f0e64d8e07e30a7b307cd010eb780c404fe08d.1512252895.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 0db897fb081b66c26a338e5481f317c71df779c9) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 979c9a5cacd1d40d08c1c24ed5c5810cf7f3069c) +Signed-off-by: Fabian Grünbichler +--- + tools/objtool/Makefile | 8 +++++--- + tools/objtool/orc_dump.c | 7 ++++--- + 2 files changed, 9 insertions(+), 6 deletions(-) + +diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile +index 847e99aa54ea..5c71bae01064 100644 +--- a/tools/objtool/Makefile ++++ b/tools/objtool/Makefile +@@ -6,9 +6,11 @@ ARCH := x86 + endif + + # always use the host compiler +-CC = gcc +-LD = ld +-AR = ar ++HOSTCC ?= gcc ++HOSTLD ?= ld ++CC = $(HOSTCC) ++LD = $(HOSTLD) ++AR = ar + + ifeq ($(srctree),) + srctree := $(patsubst %/,%,$(dir $(CURDIR))) +diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c +index 36c5bf6a2675..c3343820916a 100644 +--- a/tools/objtool/orc_dump.c ++++ b/tools/objtool/orc_dump.c +@@ -76,7 +76,8 @@ int orc_dump(const char *_objname) + int fd, nr_entries, i, *orc_ip = NULL, orc_size = 0; + struct orc_entry *orc = NULL; + char *name; +- unsigned long nr_sections, orc_ip_addr = 0; ++ size_t nr_sections; ++ Elf64_Addr orc_ip_addr = 0; + size_t shstrtab_idx; + Elf *elf; + Elf_Scn *scn; +@@ -187,10 +188,10 @@ int orc_dump(const char *_objname) + return -1; + } + +- printf("%s+%lx:", name, rela.r_addend); ++ printf("%s+%llx:", name, (unsigned long long)rela.r_addend); + + } else { +- printf("%lx:", orc_ip_addr + (i * sizeof(int)) + orc_ip[i]); ++ printf("%llx:", (unsigned long long)(orc_ip_addr + (i * sizeof(int)) + orc_ip[i])); + } + + +-- +2.14.2 + diff --git a/patches/kernel/0136-x86-decoder-Fix-and-update-the-opcodes-map.patch b/patches/kernel/0136-x86-decoder-Fix-and-update-the-opcodes-map.patch deleted file mode 100644 index ec75390..0000000 --- a/patches/kernel/0136-x86-decoder-Fix-and-update-the-opcodes-map.patch +++ /dev/null @@ -1,171 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Randy Dunlap -Date: Mon, 11 Dec 2017 10:38:36 -0800 -Subject: [PATCH] x86/decoder: Fix and update the opcodes map -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Update x86-opcode-map.txt based on the October 2017 Intel SDM publication. -Fix INVPID to INVVPID. -Add UD0 and UD1 instruction opcodes. - -Also sync the objtool and perf tooling copies of this file. - -Signed-off-by: Randy Dunlap -Acked-by: Masami Hiramatsu -Cc: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Masami Hiramatsu -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Link: http://lkml.kernel.org/r/aac062d7-c0f6-96e3-5c92-ed299e2bd3da@infradead.org -Signed-off-by: Ingo Molnar -(cherry picked from commit f5395545058cd388da5d99bda3dedd2a2fe56dbc) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit f88b977dde8156d6c4514114baa0eed05dd48e41) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/lib/x86-opcode-map.txt | 13 +++++++++++-- - tools/objtool/arch/x86/lib/x86-opcode-map.txt | 15 ++++++++++++--- - tools/perf/util/intel-pt-decoder/x86-opcode-map.txt | 15 ++++++++++++--- - 3 files changed, 35 insertions(+), 8 deletions(-) - -diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt -index 12e377184ee4..aa2270dc9e87 100644 ---- a/arch/x86/lib/x86-opcode-map.txt -+++ b/arch/x86/lib/x86-opcode-map.txt -@@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) - fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) - fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) - fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) --ff: -+ff: UD0 - EndTable - - Table: 3-byte opcode 1 (0x0f 0x38) -@@ -717,7 +717,7 @@ AVXcode: 2 - 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) - 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) - 80: INVEPT Gy,Mdq (66) --81: INVPID Gy,Mdq (66) -+81: INVVPID Gy,Mdq (66) - 82: INVPCID Gy,Mdq (66) - 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) - 88: vexpandps/d Vpd,Wpd (66),(ev) -@@ -970,6 +970,15 @@ GrpTable: Grp9 - EndTable - - GrpTable: Grp10 -+# all are UD1 -+0: UD1 -+1: UD1 -+2: UD1 -+3: UD1 -+4: UD1 -+5: UD1 -+6: UD1 -+7: UD1 - EndTable - - # Grp11A and Grp11B are expressed as Grp11 in Intel SDM -diff --git a/tools/objtool/arch/x86/lib/x86-opcode-map.txt b/tools/objtool/arch/x86/lib/x86-opcode-map.txt -index 12e377184ee4..e0b85930dd77 100644 ---- a/tools/objtool/arch/x86/lib/x86-opcode-map.txt -+++ b/tools/objtool/arch/x86/lib/x86-opcode-map.txt -@@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) - fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) - fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) - fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) --ff: -+ff: UD0 - EndTable - - Table: 3-byte opcode 1 (0x0f 0x38) -@@ -717,7 +717,7 @@ AVXcode: 2 - 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) - 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) - 80: INVEPT Gy,Mdq (66) --81: INVPID Gy,Mdq (66) -+81: INVVPID Gy,Mdq (66) - 82: INVPCID Gy,Mdq (66) - 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) - 88: vexpandps/d Vpd,Wpd (66),(ev) -@@ -896,7 +896,7 @@ EndTable - - GrpTable: Grp3_1 - 0: TEST Eb,Ib --1: -+1: TEST Eb,Ib - 2: NOT Eb - 3: NEG Eb - 4: MUL AL,Eb -@@ -970,6 +970,15 @@ GrpTable: Grp9 - EndTable - - GrpTable: Grp10 -+# all are UD1 -+0: UD1 -+1: UD1 -+2: UD1 -+3: UD1 -+4: UD1 -+5: UD1 -+6: UD1 -+7: UD1 - EndTable - - # Grp11A and Grp11B are expressed as Grp11 in Intel SDM -diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt -index 12e377184ee4..e0b85930dd77 100644 ---- a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt -+++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt -@@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) - fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) - fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) - fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) --ff: -+ff: UD0 - EndTable - - Table: 3-byte opcode 1 (0x0f 0x38) -@@ -717,7 +717,7 @@ AVXcode: 2 - 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) - 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) - 80: INVEPT Gy,Mdq (66) --81: INVPID Gy,Mdq (66) -+81: INVVPID Gy,Mdq (66) - 82: INVPCID Gy,Mdq (66) - 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) - 88: vexpandps/d Vpd,Wpd (66),(ev) -@@ -896,7 +896,7 @@ EndTable - - GrpTable: Grp3_1 - 0: TEST Eb,Ib --1: -+1: TEST Eb,Ib - 2: NOT Eb - 3: NEG Eb - 4: MUL AL,Eb -@@ -970,6 +970,15 @@ GrpTable: Grp9 - EndTable - - GrpTable: Grp10 -+# all are UD1 -+0: UD1 -+1: UD1 -+2: UD1 -+3: UD1 -+4: UD1 -+5: UD1 -+6: UD1 -+7: UD1 - EndTable - - # Grp11A and Grp11B are expressed as Grp11 in Intel SDM --- -2.14.2 - diff --git a/patches/kernel/0137-x86-decoder-Add-new-TEST-instruction-pattern.patch b/patches/kernel/0137-x86-decoder-Add-new-TEST-instruction-pattern.patch deleted file mode 100644 index d459a14..0000000 --- a/patches/kernel/0137-x86-decoder-Add-new-TEST-instruction-pattern.patch +++ /dev/null @@ -1,68 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Masami Hiramatsu -Date: Fri, 24 Nov 2017 13:56:30 +0900 -Subject: [PATCH] x86/decoder: Add new TEST instruction pattern -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The kbuild test robot reported this build warning: - - Warning: arch/x86/tools/test_get_len found difference at :ffffffff8103dd2c - - Warning: ffffffff8103dd82: f6 09 d8 testb $0xd8,(%rcx) - Warning: objdump says 3 bytes, but insn_get_length() says 2 - Warning: decoded and checked 1569014 instructions with 1 warnings - -This sequence seems to be a new instruction not in the opcode map in the Intel SDM. - -The instruction sequence is "F6 09 d8", means Group3(F6), MOD(00)REG(001)RM(001), and 0xd8. -Intel SDM vol2 A.4 Table A-6 said the table index in the group is "Encoding of Bits 5,4,3 of -the ModR/M Byte (bits 2,1,0 in parenthesis)" - -In that table, opcodes listed by the index REG bits as: - - 000 001 010 011 100 101 110 111 - TEST Ib/Iz,(undefined),NOT,NEG,MUL AL/rAX,IMUL AL/rAX,DIV AL/rAX,IDIV AL/rAX - -So, it seems TEST Ib is assigned to 001. - -Add the new pattern. - -Reported-by: kbuild test robot -Signed-off-by: Masami Hiramatsu -Cc: Greg Kroah-Hartman -Cc: -Cc: H. Peter Anvin -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: linux-kernel@vger.kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 2cf68f74af0a6cf808ad03f0d528c72b03c89cc7) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 8896d68f8ff2a97b91279221ddaba73664c5161d) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/lib/x86-opcode-map.txt | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt -index aa2270dc9e87..e0b85930dd77 100644 ---- a/arch/x86/lib/x86-opcode-map.txt -+++ b/arch/x86/lib/x86-opcode-map.txt -@@ -896,7 +896,7 @@ EndTable - - GrpTable: Grp3_1 - 0: TEST Eb,Ib --1: -+1: TEST Eb,Ib - 2: NOT Eb - 3: NEG Eb - 4: MUL AL,Eb --- -2.14.2 - diff --git a/patches/kernel/0137-x86-decoder-Fix-and-update-the-opcodes-map.patch b/patches/kernel/0137-x86-decoder-Fix-and-update-the-opcodes-map.patch new file mode 100644 index 0000000..ec75390 --- /dev/null +++ b/patches/kernel/0137-x86-decoder-Fix-and-update-the-opcodes-map.patch @@ -0,0 +1,171 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Randy Dunlap +Date: Mon, 11 Dec 2017 10:38:36 -0800 +Subject: [PATCH] x86/decoder: Fix and update the opcodes map +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Update x86-opcode-map.txt based on the October 2017 Intel SDM publication. +Fix INVPID to INVVPID. +Add UD0 and UD1 instruction opcodes. + +Also sync the objtool and perf tooling copies of this file. + +Signed-off-by: Randy Dunlap +Acked-by: Masami Hiramatsu +Cc: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Masami Hiramatsu +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/aac062d7-c0f6-96e3-5c92-ed299e2bd3da@infradead.org +Signed-off-by: Ingo Molnar +(cherry picked from commit f5395545058cd388da5d99bda3dedd2a2fe56dbc) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit f88b977dde8156d6c4514114baa0eed05dd48e41) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/lib/x86-opcode-map.txt | 13 +++++++++++-- + tools/objtool/arch/x86/lib/x86-opcode-map.txt | 15 ++++++++++++--- + tools/perf/util/intel-pt-decoder/x86-opcode-map.txt | 15 ++++++++++++--- + 3 files changed, 35 insertions(+), 8 deletions(-) + +diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt +index 12e377184ee4..aa2270dc9e87 100644 +--- a/arch/x86/lib/x86-opcode-map.txt ++++ b/arch/x86/lib/x86-opcode-map.txt +@@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) + fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) + fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) + fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) +-ff: ++ff: UD0 + EndTable + + Table: 3-byte opcode 1 (0x0f 0x38) +@@ -717,7 +717,7 @@ AVXcode: 2 + 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) + 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) + 80: INVEPT Gy,Mdq (66) +-81: INVPID Gy,Mdq (66) ++81: INVVPID Gy,Mdq (66) + 82: INVPCID Gy,Mdq (66) + 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) + 88: vexpandps/d Vpd,Wpd (66),(ev) +@@ -970,6 +970,15 @@ GrpTable: Grp9 + EndTable + + GrpTable: Grp10 ++# all are UD1 ++0: UD1 ++1: UD1 ++2: UD1 ++3: UD1 ++4: UD1 ++5: UD1 ++6: UD1 ++7: UD1 + EndTable + + # Grp11A and Grp11B are expressed as Grp11 in Intel SDM +diff --git a/tools/objtool/arch/x86/lib/x86-opcode-map.txt b/tools/objtool/arch/x86/lib/x86-opcode-map.txt +index 12e377184ee4..e0b85930dd77 100644 +--- a/tools/objtool/arch/x86/lib/x86-opcode-map.txt ++++ b/tools/objtool/arch/x86/lib/x86-opcode-map.txt +@@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) + fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) + fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) + fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) +-ff: ++ff: UD0 + EndTable + + Table: 3-byte opcode 1 (0x0f 0x38) +@@ -717,7 +717,7 @@ AVXcode: 2 + 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) + 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) + 80: INVEPT Gy,Mdq (66) +-81: INVPID Gy,Mdq (66) ++81: INVVPID Gy,Mdq (66) + 82: INVPCID Gy,Mdq (66) + 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) + 88: vexpandps/d Vpd,Wpd (66),(ev) +@@ -896,7 +896,7 @@ EndTable + + GrpTable: Grp3_1 + 0: TEST Eb,Ib +-1: ++1: TEST Eb,Ib + 2: NOT Eb + 3: NEG Eb + 4: MUL AL,Eb +@@ -970,6 +970,15 @@ GrpTable: Grp9 + EndTable + + GrpTable: Grp10 ++# all are UD1 ++0: UD1 ++1: UD1 ++2: UD1 ++3: UD1 ++4: UD1 ++5: UD1 ++6: UD1 ++7: UD1 + EndTable + + # Grp11A and Grp11B are expressed as Grp11 in Intel SDM +diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt +index 12e377184ee4..e0b85930dd77 100644 +--- a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt ++++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt +@@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) + fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) + fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) + fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) +-ff: ++ff: UD0 + EndTable + + Table: 3-byte opcode 1 (0x0f 0x38) +@@ -717,7 +717,7 @@ AVXcode: 2 + 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) + 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) + 80: INVEPT Gy,Mdq (66) +-81: INVPID Gy,Mdq (66) ++81: INVVPID Gy,Mdq (66) + 82: INVPCID Gy,Mdq (66) + 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) + 88: vexpandps/d Vpd,Wpd (66),(ev) +@@ -896,7 +896,7 @@ EndTable + + GrpTable: Grp3_1 + 0: TEST Eb,Ib +-1: ++1: TEST Eb,Ib + 2: NOT Eb + 3: NEG Eb + 4: MUL AL,Eb +@@ -970,6 +970,15 @@ GrpTable: Grp9 + EndTable + + GrpTable: Grp10 ++# all are UD1 ++0: UD1 ++1: UD1 ++2: UD1 ++3: UD1 ++4: UD1 ++5: UD1 ++6: UD1 ++7: UD1 + EndTable + + # Grp11A and Grp11B are expressed as Grp11 in Intel SDM +-- +2.14.2 + diff --git a/patches/kernel/0138-x86-decoder-Add-new-TEST-instruction-pattern.patch b/patches/kernel/0138-x86-decoder-Add-new-TEST-instruction-pattern.patch new file mode 100644 index 0000000..d459a14 --- /dev/null +++ b/patches/kernel/0138-x86-decoder-Add-new-TEST-instruction-pattern.patch @@ -0,0 +1,68 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Masami Hiramatsu +Date: Fri, 24 Nov 2017 13:56:30 +0900 +Subject: [PATCH] x86/decoder: Add new TEST instruction pattern +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The kbuild test robot reported this build warning: + + Warning: arch/x86/tools/test_get_len found difference at :ffffffff8103dd2c + + Warning: ffffffff8103dd82: f6 09 d8 testb $0xd8,(%rcx) + Warning: objdump says 3 bytes, but insn_get_length() says 2 + Warning: decoded and checked 1569014 instructions with 1 warnings + +This sequence seems to be a new instruction not in the opcode map in the Intel SDM. + +The instruction sequence is "F6 09 d8", means Group3(F6), MOD(00)REG(001)RM(001), and 0xd8. +Intel SDM vol2 A.4 Table A-6 said the table index in the group is "Encoding of Bits 5,4,3 of +the ModR/M Byte (bits 2,1,0 in parenthesis)" + +In that table, opcodes listed by the index REG bits as: + + 000 001 010 011 100 101 110 111 + TEST Ib/Iz,(undefined),NOT,NEG,MUL AL/rAX,IMUL AL/rAX,DIV AL/rAX,IDIV AL/rAX + +So, it seems TEST Ib is assigned to 001. + +Add the new pattern. + +Reported-by: kbuild test robot +Signed-off-by: Masami Hiramatsu +Cc: Greg Kroah-Hartman +Cc: +Cc: H. Peter Anvin +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 2cf68f74af0a6cf808ad03f0d528c72b03c89cc7) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 8896d68f8ff2a97b91279221ddaba73664c5161d) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/lib/x86-opcode-map.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt +index aa2270dc9e87..e0b85930dd77 100644 +--- a/arch/x86/lib/x86-opcode-map.txt ++++ b/arch/x86/lib/x86-opcode-map.txt +@@ -896,7 +896,7 @@ EndTable + + GrpTable: Grp3_1 + 0: TEST Eb,Ib +-1: ++1: TEST Eb,Ib + 2: NOT Eb + 3: NEG Eb + 4: MUL AL,Eb +-- +2.14.2 + diff --git a/patches/kernel/0138-x86-insn-eval-Add-utility-functions-to-get-segment-s.patch b/patches/kernel/0138-x86-insn-eval-Add-utility-functions-to-get-segment-s.patch deleted file mode 100644 index 87e821e..0000000 --- a/patches/kernel/0138-x86-insn-eval-Add-utility-functions-to-get-segment-s.patch +++ /dev/null @@ -1,119 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Ingo Molnar -Date: Sat, 23 Dec 2017 13:14:25 +0100 -Subject: [PATCH] x86/insn-eval: Add utility functions to get segment selector -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -When computing a linear address and segmentation is used, we need to know -the base address of the segment involved in the computation. In most of -the cases, the segment base address will be zero as in USER_DS/USER32_DS. -However, it may be possible that a user space program defines its own -segments via a local descriptor table. In such a case, the segment base -address may not be zero. Thus, the segment base address is needed to -calculate correctly the linear address. - -If running in protected mode, the segment selector to be used when -computing a linear address is determined by either any of segment override -prefixes in the instruction or inferred from the registers involved in the -computation of the effective address; in that order. Also, there are cases -when the segment override prefixes shall be ignored (i.e., code segments -are always selected by the CS segment register; string instructions always -use the ES segment register when using rDI register as operand). In long -mode, segment registers are ignored, except for FS and GS. In these two -cases, base addresses are obtained from the respective MSRs. - -For clarity, this process can be split into four steps (and an equal -number of functions): determine if segment prefixes overrides can be used; -parse the segment override prefixes, and use them if found; if not found -or cannot be used, use the default segment registers associated with the -operand registers. Once the segment register to use has been identified, -read its value to obtain the segment selector. - -The method to obtain the segment selector depends on several factors. In -32-bit builds, segment selectors are saved into a pt_regs structure -when switching to kernel mode. The same is also true for virtual-8086 -mode. In 64-bit builds, segmentation is mostly ignored, except when -running a program in 32-bit legacy mode. In this case, CS and SS can be -obtained from pt_regs. DS, ES, FS and GS can be read directly from -the respective segment registers. - -In order to identify the segment registers, a new set of #defines is -introduced. It also includes two special identifiers. One of them -indicates when the default segment register associated with instruction -operands shall be used. Another one indicates that the contents of the -segment register shall be ignored; this identifier is used when in long -mode. - -Improvements-by: Borislav Petkov -Signed-off-by: Ricardo Neri -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: "Michael S. Tsirkin" -Cc: Peter Zijlstra -Cc: Dave Hansen -Cc: ricardo.neri@intel.com -Cc: Adrian Hunter -Cc: Paul Gortmaker -Cc: Huang Rui -Cc: Qiaowei Ren -Cc: Shuah Khan -Cc: Kees Cook -Cc: Jonathan Corbet -Cc: Jiri Slaby -Cc: Dmitry Vyukov -Cc: "Ravi V. Shankar" -Cc: Chris Metcalf -Cc: Brian Gerst -Cc: Arnaldo Carvalho de Melo -Cc: Andy Lutomirski -Cc: Colin Ian King -Cc: Chen Yucong -Cc: Adam Buchbinder -Cc: Vlastimil Babka -Cc: Lorenzo Stoakes -Cc: Masami Hiramatsu -Cc: Paolo Bonzini -Cc: Andrew Morton -Cc: Thomas Garnier -Link: https://lkml.kernel.org/r/1509135945-13762-14-git-send-email-ricardo.neri-calderon@linux.intel.com -Signed-off-by: Ingo Molnar - -(Partially cherry picked from commit 32d0b95300db03c2b23b2ea2c94769a4a138e79d) - -(cherry picked from commit ca2c18cb10c8beb56dfe21321abdddc724cec4de) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit abd7780592a3687eacc0a295d4d2959bb11ff75f) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/inat.h | 10 ++++++++++ - 1 file changed, 10 insertions(+) - -diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h -index 02aff0867211..1c78580e58be 100644 ---- a/arch/x86/include/asm/inat.h -+++ b/arch/x86/include/asm/inat.h -@@ -97,6 +97,16 @@ - #define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) - #define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) - -+/* Identifiers for segment registers */ -+#define INAT_SEG_REG_IGNORE 0 -+#define INAT_SEG_REG_DEFAULT 1 -+#define INAT_SEG_REG_CS 2 -+#define INAT_SEG_REG_SS 3 -+#define INAT_SEG_REG_DS 4 -+#define INAT_SEG_REG_ES 5 -+#define INAT_SEG_REG_FS 6 -+#define INAT_SEG_REG_GS 7 -+ - /* Attribute search APIs */ - extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); - extern int inat_get_last_prefix_id(insn_byte_t last_pfx); --- -2.14.2 - diff --git a/patches/kernel/0139-x86-entry-64-paravirt-Use-paravirt-safe-macro-to-acc.patch b/patches/kernel/0139-x86-entry-64-paravirt-Use-paravirt-safe-macro-to-acc.patch deleted file mode 100644 index 07d50ac..0000000 --- a/patches/kernel/0139-x86-entry-64-paravirt-Use-paravirt-safe-macro-to-acc.patch +++ /dev/null @@ -1,129 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Boris Ostrovsky -Date: Mon, 4 Dec 2017 15:07:07 +0100 -Subject: [PATCH] x86/entry/64/paravirt: Use paravirt-safe macro to access - eflags -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Commit 1d3e53e8624a ("x86/entry/64: Refactor IRQ stacks and make them -NMI-safe") added DEBUG_ENTRY_ASSERT_IRQS_OFF macro that acceses eflags -using 'pushfq' instruction when testing for IF bit. On PV Xen guests -looking at IF flag directly will always see it set, resulting in 'ud2'. - -Introduce SAVE_FLAGS() macro that will use appropriate save_fl pv op when -running paravirt. - -Signed-off-by: Boris Ostrovsky -Signed-off-by: Thomas Gleixner -Reviewed-by: Juergen Gross -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: xen-devel@lists.xenproject.org -Link: https://lkml.kernel.org/r/20171204150604.899457242@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit e17f8234538d1ff708673f287a42457c4dee720d) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 9f4a274842938ce8d55565ced4f45e7ad4a5da90) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/irqflags.h | 3 +++ - arch/x86/include/asm/paravirt.h | 9 +++++++++ - arch/x86/kernel/asm-offsets_64.c | 3 +++ - arch/x86/entry/entry_64.S | 7 ++++--- - 4 files changed, 19 insertions(+), 3 deletions(-) - -diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h -index ac7692dcfa2e..d937781e1047 100644 ---- a/arch/x86/include/asm/irqflags.h -+++ b/arch/x86/include/asm/irqflags.h -@@ -141,6 +141,9 @@ static inline notrace unsigned long arch_local_irq_save(void) - swapgs; \ - sysretl - -+#ifdef CONFIG_DEBUG_ENTRY -+#define SAVE_FLAGS(x) pushfq; popq %rax -+#endif - #else - #define INTERRUPT_RETURN iret - #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit -diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h -index 43d4f90edebc..52dcd2361a78 100644 ---- a/arch/x86/include/asm/paravirt.h -+++ b/arch/x86/include/asm/paravirt.h -@@ -926,6 +926,15 @@ extern void default_banner(void); - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ - CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) -+ -+#ifdef CONFIG_DEBUG_ENTRY -+#define SAVE_FLAGS(clobbers) \ -+ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \ -+ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ -+ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \ -+ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) -+#endif -+ - #endif /* CONFIG_X86_32 */ - - #endif /* __ASSEMBLY__ */ -diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c -index cf42206926af..c21a5315b38e 100644 ---- a/arch/x86/kernel/asm-offsets_64.c -+++ b/arch/x86/kernel/asm-offsets_64.c -@@ -22,6 +22,9 @@ int main(void) - #ifdef CONFIG_PARAVIRT - OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); - OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); -+#ifdef CONFIG_DEBUG_ENTRY -+ OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl); -+#endif - BLANK(); - #endif - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 2491b3b25b9a..6c73e96daf78 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -461,12 +461,13 @@ END(irq_entries_start) - - .macro DEBUG_ENTRY_ASSERT_IRQS_OFF - #ifdef CONFIG_DEBUG_ENTRY -- pushfq -- testl $X86_EFLAGS_IF, (%rsp) -+ pushq %rax -+ SAVE_FLAGS(CLBR_RAX) -+ testl $X86_EFLAGS_IF, %eax - jz .Lokay_\@ - ud2 - .Lokay_\@: -- addq $8, %rsp -+ popq %rax - #endif - .endm - --- -2.14.2 - diff --git a/patches/kernel/0139-x86-insn-eval-Add-utility-functions-to-get-segment-s.patch b/patches/kernel/0139-x86-insn-eval-Add-utility-functions-to-get-segment-s.patch new file mode 100644 index 0000000..87e821e --- /dev/null +++ b/patches/kernel/0139-x86-insn-eval-Add-utility-functions-to-get-segment-s.patch @@ -0,0 +1,119 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Ingo Molnar +Date: Sat, 23 Dec 2017 13:14:25 +0100 +Subject: [PATCH] x86/insn-eval: Add utility functions to get segment selector +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +When computing a linear address and segmentation is used, we need to know +the base address of the segment involved in the computation. In most of +the cases, the segment base address will be zero as in USER_DS/USER32_DS. +However, it may be possible that a user space program defines its own +segments via a local descriptor table. In such a case, the segment base +address may not be zero. Thus, the segment base address is needed to +calculate correctly the linear address. + +If running in protected mode, the segment selector to be used when +computing a linear address is determined by either any of segment override +prefixes in the instruction or inferred from the registers involved in the +computation of the effective address; in that order. Also, there are cases +when the segment override prefixes shall be ignored (i.e., code segments +are always selected by the CS segment register; string instructions always +use the ES segment register when using rDI register as operand). In long +mode, segment registers are ignored, except for FS and GS. In these two +cases, base addresses are obtained from the respective MSRs. + +For clarity, this process can be split into four steps (and an equal +number of functions): determine if segment prefixes overrides can be used; +parse the segment override prefixes, and use them if found; if not found +or cannot be used, use the default segment registers associated with the +operand registers. Once the segment register to use has been identified, +read its value to obtain the segment selector. + +The method to obtain the segment selector depends on several factors. In +32-bit builds, segment selectors are saved into a pt_regs structure +when switching to kernel mode. The same is also true for virtual-8086 +mode. In 64-bit builds, segmentation is mostly ignored, except when +running a program in 32-bit legacy mode. In this case, CS and SS can be +obtained from pt_regs. DS, ES, FS and GS can be read directly from +the respective segment registers. + +In order to identify the segment registers, a new set of #defines is +introduced. It also includes two special identifiers. One of them +indicates when the default segment register associated with instruction +operands shall be used. Another one indicates that the contents of the +segment register shall be ignored; this identifier is used when in long +mode. + +Improvements-by: Borislav Petkov +Signed-off-by: Ricardo Neri +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: "Michael S. Tsirkin" +Cc: Peter Zijlstra +Cc: Dave Hansen +Cc: ricardo.neri@intel.com +Cc: Adrian Hunter +Cc: Paul Gortmaker +Cc: Huang Rui +Cc: Qiaowei Ren +Cc: Shuah Khan +Cc: Kees Cook +Cc: Jonathan Corbet +Cc: Jiri Slaby +Cc: Dmitry Vyukov +Cc: "Ravi V. Shankar" +Cc: Chris Metcalf +Cc: Brian Gerst +Cc: Arnaldo Carvalho de Melo +Cc: Andy Lutomirski +Cc: Colin Ian King +Cc: Chen Yucong +Cc: Adam Buchbinder +Cc: Vlastimil Babka +Cc: Lorenzo Stoakes +Cc: Masami Hiramatsu +Cc: Paolo Bonzini +Cc: Andrew Morton +Cc: Thomas Garnier +Link: https://lkml.kernel.org/r/1509135945-13762-14-git-send-email-ricardo.neri-calderon@linux.intel.com +Signed-off-by: Ingo Molnar + +(Partially cherry picked from commit 32d0b95300db03c2b23b2ea2c94769a4a138e79d) + +(cherry picked from commit ca2c18cb10c8beb56dfe21321abdddc724cec4de) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit abd7780592a3687eacc0a295d4d2959bb11ff75f) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/inat.h | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h +index 02aff0867211..1c78580e58be 100644 +--- a/arch/x86/include/asm/inat.h ++++ b/arch/x86/include/asm/inat.h +@@ -97,6 +97,16 @@ + #define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) + #define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) + ++/* Identifiers for segment registers */ ++#define INAT_SEG_REG_IGNORE 0 ++#define INAT_SEG_REG_DEFAULT 1 ++#define INAT_SEG_REG_CS 2 ++#define INAT_SEG_REG_SS 3 ++#define INAT_SEG_REG_DS 4 ++#define INAT_SEG_REG_ES 5 ++#define INAT_SEG_REG_FS 6 ++#define INAT_SEG_REG_GS 7 ++ + /* Attribute search APIs */ + extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); + extern int inat_get_last_prefix_id(insn_byte_t last_pfx); +-- +2.14.2 + diff --git a/patches/kernel/0140-x86-entry-64-paravirt-Use-paravirt-safe-macro-to-acc.patch b/patches/kernel/0140-x86-entry-64-paravirt-Use-paravirt-safe-macro-to-acc.patch new file mode 100644 index 0000000..07d50ac --- /dev/null +++ b/patches/kernel/0140-x86-entry-64-paravirt-Use-paravirt-safe-macro-to-acc.patch @@ -0,0 +1,129 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Boris Ostrovsky +Date: Mon, 4 Dec 2017 15:07:07 +0100 +Subject: [PATCH] x86/entry/64/paravirt: Use paravirt-safe macro to access + eflags +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Commit 1d3e53e8624a ("x86/entry/64: Refactor IRQ stacks and make them +NMI-safe") added DEBUG_ENTRY_ASSERT_IRQS_OFF macro that acceses eflags +using 'pushfq' instruction when testing for IF bit. On PV Xen guests +looking at IF flag directly will always see it set, resulting in 'ud2'. + +Introduce SAVE_FLAGS() macro that will use appropriate save_fl pv op when +running paravirt. + +Signed-off-by: Boris Ostrovsky +Signed-off-by: Thomas Gleixner +Reviewed-by: Juergen Gross +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: xen-devel@lists.xenproject.org +Link: https://lkml.kernel.org/r/20171204150604.899457242@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit e17f8234538d1ff708673f287a42457c4dee720d) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 9f4a274842938ce8d55565ced4f45e7ad4a5da90) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/irqflags.h | 3 +++ + arch/x86/include/asm/paravirt.h | 9 +++++++++ + arch/x86/kernel/asm-offsets_64.c | 3 +++ + arch/x86/entry/entry_64.S | 7 ++++--- + 4 files changed, 19 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h +index ac7692dcfa2e..d937781e1047 100644 +--- a/arch/x86/include/asm/irqflags.h ++++ b/arch/x86/include/asm/irqflags.h +@@ -141,6 +141,9 @@ static inline notrace unsigned long arch_local_irq_save(void) + swapgs; \ + sysretl + ++#ifdef CONFIG_DEBUG_ENTRY ++#define SAVE_FLAGS(x) pushfq; popq %rax ++#endif + #else + #define INTERRUPT_RETURN iret + #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit +diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h +index 43d4f90edebc..52dcd2361a78 100644 +--- a/arch/x86/include/asm/paravirt.h ++++ b/arch/x86/include/asm/paravirt.h +@@ -926,6 +926,15 @@ extern void default_banner(void); + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ + CLBR_NONE, \ + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) ++ ++#ifdef CONFIG_DEBUG_ENTRY ++#define SAVE_FLAGS(clobbers) \ ++ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \ ++ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ ++ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \ ++ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) ++#endif ++ + #endif /* CONFIG_X86_32 */ + + #endif /* __ASSEMBLY__ */ +diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c +index cf42206926af..c21a5315b38e 100644 +--- a/arch/x86/kernel/asm-offsets_64.c ++++ b/arch/x86/kernel/asm-offsets_64.c +@@ -22,6 +22,9 @@ int main(void) + #ifdef CONFIG_PARAVIRT + OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); + OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); ++#ifdef CONFIG_DEBUG_ENTRY ++ OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl); ++#endif + BLANK(); + #endif + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 2491b3b25b9a..6c73e96daf78 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -461,12 +461,13 @@ END(irq_entries_start) + + .macro DEBUG_ENTRY_ASSERT_IRQS_OFF + #ifdef CONFIG_DEBUG_ENTRY +- pushfq +- testl $X86_EFLAGS_IF, (%rsp) ++ pushq %rax ++ SAVE_FLAGS(CLBR_RAX) ++ testl $X86_EFLAGS_IF, %eax + jz .Lokay_\@ + ud2 + .Lokay_\@: +- addq $8, %rsp ++ popq %rax + #endif + .endm + +-- +2.14.2 + diff --git a/patches/kernel/0140-x86-unwinder-orc-Dont-bail-on-stack-overflow.patch b/patches/kernel/0140-x86-unwinder-orc-Dont-bail-on-stack-overflow.patch deleted file mode 100644 index b7267e8..0000000 --- a/patches/kernel/0140-x86-unwinder-orc-Dont-bail-on-stack-overflow.patch +++ /dev/null @@ -1,91 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:08 +0100 -Subject: [PATCH] x86/unwinder/orc: Dont bail on stack overflow -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -If the stack overflows into a guard page and the ORC unwinder should work -well: by construction, there can't be any meaningful data in the guard page -because no writes to the guard page will have succeeded. - -But there is a bug that prevents unwinding from working correctly: if the -starting register state has RSP pointing into a stack guard page, the ORC -unwinder bails out immediately. - -Instead of bailing out immediately check whether the next page up is a -valid check page and if so analyze that. As a result the ORC unwinder will -start the unwind. - -Tested by intentionally overflowing the task stack. The result is an -accurate call trace instead of a trace consisting purely of '?' entries. - -There are a few other bugs that are triggered if the unwinder encounters a -stack overflow after the first step, but they are outside the scope of this -fix. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150604.991389777@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit d3a09104018cf2ad5973dfa8a9c138ef9f5015a3) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit e5c3115ac69cddd384d6f7abc4a0ef030b247498) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/unwind_orc.c | 14 ++++++++++++-- - 1 file changed, 12 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c -index 570b70d3f604..cea85bfe93f7 100644 ---- a/arch/x86/kernel/unwind_orc.c -+++ b/arch/x86/kernel/unwind_orc.c -@@ -552,8 +552,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, - } - - if (get_stack_info((unsigned long *)state->sp, state->task, -- &state->stack_info, &state->stack_mask)) -- return; -+ &state->stack_info, &state->stack_mask)) { -+ /* -+ * We weren't on a valid stack. It's possible that -+ * we overflowed a valid stack into a guard page. -+ * See if the next page up is valid so that we can -+ * generate some kind of backtrace if this happens. -+ */ -+ void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp); -+ if (get_stack_info(next_page, state->task, &state->stack_info, -+ &state->stack_mask)) -+ return; -+ } - - /* - * The caller can provide the address of the first frame directly --- -2.14.2 - diff --git a/patches/kernel/0141-x86-unwinder-Handle-stack-overflows-more-gracefully.patch b/patches/kernel/0141-x86-unwinder-Handle-stack-overflows-more-gracefully.patch deleted file mode 100644 index 90e76d7..0000000 --- a/patches/kernel/0141-x86-unwinder-Handle-stack-overflows-more-gracefully.patch +++ /dev/null @@ -1,336 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Mon, 4 Dec 2017 15:07:09 +0100 -Subject: [PATCH] x86/unwinder: Handle stack overflows more gracefully -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -There are at least two unwinder bugs hindering the debugging of -stack-overflow crashes: - -- It doesn't deal gracefully with the case where the stack overflows and - the stack pointer itself isn't on a valid stack but the - to-be-dereferenced data *is*. - -- The ORC oops dump code doesn't know how to print partial pt_regs, for the - case where if we get an interrupt/exception in *early* entry code - before the full pt_regs have been saved. - -Fix both issues. - -http://lkml.kernel.org/r/20171126024031.uxi4numpbjm5rlbr@treble - -Signed-off-by: Josh Poimboeuf -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150605.071425003@linutronix.de -Signed-off-by: Ingo Molnar -(backported from commit b02fcf9ba1211097754b286043cd87a8b4907e75) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 9e51f396b068c3e8495cd130113e2f73b2b1f6b0) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/kdebug.h | 1 + - arch/x86/include/asm/unwind.h | 7 ++++ - arch/x86/kernel/dumpstack.c | 29 ++++++++++++++-- - arch/x86/kernel/process_64.c | 12 +++---- - arch/x86/kernel/unwind_orc.c | 78 +++++++++++++++---------------------------- - 5 files changed, 67 insertions(+), 60 deletions(-) - -diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h -index 29a594a3b82a..2a7769dd8fa2 100644 ---- a/arch/x86/include/asm/kdebug.h -+++ b/arch/x86/include/asm/kdebug.h -@@ -25,6 +25,7 @@ extern void die(const char *, struct pt_regs *,long); - extern int __must_check __die(const char *, struct pt_regs *, long); - extern void show_stack_regs(struct pt_regs *regs); - extern void __show_regs(struct pt_regs *regs, int all); -+extern void show_iret_regs(struct pt_regs *regs); - extern unsigned long oops_begin(void); - extern void oops_end(unsigned long, struct pt_regs *, int signr); - -diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h -index 35d67dc7b69f..38fa6154e382 100644 ---- a/arch/x86/include/asm/unwind.h -+++ b/arch/x86/include/asm/unwind.h -@@ -6,6 +6,9 @@ - #include - #include - -+#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip)) -+#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET) -+ - struct unwind_state { - struct stack_info stack_info; - unsigned long stack_mask; -@@ -51,6 +54,10 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, - } - - #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) -+/* -+ * WARNING: The entire pt_regs may not be safe to dereference. In some cases, -+ * only the iret frame registers are accessible. Use with caution! -+ */ - static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) - { - if (unwind_done(state)) -diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c -index dbce3cca94cb..695cdce5dfc8 100644 ---- a/arch/x86/kernel/dumpstack.c -+++ b/arch/x86/kernel/dumpstack.c -@@ -50,6 +50,28 @@ static void printk_stack_address(unsigned long address, int reliable, - printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); - } - -+void show_iret_regs(struct pt_regs *regs) -+{ -+ printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip); -+ printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, -+ regs->sp, regs->flags); -+} -+ -+static void show_regs_safe(struct stack_info *info, struct pt_regs *regs) -+{ -+ if (on_stack(info, regs, sizeof(*regs))) -+ __show_regs(regs, 0); -+ else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET, -+ IRET_FRAME_SIZE)) { -+ /* -+ * When an interrupt or exception occurs in entry code, the -+ * full pt_regs might not have been saved yet. In that case -+ * just print the iret frame. -+ */ -+ show_iret_regs(regs); -+ } -+} -+ - void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl) - { -@@ -94,6 +116,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - if (stack_name) - printk("%s <%s>\n", log_lvl, stack_name); - -+ if (regs) -+ show_regs_safe(&stack_info, regs); -+ - /* - * Scan the stack, printing any text addresses we find. At the - * same time, follow proper stack frames with the unwinder. -@@ -116,7 +141,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - - /* - * Don't print regs->ip again if it was already printed -- * by __show_regs() below. -+ * by show_regs_safe() below. - */ - if (regs && stack == ®s->ip) { - unwind_next_frame(&state); -@@ -154,7 +179,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - /* if the frame has entry regs, print them */ - regs = unwind_get_entry_regs(&state); - if (regs) -- __show_regs(regs, 0); -+ show_regs_safe(&stack_info, regs); - } - - if (stack_name) -diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c -index b08b9b6c40eb..01b119bebb68 100644 ---- a/arch/x86/kernel/process_64.c -+++ b/arch/x86/kernel/process_64.c -@@ -69,10 +69,8 @@ void __show_regs(struct pt_regs *regs, int all) - unsigned int fsindex, gsindex; - unsigned int ds, cs, es; - -- printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff, -- (void *)regs->ip); -- printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, -- regs->sp, regs->flags); -+ show_iret_regs(regs); -+ - if (regs->orig_ax != -1) - pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); - else -@@ -89,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all) - printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", - regs->r13, regs->r14, regs->r15); - -+ if (!all) -+ return; -+ - asm("movl %%ds,%0" : "=r" (ds)); - asm("movl %%cs,%0" : "=r" (cs)); - asm("movl %%es,%0" : "=r" (es)); -@@ -99,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all) - rdmsrl(MSR_GS_BASE, gs); - rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); - -- if (!all) -- return; -- - cr0 = read_cr0(); - cr2 = read_cr2(); - cr3 = __read_cr3(); -diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c -index cea85bfe93f7..702f15f6b5be 100644 ---- a/arch/x86/kernel/unwind_orc.c -+++ b/arch/x86/kernel/unwind_orc.c -@@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) - return NULL; - } - --static bool stack_access_ok(struct unwind_state *state, unsigned long addr, -+static bool stack_access_ok(struct unwind_state *state, unsigned long _addr, - size_t len) - { - struct stack_info *info = &state->stack_info; -+ void *addr = (void *)_addr; - -- /* -- * If the address isn't on the current stack, switch to the next one. -- * -- * We may have to traverse multiple stacks to deal with the possibility -- * that info->next_sp could point to an empty stack and the address -- * could be on a subsequent stack. -- */ -- while (!on_stack(info, (void *)addr, len)) -- if (get_stack_info(info->next_sp, state->task, info, -- &state->stack_mask)) -- return false; -+ if (!on_stack(info, addr, len) && -+ (get_stack_info(addr, state->task, info, &state->stack_mask))) -+ return false; - - return true; - } -@@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, - return true; - } - --#define REGS_SIZE (sizeof(struct pt_regs)) --#define SP_OFFSET (offsetof(struct pt_regs, sp)) --#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) --#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) -- - static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, -- unsigned long *ip, unsigned long *sp, bool full) -+ unsigned long *ip, unsigned long *sp) - { -- size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; -- size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; -- struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); -- -- if (IS_ENABLED(CONFIG_X86_64)) { -- if (!stack_access_ok(state, addr, regs_size)) -- return false; -- -- *ip = regs->ip; -- *sp = regs->sp; -+ struct pt_regs *regs = (struct pt_regs *)addr; - -- return true; -- } -+ /* x86-32 support will be more complicated due to the ®s->sp hack */ -+ BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32)); - -- if (!stack_access_ok(state, addr, sp_offset)) -+ if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) - return false; - - *ip = regs->ip; -+ *sp = regs->sp; -+ return true; -+} - -- if (user_mode(regs)) { -- if (!stack_access_ok(state, addr + sp_offset, -- REGS_SIZE - SP_OFFSET)) -- return false; -+static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr, -+ unsigned long *ip, unsigned long *sp) -+{ -+ struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET; - -- *sp = regs->sp; -- } else -- *sp = (unsigned long)®s->sp; -+ if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) -+ return false; - -+ *ip = regs->ip; -+ *sp = regs->sp; - return true; - } - -@@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state) - unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; - enum stack_type prev_type = state->stack_info.type; - struct orc_entry *orc; -- struct pt_regs *ptregs; - bool indirect = false; - - if (unwind_done(state)) -@@ -435,8 +417,8 @@ bool unwind_next_frame(struct unwind_state *state) - break; - - case ORC_TYPE_REGS: -- if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { -- orc_warn("can't dereference registers at %p for ip %p\n", -+ if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { -+ orc_warn("can't dereference registers at %p for ip %pB\n", - (void *)sp, (void *)orig_ip); - goto done; - } -@@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state) - break; - - case ORC_TYPE_REGS_IRET: -- if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { -- orc_warn("can't dereference iret registers at %p for ip %p\n", -+ if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { -+ orc_warn("can't dereference iret registers at %p for ip %pB\n", - (void *)sp, (void *)orig_ip); - goto done; - } - -- ptregs = container_of((void *)sp, struct pt_regs, ip); -- if ((unsigned long)ptregs >= prev_sp && -- on_stack(&state->stack_info, ptregs, REGS_SIZE)) { -- state->regs = ptregs; -- state->full_regs = false; -- } else -- state->regs = NULL; -- -+ state->regs = (void *)sp - IRET_FRAME_OFFSET; -+ state->full_regs = false; - state->signal = true; - break; - --- -2.14.2 - diff --git a/patches/kernel/0141-x86-unwinder-orc-Dont-bail-on-stack-overflow.patch b/patches/kernel/0141-x86-unwinder-orc-Dont-bail-on-stack-overflow.patch new file mode 100644 index 0000000..b7267e8 --- /dev/null +++ b/patches/kernel/0141-x86-unwinder-orc-Dont-bail-on-stack-overflow.patch @@ -0,0 +1,91 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:08 +0100 +Subject: [PATCH] x86/unwinder/orc: Dont bail on stack overflow +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +If the stack overflows into a guard page and the ORC unwinder should work +well: by construction, there can't be any meaningful data in the guard page +because no writes to the guard page will have succeeded. + +But there is a bug that prevents unwinding from working correctly: if the +starting register state has RSP pointing into a stack guard page, the ORC +unwinder bails out immediately. + +Instead of bailing out immediately check whether the next page up is a +valid check page and if so analyze that. As a result the ORC unwinder will +start the unwind. + +Tested by intentionally overflowing the task stack. The result is an +accurate call trace instead of a trace consisting purely of '?' entries. + +There are a few other bugs that are triggered if the unwinder encounters a +stack overflow after the first step, but they are outside the scope of this +fix. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150604.991389777@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit d3a09104018cf2ad5973dfa8a9c138ef9f5015a3) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit e5c3115ac69cddd384d6f7abc4a0ef030b247498) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/unwind_orc.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c +index 570b70d3f604..cea85bfe93f7 100644 +--- a/arch/x86/kernel/unwind_orc.c ++++ b/arch/x86/kernel/unwind_orc.c +@@ -552,8 +552,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, + } + + if (get_stack_info((unsigned long *)state->sp, state->task, +- &state->stack_info, &state->stack_mask)) +- return; ++ &state->stack_info, &state->stack_mask)) { ++ /* ++ * We weren't on a valid stack. It's possible that ++ * we overflowed a valid stack into a guard page. ++ * See if the next page up is valid so that we can ++ * generate some kind of backtrace if this happens. ++ */ ++ void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp); ++ if (get_stack_info(next_page, state->task, &state->stack_info, ++ &state->stack_mask)) ++ return; ++ } + + /* + * The caller can provide the address of the first frame directly +-- +2.14.2 + diff --git a/patches/kernel/0142-x86-irq-Remove-an-old-outdated-comment-about-context.patch b/patches/kernel/0142-x86-irq-Remove-an-old-outdated-comment-about-context.patch deleted file mode 100644 index 5e9cc97..0000000 --- a/patches/kernel/0142-x86-irq-Remove-an-old-outdated-comment-about-context.patch +++ /dev/null @@ -1,75 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:10 +0100 -Subject: [PATCH] x86/irq: Remove an old outdated comment about context - tracking races -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -That race has been fixed and code cleaned up for a while now. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Reviewed-by: Thomas Gleixner -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150605.150551639@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 6669a692605547892a026445e460bf233958bd7f) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 7344db7580965d6f9994b6d7b1a74206d7635565) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/irq.c | 12 ------------ - 1 file changed, 12 deletions(-) - -diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c -index 4ed0aba8dbc8..a84142a910f3 100644 ---- a/arch/x86/kernel/irq.c -+++ b/arch/x86/kernel/irq.c -@@ -222,18 +222,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) - /* high bit used in ret_from_ code */ - unsigned vector = ~regs->orig_ax; - -- /* -- * NB: Unlike exception entries, IRQ entries do not reliably -- * handle context tracking in the low-level entry code. This is -- * because syscall entries execute briefly with IRQs on before -- * updating context tracking state, so we can take an IRQ from -- * kernel mode with CONTEXT_USER. The low-level entry code only -- * updates the context if we came from user mode, so we won't -- * switch to CONTEXT_KERNEL. We'll fix that once the syscall -- * code is cleaned up enough that we can cleanly defer enabling -- * IRQs. -- */ -- - entering_irq(); - - /* entering_irq() tells RCU that we're not quiescent. Check it. */ --- -2.14.2 - diff --git a/patches/kernel/0142-x86-unwinder-Handle-stack-overflows-more-gracefully.patch b/patches/kernel/0142-x86-unwinder-Handle-stack-overflows-more-gracefully.patch new file mode 100644 index 0000000..90e76d7 --- /dev/null +++ b/patches/kernel/0142-x86-unwinder-Handle-stack-overflows-more-gracefully.patch @@ -0,0 +1,336 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Mon, 4 Dec 2017 15:07:09 +0100 +Subject: [PATCH] x86/unwinder: Handle stack overflows more gracefully +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +There are at least two unwinder bugs hindering the debugging of +stack-overflow crashes: + +- It doesn't deal gracefully with the case where the stack overflows and + the stack pointer itself isn't on a valid stack but the + to-be-dereferenced data *is*. + +- The ORC oops dump code doesn't know how to print partial pt_regs, for the + case where if we get an interrupt/exception in *early* entry code + before the full pt_regs have been saved. + +Fix both issues. + +http://lkml.kernel.org/r/20171126024031.uxi4numpbjm5rlbr@treble + +Signed-off-by: Josh Poimboeuf +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.071425003@linutronix.de +Signed-off-by: Ingo Molnar +(backported from commit b02fcf9ba1211097754b286043cd87a8b4907e75) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 9e51f396b068c3e8495cd130113e2f73b2b1f6b0) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/kdebug.h | 1 + + arch/x86/include/asm/unwind.h | 7 ++++ + arch/x86/kernel/dumpstack.c | 29 ++++++++++++++-- + arch/x86/kernel/process_64.c | 12 +++---- + arch/x86/kernel/unwind_orc.c | 78 +++++++++++++++---------------------------- + 5 files changed, 67 insertions(+), 60 deletions(-) + +diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h +index 29a594a3b82a..2a7769dd8fa2 100644 +--- a/arch/x86/include/asm/kdebug.h ++++ b/arch/x86/include/asm/kdebug.h +@@ -25,6 +25,7 @@ extern void die(const char *, struct pt_regs *,long); + extern int __must_check __die(const char *, struct pt_regs *, long); + extern void show_stack_regs(struct pt_regs *regs); + extern void __show_regs(struct pt_regs *regs, int all); ++extern void show_iret_regs(struct pt_regs *regs); + extern unsigned long oops_begin(void); + extern void oops_end(unsigned long, struct pt_regs *, int signr); + +diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h +index 35d67dc7b69f..38fa6154e382 100644 +--- a/arch/x86/include/asm/unwind.h ++++ b/arch/x86/include/asm/unwind.h +@@ -6,6 +6,9 @@ + #include + #include + ++#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip)) ++#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET) ++ + struct unwind_state { + struct stack_info stack_info; + unsigned long stack_mask; +@@ -51,6 +54,10 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, + } + + #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) ++/* ++ * WARNING: The entire pt_regs may not be safe to dereference. In some cases, ++ * only the iret frame registers are accessible. Use with caution! ++ */ + static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) + { + if (unwind_done(state)) +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index dbce3cca94cb..695cdce5dfc8 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -50,6 +50,28 @@ static void printk_stack_address(unsigned long address, int reliable, + printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); + } + ++void show_iret_regs(struct pt_regs *regs) ++{ ++ printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip); ++ printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, ++ regs->sp, regs->flags); ++} ++ ++static void show_regs_safe(struct stack_info *info, struct pt_regs *regs) ++{ ++ if (on_stack(info, regs, sizeof(*regs))) ++ __show_regs(regs, 0); ++ else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET, ++ IRET_FRAME_SIZE)) { ++ /* ++ * When an interrupt or exception occurs in entry code, the ++ * full pt_regs might not have been saved yet. In that case ++ * just print the iret frame. ++ */ ++ show_iret_regs(regs); ++ } ++} ++ + void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, char *log_lvl) + { +@@ -94,6 +116,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + if (stack_name) + printk("%s <%s>\n", log_lvl, stack_name); + ++ if (regs) ++ show_regs_safe(&stack_info, regs); ++ + /* + * Scan the stack, printing any text addresses we find. At the + * same time, follow proper stack frames with the unwinder. +@@ -116,7 +141,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + + /* + * Don't print regs->ip again if it was already printed +- * by __show_regs() below. ++ * by show_regs_safe() below. + */ + if (regs && stack == ®s->ip) { + unwind_next_frame(&state); +@@ -154,7 +179,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + /* if the frame has entry regs, print them */ + regs = unwind_get_entry_regs(&state); + if (regs) +- __show_regs(regs, 0); ++ show_regs_safe(&stack_info, regs); + } + + if (stack_name) +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index b08b9b6c40eb..01b119bebb68 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -69,10 +69,8 @@ void __show_regs(struct pt_regs *regs, int all) + unsigned int fsindex, gsindex; + unsigned int ds, cs, es; + +- printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff, +- (void *)regs->ip); +- printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, +- regs->sp, regs->flags); ++ show_iret_regs(regs); ++ + if (regs->orig_ax != -1) + pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); + else +@@ -89,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all) + printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", + regs->r13, regs->r14, regs->r15); + ++ if (!all) ++ return; ++ + asm("movl %%ds,%0" : "=r" (ds)); + asm("movl %%cs,%0" : "=r" (cs)); + asm("movl %%es,%0" : "=r" (es)); +@@ -99,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all) + rdmsrl(MSR_GS_BASE, gs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + +- if (!all) +- return; +- + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = __read_cr3(); +diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c +index cea85bfe93f7..702f15f6b5be 100644 +--- a/arch/x86/kernel/unwind_orc.c ++++ b/arch/x86/kernel/unwind_orc.c +@@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) + return NULL; + } + +-static bool stack_access_ok(struct unwind_state *state, unsigned long addr, ++static bool stack_access_ok(struct unwind_state *state, unsigned long _addr, + size_t len) + { + struct stack_info *info = &state->stack_info; ++ void *addr = (void *)_addr; + +- /* +- * If the address isn't on the current stack, switch to the next one. +- * +- * We may have to traverse multiple stacks to deal with the possibility +- * that info->next_sp could point to an empty stack and the address +- * could be on a subsequent stack. +- */ +- while (!on_stack(info, (void *)addr, len)) +- if (get_stack_info(info->next_sp, state->task, info, +- &state->stack_mask)) +- return false; ++ if (!on_stack(info, addr, len) && ++ (get_stack_info(addr, state->task, info, &state->stack_mask))) ++ return false; + + return true; + } +@@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, + return true; + } + +-#define REGS_SIZE (sizeof(struct pt_regs)) +-#define SP_OFFSET (offsetof(struct pt_regs, sp)) +-#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) +-#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) +- + static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, +- unsigned long *ip, unsigned long *sp, bool full) ++ unsigned long *ip, unsigned long *sp) + { +- size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; +- size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; +- struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); +- +- if (IS_ENABLED(CONFIG_X86_64)) { +- if (!stack_access_ok(state, addr, regs_size)) +- return false; +- +- *ip = regs->ip; +- *sp = regs->sp; ++ struct pt_regs *regs = (struct pt_regs *)addr; + +- return true; +- } ++ /* x86-32 support will be more complicated due to the ®s->sp hack */ ++ BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32)); + +- if (!stack_access_ok(state, addr, sp_offset)) ++ if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) + return false; + + *ip = regs->ip; ++ *sp = regs->sp; ++ return true; ++} + +- if (user_mode(regs)) { +- if (!stack_access_ok(state, addr + sp_offset, +- REGS_SIZE - SP_OFFSET)) +- return false; ++static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr, ++ unsigned long *ip, unsigned long *sp) ++{ ++ struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET; + +- *sp = regs->sp; +- } else +- *sp = (unsigned long)®s->sp; ++ if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) ++ return false; + ++ *ip = regs->ip; ++ *sp = regs->sp; + return true; + } + +@@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state) + unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; + enum stack_type prev_type = state->stack_info.type; + struct orc_entry *orc; +- struct pt_regs *ptregs; + bool indirect = false; + + if (unwind_done(state)) +@@ -435,8 +417,8 @@ bool unwind_next_frame(struct unwind_state *state) + break; + + case ORC_TYPE_REGS: +- if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { +- orc_warn("can't dereference registers at %p for ip %p\n", ++ if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { ++ orc_warn("can't dereference registers at %p for ip %pB\n", + (void *)sp, (void *)orig_ip); + goto done; + } +@@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state) + break; + + case ORC_TYPE_REGS_IRET: +- if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { +- orc_warn("can't dereference iret registers at %p for ip %p\n", ++ if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { ++ orc_warn("can't dereference iret registers at %p for ip %pB\n", + (void *)sp, (void *)orig_ip); + goto done; + } + +- ptregs = container_of((void *)sp, struct pt_regs, ip); +- if ((unsigned long)ptregs >= prev_sp && +- on_stack(&state->stack_info, ptregs, REGS_SIZE)) { +- state->regs = ptregs; +- state->full_regs = false; +- } else +- state->regs = NULL; +- ++ state->regs = (void *)sp - IRET_FRAME_OFFSET; ++ state->full_regs = false; + state->signal = true; + break; + +-- +2.14.2 + diff --git a/patches/kernel/0143-x86-irq-64-Print-the-offending-IP-in-the-stack-overf.patch b/patches/kernel/0143-x86-irq-64-Print-the-offending-IP-in-the-stack-overf.patch deleted file mode 100644 index 1231f6f..0000000 --- a/patches/kernel/0143-x86-irq-64-Print-the-offending-IP-in-the-stack-overf.patch +++ /dev/null @@ -1,70 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:11 +0100 -Subject: [PATCH] x86/irq/64: Print the offending IP in the stack overflow - warning -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -In case something goes wrong with unwind (not unlikely in case of -overflow), print the offending IP where we detected the overflow. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Reviewed-by: Thomas Gleixner -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150605.231677119@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 4f3789e792296e21405f708cf3cb409d7c7d5683) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit aa820446b0d31df0870b176257b40baadaf4444c) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/irq_64.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c -index 3be74fbdeff2..feca14980e32 100644 ---- a/arch/x86/kernel/irq_64.c -+++ b/arch/x86/kernel/irq_64.c -@@ -56,10 +56,10 @@ static inline void stack_overflow_check(struct pt_regs *regs) - if (regs->sp >= estack_top && regs->sp <= estack_bottom) - return; - -- WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", -+ WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", - current->comm, curbase, regs->sp, - irq_stack_top, irq_stack_bottom, -- estack_top, estack_bottom); -+ estack_top, estack_bottom, (void *)regs->ip); - - if (sysctl_panic_on_stackoverflow) - panic("low stack detected by irq handler - check messages\n"); --- -2.14.2 - diff --git a/patches/kernel/0143-x86-irq-Remove-an-old-outdated-comment-about-context.patch b/patches/kernel/0143-x86-irq-Remove-an-old-outdated-comment-about-context.patch new file mode 100644 index 0000000..5e9cc97 --- /dev/null +++ b/patches/kernel/0143-x86-irq-Remove-an-old-outdated-comment-about-context.patch @@ -0,0 +1,75 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:10 +0100 +Subject: [PATCH] x86/irq: Remove an old outdated comment about context + tracking races +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +That race has been fixed and code cleaned up for a while now. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Reviewed-by: Thomas Gleixner +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.150551639@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 6669a692605547892a026445e460bf233958bd7f) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 7344db7580965d6f9994b6d7b1a74206d7635565) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/irq.c | 12 ------------ + 1 file changed, 12 deletions(-) + +diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c +index 4ed0aba8dbc8..a84142a910f3 100644 +--- a/arch/x86/kernel/irq.c ++++ b/arch/x86/kernel/irq.c +@@ -222,18 +222,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) + /* high bit used in ret_from_ code */ + unsigned vector = ~regs->orig_ax; + +- /* +- * NB: Unlike exception entries, IRQ entries do not reliably +- * handle context tracking in the low-level entry code. This is +- * because syscall entries execute briefly with IRQs on before +- * updating context tracking state, so we can take an IRQ from +- * kernel mode with CONTEXT_USER. The low-level entry code only +- * updates the context if we came from user mode, so we won't +- * switch to CONTEXT_KERNEL. We'll fix that once the syscall +- * code is cleaned up enough that we can cleanly defer enabling +- * IRQs. +- */ +- + entering_irq(); + + /* entering_irq() tells RCU that we're not quiescent. Check it. */ +-- +2.14.2 + diff --git a/patches/kernel/0144-x86-entry-64-Allocate-and-enable-the-SYSENTER-stack.patch b/patches/kernel/0144-x86-entry-64-Allocate-and-enable-the-SYSENTER-stack.patch deleted file mode 100644 index 8a0b80f..0000000 --- a/patches/kernel/0144-x86-entry-64-Allocate-and-enable-the-SYSENTER-stack.patch +++ /dev/null @@ -1,182 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:12 +0100 -Subject: [PATCH] x86/entry/64: Allocate and enable the SYSENTER stack -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -This will simplify future changes that want scratch variables early in -the SYSENTER handler -- they'll be able to spill registers to the -stack. It also lets us get rid of a SWAPGS_UNSAFE_STACK user. - -This does not depend on CONFIG_IA32_EMULATION=y because we'll want the -stack space even without IA32 emulation. - -As far as I can tell, the reason that this wasn't done from day 1 is -that we use IST for #DB and #BP, which is IMO rather nasty and causes -a lot more problems than it solves. But, since #DB uses IST, we don't -actually need a real stack for SYSENTER (because SYSENTER with TF set -will invoke #DB on the IST stack rather than the SYSENTER stack). - -I want to remove IST usage from these vectors some day, and this patch -is a prerequisite for that as well. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150605.312726423@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 1a79797b58cddfa948420a7553241c79c013e3ca) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 8e621515fa8d1649b031f34b9d498dcd865db1c3) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/processor.h | 3 --- - arch/x86/kernel/asm-offsets.c | 5 +++++ - arch/x86/kernel/asm-offsets_32.c | 5 ----- - arch/x86/kernel/cpu/common.c | 4 +++- - arch/x86/kernel/process.c | 2 -- - arch/x86/kernel/traps.c | 3 +-- - arch/x86/entry/entry_64_compat.S | 2 +- - 7 files changed, 10 insertions(+), 14 deletions(-) - -diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index 79739e5f939a..5225917f9760 100644 ---- a/arch/x86/include/asm/processor.h -+++ b/arch/x86/include/asm/processor.h -@@ -333,14 +333,11 @@ struct tss_struct { - */ - unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; - --#ifdef CONFIG_X86_32 - /* - * Space for the temporary SYSENTER stack. - */ - unsigned long SYSENTER_stack_canary; - unsigned long SYSENTER_stack[64]; --#endif -- - } ____cacheline_aligned; - - DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); -diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c -index de827d6ac8c2..031bd35bd911 100644 ---- a/arch/x86/kernel/asm-offsets.c -+++ b/arch/x86/kernel/asm-offsets.c -@@ -92,4 +92,9 @@ void common(void) { - - BLANK(); - DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); -+ -+ /* Offset from cpu_tss to SYSENTER_stack */ -+ OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); -+ /* Size of SYSENTER_stack */ -+ DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); - } -diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c -index 880aa093268d..d09b161a3bd0 100644 ---- a/arch/x86/kernel/asm-offsets_32.c -+++ b/arch/x86/kernel/asm-offsets_32.c -@@ -52,11 +52,6 @@ void foo(void) - DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - - offsetofend(struct tss_struct, SYSENTER_stack)); - -- /* Offset from cpu_tss to SYSENTER_stack */ -- OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); -- /* Size of SYSENTER_stack */ -- DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); -- - #ifdef CONFIG_CC_STACKPROTECTOR - BLANK(); - OFFSET(stack_canary_offset, stack_canary, canary); -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index 121fe3570d6f..aa97e4cd3a33 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -1362,7 +1362,9 @@ void syscall_init(void) - * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). - */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); -- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); -+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, -+ (unsigned long)this_cpu_ptr(&cpu_tss) + -+ offsetofend(struct tss_struct, SYSENTER_stack)); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); - #else - wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); -diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c -index ccf3a4f4ef68..aa86e810fb54 100644 ---- a/arch/x86/kernel/process.c -+++ b/arch/x86/kernel/process.c -@@ -70,9 +70,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { - */ - .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, - #endif --#ifdef CONFIG_X86_32 - .SYSENTER_stack_canary = STACK_END_MAGIC, --#endif - }; - EXPORT_PER_CPU_SYMBOL(cpu_tss); - -diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c -index 3a46cab2696e..7b1d0df624cf 100644 ---- a/arch/x86/kernel/traps.c -+++ b/arch/x86/kernel/traps.c -@@ -806,14 +806,13 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) - debug_stack_usage_dec(); - - exit: --#if defined(CONFIG_X86_32) - /* - * This is the most likely code path that involves non-trivial use - * of the SYSENTER stack. Check that we haven't overrun it. - */ - WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, - "Overran or corrupted SYSENTER stack\n"); --#endif -+ - ist_exit(regs); - } - NOKPROBE_SYMBOL(do_debug); -diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S -index be745b7a3e3e..1f76b66518ee 100644 ---- a/arch/x86/entry/entry_64_compat.S -+++ b/arch/x86/entry/entry_64_compat.S -@@ -47,7 +47,7 @@ - */ - ENTRY(entry_SYSENTER_compat) - /* Interrupts are off on entry. */ -- SWAPGS_UNSAFE_STACK -+ SWAPGS - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - - /* --- -2.14.2 - diff --git a/patches/kernel/0144-x86-irq-64-Print-the-offending-IP-in-the-stack-overf.patch b/patches/kernel/0144-x86-irq-64-Print-the-offending-IP-in-the-stack-overf.patch new file mode 100644 index 0000000..1231f6f --- /dev/null +++ b/patches/kernel/0144-x86-irq-64-Print-the-offending-IP-in-the-stack-overf.patch @@ -0,0 +1,70 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:11 +0100 +Subject: [PATCH] x86/irq/64: Print the offending IP in the stack overflow + warning +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +In case something goes wrong with unwind (not unlikely in case of +overflow), print the offending IP where we detected the overflow. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Reviewed-by: Thomas Gleixner +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.231677119@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 4f3789e792296e21405f708cf3cb409d7c7d5683) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit aa820446b0d31df0870b176257b40baadaf4444c) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/irq_64.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c +index 3be74fbdeff2..feca14980e32 100644 +--- a/arch/x86/kernel/irq_64.c ++++ b/arch/x86/kernel/irq_64.c +@@ -56,10 +56,10 @@ static inline void stack_overflow_check(struct pt_regs *regs) + if (regs->sp >= estack_top && regs->sp <= estack_bottom) + return; + +- WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", ++ WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", + current->comm, curbase, regs->sp, + irq_stack_top, irq_stack_bottom, +- estack_top, estack_bottom); ++ estack_top, estack_bottom, (void *)regs->ip); + + if (sysctl_panic_on_stackoverflow) + panic("low stack detected by irq handler - check messages\n"); +-- +2.14.2 + diff --git a/patches/kernel/0145-x86-dumpstack-Add-get_stack_info-support-for-the-SYS.patch b/patches/kernel/0145-x86-dumpstack-Add-get_stack_info-support-for-the-SYS.patch deleted file mode 100644 index fcf8851..0000000 --- a/patches/kernel/0145-x86-dumpstack-Add-get_stack_info-support-for-the-SYS.patch +++ /dev/null @@ -1,184 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:13 +0100 -Subject: [PATCH] x86/dumpstack: Add get_stack_info() support for the SYSENTER - stack -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -get_stack_info() doesn't currently know about the SYSENTER stack, so -unwinding will fail if we entered the kernel on the SYSENTER stack -and haven't fully switched off. Teach get_stack_info() about the -SYSENTER stack. - -With future patches applied that run part of the entry code on the -SYSENTER stack and introduce an intentional BUG(), I would get: - - PANIC: double fault, error_code: 0x0 - ... - RIP: 0010:do_error_trap+0x33/0x1c0 - ... - Call Trace: - Code: ... - -With this patch, I get: - - PANIC: double fault, error_code: 0x0 - ... - Call Trace: - - ? async_page_fault+0x36/0x60 - ? invalid_op+0x22/0x40 - ? async_page_fault+0x36/0x60 - ? sync_regs+0x3c/0x40 - ? sync_regs+0x2e/0x40 - ? error_entry+0x6c/0xd0 - ? async_page_fault+0x36/0x60 - - Code: ... - -which is a lot more informative. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150605.392711508@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 33a2f1a6c4d7c0a02d1c006fb0379cc5ca3b96bb) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 72e90cc5463cf882c5f9508817029d85b317f2b5) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/stacktrace.h | 3 +++ - arch/x86/kernel/dumpstack.c | 19 +++++++++++++++++++ - arch/x86/kernel/dumpstack_32.c | 6 ++++++ - arch/x86/kernel/dumpstack_64.c | 6 ++++++ - 4 files changed, 34 insertions(+) - -diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h -index 2e41c50ddf47..95f999576131 100644 ---- a/arch/x86/include/asm/stacktrace.h -+++ b/arch/x86/include/asm/stacktrace.h -@@ -15,6 +15,7 @@ enum stack_type { - STACK_TYPE_TASK, - STACK_TYPE_IRQ, - STACK_TYPE_SOFTIRQ, -+ STACK_TYPE_SYSENTER, - STACK_TYPE_EXCEPTION, - STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, - }; -@@ -27,6 +28,8 @@ struct stack_info { - bool in_task_stack(unsigned long *stack, struct task_struct *task, - struct stack_info *info); - -+bool in_sysenter_stack(unsigned long *stack, struct stack_info *info); -+ - int get_stack_info(unsigned long *stack, struct task_struct *task, - struct stack_info *info, unsigned long *visit_mask); - -diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c -index 695cdce5dfc8..c211cbdff709 100644 ---- a/arch/x86/kernel/dumpstack.c -+++ b/arch/x86/kernel/dumpstack.c -@@ -43,6 +43,25 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, - return true; - } - -+bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) -+{ -+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss); -+ -+ /* Treat the canary as part of the stack for unwinding purposes. */ -+ void *begin = &tss->SYSENTER_stack_canary; -+ void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack); -+ -+ if ((void *)stack < begin || (void *)stack >= end) -+ return false; -+ -+ info->type = STACK_TYPE_SYSENTER; -+ info->begin = begin; -+ info->end = end; -+ info->next_sp = NULL; -+ -+ return true; -+} -+ - static void printk_stack_address(unsigned long address, int reliable, - char *log_lvl) - { -diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c -index e5f0b40e66d2..3160bf2d100e 100644 ---- a/arch/x86/kernel/dumpstack_32.c -+++ b/arch/x86/kernel/dumpstack_32.c -@@ -25,6 +25,9 @@ const char *stack_type_name(enum stack_type type) - if (type == STACK_TYPE_SOFTIRQ) - return "SOFTIRQ"; - -+ if (type == STACK_TYPE_SYSENTER) -+ return "SYSENTER"; -+ - return NULL; - } - -@@ -92,6 +95,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, - if (task != current) - goto unknown; - -+ if (in_sysenter_stack(stack, info)) -+ goto recursion_check; -+ - if (in_hardirq_stack(stack, info)) - goto recursion_check; - -diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c -index 3e1471d57487..f5107b659f86 100644 ---- a/arch/x86/kernel/dumpstack_64.c -+++ b/arch/x86/kernel/dumpstack_64.c -@@ -36,6 +36,9 @@ const char *stack_type_name(enum stack_type type) - if (type == STACK_TYPE_IRQ) - return "IRQ"; - -+ if (type == STACK_TYPE_SYSENTER) -+ return "SYSENTER"; -+ - if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) - return exception_stack_names[type - STACK_TYPE_EXCEPTION]; - -@@ -114,6 +117,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, - if (in_irq_stack(stack, info)) - goto recursion_check; - -+ if (in_sysenter_stack(stack, info)) -+ goto recursion_check; -+ - goto unknown; - - recursion_check: --- -2.14.2 - diff --git a/patches/kernel/0145-x86-entry-64-Allocate-and-enable-the-SYSENTER-stack.patch b/patches/kernel/0145-x86-entry-64-Allocate-and-enable-the-SYSENTER-stack.patch new file mode 100644 index 0000000..8a0b80f --- /dev/null +++ b/patches/kernel/0145-x86-entry-64-Allocate-and-enable-the-SYSENTER-stack.patch @@ -0,0 +1,182 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:12 +0100 +Subject: [PATCH] x86/entry/64: Allocate and enable the SYSENTER stack +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +This will simplify future changes that want scratch variables early in +the SYSENTER handler -- they'll be able to spill registers to the +stack. It also lets us get rid of a SWAPGS_UNSAFE_STACK user. + +This does not depend on CONFIG_IA32_EMULATION=y because we'll want the +stack space even without IA32 emulation. + +As far as I can tell, the reason that this wasn't done from day 1 is +that we use IST for #DB and #BP, which is IMO rather nasty and causes +a lot more problems than it solves. But, since #DB uses IST, we don't +actually need a real stack for SYSENTER (because SYSENTER with TF set +will invoke #DB on the IST stack rather than the SYSENTER stack). + +I want to remove IST usage from these vectors some day, and this patch +is a prerequisite for that as well. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.312726423@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 1a79797b58cddfa948420a7553241c79c013e3ca) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 8e621515fa8d1649b031f34b9d498dcd865db1c3) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/processor.h | 3 --- + arch/x86/kernel/asm-offsets.c | 5 +++++ + arch/x86/kernel/asm-offsets_32.c | 5 ----- + arch/x86/kernel/cpu/common.c | 4 +++- + arch/x86/kernel/process.c | 2 -- + arch/x86/kernel/traps.c | 3 +-- + arch/x86/entry/entry_64_compat.S | 2 +- + 7 files changed, 10 insertions(+), 14 deletions(-) + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 79739e5f939a..5225917f9760 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -333,14 +333,11 @@ struct tss_struct { + */ + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; + +-#ifdef CONFIG_X86_32 + /* + * Space for the temporary SYSENTER stack. + */ + unsigned long SYSENTER_stack_canary; + unsigned long SYSENTER_stack[64]; +-#endif +- + } ____cacheline_aligned; + + DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); +diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c +index de827d6ac8c2..031bd35bd911 100644 +--- a/arch/x86/kernel/asm-offsets.c ++++ b/arch/x86/kernel/asm-offsets.c +@@ -92,4 +92,9 @@ void common(void) { + + BLANK(); + DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); ++ ++ /* Offset from cpu_tss to SYSENTER_stack */ ++ OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); ++ /* Size of SYSENTER_stack */ ++ DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); + } +diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c +index 880aa093268d..d09b161a3bd0 100644 +--- a/arch/x86/kernel/asm-offsets_32.c ++++ b/arch/x86/kernel/asm-offsets_32.c +@@ -52,11 +52,6 @@ void foo(void) + DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - + offsetofend(struct tss_struct, SYSENTER_stack)); + +- /* Offset from cpu_tss to SYSENTER_stack */ +- OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); +- /* Size of SYSENTER_stack */ +- DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); +- + #ifdef CONFIG_CC_STACKPROTECTOR + BLANK(); + OFFSET(stack_canary_offset, stack_canary, canary); +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 121fe3570d6f..aa97e4cd3a33 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1362,7 +1362,9 @@ void syscall_init(void) + * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). + */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); +- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); ++ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, ++ (unsigned long)this_cpu_ptr(&cpu_tss) + ++ offsetofend(struct tss_struct, SYSENTER_stack)); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); + #else + wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index ccf3a4f4ef68..aa86e810fb54 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -70,9 +70,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { + */ + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, + #endif +-#ifdef CONFIG_X86_32 + .SYSENTER_stack_canary = STACK_END_MAGIC, +-#endif + }; + EXPORT_PER_CPU_SYMBOL(cpu_tss); + +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index 3a46cab2696e..7b1d0df624cf 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -806,14 +806,13 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) + debug_stack_usage_dec(); + + exit: +-#if defined(CONFIG_X86_32) + /* + * This is the most likely code path that involves non-trivial use + * of the SYSENTER stack. Check that we haven't overrun it. + */ + WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, + "Overran or corrupted SYSENTER stack\n"); +-#endif ++ + ist_exit(regs); + } + NOKPROBE_SYMBOL(do_debug); +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index be745b7a3e3e..1f76b66518ee 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -47,7 +47,7 @@ + */ + ENTRY(entry_SYSENTER_compat) + /* Interrupts are off on entry. */ +- SWAPGS_UNSAFE_STACK ++ SWAPGS + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + + /* +-- +2.14.2 + diff --git a/patches/kernel/0146-x86-dumpstack-Add-get_stack_info-support-for-the-SYS.patch b/patches/kernel/0146-x86-dumpstack-Add-get_stack_info-support-for-the-SYS.patch new file mode 100644 index 0000000..fcf8851 --- /dev/null +++ b/patches/kernel/0146-x86-dumpstack-Add-get_stack_info-support-for-the-SYS.patch @@ -0,0 +1,184 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:13 +0100 +Subject: [PATCH] x86/dumpstack: Add get_stack_info() support for the SYSENTER + stack +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +get_stack_info() doesn't currently know about the SYSENTER stack, so +unwinding will fail if we entered the kernel on the SYSENTER stack +and haven't fully switched off. Teach get_stack_info() about the +SYSENTER stack. + +With future patches applied that run part of the entry code on the +SYSENTER stack and introduce an intentional BUG(), I would get: + + PANIC: double fault, error_code: 0x0 + ... + RIP: 0010:do_error_trap+0x33/0x1c0 + ... + Call Trace: + Code: ... + +With this patch, I get: + + PANIC: double fault, error_code: 0x0 + ... + Call Trace: + + ? async_page_fault+0x36/0x60 + ? invalid_op+0x22/0x40 + ? async_page_fault+0x36/0x60 + ? sync_regs+0x3c/0x40 + ? sync_regs+0x2e/0x40 + ? error_entry+0x6c/0xd0 + ? async_page_fault+0x36/0x60 + + Code: ... + +which is a lot more informative. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.392711508@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 33a2f1a6c4d7c0a02d1c006fb0379cc5ca3b96bb) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 72e90cc5463cf882c5f9508817029d85b317f2b5) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/stacktrace.h | 3 +++ + arch/x86/kernel/dumpstack.c | 19 +++++++++++++++++++ + arch/x86/kernel/dumpstack_32.c | 6 ++++++ + arch/x86/kernel/dumpstack_64.c | 6 ++++++ + 4 files changed, 34 insertions(+) + +diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h +index 2e41c50ddf47..95f999576131 100644 +--- a/arch/x86/include/asm/stacktrace.h ++++ b/arch/x86/include/asm/stacktrace.h +@@ -15,6 +15,7 @@ enum stack_type { + STACK_TYPE_TASK, + STACK_TYPE_IRQ, + STACK_TYPE_SOFTIRQ, ++ STACK_TYPE_SYSENTER, + STACK_TYPE_EXCEPTION, + STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, + }; +@@ -27,6 +28,8 @@ struct stack_info { + bool in_task_stack(unsigned long *stack, struct task_struct *task, + struct stack_info *info); + ++bool in_sysenter_stack(unsigned long *stack, struct stack_info *info); ++ + int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask); + +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index 695cdce5dfc8..c211cbdff709 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -43,6 +43,25 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, + return true; + } + ++bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) ++{ ++ struct tss_struct *tss = this_cpu_ptr(&cpu_tss); ++ ++ /* Treat the canary as part of the stack for unwinding purposes. */ ++ void *begin = &tss->SYSENTER_stack_canary; ++ void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack); ++ ++ if ((void *)stack < begin || (void *)stack >= end) ++ return false; ++ ++ info->type = STACK_TYPE_SYSENTER; ++ info->begin = begin; ++ info->end = end; ++ info->next_sp = NULL; ++ ++ return true; ++} ++ + static void printk_stack_address(unsigned long address, int reliable, + char *log_lvl) + { +diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c +index e5f0b40e66d2..3160bf2d100e 100644 +--- a/arch/x86/kernel/dumpstack_32.c ++++ b/arch/x86/kernel/dumpstack_32.c +@@ -25,6 +25,9 @@ const char *stack_type_name(enum stack_type type) + if (type == STACK_TYPE_SOFTIRQ) + return "SOFTIRQ"; + ++ if (type == STACK_TYPE_SYSENTER) ++ return "SYSENTER"; ++ + return NULL; + } + +@@ -92,6 +95,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, + if (task != current) + goto unknown; + ++ if (in_sysenter_stack(stack, info)) ++ goto recursion_check; ++ + if (in_hardirq_stack(stack, info)) + goto recursion_check; + +diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c +index 3e1471d57487..f5107b659f86 100644 +--- a/arch/x86/kernel/dumpstack_64.c ++++ b/arch/x86/kernel/dumpstack_64.c +@@ -36,6 +36,9 @@ const char *stack_type_name(enum stack_type type) + if (type == STACK_TYPE_IRQ) + return "IRQ"; + ++ if (type == STACK_TYPE_SYSENTER) ++ return "SYSENTER"; ++ + if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) + return exception_stack_names[type - STACK_TYPE_EXCEPTION]; + +@@ -114,6 +117,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, + if (in_irq_stack(stack, info)) + goto recursion_check; + ++ if (in_sysenter_stack(stack, info)) ++ goto recursion_check; ++ + goto unknown; + + recursion_check: +-- +2.14.2 + diff --git a/patches/kernel/0146-x86-entry-gdt-Put-per-CPU-GDT-remaps-in-ascending-or.patch b/patches/kernel/0146-x86-entry-gdt-Put-per-CPU-GDT-remaps-in-ascending-or.patch deleted file mode 100644 index 91f4090..0000000 --- a/patches/kernel/0146-x86-entry-gdt-Put-per-CPU-GDT-remaps-in-ascending-or.patch +++ /dev/null @@ -1,70 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:14 +0100 -Subject: [PATCH] x86/entry/gdt: Put per-CPU GDT remaps in ascending order -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -We currently have CPU 0's GDT at the top of the GDT range and -higher-numbered CPUs at lower addresses. This happens because the -fixmap is upside down (index 0 is the top of the fixmap). - -Flip it so that GDTs are in ascending order by virtual address. -This will simplify a future patch that will generalize the GDT -remap to contain multiple pages. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Reviewed-by: Thomas Gleixner -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150605.471561421@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit aaeed3aeb39c1ba69f0a49baec8cb728121d0a91) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 9c37967fad2d6a525df53e0a40edcd652e5abaae) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/desc.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h -index f995e5a09136..22ee0a93b4f7 100644 ---- a/arch/x86/include/asm/desc.h -+++ b/arch/x86/include/asm/desc.h -@@ -61,7 +61,7 @@ static inline struct desc_struct *get_current_gdt_rw(void) - /* Get the fixmap index for a specific processor */ - static inline unsigned int get_cpu_gdt_ro_index(int cpu) - { -- return FIX_GDT_REMAP_BEGIN + cpu; -+ return FIX_GDT_REMAP_END - cpu; - } - - /* Provide the fixmap address of the remapped GDT */ --- -2.14.2 - diff --git a/patches/kernel/0147-x86-entry-gdt-Put-per-CPU-GDT-remaps-in-ascending-or.patch b/patches/kernel/0147-x86-entry-gdt-Put-per-CPU-GDT-remaps-in-ascending-or.patch new file mode 100644 index 0000000..91f4090 --- /dev/null +++ b/patches/kernel/0147-x86-entry-gdt-Put-per-CPU-GDT-remaps-in-ascending-or.patch @@ -0,0 +1,70 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:14 +0100 +Subject: [PATCH] x86/entry/gdt: Put per-CPU GDT remaps in ascending order +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +We currently have CPU 0's GDT at the top of the GDT range and +higher-numbered CPUs at lower addresses. This happens because the +fixmap is upside down (index 0 is the top of the fixmap). + +Flip it so that GDTs are in ascending order by virtual address. +This will simplify a future patch that will generalize the GDT +remap to contain multiple pages. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Reviewed-by: Thomas Gleixner +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.471561421@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit aaeed3aeb39c1ba69f0a49baec8cb728121d0a91) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 9c37967fad2d6a525df53e0a40edcd652e5abaae) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/desc.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h +index f995e5a09136..22ee0a93b4f7 100644 +--- a/arch/x86/include/asm/desc.h ++++ b/arch/x86/include/asm/desc.h +@@ -61,7 +61,7 @@ static inline struct desc_struct *get_current_gdt_rw(void) + /* Get the fixmap index for a specific processor */ + static inline unsigned int get_cpu_gdt_ro_index(int cpu) + { +- return FIX_GDT_REMAP_BEGIN + cpu; ++ return FIX_GDT_REMAP_END - cpu; + } + + /* Provide the fixmap address of the remapped GDT */ +-- +2.14.2 + diff --git a/patches/kernel/0147-x86-mm-fixmap-Generalize-the-GDT-fixmap-mechanism-in.patch b/patches/kernel/0147-x86-mm-fixmap-Generalize-the-GDT-fixmap-mechanism-in.patch deleted file mode 100644 index 7c6edcb..0000000 --- a/patches/kernel/0147-x86-mm-fixmap-Generalize-the-GDT-fixmap-mechanism-in.patch +++ /dev/null @@ -1,206 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:15 +0100 -Subject: [PATCH] x86/mm/fixmap: Generalize the GDT fixmap mechanism, introduce - struct cpu_entry_area -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Currently, the GDT is an ad-hoc array of pages, one per CPU, in the -fixmap. Generalize it to be an array of a new 'struct cpu_entry_area' -so that we can cleanly add new things to it. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150605.563271721@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit ef8813ab280507972bb57e4b1b502811ad4411e9) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit b17894f1ac91491ce29946ed946a129620b7f7ac) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/desc.h | 9 +-------- - arch/x86/include/asm/fixmap.h | 37 +++++++++++++++++++++++++++++++++++-- - arch/x86/kernel/cpu/common.c | 14 +++++++------- - arch/x86/xen/mmu_pv.c | 2 +- - 4 files changed, 44 insertions(+), 18 deletions(-) - -diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h -index 22ee0a93b4f7..81c9b1e8cae9 100644 ---- a/arch/x86/include/asm/desc.h -+++ b/arch/x86/include/asm/desc.h -@@ -58,17 +58,10 @@ static inline struct desc_struct *get_current_gdt_rw(void) - return this_cpu_ptr(&gdt_page)->gdt; - } - --/* Get the fixmap index for a specific processor */ --static inline unsigned int get_cpu_gdt_ro_index(int cpu) --{ -- return FIX_GDT_REMAP_END - cpu; --} -- - /* Provide the fixmap address of the remapped GDT */ - static inline struct desc_struct *get_cpu_gdt_ro(int cpu) - { -- unsigned int idx = get_cpu_gdt_ro_index(cpu); -- return (struct desc_struct *)__fix_to_virt(idx); -+ return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt; - } - - /* Provide the current read-only GDT */ -diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h -index 81c2b11f50a6..8c6ed66fe957 100644 ---- a/arch/x86/include/asm/fixmap.h -+++ b/arch/x86/include/asm/fixmap.h -@@ -44,6 +44,19 @@ extern unsigned long __FIXADDR_TOP; - PAGE_SIZE) - #endif - -+/* -+ * cpu_entry_area is a percpu region in the fixmap that contains things -+ * needed by the CPU and early entry/exit code. Real types aren't used -+ * for all fields here to avoid circular header dependencies. -+ * -+ * Every field is a virtual alias of some other allocated backing store. -+ * There is no direct allocation of a struct cpu_entry_area. -+ */ -+struct cpu_entry_area { -+ char gdt[PAGE_SIZE]; -+}; -+ -+#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) - - /* - * Here we define all the compile-time 'special' virtual -@@ -101,8 +114,8 @@ enum fixed_addresses { - FIX_LNW_VRTC, - #endif - /* Fixmap entries to remap the GDTs, one per processor. */ -- FIX_GDT_REMAP_BEGIN, -- FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1, -+ FIX_CPU_ENTRY_AREA_TOP, -+ FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1, - - #ifdef CONFIG_ACPI_APEI_GHES - /* Used for GHES mapping from assorted contexts */ -@@ -171,5 +184,25 @@ static inline void __set_fixmap(enum fixed_addresses idx, - void __early_set_fixmap(enum fixed_addresses idx, - phys_addr_t phys, pgprot_t flags); - -+static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page) -+{ -+ BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); -+ -+ return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page; -+} -+ -+#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \ -+ BUILD_BUG_ON(offset % PAGE_SIZE != 0); \ -+ __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \ -+ }) -+ -+#define get_cpu_entry_area_index(cpu, field) \ -+ __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field)) -+ -+static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) -+{ -+ return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); -+} -+ - #endif /* !__ASSEMBLY__ */ - #endif /* _ASM_X86_FIXMAP_H */ -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index aa97e4cd3a33..ffee73ec1af1 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -466,12 +466,12 @@ void load_percpu_segment(int cpu) - load_stack_canary_segment(); - } - --/* Setup the fixmap mapping only once per-processor */ --static inline void setup_fixmap_gdt(int cpu) -+/* Setup the fixmap mappings only once per-processor */ -+static inline void setup_cpu_entry_area(int cpu) - { - #ifdef CONFIG_X86_64 - /* On 64-bit systems, we use a read-only fixmap GDT. */ -- pgprot_t prot = PAGE_KERNEL_RO; -+ pgprot_t gdt_prot = PAGE_KERNEL_RO; - #else - /* - * On native 32-bit systems, the GDT cannot be read-only because -@@ -482,11 +482,11 @@ static inline void setup_fixmap_gdt(int cpu) - * On Xen PV, the GDT must be read-only because the hypervisor requires - * it. - */ -- pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? -+ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? - PAGE_KERNEL_RO : PAGE_KERNEL; - #endif - -- __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); -+ __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); - } - - /* Load the original GDT from the per-cpu structure */ -@@ -1589,7 +1589,7 @@ void cpu_init(void) - if (is_uv_system()) - uv_cpu_init(); - -- setup_fixmap_gdt(cpu); -+ setup_cpu_entry_area(cpu); - load_fixmap_gdt(cpu); - } - -@@ -1650,7 +1650,7 @@ void cpu_init(void) - - fpu__init_cpu(); - -- setup_fixmap_gdt(cpu); -+ setup_cpu_entry_area(cpu); - load_fixmap_gdt(cpu); - } - #endif -diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c -index 45bb2d462e44..53e65f605bdd 100644 ---- a/arch/x86/xen/mmu_pv.c -+++ b/arch/x86/xen/mmu_pv.c -@@ -2297,7 +2297,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) - #endif - case FIX_TEXT_POKE0: - case FIX_TEXT_POKE1: -- case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: -+ case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM: - /* All local page mappings */ - pte = pfn_pte(phys, prot); - break; --- -2.14.2 - diff --git a/patches/kernel/0148-x86-kasan-64-Teach-KASAN-about-the-cpu_entry_area.patch b/patches/kernel/0148-x86-kasan-64-Teach-KASAN-about-the-cpu_entry_area.patch deleted file mode 100644 index d7de3f3..0000000 --- a/patches/kernel/0148-x86-kasan-64-Teach-KASAN-about-the-cpu_entry_area.patch +++ /dev/null @@ -1,91 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:16 +0100 -Subject: [PATCH] x86/kasan/64: Teach KASAN about the cpu_entry_area -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The cpu_entry_area will contain stacks. Make sure that KASAN has -appropriate shadow mappings for them. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Andrey Ryabinin -Signed-off-by: Thomas Gleixner -Cc: Alexander Potapenko -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Dmitry Vyukov -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: kasan-dev@googlegroups.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150605.642806442@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 21506525fb8ddb0342f2a2370812d47f6a1f3833) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 17833d4cfca7e4284f68fb9f3804a91f2541a83a) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/kasan_init_64.c | 18 +++++++++++++++++- - 1 file changed, 17 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c -index 3d7341986e13..d8836e45bc07 100644 ---- a/arch/x86/mm/kasan_init_64.c -+++ b/arch/x86/mm/kasan_init_64.c -@@ -276,6 +276,7 @@ void __init kasan_early_init(void) - void __init kasan_init(void) - { - int i; -+ void *shadow_cpu_entry_begin, *shadow_cpu_entry_end; - - #ifdef CONFIG_KASAN_INLINE - register_die_notifier(&kasan_die_notifier); -@@ -328,8 +329,23 @@ void __init kasan_init(void) - (unsigned long)kasan_mem_to_shadow(_end), - early_pfn_to_nid(__pa(_stext))); - -+ shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM); -+ shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); -+ shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, -+ PAGE_SIZE); -+ -+ shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE); -+ shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); -+ shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, -+ PAGE_SIZE); -+ - kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), -- (void *)KASAN_SHADOW_END); -+ shadow_cpu_entry_begin); -+ -+ kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, -+ (unsigned long)shadow_cpu_entry_end, 0); -+ -+ kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END); - - load_cr3(init_top_pgt); - __flush_tlb_all(); --- -2.14.2 - diff --git a/patches/kernel/0148-x86-mm-fixmap-Generalize-the-GDT-fixmap-mechanism-in.patch b/patches/kernel/0148-x86-mm-fixmap-Generalize-the-GDT-fixmap-mechanism-in.patch new file mode 100644 index 0000000..7c6edcb --- /dev/null +++ b/patches/kernel/0148-x86-mm-fixmap-Generalize-the-GDT-fixmap-mechanism-in.patch @@ -0,0 +1,206 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:15 +0100 +Subject: [PATCH] x86/mm/fixmap: Generalize the GDT fixmap mechanism, introduce + struct cpu_entry_area +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Currently, the GDT is an ad-hoc array of pages, one per CPU, in the +fixmap. Generalize it to be an array of a new 'struct cpu_entry_area' +so that we can cleanly add new things to it. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.563271721@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit ef8813ab280507972bb57e4b1b502811ad4411e9) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit b17894f1ac91491ce29946ed946a129620b7f7ac) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/desc.h | 9 +-------- + arch/x86/include/asm/fixmap.h | 37 +++++++++++++++++++++++++++++++++++-- + arch/x86/kernel/cpu/common.c | 14 +++++++------- + arch/x86/xen/mmu_pv.c | 2 +- + 4 files changed, 44 insertions(+), 18 deletions(-) + +diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h +index 22ee0a93b4f7..81c9b1e8cae9 100644 +--- a/arch/x86/include/asm/desc.h ++++ b/arch/x86/include/asm/desc.h +@@ -58,17 +58,10 @@ static inline struct desc_struct *get_current_gdt_rw(void) + return this_cpu_ptr(&gdt_page)->gdt; + } + +-/* Get the fixmap index for a specific processor */ +-static inline unsigned int get_cpu_gdt_ro_index(int cpu) +-{ +- return FIX_GDT_REMAP_END - cpu; +-} +- + /* Provide the fixmap address of the remapped GDT */ + static inline struct desc_struct *get_cpu_gdt_ro(int cpu) + { +- unsigned int idx = get_cpu_gdt_ro_index(cpu); +- return (struct desc_struct *)__fix_to_virt(idx); ++ return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt; + } + + /* Provide the current read-only GDT */ +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index 81c2b11f50a6..8c6ed66fe957 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -44,6 +44,19 @@ extern unsigned long __FIXADDR_TOP; + PAGE_SIZE) + #endif + ++/* ++ * cpu_entry_area is a percpu region in the fixmap that contains things ++ * needed by the CPU and early entry/exit code. Real types aren't used ++ * for all fields here to avoid circular header dependencies. ++ * ++ * Every field is a virtual alias of some other allocated backing store. ++ * There is no direct allocation of a struct cpu_entry_area. ++ */ ++struct cpu_entry_area { ++ char gdt[PAGE_SIZE]; ++}; ++ ++#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) + + /* + * Here we define all the compile-time 'special' virtual +@@ -101,8 +114,8 @@ enum fixed_addresses { + FIX_LNW_VRTC, + #endif + /* Fixmap entries to remap the GDTs, one per processor. */ +- FIX_GDT_REMAP_BEGIN, +- FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1, ++ FIX_CPU_ENTRY_AREA_TOP, ++ FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1, + + #ifdef CONFIG_ACPI_APEI_GHES + /* Used for GHES mapping from assorted contexts */ +@@ -171,5 +184,25 @@ static inline void __set_fixmap(enum fixed_addresses idx, + void __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags); + ++static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page) ++{ ++ BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); ++ ++ return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page; ++} ++ ++#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \ ++ BUILD_BUG_ON(offset % PAGE_SIZE != 0); \ ++ __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \ ++ }) ++ ++#define get_cpu_entry_area_index(cpu, field) \ ++ __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field)) ++ ++static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) ++{ ++ return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); ++} ++ + #endif /* !__ASSEMBLY__ */ + #endif /* _ASM_X86_FIXMAP_H */ +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index aa97e4cd3a33..ffee73ec1af1 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -466,12 +466,12 @@ void load_percpu_segment(int cpu) + load_stack_canary_segment(); + } + +-/* Setup the fixmap mapping only once per-processor */ +-static inline void setup_fixmap_gdt(int cpu) ++/* Setup the fixmap mappings only once per-processor */ ++static inline void setup_cpu_entry_area(int cpu) + { + #ifdef CONFIG_X86_64 + /* On 64-bit systems, we use a read-only fixmap GDT. */ +- pgprot_t prot = PAGE_KERNEL_RO; ++ pgprot_t gdt_prot = PAGE_KERNEL_RO; + #else + /* + * On native 32-bit systems, the GDT cannot be read-only because +@@ -482,11 +482,11 @@ static inline void setup_fixmap_gdt(int cpu) + * On Xen PV, the GDT must be read-only because the hypervisor requires + * it. + */ +- pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? ++ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? + PAGE_KERNEL_RO : PAGE_KERNEL; + #endif + +- __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); ++ __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); + } + + /* Load the original GDT from the per-cpu structure */ +@@ -1589,7 +1589,7 @@ void cpu_init(void) + if (is_uv_system()) + uv_cpu_init(); + +- setup_fixmap_gdt(cpu); ++ setup_cpu_entry_area(cpu); + load_fixmap_gdt(cpu); + } + +@@ -1650,7 +1650,7 @@ void cpu_init(void) + + fpu__init_cpu(); + +- setup_fixmap_gdt(cpu); ++ setup_cpu_entry_area(cpu); + load_fixmap_gdt(cpu); + } + #endif +diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c +index 45bb2d462e44..53e65f605bdd 100644 +--- a/arch/x86/xen/mmu_pv.c ++++ b/arch/x86/xen/mmu_pv.c +@@ -2297,7 +2297,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) + #endif + case FIX_TEXT_POKE0: + case FIX_TEXT_POKE1: +- case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: ++ case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM: + /* All local page mappings */ + pte = pfn_pte(phys, prot); + break; +-- +2.14.2 + diff --git a/patches/kernel/0149-x86-entry-Fix-assumptions-that-the-HW-TSS-is-at-the-.patch b/patches/kernel/0149-x86-entry-Fix-assumptions-that-the-HW-TSS-is-at-the-.patch deleted file mode 100644 index 0c46932..0000000 --- a/patches/kernel/0149-x86-entry-Fix-assumptions-that-the-HW-TSS-is-at-the-.patch +++ /dev/null @@ -1,227 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:17 +0100 -Subject: [PATCH] x86/entry: Fix assumptions that the HW TSS is at the - beginning of cpu_tss -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -A future patch will move SYSENTER_stack to the beginning of cpu_tss -to help detect overflow. Before this can happen, fix several code -paths that hardcode assumptions about the old layout. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Reviewed-by: Dave Hansen -Reviewed-by: Thomas Gleixner -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150605.722425540@linutronix.de -Signed-off-by: Ingo Molnar -(backported from commit 7fb983b4dd569e08564134a850dfd4eb1c63d9b8) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 7123a5de72dc59dea18ce8886e7db726f7259caf) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/desc.h | 2 +- - arch/x86/include/asm/processor.h | 9 +++++++-- - arch/x86/kernel/cpu/common.c | 8 ++++---- - arch/x86/kernel/doublefault.c | 36 +++++++++++++++++------------------- - arch/x86/kvm/vmx.c | 2 +- - arch/x86/power/cpu.c | 13 +++++++------ - 6 files changed, 37 insertions(+), 33 deletions(-) - -diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h -index 81c9b1e8cae9..b817fe247506 100644 ---- a/arch/x86/include/asm/desc.h -+++ b/arch/x86/include/asm/desc.h -@@ -190,7 +190,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, - #endif - } - --static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) -+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr) - { - struct desc_struct *d = get_cpu_gdt_rw(cpu); - tss_desc tss; -diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index 5225917f9760..78123abdb046 100644 ---- a/arch/x86/include/asm/processor.h -+++ b/arch/x86/include/asm/processor.h -@@ -161,7 +161,7 @@ extern struct cpuinfo_x86 new_cpu_data; - - #include - --extern struct tss_struct doublefault_tss; -+extern struct x86_hw_tss doublefault_tss; - extern __u32 cpu_caps_cleared[NCAPINTS]; - extern __u32 cpu_caps_set[NCAPINTS]; - -@@ -246,6 +246,11 @@ static inline void load_cr3(pgd_t *pgdir) - write_cr3(__pa(pgdir)); - } - -+/* -+ * Note that while the legacy 'TSS' name comes from 'Task State Segment', -+ * on modern x86 CPUs the TSS also holds information important to 64-bit mode, -+ * unrelated to the task-switch mechanism: -+ */ - #ifdef CONFIG_X86_32 - /* This is the TSS defined by the hardware. */ - struct x86_hw_tss { -@@ -316,7 +321,7 @@ struct x86_hw_tss { - #define IO_BITMAP_BITS 65536 - #define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) - #define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) --#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) -+#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) - #define INVALID_IO_BITMAP_OFFSET 0x8000 - - struct tss_struct { -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index ffee73ec1af1..e526d82b546c 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -1558,7 +1558,7 @@ void cpu_init(void) - } - } - -- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); -+ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - - /* - * <= is required because the CPU will access up to -@@ -1576,7 +1576,7 @@ void cpu_init(void) - * Initialize the TSS. Don't bother initializing sp0, as the initial - * task never enters user mode. - */ -- set_tss_desc(cpu, t); -+ set_tss_desc(cpu, &t->x86_tss); - load_TR_desc(); - - load_mm_ldt(&init_mm); -@@ -1633,12 +1633,12 @@ void cpu_init(void) - * Initialize the TSS. Don't bother initializing sp0, as the initial - * task never enters user mode. - */ -- set_tss_desc(cpu, t); -+ set_tss_desc(cpu, &t->x86_tss); - load_TR_desc(); - - load_mm_ldt(&init_mm); - -- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); -+ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - - #ifdef CONFIG_DOUBLEFAULT - /* Set up doublefault TSS pointer in the GDT */ -diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c -index f9c324e08d85..a9fe79d49d39 100644 ---- a/arch/x86/kernel/doublefault.c -+++ b/arch/x86/kernel/doublefault.c -@@ -49,25 +49,23 @@ static void doublefault_fn(void) - cpu_relax(); - } - --struct tss_struct doublefault_tss __cacheline_aligned = { -- .x86_tss = { -- .sp0 = STACK_START, -- .ss0 = __KERNEL_DS, -- .ldt = 0, -- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, -- -- .ip = (unsigned long) doublefault_fn, -- /* 0x2 bit is always set */ -- .flags = X86_EFLAGS_SF | 0x2, -- .sp = STACK_START, -- .es = __USER_DS, -- .cs = __KERNEL_CS, -- .ss = __KERNEL_DS, -- .ds = __USER_DS, -- .fs = __KERNEL_PERCPU, -- -- .__cr3 = __pa_nodebug(swapper_pg_dir), -- } -+struct x86_hw_tss doublefault_tss __cacheline_aligned = { -+ .sp0 = STACK_START, -+ .ss0 = __KERNEL_DS, -+ .ldt = 0, -+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, -+ -+ .ip = (unsigned long) doublefault_fn, -+ /* 0x2 bit is always set */ -+ .flags = X86_EFLAGS_SF | 0x2, -+ .sp = STACK_START, -+ .es = __USER_DS, -+ .cs = __KERNEL_CS, -+ .ss = __KERNEL_DS, -+ .ds = __USER_DS, -+ .fs = __KERNEL_PERCPU, -+ -+ .__cr3 = __pa_nodebug(swapper_pg_dir), - }; - - /* dummy for do_double_fault() call */ -diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c -index dd4996a96c71..a7c5a47beab7 100644 ---- a/arch/x86/kvm/vmx.c -+++ b/arch/x86/kvm/vmx.c -@@ -2280,7 +2280,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) - * processors. See 22.2.4. - */ - vmcs_writel(HOST_TR_BASE, -- (unsigned long)this_cpu_ptr(&cpu_tss)); -+ (unsigned long)this_cpu_ptr(&cpu_tss.x86_tss)); - vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ - - /* -diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c -index 78459a6d455a..48cd87fc7222 100644 ---- a/arch/x86/power/cpu.c -+++ b/arch/x86/power/cpu.c -@@ -165,12 +165,13 @@ static void fix_processor_context(void) - struct desc_struct *desc = get_cpu_gdt_rw(cpu); - tss_desc tss; - #endif -- set_tss_desc(cpu, t); /* -- * This just modifies memory; should not be -- * necessary. But... This is necessary, because -- * 386 hardware has concept of busy TSS or some -- * similar stupidity. -- */ -+ -+ /* -+ * This just modifies memory; should not be necessary. But... This is -+ * necessary, because 386 hardware has concept of busy TSS or some -+ * similar stupidity. -+ */ -+ set_tss_desc(cpu, &t->x86_tss); - - #ifdef CONFIG_X86_64 - memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); --- -2.14.2 - diff --git a/patches/kernel/0149-x86-kasan-64-Teach-KASAN-about-the-cpu_entry_area.patch b/patches/kernel/0149-x86-kasan-64-Teach-KASAN-about-the-cpu_entry_area.patch new file mode 100644 index 0000000..d7de3f3 --- /dev/null +++ b/patches/kernel/0149-x86-kasan-64-Teach-KASAN-about-the-cpu_entry_area.patch @@ -0,0 +1,91 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:16 +0100 +Subject: [PATCH] x86/kasan/64: Teach KASAN about the cpu_entry_area +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The cpu_entry_area will contain stacks. Make sure that KASAN has +appropriate shadow mappings for them. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Andrey Ryabinin +Signed-off-by: Thomas Gleixner +Cc: Alexander Potapenko +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Dmitry Vyukov +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: kasan-dev@googlegroups.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.642806442@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 21506525fb8ddb0342f2a2370812d47f6a1f3833) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 17833d4cfca7e4284f68fb9f3804a91f2541a83a) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/kasan_init_64.c | 18 +++++++++++++++++- + 1 file changed, 17 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c +index 3d7341986e13..d8836e45bc07 100644 +--- a/arch/x86/mm/kasan_init_64.c ++++ b/arch/x86/mm/kasan_init_64.c +@@ -276,6 +276,7 @@ void __init kasan_early_init(void) + void __init kasan_init(void) + { + int i; ++ void *shadow_cpu_entry_begin, *shadow_cpu_entry_end; + + #ifdef CONFIG_KASAN_INLINE + register_die_notifier(&kasan_die_notifier); +@@ -328,8 +329,23 @@ void __init kasan_init(void) + (unsigned long)kasan_mem_to_shadow(_end), + early_pfn_to_nid(__pa(_stext))); + ++ shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM); ++ shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); ++ shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, ++ PAGE_SIZE); ++ ++ shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE); ++ shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); ++ shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, ++ PAGE_SIZE); ++ + kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), +- (void *)KASAN_SHADOW_END); ++ shadow_cpu_entry_begin); ++ ++ kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, ++ (unsigned long)shadow_cpu_entry_end, 0); ++ ++ kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END); + + load_cr3(init_top_pgt); + __flush_tlb_all(); +-- +2.14.2 + diff --git a/patches/kernel/0150-x86-dumpstack-Handle-stack-overflow-on-all-stacks.patch b/patches/kernel/0150-x86-dumpstack-Handle-stack-overflow-on-all-stacks.patch deleted file mode 100644 index acd41d1..0000000 --- a/patches/kernel/0150-x86-dumpstack-Handle-stack-overflow-on-all-stacks.patch +++ /dev/null @@ -1,96 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:18 +0100 -Subject: [PATCH] x86/dumpstack: Handle stack overflow on all stacks -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -We currently special-case stack overflow on the task stack. We're -going to start putting special stacks in the fixmap with a custom -layout, so they'll have guard pages, too. Teach the unwinder to be -able to unwind an overflow of any of the stacks. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150605.802057305@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 6e60e583426c2f8751c22c2dfe5c207083b4483a) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 1ab51120b9a5baaa46979e4ab8ff28916c9cb846) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/dumpstack.c | 24 ++++++++++++++---------- - 1 file changed, 14 insertions(+), 10 deletions(-) - -diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c -index c211cbdff709..0f4b931e1a02 100644 ---- a/arch/x86/kernel/dumpstack.c -+++ b/arch/x86/kernel/dumpstack.c -@@ -112,24 +112,28 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - * - task stack - * - interrupt stack - * - HW exception stacks (double fault, nmi, debug, mce) -+ * - SYSENTER stack - * -- * x86-32 can have up to three stacks: -+ * x86-32 can have up to four stacks: - * - task stack - * - softirq stack - * - hardirq stack -+ * - SYSENTER stack - */ - for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { - const char *stack_name; - -- /* -- * If we overflowed the task stack into a guard page, jump back -- * to the bottom of the usable stack. -- */ -- if (task_stack_page(task) - (void *)stack < PAGE_SIZE) -- stack = task_stack_page(task); -- -- if (get_stack_info(stack, task, &stack_info, &visit_mask)) -- break; -+ if (get_stack_info(stack, task, &stack_info, &visit_mask)) { -+ /* -+ * We weren't on a valid stack. It's possible that -+ * we overflowed a valid stack into a guard page. -+ * See if the next page up is valid so that we can -+ * generate some kind of backtrace if this happens. -+ */ -+ stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack); -+ if (get_stack_info(stack, task, &stack_info, &visit_mask)) -+ break; -+ } - - stack_name = stack_type_name(stack_info.type); - if (stack_name) --- -2.14.2 - diff --git a/patches/kernel/0150-x86-entry-Fix-assumptions-that-the-HW-TSS-is-at-the-.patch b/patches/kernel/0150-x86-entry-Fix-assumptions-that-the-HW-TSS-is-at-the-.patch new file mode 100644 index 0000000..0c46932 --- /dev/null +++ b/patches/kernel/0150-x86-entry-Fix-assumptions-that-the-HW-TSS-is-at-the-.patch @@ -0,0 +1,227 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:17 +0100 +Subject: [PATCH] x86/entry: Fix assumptions that the HW TSS is at the + beginning of cpu_tss +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +A future patch will move SYSENTER_stack to the beginning of cpu_tss +to help detect overflow. Before this can happen, fix several code +paths that hardcode assumptions about the old layout. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Reviewed-by: Dave Hansen +Reviewed-by: Thomas Gleixner +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.722425540@linutronix.de +Signed-off-by: Ingo Molnar +(backported from commit 7fb983b4dd569e08564134a850dfd4eb1c63d9b8) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 7123a5de72dc59dea18ce8886e7db726f7259caf) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/desc.h | 2 +- + arch/x86/include/asm/processor.h | 9 +++++++-- + arch/x86/kernel/cpu/common.c | 8 ++++---- + arch/x86/kernel/doublefault.c | 36 +++++++++++++++++------------------- + arch/x86/kvm/vmx.c | 2 +- + arch/x86/power/cpu.c | 13 +++++++------ + 6 files changed, 37 insertions(+), 33 deletions(-) + +diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h +index 81c9b1e8cae9..b817fe247506 100644 +--- a/arch/x86/include/asm/desc.h ++++ b/arch/x86/include/asm/desc.h +@@ -190,7 +190,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, + #endif + } + +-static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) ++static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr) + { + struct desc_struct *d = get_cpu_gdt_rw(cpu); + tss_desc tss; +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 5225917f9760..78123abdb046 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -161,7 +161,7 @@ extern struct cpuinfo_x86 new_cpu_data; + + #include + +-extern struct tss_struct doublefault_tss; ++extern struct x86_hw_tss doublefault_tss; + extern __u32 cpu_caps_cleared[NCAPINTS]; + extern __u32 cpu_caps_set[NCAPINTS]; + +@@ -246,6 +246,11 @@ static inline void load_cr3(pgd_t *pgdir) + write_cr3(__pa(pgdir)); + } + ++/* ++ * Note that while the legacy 'TSS' name comes from 'Task State Segment', ++ * on modern x86 CPUs the TSS also holds information important to 64-bit mode, ++ * unrelated to the task-switch mechanism: ++ */ + #ifdef CONFIG_X86_32 + /* This is the TSS defined by the hardware. */ + struct x86_hw_tss { +@@ -316,7 +321,7 @@ struct x86_hw_tss { + #define IO_BITMAP_BITS 65536 + #define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) + #define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) +-#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) ++#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) + #define INVALID_IO_BITMAP_OFFSET 0x8000 + + struct tss_struct { +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index ffee73ec1af1..e526d82b546c 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1558,7 +1558,7 @@ void cpu_init(void) + } + } + +- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); ++ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; + + /* + * <= is required because the CPU will access up to +@@ -1576,7 +1576,7 @@ void cpu_init(void) + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ +- set_tss_desc(cpu, t); ++ set_tss_desc(cpu, &t->x86_tss); + load_TR_desc(); + + load_mm_ldt(&init_mm); +@@ -1633,12 +1633,12 @@ void cpu_init(void) + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ +- set_tss_desc(cpu, t); ++ set_tss_desc(cpu, &t->x86_tss); + load_TR_desc(); + + load_mm_ldt(&init_mm); + +- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); ++ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; + + #ifdef CONFIG_DOUBLEFAULT + /* Set up doublefault TSS pointer in the GDT */ +diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c +index f9c324e08d85..a9fe79d49d39 100644 +--- a/arch/x86/kernel/doublefault.c ++++ b/arch/x86/kernel/doublefault.c +@@ -49,25 +49,23 @@ static void doublefault_fn(void) + cpu_relax(); + } + +-struct tss_struct doublefault_tss __cacheline_aligned = { +- .x86_tss = { +- .sp0 = STACK_START, +- .ss0 = __KERNEL_DS, +- .ldt = 0, +- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, +- +- .ip = (unsigned long) doublefault_fn, +- /* 0x2 bit is always set */ +- .flags = X86_EFLAGS_SF | 0x2, +- .sp = STACK_START, +- .es = __USER_DS, +- .cs = __KERNEL_CS, +- .ss = __KERNEL_DS, +- .ds = __USER_DS, +- .fs = __KERNEL_PERCPU, +- +- .__cr3 = __pa_nodebug(swapper_pg_dir), +- } ++struct x86_hw_tss doublefault_tss __cacheline_aligned = { ++ .sp0 = STACK_START, ++ .ss0 = __KERNEL_DS, ++ .ldt = 0, ++ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, ++ ++ .ip = (unsigned long) doublefault_fn, ++ /* 0x2 bit is always set */ ++ .flags = X86_EFLAGS_SF | 0x2, ++ .sp = STACK_START, ++ .es = __USER_DS, ++ .cs = __KERNEL_CS, ++ .ss = __KERNEL_DS, ++ .ds = __USER_DS, ++ .fs = __KERNEL_PERCPU, ++ ++ .__cr3 = __pa_nodebug(swapper_pg_dir), + }; + + /* dummy for do_double_fault() call */ +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index dd4996a96c71..a7c5a47beab7 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -2280,7 +2280,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + * processors. See 22.2.4. + */ + vmcs_writel(HOST_TR_BASE, +- (unsigned long)this_cpu_ptr(&cpu_tss)); ++ (unsigned long)this_cpu_ptr(&cpu_tss.x86_tss)); + vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ + + /* +diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c +index 78459a6d455a..48cd87fc7222 100644 +--- a/arch/x86/power/cpu.c ++++ b/arch/x86/power/cpu.c +@@ -165,12 +165,13 @@ static void fix_processor_context(void) + struct desc_struct *desc = get_cpu_gdt_rw(cpu); + tss_desc tss; + #endif +- set_tss_desc(cpu, t); /* +- * This just modifies memory; should not be +- * necessary. But... This is necessary, because +- * 386 hardware has concept of busy TSS or some +- * similar stupidity. +- */ ++ ++ /* ++ * This just modifies memory; should not be necessary. But... This is ++ * necessary, because 386 hardware has concept of busy TSS or some ++ * similar stupidity. ++ */ ++ set_tss_desc(cpu, &t->x86_tss); + + #ifdef CONFIG_X86_64 + memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); +-- +2.14.2 + diff --git a/patches/kernel/0151-x86-dumpstack-Handle-stack-overflow-on-all-stacks.patch b/patches/kernel/0151-x86-dumpstack-Handle-stack-overflow-on-all-stacks.patch new file mode 100644 index 0000000..acd41d1 --- /dev/null +++ b/patches/kernel/0151-x86-dumpstack-Handle-stack-overflow-on-all-stacks.patch @@ -0,0 +1,96 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:18 +0100 +Subject: [PATCH] x86/dumpstack: Handle stack overflow on all stacks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +We currently special-case stack overflow on the task stack. We're +going to start putting special stacks in the fixmap with a custom +layout, so they'll have guard pages, too. Teach the unwinder to be +able to unwind an overflow of any of the stacks. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.802057305@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 6e60e583426c2f8751c22c2dfe5c207083b4483a) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 1ab51120b9a5baaa46979e4ab8ff28916c9cb846) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/dumpstack.c | 24 ++++++++++++++---------- + 1 file changed, 14 insertions(+), 10 deletions(-) + +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index c211cbdff709..0f4b931e1a02 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -112,24 +112,28 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + * - task stack + * - interrupt stack + * - HW exception stacks (double fault, nmi, debug, mce) ++ * - SYSENTER stack + * +- * x86-32 can have up to three stacks: ++ * x86-32 can have up to four stacks: + * - task stack + * - softirq stack + * - hardirq stack ++ * - SYSENTER stack + */ + for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + const char *stack_name; + +- /* +- * If we overflowed the task stack into a guard page, jump back +- * to the bottom of the usable stack. +- */ +- if (task_stack_page(task) - (void *)stack < PAGE_SIZE) +- stack = task_stack_page(task); +- +- if (get_stack_info(stack, task, &stack_info, &visit_mask)) +- break; ++ if (get_stack_info(stack, task, &stack_info, &visit_mask)) { ++ /* ++ * We weren't on a valid stack. It's possible that ++ * we overflowed a valid stack into a guard page. ++ * See if the next page up is valid so that we can ++ * generate some kind of backtrace if this happens. ++ */ ++ stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack); ++ if (get_stack_info(stack, task, &stack_info, &visit_mask)) ++ break; ++ } + + stack_name = stack_type_name(stack_info.type); + if (stack_name) +-- +2.14.2 + diff --git a/patches/kernel/0151-x86-entry-Move-SYSENTER_stack-to-the-beginning-of-st.patch b/patches/kernel/0151-x86-entry-Move-SYSENTER_stack-to-the-beginning-of-st.patch deleted file mode 100644 index 84662ec..0000000 --- a/patches/kernel/0151-x86-entry-Move-SYSENTER_stack-to-the-beginning-of-st.patch +++ /dev/null @@ -1,130 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:19 +0100 -Subject: [PATCH] x86/entry: Move SYSENTER_stack to the beginning of struct - tss_struct -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -SYSENTER_stack should have reliable overflow detection, which -means that it needs to be at the bottom of a page, not the top. -Move it to the beginning of struct tss_struct and page-align it. - -Also add an assertion to make sure that the fixed hardware TSS -doesn't cross a page boundary. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150605.881827433@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 1a935bc3d4ea61556461a9e92a68ca3556232efd) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 57d6cfd9e7d015aabbed6d0b50e7d2525b3c86c2) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/processor.h | 21 ++++++++++++--------- - arch/x86/kernel/cpu/common.c | 21 +++++++++++++++++++++ - 2 files changed, 33 insertions(+), 9 deletions(-) - -diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index 78123abdb046..55885465c3a7 100644 ---- a/arch/x86/include/asm/processor.h -+++ b/arch/x86/include/asm/processor.h -@@ -326,7 +326,16 @@ struct x86_hw_tss { - - struct tss_struct { - /* -- * The hardware state: -+ * Space for the temporary SYSENTER stack, used for SYSENTER -+ * and the entry trampoline as well. -+ */ -+ unsigned long SYSENTER_stack_canary; -+ unsigned long SYSENTER_stack[64]; -+ -+ /* -+ * The fixed hardware portion. This must not cross a page boundary -+ * at risk of violating the SDM's advice and potentially triggering -+ * errata. - */ - struct x86_hw_tss x86_tss; - -@@ -337,15 +346,9 @@ struct tss_struct { - * be within the limit. - */ - unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; -+} __aligned(PAGE_SIZE); - -- /* -- * Space for the temporary SYSENTER stack. -- */ -- unsigned long SYSENTER_stack_canary; -- unsigned long SYSENTER_stack[64]; --} ____cacheline_aligned; -- --DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); -+DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); - - /* - * sizeof(unsigned long) coming from an extra "long" at the end -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index e526d82b546c..e61eff11f562 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -487,6 +487,27 @@ static inline void setup_cpu_entry_area(int cpu) - #endif - - __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); -+ -+ /* -+ * The Intel SDM says (Volume 3, 7.2.1): -+ * -+ * Avoid placing a page boundary in the part of the TSS that the -+ * processor reads during a task switch (the first 104 bytes). The -+ * processor may not correctly perform address translations if a -+ * boundary occurs in this area. During a task switch, the processor -+ * reads and writes into the first 104 bytes of each TSS (using -+ * contiguous physical addresses beginning with the physical address -+ * of the first byte of the TSS). So, after TSS access begins, if -+ * part of the 104 bytes is not physically contiguous, the processor -+ * will access incorrect information without generating a page-fault -+ * exception. -+ * -+ * There are also a lot of errata involving the TSS spanning a page -+ * boundary. Assert that we're not doing that. -+ */ -+ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ -+ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); -+ - } - - /* Load the original GDT from the per-cpu structure */ --- -2.14.2 - diff --git a/patches/kernel/0152-x86-entry-Move-SYSENTER_stack-to-the-beginning-of-st.patch b/patches/kernel/0152-x86-entry-Move-SYSENTER_stack-to-the-beginning-of-st.patch new file mode 100644 index 0000000..84662ec --- /dev/null +++ b/patches/kernel/0152-x86-entry-Move-SYSENTER_stack-to-the-beginning-of-st.patch @@ -0,0 +1,130 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:19 +0100 +Subject: [PATCH] x86/entry: Move SYSENTER_stack to the beginning of struct + tss_struct +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +SYSENTER_stack should have reliable overflow detection, which +means that it needs to be at the bottom of a page, not the top. +Move it to the beginning of struct tss_struct and page-align it. + +Also add an assertion to make sure that the fixed hardware TSS +doesn't cross a page boundary. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.881827433@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 1a935bc3d4ea61556461a9e92a68ca3556232efd) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 57d6cfd9e7d015aabbed6d0b50e7d2525b3c86c2) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/processor.h | 21 ++++++++++++--------- + arch/x86/kernel/cpu/common.c | 21 +++++++++++++++++++++ + 2 files changed, 33 insertions(+), 9 deletions(-) + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 78123abdb046..55885465c3a7 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -326,7 +326,16 @@ struct x86_hw_tss { + + struct tss_struct { + /* +- * The hardware state: ++ * Space for the temporary SYSENTER stack, used for SYSENTER ++ * and the entry trampoline as well. ++ */ ++ unsigned long SYSENTER_stack_canary; ++ unsigned long SYSENTER_stack[64]; ++ ++ /* ++ * The fixed hardware portion. This must not cross a page boundary ++ * at risk of violating the SDM's advice and potentially triggering ++ * errata. + */ + struct x86_hw_tss x86_tss; + +@@ -337,15 +346,9 @@ struct tss_struct { + * be within the limit. + */ + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; ++} __aligned(PAGE_SIZE); + +- /* +- * Space for the temporary SYSENTER stack. +- */ +- unsigned long SYSENTER_stack_canary; +- unsigned long SYSENTER_stack[64]; +-} ____cacheline_aligned; +- +-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); ++DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); + + /* + * sizeof(unsigned long) coming from an extra "long" at the end +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index e526d82b546c..e61eff11f562 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -487,6 +487,27 @@ static inline void setup_cpu_entry_area(int cpu) + #endif + + __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); ++ ++ /* ++ * The Intel SDM says (Volume 3, 7.2.1): ++ * ++ * Avoid placing a page boundary in the part of the TSS that the ++ * processor reads during a task switch (the first 104 bytes). The ++ * processor may not correctly perform address translations if a ++ * boundary occurs in this area. During a task switch, the processor ++ * reads and writes into the first 104 bytes of each TSS (using ++ * contiguous physical addresses beginning with the physical address ++ * of the first byte of the TSS). So, after TSS access begins, if ++ * part of the 104 bytes is not physically contiguous, the processor ++ * will access incorrect information without generating a page-fault ++ * exception. ++ * ++ * There are also a lot of errata involving the TSS spanning a page ++ * boundary. Assert that we're not doing that. ++ */ ++ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ ++ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); ++ + } + + /* Load the original GDT from the per-cpu structure */ +-- +2.14.2 + diff --git a/patches/kernel/0152-x86-entry-Remap-the-TSS-into-the-CPU-entry-area.patch b/patches/kernel/0152-x86-entry-Remap-the-TSS-into-the-CPU-entry-area.patch deleted file mode 100644 index b42d76d..0000000 --- a/patches/kernel/0152-x86-entry-Remap-the-TSS-into-the-CPU-entry-area.patch +++ /dev/null @@ -1,286 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:20 +0100 -Subject: [PATCH] x86/entry: Remap the TSS into the CPU entry area -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -This has a secondary purpose: it puts the entry stack into a region -with a well-controlled layout. A subsequent patch will take -advantage of this to streamline the SYSCALL entry code to be able to -find it more easily. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150605.962042855@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 72f5e08dbba2d01aa90b592cf76c378ea233b00b) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 475b37e78defbc4cb91d54e2bcf18aa75611bb3a) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/fixmap.h | 7 +++++++ - arch/x86/kernel/asm-offsets.c | 3 +++ - arch/x86/kernel/cpu/common.c | 41 +++++++++++++++++++++++++++++++++++------ - arch/x86/kernel/dumpstack.c | 3 ++- - arch/x86/kvm/vmx.c | 2 +- - arch/x86/power/cpu.c | 11 ++++++----- - arch/x86/entry/entry_32.S | 6 ++++-- - 7 files changed, 58 insertions(+), 15 deletions(-) - -diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h -index 8c6ed66fe957..c92fc30e6def 100644 ---- a/arch/x86/include/asm/fixmap.h -+++ b/arch/x86/include/asm/fixmap.h -@@ -54,6 +54,13 @@ extern unsigned long __FIXADDR_TOP; - */ - struct cpu_entry_area { - char gdt[PAGE_SIZE]; -+ -+ /* -+ * The GDT is just below cpu_tss and thus serves (on x86_64) as a -+ * a read-only guard page for the SYSENTER stack at the bottom -+ * of the TSS region. -+ */ -+ struct tss_struct tss; - }; - - #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) -diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c -index 031bd35bd911..f765c3253ec3 100644 ---- a/arch/x86/kernel/asm-offsets.c -+++ b/arch/x86/kernel/asm-offsets.c -@@ -97,4 +97,7 @@ void common(void) { - OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); - /* Size of SYSENTER_stack */ - DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); -+ -+ /* Layout info for cpu_entry_area */ -+ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); - } -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index e61eff11f562..4a38de4c6ede 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -466,6 +466,22 @@ void load_percpu_segment(int cpu) - load_stack_canary_segment(); - } - -+static void set_percpu_fixmap_pages(int fixmap_index, void *ptr, -+ int pages, pgprot_t prot) -+{ -+ int i; -+ -+ for (i = 0; i < pages; i++) { -+ __set_fixmap(fixmap_index - i, -+ per_cpu_ptr_to_phys(ptr + i * PAGE_SIZE), prot); -+ } -+} -+ -+#ifdef CONFIG_X86_32 -+/* The 32-bit entry code needs to find cpu_entry_area. */ -+DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); -+#endif -+ - /* Setup the fixmap mappings only once per-processor */ - static inline void setup_cpu_entry_area(int cpu) - { -@@ -507,7 +523,15 @@ static inline void setup_cpu_entry_area(int cpu) - */ - BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ - offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); -+ BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); -+ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), -+ &per_cpu(cpu_tss, cpu), -+ sizeof(struct tss_struct) / PAGE_SIZE, -+ PAGE_KERNEL); - -+#ifdef CONFIG_X86_32 -+ this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); -+#endif - } - - /* Load the original GDT from the per-cpu structure */ -@@ -1249,7 +1273,8 @@ void enable_sep_cpu(void) - wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - - wrmsr(MSR_IA32_SYSENTER_ESP, -- (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), -+ (unsigned long)&get_cpu_entry_area(cpu)->tss + -+ offsetofend(struct tss_struct, SYSENTER_stack), - 0); - - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); -@@ -1371,6 +1396,8 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - /* May not be marked __init: used by software suspend */ - void syscall_init(void) - { -+ int cpu = smp_processor_id(); -+ - wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); - -@@ -1384,7 +1411,7 @@ void syscall_init(void) - */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, -- (unsigned long)this_cpu_ptr(&cpu_tss) + -+ (unsigned long)&get_cpu_entry_area(cpu)->tss + - offsetofend(struct tss_struct, SYSENTER_stack)); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); - #else -@@ -1593,11 +1620,13 @@ void cpu_init(void) - BUG_ON(me->mm); - enter_lazy_tlb(&init_mm, me); - -+ setup_cpu_entry_area(cpu); -+ - /* - * Initialize the TSS. Don't bother initializing sp0, as the initial - * task never enters user mode. - */ -- set_tss_desc(cpu, &t->x86_tss); -+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); - load_TR_desc(); - - load_mm_ldt(&init_mm); -@@ -1610,7 +1639,6 @@ void cpu_init(void) - if (is_uv_system()) - uv_cpu_init(); - -- setup_cpu_entry_area(cpu); - load_fixmap_gdt(cpu); - } - -@@ -1650,11 +1678,13 @@ void cpu_init(void) - BUG_ON(curr->mm); - enter_lazy_tlb(&init_mm, curr); - -+ setup_cpu_entry_area(cpu); -+ - /* - * Initialize the TSS. Don't bother initializing sp0, as the initial - * task never enters user mode. - */ -- set_tss_desc(cpu, &t->x86_tss); -+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); - load_TR_desc(); - - load_mm_ldt(&init_mm); -@@ -1671,7 +1701,6 @@ void cpu_init(void) - - fpu__init_cpu(); - -- setup_cpu_entry_area(cpu); - load_fixmap_gdt(cpu); - } - #endif -diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c -index 0f4b931e1a02..c1f503673f1e 100644 ---- a/arch/x86/kernel/dumpstack.c -+++ b/arch/x86/kernel/dumpstack.c -@@ -45,7 +45,8 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, - - bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) - { -- struct tss_struct *tss = this_cpu_ptr(&cpu_tss); -+ int cpu = smp_processor_id(); -+ struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss; - - /* Treat the canary as part of the stack for unwinding purposes. */ - void *begin = &tss->SYSENTER_stack_canary; -diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c -index a7c5a47beab7..d61986a36575 100644 ---- a/arch/x86/kvm/vmx.c -+++ b/arch/x86/kvm/vmx.c -@@ -2280,7 +2280,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) - * processors. See 22.2.4. - */ - vmcs_writel(HOST_TR_BASE, -- (unsigned long)this_cpu_ptr(&cpu_tss.x86_tss)); -+ (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); - vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ - - /* -diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c -index 48cd87fc7222..2a717e023c9f 100644 ---- a/arch/x86/power/cpu.c -+++ b/arch/x86/power/cpu.c -@@ -160,18 +160,19 @@ static void do_fpu_end(void) - static void fix_processor_context(void) - { - int cpu = smp_processor_id(); -- struct tss_struct *t = &per_cpu(cpu_tss, cpu); - #ifdef CONFIG_X86_64 - struct desc_struct *desc = get_cpu_gdt_rw(cpu); - tss_desc tss; - #endif - - /* -- * This just modifies memory; should not be necessary. But... This is -- * necessary, because 386 hardware has concept of busy TSS or some -- * similar stupidity. -+ * We need to reload TR, which requires that we change the -+ * GDT entry to indicate "available" first. -+ * -+ * XXX: This could probably all be replaced by a call to -+ * force_reload_TR(). - */ -- set_tss_desc(cpu, &t->x86_tss); -+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); - - #ifdef CONFIG_X86_64 - memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); -diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S -index 0092da1c056f..41e0e103f090 100644 ---- a/arch/x86/entry/entry_32.S -+++ b/arch/x86/entry/entry_32.S -@@ -948,7 +948,8 @@ ENTRY(debug) - movl %esp, %eax # pt_regs pointer - - /* Are we currently on the SYSENTER stack? */ -- PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) -+ movl PER_CPU_VAR(cpu_entry_area), %ecx -+ addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx - subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ - cmpl $SIZEOF_SYSENTER_stack, %ecx - jb .Ldebug_from_sysenter_stack -@@ -991,7 +992,8 @@ ENTRY(nmi) - movl %esp, %eax # pt_regs pointer - - /* Are we currently on the SYSENTER stack? */ -- PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) -+ movl PER_CPU_VAR(cpu_entry_area), %ecx -+ addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx - subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ - cmpl $SIZEOF_SYSENTER_stack, %ecx - jb .Lnmi_from_sysenter_stack --- -2.14.2 - diff --git a/patches/kernel/0153-x86-entry-64-Separate-cpu_current_top_of_stack-from-.patch b/patches/kernel/0153-x86-entry-64-Separate-cpu_current_top_of_stack-from-.patch deleted file mode 100644 index c1d06d2..0000000 --- a/patches/kernel/0153-x86-entry-64-Separate-cpu_current_top_of_stack-from-.patch +++ /dev/null @@ -1,161 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:21 +0100 -Subject: [PATCH] x86/entry/64: Separate cpu_current_top_of_stack from TSS.sp0 -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -On 64-bit kernels, we used to assume that TSS.sp0 was the current -top of stack. With the addition of an entry trampoline, this will -no longer be the case. Store the current top of stack in TSS.sp1, -which is otherwise unused but shares the same cacheline. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150606.050864668@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 9aaefe7b59ae00605256a7d6bd1c1456432495fc) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 281be4ff07f7c67dc2a9c75ab24a7b9ff25544ae) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/processor.h | 18 +++++++++++++----- - arch/x86/include/asm/thread_info.h | 2 +- - arch/x86/kernel/asm-offsets_64.c | 1 + - arch/x86/kernel/process.c | 10 ++++++++++ - arch/x86/kernel/process_64.c | 1 + - 5 files changed, 26 insertions(+), 6 deletions(-) - -diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index 55885465c3a7..1bfe4bad797a 100644 ---- a/arch/x86/include/asm/processor.h -+++ b/arch/x86/include/asm/processor.h -@@ -303,7 +303,13 @@ struct x86_hw_tss { - struct x86_hw_tss { - u32 reserved1; - u64 sp0; -+ -+ /* -+ * We store cpu_current_top_of_stack in sp1 so it's always accessible. -+ * Linux does not use ring 1, so sp1 is not otherwise needed. -+ */ - u64 sp1; -+ - u64 sp2; - u64 reserved2; - u64 ist[7]; -@@ -362,6 +368,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); - - #ifdef CONFIG_X86_32 - DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); -+#else -+#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1 - #endif - - /* -@@ -533,12 +541,12 @@ static inline void native_swapgs(void) - - static inline unsigned long current_top_of_stack(void) - { --#ifdef CONFIG_X86_64 -- return this_cpu_read_stable(cpu_tss.x86_tss.sp0); --#else -- /* sp0 on x86_32 is special in and around vm86 mode. */ -+ /* -+ * We can't read directly from tss.sp0: sp0 on x86_32 is special in -+ * and around vm86 mode and sp0 on x86_64 is special because of the -+ * entry trampoline. -+ */ - return this_cpu_read_stable(cpu_current_top_of_stack); --#endif - } - - static inline bool on_thread_stack(void) -diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h -index ec8ef3bbb7dc..760dd8a73927 100644 ---- a/arch/x86/include/asm/thread_info.h -+++ b/arch/x86/include/asm/thread_info.h -@@ -214,7 +214,7 @@ static inline int arch_within_stack_frames(const void * const stack, - #else /* !__ASSEMBLY__ */ - - #ifdef CONFIG_X86_64 --# define cpu_current_top_of_stack (cpu_tss + TSS_sp0) -+# define cpu_current_top_of_stack (cpu_tss + TSS_sp1) - #endif - - #endif -diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c -index c21a5315b38e..048f68ff3396 100644 ---- a/arch/x86/kernel/asm-offsets_64.c -+++ b/arch/x86/kernel/asm-offsets_64.c -@@ -65,6 +65,7 @@ int main(void) - - OFFSET(TSS_ist, tss_struct, x86_tss.ist); - OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); -+ OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); - BLANK(); - - #ifdef CONFIG_CC_STACKPROTECTOR -diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c -index aa86e810fb54..407fc37a8718 100644 ---- a/arch/x86/kernel/process.c -+++ b/arch/x86/kernel/process.c -@@ -55,6 +55,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { - * Poison it. - */ - .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, -+ -+#ifdef CONFIG_X86_64 -+ /* -+ * .sp1 is cpu_current_top_of_stack. The init task never -+ * runs user code, but cpu_current_top_of_stack should still -+ * be well defined before the first context switch. -+ */ -+ .sp1 = TOP_OF_INIT_STACK, -+#endif -+ - #ifdef CONFIG_X86_32 - .ss0 = __KERNEL_DS, - .ss1 = __KERNEL_CS, -diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c -index 01b119bebb68..157f81816915 100644 ---- a/arch/x86/kernel/process_64.c -+++ b/arch/x86/kernel/process_64.c -@@ -461,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) - * Switch the PDA and FPU contexts. - */ - this_cpu_write(current_task, next_p); -+ this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); - - /* Reload sp0. */ - update_sp0(next_p); --- -2.14.2 - diff --git a/patches/kernel/0153-x86-entry-Remap-the-TSS-into-the-CPU-entry-area.patch b/patches/kernel/0153-x86-entry-Remap-the-TSS-into-the-CPU-entry-area.patch new file mode 100644 index 0000000..b42d76d --- /dev/null +++ b/patches/kernel/0153-x86-entry-Remap-the-TSS-into-the-CPU-entry-area.patch @@ -0,0 +1,286 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:20 +0100 +Subject: [PATCH] x86/entry: Remap the TSS into the CPU entry area +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +This has a secondary purpose: it puts the entry stack into a region +with a well-controlled layout. A subsequent patch will take +advantage of this to streamline the SYSCALL entry code to be able to +find it more easily. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.962042855@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 72f5e08dbba2d01aa90b592cf76c378ea233b00b) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 475b37e78defbc4cb91d54e2bcf18aa75611bb3a) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/fixmap.h | 7 +++++++ + arch/x86/kernel/asm-offsets.c | 3 +++ + arch/x86/kernel/cpu/common.c | 41 +++++++++++++++++++++++++++++++++++------ + arch/x86/kernel/dumpstack.c | 3 ++- + arch/x86/kvm/vmx.c | 2 +- + arch/x86/power/cpu.c | 11 ++++++----- + arch/x86/entry/entry_32.S | 6 ++++-- + 7 files changed, 58 insertions(+), 15 deletions(-) + +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index 8c6ed66fe957..c92fc30e6def 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -54,6 +54,13 @@ extern unsigned long __FIXADDR_TOP; + */ + struct cpu_entry_area { + char gdt[PAGE_SIZE]; ++ ++ /* ++ * The GDT is just below cpu_tss and thus serves (on x86_64) as a ++ * a read-only guard page for the SYSENTER stack at the bottom ++ * of the TSS region. ++ */ ++ struct tss_struct tss; + }; + + #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) +diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c +index 031bd35bd911..f765c3253ec3 100644 +--- a/arch/x86/kernel/asm-offsets.c ++++ b/arch/x86/kernel/asm-offsets.c +@@ -97,4 +97,7 @@ void common(void) { + OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); + /* Size of SYSENTER_stack */ + DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); ++ ++ /* Layout info for cpu_entry_area */ ++ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); + } +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index e61eff11f562..4a38de4c6ede 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -466,6 +466,22 @@ void load_percpu_segment(int cpu) + load_stack_canary_segment(); + } + ++static void set_percpu_fixmap_pages(int fixmap_index, void *ptr, ++ int pages, pgprot_t prot) ++{ ++ int i; ++ ++ for (i = 0; i < pages; i++) { ++ __set_fixmap(fixmap_index - i, ++ per_cpu_ptr_to_phys(ptr + i * PAGE_SIZE), prot); ++ } ++} ++ ++#ifdef CONFIG_X86_32 ++/* The 32-bit entry code needs to find cpu_entry_area. */ ++DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); ++#endif ++ + /* Setup the fixmap mappings only once per-processor */ + static inline void setup_cpu_entry_area(int cpu) + { +@@ -507,7 +523,15 @@ static inline void setup_cpu_entry_area(int cpu) + */ + BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ + offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); ++ BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); ++ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), ++ &per_cpu(cpu_tss, cpu), ++ sizeof(struct tss_struct) / PAGE_SIZE, ++ PAGE_KERNEL); + ++#ifdef CONFIG_X86_32 ++ this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); ++#endif + } + + /* Load the original GDT from the per-cpu structure */ +@@ -1249,7 +1273,8 @@ void enable_sep_cpu(void) + wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); + + wrmsr(MSR_IA32_SYSENTER_ESP, +- (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), ++ (unsigned long)&get_cpu_entry_area(cpu)->tss + ++ offsetofend(struct tss_struct, SYSENTER_stack), + 0); + + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); +@@ -1371,6 +1396,8 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + /* May not be marked __init: used by software suspend */ + void syscall_init(void) + { ++ int cpu = smp_processor_id(); ++ + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + +@@ -1384,7 +1411,7 @@ void syscall_init(void) + */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, +- (unsigned long)this_cpu_ptr(&cpu_tss) + ++ (unsigned long)&get_cpu_entry_area(cpu)->tss + + offsetofend(struct tss_struct, SYSENTER_stack)); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); + #else +@@ -1593,11 +1620,13 @@ void cpu_init(void) + BUG_ON(me->mm); + enter_lazy_tlb(&init_mm, me); + ++ setup_cpu_entry_area(cpu); ++ + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ +- set_tss_desc(cpu, &t->x86_tss); ++ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + load_TR_desc(); + + load_mm_ldt(&init_mm); +@@ -1610,7 +1639,6 @@ void cpu_init(void) + if (is_uv_system()) + uv_cpu_init(); + +- setup_cpu_entry_area(cpu); + load_fixmap_gdt(cpu); + } + +@@ -1650,11 +1678,13 @@ void cpu_init(void) + BUG_ON(curr->mm); + enter_lazy_tlb(&init_mm, curr); + ++ setup_cpu_entry_area(cpu); ++ + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ +- set_tss_desc(cpu, &t->x86_tss); ++ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + load_TR_desc(); + + load_mm_ldt(&init_mm); +@@ -1671,7 +1701,6 @@ void cpu_init(void) + + fpu__init_cpu(); + +- setup_cpu_entry_area(cpu); + load_fixmap_gdt(cpu); + } + #endif +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index 0f4b931e1a02..c1f503673f1e 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -45,7 +45,8 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, + + bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) + { +- struct tss_struct *tss = this_cpu_ptr(&cpu_tss); ++ int cpu = smp_processor_id(); ++ struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss; + + /* Treat the canary as part of the stack for unwinding purposes. */ + void *begin = &tss->SYSENTER_stack_canary; +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index a7c5a47beab7..d61986a36575 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -2280,7 +2280,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + * processors. See 22.2.4. + */ + vmcs_writel(HOST_TR_BASE, +- (unsigned long)this_cpu_ptr(&cpu_tss.x86_tss)); ++ (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); + vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ + + /* +diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c +index 48cd87fc7222..2a717e023c9f 100644 +--- a/arch/x86/power/cpu.c ++++ b/arch/x86/power/cpu.c +@@ -160,18 +160,19 @@ static void do_fpu_end(void) + static void fix_processor_context(void) + { + int cpu = smp_processor_id(); +- struct tss_struct *t = &per_cpu(cpu_tss, cpu); + #ifdef CONFIG_X86_64 + struct desc_struct *desc = get_cpu_gdt_rw(cpu); + tss_desc tss; + #endif + + /* +- * This just modifies memory; should not be necessary. But... This is +- * necessary, because 386 hardware has concept of busy TSS or some +- * similar stupidity. ++ * We need to reload TR, which requires that we change the ++ * GDT entry to indicate "available" first. ++ * ++ * XXX: This could probably all be replaced by a call to ++ * force_reload_TR(). + */ +- set_tss_desc(cpu, &t->x86_tss); ++ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + + #ifdef CONFIG_X86_64 + memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); +diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S +index 0092da1c056f..41e0e103f090 100644 +--- a/arch/x86/entry/entry_32.S ++++ b/arch/x86/entry/entry_32.S +@@ -948,7 +948,8 @@ ENTRY(debug) + movl %esp, %eax # pt_regs pointer + + /* Are we currently on the SYSENTER stack? */ +- PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) ++ movl PER_CPU_VAR(cpu_entry_area), %ecx ++ addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Ldebug_from_sysenter_stack +@@ -991,7 +992,8 @@ ENTRY(nmi) + movl %esp, %eax # pt_regs pointer + + /* Are we currently on the SYSENTER stack? */ +- PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) ++ movl PER_CPU_VAR(cpu_entry_area), %ecx ++ addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Lnmi_from_sysenter_stack +-- +2.14.2 + diff --git a/patches/kernel/0154-x86-entry-64-Separate-cpu_current_top_of_stack-from-.patch b/patches/kernel/0154-x86-entry-64-Separate-cpu_current_top_of_stack-from-.patch new file mode 100644 index 0000000..c1d06d2 --- /dev/null +++ b/patches/kernel/0154-x86-entry-64-Separate-cpu_current_top_of_stack-from-.patch @@ -0,0 +1,161 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:21 +0100 +Subject: [PATCH] x86/entry/64: Separate cpu_current_top_of_stack from TSS.sp0 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +On 64-bit kernels, we used to assume that TSS.sp0 was the current +top of stack. With the addition of an entry trampoline, this will +no longer be the case. Store the current top of stack in TSS.sp1, +which is otherwise unused but shares the same cacheline. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.050864668@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 9aaefe7b59ae00605256a7d6bd1c1456432495fc) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 281be4ff07f7c67dc2a9c75ab24a7b9ff25544ae) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/processor.h | 18 +++++++++++++----- + arch/x86/include/asm/thread_info.h | 2 +- + arch/x86/kernel/asm-offsets_64.c | 1 + + arch/x86/kernel/process.c | 10 ++++++++++ + arch/x86/kernel/process_64.c | 1 + + 5 files changed, 26 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 55885465c3a7..1bfe4bad797a 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -303,7 +303,13 @@ struct x86_hw_tss { + struct x86_hw_tss { + u32 reserved1; + u64 sp0; ++ ++ /* ++ * We store cpu_current_top_of_stack in sp1 so it's always accessible. ++ * Linux does not use ring 1, so sp1 is not otherwise needed. ++ */ + u64 sp1; ++ + u64 sp2; + u64 reserved2; + u64 ist[7]; +@@ -362,6 +368,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); + + #ifdef CONFIG_X86_32 + DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); ++#else ++#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1 + #endif + + /* +@@ -533,12 +541,12 @@ static inline void native_swapgs(void) + + static inline unsigned long current_top_of_stack(void) + { +-#ifdef CONFIG_X86_64 +- return this_cpu_read_stable(cpu_tss.x86_tss.sp0); +-#else +- /* sp0 on x86_32 is special in and around vm86 mode. */ ++ /* ++ * We can't read directly from tss.sp0: sp0 on x86_32 is special in ++ * and around vm86 mode and sp0 on x86_64 is special because of the ++ * entry trampoline. ++ */ + return this_cpu_read_stable(cpu_current_top_of_stack); +-#endif + } + + static inline bool on_thread_stack(void) +diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h +index ec8ef3bbb7dc..760dd8a73927 100644 +--- a/arch/x86/include/asm/thread_info.h ++++ b/arch/x86/include/asm/thread_info.h +@@ -214,7 +214,7 @@ static inline int arch_within_stack_frames(const void * const stack, + #else /* !__ASSEMBLY__ */ + + #ifdef CONFIG_X86_64 +-# define cpu_current_top_of_stack (cpu_tss + TSS_sp0) ++# define cpu_current_top_of_stack (cpu_tss + TSS_sp1) + #endif + + #endif +diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c +index c21a5315b38e..048f68ff3396 100644 +--- a/arch/x86/kernel/asm-offsets_64.c ++++ b/arch/x86/kernel/asm-offsets_64.c +@@ -65,6 +65,7 @@ int main(void) + + OFFSET(TSS_ist, tss_struct, x86_tss.ist); + OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); ++ OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); + BLANK(); + + #ifdef CONFIG_CC_STACKPROTECTOR +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index aa86e810fb54..407fc37a8718 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -55,6 +55,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { + * Poison it. + */ + .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, ++ ++#ifdef CONFIG_X86_64 ++ /* ++ * .sp1 is cpu_current_top_of_stack. The init task never ++ * runs user code, but cpu_current_top_of_stack should still ++ * be well defined before the first context switch. ++ */ ++ .sp1 = TOP_OF_INIT_STACK, ++#endif ++ + #ifdef CONFIG_X86_32 + .ss0 = __KERNEL_DS, + .ss1 = __KERNEL_CS, +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index 01b119bebb68..157f81816915 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -461,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + * Switch the PDA and FPU contexts. + */ + this_cpu_write(current_task, next_p); ++ this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); + + /* Reload sp0. */ + update_sp0(next_p); +-- +2.14.2 + diff --git a/patches/kernel/0154-x86-espfix-64-Stop-assuming-that-pt_regs-is-on-the-e.patch b/patches/kernel/0154-x86-espfix-64-Stop-assuming-that-pt_regs-is-on-the-e.patch deleted file mode 100644 index 8499516..0000000 --- a/patches/kernel/0154-x86-espfix-64-Stop-assuming-that-pt_regs-is-on-the-e.patch +++ /dev/null @@ -1,124 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:22 +0100 -Subject: [PATCH] x86/espfix/64: Stop assuming that pt_regs is on the entry - stack -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -When we start using an entry trampoline, a #GP from userspace will -be delivered on the entry stack, not on the task stack. Fix the -espfix64 #DF fixup to set up #GP according to TSS.SP0, rather than -assuming that pt_regs + 1 == SP0. This won't change anything -without an entry stack, but it will make the code continue to work -when an entry stack is added. - -While we're at it, improve the comments to explain what's actually -going on. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150606.130778051@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 6d9256f0a89eaff97fca6006100bcaea8d1d8bdb) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit f5d8df279d00c22e4c338a5891a874a59947e5f5) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/traps.c | 37 ++++++++++++++++++++++++++++--------- - 1 file changed, 28 insertions(+), 9 deletions(-) - -diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c -index 7b1d0df624cf..b69db1ee8733 100644 ---- a/arch/x86/kernel/traps.c -+++ b/arch/x86/kernel/traps.c -@@ -360,9 +360,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) - - /* - * If IRET takes a non-IST fault on the espfix64 stack, then we -- * end up promoting it to a doublefault. In that case, modify -- * the stack to make it look like we just entered the #GP -- * handler from user space, similar to bad_iret. -+ * end up promoting it to a doublefault. In that case, take -+ * advantage of the fact that we're not using the normal (TSS.sp0) -+ * stack right now. We can write a fake #GP(0) frame at TSS.sp0 -+ * and then modify our own IRET frame so that, when we return, -+ * we land directly at the #GP(0) vector with the stack already -+ * set up according to its expectations. -+ * -+ * The net result is that our #GP handler will think that we -+ * entered from usermode with the bad user context. - * - * No need for ist_enter here because we don't use RCU. - */ -@@ -370,13 +376,26 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) - regs->cs == __KERNEL_CS && - regs->ip == (unsigned long)native_irq_return_iret) - { -- struct pt_regs *normal_regs = task_pt_regs(current); -+ struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; -+ -+ /* -+ * regs->sp points to the failing IRET frame on the -+ * ESPFIX64 stack. Copy it to the entry stack. This fills -+ * in gpregs->ss through gpregs->ip. -+ * -+ */ -+ memmove(&gpregs->ip, (void *)regs->sp, 5*8); -+ gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ - -- /* Fake a #GP(0) from userspace. */ -- memmove(&normal_regs->ip, (void *)regs->sp, 5*8); -- normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ -+ /* -+ * Adjust our frame so that we return straight to the #GP -+ * vector with the expected RSP value. This is safe because -+ * we won't enable interupts or schedule before we invoke -+ * general_protection, so nothing will clobber the stack -+ * frame we just set up. -+ */ - regs->ip = (unsigned long)general_protection; -- regs->sp = (unsigned long)&normal_regs->orig_ax; -+ regs->sp = (unsigned long)&gpregs->orig_ax; - - return; - } -@@ -401,7 +420,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) - * - * Processors update CR2 whenever a page fault is detected. If a - * second page fault occurs while an earlier page fault is being -- * deliv- ered, the faulting linear address of the second fault will -+ * delivered, the faulting linear address of the second fault will - * overwrite the contents of CR2 (replacing the previous - * address). These updates to CR2 occur even if the page fault - * results in a double fault or occurs during the delivery of a --- -2.14.2 - diff --git a/patches/kernel/0155-x86-entry-64-Use-a-per-CPU-trampoline-stack-for-IDT-.patch b/patches/kernel/0155-x86-entry-64-Use-a-per-CPU-trampoline-stack-for-IDT-.patch deleted file mode 100644 index bfea36c..0000000 --- a/patches/kernel/0155-x86-entry-64-Use-a-per-CPU-trampoline-stack-for-IDT-.patch +++ /dev/null @@ -1,295 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:23 +0100 -Subject: [PATCH] x86/entry/64: Use a per-CPU trampoline stack for IDT entries -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Historically, IDT entries from usermode have always gone directly -to the running task's kernel stack. Rearrange it so that we enter on -a per-CPU trampoline stack and then manually switch to the task's stack. -This touches a couple of extra cachelines, but it gives us a chance -to run some code before we touch the kernel stack. - -The asm isn't exactly beautiful, but I think that fully refactoring -it can wait. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Reviewed-by: Thomas Gleixner -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150606.225330557@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 7f2590a110b837af5679d08fc25c6227c5a8c497) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit bfb2d0ede023853fb8c24d3dae8974cb2f7117c3) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/switch_to.h | 4 ++- - arch/x86/include/asm/traps.h | 1 - - arch/x86/kernel/cpu/common.c | 6 ++-- - arch/x86/kernel/traps.c | 21 +++++++------ - arch/x86/entry/entry_64.S | 67 ++++++++++++++++++++++++++++++---------- - arch/x86/entry/entry_64_compat.S | 5 ++- - 6 files changed, 72 insertions(+), 32 deletions(-) - -diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h -index 010cd6e4eafc..ca2fc84ad278 100644 ---- a/arch/x86/include/asm/switch_to.h -+++ b/arch/x86/include/asm/switch_to.h -@@ -89,10 +89,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) - /* This is used when switching tasks or entering/exiting vm86 mode. */ - static inline void update_sp0(struct task_struct *task) - { -+ /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */ - #ifdef CONFIG_X86_32 - load_sp0(task->thread.sp0); - #else -- load_sp0(task_top_of_stack(task)); -+ if (static_cpu_has(X86_FEATURE_XENPV)) -+ load_sp0(task_top_of_stack(task)); - #endif - } - -diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h -index b052a7621ca1..c3b652672d6f 100644 ---- a/arch/x86/include/asm/traps.h -+++ b/arch/x86/include/asm/traps.h -@@ -92,7 +92,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long); - dotraplinkage void do_stack_segment(struct pt_regs *, long); - #ifdef CONFIG_X86_64 - dotraplinkage void do_double_fault(struct pt_regs *, long); --asmlinkage struct pt_regs *sync_regs(struct pt_regs *); - #endif - dotraplinkage void do_general_protection(struct pt_regs *, long); - dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index 4a38de4c6ede..404e4b75db6e 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -1623,11 +1623,13 @@ void cpu_init(void) - setup_cpu_entry_area(cpu); - - /* -- * Initialize the TSS. Don't bother initializing sp0, as the initial -- * task never enters user mode. -+ * Initialize the TSS. sp0 points to the entry trampoline stack -+ * regardless of what task is running. - */ - set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); - load_TR_desc(); -+ load_sp0((unsigned long)&get_cpu_entry_area(cpu)->tss + -+ offsetofend(struct tss_struct, SYSENTER_stack)); - - load_mm_ldt(&init_mm); - -diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c -index b69db1ee8733..d9debdafe7a6 100644 ---- a/arch/x86/kernel/traps.c -+++ b/arch/x86/kernel/traps.c -@@ -631,14 +631,15 @@ NOKPROBE_SYMBOL(do_int3); - - #ifdef CONFIG_X86_64 - /* -- * Help handler running on IST stack to switch off the IST stack if the -- * interrupted code was in user mode. The actual stack switch is done in -- * entry_64.S -+ * Help handler running on a per-cpu (IST or entry trampoline) stack -+ * to switch to the normal thread stack if the interrupted code was in -+ * user mode. The actual stack switch is done in entry_64.S - */ - asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) - { -- struct pt_regs *regs = task_pt_regs(current); -- *regs = *eregs; -+ struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; -+ if (regs != eregs) -+ *regs = *eregs; - return regs; - } - NOKPROBE_SYMBOL(sync_regs); -@@ -654,13 +655,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) - /* - * This is called from entry_64.S early in handling a fault - * caused by a bad iret to user mode. To handle the fault -- * correctly, we want move our stack frame to task_pt_regs -- * and we want to pretend that the exception came from the -- * iret target. -+ * correctly, we want to move our stack frame to where it would -+ * be had we entered directly on the entry stack (rather than -+ * just below the IRET frame) and we want to pretend that the -+ * exception came from the IRET target. - */ - struct bad_iret_stack *new_stack = -- container_of(task_pt_regs(current), -- struct bad_iret_stack, regs); -+ (struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; - - /* Copy the IRET target to the new stack. */ - memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 6c73e96daf78..f70fedc58bac 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -559,6 +559,13 @@ END(irq_entries_start) - /* 0(%rsp): ~(interrupt number) */ - .macro interrupt func - cld -+ -+ testb $3, CS-ORIG_RAX(%rsp) -+ jz 1f -+ SWAPGS -+ call switch_to_thread_stack -+1: -+ - ALLOC_PT_GPREGS_ON_STACK - SAVE_C_REGS - SAVE_EXTRA_REGS -@@ -568,12 +575,8 @@ END(irq_entries_start) - jz 1f - - /* -- * IRQ from user mode. Switch to kernel gsbase and inform context -- * tracking that we're in kernel mode. -- */ -- SWAPGS -- -- /* -+ * IRQ from user mode. -+ * - * We need to tell lockdep that IRQs are off. We can't do this until - * we fix gsbase, and we should do it before enter_from_user_mode - * (which can take locks). Since TRACE_IRQS_OFF idempotent, -@@ -840,6 +843,32 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt - */ - #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) - -+/* -+ * Switch to the thread stack. This is called with the IRET frame and -+ * orig_ax on the stack. (That is, RDI..R12 are not on the stack and -+ * space has not been allocated for them.) -+ */ -+ENTRY(switch_to_thread_stack) -+ UNWIND_HINT_FUNC -+ -+ pushq %rdi -+ movq %rsp, %rdi -+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp -+ UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI -+ -+ pushq 7*8(%rdi) /* regs->ss */ -+ pushq 6*8(%rdi) /* regs->rsp */ -+ pushq 5*8(%rdi) /* regs->eflags */ -+ pushq 4*8(%rdi) /* regs->cs */ -+ pushq 3*8(%rdi) /* regs->ip */ -+ pushq 2*8(%rdi) /* regs->orig_ax */ -+ pushq 8(%rdi) /* return address */ -+ UNWIND_HINT_FUNC -+ -+ movq (%rdi), %rdi -+ ret -+END(switch_to_thread_stack) -+ - .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 - ENTRY(\sym) - UNWIND_HINT_IRET_REGS offset=\has_error_code*8 -@@ -857,11 +886,12 @@ ENTRY(\sym) - - ALLOC_PT_GPREGS_ON_STACK - -- .if \paranoid -- .if \paranoid == 1 -+ .if \paranoid < 2 - testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ -- jnz 1f -+ jnz .Lfrom_usermode_switch_stack_\@ - .endif -+ -+ .if \paranoid - call paranoid_entry - .else - call error_entry -@@ -903,20 +933,15 @@ ENTRY(\sym) - jmp error_exit - .endif - -- .if \paranoid == 1 -+ .if \paranoid < 2 - /* -- * Paranoid entry from userspace. Switch stacks and treat it -+ * Entry from userspace. Switch stacks and treat it - * as a normal entry. This means that paranoid handlers - * run in real process context if user_mode(regs). - */ --1: -+.Lfrom_usermode_switch_stack_\@: - call error_entry - -- -- movq %rsp, %rdi /* pt_regs pointer */ -- call sync_regs -- movq %rax, %rsp /* switch stack */ -- - movq %rsp, %rdi /* pt_regs pointer */ - - .if \has_error_code -@@ -1177,6 +1202,14 @@ ENTRY(error_entry) - SWAPGS - - .Lerror_entry_from_usermode_after_swapgs: -+ /* Put us onto the real thread stack. */ -+ popq %r12 /* save return addr in %12 */ -+ movq %rsp, %rdi /* arg0 = pt_regs pointer */ -+ call sync_regs -+ movq %rax, %rsp /* switch stack */ -+ ENCODE_FRAME_POINTER -+ pushq %r12 -+ - /* - * We need to tell lockdep that IRQs are off. We can't do this until - * we fix gsbase, and we should do it before enter_from_user_mode -diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S -index 1f76b66518ee..2270601b6218 100644 ---- a/arch/x86/entry/entry_64_compat.S -+++ b/arch/x86/entry/entry_64_compat.S -@@ -305,8 +305,11 @@ ENTRY(entry_INT80_compat) - */ - movl %eax, %eax - -- /* Construct struct pt_regs on stack (iret frame is already on stack) */ - pushq %rax /* pt_regs->orig_ax */ -+ -+ /* switch to thread stack expects orig_ax to be pushed */ -+ call switch_to_thread_stack -+ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ --- -2.14.2 - diff --git a/patches/kernel/0155-x86-espfix-64-Stop-assuming-that-pt_regs-is-on-the-e.patch b/patches/kernel/0155-x86-espfix-64-Stop-assuming-that-pt_regs-is-on-the-e.patch new file mode 100644 index 0000000..8499516 --- /dev/null +++ b/patches/kernel/0155-x86-espfix-64-Stop-assuming-that-pt_regs-is-on-the-e.patch @@ -0,0 +1,124 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:22 +0100 +Subject: [PATCH] x86/espfix/64: Stop assuming that pt_regs is on the entry + stack +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +When we start using an entry trampoline, a #GP from userspace will +be delivered on the entry stack, not on the task stack. Fix the +espfix64 #DF fixup to set up #GP according to TSS.SP0, rather than +assuming that pt_regs + 1 == SP0. This won't change anything +without an entry stack, but it will make the code continue to work +when an entry stack is added. + +While we're at it, improve the comments to explain what's actually +going on. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.130778051@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 6d9256f0a89eaff97fca6006100bcaea8d1d8bdb) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit f5d8df279d00c22e4c338a5891a874a59947e5f5) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/traps.c | 37 ++++++++++++++++++++++++++++--------- + 1 file changed, 28 insertions(+), 9 deletions(-) + +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index 7b1d0df624cf..b69db1ee8733 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -360,9 +360,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) + + /* + * If IRET takes a non-IST fault on the espfix64 stack, then we +- * end up promoting it to a doublefault. In that case, modify +- * the stack to make it look like we just entered the #GP +- * handler from user space, similar to bad_iret. ++ * end up promoting it to a doublefault. In that case, take ++ * advantage of the fact that we're not using the normal (TSS.sp0) ++ * stack right now. We can write a fake #GP(0) frame at TSS.sp0 ++ * and then modify our own IRET frame so that, when we return, ++ * we land directly at the #GP(0) vector with the stack already ++ * set up according to its expectations. ++ * ++ * The net result is that our #GP handler will think that we ++ * entered from usermode with the bad user context. + * + * No need for ist_enter here because we don't use RCU. + */ +@@ -370,13 +376,26 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) + regs->cs == __KERNEL_CS && + regs->ip == (unsigned long)native_irq_return_iret) + { +- struct pt_regs *normal_regs = task_pt_regs(current); ++ struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; ++ ++ /* ++ * regs->sp points to the failing IRET frame on the ++ * ESPFIX64 stack. Copy it to the entry stack. This fills ++ * in gpregs->ss through gpregs->ip. ++ * ++ */ ++ memmove(&gpregs->ip, (void *)regs->sp, 5*8); ++ gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ + +- /* Fake a #GP(0) from userspace. */ +- memmove(&normal_regs->ip, (void *)regs->sp, 5*8); +- normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ ++ /* ++ * Adjust our frame so that we return straight to the #GP ++ * vector with the expected RSP value. This is safe because ++ * we won't enable interupts or schedule before we invoke ++ * general_protection, so nothing will clobber the stack ++ * frame we just set up. ++ */ + regs->ip = (unsigned long)general_protection; +- regs->sp = (unsigned long)&normal_regs->orig_ax; ++ regs->sp = (unsigned long)&gpregs->orig_ax; + + return; + } +@@ -401,7 +420,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) + * + * Processors update CR2 whenever a page fault is detected. If a + * second page fault occurs while an earlier page fault is being +- * deliv- ered, the faulting linear address of the second fault will ++ * delivered, the faulting linear address of the second fault will + * overwrite the contents of CR2 (replacing the previous + * address). These updates to CR2 occur even if the page fault + * results in a double fault or occurs during the delivery of a +-- +2.14.2 + diff --git a/patches/kernel/0156-x86-entry-64-Return-to-userspace-from-the-trampoline.patch b/patches/kernel/0156-x86-entry-64-Return-to-userspace-from-the-trampoline.patch deleted file mode 100644 index 20025ac..0000000 --- a/patches/kernel/0156-x86-entry-64-Return-to-userspace-from-the-trampoline.patch +++ /dev/null @@ -1,133 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:24 +0100 -Subject: [PATCH] x86/entry/64: Return to userspace from the trampoline stack -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -By itself, this is useless. It gives us the ability to run some final code -before exit that cannnot run on the kernel stack. This could include a CR3 -switch a la PAGE_TABLE_ISOLATION or some kernel stack erasing, for -example. (Or even weird things like *changing* which kernel stack gets -used as an ASLR-strengthening mechanism.) - -The SYSRET32 path is not covered yet. It could be in the future or -we could just ignore it and force the slow path if needed. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150606.306546484@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 3e3b9293d392c577b62e24e4bc9982320438e749) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 40eb58584f732a2fefb5959e79e408bedeaaa43c) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/entry_64.S | 55 +++++++++++++++++++++++++++++++++++++++++++---- - 1 file changed, 51 insertions(+), 4 deletions(-) - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index f70fedc58bac..4abe5b806d2a 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -325,8 +325,24 @@ syscall_return_via_sysret: - popq %rsi /* skip rcx */ - popq %rdx - popq %rsi -+ -+ /* -+ * Now all regs are restored except RSP and RDI. -+ * Save old stack pointer and switch to trampoline stack. -+ */ -+ movq %rsp, %rdi -+ movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp -+ -+ pushq RSP-RDI(%rdi) /* RSP */ -+ pushq (%rdi) /* RDI */ -+ -+ /* -+ * We are on the trampoline stack. All regs except RDI are live. -+ * We can do future final exit work right here. -+ */ -+ - popq %rdi -- movq RSP-ORIG_RAX(%rsp), %rsp -+ popq %rsp - USERGS_SYSRET64 - END(entry_SYSCALL_64) - -@@ -629,10 +645,41 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) - ud2 - 1: - #endif -- SWAPGS - POP_EXTRA_REGS -- POP_C_REGS -- addq $8, %rsp /* skip regs->orig_ax */ -+ popq %r11 -+ popq %r10 -+ popq %r9 -+ popq %r8 -+ popq %rax -+ popq %rcx -+ popq %rdx -+ popq %rsi -+ -+ /* -+ * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS. -+ * Save old stack pointer and switch to trampoline stack. -+ */ -+ movq %rsp, %rdi -+ movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp -+ -+ /* Copy the IRET frame to the trampoline stack. */ -+ pushq 6*8(%rdi) /* SS */ -+ pushq 5*8(%rdi) /* RSP */ -+ pushq 4*8(%rdi) /* EFLAGS */ -+ pushq 3*8(%rdi) /* CS */ -+ pushq 2*8(%rdi) /* RIP */ -+ -+ /* Push user RDI on the trampoline stack. */ -+ pushq (%rdi) -+ -+ /* -+ * We are on the trampoline stack. All regs except RDI are live. -+ * We can do future final exit work right here. -+ */ -+ -+ /* Restore RDI. */ -+ popq %rdi -+ SWAPGS - INTERRUPT_RETURN - - --- -2.14.2 - diff --git a/patches/kernel/0156-x86-entry-64-Use-a-per-CPU-trampoline-stack-for-IDT-.patch b/patches/kernel/0156-x86-entry-64-Use-a-per-CPU-trampoline-stack-for-IDT-.patch new file mode 100644 index 0000000..bfea36c --- /dev/null +++ b/patches/kernel/0156-x86-entry-64-Use-a-per-CPU-trampoline-stack-for-IDT-.patch @@ -0,0 +1,295 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:23 +0100 +Subject: [PATCH] x86/entry/64: Use a per-CPU trampoline stack for IDT entries +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Historically, IDT entries from usermode have always gone directly +to the running task's kernel stack. Rearrange it so that we enter on +a per-CPU trampoline stack and then manually switch to the task's stack. +This touches a couple of extra cachelines, but it gives us a chance +to run some code before we touch the kernel stack. + +The asm isn't exactly beautiful, but I think that fully refactoring +it can wait. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Reviewed-by: Thomas Gleixner +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.225330557@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 7f2590a110b837af5679d08fc25c6227c5a8c497) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit bfb2d0ede023853fb8c24d3dae8974cb2f7117c3) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/switch_to.h | 4 ++- + arch/x86/include/asm/traps.h | 1 - + arch/x86/kernel/cpu/common.c | 6 ++-- + arch/x86/kernel/traps.c | 21 +++++++------ + arch/x86/entry/entry_64.S | 67 ++++++++++++++++++++++++++++++---------- + arch/x86/entry/entry_64_compat.S | 5 ++- + 6 files changed, 72 insertions(+), 32 deletions(-) + +diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h +index 010cd6e4eafc..ca2fc84ad278 100644 +--- a/arch/x86/include/asm/switch_to.h ++++ b/arch/x86/include/asm/switch_to.h +@@ -89,10 +89,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) + /* This is used when switching tasks or entering/exiting vm86 mode. */ + static inline void update_sp0(struct task_struct *task) + { ++ /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */ + #ifdef CONFIG_X86_32 + load_sp0(task->thread.sp0); + #else +- load_sp0(task_top_of_stack(task)); ++ if (static_cpu_has(X86_FEATURE_XENPV)) ++ load_sp0(task_top_of_stack(task)); + #endif + } + +diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h +index b052a7621ca1..c3b652672d6f 100644 +--- a/arch/x86/include/asm/traps.h ++++ b/arch/x86/include/asm/traps.h +@@ -92,7 +92,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long); + dotraplinkage void do_stack_segment(struct pt_regs *, long); + #ifdef CONFIG_X86_64 + dotraplinkage void do_double_fault(struct pt_regs *, long); +-asmlinkage struct pt_regs *sync_regs(struct pt_regs *); + #endif + dotraplinkage void do_general_protection(struct pt_regs *, long); + dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 4a38de4c6ede..404e4b75db6e 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1623,11 +1623,13 @@ void cpu_init(void) + setup_cpu_entry_area(cpu); + + /* +- * Initialize the TSS. Don't bother initializing sp0, as the initial +- * task never enters user mode. ++ * Initialize the TSS. sp0 points to the entry trampoline stack ++ * regardless of what task is running. + */ + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + load_TR_desc(); ++ load_sp0((unsigned long)&get_cpu_entry_area(cpu)->tss + ++ offsetofend(struct tss_struct, SYSENTER_stack)); + + load_mm_ldt(&init_mm); + +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index b69db1ee8733..d9debdafe7a6 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -631,14 +631,15 @@ NOKPROBE_SYMBOL(do_int3); + + #ifdef CONFIG_X86_64 + /* +- * Help handler running on IST stack to switch off the IST stack if the +- * interrupted code was in user mode. The actual stack switch is done in +- * entry_64.S ++ * Help handler running on a per-cpu (IST or entry trampoline) stack ++ * to switch to the normal thread stack if the interrupted code was in ++ * user mode. The actual stack switch is done in entry_64.S + */ + asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) + { +- struct pt_regs *regs = task_pt_regs(current); +- *regs = *eregs; ++ struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; ++ if (regs != eregs) ++ *regs = *eregs; + return regs; + } + NOKPROBE_SYMBOL(sync_regs); +@@ -654,13 +655,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) + /* + * This is called from entry_64.S early in handling a fault + * caused by a bad iret to user mode. To handle the fault +- * correctly, we want move our stack frame to task_pt_regs +- * and we want to pretend that the exception came from the +- * iret target. ++ * correctly, we want to move our stack frame to where it would ++ * be had we entered directly on the entry stack (rather than ++ * just below the IRET frame) and we want to pretend that the ++ * exception came from the IRET target. + */ + struct bad_iret_stack *new_stack = +- container_of(task_pt_regs(current), +- struct bad_iret_stack, regs); ++ (struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; + + /* Copy the IRET target to the new stack. */ + memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 6c73e96daf78..f70fedc58bac 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -559,6 +559,13 @@ END(irq_entries_start) + /* 0(%rsp): ~(interrupt number) */ + .macro interrupt func + cld ++ ++ testb $3, CS-ORIG_RAX(%rsp) ++ jz 1f ++ SWAPGS ++ call switch_to_thread_stack ++1: ++ + ALLOC_PT_GPREGS_ON_STACK + SAVE_C_REGS + SAVE_EXTRA_REGS +@@ -568,12 +575,8 @@ END(irq_entries_start) + jz 1f + + /* +- * IRQ from user mode. Switch to kernel gsbase and inform context +- * tracking that we're in kernel mode. +- */ +- SWAPGS +- +- /* ++ * IRQ from user mode. ++ * + * We need to tell lockdep that IRQs are off. We can't do this until + * we fix gsbase, and we should do it before enter_from_user_mode + * (which can take locks). Since TRACE_IRQS_OFF idempotent, +@@ -840,6 +843,32 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt + */ + #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) + ++/* ++ * Switch to the thread stack. This is called with the IRET frame and ++ * orig_ax on the stack. (That is, RDI..R12 are not on the stack and ++ * space has not been allocated for them.) ++ */ ++ENTRY(switch_to_thread_stack) ++ UNWIND_HINT_FUNC ++ ++ pushq %rdi ++ movq %rsp, %rdi ++ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp ++ UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI ++ ++ pushq 7*8(%rdi) /* regs->ss */ ++ pushq 6*8(%rdi) /* regs->rsp */ ++ pushq 5*8(%rdi) /* regs->eflags */ ++ pushq 4*8(%rdi) /* regs->cs */ ++ pushq 3*8(%rdi) /* regs->ip */ ++ pushq 2*8(%rdi) /* regs->orig_ax */ ++ pushq 8(%rdi) /* return address */ ++ UNWIND_HINT_FUNC ++ ++ movq (%rdi), %rdi ++ ret ++END(switch_to_thread_stack) ++ + .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 + ENTRY(\sym) + UNWIND_HINT_IRET_REGS offset=\has_error_code*8 +@@ -857,11 +886,12 @@ ENTRY(\sym) + + ALLOC_PT_GPREGS_ON_STACK + +- .if \paranoid +- .if \paranoid == 1 ++ .if \paranoid < 2 + testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ +- jnz 1f ++ jnz .Lfrom_usermode_switch_stack_\@ + .endif ++ ++ .if \paranoid + call paranoid_entry + .else + call error_entry +@@ -903,20 +933,15 @@ ENTRY(\sym) + jmp error_exit + .endif + +- .if \paranoid == 1 ++ .if \paranoid < 2 + /* +- * Paranoid entry from userspace. Switch stacks and treat it ++ * Entry from userspace. Switch stacks and treat it + * as a normal entry. This means that paranoid handlers + * run in real process context if user_mode(regs). + */ +-1: ++.Lfrom_usermode_switch_stack_\@: + call error_entry + +- +- movq %rsp, %rdi /* pt_regs pointer */ +- call sync_regs +- movq %rax, %rsp /* switch stack */ +- + movq %rsp, %rdi /* pt_regs pointer */ + + .if \has_error_code +@@ -1177,6 +1202,14 @@ ENTRY(error_entry) + SWAPGS + + .Lerror_entry_from_usermode_after_swapgs: ++ /* Put us onto the real thread stack. */ ++ popq %r12 /* save return addr in %12 */ ++ movq %rsp, %rdi /* arg0 = pt_regs pointer */ ++ call sync_regs ++ movq %rax, %rsp /* switch stack */ ++ ENCODE_FRAME_POINTER ++ pushq %r12 ++ + /* + * We need to tell lockdep that IRQs are off. We can't do this until + * we fix gsbase, and we should do it before enter_from_user_mode +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index 1f76b66518ee..2270601b6218 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -305,8 +305,11 @@ ENTRY(entry_INT80_compat) + */ + movl %eax, %eax + +- /* Construct struct pt_regs on stack (iret frame is already on stack) */ + pushq %rax /* pt_regs->orig_ax */ ++ ++ /* switch to thread stack expects orig_ax to be pushed */ ++ call switch_to_thread_stack ++ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ +-- +2.14.2 + diff --git a/patches/kernel/0157-x86-entry-64-Create-a-per-CPU-SYSCALL-entry-trampoli.patch b/patches/kernel/0157-x86-entry-64-Create-a-per-CPU-SYSCALL-entry-trampoli.patch deleted file mode 100644 index 4319f10..0000000 --- a/patches/kernel/0157-x86-entry-64-Create-a-per-CPU-SYSCALL-entry-trampoli.patch +++ /dev/null @@ -1,241 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:25 +0100 -Subject: [PATCH] x86/entry/64: Create a per-CPU SYSCALL entry trampoline -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Handling SYSCALL is tricky: the SYSCALL handler is entered with every -single register (except FLAGS), including RSP, live. It somehow needs -to set RSP to point to a valid stack, which means it needs to save the -user RSP somewhere and find its own stack pointer. The canonical way -to do this is with SWAPGS, which lets us access percpu data using the -%gs prefix. - -With PAGE_TABLE_ISOLATION-like pagetable switching, this is -problematic. Without a scratch register, switching CR3 is impossible, so -%gs-based percpu memory would need to be mapped in the user pagetables. -Doing that without information leaks is difficult or impossible. - -Instead, use a different sneaky trick. Map a copy of the first part -of the SYSCALL asm at a different address for each CPU. Now RIP -varies depending on the CPU, so we can use RIP-relative memory access -to access percpu memory. By putting the relevant information (one -scratch slot and the stack address) at a constant offset relative to -RIP, we can make SYSCALL work without relying on %gs. - -A nice thing about this approach is that we can easily switch it on -and off if we want pagetable switching to be configurable. - -The compat variant of SYSCALL doesn't have this problem in the first -place -- there are plenty of scratch registers, since we don't care -about preserving r8-r15. This patch therefore doesn't touch SYSCALL32 -at all. - -This patch actually seems to be a small speedup. With this patch, -SYSCALL touches an extra cache line and an extra virtual page, but -the pipeline no longer stalls waiting for SWAPGS. It seems that, at -least in a tight loop, the latter outweights the former. - -Thanks to David Laight for an optimization tip. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150606.403607157@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 3386bc8aed825e9f1f65ce38df4b109b2019b71a) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 9fec5954d068a19bbf134da7af66db94699b03a3) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/fixmap.h | 2 ++ - arch/x86/kernel/asm-offsets.c | 1 + - arch/x86/kernel/cpu/common.c | 15 ++++++++++- - arch/x86/entry/entry_64.S | 58 +++++++++++++++++++++++++++++++++++++++++++ - arch/x86/kernel/vmlinux.lds.S | 9 +++++++ - 5 files changed, 84 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h -index c92fc30e6def..189d12d8afe0 100644 ---- a/arch/x86/include/asm/fixmap.h -+++ b/arch/x86/include/asm/fixmap.h -@@ -61,6 +61,8 @@ struct cpu_entry_area { - * of the TSS region. - */ - struct tss_struct tss; -+ -+ char entry_trampoline[PAGE_SIZE]; - }; - - #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) -diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c -index f765c3253ec3..822be00c85ff 100644 ---- a/arch/x86/kernel/asm-offsets.c -+++ b/arch/x86/kernel/asm-offsets.c -@@ -100,4 +100,5 @@ void common(void) { - - /* Layout info for cpu_entry_area */ - OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); -+ OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); - } -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index 404e4b75db6e..c2b2ee73b8a1 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -486,6 +486,8 @@ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); - static inline void setup_cpu_entry_area(int cpu) - { - #ifdef CONFIG_X86_64 -+ extern char _entry_trampoline[]; -+ - /* On 64-bit systems, we use a read-only fixmap GDT. */ - pgprot_t gdt_prot = PAGE_KERNEL_RO; - #else -@@ -532,6 +534,11 @@ static inline void setup_cpu_entry_area(int cpu) - #ifdef CONFIG_X86_32 - this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); - #endif -+ -+#ifdef CONFIG_X86_64 -+ __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), -+ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); -+#endif - } - - /* Load the original GDT from the per-cpu structure */ -@@ -1396,10 +1403,16 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - /* May not be marked __init: used by software suspend */ - void syscall_init(void) - { -+ extern char _entry_trampoline[]; -+ extern char entry_SYSCALL_64_trampoline[]; -+ - int cpu = smp_processor_id(); -+ unsigned long SYSCALL64_entry_trampoline = -+ (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + -+ (entry_SYSCALL_64_trampoline - _entry_trampoline); - - wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); -- wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); -+ wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); - - #ifdef CONFIG_IA32_EMULATION - wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 4abe5b806d2a..dc100a7052ee 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -135,6 +135,64 @@ END(native_usergs_sysret64) - * with them due to bugs in both AMD and Intel CPUs. - */ - -+ .pushsection .entry_trampoline, "ax" -+ -+/* -+ * The code in here gets remapped into cpu_entry_area's trampoline. This means -+ * that the assembler and linker have the wrong idea as to where this code -+ * lives (and, in fact, it's mapped more than once, so it's not even at a -+ * fixed address). So we can't reference any symbols outside the entry -+ * trampoline and expect it to work. -+ * -+ * Instead, we carefully abuse %rip-relative addressing. -+ * _entry_trampoline(%rip) refers to the start of the remapped) entry -+ * trampoline. We can thus find cpu_entry_area with this macro: -+ */ -+ -+#define CPU_ENTRY_AREA \ -+ _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) -+ -+/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ -+#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \ -+ SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA -+ -+ENTRY(entry_SYSCALL_64_trampoline) -+ UNWIND_HINT_EMPTY -+ swapgs -+ -+ /* Stash the user RSP. */ -+ movq %rsp, RSP_SCRATCH -+ -+ /* Load the top of the task stack into RSP */ -+ movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp -+ -+ /* Start building the simulated IRET frame. */ -+ pushq $__USER_DS /* pt_regs->ss */ -+ pushq RSP_SCRATCH /* pt_regs->sp */ -+ pushq %r11 /* pt_regs->flags */ -+ pushq $__USER_CS /* pt_regs->cs */ -+ pushq %rcx /* pt_regs->ip */ -+ -+ /* -+ * x86 lacks a near absolute jump, and we can't jump to the real -+ * entry text with a relative jump. We could push the target -+ * address and then use retq, but this destroys the pipeline on -+ * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, -+ * spill RDI and restore it in a second-stage trampoline. -+ */ -+ pushq %rdi -+ movq $entry_SYSCALL_64_stage2, %rdi -+ jmp *%rdi -+END(entry_SYSCALL_64_trampoline) -+ -+ .popsection -+ -+ENTRY(entry_SYSCALL_64_stage2) -+ UNWIND_HINT_EMPTY -+ popq %rdi -+ jmp entry_SYSCALL_64_after_hwframe -+END(entry_SYSCALL_64_stage2) -+ - ENTRY(entry_SYSCALL_64) - UNWIND_HINT_EMPTY - /* -diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S -index f05f00acac89..423aa36f0150 100644 ---- a/arch/x86/kernel/vmlinux.lds.S -+++ b/arch/x86/kernel/vmlinux.lds.S -@@ -106,6 +106,15 @@ SECTIONS - SOFTIRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) -+ -+#ifdef CONFIG_X86_64 -+ . = ALIGN(PAGE_SIZE); -+ _entry_trampoline = .; -+ *(.entry_trampoline) -+ . = ALIGN(PAGE_SIZE); -+ ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); -+#endif -+ - /* End of text section */ - _etext = .; - } :text = 0x9090 --- -2.14.2 - diff --git a/patches/kernel/0157-x86-entry-64-Return-to-userspace-from-the-trampoline.patch b/patches/kernel/0157-x86-entry-64-Return-to-userspace-from-the-trampoline.patch new file mode 100644 index 0000000..20025ac --- /dev/null +++ b/patches/kernel/0157-x86-entry-64-Return-to-userspace-from-the-trampoline.patch @@ -0,0 +1,133 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:24 +0100 +Subject: [PATCH] x86/entry/64: Return to userspace from the trampoline stack +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +By itself, this is useless. It gives us the ability to run some final code +before exit that cannnot run on the kernel stack. This could include a CR3 +switch a la PAGE_TABLE_ISOLATION or some kernel stack erasing, for +example. (Or even weird things like *changing* which kernel stack gets +used as an ASLR-strengthening mechanism.) + +The SYSRET32 path is not covered yet. It could be in the future or +we could just ignore it and force the slow path if needed. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.306546484@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 3e3b9293d392c577b62e24e4bc9982320438e749) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 40eb58584f732a2fefb5959e79e408bedeaaa43c) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/entry_64.S | 55 +++++++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 51 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index f70fedc58bac..4abe5b806d2a 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -325,8 +325,24 @@ syscall_return_via_sysret: + popq %rsi /* skip rcx */ + popq %rdx + popq %rsi ++ ++ /* ++ * Now all regs are restored except RSP and RDI. ++ * Save old stack pointer and switch to trampoline stack. ++ */ ++ movq %rsp, %rdi ++ movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp ++ ++ pushq RSP-RDI(%rdi) /* RSP */ ++ pushq (%rdi) /* RDI */ ++ ++ /* ++ * We are on the trampoline stack. All regs except RDI are live. ++ * We can do future final exit work right here. ++ */ ++ + popq %rdi +- movq RSP-ORIG_RAX(%rsp), %rsp ++ popq %rsp + USERGS_SYSRET64 + END(entry_SYSCALL_64) + +@@ -629,10 +645,41 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) + ud2 + 1: + #endif +- SWAPGS + POP_EXTRA_REGS +- POP_C_REGS +- addq $8, %rsp /* skip regs->orig_ax */ ++ popq %r11 ++ popq %r10 ++ popq %r9 ++ popq %r8 ++ popq %rax ++ popq %rcx ++ popq %rdx ++ popq %rsi ++ ++ /* ++ * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS. ++ * Save old stack pointer and switch to trampoline stack. ++ */ ++ movq %rsp, %rdi ++ movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp ++ ++ /* Copy the IRET frame to the trampoline stack. */ ++ pushq 6*8(%rdi) /* SS */ ++ pushq 5*8(%rdi) /* RSP */ ++ pushq 4*8(%rdi) /* EFLAGS */ ++ pushq 3*8(%rdi) /* CS */ ++ pushq 2*8(%rdi) /* RIP */ ++ ++ /* Push user RDI on the trampoline stack. */ ++ pushq (%rdi) ++ ++ /* ++ * We are on the trampoline stack. All regs except RDI are live. ++ * We can do future final exit work right here. ++ */ ++ ++ /* Restore RDI. */ ++ popq %rdi ++ SWAPGS + INTERRUPT_RETURN + + +-- +2.14.2 + diff --git a/patches/kernel/0158-x86-entry-64-Create-a-per-CPU-SYSCALL-entry-trampoli.patch b/patches/kernel/0158-x86-entry-64-Create-a-per-CPU-SYSCALL-entry-trampoli.patch new file mode 100644 index 0000000..4319f10 --- /dev/null +++ b/patches/kernel/0158-x86-entry-64-Create-a-per-CPU-SYSCALL-entry-trampoli.patch @@ -0,0 +1,241 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:25 +0100 +Subject: [PATCH] x86/entry/64: Create a per-CPU SYSCALL entry trampoline +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Handling SYSCALL is tricky: the SYSCALL handler is entered with every +single register (except FLAGS), including RSP, live. It somehow needs +to set RSP to point to a valid stack, which means it needs to save the +user RSP somewhere and find its own stack pointer. The canonical way +to do this is with SWAPGS, which lets us access percpu data using the +%gs prefix. + +With PAGE_TABLE_ISOLATION-like pagetable switching, this is +problematic. Without a scratch register, switching CR3 is impossible, so +%gs-based percpu memory would need to be mapped in the user pagetables. +Doing that without information leaks is difficult or impossible. + +Instead, use a different sneaky trick. Map a copy of the first part +of the SYSCALL asm at a different address for each CPU. Now RIP +varies depending on the CPU, so we can use RIP-relative memory access +to access percpu memory. By putting the relevant information (one +scratch slot and the stack address) at a constant offset relative to +RIP, we can make SYSCALL work without relying on %gs. + +A nice thing about this approach is that we can easily switch it on +and off if we want pagetable switching to be configurable. + +The compat variant of SYSCALL doesn't have this problem in the first +place -- there are plenty of scratch registers, since we don't care +about preserving r8-r15. This patch therefore doesn't touch SYSCALL32 +at all. + +This patch actually seems to be a small speedup. With this patch, +SYSCALL touches an extra cache line and an extra virtual page, but +the pipeline no longer stalls waiting for SWAPGS. It seems that, at +least in a tight loop, the latter outweights the former. + +Thanks to David Laight for an optimization tip. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.403607157@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 3386bc8aed825e9f1f65ce38df4b109b2019b71a) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 9fec5954d068a19bbf134da7af66db94699b03a3) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/fixmap.h | 2 ++ + arch/x86/kernel/asm-offsets.c | 1 + + arch/x86/kernel/cpu/common.c | 15 ++++++++++- + arch/x86/entry/entry_64.S | 58 +++++++++++++++++++++++++++++++++++++++++++ + arch/x86/kernel/vmlinux.lds.S | 9 +++++++ + 5 files changed, 84 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index c92fc30e6def..189d12d8afe0 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -61,6 +61,8 @@ struct cpu_entry_area { + * of the TSS region. + */ + struct tss_struct tss; ++ ++ char entry_trampoline[PAGE_SIZE]; + }; + + #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) +diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c +index f765c3253ec3..822be00c85ff 100644 +--- a/arch/x86/kernel/asm-offsets.c ++++ b/arch/x86/kernel/asm-offsets.c +@@ -100,4 +100,5 @@ void common(void) { + + /* Layout info for cpu_entry_area */ + OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); ++ OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); + } +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 404e4b75db6e..c2b2ee73b8a1 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -486,6 +486,8 @@ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); + static inline void setup_cpu_entry_area(int cpu) + { + #ifdef CONFIG_X86_64 ++ extern char _entry_trampoline[]; ++ + /* On 64-bit systems, we use a read-only fixmap GDT. */ + pgprot_t gdt_prot = PAGE_KERNEL_RO; + #else +@@ -532,6 +534,11 @@ static inline void setup_cpu_entry_area(int cpu) + #ifdef CONFIG_X86_32 + this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); + #endif ++ ++#ifdef CONFIG_X86_64 ++ __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), ++ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); ++#endif + } + + /* Load the original GDT from the per-cpu structure */ +@@ -1396,10 +1403,16 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + /* May not be marked __init: used by software suspend */ + void syscall_init(void) + { ++ extern char _entry_trampoline[]; ++ extern char entry_SYSCALL_64_trampoline[]; ++ + int cpu = smp_processor_id(); ++ unsigned long SYSCALL64_entry_trampoline = ++ (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + ++ (entry_SYSCALL_64_trampoline - _entry_trampoline); + + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); +- wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); ++ wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); + + #ifdef CONFIG_IA32_EMULATION + wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 4abe5b806d2a..dc100a7052ee 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -135,6 +135,64 @@ END(native_usergs_sysret64) + * with them due to bugs in both AMD and Intel CPUs. + */ + ++ .pushsection .entry_trampoline, "ax" ++ ++/* ++ * The code in here gets remapped into cpu_entry_area's trampoline. This means ++ * that the assembler and linker have the wrong idea as to where this code ++ * lives (and, in fact, it's mapped more than once, so it's not even at a ++ * fixed address). So we can't reference any symbols outside the entry ++ * trampoline and expect it to work. ++ * ++ * Instead, we carefully abuse %rip-relative addressing. ++ * _entry_trampoline(%rip) refers to the start of the remapped) entry ++ * trampoline. We can thus find cpu_entry_area with this macro: ++ */ ++ ++#define CPU_ENTRY_AREA \ ++ _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) ++ ++/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ ++#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \ ++ SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA ++ ++ENTRY(entry_SYSCALL_64_trampoline) ++ UNWIND_HINT_EMPTY ++ swapgs ++ ++ /* Stash the user RSP. */ ++ movq %rsp, RSP_SCRATCH ++ ++ /* Load the top of the task stack into RSP */ ++ movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp ++ ++ /* Start building the simulated IRET frame. */ ++ pushq $__USER_DS /* pt_regs->ss */ ++ pushq RSP_SCRATCH /* pt_regs->sp */ ++ pushq %r11 /* pt_regs->flags */ ++ pushq $__USER_CS /* pt_regs->cs */ ++ pushq %rcx /* pt_regs->ip */ ++ ++ /* ++ * x86 lacks a near absolute jump, and we can't jump to the real ++ * entry text with a relative jump. We could push the target ++ * address and then use retq, but this destroys the pipeline on ++ * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, ++ * spill RDI and restore it in a second-stage trampoline. ++ */ ++ pushq %rdi ++ movq $entry_SYSCALL_64_stage2, %rdi ++ jmp *%rdi ++END(entry_SYSCALL_64_trampoline) ++ ++ .popsection ++ ++ENTRY(entry_SYSCALL_64_stage2) ++ UNWIND_HINT_EMPTY ++ popq %rdi ++ jmp entry_SYSCALL_64_after_hwframe ++END(entry_SYSCALL_64_stage2) ++ + ENTRY(entry_SYSCALL_64) + UNWIND_HINT_EMPTY + /* +diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S +index f05f00acac89..423aa36f0150 100644 +--- a/arch/x86/kernel/vmlinux.lds.S ++++ b/arch/x86/kernel/vmlinux.lds.S +@@ -106,6 +106,15 @@ SECTIONS + SOFTIRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) ++ ++#ifdef CONFIG_X86_64 ++ . = ALIGN(PAGE_SIZE); ++ _entry_trampoline = .; ++ *(.entry_trampoline) ++ . = ALIGN(PAGE_SIZE); ++ ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); ++#endif ++ + /* End of text section */ + _etext = .; + } :text = 0x9090 +-- +2.14.2 + diff --git a/patches/kernel/0158-x86-entry-64-Move-the-IST-stacks-into-struct-cpu_ent.patch b/patches/kernel/0158-x86-entry-64-Move-the-IST-stacks-into-struct-cpu_ent.patch deleted file mode 100644 index 762ca88..0000000 --- a/patches/kernel/0158-x86-entry-64-Move-the-IST-stacks-into-struct-cpu_ent.patch +++ /dev/null @@ -1,234 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:26 +0100 -Subject: [PATCH] x86/entry/64: Move the IST stacks into struct cpu_entry_area -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The IST stacks are needed when an IST exception occurs and are accessed -before any kernel code at all runs. Move them into struct cpu_entry_area. - -The IST stacks are unlike the rest of cpu_entry_area: they're used even for -entries from kernel mode. This means that they should be set up before we -load the final IDT. Move cpu_entry_area setup to trap_init() for the boot -CPU and set it up for all possible CPUs at once in native_smp_prepare_cpus(). - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150606.480598743@linutronix.de -Signed-off-by: Ingo Molnar -(backported from commit 40e7f949e0d9a33968ebde5d67f7e3a47c97742a) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 88e7277709f2e7c023e66ff9ae158aeff4cf7c8f) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/fixmap.h | 12 +++++++ - arch/x86/kernel/cpu/common.c | 74 ++++++++++++++++++++++++------------------- - arch/x86/kernel/traps.c | 3 ++ - 3 files changed, 57 insertions(+), 32 deletions(-) - -diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h -index 189d12d8afe0..953aed54cb5e 100644 ---- a/arch/x86/include/asm/fixmap.h -+++ b/arch/x86/include/asm/fixmap.h -@@ -63,10 +63,22 @@ struct cpu_entry_area { - struct tss_struct tss; - - char entry_trampoline[PAGE_SIZE]; -+ -+#ifdef CONFIG_X86_64 -+ /* -+ * Exception stacks used for IST entries. -+ * -+ * In the future, this should have a separate slot for each stack -+ * with guard pages between them. -+ */ -+ char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; -+#endif - }; - - #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) - -+extern void setup_cpu_entry_areas(void); -+ - /* - * Here we define all the compile-time 'special' virtual - * addresses. The point is to have a constant address at -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index c2b2ee73b8a1..f487766855d3 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -466,24 +466,36 @@ void load_percpu_segment(int cpu) - load_stack_canary_segment(); - } - --static void set_percpu_fixmap_pages(int fixmap_index, void *ptr, -- int pages, pgprot_t prot) --{ -- int i; -- -- for (i = 0; i < pages; i++) { -- __set_fixmap(fixmap_index - i, -- per_cpu_ptr_to_phys(ptr + i * PAGE_SIZE), prot); -- } --} -- - #ifdef CONFIG_X86_32 - /* The 32-bit entry code needs to find cpu_entry_area. */ - DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); - #endif - -+#ifdef CONFIG_X86_64 -+/* -+ * Special IST stacks which the CPU switches to when it calls -+ * an IST-marked descriptor entry. Up to 7 stacks (hardware -+ * limit), all of them are 4K, except the debug stack which -+ * is 8K. -+ */ -+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { -+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, -+ [DEBUG_STACK - 1] = DEBUG_STKSZ -+}; -+ -+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks -+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); -+#endif -+ -+static void __init -+set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) -+{ -+ for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) -+ __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); -+} -+ - /* Setup the fixmap mappings only once per-processor */ --static inline void setup_cpu_entry_area(int cpu) -+static void __init setup_cpu_entry_area(int cpu) - { - #ifdef CONFIG_X86_64 - extern char _entry_trampoline[]; -@@ -532,15 +544,31 @@ static inline void setup_cpu_entry_area(int cpu) - PAGE_KERNEL); - - #ifdef CONFIG_X86_32 -- this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); -+ per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); - #endif - - #ifdef CONFIG_X86_64 -+ BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); -+ BUILD_BUG_ON(sizeof(exception_stacks) != -+ sizeof(((struct cpu_entry_area *)0)->exception_stacks)); -+ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), -+ &per_cpu(exception_stacks, cpu), -+ sizeof(exception_stacks) / PAGE_SIZE, -+ PAGE_KERNEL); -+ - __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), - __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); - #endif - } - -+void __init setup_cpu_entry_areas(void) -+{ -+ unsigned int cpu; -+ -+ for_each_possible_cpu(cpu) -+ setup_cpu_entry_area(cpu); -+} -+ - /* Load the original GDT from the per-cpu structure */ - void load_direct_gdt(int cpu) - { -@@ -1386,20 +1414,6 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; - DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; - EXPORT_PER_CPU_SYMBOL(__preempt_count); - --/* -- * Special IST stacks which the CPU switches to when it calls -- * an IST-marked descriptor entry. Up to 7 stacks (hardware -- * limit), all of them are 4K, except the debug stack which -- * is 8K. -- */ --static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { -- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, -- [DEBUG_STACK - 1] = DEBUG_STKSZ --}; -- --static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks -- [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); -- - /* May not be marked __init: used by software suspend */ - void syscall_init(void) - { -@@ -1608,7 +1622,7 @@ void cpu_init(void) - * set up and load the per-CPU TSS - */ - if (!oist->ist[0]) { -- char *estacks = per_cpu(exception_stacks, cpu); -+ char *estacks = get_cpu_entry_area(cpu)->exception_stacks; - - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - estacks += exception_stack_sizes[v]; -@@ -1633,8 +1647,6 @@ void cpu_init(void) - BUG_ON(me->mm); - enter_lazy_tlb(&init_mm, me); - -- setup_cpu_entry_area(cpu); -- - /* - * Initialize the TSS. sp0 points to the entry trampoline stack - * regardless of what task is running. -@@ -1693,8 +1705,6 @@ void cpu_init(void) - BUG_ON(curr->mm); - enter_lazy_tlb(&init_mm, curr); - -- setup_cpu_entry_area(cpu); -- - /* - * Initialize the TSS. Don't bother initializing sp0, as the initial - * task never enters user mode. -diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c -index d9debdafe7a6..fd4d47e8672e 100644 ---- a/arch/x86/kernel/traps.c -+++ b/arch/x86/kernel/traps.c -@@ -992,6 +992,9 @@ void __init trap_init(void) - { - int i; - -+ /* Init cpu_entry_area before IST entries are set up */ -+ setup_cpu_entry_areas(); -+ - #ifdef CONFIG_EISA - void __iomem *p = early_ioremap(0x0FFFD9, 4); - --- -2.14.2 - diff --git a/patches/kernel/0159-x86-entry-64-Move-the-IST-stacks-into-struct-cpu_ent.patch b/patches/kernel/0159-x86-entry-64-Move-the-IST-stacks-into-struct-cpu_ent.patch new file mode 100644 index 0000000..762ca88 --- /dev/null +++ b/patches/kernel/0159-x86-entry-64-Move-the-IST-stacks-into-struct-cpu_ent.patch @@ -0,0 +1,234 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:26 +0100 +Subject: [PATCH] x86/entry/64: Move the IST stacks into struct cpu_entry_area +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The IST stacks are needed when an IST exception occurs and are accessed +before any kernel code at all runs. Move them into struct cpu_entry_area. + +The IST stacks are unlike the rest of cpu_entry_area: they're used even for +entries from kernel mode. This means that they should be set up before we +load the final IDT. Move cpu_entry_area setup to trap_init() for the boot +CPU and set it up for all possible CPUs at once in native_smp_prepare_cpus(). + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.480598743@linutronix.de +Signed-off-by: Ingo Molnar +(backported from commit 40e7f949e0d9a33968ebde5d67f7e3a47c97742a) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 88e7277709f2e7c023e66ff9ae158aeff4cf7c8f) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/fixmap.h | 12 +++++++ + arch/x86/kernel/cpu/common.c | 74 ++++++++++++++++++++++++------------------- + arch/x86/kernel/traps.c | 3 ++ + 3 files changed, 57 insertions(+), 32 deletions(-) + +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index 189d12d8afe0..953aed54cb5e 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -63,10 +63,22 @@ struct cpu_entry_area { + struct tss_struct tss; + + char entry_trampoline[PAGE_SIZE]; ++ ++#ifdef CONFIG_X86_64 ++ /* ++ * Exception stacks used for IST entries. ++ * ++ * In the future, this should have a separate slot for each stack ++ * with guard pages between them. ++ */ ++ char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; ++#endif + }; + + #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) + ++extern void setup_cpu_entry_areas(void); ++ + /* + * Here we define all the compile-time 'special' virtual + * addresses. The point is to have a constant address at +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index c2b2ee73b8a1..f487766855d3 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -466,24 +466,36 @@ void load_percpu_segment(int cpu) + load_stack_canary_segment(); + } + +-static void set_percpu_fixmap_pages(int fixmap_index, void *ptr, +- int pages, pgprot_t prot) +-{ +- int i; +- +- for (i = 0; i < pages; i++) { +- __set_fixmap(fixmap_index - i, +- per_cpu_ptr_to_phys(ptr + i * PAGE_SIZE), prot); +- } +-} +- + #ifdef CONFIG_X86_32 + /* The 32-bit entry code needs to find cpu_entry_area. */ + DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); + #endif + ++#ifdef CONFIG_X86_64 ++/* ++ * Special IST stacks which the CPU switches to when it calls ++ * an IST-marked descriptor entry. Up to 7 stacks (hardware ++ * limit), all of them are 4K, except the debug stack which ++ * is 8K. ++ */ ++static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { ++ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, ++ [DEBUG_STACK - 1] = DEBUG_STKSZ ++}; ++ ++static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks ++ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); ++#endif ++ ++static void __init ++set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) ++{ ++ for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) ++ __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); ++} ++ + /* Setup the fixmap mappings only once per-processor */ +-static inline void setup_cpu_entry_area(int cpu) ++static void __init setup_cpu_entry_area(int cpu) + { + #ifdef CONFIG_X86_64 + extern char _entry_trampoline[]; +@@ -532,15 +544,31 @@ static inline void setup_cpu_entry_area(int cpu) + PAGE_KERNEL); + + #ifdef CONFIG_X86_32 +- this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); ++ per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); + #endif + + #ifdef CONFIG_X86_64 ++ BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); ++ BUILD_BUG_ON(sizeof(exception_stacks) != ++ sizeof(((struct cpu_entry_area *)0)->exception_stacks)); ++ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), ++ &per_cpu(exception_stacks, cpu), ++ sizeof(exception_stacks) / PAGE_SIZE, ++ PAGE_KERNEL); ++ + __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); + #endif + } + ++void __init setup_cpu_entry_areas(void) ++{ ++ unsigned int cpu; ++ ++ for_each_possible_cpu(cpu) ++ setup_cpu_entry_area(cpu); ++} ++ + /* Load the original GDT from the per-cpu structure */ + void load_direct_gdt(int cpu) + { +@@ -1386,20 +1414,6 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; + DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; + EXPORT_PER_CPU_SYMBOL(__preempt_count); + +-/* +- * Special IST stacks which the CPU switches to when it calls +- * an IST-marked descriptor entry. Up to 7 stacks (hardware +- * limit), all of them are 4K, except the debug stack which +- * is 8K. +- */ +-static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { +- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, +- [DEBUG_STACK - 1] = DEBUG_STKSZ +-}; +- +-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks +- [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +- + /* May not be marked __init: used by software suspend */ + void syscall_init(void) + { +@@ -1608,7 +1622,7 @@ void cpu_init(void) + * set up and load the per-CPU TSS + */ + if (!oist->ist[0]) { +- char *estacks = per_cpu(exception_stacks, cpu); ++ char *estacks = get_cpu_entry_area(cpu)->exception_stacks; + + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + estacks += exception_stack_sizes[v]; +@@ -1633,8 +1647,6 @@ void cpu_init(void) + BUG_ON(me->mm); + enter_lazy_tlb(&init_mm, me); + +- setup_cpu_entry_area(cpu); +- + /* + * Initialize the TSS. sp0 points to the entry trampoline stack + * regardless of what task is running. +@@ -1693,8 +1705,6 @@ void cpu_init(void) + BUG_ON(curr->mm); + enter_lazy_tlb(&init_mm, curr); + +- setup_cpu_entry_area(cpu); +- + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index d9debdafe7a6..fd4d47e8672e 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -992,6 +992,9 @@ void __init trap_init(void) + { + int i; + ++ /* Init cpu_entry_area before IST entries are set up */ ++ setup_cpu_entry_areas(); ++ + #ifdef CONFIG_EISA + void __iomem *p = early_ioremap(0x0FFFD9, 4); + +-- +2.14.2 + diff --git a/patches/kernel/0159-x86-entry-64-Remove-the-SYSENTER-stack-canary.patch b/patches/kernel/0159-x86-entry-64-Remove-the-SYSENTER-stack-canary.patch deleted file mode 100644 index 19f11f8..0000000 --- a/patches/kernel/0159-x86-entry-64-Remove-the-SYSENTER-stack-canary.patch +++ /dev/null @@ -1,111 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:27 +0100 -Subject: [PATCH] x86/entry/64: Remove the SYSENTER stack canary -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Now that the SYSENTER stack has a guard page, there's no need for a canary -to detect overflow after the fact. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150606.572577316@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 7fbbd5cbebf118a9e09f5453f686656a167c3d1c) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 8158adf795cb48be67891feacacc36d7a247afdf) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/processor.h | 1 - - arch/x86/kernel/dumpstack.c | 3 +-- - arch/x86/kernel/process.c | 1 - - arch/x86/kernel/traps.c | 7 ------- - 4 files changed, 1 insertion(+), 11 deletions(-) - -diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index 1bfe4bad797a..4737d378d7b5 100644 ---- a/arch/x86/include/asm/processor.h -+++ b/arch/x86/include/asm/processor.h -@@ -335,7 +335,6 @@ struct tss_struct { - * Space for the temporary SYSENTER stack, used for SYSENTER - * and the entry trampoline as well. - */ -- unsigned long SYSENTER_stack_canary; - unsigned long SYSENTER_stack[64]; - - /* -diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c -index c1f503673f1e..c32c6cce9dcc 100644 ---- a/arch/x86/kernel/dumpstack.c -+++ b/arch/x86/kernel/dumpstack.c -@@ -48,8 +48,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) - int cpu = smp_processor_id(); - struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss; - -- /* Treat the canary as part of the stack for unwinding purposes. */ -- void *begin = &tss->SYSENTER_stack_canary; -+ void *begin = &tss->SYSENTER_stack; - void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack); - - if ((void *)stack < begin || (void *)stack >= end) -diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c -index 407fc37a8718..ec758390d24e 100644 ---- a/arch/x86/kernel/process.c -+++ b/arch/x86/kernel/process.c -@@ -80,7 +80,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { - */ - .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, - #endif -- .SYSENTER_stack_canary = STACK_END_MAGIC, - }; - EXPORT_PER_CPU_SYMBOL(cpu_tss); - -diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c -index fd4d47e8672e..2818c83892b3 100644 ---- a/arch/x86/kernel/traps.c -+++ b/arch/x86/kernel/traps.c -@@ -826,13 +826,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) - debug_stack_usage_dec(); - - exit: -- /* -- * This is the most likely code path that involves non-trivial use -- * of the SYSENTER stack. Check that we haven't overrun it. -- */ -- WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, -- "Overran or corrupted SYSENTER stack\n"); -- - ist_exit(regs); - } - NOKPROBE_SYMBOL(do_debug); --- -2.14.2 - diff --git a/patches/kernel/0160-x86-entry-64-Remove-the-SYSENTER-stack-canary.patch b/patches/kernel/0160-x86-entry-64-Remove-the-SYSENTER-stack-canary.patch new file mode 100644 index 0000000..19f11f8 --- /dev/null +++ b/patches/kernel/0160-x86-entry-64-Remove-the-SYSENTER-stack-canary.patch @@ -0,0 +1,111 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:27 +0100 +Subject: [PATCH] x86/entry/64: Remove the SYSENTER stack canary +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Now that the SYSENTER stack has a guard page, there's no need for a canary +to detect overflow after the fact. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.572577316@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 7fbbd5cbebf118a9e09f5453f686656a167c3d1c) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 8158adf795cb48be67891feacacc36d7a247afdf) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/processor.h | 1 - + arch/x86/kernel/dumpstack.c | 3 +-- + arch/x86/kernel/process.c | 1 - + arch/x86/kernel/traps.c | 7 ------- + 4 files changed, 1 insertion(+), 11 deletions(-) + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 1bfe4bad797a..4737d378d7b5 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -335,7 +335,6 @@ struct tss_struct { + * Space for the temporary SYSENTER stack, used for SYSENTER + * and the entry trampoline as well. + */ +- unsigned long SYSENTER_stack_canary; + unsigned long SYSENTER_stack[64]; + + /* +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index c1f503673f1e..c32c6cce9dcc 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -48,8 +48,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) + int cpu = smp_processor_id(); + struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss; + +- /* Treat the canary as part of the stack for unwinding purposes. */ +- void *begin = &tss->SYSENTER_stack_canary; ++ void *begin = &tss->SYSENTER_stack; + void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack); + + if ((void *)stack < begin || (void *)stack >= end) +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 407fc37a8718..ec758390d24e 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -80,7 +80,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { + */ + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, + #endif +- .SYSENTER_stack_canary = STACK_END_MAGIC, + }; + EXPORT_PER_CPU_SYMBOL(cpu_tss); + +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index fd4d47e8672e..2818c83892b3 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -826,13 +826,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) + debug_stack_usage_dec(); + + exit: +- /* +- * This is the most likely code path that involves non-trivial use +- * of the SYSENTER stack. Check that we haven't overrun it. +- */ +- WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, +- "Overran or corrupted SYSENTER stack\n"); +- + ist_exit(regs); + } + NOKPROBE_SYMBOL(do_debug); +-- +2.14.2 + diff --git a/patches/kernel/0160-x86-entry-Clean-up-the-SYSENTER_stack-code.patch b/patches/kernel/0160-x86-entry-Clean-up-the-SYSENTER_stack-code.patch deleted file mode 100644 index e8b5e85..0000000 --- a/patches/kernel/0160-x86-entry-Clean-up-the-SYSENTER_stack-code.patch +++ /dev/null @@ -1,205 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:28 +0100 -Subject: [PATCH] x86/entry: Clean up the SYSENTER_stack code -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The existing code was a mess, mainly because C arrays are nasty. Turn -SYSENTER_stack into a struct, add a helper to find it, and do all the -obvious cleanups this enables. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150606.653244723@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 0f9a48100fba3f189724ae88a450c2261bf91c80) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit a308af33c794110c52427ad11d3a6d35ffc14b76) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/fixmap.h | 5 +++++ - arch/x86/include/asm/processor.h | 6 +++++- - arch/x86/kernel/asm-offsets.c | 6 ++---- - arch/x86/kernel/cpu/common.c | 14 +++----------- - arch/x86/kernel/dumpstack.c | 7 +++---- - arch/x86/entry/entry_32.S | 4 ++-- - arch/x86/entry/entry_64.S | 2 +- - 7 files changed, 21 insertions(+), 23 deletions(-) - -diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h -index 953aed54cb5e..56aaffbbffd6 100644 ---- a/arch/x86/include/asm/fixmap.h -+++ b/arch/x86/include/asm/fixmap.h -@@ -225,5 +225,10 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) - return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); - } - -+static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) -+{ -+ return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack; -+} -+ - #endif /* !__ASSEMBLY__ */ - #endif /* _ASM_X86_FIXMAP_H */ -diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index 4737d378d7b5..2d489a414a86 100644 ---- a/arch/x86/include/asm/processor.h -+++ b/arch/x86/include/asm/processor.h -@@ -330,12 +330,16 @@ struct x86_hw_tss { - #define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) - #define INVALID_IO_BITMAP_OFFSET 0x8000 - -+struct SYSENTER_stack { -+ unsigned long words[64]; -+}; -+ - struct tss_struct { - /* - * Space for the temporary SYSENTER stack, used for SYSENTER - * and the entry trampoline as well. - */ -- unsigned long SYSENTER_stack[64]; -+ struct SYSENTER_stack SYSENTER_stack; - - /* - * The fixed hardware portion. This must not cross a page boundary -diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c -index 822be00c85ff..00ea20bfa857 100644 ---- a/arch/x86/kernel/asm-offsets.c -+++ b/arch/x86/kernel/asm-offsets.c -@@ -93,10 +93,8 @@ void common(void) { - BLANK(); - DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); - -- /* Offset from cpu_tss to SYSENTER_stack */ -- OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); -- /* Size of SYSENTER_stack */ -- DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); -+ OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack); -+ DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); - - /* Layout info for cpu_entry_area */ - OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index f487766855d3..f9541c48c290 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -1306,12 +1306,7 @@ void enable_sep_cpu(void) - - tss->x86_tss.ss1 = __KERNEL_CS; - wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); -- -- wrmsr(MSR_IA32_SYSENTER_ESP, -- (unsigned long)&get_cpu_entry_area(cpu)->tss + -- offsetofend(struct tss_struct, SYSENTER_stack), -- 0); -- -+ wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); - - put_cpu(); -@@ -1437,9 +1432,7 @@ void syscall_init(void) - * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). - */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); -- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, -- (unsigned long)&get_cpu_entry_area(cpu)->tss + -- offsetofend(struct tss_struct, SYSENTER_stack)); -+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); - #else - wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); -@@ -1653,8 +1646,7 @@ void cpu_init(void) - */ - set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); - load_TR_desc(); -- load_sp0((unsigned long)&get_cpu_entry_area(cpu)->tss + -- offsetofend(struct tss_struct, SYSENTER_stack)); -+ load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); - - load_mm_ldt(&init_mm); - -diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c -index c32c6cce9dcc..b005e5ef6738 100644 ---- a/arch/x86/kernel/dumpstack.c -+++ b/arch/x86/kernel/dumpstack.c -@@ -45,11 +45,10 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, - - bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) - { -- int cpu = smp_processor_id(); -- struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss; -+ struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id()); - -- void *begin = &tss->SYSENTER_stack; -- void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack); -+ void *begin = ss; -+ void *end = ss + 1; - - if ((void *)stack < begin || (void *)stack >= end) - return false; -diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S -index 41e0e103f090..04abcd3f8e2d 100644 ---- a/arch/x86/entry/entry_32.S -+++ b/arch/x86/entry/entry_32.S -@@ -949,7 +949,7 @@ ENTRY(debug) - - /* Are we currently on the SYSENTER stack? */ - movl PER_CPU_VAR(cpu_entry_area), %ecx -- addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx -+ addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx - subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ - cmpl $SIZEOF_SYSENTER_stack, %ecx - jb .Ldebug_from_sysenter_stack -@@ -993,7 +993,7 @@ ENTRY(nmi) - - /* Are we currently on the SYSENTER stack? */ - movl PER_CPU_VAR(cpu_entry_area), %ecx -- addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx -+ addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx - subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ - cmpl $SIZEOF_SYSENTER_stack, %ecx - jb .Lnmi_from_sysenter_stack -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index dc100a7052ee..7a5e9edcdaf4 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -153,7 +153,7 @@ END(native_usergs_sysret64) - _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) - - /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ --#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \ -+#define RSP_SCRATCH CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \ - SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA - - ENTRY(entry_SYSCALL_64_trampoline) --- -2.14.2 - diff --git a/patches/kernel/0161-x86-entry-64-Make-cpu_entry_area.tss-read-only.patch b/patches/kernel/0161-x86-entry-64-Make-cpu_entry_area.tss-read-only.patch deleted file mode 100644 index 42ae5cd..0000000 --- a/patches/kernel/0161-x86-entry-64-Make-cpu_entry_area.tss-read-only.patch +++ /dev/null @@ -1,492 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:29 +0100 -Subject: [PATCH] x86/entry/64: Make cpu_entry_area.tss read-only -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The TSS is a fairly juicy target for exploits, and, now that the TSS -is in the cpu_entry_area, it's no longer protected by kASLR. Make it -read-only on x86_64. - -On x86_32, it can't be RO because it's written by the CPU during task -switches, and we use a task gate for double faults. I'd also be -nervous about errata if we tried to make it RO even on configurations -without double fault handling. - -[ tglx: AMD confirmed that there is no problem on 64-bit with TSS RO. So - it's probably safe to assume that it's a non issue, though Intel - might have been creative in that area. Still waiting for - confirmation. ] - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Kees Cook -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150606.733700132@linutronix.de -Signed-off-by: Ingo Molnar -(backported from commit c482feefe1aeb150156248ba0fd3e029bc886605) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 785be108f90cd62eab2da17490714085ef752538) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/fixmap.h | 13 +++++++++---- - arch/x86/include/asm/processor.h | 17 ++++++++--------- - arch/x86/include/asm/switch_to.h | 4 ++-- - arch/x86/include/asm/thread_info.h | 2 +- - arch/x86/kernel/asm-offsets.c | 5 ++--- - arch/x86/kernel/asm-offsets_32.c | 4 ++-- - arch/x86/kernel/cpu/common.c | 29 +++++++++++++++++++---------- - arch/x86/kernel/ioport.c | 2 +- - arch/x86/kernel/process.c | 6 +++--- - arch/x86/kernel/process_32.c | 2 +- - arch/x86/kernel/process_64.c | 2 +- - arch/x86/kernel/traps.c | 4 ++-- - arch/x86/lib/delay.c | 4 ++-- - arch/x86/xen/enlighten_pv.c | 2 +- - arch/x86/entry/entry_32.S | 4 ++-- - arch/x86/entry/entry_64.S | 8 ++++---- - 16 files changed, 60 insertions(+), 48 deletions(-) - -diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h -index 56aaffbbffd6..5dc269ff4085 100644 ---- a/arch/x86/include/asm/fixmap.h -+++ b/arch/x86/include/asm/fixmap.h -@@ -56,9 +56,14 @@ struct cpu_entry_area { - char gdt[PAGE_SIZE]; - - /* -- * The GDT is just below cpu_tss and thus serves (on x86_64) as a -- * a read-only guard page for the SYSENTER stack at the bottom -- * of the TSS region. -+ * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as -+ * a a read-only guard page. -+ */ -+ struct SYSENTER_stack_page SYSENTER_stack_page; -+ -+ /* -+ * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because -+ * we need task switches to work, and task switches write to the TSS. - */ - struct tss_struct tss; - -@@ -227,7 +232,7 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) - - static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) - { -- return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack; -+ return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack; - } - - #endif /* !__ASSEMBLY__ */ -diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index 2d489a414a86..bccec7ed1676 100644 ---- a/arch/x86/include/asm/processor.h -+++ b/arch/x86/include/asm/processor.h -@@ -334,13 +334,11 @@ struct SYSENTER_stack { - unsigned long words[64]; - }; - --struct tss_struct { -- /* -- * Space for the temporary SYSENTER stack, used for SYSENTER -- * and the entry trampoline as well. -- */ -- struct SYSENTER_stack SYSENTER_stack; -+struct SYSENTER_stack_page { -+ struct SYSENTER_stack stack; -+} __aligned(PAGE_SIZE); - -+struct tss_struct { - /* - * The fixed hardware portion. This must not cross a page boundary - * at risk of violating the SDM's advice and potentially triggering -@@ -357,7 +355,7 @@ struct tss_struct { - unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; - } __aligned(PAGE_SIZE); - --DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); -+DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); - - /* - * sizeof(unsigned long) coming from an extra "long" at the end -@@ -372,7 +370,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); - #ifdef CONFIG_X86_32 - DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); - #else --#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1 -+/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */ -+#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 - #endif - - /* -@@ -532,7 +531,7 @@ static inline void native_set_iopl_mask(unsigned mask) - static inline void - native_load_sp0(unsigned long sp0) - { -- this_cpu_write(cpu_tss.x86_tss.sp0, sp0); -+ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); - } - - static inline void native_swapgs(void) -diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h -index ca2fc84ad278..cfb6dfe4c457 100644 ---- a/arch/x86/include/asm/switch_to.h -+++ b/arch/x86/include/asm/switch_to.h -@@ -78,10 +78,10 @@ do { \ - static inline void refresh_sysenter_cs(struct thread_struct *thread) - { - /* Only happens when SEP is enabled, no need to test "SEP"arately: */ -- if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) -+ if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs)) - return; - -- this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); -+ this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs); - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); - } - #endif -diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h -index 760dd8a73927..6275b391ac61 100644 ---- a/arch/x86/include/asm/thread_info.h -+++ b/arch/x86/include/asm/thread_info.h -@@ -214,7 +214,7 @@ static inline int arch_within_stack_frames(const void * const stack, - #else /* !__ASSEMBLY__ */ - - #ifdef CONFIG_X86_64 --# define cpu_current_top_of_stack (cpu_tss + TSS_sp1) -+# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1) - #endif - - #endif -diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c -index 00ea20bfa857..40c3fab107ac 100644 ---- a/arch/x86/kernel/asm-offsets.c -+++ b/arch/x86/kernel/asm-offsets.c -@@ -93,10 +93,9 @@ void common(void) { - BLANK(); - DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); - -- OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack); -- DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); -- - /* Layout info for cpu_entry_area */ - OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); - OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); -+ OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page); -+ DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); - } -diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c -index d09b161a3bd0..c4f23da7a0f0 100644 ---- a/arch/x86/kernel/asm-offsets_32.c -+++ b/arch/x86/kernel/asm-offsets_32.c -@@ -49,8 +49,8 @@ void foo(void) - BLANK(); - - /* Offset from the sysenter stack to tss.sp0 */ -- DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - -- offsetofend(struct tss_struct, SYSENTER_stack)); -+ DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - -+ offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack)); - - #ifdef CONFIG_CC_STACKPROTECTOR - BLANK(); -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index f9541c48c290..7992e5a8076c 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -487,6 +487,9 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); - #endif - -+static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page, -+ SYSENTER_stack_storage); -+ - static void __init - set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) - { -@@ -500,23 +503,29 @@ static void __init setup_cpu_entry_area(int cpu) - #ifdef CONFIG_X86_64 - extern char _entry_trampoline[]; - -- /* On 64-bit systems, we use a read-only fixmap GDT. */ -+ /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ - pgprot_t gdt_prot = PAGE_KERNEL_RO; -+ pgprot_t tss_prot = PAGE_KERNEL_RO; - #else - /* - * On native 32-bit systems, the GDT cannot be read-only because - * our double fault handler uses a task gate, and entering through -- * a task gate needs to change an available TSS to busy. If the GDT -- * is read-only, that will triple fault. -+ * a task gate needs to change an available TSS to busy. If the -+ * GDT is read-only, that will triple fault. The TSS cannot be -+ * read-only because the CPU writes to it on task switches. - * -- * On Xen PV, the GDT must be read-only because the hypervisor requires -- * it. -+ * On Xen PV, the GDT must be read-only because the hypervisor -+ * requires it. - */ - pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? - PAGE_KERNEL_RO : PAGE_KERNEL; -+ pgprot_t tss_prot = PAGE_KERNEL; - #endif - - __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); -+ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page), -+ per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1, -+ PAGE_KERNEL); - - /* - * The Intel SDM says (Volume 3, 7.2.1): -@@ -539,9 +548,9 @@ static void __init setup_cpu_entry_area(int cpu) - offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); - BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); - set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), -- &per_cpu(cpu_tss, cpu), -+ &per_cpu(cpu_tss_rw, cpu), - sizeof(struct tss_struct) / PAGE_SIZE, -- PAGE_KERNEL); -+ tss_prot); - - #ifdef CONFIG_X86_32 - per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); -@@ -1297,7 +1306,7 @@ void enable_sep_cpu(void) - return; - - cpu = get_cpu(); -- tss = &per_cpu(cpu_tss, cpu); -+ tss = &per_cpu(cpu_tss_rw, cpu); - - /* - * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- -@@ -1576,7 +1585,7 @@ void cpu_init(void) - if (cpu) - load_ucode_ap(); - -- t = &per_cpu(cpu_tss, cpu); -+ t = &per_cpu(cpu_tss_rw, cpu); - oist = &per_cpu(orig_ist, cpu); - - #ifdef CONFIG_NUMA -@@ -1667,7 +1676,7 @@ void cpu_init(void) - { - int cpu = smp_processor_id(); - struct task_struct *curr = current; -- struct tss_struct *t = &per_cpu(cpu_tss, cpu); -+ struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu); - - wait_for_master_cpu(cpu); - -diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c -index 4a613fed94b6..d13777d49d8b 100644 ---- a/arch/x86/kernel/ioport.c -+++ b/arch/x86/kernel/ioport.c -@@ -66,7 +66,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) - * because the ->io_bitmap_max value must match the bitmap - * contents: - */ -- tss = &per_cpu(cpu_tss, get_cpu()); -+ tss = &per_cpu(cpu_tss_rw, get_cpu()); - - if (turn_on) - bitmap_clear(t->io_bitmap_ptr, from, num); -diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c -index ec758390d24e..3688a7b9d055 100644 ---- a/arch/x86/kernel/process.c -+++ b/arch/x86/kernel/process.c -@@ -46,7 +46,7 @@ - * section. Since TSS's are completely CPU-local, we want them - * on exact cacheline boundaries, to eliminate cacheline ping-pong. - */ --__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { -+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = { - .x86_tss = { - /* - * .sp0 is only used when entering ring 0 from a lower -@@ -81,7 +81,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { - .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, - #endif - }; --EXPORT_PER_CPU_SYMBOL(cpu_tss); -+EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); - - DEFINE_PER_CPU(bool, __tss_limit_invalid); - EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); -@@ -110,7 +110,7 @@ void exit_thread(struct task_struct *tsk) - struct fpu *fpu = &t->fpu; - - if (bp) { -- struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); -+ struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu()); - - t->io_bitmap_ptr = NULL; - clear_thread_flag(TIF_IO_BITMAP); -diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c -index c0d60420466c..784ff9147172 100644 ---- a/arch/x86/kernel/process_32.c -+++ b/arch/x86/kernel/process_32.c -@@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) - struct fpu *prev_fpu = &prev->fpu; - struct fpu *next_fpu = &next->fpu; - int cpu = smp_processor_id(); -- struct tss_struct *tss = &per_cpu(cpu_tss, cpu); -+ struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); - - /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - -diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c -index 157f81816915..c75466232016 100644 ---- a/arch/x86/kernel/process_64.c -+++ b/arch/x86/kernel/process_64.c -@@ -399,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) - struct fpu *prev_fpu = &prev->fpu; - struct fpu *next_fpu = &next->fpu; - int cpu = smp_processor_id(); -- struct tss_struct *tss = &per_cpu(cpu_tss, cpu); -+ struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); - - WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && - this_cpu_read(irq_count) != -1); -diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c -index 2818c83892b3..14b462eefa17 100644 ---- a/arch/x86/kernel/traps.c -+++ b/arch/x86/kernel/traps.c -@@ -376,7 +376,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) - regs->cs == __KERNEL_CS && - regs->ip == (unsigned long)native_irq_return_iret) - { -- struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; -+ struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; - - /* - * regs->sp points to the failing IRET frame on the -@@ -661,7 +661,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) - * exception came from the IRET target. - */ - struct bad_iret_stack *new_stack = -- (struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; -+ (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; - - /* Copy the IRET target to the new stack. */ - memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); -diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c -index 29df077cb089..cf2ac227c2ac 100644 ---- a/arch/x86/lib/delay.c -+++ b/arch/x86/lib/delay.c -@@ -106,10 +106,10 @@ static void delay_mwaitx(unsigned long __loops) - delay = min_t(u64, MWAITX_MAX_LOOPS, loops); - - /* -- * Use cpu_tss as a cacheline-aligned, seldomly -+ * Use cpu_tss_rw as a cacheline-aligned, seldomly - * accessed per-cpu variable as the monitor target. - */ -- __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0); -+ __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); - - /* - * AMD, like Intel, supports the EAX hint and EAX=0xf -diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c -index 63c81154083b..3b76cf85e306 100644 ---- a/arch/x86/xen/enlighten_pv.c -+++ b/arch/x86/xen/enlighten_pv.c -@@ -817,7 +817,7 @@ static void xen_load_sp0(unsigned long sp0) - mcs = xen_mc_entry(0); - MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); - xen_mc_issue(PARAVIRT_LAZY_CPU); -- this_cpu_write(cpu_tss.x86_tss.sp0, sp0); -+ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); - } - - void xen_set_iopl_mask(unsigned mask) -diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S -index 04abcd3f8e2d..3ef7800007f8 100644 ---- a/arch/x86/entry/entry_32.S -+++ b/arch/x86/entry/entry_32.S -@@ -949,7 +949,7 @@ ENTRY(debug) - - /* Are we currently on the SYSENTER stack? */ - movl PER_CPU_VAR(cpu_entry_area), %ecx -- addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx -+ addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx - subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ - cmpl $SIZEOF_SYSENTER_stack, %ecx - jb .Ldebug_from_sysenter_stack -@@ -993,7 +993,7 @@ ENTRY(nmi) - - /* Are we currently on the SYSENTER stack? */ - movl PER_CPU_VAR(cpu_entry_area), %ecx -- addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx -+ addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx - subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ - cmpl $SIZEOF_SYSENTER_stack, %ecx - jb .Lnmi_from_sysenter_stack -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 7a5e9edcdaf4..157860b3569f 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -153,7 +153,7 @@ END(native_usergs_sysret64) - _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) - - /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ --#define RSP_SCRATCH CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \ -+#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \ - SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA - - ENTRY(entry_SYSCALL_64_trampoline) -@@ -389,7 +389,7 @@ syscall_return_via_sysret: - * Save old stack pointer and switch to trampoline stack. - */ - movq %rsp, %rdi -- movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp -+ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp - - pushq RSP-RDI(%rdi) /* RSP */ - pushq (%rdi) /* RDI */ -@@ -718,7 +718,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) - * Save old stack pointer and switch to trampoline stack. - */ - movq %rsp, %rdi -- movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp -+ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp - - /* Copy the IRET frame to the trampoline stack. */ - pushq 6*8(%rdi) /* SS */ -@@ -946,7 +946,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt - /* - * Exception entry points. - */ --#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) -+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) - - /* - * Switch to the thread stack. This is called with the IRET frame and --- -2.14.2 - diff --git a/patches/kernel/0161-x86-entry-Clean-up-the-SYSENTER_stack-code.patch b/patches/kernel/0161-x86-entry-Clean-up-the-SYSENTER_stack-code.patch new file mode 100644 index 0000000..e8b5e85 --- /dev/null +++ b/patches/kernel/0161-x86-entry-Clean-up-the-SYSENTER_stack-code.patch @@ -0,0 +1,205 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:28 +0100 +Subject: [PATCH] x86/entry: Clean up the SYSENTER_stack code +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The existing code was a mess, mainly because C arrays are nasty. Turn +SYSENTER_stack into a struct, add a helper to find it, and do all the +obvious cleanups this enables. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.653244723@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 0f9a48100fba3f189724ae88a450c2261bf91c80) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit a308af33c794110c52427ad11d3a6d35ffc14b76) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/fixmap.h | 5 +++++ + arch/x86/include/asm/processor.h | 6 +++++- + arch/x86/kernel/asm-offsets.c | 6 ++---- + arch/x86/kernel/cpu/common.c | 14 +++----------- + arch/x86/kernel/dumpstack.c | 7 +++---- + arch/x86/entry/entry_32.S | 4 ++-- + arch/x86/entry/entry_64.S | 2 +- + 7 files changed, 21 insertions(+), 23 deletions(-) + +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index 953aed54cb5e..56aaffbbffd6 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -225,5 +225,10 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) + return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); + } + ++static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) ++{ ++ return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack; ++} ++ + #endif /* !__ASSEMBLY__ */ + #endif /* _ASM_X86_FIXMAP_H */ +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 4737d378d7b5..2d489a414a86 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -330,12 +330,16 @@ struct x86_hw_tss { + #define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) + #define INVALID_IO_BITMAP_OFFSET 0x8000 + ++struct SYSENTER_stack { ++ unsigned long words[64]; ++}; ++ + struct tss_struct { + /* + * Space for the temporary SYSENTER stack, used for SYSENTER + * and the entry trampoline as well. + */ +- unsigned long SYSENTER_stack[64]; ++ struct SYSENTER_stack SYSENTER_stack; + + /* + * The fixed hardware portion. This must not cross a page boundary +diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c +index 822be00c85ff..00ea20bfa857 100644 +--- a/arch/x86/kernel/asm-offsets.c ++++ b/arch/x86/kernel/asm-offsets.c +@@ -93,10 +93,8 @@ void common(void) { + BLANK(); + DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + +- /* Offset from cpu_tss to SYSENTER_stack */ +- OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); +- /* Size of SYSENTER_stack */ +- DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); ++ OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack); ++ DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); + + /* Layout info for cpu_entry_area */ + OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index f487766855d3..f9541c48c290 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1306,12 +1306,7 @@ void enable_sep_cpu(void) + + tss->x86_tss.ss1 = __KERNEL_CS; + wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); +- +- wrmsr(MSR_IA32_SYSENTER_ESP, +- (unsigned long)&get_cpu_entry_area(cpu)->tss + +- offsetofend(struct tss_struct, SYSENTER_stack), +- 0); +- ++ wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); + + put_cpu(); +@@ -1437,9 +1432,7 @@ void syscall_init(void) + * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). + */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); +- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, +- (unsigned long)&get_cpu_entry_area(cpu)->tss + +- offsetofend(struct tss_struct, SYSENTER_stack)); ++ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); + #else + wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); +@@ -1653,8 +1646,7 @@ void cpu_init(void) + */ + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + load_TR_desc(); +- load_sp0((unsigned long)&get_cpu_entry_area(cpu)->tss + +- offsetofend(struct tss_struct, SYSENTER_stack)); ++ load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); + + load_mm_ldt(&init_mm); + +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index c32c6cce9dcc..b005e5ef6738 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -45,11 +45,10 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, + + bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) + { +- int cpu = smp_processor_id(); +- struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss; ++ struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id()); + +- void *begin = &tss->SYSENTER_stack; +- void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack); ++ void *begin = ss; ++ void *end = ss + 1; + + if ((void *)stack < begin || (void *)stack >= end) + return false; +diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S +index 41e0e103f090..04abcd3f8e2d 100644 +--- a/arch/x86/entry/entry_32.S ++++ b/arch/x86/entry/entry_32.S +@@ -949,7 +949,7 @@ ENTRY(debug) + + /* Are we currently on the SYSENTER stack? */ + movl PER_CPU_VAR(cpu_entry_area), %ecx +- addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx ++ addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Ldebug_from_sysenter_stack +@@ -993,7 +993,7 @@ ENTRY(nmi) + + /* Are we currently on the SYSENTER stack? */ + movl PER_CPU_VAR(cpu_entry_area), %ecx +- addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx ++ addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Lnmi_from_sysenter_stack +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index dc100a7052ee..7a5e9edcdaf4 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -153,7 +153,7 @@ END(native_usergs_sysret64) + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) + + /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ +-#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \ ++#define RSP_SCRATCH CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \ + SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA + + ENTRY(entry_SYSCALL_64_trampoline) +-- +2.14.2 + diff --git a/patches/kernel/0162-x86-entry-64-Make-cpu_entry_area.tss-read-only.patch b/patches/kernel/0162-x86-entry-64-Make-cpu_entry_area.tss-read-only.patch new file mode 100644 index 0000000..42ae5cd --- /dev/null +++ b/patches/kernel/0162-x86-entry-64-Make-cpu_entry_area.tss-read-only.patch @@ -0,0 +1,492 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:29 +0100 +Subject: [PATCH] x86/entry/64: Make cpu_entry_area.tss read-only +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The TSS is a fairly juicy target for exploits, and, now that the TSS +is in the cpu_entry_area, it's no longer protected by kASLR. Make it +read-only on x86_64. + +On x86_32, it can't be RO because it's written by the CPU during task +switches, and we use a task gate for double faults. I'd also be +nervous about errata if we tried to make it RO even on configurations +without double fault handling. + +[ tglx: AMD confirmed that there is no problem on 64-bit with TSS RO. So + it's probably safe to assume that it's a non issue, though Intel + might have been creative in that area. Still waiting for + confirmation. ] + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Kees Cook +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.733700132@linutronix.de +Signed-off-by: Ingo Molnar +(backported from commit c482feefe1aeb150156248ba0fd3e029bc886605) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 785be108f90cd62eab2da17490714085ef752538) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/fixmap.h | 13 +++++++++---- + arch/x86/include/asm/processor.h | 17 ++++++++--------- + arch/x86/include/asm/switch_to.h | 4 ++-- + arch/x86/include/asm/thread_info.h | 2 +- + arch/x86/kernel/asm-offsets.c | 5 ++--- + arch/x86/kernel/asm-offsets_32.c | 4 ++-- + arch/x86/kernel/cpu/common.c | 29 +++++++++++++++++++---------- + arch/x86/kernel/ioport.c | 2 +- + arch/x86/kernel/process.c | 6 +++--- + arch/x86/kernel/process_32.c | 2 +- + arch/x86/kernel/process_64.c | 2 +- + arch/x86/kernel/traps.c | 4 ++-- + arch/x86/lib/delay.c | 4 ++-- + arch/x86/xen/enlighten_pv.c | 2 +- + arch/x86/entry/entry_32.S | 4 ++-- + arch/x86/entry/entry_64.S | 8 ++++---- + 16 files changed, 60 insertions(+), 48 deletions(-) + +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index 56aaffbbffd6..5dc269ff4085 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -56,9 +56,14 @@ struct cpu_entry_area { + char gdt[PAGE_SIZE]; + + /* +- * The GDT is just below cpu_tss and thus serves (on x86_64) as a +- * a read-only guard page for the SYSENTER stack at the bottom +- * of the TSS region. ++ * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as ++ * a a read-only guard page. ++ */ ++ struct SYSENTER_stack_page SYSENTER_stack_page; ++ ++ /* ++ * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because ++ * we need task switches to work, and task switches write to the TSS. + */ + struct tss_struct tss; + +@@ -227,7 +232,7 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) + + static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) + { +- return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack; ++ return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack; + } + + #endif /* !__ASSEMBLY__ */ +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 2d489a414a86..bccec7ed1676 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -334,13 +334,11 @@ struct SYSENTER_stack { + unsigned long words[64]; + }; + +-struct tss_struct { +- /* +- * Space for the temporary SYSENTER stack, used for SYSENTER +- * and the entry trampoline as well. +- */ +- struct SYSENTER_stack SYSENTER_stack; ++struct SYSENTER_stack_page { ++ struct SYSENTER_stack stack; ++} __aligned(PAGE_SIZE); + ++struct tss_struct { + /* + * The fixed hardware portion. This must not cross a page boundary + * at risk of violating the SDM's advice and potentially triggering +@@ -357,7 +355,7 @@ struct tss_struct { + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; + } __aligned(PAGE_SIZE); + +-DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); ++DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); + + /* + * sizeof(unsigned long) coming from an extra "long" at the end +@@ -372,7 +370,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); + #ifdef CONFIG_X86_32 + DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); + #else +-#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1 ++/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */ ++#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 + #endif + + /* +@@ -532,7 +531,7 @@ static inline void native_set_iopl_mask(unsigned mask) + static inline void + native_load_sp0(unsigned long sp0) + { +- this_cpu_write(cpu_tss.x86_tss.sp0, sp0); ++ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); + } + + static inline void native_swapgs(void) +diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h +index ca2fc84ad278..cfb6dfe4c457 100644 +--- a/arch/x86/include/asm/switch_to.h ++++ b/arch/x86/include/asm/switch_to.h +@@ -78,10 +78,10 @@ do { \ + static inline void refresh_sysenter_cs(struct thread_struct *thread) + { + /* Only happens when SEP is enabled, no need to test "SEP"arately: */ +- if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) ++ if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs)) + return; + +- this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); ++ this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs); + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } + #endif +diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h +index 760dd8a73927..6275b391ac61 100644 +--- a/arch/x86/include/asm/thread_info.h ++++ b/arch/x86/include/asm/thread_info.h +@@ -214,7 +214,7 @@ static inline int arch_within_stack_frames(const void * const stack, + #else /* !__ASSEMBLY__ */ + + #ifdef CONFIG_X86_64 +-# define cpu_current_top_of_stack (cpu_tss + TSS_sp1) ++# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1) + #endif + + #endif +diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c +index 00ea20bfa857..40c3fab107ac 100644 +--- a/arch/x86/kernel/asm-offsets.c ++++ b/arch/x86/kernel/asm-offsets.c +@@ -93,10 +93,9 @@ void common(void) { + BLANK(); + DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + +- OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack); +- DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); +- + /* Layout info for cpu_entry_area */ + OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); ++ OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page); ++ DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); + } +diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c +index d09b161a3bd0..c4f23da7a0f0 100644 +--- a/arch/x86/kernel/asm-offsets_32.c ++++ b/arch/x86/kernel/asm-offsets_32.c +@@ -49,8 +49,8 @@ void foo(void) + BLANK(); + + /* Offset from the sysenter stack to tss.sp0 */ +- DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - +- offsetofend(struct tss_struct, SYSENTER_stack)); ++ DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - ++ offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack)); + + #ifdef CONFIG_CC_STACKPROTECTOR + BLANK(); +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index f9541c48c290..7992e5a8076c 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -487,6 +487,9 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); + #endif + ++static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page, ++ SYSENTER_stack_storage); ++ + static void __init + set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) + { +@@ -500,23 +503,29 @@ static void __init setup_cpu_entry_area(int cpu) + #ifdef CONFIG_X86_64 + extern char _entry_trampoline[]; + +- /* On 64-bit systems, we use a read-only fixmap GDT. */ ++ /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ + pgprot_t gdt_prot = PAGE_KERNEL_RO; ++ pgprot_t tss_prot = PAGE_KERNEL_RO; + #else + /* + * On native 32-bit systems, the GDT cannot be read-only because + * our double fault handler uses a task gate, and entering through +- * a task gate needs to change an available TSS to busy. If the GDT +- * is read-only, that will triple fault. ++ * a task gate needs to change an available TSS to busy. If the ++ * GDT is read-only, that will triple fault. The TSS cannot be ++ * read-only because the CPU writes to it on task switches. + * +- * On Xen PV, the GDT must be read-only because the hypervisor requires +- * it. ++ * On Xen PV, the GDT must be read-only because the hypervisor ++ * requires it. + */ + pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? + PAGE_KERNEL_RO : PAGE_KERNEL; ++ pgprot_t tss_prot = PAGE_KERNEL; + #endif + + __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); ++ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page), ++ per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1, ++ PAGE_KERNEL); + + /* + * The Intel SDM says (Volume 3, 7.2.1): +@@ -539,9 +548,9 @@ static void __init setup_cpu_entry_area(int cpu) + offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); + BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); + set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), +- &per_cpu(cpu_tss, cpu), ++ &per_cpu(cpu_tss_rw, cpu), + sizeof(struct tss_struct) / PAGE_SIZE, +- PAGE_KERNEL); ++ tss_prot); + + #ifdef CONFIG_X86_32 + per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); +@@ -1297,7 +1306,7 @@ void enable_sep_cpu(void) + return; + + cpu = get_cpu(); +- tss = &per_cpu(cpu_tss, cpu); ++ tss = &per_cpu(cpu_tss_rw, cpu); + + /* + * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- +@@ -1576,7 +1585,7 @@ void cpu_init(void) + if (cpu) + load_ucode_ap(); + +- t = &per_cpu(cpu_tss, cpu); ++ t = &per_cpu(cpu_tss_rw, cpu); + oist = &per_cpu(orig_ist, cpu); + + #ifdef CONFIG_NUMA +@@ -1667,7 +1676,7 @@ void cpu_init(void) + { + int cpu = smp_processor_id(); + struct task_struct *curr = current; +- struct tss_struct *t = &per_cpu(cpu_tss, cpu); ++ struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu); + + wait_for_master_cpu(cpu); + +diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c +index 4a613fed94b6..d13777d49d8b 100644 +--- a/arch/x86/kernel/ioport.c ++++ b/arch/x86/kernel/ioport.c +@@ -66,7 +66,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) + * because the ->io_bitmap_max value must match the bitmap + * contents: + */ +- tss = &per_cpu(cpu_tss, get_cpu()); ++ tss = &per_cpu(cpu_tss_rw, get_cpu()); + + if (turn_on) + bitmap_clear(t->io_bitmap_ptr, from, num); +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index ec758390d24e..3688a7b9d055 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -46,7 +46,7 @@ + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { ++__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = { + .x86_tss = { + /* + * .sp0 is only used when entering ring 0 from a lower +@@ -81,7 +81,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, + #endif + }; +-EXPORT_PER_CPU_SYMBOL(cpu_tss); ++EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); + + DEFINE_PER_CPU(bool, __tss_limit_invalid); + EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); +@@ -110,7 +110,7 @@ void exit_thread(struct task_struct *tsk) + struct fpu *fpu = &t->fpu; + + if (bp) { +- struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); ++ struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu()); + + t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); +diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c +index c0d60420466c..784ff9147172 100644 +--- a/arch/x86/kernel/process_32.c ++++ b/arch/x86/kernel/process_32.c +@@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + struct fpu *prev_fpu = &prev->fpu; + struct fpu *next_fpu = &next->fpu; + int cpu = smp_processor_id(); +- struct tss_struct *tss = &per_cpu(cpu_tss, cpu); ++ struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); + + /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index 157f81816915..c75466232016 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -399,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + struct fpu *prev_fpu = &prev->fpu; + struct fpu *next_fpu = &next->fpu; + int cpu = smp_processor_id(); +- struct tss_struct *tss = &per_cpu(cpu_tss, cpu); ++ struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); + + WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && + this_cpu_read(irq_count) != -1); +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index 2818c83892b3..14b462eefa17 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -376,7 +376,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) + regs->cs == __KERNEL_CS && + regs->ip == (unsigned long)native_irq_return_iret) + { +- struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; ++ struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + + /* + * regs->sp points to the failing IRET frame on the +@@ -661,7 +661,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) + * exception came from the IRET target. + */ + struct bad_iret_stack *new_stack = +- (struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; ++ (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + + /* Copy the IRET target to the new stack. */ + memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); +diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c +index 29df077cb089..cf2ac227c2ac 100644 +--- a/arch/x86/lib/delay.c ++++ b/arch/x86/lib/delay.c +@@ -106,10 +106,10 @@ static void delay_mwaitx(unsigned long __loops) + delay = min_t(u64, MWAITX_MAX_LOOPS, loops); + + /* +- * Use cpu_tss as a cacheline-aligned, seldomly ++ * Use cpu_tss_rw as a cacheline-aligned, seldomly + * accessed per-cpu variable as the monitor target. + */ +- __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0); ++ __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); + + /* + * AMD, like Intel, supports the EAX hint and EAX=0xf +diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c +index 63c81154083b..3b76cf85e306 100644 +--- a/arch/x86/xen/enlighten_pv.c ++++ b/arch/x86/xen/enlighten_pv.c +@@ -817,7 +817,7 @@ static void xen_load_sp0(unsigned long sp0) + mcs = xen_mc_entry(0); + MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); + xen_mc_issue(PARAVIRT_LAZY_CPU); +- this_cpu_write(cpu_tss.x86_tss.sp0, sp0); ++ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); + } + + void xen_set_iopl_mask(unsigned mask) +diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S +index 04abcd3f8e2d..3ef7800007f8 100644 +--- a/arch/x86/entry/entry_32.S ++++ b/arch/x86/entry/entry_32.S +@@ -949,7 +949,7 @@ ENTRY(debug) + + /* Are we currently on the SYSENTER stack? */ + movl PER_CPU_VAR(cpu_entry_area), %ecx +- addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx ++ addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Ldebug_from_sysenter_stack +@@ -993,7 +993,7 @@ ENTRY(nmi) + + /* Are we currently on the SYSENTER stack? */ + movl PER_CPU_VAR(cpu_entry_area), %ecx +- addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx ++ addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Lnmi_from_sysenter_stack +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 7a5e9edcdaf4..157860b3569f 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -153,7 +153,7 @@ END(native_usergs_sysret64) + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) + + /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ +-#define RSP_SCRATCH CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \ ++#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \ + SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA + + ENTRY(entry_SYSCALL_64_trampoline) +@@ -389,7 +389,7 @@ syscall_return_via_sysret: + * Save old stack pointer and switch to trampoline stack. + */ + movq %rsp, %rdi +- movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp ++ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + + pushq RSP-RDI(%rdi) /* RSP */ + pushq (%rdi) /* RDI */ +@@ -718,7 +718,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) + * Save old stack pointer and switch to trampoline stack. + */ + movq %rsp, %rdi +- movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp ++ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + + /* Copy the IRET frame to the trampoline stack. */ + pushq 6*8(%rdi) /* SS */ +@@ -946,7 +946,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt + /* + * Exception entry points. + */ +-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) ++#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) + + /* + * Switch to the thread stack. This is called with the IRET frame and +-- +2.14.2 + diff --git a/patches/kernel/0162-x86-paravirt-Dont-patch-flush_tlb_single.patch b/patches/kernel/0162-x86-paravirt-Dont-patch-flush_tlb_single.patch deleted file mode 100644 index 638e7f0..0000000 --- a/patches/kernel/0162-x86-paravirt-Dont-patch-flush_tlb_single.patch +++ /dev/null @@ -1,77 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 4 Dec 2017 15:07:30 +0100 -Subject: [PATCH] x86/paravirt: Dont patch flush_tlb_single -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -native_flush_tlb_single() will be changed with the upcoming -PAGE_TABLE_ISOLATION feature. This requires to have more code in -there than INVLPG. - -Remove the paravirt patching for it. - -Signed-off-by: Thomas Gleixner -Reviewed-by: Josh Poimboeuf -Reviewed-by: Juergen Gross -Acked-by: Peter Zijlstra -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Linus Torvalds -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-mm@kvack.org -Cc: michael.schwarz@iaik.tugraz.at -Cc: moritz.lipp@iaik.tugraz.at -Cc: richard.fellner@student.tugraz.at -Link: https://lkml.kernel.org/r/20171204150606.828111617@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit a035795499ca1c2bd1928808d1a156eda1420383) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 435d79a109b8c04d76a6cdb32b9b49a262f75e61) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/paravirt_patch_64.c | 2 -- - 1 file changed, 2 deletions(-) - -diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c -index 11aaf1eaa0e4..c354833342bd 100644 ---- a/arch/x86/kernel/paravirt_patch_64.c -+++ b/arch/x86/kernel/paravirt_patch_64.c -@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); - DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); - DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); - DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); --DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); - DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); - - DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); -@@ -59,7 +58,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, - PATCH_SITE(pv_mmu_ops, read_cr2); - PATCH_SITE(pv_mmu_ops, read_cr3); - PATCH_SITE(pv_mmu_ops, write_cr3); -- PATCH_SITE(pv_mmu_ops, flush_tlb_single); - PATCH_SITE(pv_cpu_ops, wbinvd); - #if defined(CONFIG_PARAVIRT_SPINLOCKS) - case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): --- -2.14.2 - diff --git a/patches/kernel/0163-x86-paravirt-Dont-patch-flush_tlb_single.patch b/patches/kernel/0163-x86-paravirt-Dont-patch-flush_tlb_single.patch new file mode 100644 index 0000000..638e7f0 --- /dev/null +++ b/patches/kernel/0163-x86-paravirt-Dont-patch-flush_tlb_single.patch @@ -0,0 +1,77 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 4 Dec 2017 15:07:30 +0100 +Subject: [PATCH] x86/paravirt: Dont patch flush_tlb_single +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +native_flush_tlb_single() will be changed with the upcoming +PAGE_TABLE_ISOLATION feature. This requires to have more code in +there than INVLPG. + +Remove the paravirt patching for it. + +Signed-off-by: Thomas Gleixner +Reviewed-by: Josh Poimboeuf +Reviewed-by: Juergen Gross +Acked-by: Peter Zijlstra +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Linus Torvalds +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Cc: michael.schwarz@iaik.tugraz.at +Cc: moritz.lipp@iaik.tugraz.at +Cc: richard.fellner@student.tugraz.at +Link: https://lkml.kernel.org/r/20171204150606.828111617@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit a035795499ca1c2bd1928808d1a156eda1420383) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 435d79a109b8c04d76a6cdb32b9b49a262f75e61) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/paravirt_patch_64.c | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c +index 11aaf1eaa0e4..c354833342bd 100644 +--- a/arch/x86/kernel/paravirt_patch_64.c ++++ b/arch/x86/kernel/paravirt_patch_64.c +@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); + DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); + DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); + DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); +-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); + DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); + + DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); +@@ -59,7 +58,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + PATCH_SITE(pv_mmu_ops, read_cr2); + PATCH_SITE(pv_mmu_ops, read_cr3); + PATCH_SITE(pv_mmu_ops, write_cr3); +- PATCH_SITE(pv_mmu_ops, flush_tlb_single); + PATCH_SITE(pv_cpu_ops, wbinvd); + #if defined(CONFIG_PARAVIRT_SPINLOCKS) + case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): +-- +2.14.2 + diff --git a/patches/kernel/0163-x86-paravirt-Provide-a-way-to-check-for-hypervisors.patch b/patches/kernel/0163-x86-paravirt-Provide-a-way-to-check-for-hypervisors.patch deleted file mode 100644 index d268b6e..0000000 --- a/patches/kernel/0163-x86-paravirt-Provide-a-way-to-check-for-hypervisors.patch +++ /dev/null @@ -1,105 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 4 Dec 2017 15:07:31 +0100 -Subject: [PATCH] x86/paravirt: Provide a way to check for hypervisors -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -There is no generic way to test whether a kernel is running on a specific -hypervisor. But that's required to prevent the upcoming user address space -separation feature in certain guest modes. - -Make the hypervisor type enum unconditionally available and provide a -helper function which allows to test for a specific type. - -Signed-off-by: Thomas Gleixner -Reviewed-by: Juergen Gross -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150606.912938129@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 79cc74155218316b9a5d28577c7077b2adba8e58) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 9f637574068f1ffdaded1cd1f408917582594b36) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/hypervisor.h | 25 +++++++++++++++---------- - 1 file changed, 15 insertions(+), 10 deletions(-) - -diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h -index 1b0a5abcd8ae..96aa6b9884dc 100644 ---- a/arch/x86/include/asm/hypervisor.h -+++ b/arch/x86/include/asm/hypervisor.h -@@ -20,16 +20,7 @@ - #ifndef _ASM_X86_HYPERVISOR_H - #define _ASM_X86_HYPERVISOR_H - --#ifdef CONFIG_HYPERVISOR_GUEST -- --#include --#include --#include -- --/* -- * x86 hypervisor information -- */ -- -+/* x86 hypervisor types */ - enum x86_hypervisor_type { - X86_HYPER_NATIVE = 0, - X86_HYPER_VMWARE, -@@ -39,6 +30,12 @@ enum x86_hypervisor_type { - X86_HYPER_KVM, - }; - -+#ifdef CONFIG_HYPERVISOR_GUEST -+ -+#include -+#include -+#include -+ - struct hypervisor_x86 { - /* Hypervisor name */ - const char *name; -@@ -58,7 +55,15 @@ struct hypervisor_x86 { - - extern enum x86_hypervisor_type x86_hyper_type; - extern void init_hypervisor_platform(void); -+static inline bool hypervisor_is_type(enum x86_hypervisor_type type) -+{ -+ return x86_hyper_type == type; -+} - #else - static inline void init_hypervisor_platform(void) { } -+static inline bool hypervisor_is_type(enum x86_hypervisor_type type) -+{ -+ return type == X86_HYPER_NATIVE; -+} - #endif /* CONFIG_HYPERVISOR_GUEST */ - #endif /* _ASM_X86_HYPERVISOR_H */ --- -2.14.2 - diff --git a/patches/kernel/0164-x86-cpufeatures-Make-CPU-bugs-sticky.patch b/patches/kernel/0164-x86-cpufeatures-Make-CPU-bugs-sticky.patch deleted file mode 100644 index eaa7c6d..0000000 --- a/patches/kernel/0164-x86-cpufeatures-Make-CPU-bugs-sticky.patch +++ /dev/null @@ -1,108 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 4 Dec 2017 15:07:32 +0100 -Subject: [PATCH] x86/cpufeatures: Make CPU bugs sticky -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -There is currently no way to force CPU bug bits like CPU feature bits. That -makes it impossible to set a bug bit once at boot and have it stick for all -upcoming CPUs. - -Extend the force set/clear arrays to handle bug bits as well. - -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Rik van Riel -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171204150606.992156574@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 6cbd2171e89b13377261d15e64384df60ecb530e) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit aab40a666a40cd015ca4a53231bed544fc679dcb) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/cpufeature.h | 2 ++ - arch/x86/include/asm/processor.h | 4 ++-- - arch/x86/kernel/cpu/common.c | 6 +++--- - 3 files changed, 7 insertions(+), 5 deletions(-) - -diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h -index 225fd8374fae..8b9915561ed1 100644 ---- a/arch/x86/include/asm/cpufeature.h -+++ b/arch/x86/include/asm/cpufeature.h -@@ -134,6 +134,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); - set_bit(bit, (unsigned long *)cpu_caps_set); \ - } while (0) - -+#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) -+ - #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) - /* - * Static testing of CPU features. Used the same as boot_cpu_has(). -diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index bccec7ed1676..59a317f8e0ec 100644 ---- a/arch/x86/include/asm/processor.h -+++ b/arch/x86/include/asm/processor.h -@@ -162,8 +162,8 @@ extern struct cpuinfo_x86 new_cpu_data; - #include - - extern struct x86_hw_tss doublefault_tss; --extern __u32 cpu_caps_cleared[NCAPINTS]; --extern __u32 cpu_caps_set[NCAPINTS]; -+extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; -+extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; - - #ifdef CONFIG_SMP - DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index 7992e5a8076c..fcdba90e0890 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -452,8 +452,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) - return NULL; /* Not found */ - } - --__u32 cpu_caps_cleared[NCAPINTS]; --__u32 cpu_caps_set[NCAPINTS]; -+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; -+__u32 cpu_caps_set[NCAPINTS + NBUGINTS]; - - void load_percpu_segment(int cpu) - { -@@ -812,7 +812,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c) - { - int i; - -- for (i = 0; i < NCAPINTS; i++) { -+ for (i = 0; i < NCAPINTS + NBUGINTS; i++) { - c->x86_capability[i] &= ~cpu_caps_cleared[i]; - c->x86_capability[i] |= cpu_caps_set[i]; - } --- -2.14.2 - diff --git a/patches/kernel/0164-x86-paravirt-Provide-a-way-to-check-for-hypervisors.patch b/patches/kernel/0164-x86-paravirt-Provide-a-way-to-check-for-hypervisors.patch new file mode 100644 index 0000000..d268b6e --- /dev/null +++ b/patches/kernel/0164-x86-paravirt-Provide-a-way-to-check-for-hypervisors.patch @@ -0,0 +1,105 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 4 Dec 2017 15:07:31 +0100 +Subject: [PATCH] x86/paravirt: Provide a way to check for hypervisors +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +There is no generic way to test whether a kernel is running on a specific +hypervisor. But that's required to prevent the upcoming user address space +separation feature in certain guest modes. + +Make the hypervisor type enum unconditionally available and provide a +helper function which allows to test for a specific type. + +Signed-off-by: Thomas Gleixner +Reviewed-by: Juergen Gross +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.912938129@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 79cc74155218316b9a5d28577c7077b2adba8e58) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 9f637574068f1ffdaded1cd1f408917582594b36) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/hypervisor.h | 25 +++++++++++++++---------- + 1 file changed, 15 insertions(+), 10 deletions(-) + +diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h +index 1b0a5abcd8ae..96aa6b9884dc 100644 +--- a/arch/x86/include/asm/hypervisor.h ++++ b/arch/x86/include/asm/hypervisor.h +@@ -20,16 +20,7 @@ + #ifndef _ASM_X86_HYPERVISOR_H + #define _ASM_X86_HYPERVISOR_H + +-#ifdef CONFIG_HYPERVISOR_GUEST +- +-#include +-#include +-#include +- +-/* +- * x86 hypervisor information +- */ +- ++/* x86 hypervisor types */ + enum x86_hypervisor_type { + X86_HYPER_NATIVE = 0, + X86_HYPER_VMWARE, +@@ -39,6 +30,12 @@ enum x86_hypervisor_type { + X86_HYPER_KVM, + }; + ++#ifdef CONFIG_HYPERVISOR_GUEST ++ ++#include ++#include ++#include ++ + struct hypervisor_x86 { + /* Hypervisor name */ + const char *name; +@@ -58,7 +55,15 @@ struct hypervisor_x86 { + + extern enum x86_hypervisor_type x86_hyper_type; + extern void init_hypervisor_platform(void); ++static inline bool hypervisor_is_type(enum x86_hypervisor_type type) ++{ ++ return x86_hyper_type == type; ++} + #else + static inline void init_hypervisor_platform(void) { } ++static inline bool hypervisor_is_type(enum x86_hypervisor_type type) ++{ ++ return type == X86_HYPER_NATIVE; ++} + #endif /* CONFIG_HYPERVISOR_GUEST */ + #endif /* _ASM_X86_HYPERVISOR_H */ +-- +2.14.2 + diff --git a/patches/kernel/0165-x86-Kconfig-Limit-NR_CPUS-on-32-bit-to-a-sane-amount.patch b/patches/kernel/0165-x86-Kconfig-Limit-NR_CPUS-on-32-bit-to-a-sane-amount.patch deleted file mode 100644 index 69f21bc..0000000 --- a/patches/kernel/0165-x86-Kconfig-Limit-NR_CPUS-on-32-bit-to-a-sane-amount.patch +++ /dev/null @@ -1,56 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Wed, 20 Dec 2017 18:02:34 +0100 -Subject: [PATCH] x86/Kconfig: Limit NR_CPUS on 32-bit to a sane amount -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The recent cpu_entry_area changes fail to compile on 32-bit when BIGSMP=y -and NR_CPUS=512, because the fixmap area becomes too big. - -Limit the number of CPUs with BIGSMP to 64, which is already way to big for -32-bit, but it's at least a working limitation. - -We performed a quick survey of 32-bit-only machines that might be affected -by this change negatively, but found none. - -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Dave Hansen -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: linux-kernel@vger.kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 7bbcbd3d1cdcbacd0f9f8dc4c98d550972f1ca30) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 8ea88ee6f0d058835bfb5685be1ec1beb51177c2) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/Kconfig | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 8b5499bb24bb..51003e53e738 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -923,7 +923,8 @@ config MAXSMP - config NR_CPUS - int "Maximum number of CPUs" if SMP && !MAXSMP - range 2 8 if SMP && X86_32 && !X86_BIGSMP -- range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK -+ range 2 64 if SMP && X86_32 && X86_BIGSMP -+ range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64 - range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64 - default "1" if !SMP - default "8192" if MAXSMP --- -2.14.2 - diff --git a/patches/kernel/0165-x86-cpufeatures-Make-CPU-bugs-sticky.patch b/patches/kernel/0165-x86-cpufeatures-Make-CPU-bugs-sticky.patch new file mode 100644 index 0000000..eaa7c6d --- /dev/null +++ b/patches/kernel/0165-x86-cpufeatures-Make-CPU-bugs-sticky.patch @@ -0,0 +1,108 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 4 Dec 2017 15:07:32 +0100 +Subject: [PATCH] x86/cpufeatures: Make CPU bugs sticky +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +There is currently no way to force CPU bug bits like CPU feature bits. That +makes it impossible to set a bug bit once at boot and have it stick for all +upcoming CPUs. + +Extend the force set/clear arrays to handle bug bits as well. + +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Rik van Riel +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.992156574@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 6cbd2171e89b13377261d15e64384df60ecb530e) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit aab40a666a40cd015ca4a53231bed544fc679dcb) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/cpufeature.h | 2 ++ + arch/x86/include/asm/processor.h | 4 ++-- + arch/x86/kernel/cpu/common.c | 6 +++--- + 3 files changed, 7 insertions(+), 5 deletions(-) + +diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h +index 225fd8374fae..8b9915561ed1 100644 +--- a/arch/x86/include/asm/cpufeature.h ++++ b/arch/x86/include/asm/cpufeature.h +@@ -134,6 +134,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); + set_bit(bit, (unsigned long *)cpu_caps_set); \ + } while (0) + ++#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) ++ + #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) + /* + * Static testing of CPU features. Used the same as boot_cpu_has(). +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index bccec7ed1676..59a317f8e0ec 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -162,8 +162,8 @@ extern struct cpuinfo_x86 new_cpu_data; + #include + + extern struct x86_hw_tss doublefault_tss; +-extern __u32 cpu_caps_cleared[NCAPINTS]; +-extern __u32 cpu_caps_set[NCAPINTS]; ++extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; ++extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; + + #ifdef CONFIG_SMP + DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 7992e5a8076c..fcdba90e0890 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -452,8 +452,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) + return NULL; /* Not found */ + } + +-__u32 cpu_caps_cleared[NCAPINTS]; +-__u32 cpu_caps_set[NCAPINTS]; ++__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; ++__u32 cpu_caps_set[NCAPINTS + NBUGINTS]; + + void load_percpu_segment(int cpu) + { +@@ -812,7 +812,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c) + { + int i; + +- for (i = 0; i < NCAPINTS; i++) { ++ for (i = 0; i < NCAPINTS + NBUGINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } +-- +2.14.2 + diff --git a/patches/kernel/0166-x86-Kconfig-Limit-NR_CPUS-on-32-bit-to-a-sane-amount.patch b/patches/kernel/0166-x86-Kconfig-Limit-NR_CPUS-on-32-bit-to-a-sane-amount.patch new file mode 100644 index 0000000..69f21bc --- /dev/null +++ b/patches/kernel/0166-x86-Kconfig-Limit-NR_CPUS-on-32-bit-to-a-sane-amount.patch @@ -0,0 +1,56 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Wed, 20 Dec 2017 18:02:34 +0100 +Subject: [PATCH] x86/Kconfig: Limit NR_CPUS on 32-bit to a sane amount +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The recent cpu_entry_area changes fail to compile on 32-bit when BIGSMP=y +and NR_CPUS=512, because the fixmap area becomes too big. + +Limit the number of CPUs with BIGSMP to 64, which is already way to big for +32-bit, but it's at least a working limitation. + +We performed a quick survey of 32-bit-only machines that might be affected +by this change negatively, but found none. + +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 7bbcbd3d1cdcbacd0f9f8dc4c98d550972f1ca30) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 8ea88ee6f0d058835bfb5685be1ec1beb51177c2) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/Kconfig | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 8b5499bb24bb..51003e53e738 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -923,7 +923,8 @@ config MAXSMP + config NR_CPUS + int "Maximum number of CPUs" if SMP && !MAXSMP + range 2 8 if SMP && X86_32 && !X86_BIGSMP +- range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK ++ range 2 64 if SMP && X86_32 && X86_BIGSMP ++ range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64 + range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64 + default "1" if !SMP + default "8192" if MAXSMP +-- +2.14.2 + diff --git a/patches/kernel/0166-x86-mm-dump_pagetables-Check-PAGE_PRESENT-for-real.patch b/patches/kernel/0166-x86-mm-dump_pagetables-Check-PAGE_PRESENT-for-real.patch deleted file mode 100644 index b8205ad..0000000 --- a/patches/kernel/0166-x86-mm-dump_pagetables-Check-PAGE_PRESENT-for-real.patch +++ /dev/null @@ -1,56 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Sat, 16 Dec 2017 01:14:39 +0100 -Subject: [PATCH] x86/mm/dump_pagetables: Check PAGE_PRESENT for real -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The check for a present page in printk_prot(): - - if (!pgprot_val(prot)) { - /* Not present */ - -is bogus. If a PTE is set to PAGE_NONE then the pgprot_val is not zero and -the entry is decoded in bogus ways, e.g. as RX GLB. That is confusing when -analyzing mapping correctness. Check for the present bit to make an -informed decision. - -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Dave Hansen -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: linux-kernel@vger.kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit c05344947b37f7cda726e802457370bc6eac4d26) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit d902780eaea12f23b50be4ff00f8df6157c30e4a) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/dump_pagetables.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c -index 0470826d2bdc..91aa41c5e0dd 100644 ---- a/arch/x86/mm/dump_pagetables.c -+++ b/arch/x86/mm/dump_pagetables.c -@@ -140,7 +140,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) - static const char * const level_name[] = - { "cr3", "pgd", "pud", "pmd", "pte" }; - -- if (!pgprot_val(prot)) { -+ if (!(pr & _PAGE_PRESENT)) { - /* Not present */ - pt_dump_cont_printf(m, dmsg, " "); - } else { --- -2.14.2 - diff --git a/patches/kernel/0167-x86-mm-dump_pagetables-Check-PAGE_PRESENT-for-real.patch b/patches/kernel/0167-x86-mm-dump_pagetables-Check-PAGE_PRESENT-for-real.patch new file mode 100644 index 0000000..b8205ad --- /dev/null +++ b/patches/kernel/0167-x86-mm-dump_pagetables-Check-PAGE_PRESENT-for-real.patch @@ -0,0 +1,56 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Sat, 16 Dec 2017 01:14:39 +0100 +Subject: [PATCH] x86/mm/dump_pagetables: Check PAGE_PRESENT for real +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The check for a present page in printk_prot(): + + if (!pgprot_val(prot)) { + /* Not present */ + +is bogus. If a PTE is set to PAGE_NONE then the pgprot_val is not zero and +the entry is decoded in bogus ways, e.g. as RX GLB. That is confusing when +analyzing mapping correctness. Check for the present bit to make an +informed decision. + +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit c05344947b37f7cda726e802457370bc6eac4d26) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit d902780eaea12f23b50be4ff00f8df6157c30e4a) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/dump_pagetables.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c +index 0470826d2bdc..91aa41c5e0dd 100644 +--- a/arch/x86/mm/dump_pagetables.c ++++ b/arch/x86/mm/dump_pagetables.c +@@ -140,7 +140,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) + static const char * const level_name[] = + { "cr3", "pgd", "pud", "pmd", "pte" }; + +- if (!pgprot_val(prot)) { ++ if (!(pr & _PAGE_PRESENT)) { + /* Not present */ + pt_dump_cont_printf(m, dmsg, " "); + } else { +-- +2.14.2 + diff --git a/patches/kernel/0167-x86-mm-dump_pagetables-Make-the-address-hints-correc.patch b/patches/kernel/0167-x86-mm-dump_pagetables-Make-the-address-hints-correc.patch deleted file mode 100644 index 6b6716b..0000000 --- a/patches/kernel/0167-x86-mm-dump_pagetables-Make-the-address-hints-correc.patch +++ /dev/null @@ -1,169 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Wed, 20 Dec 2017 18:07:42 +0100 -Subject: [PATCH] x86/mm/dump_pagetables: Make the address hints correct and - readable -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The address hints are a trainwreck. The array entry numbers have to kept -magically in sync with the actual hints, which is doomed as some of the -array members are initialized at runtime via the entry numbers. - -Designated initializers have been around before this code was -implemented.... - -Use the entry numbers to populate the address hints array and add the -missing bits and pieces. Split 32 and 64 bit for readability sake. - -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Dave Hansen -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: linux-kernel@vger.kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 146122e24bdf208015d629babba673e28d090709) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 7f4d9163531183fbaa0df1d1b1ceecbade4e58dc) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/dump_pagetables.c | 90 +++++++++++++++++++++++++------------------ - 1 file changed, 53 insertions(+), 37 deletions(-) - -diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c -index 91aa41c5e0dd..318a7c30e87e 100644 ---- a/arch/x86/mm/dump_pagetables.c -+++ b/arch/x86/mm/dump_pagetables.c -@@ -44,10 +44,12 @@ struct addr_marker { - unsigned long max_lines; - }; - --/* indices for address_markers; keep sync'd w/ address_markers below */ -+/* Address space markers hints */ -+ -+#ifdef CONFIG_X86_64 -+ - enum address_markers_idx { - USER_SPACE_NR = 0, --#ifdef CONFIG_X86_64 - KERNEL_SPACE_NR, - LOW_KERNEL_NR, - VMALLOC_START_NR, -@@ -56,56 +58,70 @@ enum address_markers_idx { - KASAN_SHADOW_START_NR, - KASAN_SHADOW_END_NR, - #endif --# ifdef CONFIG_X86_ESPFIX64 -+#ifdef CONFIG_X86_ESPFIX64 - ESPFIX_START_NR, --# endif -+#endif -+#ifdef CONFIG_EFI -+ EFI_END_NR, -+#endif - HIGH_KERNEL_NR, - MODULES_VADDR_NR, - MODULES_END_NR, --#else -+ FIXADDR_START_NR, -+ END_OF_SPACE_NR, -+}; -+ -+static struct addr_marker address_markers[] = { -+ [USER_SPACE_NR] = { 0, "User Space" }, -+ [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" }, -+ [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" }, -+ [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, -+ [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, -+#ifdef CONFIG_KASAN -+ [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, -+ [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, -+#endif -+#ifdef CONFIG_X86_ESPFIX64 -+ [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, -+#endif -+#ifdef CONFIG_EFI -+ [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" }, -+#endif -+ [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" }, -+ [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" }, -+ [MODULES_END_NR] = { MODULES_END, "End Modules" }, -+ [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" }, -+ [END_OF_SPACE_NR] = { -1, NULL } -+}; -+ -+#else /* CONFIG_X86_64 */ -+ -+enum address_markers_idx { -+ USER_SPACE_NR = 0, - KERNEL_SPACE_NR, - VMALLOC_START_NR, - VMALLOC_END_NR, --# ifdef CONFIG_HIGHMEM -+#ifdef CONFIG_HIGHMEM - PKMAP_BASE_NR, --# endif -- FIXADDR_START_NR, - #endif -+ FIXADDR_START_NR, -+ END_OF_SPACE_NR, - }; - --/* Address space markers hints */ - static struct addr_marker address_markers[] = { -- { 0, "User Space" }, --#ifdef CONFIG_X86_64 -- { 0x8000000000000000UL, "Kernel Space" }, -- { 0/* PAGE_OFFSET */, "Low Kernel Mapping" }, -- { 0/* VMALLOC_START */, "vmalloc() Area" }, -- { 0/* VMEMMAP_START */, "Vmemmap" }, --#ifdef CONFIG_KASAN -- { KASAN_SHADOW_START, "KASAN shadow" }, -- { KASAN_SHADOW_END, "KASAN shadow end" }, -+ [USER_SPACE_NR] = { 0, "User Space" }, -+ [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" }, -+ [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, -+ [VMALLOC_END_NR] = { 0UL, "vmalloc() End" }, -+#ifdef CONFIG_HIGHMEM -+ [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, - #endif --# ifdef CONFIG_X86_ESPFIX64 -- { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, --# endif --# ifdef CONFIG_EFI -- { EFI_VA_END, "EFI Runtime Services" }, --# endif -- { __START_KERNEL_map, "High Kernel Mapping" }, -- { MODULES_VADDR, "Modules" }, -- { MODULES_END, "End Modules" }, --#else -- { PAGE_OFFSET, "Kernel Mapping" }, -- { 0/* VMALLOC_START */, "vmalloc() Area" }, -- { 0/*VMALLOC_END*/, "vmalloc() End" }, --# ifdef CONFIG_HIGHMEM -- { 0/*PKMAP_BASE*/, "Persistent kmap() Area" }, --# endif -- { 0/*FIXADDR_START*/, "Fixmap Area" }, --#endif -- { -1, NULL } /* End of list */ -+ [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, -+ [END_OF_SPACE_NR] = { -1, NULL } - }; - -+#endif /* !CONFIG_X86_64 */ -+ - /* Multipliers for offsets within the PTEs */ - #define PTE_LEVEL_MULT (PAGE_SIZE) - #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) --- -2.14.2 - diff --git a/patches/kernel/0168-x86-mm-dump_pagetables-Make-the-address-hints-correc.patch b/patches/kernel/0168-x86-mm-dump_pagetables-Make-the-address-hints-correc.patch new file mode 100644 index 0000000..6b6716b --- /dev/null +++ b/patches/kernel/0168-x86-mm-dump_pagetables-Make-the-address-hints-correc.patch @@ -0,0 +1,169 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Wed, 20 Dec 2017 18:07:42 +0100 +Subject: [PATCH] x86/mm/dump_pagetables: Make the address hints correct and + readable +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The address hints are a trainwreck. The array entry numbers have to kept +magically in sync with the actual hints, which is doomed as some of the +array members are initialized at runtime via the entry numbers. + +Designated initializers have been around before this code was +implemented.... + +Use the entry numbers to populate the address hints array and add the +missing bits and pieces. Split 32 and 64 bit for readability sake. + +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 146122e24bdf208015d629babba673e28d090709) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 7f4d9163531183fbaa0df1d1b1ceecbade4e58dc) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/dump_pagetables.c | 90 +++++++++++++++++++++++++------------------ + 1 file changed, 53 insertions(+), 37 deletions(-) + +diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c +index 91aa41c5e0dd..318a7c30e87e 100644 +--- a/arch/x86/mm/dump_pagetables.c ++++ b/arch/x86/mm/dump_pagetables.c +@@ -44,10 +44,12 @@ struct addr_marker { + unsigned long max_lines; + }; + +-/* indices for address_markers; keep sync'd w/ address_markers below */ ++/* Address space markers hints */ ++ ++#ifdef CONFIG_X86_64 ++ + enum address_markers_idx { + USER_SPACE_NR = 0, +-#ifdef CONFIG_X86_64 + KERNEL_SPACE_NR, + LOW_KERNEL_NR, + VMALLOC_START_NR, +@@ -56,56 +58,70 @@ enum address_markers_idx { + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, + #endif +-# ifdef CONFIG_X86_ESPFIX64 ++#ifdef CONFIG_X86_ESPFIX64 + ESPFIX_START_NR, +-# endif ++#endif ++#ifdef CONFIG_EFI ++ EFI_END_NR, ++#endif + HIGH_KERNEL_NR, + MODULES_VADDR_NR, + MODULES_END_NR, +-#else ++ FIXADDR_START_NR, ++ END_OF_SPACE_NR, ++}; ++ ++static struct addr_marker address_markers[] = { ++ [USER_SPACE_NR] = { 0, "User Space" }, ++ [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" }, ++ [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" }, ++ [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, ++ [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, ++#ifdef CONFIG_KASAN ++ [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, ++ [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, ++#endif ++#ifdef CONFIG_X86_ESPFIX64 ++ [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, ++#endif ++#ifdef CONFIG_EFI ++ [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" }, ++#endif ++ [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" }, ++ [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" }, ++ [MODULES_END_NR] = { MODULES_END, "End Modules" }, ++ [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" }, ++ [END_OF_SPACE_NR] = { -1, NULL } ++}; ++ ++#else /* CONFIG_X86_64 */ ++ ++enum address_markers_idx { ++ USER_SPACE_NR = 0, + KERNEL_SPACE_NR, + VMALLOC_START_NR, + VMALLOC_END_NR, +-# ifdef CONFIG_HIGHMEM ++#ifdef CONFIG_HIGHMEM + PKMAP_BASE_NR, +-# endif +- FIXADDR_START_NR, + #endif ++ FIXADDR_START_NR, ++ END_OF_SPACE_NR, + }; + +-/* Address space markers hints */ + static struct addr_marker address_markers[] = { +- { 0, "User Space" }, +-#ifdef CONFIG_X86_64 +- { 0x8000000000000000UL, "Kernel Space" }, +- { 0/* PAGE_OFFSET */, "Low Kernel Mapping" }, +- { 0/* VMALLOC_START */, "vmalloc() Area" }, +- { 0/* VMEMMAP_START */, "Vmemmap" }, +-#ifdef CONFIG_KASAN +- { KASAN_SHADOW_START, "KASAN shadow" }, +- { KASAN_SHADOW_END, "KASAN shadow end" }, ++ [USER_SPACE_NR] = { 0, "User Space" }, ++ [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" }, ++ [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, ++ [VMALLOC_END_NR] = { 0UL, "vmalloc() End" }, ++#ifdef CONFIG_HIGHMEM ++ [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, + #endif +-# ifdef CONFIG_X86_ESPFIX64 +- { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, +-# endif +-# ifdef CONFIG_EFI +- { EFI_VA_END, "EFI Runtime Services" }, +-# endif +- { __START_KERNEL_map, "High Kernel Mapping" }, +- { MODULES_VADDR, "Modules" }, +- { MODULES_END, "End Modules" }, +-#else +- { PAGE_OFFSET, "Kernel Mapping" }, +- { 0/* VMALLOC_START */, "vmalloc() Area" }, +- { 0/*VMALLOC_END*/, "vmalloc() End" }, +-# ifdef CONFIG_HIGHMEM +- { 0/*PKMAP_BASE*/, "Persistent kmap() Area" }, +-# endif +- { 0/*FIXADDR_START*/, "Fixmap Area" }, +-#endif +- { -1, NULL } /* End of list */ ++ [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, ++ [END_OF_SPACE_NR] = { -1, NULL } + }; + ++#endif /* !CONFIG_X86_64 */ ++ + /* Multipliers for offsets within the PTEs */ + #define PTE_LEVEL_MULT (PAGE_SIZE) + #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) +-- +2.14.2 + diff --git a/patches/kernel/0168-x86-vsyscall-64-Explicitly-set-_PAGE_USER-in-the-pag.patch b/patches/kernel/0168-x86-vsyscall-64-Explicitly-set-_PAGE_USER-in-the-pag.patch deleted file mode 100644 index dd32f28..0000000 --- a/patches/kernel/0168-x86-vsyscall-64-Explicitly-set-_PAGE_USER-in-the-pag.patch +++ /dev/null @@ -1,108 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Sun, 10 Dec 2017 22:47:19 -0800 -Subject: [PATCH] x86/vsyscall/64: Explicitly set _PAGE_USER in the pagetable - hierarchy -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The kernel is very erratic as to which pagetables have _PAGE_USER set. The -vsyscall page gets lucky: it seems that all of the relevant pagetables are -among the apparently arbitrary ones that set _PAGE_USER. Rather than -relying on chance, just explicitly set _PAGE_USER. - -This will let us clean up pagetable setup to stop setting _PAGE_USER. The -added code can also be reused by pagetable isolation to manage the -_PAGE_USER bit in the usermode tables. - -[ tglx: Folded paravirt fix from Juergen Gross ] - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Kees Cook -Cc: Linus Torvalds -Cc: Peter Zijlstra -Signed-off-by: Ingo Molnar -(cherry picked from commit 49275fef986abfb8b476e4708aaecc07e7d3e087) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 445742d3632efea229c0b974f91e56a19cf31996) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/vsyscall/vsyscall_64.c | 34 +++++++++++++++++++++++++++++++++- - 1 file changed, 33 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c -index ce1d7534fa53..91f3133cf5f1 100644 ---- a/arch/x86/entry/vsyscall/vsyscall_64.c -+++ b/arch/x86/entry/vsyscall/vsyscall_64.c -@@ -36,6 +36,7 @@ - #include - #include - #include -+#include - - #define CREATE_TRACE_POINTS - #include "vsyscall_trace.h" -@@ -328,16 +329,47 @@ int in_gate_area_no_mm(unsigned long addr) - return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; - } - -+/* -+ * The VSYSCALL page is the only user-accessible page in the kernel address -+ * range. Normally, the kernel page tables can have _PAGE_USER clear, but -+ * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls -+ * are enabled. -+ * -+ * Some day we may create a "minimal" vsyscall mode in which we emulate -+ * vsyscalls but leave the page not present. If so, we skip calling -+ * this. -+ */ -+static void __init set_vsyscall_pgtable_user_bits(void) -+{ -+ pgd_t *pgd; -+ p4d_t *p4d; -+ pud_t *pud; -+ pmd_t *pmd; -+ -+ pgd = pgd_offset_k(VSYSCALL_ADDR); -+ set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); -+ p4d = p4d_offset(pgd, VSYSCALL_ADDR); -+#if CONFIG_PGTABLE_LEVELS >= 5 -+ p4d->p4d |= _PAGE_USER; -+#endif -+ pud = pud_offset(p4d, VSYSCALL_ADDR); -+ set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); -+ pmd = pmd_offset(pud, VSYSCALL_ADDR); -+ set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER)); -+} -+ - void __init map_vsyscall(void) - { - extern char __vsyscall_page; - unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - -- if (vsyscall_mode != NONE) -+ if (vsyscall_mode != NONE) { - __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, - vsyscall_mode == NATIVE - ? PAGE_KERNEL_VSYSCALL - : PAGE_KERNEL_VVAR); -+ set_vsyscall_pgtable_user_bits(); -+ } - - BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != - (unsigned long)VSYSCALL_ADDR); --- -2.14.2 - diff --git a/patches/kernel/0169-x86-vsyscall-64-Explicitly-set-_PAGE_USER-in-the-pag.patch b/patches/kernel/0169-x86-vsyscall-64-Explicitly-set-_PAGE_USER-in-the-pag.patch new file mode 100644 index 0000000..dd32f28 --- /dev/null +++ b/patches/kernel/0169-x86-vsyscall-64-Explicitly-set-_PAGE_USER-in-the-pag.patch @@ -0,0 +1,108 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Sun, 10 Dec 2017 22:47:19 -0800 +Subject: [PATCH] x86/vsyscall/64: Explicitly set _PAGE_USER in the pagetable + hierarchy +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The kernel is very erratic as to which pagetables have _PAGE_USER set. The +vsyscall page gets lucky: it seems that all of the relevant pagetables are +among the apparently arbitrary ones that set _PAGE_USER. Rather than +relying on chance, just explicitly set _PAGE_USER. + +This will let us clean up pagetable setup to stop setting _PAGE_USER. The +added code can also be reused by pagetable isolation to manage the +_PAGE_USER bit in the usermode tables. + +[ tglx: Folded paravirt fix from Juergen Gross ] + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Kees Cook +Cc: Linus Torvalds +Cc: Peter Zijlstra +Signed-off-by: Ingo Molnar +(cherry picked from commit 49275fef986abfb8b476e4708aaecc07e7d3e087) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 445742d3632efea229c0b974f91e56a19cf31996) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/vsyscall/vsyscall_64.c | 34 +++++++++++++++++++++++++++++++++- + 1 file changed, 33 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c +index ce1d7534fa53..91f3133cf5f1 100644 +--- a/arch/x86/entry/vsyscall/vsyscall_64.c ++++ b/arch/x86/entry/vsyscall/vsyscall_64.c +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + + #define CREATE_TRACE_POINTS + #include "vsyscall_trace.h" +@@ -328,16 +329,47 @@ int in_gate_area_no_mm(unsigned long addr) + return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; + } + ++/* ++ * The VSYSCALL page is the only user-accessible page in the kernel address ++ * range. Normally, the kernel page tables can have _PAGE_USER clear, but ++ * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls ++ * are enabled. ++ * ++ * Some day we may create a "minimal" vsyscall mode in which we emulate ++ * vsyscalls but leave the page not present. If so, we skip calling ++ * this. ++ */ ++static void __init set_vsyscall_pgtable_user_bits(void) ++{ ++ pgd_t *pgd; ++ p4d_t *p4d; ++ pud_t *pud; ++ pmd_t *pmd; ++ ++ pgd = pgd_offset_k(VSYSCALL_ADDR); ++ set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); ++ p4d = p4d_offset(pgd, VSYSCALL_ADDR); ++#if CONFIG_PGTABLE_LEVELS >= 5 ++ p4d->p4d |= _PAGE_USER; ++#endif ++ pud = pud_offset(p4d, VSYSCALL_ADDR); ++ set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); ++ pmd = pmd_offset(pud, VSYSCALL_ADDR); ++ set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER)); ++} ++ + void __init map_vsyscall(void) + { + extern char __vsyscall_page; + unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); + +- if (vsyscall_mode != NONE) ++ if (vsyscall_mode != NONE) { + __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, + vsyscall_mode == NATIVE + ? PAGE_KERNEL_VSYSCALL + : PAGE_KERNEL_VVAR); ++ set_vsyscall_pgtable_user_bits(); ++ } + + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != + (unsigned long)VSYSCALL_ADDR); +-- +2.14.2 + diff --git a/patches/kernel/0169-x86-vsyscall-64-Warn-and-fail-vsyscall-emulation-in-.patch b/patches/kernel/0169-x86-vsyscall-64-Warn-and-fail-vsyscall-emulation-in-.patch deleted file mode 100644 index f1c7be9..0000000 --- a/patches/kernel/0169-x86-vsyscall-64-Warn-and-fail-vsyscall-emulation-in-.patch +++ /dev/null @@ -1,55 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Sun, 10 Dec 2017 22:47:20 -0800 -Subject: [PATCH] x86/vsyscall/64: Warn and fail vsyscall emulation in NATIVE - mode -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -If something goes wrong with pagetable setup, vsyscall=native will -accidentally fall back to emulation. Make it warn and fail so that we -notice. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Kees Cook -Cc: Linus Torvalds -Cc: Peter Zijlstra -Signed-off-by: Ingo Molnar -(cherry picked from commit 4831b779403a836158917d59a7ca880483c67378) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit ba10c7488b12c3106d79c8b2ba3f4e79c7e40ee4) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/vsyscall/vsyscall_64.c | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c -index 91f3133cf5f1..5e56a4ced848 100644 ---- a/arch/x86/entry/vsyscall/vsyscall_64.c -+++ b/arch/x86/entry/vsyscall/vsyscall_64.c -@@ -138,6 +138,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) - - WARN_ON_ONCE(address != regs->ip); - -+ /* This should be unreachable in NATIVE mode. */ -+ if (WARN_ON(vsyscall_mode == NATIVE)) -+ return false; -+ - if (vsyscall_mode == NONE) { - warn_bad_vsyscall(KERN_INFO, regs, - "vsyscall attempted with vsyscall=none"); --- -2.14.2 - diff --git a/patches/kernel/0170-arch-mm-Allow-arch_dup_mmap-to-fail.patch b/patches/kernel/0170-arch-mm-Allow-arch_dup_mmap-to-fail.patch deleted file mode 100644 index d7db473..0000000 --- a/patches/kernel/0170-arch-mm-Allow-arch_dup_mmap-to-fail.patch +++ /dev/null @@ -1,155 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Thu, 14 Dec 2017 12:27:29 +0100 -Subject: [PATCH] arch, mm: Allow arch_dup_mmap() to fail -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -In order to sanitize the LDT initialization on x86 arch_dup_mmap() must be -allowed to fail. Fix up all instances. - -Signed-off-by: Thomas Gleixner -Signed-off-by: Peter Zijlstra (Intel) -Cc: Andy Lutomirski -Cc: Andy Lutomirsky -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: dan.j.williams@intel.com -Cc: hughd@google.com -Cc: keescook@google.com -Cc: kirill.shutemov@linux.intel.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit c10e83f598d08046dd1ebc8360d4bb12d802d51b) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit b812abb61437eda1f5718a95085d67902f813f2f) -Signed-off-by: Fabian Grünbichler ---- - arch/powerpc/include/asm/mmu_context.h | 5 +++-- - arch/um/include/asm/mmu_context.h | 3 ++- - arch/unicore32/include/asm/mmu_context.h | 5 +++-- - arch/x86/include/asm/mmu_context.h | 4 ++-- - include/asm-generic/mm_hooks.h | 5 +++-- - kernel/fork.c | 3 +-- - 6 files changed, 14 insertions(+), 11 deletions(-) - -diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h -index 35bec1c5bd5a..60afcc94e673 100644 ---- a/arch/powerpc/include/asm/mmu_context.h -+++ b/arch/powerpc/include/asm/mmu_context.h -@@ -185,9 +185,10 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, - #endif - } - --static inline void arch_dup_mmap(struct mm_struct *oldmm, -- struct mm_struct *mm) -+static inline int arch_dup_mmap(struct mm_struct *oldmm, -+ struct mm_struct *mm) - { -+ return 0; - } - - static inline void arch_exit_mmap(struct mm_struct *mm) -diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h -index b668e351fd6c..fca34b2177e2 100644 ---- a/arch/um/include/asm/mmu_context.h -+++ b/arch/um/include/asm/mmu_context.h -@@ -15,9 +15,10 @@ extern void uml_setup_stubs(struct mm_struct *mm); - /* - * Needed since we do not use the asm-generic/mm_hooks.h: - */ --static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) -+static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) - { - uml_setup_stubs(mm); -+ return 0; - } - extern void arch_exit_mmap(struct mm_struct *mm); - static inline void arch_unmap(struct mm_struct *mm, -diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h -index 59b06b48f27d..5c205a9cb5a6 100644 ---- a/arch/unicore32/include/asm/mmu_context.h -+++ b/arch/unicore32/include/asm/mmu_context.h -@@ -81,9 +81,10 @@ do { \ - } \ - } while (0) - --static inline void arch_dup_mmap(struct mm_struct *oldmm, -- struct mm_struct *mm) -+static inline int arch_dup_mmap(struct mm_struct *oldmm, -+ struct mm_struct *mm) - { -+ return 0; - } - - static inline void arch_unmap(struct mm_struct *mm, -diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h -index efc530642f7d..9be54d9c04c4 100644 ---- a/arch/x86/include/asm/mmu_context.h -+++ b/arch/x86/include/asm/mmu_context.h -@@ -175,10 +175,10 @@ do { \ - } while (0) - #endif - --static inline void arch_dup_mmap(struct mm_struct *oldmm, -- struct mm_struct *mm) -+static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) - { - paravirt_arch_dup_mmap(oldmm, mm); -+ return 0; - } - - static inline void arch_exit_mmap(struct mm_struct *mm) -diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h -index 41e5b6784b97..7a2980f4e3e6 100644 ---- a/include/asm-generic/mm_hooks.h -+++ b/include/asm-generic/mm_hooks.h -@@ -6,9 +6,10 @@ - #ifndef _ASM_GENERIC_MM_HOOKS_H - #define _ASM_GENERIC_MM_HOOKS_H - --static inline void arch_dup_mmap(struct mm_struct *oldmm, -- struct mm_struct *mm) -+static inline int arch_dup_mmap(struct mm_struct *oldmm, -+ struct mm_struct *mm) - { -+ return 0; - } - - static inline void arch_exit_mmap(struct mm_struct *mm) -diff --git a/kernel/fork.c b/kernel/fork.c -index 8efc6b4466e3..1d907772b9d2 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -712,8 +712,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, - goto out; - } - /* a new mm has just been created */ -- arch_dup_mmap(oldmm, mm); -- retval = 0; -+ retval = arch_dup_mmap(oldmm, mm); - out: - up_write(&mm->mmap_sem); - flush_tlb_mm(oldmm); --- -2.14.2 - diff --git a/patches/kernel/0170-x86-vsyscall-64-Warn-and-fail-vsyscall-emulation-in-.patch b/patches/kernel/0170-x86-vsyscall-64-Warn-and-fail-vsyscall-emulation-in-.patch new file mode 100644 index 0000000..f1c7be9 --- /dev/null +++ b/patches/kernel/0170-x86-vsyscall-64-Warn-and-fail-vsyscall-emulation-in-.patch @@ -0,0 +1,55 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Sun, 10 Dec 2017 22:47:20 -0800 +Subject: [PATCH] x86/vsyscall/64: Warn and fail vsyscall emulation in NATIVE + mode +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +If something goes wrong with pagetable setup, vsyscall=native will +accidentally fall back to emulation. Make it warn and fail so that we +notice. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Kees Cook +Cc: Linus Torvalds +Cc: Peter Zijlstra +Signed-off-by: Ingo Molnar +(cherry picked from commit 4831b779403a836158917d59a7ca880483c67378) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit ba10c7488b12c3106d79c8b2ba3f4e79c7e40ee4) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/vsyscall/vsyscall_64.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c +index 91f3133cf5f1..5e56a4ced848 100644 +--- a/arch/x86/entry/vsyscall/vsyscall_64.c ++++ b/arch/x86/entry/vsyscall/vsyscall_64.c +@@ -138,6 +138,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) + + WARN_ON_ONCE(address != regs->ip); + ++ /* This should be unreachable in NATIVE mode. */ ++ if (WARN_ON(vsyscall_mode == NATIVE)) ++ return false; ++ + if (vsyscall_mode == NONE) { + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall attempted with vsyscall=none"); +-- +2.14.2 + diff --git a/patches/kernel/0171-arch-mm-Allow-arch_dup_mmap-to-fail.patch b/patches/kernel/0171-arch-mm-Allow-arch_dup_mmap-to-fail.patch new file mode 100644 index 0000000..d7db473 --- /dev/null +++ b/patches/kernel/0171-arch-mm-Allow-arch_dup_mmap-to-fail.patch @@ -0,0 +1,155 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Thu, 14 Dec 2017 12:27:29 +0100 +Subject: [PATCH] arch, mm: Allow arch_dup_mmap() to fail +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +In order to sanitize the LDT initialization on x86 arch_dup_mmap() must be +allowed to fail. Fix up all instances. + +Signed-off-by: Thomas Gleixner +Signed-off-by: Peter Zijlstra (Intel) +Cc: Andy Lutomirski +Cc: Andy Lutomirsky +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: dan.j.williams@intel.com +Cc: hughd@google.com +Cc: keescook@google.com +Cc: kirill.shutemov@linux.intel.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit c10e83f598d08046dd1ebc8360d4bb12d802d51b) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit b812abb61437eda1f5718a95085d67902f813f2f) +Signed-off-by: Fabian Grünbichler +--- + arch/powerpc/include/asm/mmu_context.h | 5 +++-- + arch/um/include/asm/mmu_context.h | 3 ++- + arch/unicore32/include/asm/mmu_context.h | 5 +++-- + arch/x86/include/asm/mmu_context.h | 4 ++-- + include/asm-generic/mm_hooks.h | 5 +++-- + kernel/fork.c | 3 +-- + 6 files changed, 14 insertions(+), 11 deletions(-) + +diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h +index 35bec1c5bd5a..60afcc94e673 100644 +--- a/arch/powerpc/include/asm/mmu_context.h ++++ b/arch/powerpc/include/asm/mmu_context.h +@@ -185,9 +185,10 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, + #endif + } + +-static inline void arch_dup_mmap(struct mm_struct *oldmm, +- struct mm_struct *mm) ++static inline int arch_dup_mmap(struct mm_struct *oldmm, ++ struct mm_struct *mm) + { ++ return 0; + } + + static inline void arch_exit_mmap(struct mm_struct *mm) +diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h +index b668e351fd6c..fca34b2177e2 100644 +--- a/arch/um/include/asm/mmu_context.h ++++ b/arch/um/include/asm/mmu_context.h +@@ -15,9 +15,10 @@ extern void uml_setup_stubs(struct mm_struct *mm); + /* + * Needed since we do not use the asm-generic/mm_hooks.h: + */ +-static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) ++static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) + { + uml_setup_stubs(mm); ++ return 0; + } + extern void arch_exit_mmap(struct mm_struct *mm); + static inline void arch_unmap(struct mm_struct *mm, +diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h +index 59b06b48f27d..5c205a9cb5a6 100644 +--- a/arch/unicore32/include/asm/mmu_context.h ++++ b/arch/unicore32/include/asm/mmu_context.h +@@ -81,9 +81,10 @@ do { \ + } \ + } while (0) + +-static inline void arch_dup_mmap(struct mm_struct *oldmm, +- struct mm_struct *mm) ++static inline int arch_dup_mmap(struct mm_struct *oldmm, ++ struct mm_struct *mm) + { ++ return 0; + } + + static inline void arch_unmap(struct mm_struct *mm, +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index efc530642f7d..9be54d9c04c4 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -175,10 +175,10 @@ do { \ + } while (0) + #endif + +-static inline void arch_dup_mmap(struct mm_struct *oldmm, +- struct mm_struct *mm) ++static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) + { + paravirt_arch_dup_mmap(oldmm, mm); ++ return 0; + } + + static inline void arch_exit_mmap(struct mm_struct *mm) +diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h +index 41e5b6784b97..7a2980f4e3e6 100644 +--- a/include/asm-generic/mm_hooks.h ++++ b/include/asm-generic/mm_hooks.h +@@ -6,9 +6,10 @@ + #ifndef _ASM_GENERIC_MM_HOOKS_H + #define _ASM_GENERIC_MM_HOOKS_H + +-static inline void arch_dup_mmap(struct mm_struct *oldmm, +- struct mm_struct *mm) ++static inline int arch_dup_mmap(struct mm_struct *oldmm, ++ struct mm_struct *mm) + { ++ return 0; + } + + static inline void arch_exit_mmap(struct mm_struct *mm) +diff --git a/kernel/fork.c b/kernel/fork.c +index 8efc6b4466e3..1d907772b9d2 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -712,8 +712,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, + goto out; + } + /* a new mm has just been created */ +- arch_dup_mmap(oldmm, mm); +- retval = 0; ++ retval = arch_dup_mmap(oldmm, mm); + out: + up_write(&mm->mmap_sem); + flush_tlb_mm(oldmm); +-- +2.14.2 + diff --git a/patches/kernel/0171-x86-ldt-Rework-locking.patch b/patches/kernel/0171-x86-ldt-Rework-locking.patch deleted file mode 100644 index a8c17a4..0000000 --- a/patches/kernel/0171-x86-ldt-Rework-locking.patch +++ /dev/null @@ -1,199 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Thu, 14 Dec 2017 12:27:30 +0100 -Subject: [PATCH] x86/ldt: Rework locking -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The LDT is duplicated on fork() and on exec(), which is wrong as exec() -should start from a clean state, i.e. without LDT. To fix this the LDT -duplication code will be moved into arch_dup_mmap() which is only called -for fork(). - -This introduces a locking problem. arch_dup_mmap() holds mmap_sem of the -parent process, but the LDT duplication code needs to acquire -mm->context.lock to access the LDT data safely, which is the reverse lock -order of write_ldt() where mmap_sem nests into context.lock. - -Solve this by introducing a new rw semaphore which serializes the -read/write_ldt() syscall operations and use context.lock to protect the -actual installment of the LDT descriptor. - -So context.lock stabilizes mm->context.ldt and can nest inside of the new -semaphore or mmap_sem. - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Andy Lutomirsky -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: dan.j.williams@intel.com -Cc: hughd@google.com -Cc: keescook@google.com -Cc: kirill.shutemov@linux.intel.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit c2b3496bb30bd159e9de42e5c952e1f1f33c9a77) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit bf7ee649ccc71ef9acb713a00472886c19e78684) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/mmu.h | 4 +++- - arch/x86/include/asm/mmu_context.h | 2 ++ - arch/x86/kernel/ldt.c | 33 +++++++++++++++++++++------------ - 3 files changed, 26 insertions(+), 13 deletions(-) - -diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h -index bb8c597c2248..2d7e852b2dad 100644 ---- a/arch/x86/include/asm/mmu.h -+++ b/arch/x86/include/asm/mmu.h -@@ -2,6 +2,7 @@ - #define _ASM_X86_MMU_H - - #include -+#include - #include - #include - -@@ -26,7 +27,8 @@ typedef struct { - atomic64_t tlb_gen; - - #ifdef CONFIG_MODIFY_LDT_SYSCALL -- struct ldt_struct *ldt; -+ struct rw_semaphore ldt_usr_sem; -+ struct ldt_struct *ldt; - #endif - - #ifdef CONFIG_X86_64 -diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h -index 9be54d9c04c4..dd865c2acb9d 100644 ---- a/arch/x86/include/asm/mmu_context.h -+++ b/arch/x86/include/asm/mmu_context.h -@@ -131,6 +131,8 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); - static inline int init_new_context(struct task_struct *tsk, - struct mm_struct *mm) - { -+ mutex_init(&mm->context.lock); -+ - mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); - atomic64_set(&mm->context.tlb_gen, 0); - -diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c -index b8be2413cb74..3e7208f0c350 100644 ---- a/arch/x86/kernel/ldt.c -+++ b/arch/x86/kernel/ldt.c -@@ -4,6 +4,11 @@ - * Copyright (C) 2002 Andi Kleen - * - * This handles calls from both 32bit and 64bit mode. -+ * -+ * Lock order: -+ * contex.ldt_usr_sem -+ * mmap_sem -+ * context.lock - */ - - #include -@@ -41,7 +46,7 @@ static void refresh_ldt_segments(void) - #endif - } - --/* context.lock is held for us, so we don't need any locking. */ -+/* context.lock is held by the task which issued the smp function call */ - static void flush_ldt(void *__mm) - { - struct mm_struct *mm = __mm; -@@ -98,15 +103,17 @@ static void finalize_ldt_struct(struct ldt_struct *ldt) - paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); - } - --/* context.lock is held */ --static void install_ldt(struct mm_struct *current_mm, -- struct ldt_struct *ldt) -+static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt) - { -+ mutex_lock(&mm->context.lock); -+ - /* Synchronizes with READ_ONCE in load_mm_ldt. */ -- smp_store_release(¤t_mm->context.ldt, ldt); -+ smp_store_release(&mm->context.ldt, ldt); - -- /* Activate the LDT for all CPUs using current_mm. */ -- on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true); -+ /* Activate the LDT for all CPUs using currents mm. */ -+ on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true); -+ -+ mutex_unlock(&mm->context.lock); - } - - static void free_ldt_struct(struct ldt_struct *ldt) -@@ -132,7 +139,8 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) - struct mm_struct *old_mm; - int retval = 0; - -- mutex_init(&mm->context.lock); -+ init_rwsem(&mm->context.ldt_usr_sem); -+ - old_mm = current->mm; - if (!old_mm) { - mm->context.ldt = NULL; -@@ -179,7 +187,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) - unsigned long entries_size; - int retval; - -- mutex_lock(&mm->context.lock); -+ down_read(&mm->context.ldt_usr_sem); - - if (!mm->context.ldt) { - retval = 0; -@@ -208,7 +216,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) - retval = bytecount; - - out_unlock: -- mutex_unlock(&mm->context.lock); -+ up_read(&mm->context.ldt_usr_sem); - return retval; - } - -@@ -268,7 +276,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) - ldt.avl = 0; - } - -- mutex_lock(&mm->context.lock); -+ if (down_write_killable(&mm->context.ldt_usr_sem)) -+ return -EINTR; - - old_ldt = mm->context.ldt; - old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; -@@ -290,7 +299,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) - error = 0; - - out_unlock: -- mutex_unlock(&mm->context.lock); -+ up_write(&mm->context.ldt_usr_sem); - out: - return error; - } --- -2.14.2 - diff --git a/patches/kernel/0172-x86-ldt-Prevent-LDT-inheritance-on-exec.patch b/patches/kernel/0172-x86-ldt-Prevent-LDT-inheritance-on-exec.patch deleted file mode 100644 index 4348215..0000000 --- a/patches/kernel/0172-x86-ldt-Prevent-LDT-inheritance-on-exec.patch +++ /dev/null @@ -1,177 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Thu, 14 Dec 2017 12:27:31 +0100 -Subject: [PATCH] x86/ldt: Prevent LDT inheritance on exec -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The LDT is inherited across fork() or exec(), but that makes no sense -at all because exec() is supposed to start the process clean. - -The reason why this happens is that init_new_context_ldt() is called from -init_new_context() which obviously needs to be called for both fork() and -exec(). - -It would be surprising if anything relies on that behaviour, so it seems to -be safe to remove that misfeature. - -Split the context initialization into two parts. Clear the LDT pointer and -initialize the mutex from the general context init and move the LDT -duplication to arch_dup_mmap() which is only called on fork(). - -Signed-off-by: Thomas Gleixner -Signed-off-by: Peter Zijlstra -Cc: Andy Lutomirski -Cc: Andy Lutomirsky -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: dan.j.williams@intel.com -Cc: hughd@google.com -Cc: keescook@google.com -Cc: kirill.shutemov@linux.intel.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit a4828f81037f491b2cc986595e3a969a6eeb2fb5) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit f90d254204df4b336731f23bb5417226f51e8651) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/mmu_context.h | 21 ++++++++++++++------- - arch/x86/kernel/ldt.c | 18 +++++------------- - tools/testing/selftests/x86/ldt_gdt.c | 9 +++------ - 3 files changed, 22 insertions(+), 26 deletions(-) - -diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h -index dd865c2acb9d..47ec51a821e8 100644 ---- a/arch/x86/include/asm/mmu_context.h -+++ b/arch/x86/include/asm/mmu_context.h -@@ -56,11 +56,17 @@ struct ldt_struct { - /* - * Used for LDT copy/destruction. - */ --int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm); -+static inline void init_new_context_ldt(struct mm_struct *mm) -+{ -+ mm->context.ldt = NULL; -+ init_rwsem(&mm->context.ldt_usr_sem); -+} -+int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); - void destroy_context_ldt(struct mm_struct *mm); - #else /* CONFIG_MODIFY_LDT_SYSCALL */ --static inline int init_new_context_ldt(struct task_struct *tsk, -- struct mm_struct *mm) -+static inline void init_new_context_ldt(struct mm_struct *mm) { } -+static inline int ldt_dup_context(struct mm_struct *oldmm, -+ struct mm_struct *mm) - { - return 0; - } -@@ -136,15 +142,16 @@ static inline int init_new_context(struct task_struct *tsk, - mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); - atomic64_set(&mm->context.tlb_gen, 0); - -- #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS - if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { - /* pkey 0 is the default and always allocated */ - mm->context.pkey_allocation_map = 0x1; - /* -1 means unallocated or invalid */ - mm->context.execute_only_pkey = -1; - } -- #endif -- return init_new_context_ldt(tsk, mm); -+#endif -+ init_new_context_ldt(mm); -+ return 0; - } - static inline void destroy_context(struct mm_struct *mm) - { -@@ -180,7 +187,7 @@ do { \ - static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) - { - paravirt_arch_dup_mmap(oldmm, mm); -- return 0; -+ return ldt_dup_context(oldmm, mm); - } - - static inline void arch_exit_mmap(struct mm_struct *mm) -diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c -index 3e7208f0c350..74a5aaf13f3c 100644 ---- a/arch/x86/kernel/ldt.c -+++ b/arch/x86/kernel/ldt.c -@@ -130,28 +130,20 @@ static void free_ldt_struct(struct ldt_struct *ldt) - } - - /* -- * we do not have to muck with descriptors here, that is -- * done in switch_mm() as needed. -+ * Called on fork from arch_dup_mmap(). Just copy the current LDT state, -+ * the new task is not running, so nothing can be installed. - */ --int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) -+int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) - { - struct ldt_struct *new_ldt; -- struct mm_struct *old_mm; - int retval = 0; - -- init_rwsem(&mm->context.ldt_usr_sem); -- -- old_mm = current->mm; -- if (!old_mm) { -- mm->context.ldt = NULL; -+ if (!old_mm) - return 0; -- } - - mutex_lock(&old_mm->context.lock); -- if (!old_mm->context.ldt) { -- mm->context.ldt = NULL; -+ if (!old_mm->context.ldt) - goto out_unlock; -- } - - new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); - if (!new_ldt) { -diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c -index 8e290c9b2c3f..783e1a754b78 100644 ---- a/tools/testing/selftests/x86/ldt_gdt.c -+++ b/tools/testing/selftests/x86/ldt_gdt.c -@@ -626,13 +626,10 @@ static void do_multicpu_tests(void) - static int finish_exec_test(void) - { - /* -- * In a sensible world, this would be check_invalid_segment(0, 1); -- * For better or for worse, though, the LDT is inherited across exec. -- * We can probably change this safely, but for now we test it. -+ * Older kernel versions did inherit the LDT on exec() which is -+ * wrong because exec() starts from a clean state. - */ -- check_valid_segment(0, 1, -- AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB, -- 42, true); -+ check_invalid_segment(0, 1); - - return nerrs ? 1 : 0; - } --- -2.14.2 - diff --git a/patches/kernel/0172-x86-ldt-Rework-locking.patch b/patches/kernel/0172-x86-ldt-Rework-locking.patch new file mode 100644 index 0000000..a8c17a4 --- /dev/null +++ b/patches/kernel/0172-x86-ldt-Rework-locking.patch @@ -0,0 +1,199 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Thu, 14 Dec 2017 12:27:30 +0100 +Subject: [PATCH] x86/ldt: Rework locking +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The LDT is duplicated on fork() and on exec(), which is wrong as exec() +should start from a clean state, i.e. without LDT. To fix this the LDT +duplication code will be moved into arch_dup_mmap() which is only called +for fork(). + +This introduces a locking problem. arch_dup_mmap() holds mmap_sem of the +parent process, but the LDT duplication code needs to acquire +mm->context.lock to access the LDT data safely, which is the reverse lock +order of write_ldt() where mmap_sem nests into context.lock. + +Solve this by introducing a new rw semaphore which serializes the +read/write_ldt() syscall operations and use context.lock to protect the +actual installment of the LDT descriptor. + +So context.lock stabilizes mm->context.ldt and can nest inside of the new +semaphore or mmap_sem. + +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Andy Lutomirsky +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: dan.j.williams@intel.com +Cc: hughd@google.com +Cc: keescook@google.com +Cc: kirill.shutemov@linux.intel.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit c2b3496bb30bd159e9de42e5c952e1f1f33c9a77) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit bf7ee649ccc71ef9acb713a00472886c19e78684) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/mmu.h | 4 +++- + arch/x86/include/asm/mmu_context.h | 2 ++ + arch/x86/kernel/ldt.c | 33 +++++++++++++++++++++------------ + 3 files changed, 26 insertions(+), 13 deletions(-) + +diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h +index bb8c597c2248..2d7e852b2dad 100644 +--- a/arch/x86/include/asm/mmu.h ++++ b/arch/x86/include/asm/mmu.h +@@ -2,6 +2,7 @@ + #define _ASM_X86_MMU_H + + #include ++#include + #include + #include + +@@ -26,7 +27,8 @@ typedef struct { + atomic64_t tlb_gen; + + #ifdef CONFIG_MODIFY_LDT_SYSCALL +- struct ldt_struct *ldt; ++ struct rw_semaphore ldt_usr_sem; ++ struct ldt_struct *ldt; + #endif + + #ifdef CONFIG_X86_64 +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index 9be54d9c04c4..dd865c2acb9d 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -131,6 +131,8 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); + static inline int init_new_context(struct task_struct *tsk, + struct mm_struct *mm) + { ++ mutex_init(&mm->context.lock); ++ + mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); + atomic64_set(&mm->context.tlb_gen, 0); + +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index b8be2413cb74..3e7208f0c350 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -4,6 +4,11 @@ + * Copyright (C) 2002 Andi Kleen + * + * This handles calls from both 32bit and 64bit mode. ++ * ++ * Lock order: ++ * contex.ldt_usr_sem ++ * mmap_sem ++ * context.lock + */ + + #include +@@ -41,7 +46,7 @@ static void refresh_ldt_segments(void) + #endif + } + +-/* context.lock is held for us, so we don't need any locking. */ ++/* context.lock is held by the task which issued the smp function call */ + static void flush_ldt(void *__mm) + { + struct mm_struct *mm = __mm; +@@ -98,15 +103,17 @@ static void finalize_ldt_struct(struct ldt_struct *ldt) + paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); + } + +-/* context.lock is held */ +-static void install_ldt(struct mm_struct *current_mm, +- struct ldt_struct *ldt) ++static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt) + { ++ mutex_lock(&mm->context.lock); ++ + /* Synchronizes with READ_ONCE in load_mm_ldt. */ +- smp_store_release(¤t_mm->context.ldt, ldt); ++ smp_store_release(&mm->context.ldt, ldt); + +- /* Activate the LDT for all CPUs using current_mm. */ +- on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true); ++ /* Activate the LDT for all CPUs using currents mm. */ ++ on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true); ++ ++ mutex_unlock(&mm->context.lock); + } + + static void free_ldt_struct(struct ldt_struct *ldt) +@@ -132,7 +139,8 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) + struct mm_struct *old_mm; + int retval = 0; + +- mutex_init(&mm->context.lock); ++ init_rwsem(&mm->context.ldt_usr_sem); ++ + old_mm = current->mm; + if (!old_mm) { + mm->context.ldt = NULL; +@@ -179,7 +187,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) + unsigned long entries_size; + int retval; + +- mutex_lock(&mm->context.lock); ++ down_read(&mm->context.ldt_usr_sem); + + if (!mm->context.ldt) { + retval = 0; +@@ -208,7 +216,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) + retval = bytecount; + + out_unlock: +- mutex_unlock(&mm->context.lock); ++ up_read(&mm->context.ldt_usr_sem); + return retval; + } + +@@ -268,7 +276,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) + ldt.avl = 0; + } + +- mutex_lock(&mm->context.lock); ++ if (down_write_killable(&mm->context.ldt_usr_sem)) ++ return -EINTR; + + old_ldt = mm->context.ldt; + old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; +@@ -290,7 +299,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) + error = 0; + + out_unlock: +- mutex_unlock(&mm->context.lock); ++ up_write(&mm->context.ldt_usr_sem); + out: + return error; + } +-- +2.14.2 + diff --git a/patches/kernel/0173-x86-ldt-Prevent-LDT-inheritance-on-exec.patch b/patches/kernel/0173-x86-ldt-Prevent-LDT-inheritance-on-exec.patch new file mode 100644 index 0000000..4348215 --- /dev/null +++ b/patches/kernel/0173-x86-ldt-Prevent-LDT-inheritance-on-exec.patch @@ -0,0 +1,177 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Thu, 14 Dec 2017 12:27:31 +0100 +Subject: [PATCH] x86/ldt: Prevent LDT inheritance on exec +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The LDT is inherited across fork() or exec(), but that makes no sense +at all because exec() is supposed to start the process clean. + +The reason why this happens is that init_new_context_ldt() is called from +init_new_context() which obviously needs to be called for both fork() and +exec(). + +It would be surprising if anything relies on that behaviour, so it seems to +be safe to remove that misfeature. + +Split the context initialization into two parts. Clear the LDT pointer and +initialize the mutex from the general context init and move the LDT +duplication to arch_dup_mmap() which is only called on fork(). + +Signed-off-by: Thomas Gleixner +Signed-off-by: Peter Zijlstra +Cc: Andy Lutomirski +Cc: Andy Lutomirsky +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: dan.j.williams@intel.com +Cc: hughd@google.com +Cc: keescook@google.com +Cc: kirill.shutemov@linux.intel.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit a4828f81037f491b2cc986595e3a969a6eeb2fb5) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit f90d254204df4b336731f23bb5417226f51e8651) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/mmu_context.h | 21 ++++++++++++++------- + arch/x86/kernel/ldt.c | 18 +++++------------- + tools/testing/selftests/x86/ldt_gdt.c | 9 +++------ + 3 files changed, 22 insertions(+), 26 deletions(-) + +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index dd865c2acb9d..47ec51a821e8 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -56,11 +56,17 @@ struct ldt_struct { + /* + * Used for LDT copy/destruction. + */ +-int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm); ++static inline void init_new_context_ldt(struct mm_struct *mm) ++{ ++ mm->context.ldt = NULL; ++ init_rwsem(&mm->context.ldt_usr_sem); ++} ++int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); + void destroy_context_ldt(struct mm_struct *mm); + #else /* CONFIG_MODIFY_LDT_SYSCALL */ +-static inline int init_new_context_ldt(struct task_struct *tsk, +- struct mm_struct *mm) ++static inline void init_new_context_ldt(struct mm_struct *mm) { } ++static inline int ldt_dup_context(struct mm_struct *oldmm, ++ struct mm_struct *mm) + { + return 0; + } +@@ -136,15 +142,16 @@ static inline int init_new_context(struct task_struct *tsk, + mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); + atomic64_set(&mm->context.tlb_gen, 0); + +- #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS ++#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS + if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { + /* pkey 0 is the default and always allocated */ + mm->context.pkey_allocation_map = 0x1; + /* -1 means unallocated or invalid */ + mm->context.execute_only_pkey = -1; + } +- #endif +- return init_new_context_ldt(tsk, mm); ++#endif ++ init_new_context_ldt(mm); ++ return 0; + } + static inline void destroy_context(struct mm_struct *mm) + { +@@ -180,7 +187,7 @@ do { \ + static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) + { + paravirt_arch_dup_mmap(oldmm, mm); +- return 0; ++ return ldt_dup_context(oldmm, mm); + } + + static inline void arch_exit_mmap(struct mm_struct *mm) +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index 3e7208f0c350..74a5aaf13f3c 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -130,28 +130,20 @@ static void free_ldt_struct(struct ldt_struct *ldt) + } + + /* +- * we do not have to muck with descriptors here, that is +- * done in switch_mm() as needed. ++ * Called on fork from arch_dup_mmap(). Just copy the current LDT state, ++ * the new task is not running, so nothing can be installed. + */ +-int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) ++int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) + { + struct ldt_struct *new_ldt; +- struct mm_struct *old_mm; + int retval = 0; + +- init_rwsem(&mm->context.ldt_usr_sem); +- +- old_mm = current->mm; +- if (!old_mm) { +- mm->context.ldt = NULL; ++ if (!old_mm) + return 0; +- } + + mutex_lock(&old_mm->context.lock); +- if (!old_mm->context.ldt) { +- mm->context.ldt = NULL; ++ if (!old_mm->context.ldt) + goto out_unlock; +- } + + new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); + if (!new_ldt) { +diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c +index 8e290c9b2c3f..783e1a754b78 100644 +--- a/tools/testing/selftests/x86/ldt_gdt.c ++++ b/tools/testing/selftests/x86/ldt_gdt.c +@@ -626,13 +626,10 @@ static void do_multicpu_tests(void) + static int finish_exec_test(void) + { + /* +- * In a sensible world, this would be check_invalid_segment(0, 1); +- * For better or for worse, though, the LDT is inherited across exec. +- * We can probably change this safely, but for now we test it. ++ * Older kernel versions did inherit the LDT on exec() which is ++ * wrong because exec() starts from a clean state. + */ +- check_valid_segment(0, 1, +- AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB, +- 42, true); ++ check_invalid_segment(0, 1); + + return nerrs ? 1 : 0; + } +-- +2.14.2 + diff --git a/patches/kernel/0173-x86-mm-64-Improve-the-memory-map-documentation.patch b/patches/kernel/0173-x86-mm-64-Improve-the-memory-map-documentation.patch deleted file mode 100644 index 453153a..0000000 --- a/patches/kernel/0173-x86-mm-64-Improve-the-memory-map-documentation.patch +++ /dev/null @@ -1,70 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Tue, 12 Dec 2017 07:56:43 -0800 -Subject: [PATCH] x86/mm/64: Improve the memory map documentation -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The old docs had the vsyscall range wrong and were missing the fixmap. -Fix both. - -There used to be 8 MB reserved for future vsyscalls, but that's long gone. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Kees Cook -Cc: Kirill A. Shutemov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Signed-off-by: Ingo Molnar -(cherry picked from commit 5a7ccf4754fb3660569a6de52ba7f7fc3dfaf280) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit d694898656126d8a04e86f681c8fe34ea57f1b85) -Signed-off-by: Fabian Grünbichler ---- - Documentation/x86/x86_64/mm.txt | 10 ++++++---- - 1 file changed, 6 insertions(+), 4 deletions(-) - -diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt -index 3448e675b462..83ca5a3b90ac 100644 ---- a/Documentation/x86/x86_64/mm.txt -+++ b/Documentation/x86/x86_64/mm.txt -@@ -19,8 +19,9 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks - ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space - ... unused hole ... - ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 --ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable) --ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls -+ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space (variable) -+[fixmap start] - ffffffffff5fffff kernel-internal fixmap range -+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI - ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole - - Virtual memory map with 5 level page tables: -@@ -41,8 +42,9 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks - ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space - ... unused hole ... - ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 --ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space --ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls -+ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space -+[fixmap start] - ffffffffff5fffff kernel-internal fixmap range -+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI - ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole - - Architecture defines a 64-bit virtual address. Implementations can support --- -2.14.2 - diff --git a/patches/kernel/0174-x86-doc-Remove-obvious-weirdnesses-from-the-x86-MM-l.patch b/patches/kernel/0174-x86-doc-Remove-obvious-weirdnesses-from-the-x86-MM-l.patch deleted file mode 100644 index ddc6428..0000000 --- a/patches/kernel/0174-x86-doc-Remove-obvious-weirdnesses-from-the-x86-MM-l.patch +++ /dev/null @@ -1,85 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Tue, 5 Dec 2017 13:34:54 +0100 -Subject: [PATCH] x86/doc: Remove obvious weirdnesses from the x86 MM layout - documentation -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit e8ffe96e5933d417195268478479933d56213a3f) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit d9012133906878a404cf47acc168ff9e4b10e379) -Signed-off-by: Fabian Grünbichler ---- - Documentation/x86/x86_64/mm.txt | 12 +++--------- - 1 file changed, 3 insertions(+), 9 deletions(-) - -diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt -index 83ca5a3b90ac..63a41671d25b 100644 ---- a/Documentation/x86/x86_64/mm.txt -+++ b/Documentation/x86/x86_64/mm.txt -@@ -1,6 +1,4 @@ - -- -- - Virtual memory map with 4 level page tables: - - 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm -@@ -49,8 +47,9 @@ ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole - - Architecture defines a 64-bit virtual address. Implementations can support - less. Currently supported are 48- and 57-bit virtual addresses. Bits 63 --through to the most-significant implemented bit are set to either all ones --or all zero. This causes hole between user space and kernel addresses. -+through to the most-significant implemented bit are sign extended. -+This causes hole between user space and kernel addresses if you interpret them -+as unsigned. - - The direct mapping covers all memory in the system up to the highest - memory address (this means in some cases it can also include PCI memory -@@ -60,9 +59,6 @@ vmalloc space is lazily synchronized into the different PML4/PML5 pages of - the processes using the page fault handler, with init_top_pgt as - reference. - --Current X86-64 implementations support up to 46 bits of address space (64 TB), --which is our current limit. This expands into MBZ space in the page tables. -- - We map EFI runtime services in the 'efi_pgd' PGD in a 64Gb large virtual - memory window (this size is arbitrary, it can be raised later if needed). - The mappings are not part of any other kernel PGD and are only available -@@ -74,5 +70,3 @@ following fixmap section. - Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all - physical memory, vmalloc/ioremap space and virtual memory map are randomized. - Their order is preserved but their base will be offset early at boot time. -- ---Andi Kleen, Jul 2004 --- -2.14.2 - diff --git a/patches/kernel/0174-x86-mm-64-Improve-the-memory-map-documentation.patch b/patches/kernel/0174-x86-mm-64-Improve-the-memory-map-documentation.patch new file mode 100644 index 0000000..453153a --- /dev/null +++ b/patches/kernel/0174-x86-mm-64-Improve-the-memory-map-documentation.patch @@ -0,0 +1,70 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Tue, 12 Dec 2017 07:56:43 -0800 +Subject: [PATCH] x86/mm/64: Improve the memory map documentation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The old docs had the vsyscall range wrong and were missing the fixmap. +Fix both. + +There used to be 8 MB reserved for future vsyscalls, but that's long gone. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Kees Cook +Cc: Kirill A. Shutemov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Signed-off-by: Ingo Molnar +(cherry picked from commit 5a7ccf4754fb3660569a6de52ba7f7fc3dfaf280) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit d694898656126d8a04e86f681c8fe34ea57f1b85) +Signed-off-by: Fabian Grünbichler +--- + Documentation/x86/x86_64/mm.txt | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt +index 3448e675b462..83ca5a3b90ac 100644 +--- a/Documentation/x86/x86_64/mm.txt ++++ b/Documentation/x86/x86_64/mm.txt +@@ -19,8 +19,9 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks + ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space + ... unused hole ... + ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 +-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable) +-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls ++ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space (variable) ++[fixmap start] - ffffffffff5fffff kernel-internal fixmap range ++ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI + ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole + + Virtual memory map with 5 level page tables: +@@ -41,8 +42,9 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks + ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space + ... unused hole ... + ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 +-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space +-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls ++ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space ++[fixmap start] - ffffffffff5fffff kernel-internal fixmap range ++ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI + ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole + + Architecture defines a 64-bit virtual address. Implementations can support +-- +2.14.2 + diff --git a/patches/kernel/0175-x86-doc-Remove-obvious-weirdnesses-from-the-x86-MM-l.patch b/patches/kernel/0175-x86-doc-Remove-obvious-weirdnesses-from-the-x86-MM-l.patch new file mode 100644 index 0000000..ddc6428 --- /dev/null +++ b/patches/kernel/0175-x86-doc-Remove-obvious-weirdnesses-from-the-x86-MM-l.patch @@ -0,0 +1,85 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Tue, 5 Dec 2017 13:34:54 +0100 +Subject: [PATCH] x86/doc: Remove obvious weirdnesses from the x86 MM layout + documentation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit e8ffe96e5933d417195268478479933d56213a3f) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit d9012133906878a404cf47acc168ff9e4b10e379) +Signed-off-by: Fabian Grünbichler +--- + Documentation/x86/x86_64/mm.txt | 12 +++--------- + 1 file changed, 3 insertions(+), 9 deletions(-) + +diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt +index 83ca5a3b90ac..63a41671d25b 100644 +--- a/Documentation/x86/x86_64/mm.txt ++++ b/Documentation/x86/x86_64/mm.txt +@@ -1,6 +1,4 @@ + +- +- + Virtual memory map with 4 level page tables: + + 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm +@@ -49,8 +47,9 @@ ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole + + Architecture defines a 64-bit virtual address. Implementations can support + less. Currently supported are 48- and 57-bit virtual addresses. Bits 63 +-through to the most-significant implemented bit are set to either all ones +-or all zero. This causes hole between user space and kernel addresses. ++through to the most-significant implemented bit are sign extended. ++This causes hole between user space and kernel addresses if you interpret them ++as unsigned. + + The direct mapping covers all memory in the system up to the highest + memory address (this means in some cases it can also include PCI memory +@@ -60,9 +59,6 @@ vmalloc space is lazily synchronized into the different PML4/PML5 pages of + the processes using the page fault handler, with init_top_pgt as + reference. + +-Current X86-64 implementations support up to 46 bits of address space (64 TB), +-which is our current limit. This expands into MBZ space in the page tables. +- + We map EFI runtime services in the 'efi_pgd' PGD in a 64Gb large virtual + memory window (this size is arbitrary, it can be raised later if needed). + The mappings are not part of any other kernel PGD and are only available +@@ -74,5 +70,3 @@ following fixmap section. + Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all + physical memory, vmalloc/ioremap space and virtual memory map are randomized. + Their order is preserved but their base will be offset early at boot time. +- +--Andi Kleen, Jul 2004 +-- +2.14.2 + diff --git a/patches/kernel/0175-x86-entry-Rename-SYSENTER_stack-to-CPU_ENTRY_AREA_en.patch b/patches/kernel/0175-x86-entry-Rename-SYSENTER_stack-to-CPU_ENTRY_AREA_en.patch deleted file mode 100644 index a48ffba..0000000 --- a/patches/kernel/0175-x86-entry-Rename-SYSENTER_stack-to-CPU_ENTRY_AREA_en.patch +++ /dev/null @@ -1,346 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Mon, 4 Dec 2017 17:25:07 -0800 -Subject: [PATCH] x86/entry: Rename SYSENTER_stack to - CPU_ENTRY_AREA_entry_stack -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -If the kernel oopses while on the trampoline stack, it will print -"" even if SYSENTER is not involved. That is rather confusing. - -The "SYSENTER" stack is used for a lot more than SYSENTER now. Give it a -better string to display in stack dumps, and rename the kernel code to -match. - -Also move the 32-bit code over to the new naming even though it still uses -the entry stack only for SYSENTER. - -Signed-off-by: Dave Hansen -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Denys Vlasenko -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Signed-off-by: Ingo Molnar -(cherry picked from commit 4fe2d8b11a370af286287a2661de9d4e6c9a145a) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit e0437c473463f208c2b4952f0826e43ce1335a53) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/fixmap.h | 8 ++++---- - arch/x86/include/asm/processor.h | 6 +++--- - arch/x86/include/asm/stacktrace.h | 4 ++-- - arch/x86/kernel/asm-offsets.c | 4 ++-- - arch/x86/kernel/asm-offsets_32.c | 2 +- - arch/x86/kernel/cpu/common.c | 14 +++++++------- - arch/x86/kernel/dumpstack.c | 10 +++++----- - arch/x86/kernel/dumpstack_32.c | 6 +++--- - arch/x86/kernel/dumpstack_64.c | 12 +++++++++--- - arch/x86/entry/entry_32.S | 12 ++++++------ - arch/x86/entry/entry_64.S | 4 ++-- - 11 files changed, 44 insertions(+), 38 deletions(-) - -diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h -index 5dc269ff4085..a7fb137ad964 100644 ---- a/arch/x86/include/asm/fixmap.h -+++ b/arch/x86/include/asm/fixmap.h -@@ -56,10 +56,10 @@ struct cpu_entry_area { - char gdt[PAGE_SIZE]; - - /* -- * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as -+ * The GDT is just below entry_stack and thus serves (on x86_64) as - * a a read-only guard page. - */ -- struct SYSENTER_stack_page SYSENTER_stack_page; -+ struct entry_stack_page entry_stack_page; - - /* - * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because -@@ -230,9 +230,9 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) - return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); - } - --static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) -+static inline struct entry_stack *cpu_entry_stack(int cpu) - { -- return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack; -+ return &get_cpu_entry_area(cpu)->entry_stack_page.stack; - } - - #endif /* !__ASSEMBLY__ */ -diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index 59a317f8e0ec..935d68609922 100644 ---- a/arch/x86/include/asm/processor.h -+++ b/arch/x86/include/asm/processor.h -@@ -330,12 +330,12 @@ struct x86_hw_tss { - #define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) - #define INVALID_IO_BITMAP_OFFSET 0x8000 - --struct SYSENTER_stack { -+struct entry_stack { - unsigned long words[64]; - }; - --struct SYSENTER_stack_page { -- struct SYSENTER_stack stack; -+struct entry_stack_page { -+ struct entry_stack stack; - } __aligned(PAGE_SIZE); - - struct tss_struct { -diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h -index 95f999576131..3b3cc5ba579a 100644 ---- a/arch/x86/include/asm/stacktrace.h -+++ b/arch/x86/include/asm/stacktrace.h -@@ -15,7 +15,7 @@ enum stack_type { - STACK_TYPE_TASK, - STACK_TYPE_IRQ, - STACK_TYPE_SOFTIRQ, -- STACK_TYPE_SYSENTER, -+ STACK_TYPE_ENTRY, - STACK_TYPE_EXCEPTION, - STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, - }; -@@ -28,7 +28,7 @@ struct stack_info { - bool in_task_stack(unsigned long *stack, struct task_struct *task, - struct stack_info *info); - --bool in_sysenter_stack(unsigned long *stack, struct stack_info *info); -+bool in_entry_stack(unsigned long *stack, struct stack_info *info); - - int get_stack_info(unsigned long *stack, struct task_struct *task, - struct stack_info *info, unsigned long *visit_mask); -diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c -index 40c3fab107ac..25b4832e9c28 100644 ---- a/arch/x86/kernel/asm-offsets.c -+++ b/arch/x86/kernel/asm-offsets.c -@@ -96,6 +96,6 @@ void common(void) { - /* Layout info for cpu_entry_area */ - OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); - OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); -- OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page); -- DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); -+ OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); -+ DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); - } -diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c -index c4f23da7a0f0..4dba34cb777d 100644 ---- a/arch/x86/kernel/asm-offsets_32.c -+++ b/arch/x86/kernel/asm-offsets_32.c -@@ -50,7 +50,7 @@ void foo(void) - - /* Offset from the sysenter stack to tss.sp0 */ - DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - -- offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack)); -+ offsetofend(struct cpu_entry_area, entry_stack_page.stack)); - - #ifdef CONFIG_CC_STACKPROTECTOR - BLANK(); -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index fcdba90e0890..7a8a5d436566 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -487,8 +487,8 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); - #endif - --static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page, -- SYSENTER_stack_storage); -+static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, -+ entry_stack_storage); - - static void __init - set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) -@@ -523,8 +523,8 @@ static void __init setup_cpu_entry_area(int cpu) - #endif - - __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); -- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page), -- per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1, -+ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), -+ per_cpu_ptr(&entry_stack_storage, cpu), 1, - PAGE_KERNEL); - - /* -@@ -1315,7 +1315,7 @@ void enable_sep_cpu(void) - - tss->x86_tss.ss1 = __KERNEL_CS; - wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); -- wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0); -+ wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); - - put_cpu(); -@@ -1441,7 +1441,7 @@ void syscall_init(void) - * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). - */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); -- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); -+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); - #else - wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); -@@ -1655,7 +1655,7 @@ void cpu_init(void) - */ - set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); - load_TR_desc(); -- load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); -+ load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); - - load_mm_ldt(&init_mm); - -diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c -index b005e5ef6738..55bf1c3b5319 100644 ---- a/arch/x86/kernel/dumpstack.c -+++ b/arch/x86/kernel/dumpstack.c -@@ -43,9 +43,9 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, - return true; - } - --bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) -+bool in_entry_stack(unsigned long *stack, struct stack_info *info) - { -- struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id()); -+ struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); - - void *begin = ss; - void *end = ss + 1; -@@ -53,7 +53,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) - if ((void *)stack < begin || (void *)stack >= end) - return false; - -- info->type = STACK_TYPE_SYSENTER; -+ info->type = STACK_TYPE_ENTRY; - info->begin = begin; - info->end = end; - info->next_sp = NULL; -@@ -111,13 +111,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - * - task stack - * - interrupt stack - * - HW exception stacks (double fault, nmi, debug, mce) -- * - SYSENTER stack -+ * - entry stack - * - * x86-32 can have up to four stacks: - * - task stack - * - softirq stack - * - hardirq stack -- * - SYSENTER stack -+ * - entry stack - */ - for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { - const char *stack_name; -diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c -index 3160bf2d100e..4580ba0204f6 100644 ---- a/arch/x86/kernel/dumpstack_32.c -+++ b/arch/x86/kernel/dumpstack_32.c -@@ -25,8 +25,8 @@ const char *stack_type_name(enum stack_type type) - if (type == STACK_TYPE_SOFTIRQ) - return "SOFTIRQ"; - -- if (type == STACK_TYPE_SYSENTER) -- return "SYSENTER"; -+ if (type == STACK_TYPE_ENTRY) -+ return "ENTRY_TRAMPOLINE"; - - return NULL; - } -@@ -95,7 +95,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, - if (task != current) - goto unknown; - -- if (in_sysenter_stack(stack, info)) -+ if (in_entry_stack(stack, info)) - goto recursion_check; - - if (in_hardirq_stack(stack, info)) -diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c -index f5107b659f86..7d9c0e06afc2 100644 ---- a/arch/x86/kernel/dumpstack_64.c -+++ b/arch/x86/kernel/dumpstack_64.c -@@ -36,8 +36,14 @@ const char *stack_type_name(enum stack_type type) - if (type == STACK_TYPE_IRQ) - return "IRQ"; - -- if (type == STACK_TYPE_SYSENTER) -- return "SYSENTER"; -+ if (type == STACK_TYPE_ENTRY) { -+ /* -+ * On 64-bit, we have a generic entry stack that we -+ * use for all the kernel entry points, including -+ * SYSENTER. -+ */ -+ return "ENTRY_TRAMPOLINE"; -+ } - - if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) - return exception_stack_names[type - STACK_TYPE_EXCEPTION]; -@@ -117,7 +123,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, - if (in_irq_stack(stack, info)) - goto recursion_check; - -- if (in_sysenter_stack(stack, info)) -+ if (in_entry_stack(stack, info)) - goto recursion_check; - - goto unknown; -diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S -index 3ef7800007f8..634c6a78885c 100644 ---- a/arch/x86/entry/entry_32.S -+++ b/arch/x86/entry/entry_32.S -@@ -949,9 +949,9 @@ ENTRY(debug) - - /* Are we currently on the SYSENTER stack? */ - movl PER_CPU_VAR(cpu_entry_area), %ecx -- addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx -- subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ -- cmpl $SIZEOF_SYSENTER_stack, %ecx -+ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx -+ subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ -+ cmpl $SIZEOF_entry_stack, %ecx - jb .Ldebug_from_sysenter_stack - - TRACE_IRQS_OFF -@@ -993,9 +993,9 @@ ENTRY(nmi) - - /* Are we currently on the SYSENTER stack? */ - movl PER_CPU_VAR(cpu_entry_area), %ecx -- addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx -- subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ -- cmpl $SIZEOF_SYSENTER_stack, %ecx -+ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx -+ subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ -+ cmpl $SIZEOF_entry_stack, %ecx - jb .Lnmi_from_sysenter_stack - - /* Not on SYSENTER stack. */ -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 157860b3569f..03e052f02176 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -153,8 +153,8 @@ END(native_usergs_sysret64) - _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) - - /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ --#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \ -- SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA -+#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \ -+ SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA - - ENTRY(entry_SYSCALL_64_trampoline) - UNWIND_HINT_EMPTY --- -2.14.2 - diff --git a/patches/kernel/0176-x86-entry-Rename-SYSENTER_stack-to-CPU_ENTRY_AREA_en.patch b/patches/kernel/0176-x86-entry-Rename-SYSENTER_stack-to-CPU_ENTRY_AREA_en.patch new file mode 100644 index 0000000..a48ffba --- /dev/null +++ b/patches/kernel/0176-x86-entry-Rename-SYSENTER_stack-to-CPU_ENTRY_AREA_en.patch @@ -0,0 +1,346 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Mon, 4 Dec 2017 17:25:07 -0800 +Subject: [PATCH] x86/entry: Rename SYSENTER_stack to + CPU_ENTRY_AREA_entry_stack +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +If the kernel oopses while on the trampoline stack, it will print +"" even if SYSENTER is not involved. That is rather confusing. + +The "SYSENTER" stack is used for a lot more than SYSENTER now. Give it a +better string to display in stack dumps, and rename the kernel code to +match. + +Also move the 32-bit code over to the new naming even though it still uses +the entry stack only for SYSENTER. + +Signed-off-by: Dave Hansen +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Signed-off-by: Ingo Molnar +(cherry picked from commit 4fe2d8b11a370af286287a2661de9d4e6c9a145a) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit e0437c473463f208c2b4952f0826e43ce1335a53) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/fixmap.h | 8 ++++---- + arch/x86/include/asm/processor.h | 6 +++--- + arch/x86/include/asm/stacktrace.h | 4 ++-- + arch/x86/kernel/asm-offsets.c | 4 ++-- + arch/x86/kernel/asm-offsets_32.c | 2 +- + arch/x86/kernel/cpu/common.c | 14 +++++++------- + arch/x86/kernel/dumpstack.c | 10 +++++----- + arch/x86/kernel/dumpstack_32.c | 6 +++--- + arch/x86/kernel/dumpstack_64.c | 12 +++++++++--- + arch/x86/entry/entry_32.S | 12 ++++++------ + arch/x86/entry/entry_64.S | 4 ++-- + 11 files changed, 44 insertions(+), 38 deletions(-) + +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index 5dc269ff4085..a7fb137ad964 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -56,10 +56,10 @@ struct cpu_entry_area { + char gdt[PAGE_SIZE]; + + /* +- * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as ++ * The GDT is just below entry_stack and thus serves (on x86_64) as + * a a read-only guard page. + */ +- struct SYSENTER_stack_page SYSENTER_stack_page; ++ struct entry_stack_page entry_stack_page; + + /* + * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because +@@ -230,9 +230,9 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) + return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); + } + +-static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) ++static inline struct entry_stack *cpu_entry_stack(int cpu) + { +- return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack; ++ return &get_cpu_entry_area(cpu)->entry_stack_page.stack; + } + + #endif /* !__ASSEMBLY__ */ +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 59a317f8e0ec..935d68609922 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -330,12 +330,12 @@ struct x86_hw_tss { + #define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) + #define INVALID_IO_BITMAP_OFFSET 0x8000 + +-struct SYSENTER_stack { ++struct entry_stack { + unsigned long words[64]; + }; + +-struct SYSENTER_stack_page { +- struct SYSENTER_stack stack; ++struct entry_stack_page { ++ struct entry_stack stack; + } __aligned(PAGE_SIZE); + + struct tss_struct { +diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h +index 95f999576131..3b3cc5ba579a 100644 +--- a/arch/x86/include/asm/stacktrace.h ++++ b/arch/x86/include/asm/stacktrace.h +@@ -15,7 +15,7 @@ enum stack_type { + STACK_TYPE_TASK, + STACK_TYPE_IRQ, + STACK_TYPE_SOFTIRQ, +- STACK_TYPE_SYSENTER, ++ STACK_TYPE_ENTRY, + STACK_TYPE_EXCEPTION, + STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, + }; +@@ -28,7 +28,7 @@ struct stack_info { + bool in_task_stack(unsigned long *stack, struct task_struct *task, + struct stack_info *info); + +-bool in_sysenter_stack(unsigned long *stack, struct stack_info *info); ++bool in_entry_stack(unsigned long *stack, struct stack_info *info); + + int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask); +diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c +index 40c3fab107ac..25b4832e9c28 100644 +--- a/arch/x86/kernel/asm-offsets.c ++++ b/arch/x86/kernel/asm-offsets.c +@@ -96,6 +96,6 @@ void common(void) { + /* Layout info for cpu_entry_area */ + OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); +- OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page); +- DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); ++ OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); ++ DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); + } +diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c +index c4f23da7a0f0..4dba34cb777d 100644 +--- a/arch/x86/kernel/asm-offsets_32.c ++++ b/arch/x86/kernel/asm-offsets_32.c +@@ -50,7 +50,7 @@ void foo(void) + + /* Offset from the sysenter stack to tss.sp0 */ + DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - +- offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack)); ++ offsetofend(struct cpu_entry_area, entry_stack_page.stack)); + + #ifdef CONFIG_CC_STACKPROTECTOR + BLANK(); +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index fcdba90e0890..7a8a5d436566 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -487,8 +487,8 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); + #endif + +-static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page, +- SYSENTER_stack_storage); ++static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, ++ entry_stack_storage); + + static void __init + set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) +@@ -523,8 +523,8 @@ static void __init setup_cpu_entry_area(int cpu) + #endif + + __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); +- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page), +- per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1, ++ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), ++ per_cpu_ptr(&entry_stack_storage, cpu), 1, + PAGE_KERNEL); + + /* +@@ -1315,7 +1315,7 @@ void enable_sep_cpu(void) + + tss->x86_tss.ss1 = __KERNEL_CS; + wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); +- wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0); ++ wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); + + put_cpu(); +@@ -1441,7 +1441,7 @@ void syscall_init(void) + * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). + */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); +- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); ++ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); + #else + wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); +@@ -1655,7 +1655,7 @@ void cpu_init(void) + */ + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + load_TR_desc(); +- load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); ++ load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); + + load_mm_ldt(&init_mm); + +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index b005e5ef6738..55bf1c3b5319 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -43,9 +43,9 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, + return true; + } + +-bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) ++bool in_entry_stack(unsigned long *stack, struct stack_info *info) + { +- struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id()); ++ struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); + + void *begin = ss; + void *end = ss + 1; +@@ -53,7 +53,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) + if ((void *)stack < begin || (void *)stack >= end) + return false; + +- info->type = STACK_TYPE_SYSENTER; ++ info->type = STACK_TYPE_ENTRY; + info->begin = begin; + info->end = end; + info->next_sp = NULL; +@@ -111,13 +111,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + * - task stack + * - interrupt stack + * - HW exception stacks (double fault, nmi, debug, mce) +- * - SYSENTER stack ++ * - entry stack + * + * x86-32 can have up to four stacks: + * - task stack + * - softirq stack + * - hardirq stack +- * - SYSENTER stack ++ * - entry stack + */ + for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + const char *stack_name; +diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c +index 3160bf2d100e..4580ba0204f6 100644 +--- a/arch/x86/kernel/dumpstack_32.c ++++ b/arch/x86/kernel/dumpstack_32.c +@@ -25,8 +25,8 @@ const char *stack_type_name(enum stack_type type) + if (type == STACK_TYPE_SOFTIRQ) + return "SOFTIRQ"; + +- if (type == STACK_TYPE_SYSENTER) +- return "SYSENTER"; ++ if (type == STACK_TYPE_ENTRY) ++ return "ENTRY_TRAMPOLINE"; + + return NULL; + } +@@ -95,7 +95,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, + if (task != current) + goto unknown; + +- if (in_sysenter_stack(stack, info)) ++ if (in_entry_stack(stack, info)) + goto recursion_check; + + if (in_hardirq_stack(stack, info)) +diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c +index f5107b659f86..7d9c0e06afc2 100644 +--- a/arch/x86/kernel/dumpstack_64.c ++++ b/arch/x86/kernel/dumpstack_64.c +@@ -36,8 +36,14 @@ const char *stack_type_name(enum stack_type type) + if (type == STACK_TYPE_IRQ) + return "IRQ"; + +- if (type == STACK_TYPE_SYSENTER) +- return "SYSENTER"; ++ if (type == STACK_TYPE_ENTRY) { ++ /* ++ * On 64-bit, we have a generic entry stack that we ++ * use for all the kernel entry points, including ++ * SYSENTER. ++ */ ++ return "ENTRY_TRAMPOLINE"; ++ } + + if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) + return exception_stack_names[type - STACK_TYPE_EXCEPTION]; +@@ -117,7 +123,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, + if (in_irq_stack(stack, info)) + goto recursion_check; + +- if (in_sysenter_stack(stack, info)) ++ if (in_entry_stack(stack, info)) + goto recursion_check; + + goto unknown; +diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S +index 3ef7800007f8..634c6a78885c 100644 +--- a/arch/x86/entry/entry_32.S ++++ b/arch/x86/entry/entry_32.S +@@ -949,9 +949,9 @@ ENTRY(debug) + + /* Are we currently on the SYSENTER stack? */ + movl PER_CPU_VAR(cpu_entry_area), %ecx +- addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx +- subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ +- cmpl $SIZEOF_SYSENTER_stack, %ecx ++ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx ++ subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ ++ cmpl $SIZEOF_entry_stack, %ecx + jb .Ldebug_from_sysenter_stack + + TRACE_IRQS_OFF +@@ -993,9 +993,9 @@ ENTRY(nmi) + + /* Are we currently on the SYSENTER stack? */ + movl PER_CPU_VAR(cpu_entry_area), %ecx +- addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx +- subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ +- cmpl $SIZEOF_SYSENTER_stack, %ecx ++ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx ++ subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ ++ cmpl $SIZEOF_entry_stack, %ecx + jb .Lnmi_from_sysenter_stack + + /* Not on SYSENTER stack. */ +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 157860b3569f..03e052f02176 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -153,8 +153,8 @@ END(native_usergs_sysret64) + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) + + /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ +-#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \ +- SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA ++#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \ ++ SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA + + ENTRY(entry_SYSCALL_64_trampoline) + UNWIND_HINT_EMPTY +-- +2.14.2 + diff --git a/patches/kernel/0176-x86-uv-Use-the-right-TLB-flush-API.patch b/patches/kernel/0176-x86-uv-Use-the-right-TLB-flush-API.patch deleted file mode 100644 index cffa705..0000000 --- a/patches/kernel/0176-x86-uv-Use-the-right-TLB-flush-API.patch +++ /dev/null @@ -1,64 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Tue, 5 Dec 2017 13:34:50 +0100 -Subject: [PATCH] x86/uv: Use the right TLB-flush API -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Since uv_flush_tlb_others() implements flush_tlb_others() which is -about flushing user mappings, we should use __flush_tlb_single(), -which too is about flushing user mappings. - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Thomas Gleixner -Acked-by: Andrew Banman -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Mike Travis -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 3e46e0f5ee3643a1239be9046c7ba6c66ca2b329) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 532216cdf02174dc08ca998b570c4699899fa355) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/platform/uv/tlb_uv.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c -index f44c0bc95aa2..8538a6723171 100644 ---- a/arch/x86/platform/uv/tlb_uv.c -+++ b/arch/x86/platform/uv/tlb_uv.c -@@ -299,7 +299,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, - local_flush_tlb(); - stat->d_alltlb++; - } else { -- __flush_tlb_one(msg->address); -+ __flush_tlb_single(msg->address); - stat->d_onetlb++; - } - stat->d_requestee++; --- -2.14.2 - diff --git a/patches/kernel/0177-x86-microcode-Dont-abuse-the-TLB-flush-interface.patch b/patches/kernel/0177-x86-microcode-Dont-abuse-the-TLB-flush-interface.patch deleted file mode 100644 index 7873a60..0000000 --- a/patches/kernel/0177-x86-microcode-Dont-abuse-the-TLB-flush-interface.patch +++ /dev/null @@ -1,126 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Tue, 5 Dec 2017 13:34:51 +0100 -Subject: [PATCH] x86/microcode: Dont abuse the TLB-flush interface -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Commit: - - ec400ddeff20 ("x86/microcode_intel_early.c: Early update ucode on Intel's CPU") - -... grubbed into tlbflush internals without coherent explanation. - -Since it says its a precaution and the SDM doesn't mention anything like -this, take it out back. - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: fenghua.yu@intel.com -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 23cb7d46f371844c004784ad9552a57446f73e5a) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 0f3d96d1e5aa4d9538ab1a918fb49f2c57ebb6f5) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/tlbflush.h | 19 ++++++------------- - arch/x86/kernel/cpu/microcode/intel.c | 13 ------------- - 2 files changed, 6 insertions(+), 26 deletions(-) - -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index 6533da3036c9..6d2688a6fda0 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -234,20 +234,9 @@ static inline void __native_flush_tlb(void) - preempt_enable(); - } - --static inline void __native_flush_tlb_global_irq_disabled(void) --{ -- unsigned long cr4; -- -- cr4 = this_cpu_read(cpu_tlbstate.cr4); -- /* clear PGE */ -- native_write_cr4(cr4 & ~X86_CR4_PGE); -- /* write old PGE again and flush TLBs */ -- native_write_cr4(cr4); --} -- - static inline void __native_flush_tlb_global(void) - { -- unsigned long flags; -+ unsigned long cr4, flags; - - if (static_cpu_has(X86_FEATURE_INVPCID)) { - /* -@@ -265,7 +254,11 @@ static inline void __native_flush_tlb_global(void) - */ - raw_local_irq_save(flags); - -- __native_flush_tlb_global_irq_disabled(); -+ cr4 = this_cpu_read(cpu_tlbstate.cr4); -+ /* toggle PGE */ -+ native_write_cr4(cr4 ^ X86_CR4_PGE); -+ /* write old PGE again and flush TLBs */ -+ native_write_cr4(cr4); - - raw_local_irq_restore(flags); - } -diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c -index 636a5fcfdeb7..d9a8f69101aa 100644 ---- a/arch/x86/kernel/cpu/microcode/intel.c -+++ b/arch/x86/kernel/cpu/microcode/intel.c -@@ -564,15 +564,6 @@ static void print_ucode(struct ucode_cpu_info *uci) - } - #else - --/* -- * Flush global tlb. We only do this in x86_64 where paging has been enabled -- * already and PGE should be enabled as well. -- */ --static inline void flush_tlb_early(void) --{ -- __native_flush_tlb_global_irq_disabled(); --} -- - static inline void print_ucode(struct ucode_cpu_info *uci) - { - struct microcode_intel *mc; -@@ -601,10 +592,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) - if (rev != mc->hdr.rev) - return -1; - --#ifdef CONFIG_X86_64 -- /* Flush global tlb. This is precaution. */ -- flush_tlb_early(); --#endif - uci->cpu_sig.rev = rev; - - if (early) --- -2.14.2 - diff --git a/patches/kernel/0177-x86-uv-Use-the-right-TLB-flush-API.patch b/patches/kernel/0177-x86-uv-Use-the-right-TLB-flush-API.patch new file mode 100644 index 0000000..cffa705 --- /dev/null +++ b/patches/kernel/0177-x86-uv-Use-the-right-TLB-flush-API.patch @@ -0,0 +1,64 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Tue, 5 Dec 2017 13:34:50 +0100 +Subject: [PATCH] x86/uv: Use the right TLB-flush API +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Since uv_flush_tlb_others() implements flush_tlb_others() which is +about flushing user mappings, we should use __flush_tlb_single(), +which too is about flushing user mappings. + +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Thomas Gleixner +Acked-by: Andrew Banman +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Mike Travis +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 3e46e0f5ee3643a1239be9046c7ba6c66ca2b329) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 532216cdf02174dc08ca998b570c4699899fa355) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/platform/uv/tlb_uv.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c +index f44c0bc95aa2..8538a6723171 100644 +--- a/arch/x86/platform/uv/tlb_uv.c ++++ b/arch/x86/platform/uv/tlb_uv.c +@@ -299,7 +299,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, + local_flush_tlb(); + stat->d_alltlb++; + } else { +- __flush_tlb_one(msg->address); ++ __flush_tlb_single(msg->address); + stat->d_onetlb++; + } + stat->d_requestee++; +-- +2.14.2 + diff --git a/patches/kernel/0178-x86-microcode-Dont-abuse-the-TLB-flush-interface.patch b/patches/kernel/0178-x86-microcode-Dont-abuse-the-TLB-flush-interface.patch new file mode 100644 index 0000000..7873a60 --- /dev/null +++ b/patches/kernel/0178-x86-microcode-Dont-abuse-the-TLB-flush-interface.patch @@ -0,0 +1,126 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Tue, 5 Dec 2017 13:34:51 +0100 +Subject: [PATCH] x86/microcode: Dont abuse the TLB-flush interface +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Commit: + + ec400ddeff20 ("x86/microcode_intel_early.c: Early update ucode on Intel's CPU") + +... grubbed into tlbflush internals without coherent explanation. + +Since it says its a precaution and the SDM doesn't mention anything like +this, take it out back. + +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: fenghua.yu@intel.com +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 23cb7d46f371844c004784ad9552a57446f73e5a) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 0f3d96d1e5aa4d9538ab1a918fb49f2c57ebb6f5) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/tlbflush.h | 19 ++++++------------- + arch/x86/kernel/cpu/microcode/intel.c | 13 ------------- + 2 files changed, 6 insertions(+), 26 deletions(-) + +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 6533da3036c9..6d2688a6fda0 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -234,20 +234,9 @@ static inline void __native_flush_tlb(void) + preempt_enable(); + } + +-static inline void __native_flush_tlb_global_irq_disabled(void) +-{ +- unsigned long cr4; +- +- cr4 = this_cpu_read(cpu_tlbstate.cr4); +- /* clear PGE */ +- native_write_cr4(cr4 & ~X86_CR4_PGE); +- /* write old PGE again and flush TLBs */ +- native_write_cr4(cr4); +-} +- + static inline void __native_flush_tlb_global(void) + { +- unsigned long flags; ++ unsigned long cr4, flags; + + if (static_cpu_has(X86_FEATURE_INVPCID)) { + /* +@@ -265,7 +254,11 @@ static inline void __native_flush_tlb_global(void) + */ + raw_local_irq_save(flags); + +- __native_flush_tlb_global_irq_disabled(); ++ cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ /* toggle PGE */ ++ native_write_cr4(cr4 ^ X86_CR4_PGE); ++ /* write old PGE again and flush TLBs */ ++ native_write_cr4(cr4); + + raw_local_irq_restore(flags); + } +diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c +index 636a5fcfdeb7..d9a8f69101aa 100644 +--- a/arch/x86/kernel/cpu/microcode/intel.c ++++ b/arch/x86/kernel/cpu/microcode/intel.c +@@ -564,15 +564,6 @@ static void print_ucode(struct ucode_cpu_info *uci) + } + #else + +-/* +- * Flush global tlb. We only do this in x86_64 where paging has been enabled +- * already and PGE should be enabled as well. +- */ +-static inline void flush_tlb_early(void) +-{ +- __native_flush_tlb_global_irq_disabled(); +-} +- + static inline void print_ucode(struct ucode_cpu_info *uci) + { + struct microcode_intel *mc; +@@ -601,10 +592,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) + if (rev != mc->hdr.rev) + return -1; + +-#ifdef CONFIG_X86_64 +- /* Flush global tlb. This is precaution. */ +- flush_tlb_early(); +-#endif + uci->cpu_sig.rev = rev; + + if (early) +-- +2.14.2 + diff --git a/patches/kernel/0178-x86-mm-Use-__flush_tlb_one-for-kernel-memory.patch b/patches/kernel/0178-x86-mm-Use-__flush_tlb_one-for-kernel-memory.patch deleted file mode 100644 index 0474955..0000000 --- a/patches/kernel/0178-x86-mm-Use-__flush_tlb_one-for-kernel-memory.patch +++ /dev/null @@ -1,61 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Tue, 5 Dec 2017 13:34:49 +0100 -Subject: [PATCH] x86/mm: Use __flush_tlb_one() for kernel memory -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -__flush_tlb_single() is for user mappings, __flush_tlb_one() for -kernel mappings. - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit a501686b2923ce6f2ff2b1d0d50682c6411baf72) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 9d23f46143933cd29576b6aa2b1827f3f39b9cf8) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/tlb.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c -index ed06f1593390..5b4342c5039c 100644 ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -546,7 +546,7 @@ static void do_kernel_range_flush(void *info) - - /* flush range by one by one 'invlpg' */ - for (addr = f->start; addr < f->end; addr += PAGE_SIZE) -- __flush_tlb_single(addr); -+ __flush_tlb_one(addr); - } - - void flush_tlb_kernel_range(unsigned long start, unsigned long end) --- -2.14.2 - diff --git a/patches/kernel/0179-x86-mm-Remove-superfluous-barriers.patch b/patches/kernel/0179-x86-mm-Remove-superfluous-barriers.patch deleted file mode 100644 index d3017fc..0000000 --- a/patches/kernel/0179-x86-mm-Remove-superfluous-barriers.patch +++ /dev/null @@ -1,72 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Tue, 5 Dec 2017 13:34:46 +0100 -Subject: [PATCH] x86/mm: Remove superfluous barriers -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -atomic64_inc_return() already implies smp_mb() before and after. - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit b5fc6d943808b570bdfbec80f40c6b3855f1c48b) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 31a37930df33315a7006b46706f6babdb57db1f4) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/tlbflush.h | 8 +------- - 1 file changed, 1 insertion(+), 7 deletions(-) - -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index 6d2688a6fda0..bc1460b4737b 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -59,19 +59,13 @@ static inline void invpcid_flush_all_nonglobals(void) - - static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) - { -- u64 new_tlb_gen; -- - /* - * Bump the generation count. This also serves as a full barrier - * that synchronizes with switch_mm(): callers are required to order - * their read of mm_cpumask after their writes to the paging - * structures. - */ -- smp_mb__before_atomic(); -- new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); -- smp_mb__after_atomic(); -- -- return new_tlb_gen; -+ return atomic64_inc_return(&mm->context.tlb_gen); - } - - #ifdef CONFIG_PARAVIRT --- -2.14.2 - diff --git a/patches/kernel/0179-x86-mm-Use-__flush_tlb_one-for-kernel-memory.patch b/patches/kernel/0179-x86-mm-Use-__flush_tlb_one-for-kernel-memory.patch new file mode 100644 index 0000000..0474955 --- /dev/null +++ b/patches/kernel/0179-x86-mm-Use-__flush_tlb_one-for-kernel-memory.patch @@ -0,0 +1,61 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Tue, 5 Dec 2017 13:34:49 +0100 +Subject: [PATCH] x86/mm: Use __flush_tlb_one() for kernel memory +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +__flush_tlb_single() is for user mappings, __flush_tlb_one() for +kernel mappings. + +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit a501686b2923ce6f2ff2b1d0d50682c6411baf72) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 9d23f46143933cd29576b6aa2b1827f3f39b9cf8) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/tlb.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index ed06f1593390..5b4342c5039c 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -546,7 +546,7 @@ static void do_kernel_range_flush(void *info) + + /* flush range by one by one 'invlpg' */ + for (addr = f->start; addr < f->end; addr += PAGE_SIZE) +- __flush_tlb_single(addr); ++ __flush_tlb_one(addr); + } + + void flush_tlb_kernel_range(unsigned long start, unsigned long end) +-- +2.14.2 + diff --git a/patches/kernel/0180-x86-mm-Add-comments-to-clarify-which-TLB-flush-funct.patch b/patches/kernel/0180-x86-mm-Add-comments-to-clarify-which-TLB-flush-funct.patch deleted file mode 100644 index d750f16..0000000 --- a/patches/kernel/0180-x86-mm-Add-comments-to-clarify-which-TLB-flush-funct.patch +++ /dev/null @@ -1,113 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Tue, 5 Dec 2017 13:34:52 +0100 -Subject: [PATCH] x86/mm: Add comments to clarify which TLB-flush functions are - supposed to flush what -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Per popular request.. - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(backported from commit 3f67af51e56f291d7417d77c4f67cd774633c5e1) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 8394b666c2b3b1fc5279a897c96b196531923f3b) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/tlbflush.h | 24 ++++++++++++++++++++++-- - 1 file changed, 22 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index bc1460b4737b..ed5d483c4a1b 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -216,6 +216,10 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) - cr4_set_bits(mask); - } - -+ -+/* -+ * flush the entire current user mapping -+ */ - static inline void __native_flush_tlb(void) - { - /* -@@ -228,6 +232,9 @@ static inline void __native_flush_tlb(void) - preempt_enable(); - } - -+/* -+ * flush everything -+ */ - static inline void __native_flush_tlb_global(void) - { - unsigned long cr4, flags; -@@ -257,17 +264,27 @@ static inline void __native_flush_tlb_global(void) - raw_local_irq_restore(flags); - } - -+/* -+ * flush one page in the user mapping -+ */ - static inline void __native_flush_tlb_single(unsigned long addr) - { - asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); - } - -+/* -+ * flush everything -+ */ - static inline void __flush_tlb_all(void) - { -- if (boot_cpu_has(X86_FEATURE_PGE)) -+ if (boot_cpu_has(X86_FEATURE_PGE)) { - __flush_tlb_global(); -- else -+ } else { -+ /* -+ * !PGE -> !PCID (setup_pcid()), thus every flush is total. -+ */ - __flush_tlb(); -+ } - - /* - * Note: if we somehow had PCID but not PGE, then this wouldn't work -- -@@ -278,6 +295,9 @@ static inline void __flush_tlb_all(void) - */ - } - -+/* -+ * flush one page in the kernel mapping -+ */ - static inline void __flush_tlb_one(unsigned long addr) - { - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); --- -2.14.2 - diff --git a/patches/kernel/0180-x86-mm-Remove-superfluous-barriers.patch b/patches/kernel/0180-x86-mm-Remove-superfluous-barriers.patch new file mode 100644 index 0000000..d3017fc --- /dev/null +++ b/patches/kernel/0180-x86-mm-Remove-superfluous-barriers.patch @@ -0,0 +1,72 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Tue, 5 Dec 2017 13:34:46 +0100 +Subject: [PATCH] x86/mm: Remove superfluous barriers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +atomic64_inc_return() already implies smp_mb() before and after. + +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit b5fc6d943808b570bdfbec80f40c6b3855f1c48b) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 31a37930df33315a7006b46706f6babdb57db1f4) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/tlbflush.h | 8 +------- + 1 file changed, 1 insertion(+), 7 deletions(-) + +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 6d2688a6fda0..bc1460b4737b 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -59,19 +59,13 @@ static inline void invpcid_flush_all_nonglobals(void) + + static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) + { +- u64 new_tlb_gen; +- + /* + * Bump the generation count. This also serves as a full barrier + * that synchronizes with switch_mm(): callers are required to order + * their read of mm_cpumask after their writes to the paging + * structures. + */ +- smp_mb__before_atomic(); +- new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); +- smp_mb__after_atomic(); +- +- return new_tlb_gen; ++ return atomic64_inc_return(&mm->context.tlb_gen); + } + + #ifdef CONFIG_PARAVIRT +-- +2.14.2 + diff --git a/patches/kernel/0181-x86-mm-Add-comments-to-clarify-which-TLB-flush-funct.patch b/patches/kernel/0181-x86-mm-Add-comments-to-clarify-which-TLB-flush-funct.patch new file mode 100644 index 0000000..d750f16 --- /dev/null +++ b/patches/kernel/0181-x86-mm-Add-comments-to-clarify-which-TLB-flush-funct.patch @@ -0,0 +1,113 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Tue, 5 Dec 2017 13:34:52 +0100 +Subject: [PATCH] x86/mm: Add comments to clarify which TLB-flush functions are + supposed to flush what +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Per popular request.. + +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(backported from commit 3f67af51e56f291d7417d77c4f67cd774633c5e1) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 8394b666c2b3b1fc5279a897c96b196531923f3b) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/tlbflush.h | 24 ++++++++++++++++++++++-- + 1 file changed, 22 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index bc1460b4737b..ed5d483c4a1b 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -216,6 +216,10 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) + cr4_set_bits(mask); + } + ++ ++/* ++ * flush the entire current user mapping ++ */ + static inline void __native_flush_tlb(void) + { + /* +@@ -228,6 +232,9 @@ static inline void __native_flush_tlb(void) + preempt_enable(); + } + ++/* ++ * flush everything ++ */ + static inline void __native_flush_tlb_global(void) + { + unsigned long cr4, flags; +@@ -257,17 +264,27 @@ static inline void __native_flush_tlb_global(void) + raw_local_irq_restore(flags); + } + ++/* ++ * flush one page in the user mapping ++ */ + static inline void __native_flush_tlb_single(unsigned long addr) + { + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + } + ++/* ++ * flush everything ++ */ + static inline void __flush_tlb_all(void) + { +- if (boot_cpu_has(X86_FEATURE_PGE)) ++ if (boot_cpu_has(X86_FEATURE_PGE)) { + __flush_tlb_global(); +- else ++ } else { ++ /* ++ * !PGE -> !PCID (setup_pcid()), thus every flush is total. ++ */ + __flush_tlb(); ++ } + + /* + * Note: if we somehow had PCID but not PGE, then this wouldn't work -- +@@ -278,6 +295,9 @@ static inline void __flush_tlb_all(void) + */ + } + ++/* ++ * flush one page in the kernel mapping ++ */ + static inline void __flush_tlb_one(unsigned long addr) + { + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); +-- +2.14.2 + diff --git a/patches/kernel/0181-x86-mm-Move-the-CR3-construction-functions-to-tlbflu.patch b/patches/kernel/0181-x86-mm-Move-the-CR3-construction-functions-to-tlbflu.patch deleted file mode 100644 index 8927557..0000000 --- a/patches/kernel/0181-x86-mm-Move-the-CR3-construction-functions-to-tlbflu.patch +++ /dev/null @@ -1,179 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Mon, 4 Dec 2017 15:07:54 +0100 -Subject: [PATCH] x86/mm: Move the CR3 construction functions to tlbflush.h -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -For flushing the TLB, the ASID which has been programmed into the hardware -must be known. That differs from what is in 'cpu_tlbstate'. - -Add functions to transform the 'cpu_tlbstate' values into to the one -programmed into the hardware (CR3). - -It's not easy to include mmu_context.h into tlbflush.h, so just move the -CR3 building over to tlbflush.h. - -Signed-off-by: Dave Hansen -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 50fb83a62cf472dc53ba23bd3f7bd6c1b2b3b53e) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit f741923acf51c1061c11b45a168f8864d37dc5cd) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/mmu_context.h | 29 +---------------------------- - arch/x86/include/asm/tlbflush.h | 26 ++++++++++++++++++++++++++ - arch/x86/mm/tlb.c | 8 ++++---- - 3 files changed, 31 insertions(+), 32 deletions(-) - -diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h -index 47ec51a821e8..89a01ad7e370 100644 ---- a/arch/x86/include/asm/mmu_context.h -+++ b/arch/x86/include/asm/mmu_context.h -@@ -289,33 +289,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, - return __pkru_allows_pkey(vma_pkey(vma), write); - } - --/* -- * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID -- * bits. This serves two purposes. It prevents a nasty situation in -- * which PCID-unaware code saves CR3, loads some other value (with PCID -- * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if -- * the saved ASID was nonzero. It also means that any bugs involving -- * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger -- * deterministically. -- */ -- --static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) --{ -- if (static_cpu_has(X86_FEATURE_PCID)) { -- VM_WARN_ON_ONCE(asid > 4094); -- return __sme_pa(mm->pgd) | (asid + 1); -- } else { -- VM_WARN_ON_ONCE(asid != 0); -- return __sme_pa(mm->pgd); -- } --} -- --static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) --{ -- VM_WARN_ON_ONCE(asid > 4094); -- return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH; --} -- - /* - * This can be used from process context to figure out what the value of - * CR3 is without needing to do a (slow) __read_cr3(). -@@ -325,7 +298,7 @@ static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) - */ - static inline unsigned long __get_current_cr3_fast(void) - { -- unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm), -+ unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, - this_cpu_read(cpu_tlbstate.loaded_mm_asid)); - - /* For now, be very restrictive about when this can be called. */ -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index ed5d483c4a1b..3a421b164868 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -68,6 +68,32 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) - return atomic64_inc_return(&mm->context.tlb_gen); - } - -+/* -+ * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID bits. -+ * This serves two purposes. It prevents a nasty situation in which -+ * PCID-unaware code saves CR3, loads some other value (with PCID == 0), -+ * and then restores CR3, thus corrupting the TLB for ASID 0 if the saved -+ * ASID was nonzero. It also means that any bugs involving loading a -+ * PCID-enabled CR3 with CR4.PCIDE off will trigger deterministically. -+ */ -+struct pgd_t; -+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) -+{ -+ if (static_cpu_has(X86_FEATURE_PCID)) { -+ VM_WARN_ON_ONCE(asid > 4094); -+ return __sme_pa(pgd) | (asid + 1); -+ } else { -+ VM_WARN_ON_ONCE(asid != 0); -+ return __sme_pa(pgd); -+ } -+} -+ -+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) -+{ -+ VM_WARN_ON_ONCE(asid > 4094); -+ return __sme_pa(pgd) | (asid + 1) | CR3_NOFLUSH; -+} -+ - #ifdef CONFIG_PARAVIRT - #include - #else -diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c -index 5b4342c5039c..87d4f961bcb4 100644 ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -126,7 +126,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - * does something like write_cr3(read_cr3_pa()). - */ - #ifdef CONFIG_DEBUG_VM -- if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { -+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) { - /* - * If we were to BUG here, we'd be very likely to kill - * the system so hard that we don't see the call trace. -@@ -193,7 +193,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - if (need_flush) { - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); -- write_cr3(build_cr3(next, new_asid)); -+ write_cr3(build_cr3(next->pgd, new_asid)); - - /* - * NB: This gets called via leave_mm() in the idle path -@@ -206,7 +206,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - } else { - /* The new ASID is already up to date. */ -- write_cr3(build_cr3_noflush(next, new_asid)); -+ write_cr3(build_cr3_noflush(next->pgd, new_asid)); - - /* See above wrt _rcuidle. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); -@@ -283,7 +283,7 @@ void initialize_tlbstate_and_flush(void) - !(cr4_read_shadow() & X86_CR4_PCIDE)); - - /* Force ASID 0 and force a TLB flush. */ -- write_cr3(build_cr3(mm, 0)); -+ write_cr3(build_cr3(mm->pgd, 0)); - - /* Reinitialize tlbstate. */ - this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); --- -2.14.2 - diff --git a/patches/kernel/0182-x86-mm-Move-the-CR3-construction-functions-to-tlbflu.patch b/patches/kernel/0182-x86-mm-Move-the-CR3-construction-functions-to-tlbflu.patch new file mode 100644 index 0000000..8927557 --- /dev/null +++ b/patches/kernel/0182-x86-mm-Move-the-CR3-construction-functions-to-tlbflu.patch @@ -0,0 +1,179 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Mon, 4 Dec 2017 15:07:54 +0100 +Subject: [PATCH] x86/mm: Move the CR3 construction functions to tlbflush.h +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +For flushing the TLB, the ASID which has been programmed into the hardware +must be known. That differs from what is in 'cpu_tlbstate'. + +Add functions to transform the 'cpu_tlbstate' values into to the one +programmed into the hardware (CR3). + +It's not easy to include mmu_context.h into tlbflush.h, so just move the +CR3 building over to tlbflush.h. + +Signed-off-by: Dave Hansen +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 50fb83a62cf472dc53ba23bd3f7bd6c1b2b3b53e) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit f741923acf51c1061c11b45a168f8864d37dc5cd) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/mmu_context.h | 29 +---------------------------- + arch/x86/include/asm/tlbflush.h | 26 ++++++++++++++++++++++++++ + arch/x86/mm/tlb.c | 8 ++++---- + 3 files changed, 31 insertions(+), 32 deletions(-) + +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index 47ec51a821e8..89a01ad7e370 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -289,33 +289,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, + return __pkru_allows_pkey(vma_pkey(vma), write); + } + +-/* +- * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID +- * bits. This serves two purposes. It prevents a nasty situation in +- * which PCID-unaware code saves CR3, loads some other value (with PCID +- * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if +- * the saved ASID was nonzero. It also means that any bugs involving +- * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger +- * deterministically. +- */ +- +-static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) +-{ +- if (static_cpu_has(X86_FEATURE_PCID)) { +- VM_WARN_ON_ONCE(asid > 4094); +- return __sme_pa(mm->pgd) | (asid + 1); +- } else { +- VM_WARN_ON_ONCE(asid != 0); +- return __sme_pa(mm->pgd); +- } +-} +- +-static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) +-{ +- VM_WARN_ON_ONCE(asid > 4094); +- return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH; +-} +- + /* + * This can be used from process context to figure out what the value of + * CR3 is without needing to do a (slow) __read_cr3(). +@@ -325,7 +298,7 @@ static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) + */ + static inline unsigned long __get_current_cr3_fast(void) + { +- unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm), ++ unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, + this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* For now, be very restrictive about when this can be called. */ +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index ed5d483c4a1b..3a421b164868 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -68,6 +68,32 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) + return atomic64_inc_return(&mm->context.tlb_gen); + } + ++/* ++ * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID bits. ++ * This serves two purposes. It prevents a nasty situation in which ++ * PCID-unaware code saves CR3, loads some other value (with PCID == 0), ++ * and then restores CR3, thus corrupting the TLB for ASID 0 if the saved ++ * ASID was nonzero. It also means that any bugs involving loading a ++ * PCID-enabled CR3 with CR4.PCIDE off will trigger deterministically. ++ */ ++struct pgd_t; ++static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) ++{ ++ if (static_cpu_has(X86_FEATURE_PCID)) { ++ VM_WARN_ON_ONCE(asid > 4094); ++ return __sme_pa(pgd) | (asid + 1); ++ } else { ++ VM_WARN_ON_ONCE(asid != 0); ++ return __sme_pa(pgd); ++ } ++} ++ ++static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) ++{ ++ VM_WARN_ON_ONCE(asid > 4094); ++ return __sme_pa(pgd) | (asid + 1) | CR3_NOFLUSH; ++} ++ + #ifdef CONFIG_PARAVIRT + #include + #else +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 5b4342c5039c..87d4f961bcb4 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -126,7 +126,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + * does something like write_cr3(read_cr3_pa()). + */ + #ifdef CONFIG_DEBUG_VM +- if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { ++ if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) { + /* + * If we were to BUG here, we'd be very likely to kill + * the system so hard that we don't see the call trace. +@@ -193,7 +193,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); +- write_cr3(build_cr3(next, new_asid)); ++ write_cr3(build_cr3(next->pgd, new_asid)); + + /* + * NB: This gets called via leave_mm() in the idle path +@@ -206,7 +206,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ +- write_cr3(build_cr3_noflush(next, new_asid)); ++ write_cr3(build_cr3_noflush(next->pgd, new_asid)); + + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); +@@ -283,7 +283,7 @@ void initialize_tlbstate_and_flush(void) + !(cr4_read_shadow() & X86_CR4_PCIDE)); + + /* Force ASID 0 and force a TLB flush. */ +- write_cr3(build_cr3(mm, 0)); ++ write_cr3(build_cr3(mm->pgd, 0)); + + /* Reinitialize tlbstate. */ + this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); +-- +2.14.2 + diff --git a/patches/kernel/0182-x86-mm-Remove-hard-coded-ASID-limit-checks.patch b/patches/kernel/0182-x86-mm-Remove-hard-coded-ASID-limit-checks.patch deleted file mode 100644 index e6c5079..0000000 --- a/patches/kernel/0182-x86-mm-Remove-hard-coded-ASID-limit-checks.patch +++ /dev/null @@ -1,96 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Mon, 4 Dec 2017 15:07:55 +0100 -Subject: [PATCH] x86/mm: Remove hard-coded ASID limit checks -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -First, it's nice to remove the magic numbers. - -Second, PAGE_TABLE_ISOLATION is going to consume half of the available ASID -space. The space is currently unused, but add a comment to spell out this -new restriction. - -Signed-off-by: Dave Hansen -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit cb0a9144a744e55207e24dcef812f05cd15a499a) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit fd5d001ae73ccd382d4270f53e27dcf61c4e4749) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/tlbflush.h | 20 ++++++++++++++++++-- - 1 file changed, 18 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index 3a421b164868..c1c10db4156c 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -68,6 +68,22 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) - return atomic64_inc_return(&mm->context.tlb_gen); - } - -+/* There are 12 bits of space for ASIDS in CR3 */ -+#define CR3_HW_ASID_BITS 12 -+/* -+ * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for -+ * user/kernel switches -+ */ -+#define PTI_CONSUMED_ASID_BITS 0 -+ -+#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS) -+/* -+ * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account -+ * for them being zero-based. Another -1 is because ASID 0 is reserved for -+ * use by non-PCID-aware users. -+ */ -+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) -+ - /* - * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID bits. - * This serves two purposes. It prevents a nasty situation in which -@@ -80,7 +96,7 @@ struct pgd_t; - static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) - { - if (static_cpu_has(X86_FEATURE_PCID)) { -- VM_WARN_ON_ONCE(asid > 4094); -+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); - return __sme_pa(pgd) | (asid + 1); - } else { - VM_WARN_ON_ONCE(asid != 0); -@@ -90,7 +106,7 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) - - static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) - { -- VM_WARN_ON_ONCE(asid > 4094); -+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); - return __sme_pa(pgd) | (asid + 1) | CR3_NOFLUSH; - } - --- -2.14.2 - diff --git a/patches/kernel/0183-x86-mm-Put-MMU-to-hardware-ASID-translation-in-one-p.patch b/patches/kernel/0183-x86-mm-Put-MMU-to-hardware-ASID-translation-in-one-p.patch deleted file mode 100644 index 51ab520..0000000 --- a/patches/kernel/0183-x86-mm-Put-MMU-to-hardware-ASID-translation-in-one-p.patch +++ /dev/null @@ -1,109 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Mon, 4 Dec 2017 15:07:56 +0100 -Subject: [PATCH] x86/mm: Put MMU to hardware ASID translation in one place -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -There are effectively two ASID types: - - 1. The one stored in the mmu_context that goes from 0..5 - 2. The one programmed into the hardware that goes from 1..6 - -This consolidates the locations where converting between the two (by doing -a +1) to a single place which gives us a nice place to comment. -PAGE_TABLE_ISOLATION will also need to, given an ASID, know which hardware -ASID to flush for the userspace mapping. - -Signed-off-by: Dave Hansen -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit dd95f1a4b5ca904c78e6a097091eb21436478abb) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 6f3e88a8f41123ac339d28cfdda5da0e85bec550) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/tlbflush.h | 31 +++++++++++++++++++------------ - 1 file changed, 19 insertions(+), 12 deletions(-) - -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index c1c10db4156c..ecd634f87e4e 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -84,30 +84,37 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) - */ - #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) - --/* -- * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID bits. -- * This serves two purposes. It prevents a nasty situation in which -- * PCID-unaware code saves CR3, loads some other value (with PCID == 0), -- * and then restores CR3, thus corrupting the TLB for ASID 0 if the saved -- * ASID was nonzero. It also means that any bugs involving loading a -- * PCID-enabled CR3 with CR4.PCIDE off will trigger deterministically. -- */ -+static inline u16 kern_pcid(u16 asid) -+{ -+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); -+ /* -+ * If PCID is on, ASID-aware code paths put the ASID+1 into the -+ * PCID bits. This serves two purposes. It prevents a nasty -+ * situation in which PCID-unaware code saves CR3, loads some other -+ * value (with PCID == 0), and then restores CR3, thus corrupting -+ * the TLB for ASID 0 if the saved ASID was nonzero. It also means -+ * that any bugs involving loading a PCID-enabled CR3 with -+ * CR4.PCIDE off will trigger deterministically. -+ */ -+ return asid + 1; -+} -+ - struct pgd_t; - static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) - { - if (static_cpu_has(X86_FEATURE_PCID)) { -- VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); -- return __sme_pa(pgd) | (asid + 1); -+ return __pa(pgd) | kern_pcid(asid); - } else { - VM_WARN_ON_ONCE(asid != 0); -- return __sme_pa(pgd); -+ return __pa(pgd); - } - } - - static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) - { - VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); -- return __sme_pa(pgd) | (asid + 1) | CR3_NOFLUSH; -+ VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID)); -+ return __pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; - } - - #ifdef CONFIG_PARAVIRT --- -2.14.2 - diff --git a/patches/kernel/0183-x86-mm-Remove-hard-coded-ASID-limit-checks.patch b/patches/kernel/0183-x86-mm-Remove-hard-coded-ASID-limit-checks.patch new file mode 100644 index 0000000..e6c5079 --- /dev/null +++ b/patches/kernel/0183-x86-mm-Remove-hard-coded-ASID-limit-checks.patch @@ -0,0 +1,96 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Mon, 4 Dec 2017 15:07:55 +0100 +Subject: [PATCH] x86/mm: Remove hard-coded ASID limit checks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +First, it's nice to remove the magic numbers. + +Second, PAGE_TABLE_ISOLATION is going to consume half of the available ASID +space. The space is currently unused, but add a comment to spell out this +new restriction. + +Signed-off-by: Dave Hansen +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit cb0a9144a744e55207e24dcef812f05cd15a499a) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit fd5d001ae73ccd382d4270f53e27dcf61c4e4749) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/tlbflush.h | 20 ++++++++++++++++++-- + 1 file changed, 18 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 3a421b164868..c1c10db4156c 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -68,6 +68,22 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) + return atomic64_inc_return(&mm->context.tlb_gen); + } + ++/* There are 12 bits of space for ASIDS in CR3 */ ++#define CR3_HW_ASID_BITS 12 ++/* ++ * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for ++ * user/kernel switches ++ */ ++#define PTI_CONSUMED_ASID_BITS 0 ++ ++#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS) ++/* ++ * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account ++ * for them being zero-based. Another -1 is because ASID 0 is reserved for ++ * use by non-PCID-aware users. ++ */ ++#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) ++ + /* + * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID bits. + * This serves two purposes. It prevents a nasty situation in which +@@ -80,7 +96,7 @@ struct pgd_t; + static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) + { + if (static_cpu_has(X86_FEATURE_PCID)) { +- VM_WARN_ON_ONCE(asid > 4094); ++ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + return __sme_pa(pgd) | (asid + 1); + } else { + VM_WARN_ON_ONCE(asid != 0); +@@ -90,7 +106,7 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) + + static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) + { +- VM_WARN_ON_ONCE(asid > 4094); ++ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + return __sme_pa(pgd) | (asid + 1) | CR3_NOFLUSH; + } + +-- +2.14.2 + diff --git a/patches/kernel/0184-x86-mm-Create-asm-invpcid.h.patch b/patches/kernel/0184-x86-mm-Create-asm-invpcid.h.patch deleted file mode 100644 index 78cf317..0000000 --- a/patches/kernel/0184-x86-mm-Create-asm-invpcid.h.patch +++ /dev/null @@ -1,168 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Tue, 5 Dec 2017 13:34:47 +0100 -Subject: [PATCH] x86/mm: Create asm/invpcid.h -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Unclutter tlbflush.h a little. - -Signed-off-by: Peter Zijlstra (Intel) -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 1a3b0caeb77edeac5ce5fa05e6a61c474c9a9745) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 5af02a8c43ce521f460891f6ba68af69428abe90) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/invpcid.h | 53 +++++++++++++++++++++++++++++++++++++++++ - arch/x86/include/asm/tlbflush.h | 49 +------------------------------------ - 2 files changed, 54 insertions(+), 48 deletions(-) - create mode 100644 arch/x86/include/asm/invpcid.h - -diff --git a/arch/x86/include/asm/invpcid.h b/arch/x86/include/asm/invpcid.h -new file mode 100644 -index 000000000000..989cfa86de85 ---- /dev/null -+++ b/arch/x86/include/asm/invpcid.h -@@ -0,0 +1,53 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _ASM_X86_INVPCID -+#define _ASM_X86_INVPCID -+ -+static inline void __invpcid(unsigned long pcid, unsigned long addr, -+ unsigned long type) -+{ -+ struct { u64 d[2]; } desc = { { pcid, addr } }; -+ -+ /* -+ * The memory clobber is because the whole point is to invalidate -+ * stale TLB entries and, especially if we're flushing global -+ * mappings, we don't want the compiler to reorder any subsequent -+ * memory accesses before the TLB flush. -+ * -+ * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and -+ * invpcid (%rcx), %rax in long mode. -+ */ -+ asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" -+ : : "m" (desc), "a" (type), "c" (&desc) : "memory"); -+} -+ -+#define INVPCID_TYPE_INDIV_ADDR 0 -+#define INVPCID_TYPE_SINGLE_CTXT 1 -+#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 -+#define INVPCID_TYPE_ALL_NON_GLOBAL 3 -+ -+/* Flush all mappings for a given pcid and addr, not including globals. */ -+static inline void invpcid_flush_one(unsigned long pcid, -+ unsigned long addr) -+{ -+ __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); -+} -+ -+/* Flush all mappings for a given PCID, not including globals. */ -+static inline void invpcid_flush_single_context(unsigned long pcid) -+{ -+ __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); -+} -+ -+/* Flush all mappings, including globals, for all PCIDs. */ -+static inline void invpcid_flush_all(void) -+{ -+ __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); -+} -+ -+/* Flush all mappings for all PCIDs except globals. */ -+static inline void invpcid_flush_all_nonglobals(void) -+{ -+ __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); -+} -+ -+#endif /* _ASM_X86_INVPCID */ -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index ecd634f87e4e..503f87c30c15 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -8,54 +8,7 @@ - #include - #include - #include -- --static inline void __invpcid(unsigned long pcid, unsigned long addr, -- unsigned long type) --{ -- struct { u64 d[2]; } desc = { { pcid, addr } }; -- -- /* -- * The memory clobber is because the whole point is to invalidate -- * stale TLB entries and, especially if we're flushing global -- * mappings, we don't want the compiler to reorder any subsequent -- * memory accesses before the TLB flush. -- * -- * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and -- * invpcid (%rcx), %rax in long mode. -- */ -- asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" -- : : "m" (desc), "a" (type), "c" (&desc) : "memory"); --} -- --#define INVPCID_TYPE_INDIV_ADDR 0 --#define INVPCID_TYPE_SINGLE_CTXT 1 --#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 --#define INVPCID_TYPE_ALL_NON_GLOBAL 3 -- --/* Flush all mappings for a given pcid and addr, not including globals. */ --static inline void invpcid_flush_one(unsigned long pcid, -- unsigned long addr) --{ -- __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); --} -- --/* Flush all mappings for a given PCID, not including globals. */ --static inline void invpcid_flush_single_context(unsigned long pcid) --{ -- __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); --} -- --/* Flush all mappings, including globals, for all PCIDs. */ --static inline void invpcid_flush_all(void) --{ -- __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); --} -- --/* Flush all mappings for all PCIDs except globals. */ --static inline void invpcid_flush_all_nonglobals(void) --{ -- __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); --} -+#include - - static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) - { --- -2.14.2 - diff --git a/patches/kernel/0184-x86-mm-Put-MMU-to-hardware-ASID-translation-in-one-p.patch b/patches/kernel/0184-x86-mm-Put-MMU-to-hardware-ASID-translation-in-one-p.patch new file mode 100644 index 0000000..51ab520 --- /dev/null +++ b/patches/kernel/0184-x86-mm-Put-MMU-to-hardware-ASID-translation-in-one-p.patch @@ -0,0 +1,109 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Mon, 4 Dec 2017 15:07:56 +0100 +Subject: [PATCH] x86/mm: Put MMU to hardware ASID translation in one place +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +There are effectively two ASID types: + + 1. The one stored in the mmu_context that goes from 0..5 + 2. The one programmed into the hardware that goes from 1..6 + +This consolidates the locations where converting between the two (by doing +a +1) to a single place which gives us a nice place to comment. +PAGE_TABLE_ISOLATION will also need to, given an ASID, know which hardware +ASID to flush for the userspace mapping. + +Signed-off-by: Dave Hansen +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit dd95f1a4b5ca904c78e6a097091eb21436478abb) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 6f3e88a8f41123ac339d28cfdda5da0e85bec550) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/tlbflush.h | 31 +++++++++++++++++++------------ + 1 file changed, 19 insertions(+), 12 deletions(-) + +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index c1c10db4156c..ecd634f87e4e 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -84,30 +84,37 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) + */ + #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) + +-/* +- * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID bits. +- * This serves two purposes. It prevents a nasty situation in which +- * PCID-unaware code saves CR3, loads some other value (with PCID == 0), +- * and then restores CR3, thus corrupting the TLB for ASID 0 if the saved +- * ASID was nonzero. It also means that any bugs involving loading a +- * PCID-enabled CR3 with CR4.PCIDE off will trigger deterministically. +- */ ++static inline u16 kern_pcid(u16 asid) ++{ ++ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); ++ /* ++ * If PCID is on, ASID-aware code paths put the ASID+1 into the ++ * PCID bits. This serves two purposes. It prevents a nasty ++ * situation in which PCID-unaware code saves CR3, loads some other ++ * value (with PCID == 0), and then restores CR3, thus corrupting ++ * the TLB for ASID 0 if the saved ASID was nonzero. It also means ++ * that any bugs involving loading a PCID-enabled CR3 with ++ * CR4.PCIDE off will trigger deterministically. ++ */ ++ return asid + 1; ++} ++ + struct pgd_t; + static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) + { + if (static_cpu_has(X86_FEATURE_PCID)) { +- VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); +- return __sme_pa(pgd) | (asid + 1); ++ return __pa(pgd) | kern_pcid(asid); + } else { + VM_WARN_ON_ONCE(asid != 0); +- return __sme_pa(pgd); ++ return __pa(pgd); + } + } + + static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) + { + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); +- return __sme_pa(pgd) | (asid + 1) | CR3_NOFLUSH; ++ VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID)); ++ return __pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; + } + + #ifdef CONFIG_PARAVIRT +-- +2.14.2 + diff --git a/patches/kernel/0185-x86-cpu_entry_area-Move-it-to-a-separate-unit.patch b/patches/kernel/0185-x86-cpu_entry_area-Move-it-to-a-separate-unit.patch deleted file mode 100644 index 84983a3..0000000 --- a/patches/kernel/0185-x86-cpu_entry_area-Move-it-to-a-separate-unit.patch +++ /dev/null @@ -1,400 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Wed, 20 Dec 2017 18:28:54 +0100 -Subject: [PATCH] x86/cpu_entry_area: Move it to a separate unit -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Separate the cpu_entry_area code out of cpu/common.c and the fixmap. - -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Dave Hansen -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Signed-off-by: Ingo Molnar -(cherry picked from commit ed1bbc40a0d10e0c5c74fe7bdc6298295cf40255) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 0fa11d2cd3d67af676aa2762ade282ba6d09cbe5) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/Makefile | 2 +- - arch/x86/include/asm/cpu_entry_area.h | 52 +++++++++++++++++ - arch/x86/include/asm/fixmap.h | 41 +------------- - arch/x86/kernel/cpu/common.c | 94 ------------------------------ - arch/x86/kernel/traps.c | 1 + - arch/x86/mm/cpu_entry_area.c | 104 ++++++++++++++++++++++++++++++++++ - 6 files changed, 159 insertions(+), 135 deletions(-) - create mode 100644 arch/x86/include/asm/cpu_entry_area.h - create mode 100644 arch/x86/mm/cpu_entry_area.c - -diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile -index 0fbdcb64f9f8..76f5399a8356 100644 ---- a/arch/x86/mm/Makefile -+++ b/arch/x86/mm/Makefile -@@ -2,7 +2,7 @@ - KCOV_INSTRUMENT_tlb.o := n - - obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ -- pat.o pgtable.o physaddr.o setup_nx.o tlb.o -+ pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o - - # Make sure __phys_addr has no stackprotector - nostackp := $(call cc-option, -fno-stack-protector) -diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h -new file mode 100644 -index 000000000000..5471826803af ---- /dev/null -+++ b/arch/x86/include/asm/cpu_entry_area.h -@@ -0,0 +1,52 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#ifndef _ASM_X86_CPU_ENTRY_AREA_H -+#define _ASM_X86_CPU_ENTRY_AREA_H -+ -+#include -+#include -+ -+/* -+ * cpu_entry_area is a percpu region that contains things needed by the CPU -+ * and early entry/exit code. Real types aren't used for all fields here -+ * to avoid circular header dependencies. -+ * -+ * Every field is a virtual alias of some other allocated backing store. -+ * There is no direct allocation of a struct cpu_entry_area. -+ */ -+struct cpu_entry_area { -+ char gdt[PAGE_SIZE]; -+ -+ /* -+ * The GDT is just below entry_stack and thus serves (on x86_64) as -+ * a a read-only guard page. -+ */ -+ struct entry_stack_page entry_stack_page; -+ -+ /* -+ * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because -+ * we need task switches to work, and task switches write to the TSS. -+ */ -+ struct tss_struct tss; -+ -+ char entry_trampoline[PAGE_SIZE]; -+ -+#ifdef CONFIG_X86_64 -+ /* -+ * Exception stacks used for IST entries. -+ * -+ * In the future, this should have a separate slot for each stack -+ * with guard pages between them. -+ */ -+ char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; -+#endif -+}; -+ -+#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) -+#define CPU_ENTRY_AREA_PAGES (CPU_ENTRY_AREA_SIZE / PAGE_SIZE) -+ -+DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); -+ -+extern void setup_cpu_entry_areas(void); -+ -+#endif -diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h -index a7fb137ad964..1b2521473480 100644 ---- a/arch/x86/include/asm/fixmap.h -+++ b/arch/x86/include/asm/fixmap.h -@@ -25,6 +25,7 @@ - #else - #include - #endif -+#include - - /* - * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall -@@ -44,46 +45,6 @@ extern unsigned long __FIXADDR_TOP; - PAGE_SIZE) - #endif - --/* -- * cpu_entry_area is a percpu region in the fixmap that contains things -- * needed by the CPU and early entry/exit code. Real types aren't used -- * for all fields here to avoid circular header dependencies. -- * -- * Every field is a virtual alias of some other allocated backing store. -- * There is no direct allocation of a struct cpu_entry_area. -- */ --struct cpu_entry_area { -- char gdt[PAGE_SIZE]; -- -- /* -- * The GDT is just below entry_stack and thus serves (on x86_64) as -- * a a read-only guard page. -- */ -- struct entry_stack_page entry_stack_page; -- -- /* -- * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because -- * we need task switches to work, and task switches write to the TSS. -- */ -- struct tss_struct tss; -- -- char entry_trampoline[PAGE_SIZE]; -- --#ifdef CONFIG_X86_64 -- /* -- * Exception stacks used for IST entries. -- * -- * In the future, this should have a separate slot for each stack -- * with guard pages between them. -- */ -- char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; --#endif --}; -- --#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) -- --extern void setup_cpu_entry_areas(void); -- - /* - * Here we define all the compile-time 'special' virtual - * addresses. The point is to have a constant address at -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index 7a8a5d436566..96171ce46d61 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -482,102 +482,8 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ - }; -- --static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks -- [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); --#endif -- --static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, -- entry_stack_storage); -- --static void __init --set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) --{ -- for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) -- __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); --} -- --/* Setup the fixmap mappings only once per-processor */ --static void __init setup_cpu_entry_area(int cpu) --{ --#ifdef CONFIG_X86_64 -- extern char _entry_trampoline[]; -- -- /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ -- pgprot_t gdt_prot = PAGE_KERNEL_RO; -- pgprot_t tss_prot = PAGE_KERNEL_RO; --#else -- /* -- * On native 32-bit systems, the GDT cannot be read-only because -- * our double fault handler uses a task gate, and entering through -- * a task gate needs to change an available TSS to busy. If the -- * GDT is read-only, that will triple fault. The TSS cannot be -- * read-only because the CPU writes to it on task switches. -- * -- * On Xen PV, the GDT must be read-only because the hypervisor -- * requires it. -- */ -- pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? -- PAGE_KERNEL_RO : PAGE_KERNEL; -- pgprot_t tss_prot = PAGE_KERNEL; --#endif -- -- __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); -- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), -- per_cpu_ptr(&entry_stack_storage, cpu), 1, -- PAGE_KERNEL); -- -- /* -- * The Intel SDM says (Volume 3, 7.2.1): -- * -- * Avoid placing a page boundary in the part of the TSS that the -- * processor reads during a task switch (the first 104 bytes). The -- * processor may not correctly perform address translations if a -- * boundary occurs in this area. During a task switch, the processor -- * reads and writes into the first 104 bytes of each TSS (using -- * contiguous physical addresses beginning with the physical address -- * of the first byte of the TSS). So, after TSS access begins, if -- * part of the 104 bytes is not physically contiguous, the processor -- * will access incorrect information without generating a page-fault -- * exception. -- * -- * There are also a lot of errata involving the TSS spanning a page -- * boundary. Assert that we're not doing that. -- */ -- BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ -- offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); -- BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); -- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), -- &per_cpu(cpu_tss_rw, cpu), -- sizeof(struct tss_struct) / PAGE_SIZE, -- tss_prot); -- --#ifdef CONFIG_X86_32 -- per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); - #endif - --#ifdef CONFIG_X86_64 -- BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); -- BUILD_BUG_ON(sizeof(exception_stacks) != -- sizeof(((struct cpu_entry_area *)0)->exception_stacks)); -- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), -- &per_cpu(exception_stacks, cpu), -- sizeof(exception_stacks) / PAGE_SIZE, -- PAGE_KERNEL); -- -- __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), -- __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); --#endif --} -- --void __init setup_cpu_entry_areas(void) --{ -- unsigned int cpu; -- -- for_each_possible_cpu(cpu) -- setup_cpu_entry_area(cpu); --} -- - /* Load the original GDT from the per-cpu structure */ - void load_direct_gdt(int cpu) - { -diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c -index 14b462eefa17..ef2d1b8a0516 100644 ---- a/arch/x86/kernel/traps.c -+++ b/arch/x86/kernel/traps.c -@@ -57,6 +57,7 @@ - #include - #include - #include -+#include - #include - #include - #include -diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c -new file mode 100644 -index 000000000000..235ff9cfaaf4 ---- /dev/null -+++ b/arch/x86/mm/cpu_entry_area.c -@@ -0,0 +1,104 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include -+#include -+ -+#include -+#include -+#include -+#include -+ -+static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); -+ -+#ifdef CONFIG_X86_64 -+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks -+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); -+#endif -+ -+static void __init -+set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) -+{ -+ for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) -+ __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); -+} -+ -+/* Setup the fixmap mappings only once per-processor */ -+static void __init setup_cpu_entry_area(int cpu) -+{ -+#ifdef CONFIG_X86_64 -+ extern char _entry_trampoline[]; -+ -+ /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ -+ pgprot_t gdt_prot = PAGE_KERNEL_RO; -+ pgprot_t tss_prot = PAGE_KERNEL_RO; -+#else -+ /* -+ * On native 32-bit systems, the GDT cannot be read-only because -+ * our double fault handler uses a task gate, and entering through -+ * a task gate needs to change an available TSS to busy. If the -+ * GDT is read-only, that will triple fault. The TSS cannot be -+ * read-only because the CPU writes to it on task switches. -+ * -+ * On Xen PV, the GDT must be read-only because the hypervisor -+ * requires it. -+ */ -+ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? -+ PAGE_KERNEL_RO : PAGE_KERNEL; -+ pgprot_t tss_prot = PAGE_KERNEL; -+#endif -+ -+ __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); -+ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), -+ per_cpu_ptr(&entry_stack_storage, cpu), 1, -+ PAGE_KERNEL); -+ -+ /* -+ * The Intel SDM says (Volume 3, 7.2.1): -+ * -+ * Avoid placing a page boundary in the part of the TSS that the -+ * processor reads during a task switch (the first 104 bytes). The -+ * processor may not correctly perform address translations if a -+ * boundary occurs in this area. During a task switch, the processor -+ * reads and writes into the first 104 bytes of each TSS (using -+ * contiguous physical addresses beginning with the physical address -+ * of the first byte of the TSS). So, after TSS access begins, if -+ * part of the 104 bytes is not physically contiguous, the processor -+ * will access incorrect information without generating a page-fault -+ * exception. -+ * -+ * There are also a lot of errata involving the TSS spanning a page -+ * boundary. Assert that we're not doing that. -+ */ -+ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ -+ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); -+ BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); -+ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), -+ &per_cpu(cpu_tss_rw, cpu), -+ sizeof(struct tss_struct) / PAGE_SIZE, -+ tss_prot); -+ -+#ifdef CONFIG_X86_32 -+ per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); -+#endif -+ -+#ifdef CONFIG_X86_64 -+ BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); -+ BUILD_BUG_ON(sizeof(exception_stacks) != -+ sizeof(((struct cpu_entry_area *)0)->exception_stacks)); -+ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), -+ &per_cpu(exception_stacks, cpu), -+ sizeof(exception_stacks) / PAGE_SIZE, -+ PAGE_KERNEL); -+ -+ __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), -+ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); -+#endif -+} -+ -+void __init setup_cpu_entry_areas(void) -+{ -+ unsigned int cpu; -+ -+ for_each_possible_cpu(cpu) -+ setup_cpu_entry_area(cpu); -+} --- -2.14.2 - diff --git a/patches/kernel/0185-x86-mm-Create-asm-invpcid.h.patch b/patches/kernel/0185-x86-mm-Create-asm-invpcid.h.patch new file mode 100644 index 0000000..78cf317 --- /dev/null +++ b/patches/kernel/0185-x86-mm-Create-asm-invpcid.h.patch @@ -0,0 +1,168 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Tue, 5 Dec 2017 13:34:47 +0100 +Subject: [PATCH] x86/mm: Create asm/invpcid.h +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Unclutter tlbflush.h a little. + +Signed-off-by: Peter Zijlstra (Intel) +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 1a3b0caeb77edeac5ce5fa05e6a61c474c9a9745) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 5af02a8c43ce521f460891f6ba68af69428abe90) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/invpcid.h | 53 +++++++++++++++++++++++++++++++++++++++++ + arch/x86/include/asm/tlbflush.h | 49 +------------------------------------ + 2 files changed, 54 insertions(+), 48 deletions(-) + create mode 100644 arch/x86/include/asm/invpcid.h + +diff --git a/arch/x86/include/asm/invpcid.h b/arch/x86/include/asm/invpcid.h +new file mode 100644 +index 000000000000..989cfa86de85 +--- /dev/null ++++ b/arch/x86/include/asm/invpcid.h +@@ -0,0 +1,53 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _ASM_X86_INVPCID ++#define _ASM_X86_INVPCID ++ ++static inline void __invpcid(unsigned long pcid, unsigned long addr, ++ unsigned long type) ++{ ++ struct { u64 d[2]; } desc = { { pcid, addr } }; ++ ++ /* ++ * The memory clobber is because the whole point is to invalidate ++ * stale TLB entries and, especially if we're flushing global ++ * mappings, we don't want the compiler to reorder any subsequent ++ * memory accesses before the TLB flush. ++ * ++ * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and ++ * invpcid (%rcx), %rax in long mode. ++ */ ++ asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" ++ : : "m" (desc), "a" (type), "c" (&desc) : "memory"); ++} ++ ++#define INVPCID_TYPE_INDIV_ADDR 0 ++#define INVPCID_TYPE_SINGLE_CTXT 1 ++#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 ++#define INVPCID_TYPE_ALL_NON_GLOBAL 3 ++ ++/* Flush all mappings for a given pcid and addr, not including globals. */ ++static inline void invpcid_flush_one(unsigned long pcid, ++ unsigned long addr) ++{ ++ __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); ++} ++ ++/* Flush all mappings for a given PCID, not including globals. */ ++static inline void invpcid_flush_single_context(unsigned long pcid) ++{ ++ __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); ++} ++ ++/* Flush all mappings, including globals, for all PCIDs. */ ++static inline void invpcid_flush_all(void) ++{ ++ __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); ++} ++ ++/* Flush all mappings for all PCIDs except globals. */ ++static inline void invpcid_flush_all_nonglobals(void) ++{ ++ __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); ++} ++ ++#endif /* _ASM_X86_INVPCID */ +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index ecd634f87e4e..503f87c30c15 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -8,54 +8,7 @@ + #include + #include + #include +- +-static inline void __invpcid(unsigned long pcid, unsigned long addr, +- unsigned long type) +-{ +- struct { u64 d[2]; } desc = { { pcid, addr } }; +- +- /* +- * The memory clobber is because the whole point is to invalidate +- * stale TLB entries and, especially if we're flushing global +- * mappings, we don't want the compiler to reorder any subsequent +- * memory accesses before the TLB flush. +- * +- * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and +- * invpcid (%rcx), %rax in long mode. +- */ +- asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" +- : : "m" (desc), "a" (type), "c" (&desc) : "memory"); +-} +- +-#define INVPCID_TYPE_INDIV_ADDR 0 +-#define INVPCID_TYPE_SINGLE_CTXT 1 +-#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 +-#define INVPCID_TYPE_ALL_NON_GLOBAL 3 +- +-/* Flush all mappings for a given pcid and addr, not including globals. */ +-static inline void invpcid_flush_one(unsigned long pcid, +- unsigned long addr) +-{ +- __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); +-} +- +-/* Flush all mappings for a given PCID, not including globals. */ +-static inline void invpcid_flush_single_context(unsigned long pcid) +-{ +- __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); +-} +- +-/* Flush all mappings, including globals, for all PCIDs. */ +-static inline void invpcid_flush_all(void) +-{ +- __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); +-} +- +-/* Flush all mappings for all PCIDs except globals. */ +-static inline void invpcid_flush_all_nonglobals(void) +-{ +- __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); +-} ++#include + + static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) + { +-- +2.14.2 + diff --git a/patches/kernel/0186-x86-cpu_entry_area-Move-it-out-of-the-fixmap.patch b/patches/kernel/0186-x86-cpu_entry_area-Move-it-out-of-the-fixmap.patch deleted file mode 100644 index 726fe13..0000000 --- a/patches/kernel/0186-x86-cpu_entry_area-Move-it-out-of-the-fixmap.patch +++ /dev/null @@ -1,588 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Wed, 20 Dec 2017 18:51:31 +0100 -Subject: [PATCH] x86/cpu_entry_area: Move it out of the fixmap -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Put the cpu_entry_area into a separate P4D entry. The fixmap gets too big -and 0-day already hit a case where the fixmap PTEs were cleared by -cleanup_highmap(). - -Aside of that the fixmap API is a pain as it's all backwards. - -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Dave Hansen -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: linux-kernel@vger.kernel.org -Signed-off-by: Ingo Molnar -(backported from commit 92a0f81d89571e3e8759366e050ee05cc545ef99) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit bda9eb328d9ce3757f22794f79da73dd5886c93a) -Signed-off-by: Fabian Grünbichler ---- - Documentation/x86/x86_64/mm.txt | 2 + - arch/x86/include/asm/cpu_entry_area.h | 18 ++++++++- - arch/x86/include/asm/desc.h | 2 + - arch/x86/include/asm/fixmap.h | 32 +--------------- - arch/x86/include/asm/pgtable_32_types.h | 15 ++++++-- - arch/x86/include/asm/pgtable_64_types.h | 47 +++++++++++++---------- - arch/x86/kernel/dumpstack.c | 1 + - arch/x86/kernel/traps.c | 5 ++- - arch/x86/mm/cpu_entry_area.c | 66 +++++++++++++++++++++++++-------- - arch/x86/mm/dump_pagetables.c | 6 ++- - arch/x86/mm/init_32.c | 6 +++ - arch/x86/mm/kasan_init_64.c | 30 ++++++++------- - arch/x86/mm/pgtable_32.c | 1 + - arch/x86/xen/mmu_pv.c | 2 - - 14 files changed, 145 insertions(+), 88 deletions(-) - -diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt -index 63a41671d25b..51101708a03a 100644 ---- a/Documentation/x86/x86_64/mm.txt -+++ b/Documentation/x86/x86_64/mm.txt -@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) - ... unused hole ... - ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) - ... unused hole ... -+fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping - ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks - ... unused hole ... - ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space -@@ -35,6 +36,7 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) - ... unused hole ... - ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) - ... unused hole ... -+fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping - ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks - ... unused hole ... - ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space -diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h -index 5471826803af..2fbc69a0916e 100644 ---- a/arch/x86/include/asm/cpu_entry_area.h -+++ b/arch/x86/include/asm/cpu_entry_area.h -@@ -43,10 +43,26 @@ struct cpu_entry_area { - }; - - #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) --#define CPU_ENTRY_AREA_PAGES (CPU_ENTRY_AREA_SIZE / PAGE_SIZE) -+#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) - - DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); - - extern void setup_cpu_entry_areas(void); -+extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); -+ -+#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE -+#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE) -+ -+#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT) -+ -+#define CPU_ENTRY_AREA_MAP_SIZE \ -+ (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE) -+ -+extern struct cpu_entry_area *get_cpu_entry_area(int cpu); -+ -+static inline struct entry_stack *cpu_entry_stack(int cpu) -+{ -+ return &get_cpu_entry_area(cpu)->entry_stack_page.stack; -+} - - #endif -diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h -index b817fe247506..de40c514ba25 100644 ---- a/arch/x86/include/asm/desc.h -+++ b/arch/x86/include/asm/desc.h -@@ -5,6 +5,8 @@ - #include - #include - #include -+#include -+#include - - #include - #include -diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h -index 1b2521473480..a6ff9e1a6189 100644 ---- a/arch/x86/include/asm/fixmap.h -+++ b/arch/x86/include/asm/fixmap.h -@@ -25,7 +25,6 @@ - #else - #include - #endif --#include - - /* - * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall -@@ -84,7 +83,6 @@ enum fixed_addresses { - FIX_IO_APIC_BASE_0, - FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, - #endif -- FIX_RO_IDT, /* Virtual mapping for read-only IDT */ - #ifdef CONFIG_X86_32 - FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, -@@ -100,9 +98,6 @@ enum fixed_addresses { - #ifdef CONFIG_X86_INTEL_MID - FIX_LNW_VRTC, - #endif -- /* Fixmap entries to remap the GDTs, one per processor. */ -- FIX_CPU_ENTRY_AREA_TOP, -- FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1, - - #ifdef CONFIG_ACPI_APEI_GHES - /* Used for GHES mapping from assorted contexts */ -@@ -143,7 +138,7 @@ enum fixed_addresses { - extern void reserve_top_address(unsigned long reserve); - - #define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) --#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) -+#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) - - extern int fixmaps_set; - -@@ -171,30 +166,5 @@ static inline void __set_fixmap(enum fixed_addresses idx, - void __early_set_fixmap(enum fixed_addresses idx, - phys_addr_t phys, pgprot_t flags); - --static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page) --{ -- BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); -- -- return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page; --} -- --#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \ -- BUILD_BUG_ON(offset % PAGE_SIZE != 0); \ -- __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \ -- }) -- --#define get_cpu_entry_area_index(cpu, field) \ -- __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field)) -- --static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) --{ -- return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); --} -- --static inline struct entry_stack *cpu_entry_stack(int cpu) --{ -- return &get_cpu_entry_area(cpu)->entry_stack_page.stack; --} -- - #endif /* !__ASSEMBLY__ */ - #endif /* _ASM_X86_FIXMAP_H */ -diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h -index 9fb2f2bc8245..67b60e11b70d 100644 ---- a/arch/x86/include/asm/pgtable_32_types.h -+++ b/arch/x86/include/asm/pgtable_32_types.h -@@ -37,13 +37,22 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ - #define LAST_PKMAP 1024 - #endif - --#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ -- & PMD_MASK) -+/* -+ * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c -+ * to avoid include recursion hell -+ */ -+#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40) -+ -+#define CPU_ENTRY_AREA_BASE \ -+ ((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK) -+ -+#define PKMAP_BASE \ -+ ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) - - #ifdef CONFIG_HIGHMEM - # define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) - #else --# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) -+# define VMALLOC_END (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE) - #endif - - #define MODULES_VADDR VMALLOC_START -diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h -index 06470da156ba..42e2750da525 100644 ---- a/arch/x86/include/asm/pgtable_64_types.h -+++ b/arch/x86/include/asm/pgtable_64_types.h -@@ -75,32 +75,41 @@ typedef struct { pteval_t pte; } pte_t; - #define PGDIR_MASK (~(PGDIR_SIZE - 1)) - - /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ --#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) -+#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) -+ - #ifdef CONFIG_X86_5LEVEL --#define VMALLOC_SIZE_TB _AC(16384, UL) --#define __VMALLOC_BASE _AC(0xff92000000000000, UL) --#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) -+# define VMALLOC_SIZE_TB _AC(16384, UL) -+# define __VMALLOC_BASE _AC(0xff92000000000000, UL) -+# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) - #else --#define VMALLOC_SIZE_TB _AC(32, UL) --#define __VMALLOC_BASE _AC(0xffffc90000000000, UL) --#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) -+# define VMALLOC_SIZE_TB _AC(32, UL) -+# define __VMALLOC_BASE _AC(0xffffc90000000000, UL) -+# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) - #endif -+ - #ifdef CONFIG_RANDOMIZE_MEMORY --#define VMALLOC_START vmalloc_base --#define VMEMMAP_START vmemmap_base -+# define VMALLOC_START vmalloc_base -+# define VMEMMAP_START vmemmap_base - #else --#define VMALLOC_START __VMALLOC_BASE --#define VMEMMAP_START __VMEMMAP_BASE -+# define VMALLOC_START __VMALLOC_BASE -+# define VMEMMAP_START __VMEMMAP_BASE - #endif /* CONFIG_RANDOMIZE_MEMORY */ --#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) --#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) -+ -+#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) -+ -+#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) - /* The module sections ends with the start of the fixmap */ --#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) --#define MODULES_LEN (MODULES_END - MODULES_VADDR) --#define ESPFIX_PGD_ENTRY _AC(-2, UL) --#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) --#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) --#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) -+#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) -+#define MODULES_LEN (MODULES_END - MODULES_VADDR) -+ -+#define ESPFIX_PGD_ENTRY _AC(-2, UL) -+#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) -+ -+#define CPU_ENTRY_AREA_PGD _AC(-3, UL) -+#define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT) -+ -+#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) -+#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) - - #define EARLY_DYNAMIC_PAGE_TABLES 64 - -diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c -index 55bf1c3b5319..2bdeb983b9d8 100644 ---- a/arch/x86/kernel/dumpstack.c -+++ b/arch/x86/kernel/dumpstack.c -@@ -18,6 +18,7 @@ - #include - #include - -+#include - #include - #include - -diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c -index ef2d1b8a0516..5808ccb59266 100644 ---- a/arch/x86/kernel/traps.c -+++ b/arch/x86/kernel/traps.c -@@ -1041,8 +1041,9 @@ void __init trap_init(void) - * "sidt" instruction will not leak the location of the kernel, and - * to defend the IDT against arbitrary memory write vulnerabilities. - * It will be reloaded in cpu_init() */ -- __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO); -- idt_descr.address = fix_to_virt(FIX_RO_IDT); -+ cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), -+ PAGE_KERNEL_RO); -+ idt_descr.address = CPU_ENTRY_AREA_RO_IDT; - - /* - * Should be a barrier for any external CPU state: -diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c -index 235ff9cfaaf4..21e8b595cbb1 100644 ---- a/arch/x86/mm/cpu_entry_area.c -+++ b/arch/x86/mm/cpu_entry_area.c -@@ -15,11 +15,27 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); - #endif - -+struct cpu_entry_area *get_cpu_entry_area(int cpu) -+{ -+ unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; -+ BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); -+ -+ return (struct cpu_entry_area *) va; -+} -+EXPORT_SYMBOL(get_cpu_entry_area); -+ -+void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) -+{ -+ unsigned long va = (unsigned long) cea_vaddr; -+ -+ set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags)); -+} -+ - static void __init --set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) -+cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) - { -- for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) -- __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); -+ for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE) -+ cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); - } - - /* Setup the fixmap mappings only once per-processor */ -@@ -47,10 +63,12 @@ static void __init setup_cpu_entry_area(int cpu) - pgprot_t tss_prot = PAGE_KERNEL; - #endif - -- __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); -- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), -- per_cpu_ptr(&entry_stack_storage, cpu), 1, -- PAGE_KERNEL); -+ cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), -+ gdt_prot); -+ -+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, -+ per_cpu_ptr(&entry_stack_storage, cpu), 1, -+ PAGE_KERNEL); - - /* - * The Intel SDM says (Volume 3, 7.2.1): -@@ -72,10 +90,9 @@ static void __init setup_cpu_entry_area(int cpu) - BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ - offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); - BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); -- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), -- &per_cpu(cpu_tss_rw, cpu), -- sizeof(struct tss_struct) / PAGE_SIZE, -- tss_prot); -+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, -+ &per_cpu(cpu_tss_rw, cpu), -+ sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); - - #ifdef CONFIG_X86_32 - per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); -@@ -85,20 +102,37 @@ static void __init setup_cpu_entry_area(int cpu) - BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); - BUILD_BUG_ON(sizeof(exception_stacks) != - sizeof(((struct cpu_entry_area *)0)->exception_stacks)); -- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), -- &per_cpu(exception_stacks, cpu), -- sizeof(exception_stacks) / PAGE_SIZE, -- PAGE_KERNEL); -+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, -+ &per_cpu(exception_stacks, cpu), -+ sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); - -- __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), -+ cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, - __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); - #endif - } - -+static __init void setup_cpu_entry_area_ptes(void) -+{ -+#ifdef CONFIG_X86_32 -+ unsigned long start, end; -+ -+ BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE); -+ BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); -+ -+ start = CPU_ENTRY_AREA_BASE; -+ end = start + CPU_ENTRY_AREA_MAP_SIZE; -+ -+ for (; start < end; start += PMD_SIZE) -+ populate_extra_pte(start); -+#endif -+} -+ - void __init setup_cpu_entry_areas(void) - { - unsigned int cpu; - -+ setup_cpu_entry_area_ptes(); -+ - for_each_possible_cpu(cpu) - setup_cpu_entry_area(cpu); - } -diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c -index 318a7c30e87e..3b7720404a9f 100644 ---- a/arch/x86/mm/dump_pagetables.c -+++ b/arch/x86/mm/dump_pagetables.c -@@ -58,6 +58,7 @@ enum address_markers_idx { - KASAN_SHADOW_START_NR, - KASAN_SHADOW_END_NR, - #endif -+ CPU_ENTRY_AREA_NR, - #ifdef CONFIG_X86_ESPFIX64 - ESPFIX_START_NR, - #endif -@@ -81,6 +82,7 @@ static struct addr_marker address_markers[] = { - [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, - [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, - #endif -+ [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, - #ifdef CONFIG_X86_ESPFIX64 - [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, - #endif -@@ -104,6 +106,7 @@ enum address_markers_idx { - #ifdef CONFIG_HIGHMEM - PKMAP_BASE_NR, - #endif -+ CPU_ENTRY_AREA_NR, - FIXADDR_START_NR, - END_OF_SPACE_NR, - }; -@@ -116,6 +119,7 @@ static struct addr_marker address_markers[] = { - #ifdef CONFIG_HIGHMEM - [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, - #endif -+ [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, - [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, - [END_OF_SPACE_NR] = { -1, NULL } - }; -@@ -522,8 +526,8 @@ static int __init pt_dump_init(void) - address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; - # endif - address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; -+ address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; - #endif -- - return 0; - } - __initcall(pt_dump_init); -diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c -index 8a64a6f2848d..135c9a7898c7 100644 ---- a/arch/x86/mm/init_32.c -+++ b/arch/x86/mm/init_32.c -@@ -50,6 +50,7 @@ - #include - #include - #include -+#include - #include - - #include "mm_internal.h" -@@ -766,6 +767,7 @@ void __init mem_init(void) - mem_init_print_info(NULL); - printk(KERN_INFO "virtual kernel memory layout:\n" - " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" -+ " cpu_entry : 0x%08lx - 0x%08lx (%4ld kB)\n" - #ifdef CONFIG_HIGHMEM - " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" - #endif -@@ -777,6 +779,10 @@ void __init mem_init(void) - FIXADDR_START, FIXADDR_TOP, - (FIXADDR_TOP - FIXADDR_START) >> 10, - -+ CPU_ENTRY_AREA_BASE, -+ CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE, -+ CPU_ENTRY_AREA_MAP_SIZE >> 10, -+ - #ifdef CONFIG_HIGHMEM - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, - (LAST_PKMAP*PAGE_SIZE) >> 10, -diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c -index d8836e45bc07..4cd556a30ee1 100644 ---- a/arch/x86/mm/kasan_init_64.c -+++ b/arch/x86/mm/kasan_init_64.c -@@ -13,6 +13,8 @@ - #include - #include - #include -+#include -+#include - - extern pgd_t early_top_pgt[PTRS_PER_PGD]; - extern struct range pfn_mapped[E820_MAX_ENTRIES]; -@@ -321,31 +323,33 @@ void __init kasan_init(void) - map_range(&pfn_mapped[i]); - } - -- kasan_populate_zero_shadow( -- kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), -- kasan_mem_to_shadow((void *)__START_KERNEL_map)); -- -- kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), -- (unsigned long)kasan_mem_to_shadow(_end), -- early_pfn_to_nid(__pa(_stext))); -- -- shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM); -+ shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE; - shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); - shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, - PAGE_SIZE); - -- shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE); -+ shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE + -+ CPU_ENTRY_AREA_MAP_SIZE); - shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); - shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, - PAGE_SIZE); - -- kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), -- shadow_cpu_entry_begin); -+ kasan_populate_zero_shadow( -+ kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), -+ shadow_cpu_entry_begin); - - kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, - (unsigned long)shadow_cpu_entry_end, 0); - -- kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END); -+ kasan_populate_zero_shadow(shadow_cpu_entry_end, -+ kasan_mem_to_shadow((void *)__START_KERNEL_map)); -+ -+ kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), -+ (unsigned long)kasan_mem_to_shadow(_end), -+ early_pfn_to_nid(__pa(_stext))); -+ -+ kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), -+ (void *)KASAN_SHADOW_END); - - load_cr3(init_top_pgt); - __flush_tlb_all(); -diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c -index b9bd5b8b14fa..77909bae5943 100644 ---- a/arch/x86/mm/pgtable_32.c -+++ b/arch/x86/mm/pgtable_32.c -@@ -9,6 +9,7 @@ - #include - #include - -+#include - #include - #include - #include -diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c -index 53e65f605bdd..cd4b91b8d614 100644 ---- a/arch/x86/xen/mmu_pv.c -+++ b/arch/x86/xen/mmu_pv.c -@@ -2286,7 +2286,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) - - switch (idx) { - case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: -- case FIX_RO_IDT: - #ifdef CONFIG_X86_32 - case FIX_WP_TEST: - # ifdef CONFIG_HIGHMEM -@@ -2297,7 +2296,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) - #endif - case FIX_TEXT_POKE0: - case FIX_TEXT_POKE1: -- case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM: - /* All local page mappings */ - pte = pfn_pte(phys, prot); - break; --- -2.14.2 - diff --git a/patches/kernel/0186-x86-cpu_entry_area-Move-it-to-a-separate-unit.patch b/patches/kernel/0186-x86-cpu_entry_area-Move-it-to-a-separate-unit.patch new file mode 100644 index 0000000..84983a3 --- /dev/null +++ b/patches/kernel/0186-x86-cpu_entry_area-Move-it-to-a-separate-unit.patch @@ -0,0 +1,400 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Wed, 20 Dec 2017 18:28:54 +0100 +Subject: [PATCH] x86/cpu_entry_area: Move it to a separate unit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Separate the cpu_entry_area code out of cpu/common.c and the fixmap. + +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Signed-off-by: Ingo Molnar +(cherry picked from commit ed1bbc40a0d10e0c5c74fe7bdc6298295cf40255) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 0fa11d2cd3d67af676aa2762ade282ba6d09cbe5) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/Makefile | 2 +- + arch/x86/include/asm/cpu_entry_area.h | 52 +++++++++++++++++ + arch/x86/include/asm/fixmap.h | 41 +------------- + arch/x86/kernel/cpu/common.c | 94 ------------------------------ + arch/x86/kernel/traps.c | 1 + + arch/x86/mm/cpu_entry_area.c | 104 ++++++++++++++++++++++++++++++++++ + 6 files changed, 159 insertions(+), 135 deletions(-) + create mode 100644 arch/x86/include/asm/cpu_entry_area.h + create mode 100644 arch/x86/mm/cpu_entry_area.c + +diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile +index 0fbdcb64f9f8..76f5399a8356 100644 +--- a/arch/x86/mm/Makefile ++++ b/arch/x86/mm/Makefile +@@ -2,7 +2,7 @@ + KCOV_INSTRUMENT_tlb.o := n + + obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ +- pat.o pgtable.o physaddr.o setup_nx.o tlb.o ++ pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o + + # Make sure __phys_addr has no stackprotector + nostackp := $(call cc-option, -fno-stack-protector) +diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h +new file mode 100644 +index 000000000000..5471826803af +--- /dev/null ++++ b/arch/x86/include/asm/cpu_entry_area.h +@@ -0,0 +1,52 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#ifndef _ASM_X86_CPU_ENTRY_AREA_H ++#define _ASM_X86_CPU_ENTRY_AREA_H ++ ++#include ++#include ++ ++/* ++ * cpu_entry_area is a percpu region that contains things needed by the CPU ++ * and early entry/exit code. Real types aren't used for all fields here ++ * to avoid circular header dependencies. ++ * ++ * Every field is a virtual alias of some other allocated backing store. ++ * There is no direct allocation of a struct cpu_entry_area. ++ */ ++struct cpu_entry_area { ++ char gdt[PAGE_SIZE]; ++ ++ /* ++ * The GDT is just below entry_stack and thus serves (on x86_64) as ++ * a a read-only guard page. ++ */ ++ struct entry_stack_page entry_stack_page; ++ ++ /* ++ * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because ++ * we need task switches to work, and task switches write to the TSS. ++ */ ++ struct tss_struct tss; ++ ++ char entry_trampoline[PAGE_SIZE]; ++ ++#ifdef CONFIG_X86_64 ++ /* ++ * Exception stacks used for IST entries. ++ * ++ * In the future, this should have a separate slot for each stack ++ * with guard pages between them. ++ */ ++ char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; ++#endif ++}; ++ ++#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) ++#define CPU_ENTRY_AREA_PAGES (CPU_ENTRY_AREA_SIZE / PAGE_SIZE) ++ ++DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); ++ ++extern void setup_cpu_entry_areas(void); ++ ++#endif +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index a7fb137ad964..1b2521473480 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -25,6 +25,7 @@ + #else + #include + #endif ++#include + + /* + * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall +@@ -44,46 +45,6 @@ extern unsigned long __FIXADDR_TOP; + PAGE_SIZE) + #endif + +-/* +- * cpu_entry_area is a percpu region in the fixmap that contains things +- * needed by the CPU and early entry/exit code. Real types aren't used +- * for all fields here to avoid circular header dependencies. +- * +- * Every field is a virtual alias of some other allocated backing store. +- * There is no direct allocation of a struct cpu_entry_area. +- */ +-struct cpu_entry_area { +- char gdt[PAGE_SIZE]; +- +- /* +- * The GDT is just below entry_stack and thus serves (on x86_64) as +- * a a read-only guard page. +- */ +- struct entry_stack_page entry_stack_page; +- +- /* +- * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because +- * we need task switches to work, and task switches write to the TSS. +- */ +- struct tss_struct tss; +- +- char entry_trampoline[PAGE_SIZE]; +- +-#ifdef CONFIG_X86_64 +- /* +- * Exception stacks used for IST entries. +- * +- * In the future, this should have a separate slot for each stack +- * with guard pages between them. +- */ +- char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; +-#endif +-}; +- +-#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) +- +-extern void setup_cpu_entry_areas(void); +- + /* + * Here we define all the compile-time 'special' virtual + * addresses. The point is to have a constant address at +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 7a8a5d436566..96171ce46d61 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -482,102 +482,8 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ + }; +- +-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks +- [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +-#endif +- +-static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, +- entry_stack_storage); +- +-static void __init +-set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) +-{ +- for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) +- __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); +-} +- +-/* Setup the fixmap mappings only once per-processor */ +-static void __init setup_cpu_entry_area(int cpu) +-{ +-#ifdef CONFIG_X86_64 +- extern char _entry_trampoline[]; +- +- /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ +- pgprot_t gdt_prot = PAGE_KERNEL_RO; +- pgprot_t tss_prot = PAGE_KERNEL_RO; +-#else +- /* +- * On native 32-bit systems, the GDT cannot be read-only because +- * our double fault handler uses a task gate, and entering through +- * a task gate needs to change an available TSS to busy. If the +- * GDT is read-only, that will triple fault. The TSS cannot be +- * read-only because the CPU writes to it on task switches. +- * +- * On Xen PV, the GDT must be read-only because the hypervisor +- * requires it. +- */ +- pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? +- PAGE_KERNEL_RO : PAGE_KERNEL; +- pgprot_t tss_prot = PAGE_KERNEL; +-#endif +- +- __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); +- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), +- per_cpu_ptr(&entry_stack_storage, cpu), 1, +- PAGE_KERNEL); +- +- /* +- * The Intel SDM says (Volume 3, 7.2.1): +- * +- * Avoid placing a page boundary in the part of the TSS that the +- * processor reads during a task switch (the first 104 bytes). The +- * processor may not correctly perform address translations if a +- * boundary occurs in this area. During a task switch, the processor +- * reads and writes into the first 104 bytes of each TSS (using +- * contiguous physical addresses beginning with the physical address +- * of the first byte of the TSS). So, after TSS access begins, if +- * part of the 104 bytes is not physically contiguous, the processor +- * will access incorrect information without generating a page-fault +- * exception. +- * +- * There are also a lot of errata involving the TSS spanning a page +- * boundary. Assert that we're not doing that. +- */ +- BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ +- offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); +- BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); +- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), +- &per_cpu(cpu_tss_rw, cpu), +- sizeof(struct tss_struct) / PAGE_SIZE, +- tss_prot); +- +-#ifdef CONFIG_X86_32 +- per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); + #endif + +-#ifdef CONFIG_X86_64 +- BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); +- BUILD_BUG_ON(sizeof(exception_stacks) != +- sizeof(((struct cpu_entry_area *)0)->exception_stacks)); +- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), +- &per_cpu(exception_stacks, cpu), +- sizeof(exception_stacks) / PAGE_SIZE, +- PAGE_KERNEL); +- +- __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), +- __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); +-#endif +-} +- +-void __init setup_cpu_entry_areas(void) +-{ +- unsigned int cpu; +- +- for_each_possible_cpu(cpu) +- setup_cpu_entry_area(cpu); +-} +- + /* Load the original GDT from the per-cpu structure */ + void load_direct_gdt(int cpu) + { +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index 14b462eefa17..ef2d1b8a0516 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -57,6 +57,7 @@ + #include + #include + #include ++#include + #include + #include + #include +diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c +new file mode 100644 +index 000000000000..235ff9cfaaf4 +--- /dev/null ++++ b/arch/x86/mm/cpu_entry_area.c +@@ -0,0 +1,104 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); ++ ++#ifdef CONFIG_X86_64 ++static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks ++ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); ++#endif ++ ++static void __init ++set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) ++{ ++ for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) ++ __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); ++} ++ ++/* Setup the fixmap mappings only once per-processor */ ++static void __init setup_cpu_entry_area(int cpu) ++{ ++#ifdef CONFIG_X86_64 ++ extern char _entry_trampoline[]; ++ ++ /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ ++ pgprot_t gdt_prot = PAGE_KERNEL_RO; ++ pgprot_t tss_prot = PAGE_KERNEL_RO; ++#else ++ /* ++ * On native 32-bit systems, the GDT cannot be read-only because ++ * our double fault handler uses a task gate, and entering through ++ * a task gate needs to change an available TSS to busy. If the ++ * GDT is read-only, that will triple fault. The TSS cannot be ++ * read-only because the CPU writes to it on task switches. ++ * ++ * On Xen PV, the GDT must be read-only because the hypervisor ++ * requires it. ++ */ ++ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? ++ PAGE_KERNEL_RO : PAGE_KERNEL; ++ pgprot_t tss_prot = PAGE_KERNEL; ++#endif ++ ++ __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); ++ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), ++ per_cpu_ptr(&entry_stack_storage, cpu), 1, ++ PAGE_KERNEL); ++ ++ /* ++ * The Intel SDM says (Volume 3, 7.2.1): ++ * ++ * Avoid placing a page boundary in the part of the TSS that the ++ * processor reads during a task switch (the first 104 bytes). The ++ * processor may not correctly perform address translations if a ++ * boundary occurs in this area. During a task switch, the processor ++ * reads and writes into the first 104 bytes of each TSS (using ++ * contiguous physical addresses beginning with the physical address ++ * of the first byte of the TSS). So, after TSS access begins, if ++ * part of the 104 bytes is not physically contiguous, the processor ++ * will access incorrect information without generating a page-fault ++ * exception. ++ * ++ * There are also a lot of errata involving the TSS spanning a page ++ * boundary. Assert that we're not doing that. ++ */ ++ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ ++ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); ++ BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); ++ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), ++ &per_cpu(cpu_tss_rw, cpu), ++ sizeof(struct tss_struct) / PAGE_SIZE, ++ tss_prot); ++ ++#ifdef CONFIG_X86_32 ++ per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); ++#endif ++ ++#ifdef CONFIG_X86_64 ++ BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); ++ BUILD_BUG_ON(sizeof(exception_stacks) != ++ sizeof(((struct cpu_entry_area *)0)->exception_stacks)); ++ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), ++ &per_cpu(exception_stacks, cpu), ++ sizeof(exception_stacks) / PAGE_SIZE, ++ PAGE_KERNEL); ++ ++ __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), ++ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); ++#endif ++} ++ ++void __init setup_cpu_entry_areas(void) ++{ ++ unsigned int cpu; ++ ++ for_each_possible_cpu(cpu) ++ setup_cpu_entry_area(cpu); ++} +-- +2.14.2 + diff --git a/patches/kernel/0187-init-Invoke-init_espfix_bsp-from-mm_init.patch b/patches/kernel/0187-init-Invoke-init_espfix_bsp-from-mm_init.patch deleted file mode 100644 index 78868d6..0000000 --- a/patches/kernel/0187-init-Invoke-init_espfix_bsp-from-mm_init.patch +++ /dev/null @@ -1,123 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Sun, 17 Dec 2017 10:56:29 +0100 -Subject: [PATCH] init: Invoke init_espfix_bsp() from mm_init() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -init_espfix_bsp() needs to be invoked before the page table isolation -initialization. Move it into mm_init() which is the place where pti_init() -will be added. - -While at it get rid of the #ifdeffery and provide proper stub functions. - -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Dave Hansen -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Signed-off-by: Ingo Molnar -(cherry picked from commit 613e396bc0d4c7604fba23256644e78454c68cf6) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit a187e1a3cd87c860a8db188991d2d43fedd7225f) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/espfix.h | 7 ++++--- - include/asm-generic/pgtable.h | 5 +++++ - arch/x86/kernel/smpboot.c | 6 +----- - init/main.c | 6 ++---- - 4 files changed, 12 insertions(+), 12 deletions(-) - -diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h -index ca3ce9ab9385..e7009ac975af 100644 ---- a/arch/x86/include/asm/espfix.h -+++ b/arch/x86/include/asm/espfix.h -@@ -1,7 +1,7 @@ - #ifndef _ASM_X86_ESPFIX_H - #define _ASM_X86_ESPFIX_H - --#ifdef CONFIG_X86_64 -+#ifdef CONFIG_X86_ESPFIX64 - - #include - -@@ -10,7 +10,8 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); - - extern void init_espfix_bsp(void); - extern void init_espfix_ap(int cpu); -- --#endif /* CONFIG_X86_64 */ -+#else -+static inline void init_espfix_ap(int cpu) { } -+#endif - - #endif /* _ASM_X86_ESPFIX_H */ -diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h -index 7dfa767dc680..1bab3cfc0601 100644 ---- a/include/asm-generic/pgtable.h -+++ b/include/asm-generic/pgtable.h -@@ -956,6 +956,11 @@ static inline int pmd_clear_huge(pmd_t *pmd) - struct file; - int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, - unsigned long size, pgprot_t *vma_prot); -+ -+#ifndef CONFIG_X86_ESPFIX64 -+static inline void init_espfix_bsp(void) { } -+#endif -+ - #endif /* !__ASSEMBLY__ */ - - #ifndef io_remap_pfn_range -diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c -index 8ea3b18cbdc1..03d2ba2da3b0 100644 ---- a/arch/x86/kernel/smpboot.c -+++ b/arch/x86/kernel/smpboot.c -@@ -989,12 +989,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, - initial_code = (unsigned long)start_secondary; - initial_stack = idle->thread.sp; - -- /* -- * Enable the espfix hack for this CPU -- */ --#ifdef CONFIG_X86_ESPFIX64 -+ /* Enable the espfix hack for this CPU */ - init_espfix_ap(cpu); --#endif - - /* So we see what's up */ - announce_cpu(cpu, apicid); -diff --git a/init/main.c b/init/main.c -index 83d1004e3b97..de1c495da782 100644 ---- a/init/main.c -+++ b/init/main.c -@@ -504,6 +504,8 @@ static void __init mm_init(void) - pgtable_init(); - vmalloc_init(); - ioremap_huge_init(); -+ /* Should be run before the first non-init thread is created */ -+ init_espfix_bsp(); - } - - asmlinkage __visible void __init start_kernel(void) -@@ -664,10 +666,6 @@ asmlinkage __visible void __init start_kernel(void) - #ifdef CONFIG_X86 - if (efi_enabled(EFI_RUNTIME_SERVICES)) - efi_enter_virtual_mode(); --#endif --#ifdef CONFIG_X86_ESPFIX64 -- /* Should be run before the first non-init thread is created */ -- init_espfix_bsp(); - #endif - thread_stack_cache_init(); - cred_init(); --- -2.14.2 - diff --git a/patches/kernel/0187-x86-cpu_entry_area-Move-it-out-of-the-fixmap.patch b/patches/kernel/0187-x86-cpu_entry_area-Move-it-out-of-the-fixmap.patch new file mode 100644 index 0000000..726fe13 --- /dev/null +++ b/patches/kernel/0187-x86-cpu_entry_area-Move-it-out-of-the-fixmap.patch @@ -0,0 +1,588 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Wed, 20 Dec 2017 18:51:31 +0100 +Subject: [PATCH] x86/cpu_entry_area: Move it out of the fixmap +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Put the cpu_entry_area into a separate P4D entry. The fixmap gets too big +and 0-day already hit a case where the fixmap PTEs were cleared by +cleanup_highmap(). + +Aside of that the fixmap API is a pain as it's all backwards. + +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ingo Molnar +(backported from commit 92a0f81d89571e3e8759366e050ee05cc545ef99) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit bda9eb328d9ce3757f22794f79da73dd5886c93a) +Signed-off-by: Fabian Grünbichler +--- + Documentation/x86/x86_64/mm.txt | 2 + + arch/x86/include/asm/cpu_entry_area.h | 18 ++++++++- + arch/x86/include/asm/desc.h | 2 + + arch/x86/include/asm/fixmap.h | 32 +--------------- + arch/x86/include/asm/pgtable_32_types.h | 15 ++++++-- + arch/x86/include/asm/pgtable_64_types.h | 47 +++++++++++++---------- + arch/x86/kernel/dumpstack.c | 1 + + arch/x86/kernel/traps.c | 5 ++- + arch/x86/mm/cpu_entry_area.c | 66 +++++++++++++++++++++++++-------- + arch/x86/mm/dump_pagetables.c | 6 ++- + arch/x86/mm/init_32.c | 6 +++ + arch/x86/mm/kasan_init_64.c | 30 ++++++++------- + arch/x86/mm/pgtable_32.c | 1 + + arch/x86/xen/mmu_pv.c | 2 - + 14 files changed, 145 insertions(+), 88 deletions(-) + +diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt +index 63a41671d25b..51101708a03a 100644 +--- a/Documentation/x86/x86_64/mm.txt ++++ b/Documentation/x86/x86_64/mm.txt +@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) + ... unused hole ... + ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) + ... unused hole ... ++fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping + ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks + ... unused hole ... + ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space +@@ -35,6 +36,7 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) + ... unused hole ... + ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) + ... unused hole ... ++fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping + ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks + ... unused hole ... + ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space +diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h +index 5471826803af..2fbc69a0916e 100644 +--- a/arch/x86/include/asm/cpu_entry_area.h ++++ b/arch/x86/include/asm/cpu_entry_area.h +@@ -43,10 +43,26 @@ struct cpu_entry_area { + }; + + #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) +-#define CPU_ENTRY_AREA_PAGES (CPU_ENTRY_AREA_SIZE / PAGE_SIZE) ++#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) + + DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); + + extern void setup_cpu_entry_areas(void); ++extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); ++ ++#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE ++#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE) ++ ++#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT) ++ ++#define CPU_ENTRY_AREA_MAP_SIZE \ ++ (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE) ++ ++extern struct cpu_entry_area *get_cpu_entry_area(int cpu); ++ ++static inline struct entry_stack *cpu_entry_stack(int cpu) ++{ ++ return &get_cpu_entry_area(cpu)->entry_stack_page.stack; ++} + + #endif +diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h +index b817fe247506..de40c514ba25 100644 +--- a/arch/x86/include/asm/desc.h ++++ b/arch/x86/include/asm/desc.h +@@ -5,6 +5,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index 1b2521473480..a6ff9e1a6189 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -25,7 +25,6 @@ + #else + #include + #endif +-#include + + /* + * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall +@@ -84,7 +83,6 @@ enum fixed_addresses { + FIX_IO_APIC_BASE_0, + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, + #endif +- FIX_RO_IDT, /* Virtual mapping for read-only IDT */ + #ifdef CONFIG_X86_32 + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, +@@ -100,9 +98,6 @@ enum fixed_addresses { + #ifdef CONFIG_X86_INTEL_MID + FIX_LNW_VRTC, + #endif +- /* Fixmap entries to remap the GDTs, one per processor. */ +- FIX_CPU_ENTRY_AREA_TOP, +- FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1, + + #ifdef CONFIG_ACPI_APEI_GHES + /* Used for GHES mapping from assorted contexts */ +@@ -143,7 +138,7 @@ enum fixed_addresses { + extern void reserve_top_address(unsigned long reserve); + + #define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) +-#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) ++#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) + + extern int fixmaps_set; + +@@ -171,30 +166,5 @@ static inline void __set_fixmap(enum fixed_addresses idx, + void __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags); + +-static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page) +-{ +- BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); +- +- return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page; +-} +- +-#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \ +- BUILD_BUG_ON(offset % PAGE_SIZE != 0); \ +- __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \ +- }) +- +-#define get_cpu_entry_area_index(cpu, field) \ +- __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field)) +- +-static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) +-{ +- return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); +-} +- +-static inline struct entry_stack *cpu_entry_stack(int cpu) +-{ +- return &get_cpu_entry_area(cpu)->entry_stack_page.stack; +-} +- + #endif /* !__ASSEMBLY__ */ + #endif /* _ASM_X86_FIXMAP_H */ +diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h +index 9fb2f2bc8245..67b60e11b70d 100644 +--- a/arch/x86/include/asm/pgtable_32_types.h ++++ b/arch/x86/include/asm/pgtable_32_types.h +@@ -37,13 +37,22 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ + #define LAST_PKMAP 1024 + #endif + +-#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ +- & PMD_MASK) ++/* ++ * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c ++ * to avoid include recursion hell ++ */ ++#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40) ++ ++#define CPU_ENTRY_AREA_BASE \ ++ ((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK) ++ ++#define PKMAP_BASE \ ++ ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) + + #ifdef CONFIG_HIGHMEM + # define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) + #else +-# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) ++# define VMALLOC_END (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE) + #endif + + #define MODULES_VADDR VMALLOC_START +diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h +index 06470da156ba..42e2750da525 100644 +--- a/arch/x86/include/asm/pgtable_64_types.h ++++ b/arch/x86/include/asm/pgtable_64_types.h +@@ -75,32 +75,41 @@ typedef struct { pteval_t pte; } pte_t; + #define PGDIR_MASK (~(PGDIR_SIZE - 1)) + + /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ +-#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) ++#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) ++ + #ifdef CONFIG_X86_5LEVEL +-#define VMALLOC_SIZE_TB _AC(16384, UL) +-#define __VMALLOC_BASE _AC(0xff92000000000000, UL) +-#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) ++# define VMALLOC_SIZE_TB _AC(16384, UL) ++# define __VMALLOC_BASE _AC(0xff92000000000000, UL) ++# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) + #else +-#define VMALLOC_SIZE_TB _AC(32, UL) +-#define __VMALLOC_BASE _AC(0xffffc90000000000, UL) +-#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) ++# define VMALLOC_SIZE_TB _AC(32, UL) ++# define __VMALLOC_BASE _AC(0xffffc90000000000, UL) ++# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) + #endif ++ + #ifdef CONFIG_RANDOMIZE_MEMORY +-#define VMALLOC_START vmalloc_base +-#define VMEMMAP_START vmemmap_base ++# define VMALLOC_START vmalloc_base ++# define VMEMMAP_START vmemmap_base + #else +-#define VMALLOC_START __VMALLOC_BASE +-#define VMEMMAP_START __VMEMMAP_BASE ++# define VMALLOC_START __VMALLOC_BASE ++# define VMEMMAP_START __VMEMMAP_BASE + #endif /* CONFIG_RANDOMIZE_MEMORY */ +-#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) +-#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) ++ ++#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) ++ ++#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) + /* The module sections ends with the start of the fixmap */ +-#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) +-#define MODULES_LEN (MODULES_END - MODULES_VADDR) +-#define ESPFIX_PGD_ENTRY _AC(-2, UL) +-#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) +-#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) +-#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) ++#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) ++#define MODULES_LEN (MODULES_END - MODULES_VADDR) ++ ++#define ESPFIX_PGD_ENTRY _AC(-2, UL) ++#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) ++ ++#define CPU_ENTRY_AREA_PGD _AC(-3, UL) ++#define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT) ++ ++#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) ++#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) + + #define EARLY_DYNAMIC_PAGE_TABLES 64 + +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index 55bf1c3b5319..2bdeb983b9d8 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -18,6 +18,7 @@ + #include + #include + ++#include + #include + #include + +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index ef2d1b8a0516..5808ccb59266 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -1041,8 +1041,9 @@ void __init trap_init(void) + * "sidt" instruction will not leak the location of the kernel, and + * to defend the IDT against arbitrary memory write vulnerabilities. + * It will be reloaded in cpu_init() */ +- __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO); +- idt_descr.address = fix_to_virt(FIX_RO_IDT); ++ cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), ++ PAGE_KERNEL_RO); ++ idt_descr.address = CPU_ENTRY_AREA_RO_IDT; + + /* + * Should be a barrier for any external CPU state: +diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c +index 235ff9cfaaf4..21e8b595cbb1 100644 +--- a/arch/x86/mm/cpu_entry_area.c ++++ b/arch/x86/mm/cpu_entry_area.c +@@ -15,11 +15,27 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); + #endif + ++struct cpu_entry_area *get_cpu_entry_area(int cpu) ++{ ++ unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; ++ BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); ++ ++ return (struct cpu_entry_area *) va; ++} ++EXPORT_SYMBOL(get_cpu_entry_area); ++ ++void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) ++{ ++ unsigned long va = (unsigned long) cea_vaddr; ++ ++ set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags)); ++} ++ + static void __init +-set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) ++cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) + { +- for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) +- __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); ++ for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE) ++ cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); + } + + /* Setup the fixmap mappings only once per-processor */ +@@ -47,10 +63,12 @@ static void __init setup_cpu_entry_area(int cpu) + pgprot_t tss_prot = PAGE_KERNEL; + #endif + +- __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); +- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), +- per_cpu_ptr(&entry_stack_storage, cpu), 1, +- PAGE_KERNEL); ++ cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), ++ gdt_prot); ++ ++ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, ++ per_cpu_ptr(&entry_stack_storage, cpu), 1, ++ PAGE_KERNEL); + + /* + * The Intel SDM says (Volume 3, 7.2.1): +@@ -72,10 +90,9 @@ static void __init setup_cpu_entry_area(int cpu) + BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ + offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); + BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); +- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), +- &per_cpu(cpu_tss_rw, cpu), +- sizeof(struct tss_struct) / PAGE_SIZE, +- tss_prot); ++ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, ++ &per_cpu(cpu_tss_rw, cpu), ++ sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); + + #ifdef CONFIG_X86_32 + per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); +@@ -85,20 +102,37 @@ static void __init setup_cpu_entry_area(int cpu) + BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + BUILD_BUG_ON(sizeof(exception_stacks) != + sizeof(((struct cpu_entry_area *)0)->exception_stacks)); +- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), +- &per_cpu(exception_stacks, cpu), +- sizeof(exception_stacks) / PAGE_SIZE, +- PAGE_KERNEL); ++ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, ++ &per_cpu(exception_stacks, cpu), ++ sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); + +- __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), ++ cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); + #endif + } + ++static __init void setup_cpu_entry_area_ptes(void) ++{ ++#ifdef CONFIG_X86_32 ++ unsigned long start, end; ++ ++ BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE); ++ BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); ++ ++ start = CPU_ENTRY_AREA_BASE; ++ end = start + CPU_ENTRY_AREA_MAP_SIZE; ++ ++ for (; start < end; start += PMD_SIZE) ++ populate_extra_pte(start); ++#endif ++} ++ + void __init setup_cpu_entry_areas(void) + { + unsigned int cpu; + ++ setup_cpu_entry_area_ptes(); ++ + for_each_possible_cpu(cpu) + setup_cpu_entry_area(cpu); + } +diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c +index 318a7c30e87e..3b7720404a9f 100644 +--- a/arch/x86/mm/dump_pagetables.c ++++ b/arch/x86/mm/dump_pagetables.c +@@ -58,6 +58,7 @@ enum address_markers_idx { + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, + #endif ++ CPU_ENTRY_AREA_NR, + #ifdef CONFIG_X86_ESPFIX64 + ESPFIX_START_NR, + #endif +@@ -81,6 +82,7 @@ static struct addr_marker address_markers[] = { + [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, + [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, + #endif ++ [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, + #ifdef CONFIG_X86_ESPFIX64 + [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, + #endif +@@ -104,6 +106,7 @@ enum address_markers_idx { + #ifdef CONFIG_HIGHMEM + PKMAP_BASE_NR, + #endif ++ CPU_ENTRY_AREA_NR, + FIXADDR_START_NR, + END_OF_SPACE_NR, + }; +@@ -116,6 +119,7 @@ static struct addr_marker address_markers[] = { + #ifdef CONFIG_HIGHMEM + [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, + #endif ++ [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, + [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, + [END_OF_SPACE_NR] = { -1, NULL } + }; +@@ -522,8 +526,8 @@ static int __init pt_dump_init(void) + address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; + # endif + address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; ++ address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; + #endif +- + return 0; + } + __initcall(pt_dump_init); +diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c +index 8a64a6f2848d..135c9a7898c7 100644 +--- a/arch/x86/mm/init_32.c ++++ b/arch/x86/mm/init_32.c +@@ -50,6 +50,7 @@ + #include + #include + #include ++#include + #include + + #include "mm_internal.h" +@@ -766,6 +767,7 @@ void __init mem_init(void) + mem_init_print_info(NULL); + printk(KERN_INFO "virtual kernel memory layout:\n" + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" ++ " cpu_entry : 0x%08lx - 0x%08lx (%4ld kB)\n" + #ifdef CONFIG_HIGHMEM + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" + #endif +@@ -777,6 +779,10 @@ void __init mem_init(void) + FIXADDR_START, FIXADDR_TOP, + (FIXADDR_TOP - FIXADDR_START) >> 10, + ++ CPU_ENTRY_AREA_BASE, ++ CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE, ++ CPU_ENTRY_AREA_MAP_SIZE >> 10, ++ + #ifdef CONFIG_HIGHMEM + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, + (LAST_PKMAP*PAGE_SIZE) >> 10, +diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c +index d8836e45bc07..4cd556a30ee1 100644 +--- a/arch/x86/mm/kasan_init_64.c ++++ b/arch/x86/mm/kasan_init_64.c +@@ -13,6 +13,8 @@ + #include + #include + #include ++#include ++#include + + extern pgd_t early_top_pgt[PTRS_PER_PGD]; + extern struct range pfn_mapped[E820_MAX_ENTRIES]; +@@ -321,31 +323,33 @@ void __init kasan_init(void) + map_range(&pfn_mapped[i]); + } + +- kasan_populate_zero_shadow( +- kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), +- kasan_mem_to_shadow((void *)__START_KERNEL_map)); +- +- kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), +- (unsigned long)kasan_mem_to_shadow(_end), +- early_pfn_to_nid(__pa(_stext))); +- +- shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM); ++ shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE; + shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); + shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, + PAGE_SIZE); + +- shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE); ++ shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE + ++ CPU_ENTRY_AREA_MAP_SIZE); + shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); + shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, + PAGE_SIZE); + +- kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), +- shadow_cpu_entry_begin); ++ kasan_populate_zero_shadow( ++ kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), ++ shadow_cpu_entry_begin); + + kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, + (unsigned long)shadow_cpu_entry_end, 0); + +- kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END); ++ kasan_populate_zero_shadow(shadow_cpu_entry_end, ++ kasan_mem_to_shadow((void *)__START_KERNEL_map)); ++ ++ kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), ++ (unsigned long)kasan_mem_to_shadow(_end), ++ early_pfn_to_nid(__pa(_stext))); ++ ++ kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), ++ (void *)KASAN_SHADOW_END); + + load_cr3(init_top_pgt); + __flush_tlb_all(); +diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c +index b9bd5b8b14fa..77909bae5943 100644 +--- a/arch/x86/mm/pgtable_32.c ++++ b/arch/x86/mm/pgtable_32.c +@@ -9,6 +9,7 @@ + #include + #include + ++#include + #include + #include + #include +diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c +index 53e65f605bdd..cd4b91b8d614 100644 +--- a/arch/x86/xen/mmu_pv.c ++++ b/arch/x86/xen/mmu_pv.c +@@ -2286,7 +2286,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) + + switch (idx) { + case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: +- case FIX_RO_IDT: + #ifdef CONFIG_X86_32 + case FIX_WP_TEST: + # ifdef CONFIG_HIGHMEM +@@ -2297,7 +2296,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) + #endif + case FIX_TEXT_POKE0: + case FIX_TEXT_POKE1: +- case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM: + /* All local page mappings */ + pte = pfn_pte(phys, prot); + break; +-- +2.14.2 + diff --git a/patches/kernel/0188-init-Invoke-init_espfix_bsp-from-mm_init.patch b/patches/kernel/0188-init-Invoke-init_espfix_bsp-from-mm_init.patch new file mode 100644 index 0000000..78868d6 --- /dev/null +++ b/patches/kernel/0188-init-Invoke-init_espfix_bsp-from-mm_init.patch @@ -0,0 +1,123 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Sun, 17 Dec 2017 10:56:29 +0100 +Subject: [PATCH] init: Invoke init_espfix_bsp() from mm_init() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +init_espfix_bsp() needs to be invoked before the page table isolation +initialization. Move it into mm_init() which is the place where pti_init() +will be added. + +While at it get rid of the #ifdeffery and provide proper stub functions. + +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Signed-off-by: Ingo Molnar +(cherry picked from commit 613e396bc0d4c7604fba23256644e78454c68cf6) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit a187e1a3cd87c860a8db188991d2d43fedd7225f) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/espfix.h | 7 ++++--- + include/asm-generic/pgtable.h | 5 +++++ + arch/x86/kernel/smpboot.c | 6 +----- + init/main.c | 6 ++---- + 4 files changed, 12 insertions(+), 12 deletions(-) + +diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h +index ca3ce9ab9385..e7009ac975af 100644 +--- a/arch/x86/include/asm/espfix.h ++++ b/arch/x86/include/asm/espfix.h +@@ -1,7 +1,7 @@ + #ifndef _ASM_X86_ESPFIX_H + #define _ASM_X86_ESPFIX_H + +-#ifdef CONFIG_X86_64 ++#ifdef CONFIG_X86_ESPFIX64 + + #include + +@@ -10,7 +10,8 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); + + extern void init_espfix_bsp(void); + extern void init_espfix_ap(int cpu); +- +-#endif /* CONFIG_X86_64 */ ++#else ++static inline void init_espfix_ap(int cpu) { } ++#endif + + #endif /* _ASM_X86_ESPFIX_H */ +diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h +index 7dfa767dc680..1bab3cfc0601 100644 +--- a/include/asm-generic/pgtable.h ++++ b/include/asm-generic/pgtable.h +@@ -956,6 +956,11 @@ static inline int pmd_clear_huge(pmd_t *pmd) + struct file; + int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t *vma_prot); ++ ++#ifndef CONFIG_X86_ESPFIX64 ++static inline void init_espfix_bsp(void) { } ++#endif ++ + #endif /* !__ASSEMBLY__ */ + + #ifndef io_remap_pfn_range +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index 8ea3b18cbdc1..03d2ba2da3b0 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -989,12 +989,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, + initial_code = (unsigned long)start_secondary; + initial_stack = idle->thread.sp; + +- /* +- * Enable the espfix hack for this CPU +- */ +-#ifdef CONFIG_X86_ESPFIX64 ++ /* Enable the espfix hack for this CPU */ + init_espfix_ap(cpu); +-#endif + + /* So we see what's up */ + announce_cpu(cpu, apicid); +diff --git a/init/main.c b/init/main.c +index 83d1004e3b97..de1c495da782 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -504,6 +504,8 @@ static void __init mm_init(void) + pgtable_init(); + vmalloc_init(); + ioremap_huge_init(); ++ /* Should be run before the first non-init thread is created */ ++ init_espfix_bsp(); + } + + asmlinkage __visible void __init start_kernel(void) +@@ -664,10 +666,6 @@ asmlinkage __visible void __init start_kernel(void) + #ifdef CONFIG_X86 + if (efi_enabled(EFI_RUNTIME_SERVICES)) + efi_enter_virtual_mode(); +-#endif +-#ifdef CONFIG_X86_ESPFIX64 +- /* Should be run before the first non-init thread is created */ +- init_espfix_bsp(); + #endif + thread_stack_cache_init(); + cred_init(); +-- +2.14.2 + diff --git a/patches/kernel/0188-x86-cpu_entry_area-Prevent-wraparound-in-setup_cpu_e.patch b/patches/kernel/0188-x86-cpu_entry_area-Prevent-wraparound-in-setup_cpu_e.patch deleted file mode 100644 index 4f6e414..0000000 --- a/patches/kernel/0188-x86-cpu_entry_area-Prevent-wraparound-in-setup_cpu_e.patch +++ /dev/null @@ -1,49 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Sat, 23 Dec 2017 19:45:11 +0100 -Subject: [PATCH] x86/cpu_entry_area: Prevent wraparound in - setup_cpu_entry_area_ptes() on 32bit -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The loop which populates the CPU entry area PMDs can wrap around on 32bit -machines when the number of CPUs is small. - -It worked wonderful for NR_CPUS=64 for whatever reason and the moron who -wrote that code did not bother to test it with !SMP. - -Check for the wraparound to fix it. - -Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap") -Reported-by: kernel test robot -Signed-off-by: Thomas "Feels stupid" Gleixner -Tested-by: Borislav Petkov -(cherry picked from commit f6c4fd506cb626e4346aa81688f255e593a7c5a0) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 8a21158932b93ed7e72d16683085d55a3a06125e) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/cpu_entry_area.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c -index 21e8b595cbb1..fe814fd5e014 100644 ---- a/arch/x86/mm/cpu_entry_area.c -+++ b/arch/x86/mm/cpu_entry_area.c -@@ -122,7 +122,8 @@ static __init void setup_cpu_entry_area_ptes(void) - start = CPU_ENTRY_AREA_BASE; - end = start + CPU_ENTRY_AREA_MAP_SIZE; - -- for (; start < end; start += PMD_SIZE) -+ /* Careful here: start + PMD_SIZE might wrap around */ -+ for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE) - populate_extra_pte(start); - #endif - } --- -2.14.2 - diff --git a/patches/kernel/0189-x86-cpu_entry_area-Prevent-wraparound-in-setup_cpu_e.patch b/patches/kernel/0189-x86-cpu_entry_area-Prevent-wraparound-in-setup_cpu_e.patch new file mode 100644 index 0000000..4f6e414 --- /dev/null +++ b/patches/kernel/0189-x86-cpu_entry_area-Prevent-wraparound-in-setup_cpu_e.patch @@ -0,0 +1,49 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Sat, 23 Dec 2017 19:45:11 +0100 +Subject: [PATCH] x86/cpu_entry_area: Prevent wraparound in + setup_cpu_entry_area_ptes() on 32bit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The loop which populates the CPU entry area PMDs can wrap around on 32bit +machines when the number of CPUs is small. + +It worked wonderful for NR_CPUS=64 for whatever reason and the moron who +wrote that code did not bother to test it with !SMP. + +Check for the wraparound to fix it. + +Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap") +Reported-by: kernel test robot +Signed-off-by: Thomas "Feels stupid" Gleixner +Tested-by: Borislav Petkov +(cherry picked from commit f6c4fd506cb626e4346aa81688f255e593a7c5a0) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 8a21158932b93ed7e72d16683085d55a3a06125e) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/cpu_entry_area.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c +index 21e8b595cbb1..fe814fd5e014 100644 +--- a/arch/x86/mm/cpu_entry_area.c ++++ b/arch/x86/mm/cpu_entry_area.c +@@ -122,7 +122,8 @@ static __init void setup_cpu_entry_area_ptes(void) + start = CPU_ENTRY_AREA_BASE; + end = start + CPU_ENTRY_AREA_MAP_SIZE; + +- for (; start < end; start += PMD_SIZE) ++ /* Careful here: start + PMD_SIZE might wrap around */ ++ for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE) + populate_extra_pte(start); + #endif + } +-- +2.14.2 + diff --git a/patches/kernel/0189-x86-cpufeatures-Add-X86_BUG_CPU_INSECURE.patch b/patches/kernel/0189-x86-cpufeatures-Add-X86_BUG_CPU_INSECURE.patch deleted file mode 100644 index e0fbf55..0000000 --- a/patches/kernel/0189-x86-cpufeatures-Add-X86_BUG_CPU_INSECURE.patch +++ /dev/null @@ -1,120 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 4 Dec 2017 15:07:33 +0100 -Subject: [PATCH] x86/cpufeatures: Add X86_BUG_CPU_INSECURE -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Many x86 CPUs leak information to user space due to missing isolation of -user space and kernel space page tables. There are many well documented -ways to exploit that. - -The upcoming software migitation of isolating the user and kernel space -page tables needs a misfeature flag so code can be made runtime -conditional. - -Add the BUG bits which indicates that the CPU is affected and add a feature -bit which indicates that the software migitation is enabled. - -Assume for now that _ALL_ x86 CPUs are affected by this. Exceptions can be -made later. - -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(cherry picked from commit a89f040fa34ec9cd682aed98b8f04e3c47d998bd) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 3b0dffb3557f6a1084a2b92ac0cc2d36b5e1f39f) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/cpufeatures.h | 3 ++- - arch/x86/include/asm/disabled-features.h | 8 +++++++- - arch/x86/kernel/cpu/common.c | 4 ++++ - 3 files changed, 13 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h -index d57a174ec97c..de4e91452de4 100644 ---- a/arch/x86/include/asm/cpufeatures.h -+++ b/arch/x86/include/asm/cpufeatures.h -@@ -200,7 +200,7 @@ - #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ - #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ - #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ -- -+#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ - #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ - #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ - #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ -@@ -339,5 +339,6 @@ - #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ - #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ - #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ -+#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */ - - #endif /* _ASM_X86_CPUFEATURES_H */ -diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h -index 5dff775af7cd..db681152f024 100644 ---- a/arch/x86/include/asm/disabled-features.h -+++ b/arch/x86/include/asm/disabled-features.h -@@ -42,6 +42,12 @@ - # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) - #endif - -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+# define DISABLE_PTI 0 -+#else -+# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) -+#endif -+ - /* - * Make sure to add features to the correct mask - */ -@@ -52,7 +58,7 @@ - #define DISABLED_MASK4 0 - #define DISABLED_MASK5 0 - #define DISABLED_MASK6 0 --#define DISABLED_MASK7 0 -+#define DISABLED_MASK7 (DISABLE_PTI) - #define DISABLED_MASK8 0 - #define DISABLED_MASK9 (DISABLE_MPX) - #define DISABLED_MASK10 0 -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index 96171ce46d61..623ba3635793 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -898,6 +898,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) - } - - setup_force_cpu_cap(X86_FEATURE_ALWAYS); -+ -+ /* Assume for now that ALL x86 CPUs are insecure */ -+ setup_force_cpu_bug(X86_BUG_CPU_INSECURE); -+ - fpu__init_system(c); - } - --- -2.14.2 - diff --git a/patches/kernel/0190-x86-cpufeatures-Add-X86_BUG_CPU_INSECURE.patch b/patches/kernel/0190-x86-cpufeatures-Add-X86_BUG_CPU_INSECURE.patch new file mode 100644 index 0000000..e0fbf55 --- /dev/null +++ b/patches/kernel/0190-x86-cpufeatures-Add-X86_BUG_CPU_INSECURE.patch @@ -0,0 +1,120 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 4 Dec 2017 15:07:33 +0100 +Subject: [PATCH] x86/cpufeatures: Add X86_BUG_CPU_INSECURE +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Many x86 CPUs leak information to user space due to missing isolation of +user space and kernel space page tables. There are many well documented +ways to exploit that. + +The upcoming software migitation of isolating the user and kernel space +page tables needs a misfeature flag so code can be made runtime +conditional. + +Add the BUG bits which indicates that the CPU is affected and add a feature +bit which indicates that the software migitation is enabled. + +Assume for now that _ALL_ x86 CPUs are affected by this. Exceptions can be +made later. + +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(cherry picked from commit a89f040fa34ec9cd682aed98b8f04e3c47d998bd) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 3b0dffb3557f6a1084a2b92ac0cc2d36b5e1f39f) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/cpufeatures.h | 3 ++- + arch/x86/include/asm/disabled-features.h | 8 +++++++- + arch/x86/kernel/cpu/common.c | 4 ++++ + 3 files changed, 13 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index d57a174ec97c..de4e91452de4 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -200,7 +200,7 @@ + #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ + #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ + #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ +- ++#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ + #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ + #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ + #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ +@@ -339,5 +339,6 @@ + #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ + #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ + #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ ++#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */ + + #endif /* _ASM_X86_CPUFEATURES_H */ +diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h +index 5dff775af7cd..db681152f024 100644 +--- a/arch/x86/include/asm/disabled-features.h ++++ b/arch/x86/include/asm/disabled-features.h +@@ -42,6 +42,12 @@ + # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) + #endif + ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++# define DISABLE_PTI 0 ++#else ++# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) ++#endif ++ + /* + * Make sure to add features to the correct mask + */ +@@ -52,7 +58,7 @@ + #define DISABLED_MASK4 0 + #define DISABLED_MASK5 0 + #define DISABLED_MASK6 0 +-#define DISABLED_MASK7 0 ++#define DISABLED_MASK7 (DISABLE_PTI) + #define DISABLED_MASK8 0 + #define DISABLED_MASK9 (DISABLE_MPX) + #define DISABLED_MASK10 0 +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 96171ce46d61..623ba3635793 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -898,6 +898,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) + } + + setup_force_cpu_cap(X86_FEATURE_ALWAYS); ++ ++ /* Assume for now that ALL x86 CPUs are insecure */ ++ setup_force_cpu_bug(X86_BUG_CPU_INSECURE); ++ + fpu__init_system(c); + } + +-- +2.14.2 + diff --git a/patches/kernel/0190-x86-mm-pti-Disable-global-pages-if-PAGE_TABLE_ISOLAT.patch b/patches/kernel/0190-x86-mm-pti-Disable-global-pages-if-PAGE_TABLE_ISOLAT.patch deleted file mode 100644 index ad59431..0000000 --- a/patches/kernel/0190-x86-mm-pti-Disable-global-pages-if-PAGE_TABLE_ISOLAT.patch +++ /dev/null @@ -1,100 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Mon, 4 Dec 2017 15:07:34 +0100 -Subject: [PATCH] x86/mm/pti: Disable global pages if PAGE_TABLE_ISOLATION=y -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Global pages stay in the TLB across context switches. Since all contexts -share the same kernel mapping, these mappings are marked as global pages -so kernel entries in the TLB are not flushed out on a context switch. - -But, even having these entries in the TLB opens up something that an -attacker can use, such as the double-page-fault attack: - - http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf - -That means that even when PAGE_TABLE_ISOLATION switches page tables -on return to user space the global pages would stay in the TLB cache. - -Disable global pages so that kernel TLB entries can be flushed before -returning to user space. This way, all accesses to kernel addresses from -userspace result in a TLB miss independent of the existence of a kernel -mapping. - -Suppress global pages via the __supported_pte_mask. The user space -mappings set PAGE_GLOBAL for the minimal kernel mappings which are -required for entry/exit. These mappings are set up manually so the -filtering does not take place. - -[ The __supported_pte_mask simplification was written by Thomas Gleixner. ] -Signed-off-by: Dave Hansen -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit c313ec66317d421fb5768d78c56abed2dc862264) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit ace78e99d765da1e59f6b151adac6c360c67af7d) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/init.c | 12 +++++++++--- - 1 file changed, 9 insertions(+), 3 deletions(-) - -diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c -index a22c2b95e513..020223420308 100644 ---- a/arch/x86/mm/init.c -+++ b/arch/x86/mm/init.c -@@ -161,6 +161,12 @@ struct map_range { - - static int page_size_mask; - -+static void enable_global_pages(void) -+{ -+ if (!static_cpu_has(X86_FEATURE_PTI)) -+ __supported_pte_mask |= _PAGE_GLOBAL; -+} -+ - static void __init probe_page_size_mask(void) - { - /* -@@ -179,11 +185,11 @@ static void __init probe_page_size_mask(void) - cr4_set_bits_and_update_boot(X86_CR4_PSE); - - /* Enable PGE if available */ -+ __supported_pte_mask &= ~_PAGE_GLOBAL; - if (boot_cpu_has(X86_FEATURE_PGE)) { - cr4_set_bits_and_update_boot(X86_CR4_PGE); -- __supported_pte_mask |= _PAGE_GLOBAL; -- } else -- __supported_pte_mask &= ~_PAGE_GLOBAL; -+ enable_global_pages(); -+ } - - /* Enable 1 GB linear kernel mappings if available: */ - if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { --- -2.14.2 - diff --git a/patches/kernel/0191-x86-mm-pti-Disable-global-pages-if-PAGE_TABLE_ISOLAT.patch b/patches/kernel/0191-x86-mm-pti-Disable-global-pages-if-PAGE_TABLE_ISOLAT.patch new file mode 100644 index 0000000..ad59431 --- /dev/null +++ b/patches/kernel/0191-x86-mm-pti-Disable-global-pages-if-PAGE_TABLE_ISOLAT.patch @@ -0,0 +1,100 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Mon, 4 Dec 2017 15:07:34 +0100 +Subject: [PATCH] x86/mm/pti: Disable global pages if PAGE_TABLE_ISOLATION=y +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Global pages stay in the TLB across context switches. Since all contexts +share the same kernel mapping, these mappings are marked as global pages +so kernel entries in the TLB are not flushed out on a context switch. + +But, even having these entries in the TLB opens up something that an +attacker can use, such as the double-page-fault attack: + + http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf + +That means that even when PAGE_TABLE_ISOLATION switches page tables +on return to user space the global pages would stay in the TLB cache. + +Disable global pages so that kernel TLB entries can be flushed before +returning to user space. This way, all accesses to kernel addresses from +userspace result in a TLB miss independent of the existence of a kernel +mapping. + +Suppress global pages via the __supported_pte_mask. The user space +mappings set PAGE_GLOBAL for the minimal kernel mappings which are +required for entry/exit. These mappings are set up manually so the +filtering does not take place. + +[ The __supported_pte_mask simplification was written by Thomas Gleixner. ] +Signed-off-by: Dave Hansen +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit c313ec66317d421fb5768d78c56abed2dc862264) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit ace78e99d765da1e59f6b151adac6c360c67af7d) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/init.c | 12 +++++++++--- + 1 file changed, 9 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c +index a22c2b95e513..020223420308 100644 +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -161,6 +161,12 @@ struct map_range { + + static int page_size_mask; + ++static void enable_global_pages(void) ++{ ++ if (!static_cpu_has(X86_FEATURE_PTI)) ++ __supported_pte_mask |= _PAGE_GLOBAL; ++} ++ + static void __init probe_page_size_mask(void) + { + /* +@@ -179,11 +185,11 @@ static void __init probe_page_size_mask(void) + cr4_set_bits_and_update_boot(X86_CR4_PSE); + + /* Enable PGE if available */ ++ __supported_pte_mask &= ~_PAGE_GLOBAL; + if (boot_cpu_has(X86_FEATURE_PGE)) { + cr4_set_bits_and_update_boot(X86_CR4_PGE); +- __supported_pte_mask |= _PAGE_GLOBAL; +- } else +- __supported_pte_mask &= ~_PAGE_GLOBAL; ++ enable_global_pages(); ++ } + + /* Enable 1 GB linear kernel mappings if available: */ + if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { +-- +2.14.2 + diff --git a/patches/kernel/0191-x86-mm-pti-Prepare-the-x86-entry-assembly-code-for-e.patch b/patches/kernel/0191-x86-mm-pti-Prepare-the-x86-entry-assembly-code-for-e.patch deleted file mode 100644 index c16486b..0000000 --- a/patches/kernel/0191-x86-mm-pti-Prepare-the-x86-entry-assembly-code-for-e.patch +++ /dev/null @@ -1,381 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Mon, 4 Dec 2017 15:07:35 +0100 -Subject: [PATCH] x86/mm/pti: Prepare the x86/entry assembly code for - entry/exit CR3 switching -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -PAGE_TABLE_ISOLATION needs to switch to a different CR3 value when it -enters the kernel and switch back when it exits. This essentially needs to -be done before leaving assembly code. - -This is extra challenging because the switching context is tricky: the -registers that can be clobbered can vary. It is also hard to store things -on the stack because there is an established ABI (ptregs) or the stack is -entirely unsafe to use. - -Establish a set of macros that allow changing to the user and kernel CR3 -values. - -Interactions with SWAPGS: - - Previous versions of the PAGE_TABLE_ISOLATION code relied on having - per-CPU scratch space to save/restore a register that can be used for the - CR3 MOV. The %GS register is used to index into our per-CPU space, so - SWAPGS *had* to be done before the CR3 switch. That scratch space is gone - now, but the semantic that SWAPGS must be done before the CR3 MOV is - retained. This is good to keep because it is not that hard to do and it - allows to do things like add per-CPU debugging information. - -What this does in the NMI code is worth pointing out. NMIs can interrupt -*any* context and they can also be nested with NMIs interrupting other -NMIs. The comments below ".Lnmi_from_kernel" explain the format of the -stack during this situation. Changing the format of this stack is hard. -Instead of storing the old CR3 value on the stack, this depends on the -*regular* register save/restore mechanism and then uses %r14 to keep CR3 -during the NMI. It is callee-saved and will not be clobbered by the C NMI -handlers that get called. - -[ PeterZ: ESPFIX optimization ] - -Based-on-code-from: Andy Lutomirski -Signed-off-by: Dave Hansen -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Reviewed-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 8a09317b895f073977346779df52f67c1056d81d) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 313dfb599cf7f8e53fc6f710d15bed60972dcd6f) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/calling.h | 66 ++++++++++++++++++++++++++++++++++++++++ - arch/x86/entry/entry_64.S | 45 +++++++++++++++++++++++---- - arch/x86/entry/entry_64_compat.S | 24 ++++++++++++++- - 3 files changed, 128 insertions(+), 7 deletions(-) - -diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h -index 1895a685d3dd..dde6262be0a3 100644 ---- a/arch/x86/entry/calling.h -+++ b/arch/x86/entry/calling.h -@@ -1,5 +1,7 @@ - #include - #include -+#include -+#include - - /* - -@@ -186,6 +188,70 @@ For 32-bit we have the following conventions - kernel is built with - #endif - .endm - -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+ -+/* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two halves: */ -+#define PTI_SWITCH_MASK (1< in kernel */ - SWAPGS - xorl %ebx, %ebx --1: ret -+ -+1: -+ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 -+ -+ ret - END(paranoid_entry) - - /* -@@ -1278,6 +1299,7 @@ ENTRY(paranoid_exit) - testl %ebx, %ebx /* swapgs needed? */ - jnz .Lparanoid_exit_no_swapgs - TRACE_IRQS_IRETQ -+ RESTORE_CR3 save_reg=%r14 - SWAPGS_UNSAFE_STACK - jmp .Lparanoid_exit_restore - .Lparanoid_exit_no_swapgs: -@@ -1305,6 +1327,8 @@ ENTRY(error_entry) - * from user mode due to an IRET fault. - */ - SWAPGS -+ /* We have user CR3. Change to kernel CR3. */ -+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax - - .Lerror_entry_from_usermode_after_swapgs: - /* Put us onto the real thread stack. */ -@@ -1351,6 +1375,7 @@ ENTRY(error_entry) - * .Lgs_change's error handler with kernel gsbase. - */ - SWAPGS -+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax - jmp .Lerror_entry_done - - .Lbstep_iret: -@@ -1360,10 +1385,11 @@ ENTRY(error_entry) - - .Lerror_bad_iret: - /* -- * We came from an IRET to user mode, so we have user gsbase. -- * Switch to kernel gsbase: -+ * We came from an IRET to user mode, so we have user -+ * gsbase and CR3. Switch to kernel gsbase and CR3: - */ - SWAPGS -+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax - - /* - * Pretend that the exception came from user mode: set up pt_regs -@@ -1395,6 +1421,10 @@ END(error_exit) - /* - * Runs on exception stack. Xen PV does not go through this path at all, - * so we can use real assembly here. -+ * -+ * Registers: -+ * %r14: Used to save/restore the CR3 of the interrupted context -+ * when PAGE_TABLE_ISOLATION is in use. Do not clobber. - */ - ENTRY(nmi) - UNWIND_HINT_IRET_REGS -@@ -1458,6 +1488,7 @@ ENTRY(nmi) - - swapgs - cld -+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx - movq %rsp, %rdx - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - UNWIND_HINT_IRET_REGS base=%rdx offset=8 -@@ -1710,6 +1741,8 @@ end_repeat_nmi: - movq $-1, %rsi - call do_nmi - -+ RESTORE_CR3 save_reg=%r14 -+ - testl %ebx, %ebx /* swapgs needed? */ - jnz nmi_restore - nmi_swapgs: -diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S -index 2270601b6218..43f856aeee67 100644 ---- a/arch/x86/entry/entry_64_compat.S -+++ b/arch/x86/entry/entry_64_compat.S -@@ -48,6 +48,10 @@ - ENTRY(entry_SYSENTER_compat) - /* Interrupts are off on entry. */ - SWAPGS -+ -+ /* We are about to clobber %rsp anyway, clobbering here is OK */ -+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp -+ - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - - /* -@@ -214,6 +218,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) - pushq $0 /* pt_regs->r14 = 0 */ - pushq $0 /* pt_regs->r15 = 0 */ - -+ /* -+ * We just saved %rdi so it is safe to clobber. It is not -+ * preserved during the C calls inside TRACE_IRQS_OFF anyway. -+ */ -+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi -+ - /* - * User mode is traced as though IRQs are on, and SYSENTER - * turned them off. -@@ -255,10 +265,22 @@ sysret32_from_system_call: - * when the system call started, which is already known to user - * code. We zero R8-R10 to avoid info leaks. - */ -+ movq RSP-ORIG_RAX(%rsp), %rsp -+ -+ /* -+ * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored -+ * on the process stack which is not mapped to userspace and -+ * not readable after we SWITCH_TO_USER_CR3. Delay the CR3 -+ * switch until after after the last reference to the process -+ * stack. -+ * -+ * %r8 is zeroed before the sysret, thus safe to clobber. -+ */ -+ SWITCH_TO_USER_CR3 scratch_reg=%r8 -+ - xorq %r8, %r8 - xorq %r9, %r9 - xorq %r10, %r10 -- movq RSP-ORIG_RAX(%rsp), %rsp - swapgs - sysretl - END(entry_SYSCALL_compat) --- -2.14.2 - diff --git a/patches/kernel/0192-x86-mm-pti-Add-infrastructure-for-page-table-isolati.patch b/patches/kernel/0192-x86-mm-pti-Add-infrastructure-for-page-table-isolati.patch deleted file mode 100644 index b84d5ac..0000000 --- a/patches/kernel/0192-x86-mm-pti-Add-infrastructure-for-page-table-isolati.patch +++ /dev/null @@ -1,311 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 4 Dec 2017 15:07:36 +0100 -Subject: [PATCH] x86/mm/pti: Add infrastructure for page table isolation -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Add the initial files for kernel page table isolation, with a minimal init -function and the boot time detection for this misfeature. - -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(backported from commit aa8c6248f8c75acfd610fe15d8cae23cf70d9d09) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 50da124a01ed7a59f9b2c9551f622c5a27d1caec) -Signed-off-by: Fabian Grünbichler ---- - Documentation/admin-guide/kernel-parameters.txt | 2 + - arch/x86/mm/Makefile | 7 ++- - arch/x86/entry/calling.h | 7 +++ - arch/x86/include/asm/pti.h | 14 +++++ - include/linux/pti.h | 11 ++++ - arch/x86/boot/compressed/pagetable.c | 3 + - arch/x86/mm/init.c | 2 + - arch/x86/mm/pti.c | 84 +++++++++++++++++++++++++ - init/main.c | 3 + - 9 files changed, 130 insertions(+), 3 deletions(-) - create mode 100644 arch/x86/include/asm/pti.h - create mode 100644 include/linux/pti.h - create mode 100644 arch/x86/mm/pti.c - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 3510e255ef4c..e2a4608da5d2 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -2677,6 +2677,8 @@ - steal time is computed, but won't influence scheduler - behaviour - -+ nopti [X86-64] Disable kernel page table isolation -+ - nolapic [X86-32,APIC] Do not enable or use the local APIC. - - nolapic_timer [X86-32,APIC] Do not use the local APIC timer. -diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile -index 76f5399a8356..7aa68fc18abe 100644 ---- a/arch/x86/mm/Makefile -+++ b/arch/x86/mm/Makefile -@@ -35,7 +35,8 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o - obj-$(CONFIG_ACPI_NUMA) += srat.o - obj-$(CONFIG_NUMA_EMU) += numa_emulation.o - --obj-$(CONFIG_X86_INTEL_MPX) += mpx.o --obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o --obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o -+obj-$(CONFIG_X86_INTEL_MPX) += mpx.o -+obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o -+obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o -+obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o - -diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h -index dde6262be0a3..bb56f5346ae8 100644 ---- a/arch/x86/entry/calling.h -+++ b/arch/x86/entry/calling.h -@@ -204,18 +204,23 @@ For 32-bit we have the following conventions - kernel is built with - .endm - - .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req -+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI - mov %cr3, \scratch_reg - ADJUST_KERNEL_CR3 \scratch_reg - mov \scratch_reg, %cr3 -+.Lend_\@: - .endm - - .macro SWITCH_TO_USER_CR3 scratch_reg:req -+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI - mov %cr3, \scratch_reg - ADJUST_USER_CR3 \scratch_reg - mov \scratch_reg, %cr3 -+.Lend_\@: - .endm - - .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req -+ ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI - movq %cr3, \scratch_reg - movq \scratch_reg, \save_reg - /* -@@ -232,11 +237,13 @@ For 32-bit we have the following conventions - kernel is built with - .endm - - .macro RESTORE_CR3 save_reg:req -+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI - /* - * The CR3 write could be avoided when not changing its value, - * but would require a CR3 read *and* a scratch register. - */ - movq \save_reg, %cr3 -+.Lend_\@: - .endm - - #else /* CONFIG_PAGE_TABLE_ISOLATION=n: */ -diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h -new file mode 100644 -index 000000000000..0b5ef05b2d2d ---- /dev/null -+++ b/arch/x86/include/asm/pti.h -@@ -0,0 +1,14 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef _ASM_X86_PTI_H -+#define _ASM_X86_PTI_H -+#ifndef __ASSEMBLY__ -+ -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+extern void pti_init(void); -+extern void pti_check_boottime_disable(void); -+#else -+static inline void pti_check_boottime_disable(void) { } -+#endif -+ -+#endif /* __ASSEMBLY__ */ -+#endif /* _ASM_X86_PTI_H */ -diff --git a/include/linux/pti.h b/include/linux/pti.h -new file mode 100644 -index 000000000000..0174883a935a ---- /dev/null -+++ b/include/linux/pti.h -@@ -0,0 +1,11 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef _INCLUDE_PTI_H -+#define _INCLUDE_PTI_H -+ -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+#include -+#else -+static inline void pti_init(void) { } -+#endif -+ -+#endif -diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c -index 28029be47fbb..21d8839cdaa7 100644 ---- a/arch/x86/boot/compressed/pagetable.c -+++ b/arch/x86/boot/compressed/pagetable.c -@@ -15,6 +15,9 @@ - #define __pa(x) ((unsigned long)(x)) - #define __va(x) ((void *)((unsigned long)(x))) - -+/* No PAGE_TABLE_ISOLATION support needed either: */ -+#undef CONFIG_PAGE_TABLE_ISOLATION -+ - #include "misc.h" - - /* These actually do the work of building the kernel identity maps. */ -diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c -index 020223420308..af75069fb116 100644 ---- a/arch/x86/mm/init.c -+++ b/arch/x86/mm/init.c -@@ -20,6 +20,7 @@ - #include - #include - #include -+#include - - /* - * We need to define the tracepoints somewhere, and tlb.c -@@ -630,6 +631,7 @@ void __init init_mem_mapping(void) - { - unsigned long end; - -+ pti_check_boottime_disable(); - probe_page_size_mask(); - setup_pcid(); - -diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c -new file mode 100644 -index 000000000000..375f23a758bc ---- /dev/null -+++ b/arch/x86/mm/pti.c -@@ -0,0 +1,84 @@ -+/* -+ * Copyright(c) 2017 Intel Corporation. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of version 2 of the GNU General Public License as -+ * published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * General Public License for more details. -+ * -+ * This code is based in part on work published here: -+ * -+ * https://github.com/IAIK/KAISER -+ * -+ * The original work was written by and and signed off by for the Linux -+ * kernel by: -+ * -+ * Signed-off-by: Richard Fellner -+ * Signed-off-by: Moritz Lipp -+ * Signed-off-by: Daniel Gruss -+ * Signed-off-by: Michael Schwarz -+ * -+ * Major changes to the original code by: Dave Hansen -+ * Mostly rewritten by Thomas Gleixner and -+ * Andy Lutomirsky -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#undef pr_fmt -+#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt -+ -+static void __init pti_print_if_insecure(const char *reason) -+{ -+ if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) -+ pr_info("%s\n", reason); -+} -+ -+void __init pti_check_boottime_disable(void) -+{ -+ if (hypervisor_is_type(X86_HYPER_XEN_PV)) { -+ pti_print_if_insecure("disabled on XEN PV."); -+ return; -+ } -+ -+ if (cmdline_find_option_bool(boot_command_line, "nopti")) { -+ pti_print_if_insecure("disabled on command line."); -+ return; -+ } -+ -+ if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) -+ return; -+ -+ setup_force_cpu_cap(X86_FEATURE_PTI); -+} -+ -+/* -+ * Initialize kernel page table isolation -+ */ -+void __init pti_init(void) -+{ -+ if (!static_cpu_has(X86_FEATURE_PTI)) -+ return; -+ -+ pr_info("enabled\n"); -+} -diff --git a/init/main.c b/init/main.c -index de1c495da782..bb0896c24c08 100644 ---- a/init/main.c -+++ b/init/main.c -@@ -75,6 +75,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -506,6 +507,8 @@ static void __init mm_init(void) - ioremap_huge_init(); - /* Should be run before the first non-init thread is created */ - init_espfix_bsp(); -+ /* Should be run after espfix64 is set up. */ -+ pti_init(); - } - - asmlinkage __visible void __init start_kernel(void) --- -2.14.2 - diff --git a/patches/kernel/0192-x86-mm-pti-Prepare-the-x86-entry-assembly-code-for-e.patch b/patches/kernel/0192-x86-mm-pti-Prepare-the-x86-entry-assembly-code-for-e.patch new file mode 100644 index 0000000..c16486b --- /dev/null +++ b/patches/kernel/0192-x86-mm-pti-Prepare-the-x86-entry-assembly-code-for-e.patch @@ -0,0 +1,381 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Mon, 4 Dec 2017 15:07:35 +0100 +Subject: [PATCH] x86/mm/pti: Prepare the x86/entry assembly code for + entry/exit CR3 switching +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +PAGE_TABLE_ISOLATION needs to switch to a different CR3 value when it +enters the kernel and switch back when it exits. This essentially needs to +be done before leaving assembly code. + +This is extra challenging because the switching context is tricky: the +registers that can be clobbered can vary. It is also hard to store things +on the stack because there is an established ABI (ptregs) or the stack is +entirely unsafe to use. + +Establish a set of macros that allow changing to the user and kernel CR3 +values. + +Interactions with SWAPGS: + + Previous versions of the PAGE_TABLE_ISOLATION code relied on having + per-CPU scratch space to save/restore a register that can be used for the + CR3 MOV. The %GS register is used to index into our per-CPU space, so + SWAPGS *had* to be done before the CR3 switch. That scratch space is gone + now, but the semantic that SWAPGS must be done before the CR3 MOV is + retained. This is good to keep because it is not that hard to do and it + allows to do things like add per-CPU debugging information. + +What this does in the NMI code is worth pointing out. NMIs can interrupt +*any* context and they can also be nested with NMIs interrupting other +NMIs. The comments below ".Lnmi_from_kernel" explain the format of the +stack during this situation. Changing the format of this stack is hard. +Instead of storing the old CR3 value on the stack, this depends on the +*regular* register save/restore mechanism and then uses %r14 to keep CR3 +during the NMI. It is callee-saved and will not be clobbered by the C NMI +handlers that get called. + +[ PeterZ: ESPFIX optimization ] + +Based-on-code-from: Andy Lutomirski +Signed-off-by: Dave Hansen +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Reviewed-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 8a09317b895f073977346779df52f67c1056d81d) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 313dfb599cf7f8e53fc6f710d15bed60972dcd6f) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/calling.h | 66 ++++++++++++++++++++++++++++++++++++++++ + arch/x86/entry/entry_64.S | 45 +++++++++++++++++++++++---- + arch/x86/entry/entry_64_compat.S | 24 ++++++++++++++- + 3 files changed, 128 insertions(+), 7 deletions(-) + +diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h +index 1895a685d3dd..dde6262be0a3 100644 +--- a/arch/x86/entry/calling.h ++++ b/arch/x86/entry/calling.h +@@ -1,5 +1,7 @@ + #include + #include ++#include ++#include + + /* + +@@ -186,6 +188,70 @@ For 32-bit we have the following conventions - kernel is built with + #endif + .endm + ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ ++/* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two halves: */ ++#define PTI_SWITCH_MASK (1< in kernel */ + SWAPGS + xorl %ebx, %ebx +-1: ret ++ ++1: ++ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 ++ ++ ret + END(paranoid_entry) + + /* +@@ -1278,6 +1299,7 @@ ENTRY(paranoid_exit) + testl %ebx, %ebx /* swapgs needed? */ + jnz .Lparanoid_exit_no_swapgs + TRACE_IRQS_IRETQ ++ RESTORE_CR3 save_reg=%r14 + SWAPGS_UNSAFE_STACK + jmp .Lparanoid_exit_restore + .Lparanoid_exit_no_swapgs: +@@ -1305,6 +1327,8 @@ ENTRY(error_entry) + * from user mode due to an IRET fault. + */ + SWAPGS ++ /* We have user CR3. Change to kernel CR3. */ ++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + + .Lerror_entry_from_usermode_after_swapgs: + /* Put us onto the real thread stack. */ +@@ -1351,6 +1375,7 @@ ENTRY(error_entry) + * .Lgs_change's error handler with kernel gsbase. + */ + SWAPGS ++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + jmp .Lerror_entry_done + + .Lbstep_iret: +@@ -1360,10 +1385,11 @@ ENTRY(error_entry) + + .Lerror_bad_iret: + /* +- * We came from an IRET to user mode, so we have user gsbase. +- * Switch to kernel gsbase: ++ * We came from an IRET to user mode, so we have user ++ * gsbase and CR3. Switch to kernel gsbase and CR3: + */ + SWAPGS ++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + + /* + * Pretend that the exception came from user mode: set up pt_regs +@@ -1395,6 +1421,10 @@ END(error_exit) + /* + * Runs on exception stack. Xen PV does not go through this path at all, + * so we can use real assembly here. ++ * ++ * Registers: ++ * %r14: Used to save/restore the CR3 of the interrupted context ++ * when PAGE_TABLE_ISOLATION is in use. Do not clobber. + */ + ENTRY(nmi) + UNWIND_HINT_IRET_REGS +@@ -1458,6 +1488,7 @@ ENTRY(nmi) + + swapgs + cld ++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx + movq %rsp, %rdx + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + UNWIND_HINT_IRET_REGS base=%rdx offset=8 +@@ -1710,6 +1741,8 @@ end_repeat_nmi: + movq $-1, %rsi + call do_nmi + ++ RESTORE_CR3 save_reg=%r14 ++ + testl %ebx, %ebx /* swapgs needed? */ + jnz nmi_restore + nmi_swapgs: +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index 2270601b6218..43f856aeee67 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -48,6 +48,10 @@ + ENTRY(entry_SYSENTER_compat) + /* Interrupts are off on entry. */ + SWAPGS ++ ++ /* We are about to clobber %rsp anyway, clobbering here is OK */ ++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp ++ + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + + /* +@@ -214,6 +218,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) + pushq $0 /* pt_regs->r14 = 0 */ + pushq $0 /* pt_regs->r15 = 0 */ + ++ /* ++ * We just saved %rdi so it is safe to clobber. It is not ++ * preserved during the C calls inside TRACE_IRQS_OFF anyway. ++ */ ++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi ++ + /* + * User mode is traced as though IRQs are on, and SYSENTER + * turned them off. +@@ -255,10 +265,22 @@ sysret32_from_system_call: + * when the system call started, which is already known to user + * code. We zero R8-R10 to avoid info leaks. + */ ++ movq RSP-ORIG_RAX(%rsp), %rsp ++ ++ /* ++ * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored ++ * on the process stack which is not mapped to userspace and ++ * not readable after we SWITCH_TO_USER_CR3. Delay the CR3 ++ * switch until after after the last reference to the process ++ * stack. ++ * ++ * %r8 is zeroed before the sysret, thus safe to clobber. ++ */ ++ SWITCH_TO_USER_CR3 scratch_reg=%r8 ++ + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r10 +- movq RSP-ORIG_RAX(%rsp), %rsp + swapgs + sysretl + END(entry_SYSCALL_compat) +-- +2.14.2 + diff --git a/patches/kernel/0193-x86-mm-pti-Add-infrastructure-for-page-table-isolati.patch b/patches/kernel/0193-x86-mm-pti-Add-infrastructure-for-page-table-isolati.patch new file mode 100644 index 0000000..b84d5ac --- /dev/null +++ b/patches/kernel/0193-x86-mm-pti-Add-infrastructure-for-page-table-isolati.patch @@ -0,0 +1,311 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 4 Dec 2017 15:07:36 +0100 +Subject: [PATCH] x86/mm/pti: Add infrastructure for page table isolation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Add the initial files for kernel page table isolation, with a minimal init +function and the boot time detection for this misfeature. + +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(backported from commit aa8c6248f8c75acfd610fe15d8cae23cf70d9d09) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 50da124a01ed7a59f9b2c9551f622c5a27d1caec) +Signed-off-by: Fabian Grünbichler +--- + Documentation/admin-guide/kernel-parameters.txt | 2 + + arch/x86/mm/Makefile | 7 ++- + arch/x86/entry/calling.h | 7 +++ + arch/x86/include/asm/pti.h | 14 +++++ + include/linux/pti.h | 11 ++++ + arch/x86/boot/compressed/pagetable.c | 3 + + arch/x86/mm/init.c | 2 + + arch/x86/mm/pti.c | 84 +++++++++++++++++++++++++ + init/main.c | 3 + + 9 files changed, 130 insertions(+), 3 deletions(-) + create mode 100644 arch/x86/include/asm/pti.h + create mode 100644 include/linux/pti.h + create mode 100644 arch/x86/mm/pti.c + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 3510e255ef4c..e2a4608da5d2 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -2677,6 +2677,8 @@ + steal time is computed, but won't influence scheduler + behaviour + ++ nopti [X86-64] Disable kernel page table isolation ++ + nolapic [X86-32,APIC] Do not enable or use the local APIC. + + nolapic_timer [X86-32,APIC] Do not use the local APIC timer. +diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile +index 76f5399a8356..7aa68fc18abe 100644 +--- a/arch/x86/mm/Makefile ++++ b/arch/x86/mm/Makefile +@@ -35,7 +35,8 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o + obj-$(CONFIG_ACPI_NUMA) += srat.o + obj-$(CONFIG_NUMA_EMU) += numa_emulation.o + +-obj-$(CONFIG_X86_INTEL_MPX) += mpx.o +-obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o +-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o ++obj-$(CONFIG_X86_INTEL_MPX) += mpx.o ++obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o ++obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o ++obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o + +diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h +index dde6262be0a3..bb56f5346ae8 100644 +--- a/arch/x86/entry/calling.h ++++ b/arch/x86/entry/calling.h +@@ -204,18 +204,23 @@ For 32-bit we have the following conventions - kernel is built with + .endm + + .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req ++ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + mov %cr3, \scratch_reg + ADJUST_KERNEL_CR3 \scratch_reg + mov \scratch_reg, %cr3 ++.Lend_\@: + .endm + + .macro SWITCH_TO_USER_CR3 scratch_reg:req ++ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + mov %cr3, \scratch_reg + ADJUST_USER_CR3 \scratch_reg + mov \scratch_reg, %cr3 ++.Lend_\@: + .endm + + .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req ++ ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI + movq %cr3, \scratch_reg + movq \scratch_reg, \save_reg + /* +@@ -232,11 +237,13 @@ For 32-bit we have the following conventions - kernel is built with + .endm + + .macro RESTORE_CR3 save_reg:req ++ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + /* + * The CR3 write could be avoided when not changing its value, + * but would require a CR3 read *and* a scratch register. + */ + movq \save_reg, %cr3 ++.Lend_\@: + .endm + + #else /* CONFIG_PAGE_TABLE_ISOLATION=n: */ +diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h +new file mode 100644 +index 000000000000..0b5ef05b2d2d +--- /dev/null ++++ b/arch/x86/include/asm/pti.h +@@ -0,0 +1,14 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef _ASM_X86_PTI_H ++#define _ASM_X86_PTI_H ++#ifndef __ASSEMBLY__ ++ ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++extern void pti_init(void); ++extern void pti_check_boottime_disable(void); ++#else ++static inline void pti_check_boottime_disable(void) { } ++#endif ++ ++#endif /* __ASSEMBLY__ */ ++#endif /* _ASM_X86_PTI_H */ +diff --git a/include/linux/pti.h b/include/linux/pti.h +new file mode 100644 +index 000000000000..0174883a935a +--- /dev/null ++++ b/include/linux/pti.h +@@ -0,0 +1,11 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef _INCLUDE_PTI_H ++#define _INCLUDE_PTI_H ++ ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++#include ++#else ++static inline void pti_init(void) { } ++#endif ++ ++#endif +diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c +index 28029be47fbb..21d8839cdaa7 100644 +--- a/arch/x86/boot/compressed/pagetable.c ++++ b/arch/x86/boot/compressed/pagetable.c +@@ -15,6 +15,9 @@ + #define __pa(x) ((unsigned long)(x)) + #define __va(x) ((void *)((unsigned long)(x))) + ++/* No PAGE_TABLE_ISOLATION support needed either: */ ++#undef CONFIG_PAGE_TABLE_ISOLATION ++ + #include "misc.h" + + /* These actually do the work of building the kernel identity maps. */ +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c +index 020223420308..af75069fb116 100644 +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + /* + * We need to define the tracepoints somewhere, and tlb.c +@@ -630,6 +631,7 @@ void __init init_mem_mapping(void) + { + unsigned long end; + ++ pti_check_boottime_disable(); + probe_page_size_mask(); + setup_pcid(); + +diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c +new file mode 100644 +index 000000000000..375f23a758bc +--- /dev/null ++++ b/arch/x86/mm/pti.c +@@ -0,0 +1,84 @@ ++/* ++ * Copyright(c) 2017 Intel Corporation. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of version 2 of the GNU General Public License as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * This code is based in part on work published here: ++ * ++ * https://github.com/IAIK/KAISER ++ * ++ * The original work was written by and and signed off by for the Linux ++ * kernel by: ++ * ++ * Signed-off-by: Richard Fellner ++ * Signed-off-by: Moritz Lipp ++ * Signed-off-by: Daniel Gruss ++ * Signed-off-by: Michael Schwarz ++ * ++ * Major changes to the original code by: Dave Hansen ++ * Mostly rewritten by Thomas Gleixner and ++ * Andy Lutomirsky ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#undef pr_fmt ++#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt ++ ++static void __init pti_print_if_insecure(const char *reason) ++{ ++ if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) ++ pr_info("%s\n", reason); ++} ++ ++void __init pti_check_boottime_disable(void) ++{ ++ if (hypervisor_is_type(X86_HYPER_XEN_PV)) { ++ pti_print_if_insecure("disabled on XEN PV."); ++ return; ++ } ++ ++ if (cmdline_find_option_bool(boot_command_line, "nopti")) { ++ pti_print_if_insecure("disabled on command line."); ++ return; ++ } ++ ++ if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) ++ return; ++ ++ setup_force_cpu_cap(X86_FEATURE_PTI); ++} ++ ++/* ++ * Initialize kernel page table isolation ++ */ ++void __init pti_init(void) ++{ ++ if (!static_cpu_has(X86_FEATURE_PTI)) ++ return; ++ ++ pr_info("enabled\n"); ++} +diff --git a/init/main.c b/init/main.c +index de1c495da782..bb0896c24c08 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -75,6 +75,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -506,6 +507,8 @@ static void __init mm_init(void) + ioremap_huge_init(); + /* Should be run before the first non-init thread is created */ + init_espfix_bsp(); ++ /* Should be run after espfix64 is set up. */ ++ pti_init(); + } + + asmlinkage __visible void __init start_kernel(void) +-- +2.14.2 + diff --git a/patches/kernel/0193-x86-pti-Add-the-pti-cmdline-option-and-documentation.patch b/patches/kernel/0193-x86-pti-Add-the-pti-cmdline-option-and-documentation.patch deleted file mode 100644 index e5b25e7..0000000 --- a/patches/kernel/0193-x86-pti-Add-the-pti-cmdline-option-and-documentation.patch +++ /dev/null @@ -1,122 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Borislav Petkov -Date: Tue, 12 Dec 2017 14:39:52 +0100 -Subject: [PATCH] x86/pti: Add the pti= cmdline option and documentation -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Keep the "nopti" optional for traditional reasons. - -[ tglx: Don't allow force on when running on XEN PV and made 'on' - printout conditional ] - -Requested-by: Linus Torvalds -Signed-off-by: Borislav Petkov -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Andy Lutomirsky -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Link: https://lkml.kernel.org/r/20171212133952.10177-1-bp@alien8.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 41f4c20b57a4890ea7f56ff8717cc83fefb8d537) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 96d3670fa8f88989fb7c0be5172a1378143f3296) -Signed-off-by: Fabian Grünbichler ---- - Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ - arch/x86/mm/pti.c | 26 ++++++++++++++++++++++++- - 2 files changed, 31 insertions(+), 1 deletion(-) - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index e2a4608da5d2..b4d2edf316db 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -3247,6 +3247,12 @@ - pt. [PARIDE] - See Documentation/blockdev/paride.txt. - -+ pti= [X86_64] -+ Control user/kernel address space isolation: -+ on - enable -+ off - disable -+ auto - default setting -+ - pty.legacy_count= - [KNL] Number of legacy pty's. Overwrites compiled-in - default number. -diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c -index 375f23a758bc..a13f6b109865 100644 ---- a/arch/x86/mm/pti.c -+++ b/arch/x86/mm/pti.c -@@ -54,21 +54,45 @@ static void __init pti_print_if_insecure(const char *reason) - pr_info("%s\n", reason); - } - -+static void __init pti_print_if_secure(const char *reason) -+{ -+ if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) -+ pr_info("%s\n", reason); -+} -+ - void __init pti_check_boottime_disable(void) - { -+ char arg[5]; -+ int ret; -+ - if (hypervisor_is_type(X86_HYPER_XEN_PV)) { - pti_print_if_insecure("disabled on XEN PV."); - return; - } - -+ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); -+ if (ret > 0) { -+ if (ret == 3 && !strncmp(arg, "off", 3)) { -+ pti_print_if_insecure("disabled on command line."); -+ return; -+ } -+ if (ret == 2 && !strncmp(arg, "on", 2)) { -+ pti_print_if_secure("force enabled on command line."); -+ goto enable; -+ } -+ if (ret == 4 && !strncmp(arg, "auto", 4)) -+ goto autosel; -+ } -+ - if (cmdline_find_option_bool(boot_command_line, "nopti")) { - pti_print_if_insecure("disabled on command line."); - return; - } - -+autosel: - if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) - return; -- -+enable: - setup_force_cpu_cap(X86_FEATURE_PTI); - } - --- -2.14.2 - diff --git a/patches/kernel/0194-x86-mm-pti-Add-mapping-helper-functions.patch b/patches/kernel/0194-x86-mm-pti-Add-mapping-helper-functions.patch deleted file mode 100644 index ee78a97..0000000 --- a/patches/kernel/0194-x86-mm-pti-Add-mapping-helper-functions.patch +++ /dev/null @@ -1,235 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Mon, 4 Dec 2017 15:07:37 +0100 -Subject: [PATCH] x86/mm/pti: Add mapping helper functions -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Add the pagetable helper functions do manage the separate user space page -tables. - -[ tglx: Split out from the big combo kaiser patch. Folded Andys - simplification and made it out of line as Boris suggested ] - -Signed-off-by: Dave Hansen -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-kernel@vger.kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 61e9b3671007a5da8127955a1a3bda7e0d5f42e8) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit fb45c59197f3134db6e223bb4fec0529774c07e1) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/pgtable.h | 6 ++- - arch/x86/include/asm/pgtable_64.h | 92 +++++++++++++++++++++++++++++++++++++++ - arch/x86/mm/pti.c | 41 +++++++++++++++++ - 3 files changed, 138 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h -index bb8e9ea7deb4..abbb47c75467 100644 ---- a/arch/x86/include/asm/pgtable.h -+++ b/arch/x86/include/asm/pgtable.h -@@ -894,7 +894,11 @@ static inline int pgd_none(pgd_t pgd) - * pgd_offset() returns a (pgd_t *) - * pgd_index() is used get the offset into the pgd page's array of pgd_t's; - */ --#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) -+#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address))) -+/* -+ * a shortcut to get a pgd_t in a given mm -+ */ -+#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) - /* - * a shortcut which implies the use of the kernel's pgd, instead - * of a process's -diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h -index 2160c1fee920..1ac15b03cf30 100644 ---- a/arch/x86/include/asm/pgtable_64.h -+++ b/arch/x86/include/asm/pgtable_64.h -@@ -130,9 +130,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp) - #endif - } - -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+/* -+ * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages -+ * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and -+ * the user one is in the last 4k. To switch between them, you -+ * just need to flip the 12th bit in their addresses. -+ */ -+#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT -+ -+/* -+ * This generates better code than the inline assembly in -+ * __set_bit(). -+ */ -+static inline void *ptr_set_bit(void *ptr, int bit) -+{ -+ unsigned long __ptr = (unsigned long)ptr; -+ -+ __ptr |= BIT(bit); -+ return (void *)__ptr; -+} -+static inline void *ptr_clear_bit(void *ptr, int bit) -+{ -+ unsigned long __ptr = (unsigned long)ptr; -+ -+ __ptr &= ~BIT(bit); -+ return (void *)__ptr; -+} -+ -+static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) -+{ -+ return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); -+} -+ -+static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) -+{ -+ return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); -+} -+ -+static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) -+{ -+ return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); -+} -+ -+static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) -+{ -+ return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); -+} -+#endif /* CONFIG_PAGE_TABLE_ISOLATION */ -+ -+/* -+ * Page table pages are page-aligned. The lower half of the top -+ * level is used for userspace and the top half for the kernel. -+ * -+ * Returns true for parts of the PGD that map userspace and -+ * false for the parts that map the kernel. -+ */ -+static inline bool pgdp_maps_userspace(void *__ptr) -+{ -+ unsigned long ptr = (unsigned long)__ptr; -+ -+ return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2); -+} -+ -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd); -+ -+/* -+ * Take a PGD location (pgdp) and a pgd value that needs to be set there. -+ * Populates the user and returns the resulting PGD that must be set in -+ * the kernel copy of the page tables. -+ */ -+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) -+{ -+ if (!static_cpu_has(X86_FEATURE_PTI)) -+ return pgd; -+ return __pti_set_user_pgd(pgdp, pgd); -+} -+#else -+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) -+{ -+ return pgd; -+} -+#endif -+ - static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) - { -+#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL) -+ p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd); -+#else - *p4dp = p4d; -+#endif - } - - static inline void native_p4d_clear(p4d_t *p4d) -@@ -146,7 +234,11 @@ static inline void native_p4d_clear(p4d_t *p4d) - - static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) - { -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+ *pgdp = pti_set_user_pgd(pgdp, pgd); -+#else - *pgdp = pgd; -+#endif - } - - static inline void native_pgd_clear(pgd_t *pgd) -diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c -index a13f6b109865..69a983365392 100644 ---- a/arch/x86/mm/pti.c -+++ b/arch/x86/mm/pti.c -@@ -96,6 +96,47 @@ void __init pti_check_boottime_disable(void) - setup_force_cpu_cap(X86_FEATURE_PTI); - } - -+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) -+{ -+ /* -+ * Changes to the high (kernel) portion of the kernelmode page -+ * tables are not automatically propagated to the usermode tables. -+ * -+ * Users should keep in mind that, unlike the kernelmode tables, -+ * there is no vmalloc_fault equivalent for the usermode tables. -+ * Top-level entries added to init_mm's usermode pgd after boot -+ * will not be automatically propagated to other mms. -+ */ -+ if (!pgdp_maps_userspace(pgdp)) -+ return pgd; -+ -+ /* -+ * The user page tables get the full PGD, accessible from -+ * userspace: -+ */ -+ kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; -+ -+ /* -+ * If this is normal user memory, make it NX in the kernel -+ * pagetables so that, if we somehow screw up and return to -+ * usermode with the kernel CR3 loaded, we'll get a page fault -+ * instead of allowing user code to execute with the wrong CR3. -+ * -+ * As exceptions, we don't set NX if: -+ * - _PAGE_USER is not set. This could be an executable -+ * EFI runtime mapping or something similar, and the kernel -+ * may execute from it -+ * - we don't have NX support -+ * - we're clearing the PGD (i.e. the new pgd is not present). -+ */ -+ if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && -+ (__supported_pte_mask & _PAGE_NX)) -+ pgd.pgd |= _PAGE_NX; -+ -+ /* return the copy of the PGD we want the kernel to use: */ -+ return pgd; -+} -+ - /* - * Initialize kernel page table isolation - */ --- -2.14.2 - diff --git a/patches/kernel/0194-x86-pti-Add-the-pti-cmdline-option-and-documentation.patch b/patches/kernel/0194-x86-pti-Add-the-pti-cmdline-option-and-documentation.patch new file mode 100644 index 0000000..e5b25e7 --- /dev/null +++ b/patches/kernel/0194-x86-pti-Add-the-pti-cmdline-option-and-documentation.patch @@ -0,0 +1,122 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Borislav Petkov +Date: Tue, 12 Dec 2017 14:39:52 +0100 +Subject: [PATCH] x86/pti: Add the pti= cmdline option and documentation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Keep the "nopti" optional for traditional reasons. + +[ tglx: Don't allow force on when running on XEN PV and made 'on' + printout conditional ] + +Requested-by: Linus Torvalds +Signed-off-by: Borislav Petkov +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Andy Lutomirsky +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171212133952.10177-1-bp@alien8.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 41f4c20b57a4890ea7f56ff8717cc83fefb8d537) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 96d3670fa8f88989fb7c0be5172a1378143f3296) +Signed-off-by: Fabian Grünbichler +--- + Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ + arch/x86/mm/pti.c | 26 ++++++++++++++++++++++++- + 2 files changed, 31 insertions(+), 1 deletion(-) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index e2a4608da5d2..b4d2edf316db 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -3247,6 +3247,12 @@ + pt. [PARIDE] + See Documentation/blockdev/paride.txt. + ++ pti= [X86_64] ++ Control user/kernel address space isolation: ++ on - enable ++ off - disable ++ auto - default setting ++ + pty.legacy_count= + [KNL] Number of legacy pty's. Overwrites compiled-in + default number. +diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c +index 375f23a758bc..a13f6b109865 100644 +--- a/arch/x86/mm/pti.c ++++ b/arch/x86/mm/pti.c +@@ -54,21 +54,45 @@ static void __init pti_print_if_insecure(const char *reason) + pr_info("%s\n", reason); + } + ++static void __init pti_print_if_secure(const char *reason) ++{ ++ if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) ++ pr_info("%s\n", reason); ++} ++ + void __init pti_check_boottime_disable(void) + { ++ char arg[5]; ++ int ret; ++ + if (hypervisor_is_type(X86_HYPER_XEN_PV)) { + pti_print_if_insecure("disabled on XEN PV."); + return; + } + ++ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); ++ if (ret > 0) { ++ if (ret == 3 && !strncmp(arg, "off", 3)) { ++ pti_print_if_insecure("disabled on command line."); ++ return; ++ } ++ if (ret == 2 && !strncmp(arg, "on", 2)) { ++ pti_print_if_secure("force enabled on command line."); ++ goto enable; ++ } ++ if (ret == 4 && !strncmp(arg, "auto", 4)) ++ goto autosel; ++ } ++ + if (cmdline_find_option_bool(boot_command_line, "nopti")) { + pti_print_if_insecure("disabled on command line."); + return; + } + ++autosel: + if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) + return; +- ++enable: + setup_force_cpu_cap(X86_FEATURE_PTI); + } + +-- +2.14.2 + diff --git a/patches/kernel/0195-x86-mm-pti-Add-mapping-helper-functions.patch b/patches/kernel/0195-x86-mm-pti-Add-mapping-helper-functions.patch new file mode 100644 index 0000000..ee78a97 --- /dev/null +++ b/patches/kernel/0195-x86-mm-pti-Add-mapping-helper-functions.patch @@ -0,0 +1,235 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Mon, 4 Dec 2017 15:07:37 +0100 +Subject: [PATCH] x86/mm/pti: Add mapping helper functions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Add the pagetable helper functions do manage the separate user space page +tables. + +[ tglx: Split out from the big combo kaiser patch. Folded Andys + simplification and made it out of line as Boris suggested ] + +Signed-off-by: Dave Hansen +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 61e9b3671007a5da8127955a1a3bda7e0d5f42e8) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit fb45c59197f3134db6e223bb4fec0529774c07e1) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/pgtable.h | 6 ++- + arch/x86/include/asm/pgtable_64.h | 92 +++++++++++++++++++++++++++++++++++++++ + arch/x86/mm/pti.c | 41 +++++++++++++++++ + 3 files changed, 138 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index bb8e9ea7deb4..abbb47c75467 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -894,7 +894,11 @@ static inline int pgd_none(pgd_t pgd) + * pgd_offset() returns a (pgd_t *) + * pgd_index() is used get the offset into the pgd page's array of pgd_t's; + */ +-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) ++#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address))) ++/* ++ * a shortcut to get a pgd_t in a given mm ++ */ ++#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) + /* + * a shortcut which implies the use of the kernel's pgd, instead + * of a process's +diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h +index 2160c1fee920..1ac15b03cf30 100644 +--- a/arch/x86/include/asm/pgtable_64.h ++++ b/arch/x86/include/asm/pgtable_64.h +@@ -130,9 +130,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp) + #endif + } + ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++/* ++ * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages ++ * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and ++ * the user one is in the last 4k. To switch between them, you ++ * just need to flip the 12th bit in their addresses. ++ */ ++#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT ++ ++/* ++ * This generates better code than the inline assembly in ++ * __set_bit(). ++ */ ++static inline void *ptr_set_bit(void *ptr, int bit) ++{ ++ unsigned long __ptr = (unsigned long)ptr; ++ ++ __ptr |= BIT(bit); ++ return (void *)__ptr; ++} ++static inline void *ptr_clear_bit(void *ptr, int bit) ++{ ++ unsigned long __ptr = (unsigned long)ptr; ++ ++ __ptr &= ~BIT(bit); ++ return (void *)__ptr; ++} ++ ++static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) ++{ ++ return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); ++} ++ ++static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) ++{ ++ return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); ++} ++ ++static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) ++{ ++ return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); ++} ++ ++static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) ++{ ++ return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); ++} ++#endif /* CONFIG_PAGE_TABLE_ISOLATION */ ++ ++/* ++ * Page table pages are page-aligned. The lower half of the top ++ * level is used for userspace and the top half for the kernel. ++ * ++ * Returns true for parts of the PGD that map userspace and ++ * false for the parts that map the kernel. ++ */ ++static inline bool pgdp_maps_userspace(void *__ptr) ++{ ++ unsigned long ptr = (unsigned long)__ptr; ++ ++ return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2); ++} ++ ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd); ++ ++/* ++ * Take a PGD location (pgdp) and a pgd value that needs to be set there. ++ * Populates the user and returns the resulting PGD that must be set in ++ * the kernel copy of the page tables. ++ */ ++static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) ++{ ++ if (!static_cpu_has(X86_FEATURE_PTI)) ++ return pgd; ++ return __pti_set_user_pgd(pgdp, pgd); ++} ++#else ++static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) ++{ ++ return pgd; ++} ++#endif ++ + static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) + { ++#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL) ++ p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd); ++#else + *p4dp = p4d; ++#endif + } + + static inline void native_p4d_clear(p4d_t *p4d) +@@ -146,7 +234,11 @@ static inline void native_p4d_clear(p4d_t *p4d) + + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) + { ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ *pgdp = pti_set_user_pgd(pgdp, pgd); ++#else + *pgdp = pgd; ++#endif + } + + static inline void native_pgd_clear(pgd_t *pgd) +diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c +index a13f6b109865..69a983365392 100644 +--- a/arch/x86/mm/pti.c ++++ b/arch/x86/mm/pti.c +@@ -96,6 +96,47 @@ void __init pti_check_boottime_disable(void) + setup_force_cpu_cap(X86_FEATURE_PTI); + } + ++pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) ++{ ++ /* ++ * Changes to the high (kernel) portion of the kernelmode page ++ * tables are not automatically propagated to the usermode tables. ++ * ++ * Users should keep in mind that, unlike the kernelmode tables, ++ * there is no vmalloc_fault equivalent for the usermode tables. ++ * Top-level entries added to init_mm's usermode pgd after boot ++ * will not be automatically propagated to other mms. ++ */ ++ if (!pgdp_maps_userspace(pgdp)) ++ return pgd; ++ ++ /* ++ * The user page tables get the full PGD, accessible from ++ * userspace: ++ */ ++ kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; ++ ++ /* ++ * If this is normal user memory, make it NX in the kernel ++ * pagetables so that, if we somehow screw up and return to ++ * usermode with the kernel CR3 loaded, we'll get a page fault ++ * instead of allowing user code to execute with the wrong CR3. ++ * ++ * As exceptions, we don't set NX if: ++ * - _PAGE_USER is not set. This could be an executable ++ * EFI runtime mapping or something similar, and the kernel ++ * may execute from it ++ * - we don't have NX support ++ * - we're clearing the PGD (i.e. the new pgd is not present). ++ */ ++ if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && ++ (__supported_pte_mask & _PAGE_NX)) ++ pgd.pgd |= _PAGE_NX; ++ ++ /* return the copy of the PGD we want the kernel to use: */ ++ return pgd; ++} ++ + /* + * Initialize kernel page table isolation + */ +-- +2.14.2 + diff --git a/patches/kernel/0195-x86-mm-pti-Allow-NX-poison-to-be-set-in-p4d-pgd.patch b/patches/kernel/0195-x86-mm-pti-Allow-NX-poison-to-be-set-in-p4d-pgd.patch deleted file mode 100644 index 7060437..0000000 --- a/patches/kernel/0195-x86-mm-pti-Allow-NX-poison-to-be-set-in-p4d-pgd.patch +++ /dev/null @@ -1,84 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Mon, 4 Dec 2017 15:07:38 +0100 -Subject: [PATCH] x86/mm/pti: Allow NX poison to be set in p4d/pgd -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -With PAGE_TABLE_ISOLATION the user portion of the kernel page tables is -poisoned with the NX bit so if the entry code exits with the kernel page -tables selected in CR3, userspace crashes. - -But doing so trips the p4d/pgd_bad() checks. Make sure it does not do -that. - -Signed-off-by: Dave Hansen -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-kernel@vger.kernel.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 1c4de1ff4fe50453b968579ee86fac3da80dd783) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 889a8bd0e57e39e7ce337e87c55fa59c09644d4e) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/pgtable.h | 14 ++++++++++++-- - 1 file changed, 12 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h -index abbb47c75467..3ef8415b2358 100644 ---- a/arch/x86/include/asm/pgtable.h -+++ b/arch/x86/include/asm/pgtable.h -@@ -831,7 +831,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) - - static inline int p4d_bad(p4d_t p4d) - { -- return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; -+ unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; -+ -+ if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) -+ ignore_flags |= _PAGE_NX; -+ -+ return (p4d_flags(p4d) & ~ignore_flags) != 0; - } - #endif /* CONFIG_PGTABLE_LEVELS > 3 */ - -@@ -865,7 +870,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) - - static inline int pgd_bad(pgd_t pgd) - { -- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; -+ unsigned long ignore_flags = _PAGE_USER; -+ -+ if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) -+ ignore_flags |= _PAGE_NX; -+ -+ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; - } - - static inline int pgd_none(pgd_t pgd) --- -2.14.2 - diff --git a/patches/kernel/0196-x86-mm-pti-Allocate-a-separate-user-PGD.patch b/patches/kernel/0196-x86-mm-pti-Allocate-a-separate-user-PGD.patch deleted file mode 100644 index f2a2ce0..0000000 --- a/patches/kernel/0196-x86-mm-pti-Allocate-a-separate-user-PGD.patch +++ /dev/null @@ -1,199 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Mon, 4 Dec 2017 15:07:39 +0100 -Subject: [PATCH] x86/mm/pti: Allocate a separate user PGD -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Kernel page table isolation requires to have two PGDs. One for the kernel, -which contains the full kernel mapping plus the user space mapping and one -for user space which contains the user space mappings and the minimal set -of kernel mappings which are required by the architecture to be able to -transition from and to user space. - -Add the necessary preliminaries. - -[ tglx: Split out from the big kaiser dump. EFI fixup from Kirill ] - -Signed-off-by: Dave Hansen -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(backported from commit d9e9a6418065bb376e5de8d93ce346939b9a37a6) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 0bd4b34e330d8bedf90c0497dfcef2e2286c4367) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/pgalloc.h | 11 +++++++++++ - arch/x86/mm/pgtable.c | 5 +++-- - arch/x86/platform/efi/efi_64.c | 5 ++++- - arch/x86/kernel/head_64.S | 30 +++++++++++++++++++++++++++--- - 4 files changed, 45 insertions(+), 6 deletions(-) - -diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h -index b2d0cd8288aa..d65b0dee7448 100644 ---- a/arch/x86/include/asm/pgalloc.h -+++ b/arch/x86/include/asm/pgalloc.h -@@ -29,6 +29,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {} - */ - extern gfp_t __userpte_alloc_gfp; - -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+/* -+ * Instead of one PGD, we acquire two PGDs. Being order-1, it is -+ * both 8k in size and 8k-aligned. That lets us just flip bit 12 -+ * in a pointer to swap between the two 4k halves. -+ */ -+#define PGD_ALLOCATION_ORDER 1 -+#else -+#define PGD_ALLOCATION_ORDER 0 -+#endif -+ - /* - * Allocate and free page tables. - */ -diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c -index 942391b5b639..90d1d8f49cf6 100644 ---- a/arch/x86/mm/pgtable.c -+++ b/arch/x86/mm/pgtable.c -@@ -354,14 +354,15 @@ static inline void _pgd_free(pgd_t *pgd) - kmem_cache_free(pgd_cache, pgd); - } - #else -+ - static inline pgd_t *_pgd_alloc(void) - { -- return (pgd_t *)__get_free_page(PGALLOC_GFP); -+ return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); - } - - static inline void _pgd_free(pgd_t *pgd) - { -- free_page((unsigned long)pgd); -+ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); - } - #endif /* CONFIG_X86_PAE */ - -diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c -index 9bf72f5bfedb..b104224d3d6c 100644 ---- a/arch/x86/platform/efi/efi_64.c -+++ b/arch/x86/platform/efi/efi_64.c -@@ -194,6 +194,9 @@ static pgd_t *efi_pgd; - * because we want to avoid inserting EFI region mappings (EFI_VA_END - * to EFI_VA_START) into the standard kernel page tables. Everything - * else can be shared, see efi_sync_low_kernel_mappings(). -+ * -+ * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the -+ * allocation. - */ - int __init efi_alloc_page_tables(void) - { -@@ -206,7 +209,7 @@ int __init efi_alloc_page_tables(void) - return 0; - - gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO; -- efi_pgd = (pgd_t *)__get_free_page(gfp_mask); -+ efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER); - if (!efi_pgd) - return -ENOMEM; - -diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S -index e785734980ad..eeaaaab54b2a 100644 ---- a/arch/x86/kernel/head_64.S -+++ b/arch/x86/kernel/head_64.S -@@ -324,6 +324,27 @@ GLOBAL(early_recursion_flag) - .balign PAGE_SIZE; \ - GLOBAL(name) - -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+/* -+ * Each PGD needs to be 8k long and 8k aligned. We do not -+ * ever go out to userspace with these, so we do not -+ * strictly *need* the second page, but this allows us to -+ * have a single set_pgd() implementation that does not -+ * need to worry about whether it has 4k or 8k to work -+ * with. -+ * -+ * This ensures PGDs are 8k long: -+ */ -+#define PTI_USER_PGD_FILL 512 -+/* This ensures they are 8k-aligned: */ -+#define NEXT_PGD_PAGE(name) \ -+ .balign 2 * PAGE_SIZE; \ -+GLOBAL(name) -+#else -+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) -+#define PTI_USER_PGD_FILL 0 -+#endif -+ - /* Automate the creation of 1 to 1 mapping pmd entries */ - #define PMDS(START, PERM, COUNT) \ - i = 0 ; \ -@@ -333,13 +354,14 @@ GLOBAL(name) - .endr - - __INITDATA --NEXT_PAGE(early_top_pgt) -+NEXT_PGD_PAGE(early_top_pgt) - .fill 511,8,0 - #ifdef CONFIG_X86_5LEVEL - .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE - #else - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE - #endif -+ .fill PTI_USER_PGD_FILL,8,0 - - NEXT_PAGE(early_dynamic_pgts) - .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 -@@ -347,13 +369,14 @@ NEXT_PAGE(early_dynamic_pgts) - .data - - #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) --NEXT_PAGE(init_top_pgt) -+NEXT_PGD_PAGE(init_top_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_top_pgt + PGD_START_KERNEL*8, 0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE -+ .fill PTI_USER_PGD_FILL,8,0 - - NEXT_PAGE(level3_ident_pgt) - .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE -@@ -364,8 +387,9 @@ NEXT_PAGE(level2_ident_pgt) - */ - PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) - #else --NEXT_PAGE(init_top_pgt) -+NEXT_PGD_PAGE(init_top_pgt) - .fill 512,8,0 -+ .fill PTI_USER_PGD_FILL,8,0 - #endif - - #ifdef CONFIG_X86_5LEVEL --- -2.14.2 - diff --git a/patches/kernel/0196-x86-mm-pti-Allow-NX-poison-to-be-set-in-p4d-pgd.patch b/patches/kernel/0196-x86-mm-pti-Allow-NX-poison-to-be-set-in-p4d-pgd.patch new file mode 100644 index 0000000..7060437 --- /dev/null +++ b/patches/kernel/0196-x86-mm-pti-Allow-NX-poison-to-be-set-in-p4d-pgd.patch @@ -0,0 +1,84 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Mon, 4 Dec 2017 15:07:38 +0100 +Subject: [PATCH] x86/mm/pti: Allow NX poison to be set in p4d/pgd +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +With PAGE_TABLE_ISOLATION the user portion of the kernel page tables is +poisoned with the NX bit so if the entry code exits with the kernel page +tables selected in CR3, userspace crashes. + +But doing so trips the p4d/pgd_bad() checks. Make sure it does not do +that. + +Signed-off-by: Dave Hansen +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 1c4de1ff4fe50453b968579ee86fac3da80dd783) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 889a8bd0e57e39e7ce337e87c55fa59c09644d4e) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/pgtable.h | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index abbb47c75467..3ef8415b2358 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -831,7 +831,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) + + static inline int p4d_bad(p4d_t p4d) + { +- return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; ++ unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; ++ ++ if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) ++ ignore_flags |= _PAGE_NX; ++ ++ return (p4d_flags(p4d) & ~ignore_flags) != 0; + } + #endif /* CONFIG_PGTABLE_LEVELS > 3 */ + +@@ -865,7 +870,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) + + static inline int pgd_bad(pgd_t pgd) + { +- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; ++ unsigned long ignore_flags = _PAGE_USER; ++ ++ if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) ++ ignore_flags |= _PAGE_NX; ++ ++ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; + } + + static inline int pgd_none(pgd_t pgd) +-- +2.14.2 + diff --git a/patches/kernel/0197-x86-mm-pti-Allocate-a-separate-user-PGD.patch b/patches/kernel/0197-x86-mm-pti-Allocate-a-separate-user-PGD.patch new file mode 100644 index 0000000..f2a2ce0 --- /dev/null +++ b/patches/kernel/0197-x86-mm-pti-Allocate-a-separate-user-PGD.patch @@ -0,0 +1,199 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Mon, 4 Dec 2017 15:07:39 +0100 +Subject: [PATCH] x86/mm/pti: Allocate a separate user PGD +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Kernel page table isolation requires to have two PGDs. One for the kernel, +which contains the full kernel mapping plus the user space mapping and one +for user space which contains the user space mappings and the minimal set +of kernel mappings which are required by the architecture to be able to +transition from and to user space. + +Add the necessary preliminaries. + +[ tglx: Split out from the big kaiser dump. EFI fixup from Kirill ] + +Signed-off-by: Dave Hansen +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(backported from commit d9e9a6418065bb376e5de8d93ce346939b9a37a6) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 0bd4b34e330d8bedf90c0497dfcef2e2286c4367) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/pgalloc.h | 11 +++++++++++ + arch/x86/mm/pgtable.c | 5 +++-- + arch/x86/platform/efi/efi_64.c | 5 ++++- + arch/x86/kernel/head_64.S | 30 +++++++++++++++++++++++++++--- + 4 files changed, 45 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h +index b2d0cd8288aa..d65b0dee7448 100644 +--- a/arch/x86/include/asm/pgalloc.h ++++ b/arch/x86/include/asm/pgalloc.h +@@ -29,6 +29,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {} + */ + extern gfp_t __userpte_alloc_gfp; + ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++/* ++ * Instead of one PGD, we acquire two PGDs. Being order-1, it is ++ * both 8k in size and 8k-aligned. That lets us just flip bit 12 ++ * in a pointer to swap between the two 4k halves. ++ */ ++#define PGD_ALLOCATION_ORDER 1 ++#else ++#define PGD_ALLOCATION_ORDER 0 ++#endif ++ + /* + * Allocate and free page tables. + */ +diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c +index 942391b5b639..90d1d8f49cf6 100644 +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -354,14 +354,15 @@ static inline void _pgd_free(pgd_t *pgd) + kmem_cache_free(pgd_cache, pgd); + } + #else ++ + static inline pgd_t *_pgd_alloc(void) + { +- return (pgd_t *)__get_free_page(PGALLOC_GFP); ++ return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); + } + + static inline void _pgd_free(pgd_t *pgd) + { +- free_page((unsigned long)pgd); ++ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); + } + #endif /* CONFIG_X86_PAE */ + +diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c +index 9bf72f5bfedb..b104224d3d6c 100644 +--- a/arch/x86/platform/efi/efi_64.c ++++ b/arch/x86/platform/efi/efi_64.c +@@ -194,6 +194,9 @@ static pgd_t *efi_pgd; + * because we want to avoid inserting EFI region mappings (EFI_VA_END + * to EFI_VA_START) into the standard kernel page tables. Everything + * else can be shared, see efi_sync_low_kernel_mappings(). ++ * ++ * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the ++ * allocation. + */ + int __init efi_alloc_page_tables(void) + { +@@ -206,7 +209,7 @@ int __init efi_alloc_page_tables(void) + return 0; + + gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO; +- efi_pgd = (pgd_t *)__get_free_page(gfp_mask); ++ efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER); + if (!efi_pgd) + return -ENOMEM; + +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index e785734980ad..eeaaaab54b2a 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -324,6 +324,27 @@ GLOBAL(early_recursion_flag) + .balign PAGE_SIZE; \ + GLOBAL(name) + ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++/* ++ * Each PGD needs to be 8k long and 8k aligned. We do not ++ * ever go out to userspace with these, so we do not ++ * strictly *need* the second page, but this allows us to ++ * have a single set_pgd() implementation that does not ++ * need to worry about whether it has 4k or 8k to work ++ * with. ++ * ++ * This ensures PGDs are 8k long: ++ */ ++#define PTI_USER_PGD_FILL 512 ++/* This ensures they are 8k-aligned: */ ++#define NEXT_PGD_PAGE(name) \ ++ .balign 2 * PAGE_SIZE; \ ++GLOBAL(name) ++#else ++#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) ++#define PTI_USER_PGD_FILL 0 ++#endif ++ + /* Automate the creation of 1 to 1 mapping pmd entries */ + #define PMDS(START, PERM, COUNT) \ + i = 0 ; \ +@@ -333,13 +354,14 @@ GLOBAL(name) + .endr + + __INITDATA +-NEXT_PAGE(early_top_pgt) ++NEXT_PGD_PAGE(early_top_pgt) + .fill 511,8,0 + #ifdef CONFIG_X86_5LEVEL + .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + #else + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + #endif ++ .fill PTI_USER_PGD_FILL,8,0 + + NEXT_PAGE(early_dynamic_pgts) + .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 +@@ -347,13 +369,14 @@ NEXT_PAGE(early_dynamic_pgts) + .data + + #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) +-NEXT_PAGE(init_top_pgt) ++NEXT_PGD_PAGE(init_top_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_top_pgt + PGD_START_KERNEL*8, 0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE ++ .fill PTI_USER_PGD_FILL,8,0 + + NEXT_PAGE(level3_ident_pgt) + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE +@@ -364,8 +387,9 @@ NEXT_PAGE(level2_ident_pgt) + */ + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) + #else +-NEXT_PAGE(init_top_pgt) ++NEXT_PGD_PAGE(init_top_pgt) + .fill 512,8,0 ++ .fill PTI_USER_PGD_FILL,8,0 + #endif + + #ifdef CONFIG_X86_5LEVEL +-- +2.14.2 + diff --git a/patches/kernel/0197-x86-mm-pti-Populate-user-PGD.patch b/patches/kernel/0197-x86-mm-pti-Populate-user-PGD.patch deleted file mode 100644 index 09d7bc8..0000000 --- a/patches/kernel/0197-x86-mm-pti-Populate-user-PGD.patch +++ /dev/null @@ -1,70 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Mon, 4 Dec 2017 15:07:40 +0100 -Subject: [PATCH] x86/mm/pti: Populate user PGD -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -In clone_pgd_range() copy the init user PGDs which cover the kernel half of -the address space, so a process has all the required kernel mappings -visible. - -[ tglx: Split out from the big kaiser dump and folded Andys simplification ] - -Signed-off-by: Dave Hansen -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(cherry picked from commit fc2fbc8512ed08d1de7720936fd7d2e4ce02c3a2) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 26c08c52162e1079cbb3e9ce8e1346a100ea7ccc) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/pgtable.h | 9 ++++++++- - 1 file changed, 8 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h -index 3ef8415b2358..25604b8a251a 100644 ---- a/arch/x86/include/asm/pgtable.h -+++ b/arch/x86/include/asm/pgtable.h -@@ -1104,7 +1104,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, - */ - static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) - { -- memcpy(dst, src, count * sizeof(pgd_t)); -+ memcpy(dst, src, count * sizeof(pgd_t)); -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+ if (!static_cpu_has(X86_FEATURE_PTI)) -+ return; -+ /* Clone the user space pgd as well */ -+ memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src), -+ count * sizeof(pgd_t)); -+#endif - } - - #define PTE_SHIFT ilog2(PTRS_PER_PTE) --- -2.14.2 - diff --git a/patches/kernel/0198-x86-mm-pti-Add-functions-to-clone-kernel-PMDs.patch b/patches/kernel/0198-x86-mm-pti-Add-functions-to-clone-kernel-PMDs.patch deleted file mode 100644 index dc510fd..0000000 --- a/patches/kernel/0198-x86-mm-pti-Add-functions-to-clone-kernel-PMDs.patch +++ /dev/null @@ -1,204 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:42 +0100 -Subject: [PATCH] x86/mm/pti: Add functions to clone kernel PMDs -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Provide infrastructure to: - - - find a kernel PMD for a mapping which must be visible to user space for - the entry/exit code to work. - - - walk an address range and share the kernel PMD with it. - -This reuses a small part of the original KAISER patches to populate the -user space page table. - -[ tglx: Made it universally usable so it can be used for any kind of shared - mapping. Add a mechanism to clear specific bits in the user space - visible PMD entry. Folded Andys simplifactions ] - -Originally-by: Dave Hansen -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 03f4424f348e8be95eb1bbeba09461cd7b867828) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 262ab7e8665e88581d20ccaefa107340457224bb) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/pti.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 127 insertions(+) - -diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c -index 69a983365392..d58bcee470fc 100644 ---- a/arch/x86/mm/pti.c -+++ b/arch/x86/mm/pti.c -@@ -48,6 +48,11 @@ - #undef pr_fmt - #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt - -+/* Backporting helper */ -+#ifndef __GFP_NOTRACK -+#define __GFP_NOTRACK 0 -+#endif -+ - static void __init pti_print_if_insecure(const char *reason) - { - if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) -@@ -137,6 +142,128 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) - return pgd; - } - -+/* -+ * Walk the user copy of the page tables (optionally) trying to allocate -+ * page table pages on the way down. -+ * -+ * Returns a pointer to a P4D on success, or NULL on failure. -+ */ -+static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) -+{ -+ pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); -+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); -+ -+ if (address < PAGE_OFFSET) { -+ WARN_ONCE(1, "attempt to walk user address\n"); -+ return NULL; -+ } -+ -+ if (pgd_none(*pgd)) { -+ unsigned long new_p4d_page = __get_free_page(gfp); -+ if (!new_p4d_page) -+ return NULL; -+ -+ if (pgd_none(*pgd)) { -+ set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); -+ new_p4d_page = 0; -+ } -+ if (new_p4d_page) -+ free_page(new_p4d_page); -+ } -+ BUILD_BUG_ON(pgd_large(*pgd) != 0); -+ -+ return p4d_offset(pgd, address); -+} -+ -+/* -+ * Walk the user copy of the page tables (optionally) trying to allocate -+ * page table pages on the way down. -+ * -+ * Returns a pointer to a PMD on success, or NULL on failure. -+ */ -+static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) -+{ -+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); -+ p4d_t *p4d = pti_user_pagetable_walk_p4d(address); -+ pud_t *pud; -+ -+ BUILD_BUG_ON(p4d_large(*p4d) != 0); -+ if (p4d_none(*p4d)) { -+ unsigned long new_pud_page = __get_free_page(gfp); -+ if (!new_pud_page) -+ return NULL; -+ -+ if (p4d_none(*p4d)) { -+ set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); -+ new_pud_page = 0; -+ } -+ if (new_pud_page) -+ free_page(new_pud_page); -+ } -+ -+ pud = pud_offset(p4d, address); -+ /* The user page tables do not use large mappings: */ -+ if (pud_large(*pud)) { -+ WARN_ON(1); -+ return NULL; -+ } -+ if (pud_none(*pud)) { -+ unsigned long new_pmd_page = __get_free_page(gfp); -+ if (!new_pmd_page) -+ return NULL; -+ -+ if (pud_none(*pud)) { -+ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); -+ new_pmd_page = 0; -+ } -+ if (new_pmd_page) -+ free_page(new_pmd_page); -+ } -+ -+ return pmd_offset(pud, address); -+} -+ -+static void __init -+pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) -+{ -+ unsigned long addr; -+ -+ /* -+ * Clone the populated PMDs which cover start to end. These PMD areas -+ * can have holes. -+ */ -+ for (addr = start; addr < end; addr += PMD_SIZE) { -+ pmd_t *pmd, *target_pmd; -+ pgd_t *pgd; -+ p4d_t *p4d; -+ pud_t *pud; -+ -+ pgd = pgd_offset_k(addr); -+ if (WARN_ON(pgd_none(*pgd))) -+ return; -+ p4d = p4d_offset(pgd, addr); -+ if (WARN_ON(p4d_none(*p4d))) -+ return; -+ pud = pud_offset(p4d, addr); -+ if (pud_none(*pud)) -+ continue; -+ pmd = pmd_offset(pud, addr); -+ if (pmd_none(*pmd)) -+ continue; -+ -+ target_pmd = pti_user_pagetable_walk_pmd(addr); -+ if (WARN_ON(!target_pmd)) -+ return; -+ -+ /* -+ * Copy the PMD. That is, the kernelmode and usermode -+ * tables will share the last-level page tables of this -+ * address range -+ */ -+ *target_pmd = pmd_clear_flags(*pmd, clear); -+ } -+} -+ - /* - * Initialize kernel page table isolation - */ --- -2.14.2 - diff --git a/patches/kernel/0198-x86-mm-pti-Populate-user-PGD.patch b/patches/kernel/0198-x86-mm-pti-Populate-user-PGD.patch new file mode 100644 index 0000000..09d7bc8 --- /dev/null +++ b/patches/kernel/0198-x86-mm-pti-Populate-user-PGD.patch @@ -0,0 +1,70 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Mon, 4 Dec 2017 15:07:40 +0100 +Subject: [PATCH] x86/mm/pti: Populate user PGD +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +In clone_pgd_range() copy the init user PGDs which cover the kernel half of +the address space, so a process has all the required kernel mappings +visible. + +[ tglx: Split out from the big kaiser dump and folded Andys simplification ] + +Signed-off-by: Dave Hansen +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(cherry picked from commit fc2fbc8512ed08d1de7720936fd7d2e4ce02c3a2) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 26c08c52162e1079cbb3e9ce8e1346a100ea7ccc) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/pgtable.h | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index 3ef8415b2358..25604b8a251a 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -1104,7 +1104,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, + */ + static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) + { +- memcpy(dst, src, count * sizeof(pgd_t)); ++ memcpy(dst, src, count * sizeof(pgd_t)); ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ if (!static_cpu_has(X86_FEATURE_PTI)) ++ return; ++ /* Clone the user space pgd as well */ ++ memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src), ++ count * sizeof(pgd_t)); ++#endif + } + + #define PTE_SHIFT ilog2(PTRS_PER_PTE) +-- +2.14.2 + diff --git a/patches/kernel/0199-x86-mm-pti-Add-functions-to-clone-kernel-PMDs.patch b/patches/kernel/0199-x86-mm-pti-Add-functions-to-clone-kernel-PMDs.patch new file mode 100644 index 0000000..dc510fd --- /dev/null +++ b/patches/kernel/0199-x86-mm-pti-Add-functions-to-clone-kernel-PMDs.patch @@ -0,0 +1,204 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:42 +0100 +Subject: [PATCH] x86/mm/pti: Add functions to clone kernel PMDs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Provide infrastructure to: + + - find a kernel PMD for a mapping which must be visible to user space for + the entry/exit code to work. + + - walk an address range and share the kernel PMD with it. + +This reuses a small part of the original KAISER patches to populate the +user space page table. + +[ tglx: Made it universally usable so it can be used for any kind of shared + mapping. Add a mechanism to clear specific bits in the user space + visible PMD entry. Folded Andys simplifactions ] + +Originally-by: Dave Hansen +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 03f4424f348e8be95eb1bbeba09461cd7b867828) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 262ab7e8665e88581d20ccaefa107340457224bb) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/pti.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 127 insertions(+) + +diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c +index 69a983365392..d58bcee470fc 100644 +--- a/arch/x86/mm/pti.c ++++ b/arch/x86/mm/pti.c +@@ -48,6 +48,11 @@ + #undef pr_fmt + #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt + ++/* Backporting helper */ ++#ifndef __GFP_NOTRACK ++#define __GFP_NOTRACK 0 ++#endif ++ + static void __init pti_print_if_insecure(const char *reason) + { + if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) +@@ -137,6 +142,128 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) + return pgd; + } + ++/* ++ * Walk the user copy of the page tables (optionally) trying to allocate ++ * page table pages on the way down. ++ * ++ * Returns a pointer to a P4D on success, or NULL on failure. ++ */ ++static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) ++{ ++ pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); ++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); ++ ++ if (address < PAGE_OFFSET) { ++ WARN_ONCE(1, "attempt to walk user address\n"); ++ return NULL; ++ } ++ ++ if (pgd_none(*pgd)) { ++ unsigned long new_p4d_page = __get_free_page(gfp); ++ if (!new_p4d_page) ++ return NULL; ++ ++ if (pgd_none(*pgd)) { ++ set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); ++ new_p4d_page = 0; ++ } ++ if (new_p4d_page) ++ free_page(new_p4d_page); ++ } ++ BUILD_BUG_ON(pgd_large(*pgd) != 0); ++ ++ return p4d_offset(pgd, address); ++} ++ ++/* ++ * Walk the user copy of the page tables (optionally) trying to allocate ++ * page table pages on the way down. ++ * ++ * Returns a pointer to a PMD on success, or NULL on failure. ++ */ ++static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) ++{ ++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); ++ p4d_t *p4d = pti_user_pagetable_walk_p4d(address); ++ pud_t *pud; ++ ++ BUILD_BUG_ON(p4d_large(*p4d) != 0); ++ if (p4d_none(*p4d)) { ++ unsigned long new_pud_page = __get_free_page(gfp); ++ if (!new_pud_page) ++ return NULL; ++ ++ if (p4d_none(*p4d)) { ++ set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); ++ new_pud_page = 0; ++ } ++ if (new_pud_page) ++ free_page(new_pud_page); ++ } ++ ++ pud = pud_offset(p4d, address); ++ /* The user page tables do not use large mappings: */ ++ if (pud_large(*pud)) { ++ WARN_ON(1); ++ return NULL; ++ } ++ if (pud_none(*pud)) { ++ unsigned long new_pmd_page = __get_free_page(gfp); ++ if (!new_pmd_page) ++ return NULL; ++ ++ if (pud_none(*pud)) { ++ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); ++ new_pmd_page = 0; ++ } ++ if (new_pmd_page) ++ free_page(new_pmd_page); ++ } ++ ++ return pmd_offset(pud, address); ++} ++ ++static void __init ++pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) ++{ ++ unsigned long addr; ++ ++ /* ++ * Clone the populated PMDs which cover start to end. These PMD areas ++ * can have holes. ++ */ ++ for (addr = start; addr < end; addr += PMD_SIZE) { ++ pmd_t *pmd, *target_pmd; ++ pgd_t *pgd; ++ p4d_t *p4d; ++ pud_t *pud; ++ ++ pgd = pgd_offset_k(addr); ++ if (WARN_ON(pgd_none(*pgd))) ++ return; ++ p4d = p4d_offset(pgd, addr); ++ if (WARN_ON(p4d_none(*p4d))) ++ return; ++ pud = pud_offset(p4d, addr); ++ if (pud_none(*pud)) ++ continue; ++ pmd = pmd_offset(pud, addr); ++ if (pmd_none(*pmd)) ++ continue; ++ ++ target_pmd = pti_user_pagetable_walk_pmd(addr); ++ if (WARN_ON(!target_pmd)) ++ return; ++ ++ /* ++ * Copy the PMD. That is, the kernelmode and usermode ++ * tables will share the last-level page tables of this ++ * address range ++ */ ++ *target_pmd = pmd_clear_flags(*pmd, clear); ++ } ++} ++ + /* + * Initialize kernel page table isolation + */ +-- +2.14.2 + diff --git a/patches/kernel/0199-x86-mm-pti-Force-entry-through-trampoline-when-PTI-a.patch b/patches/kernel/0199-x86-mm-pti-Force-entry-through-trampoline-when-PTI-a.patch deleted file mode 100644 index c0d2a1c..0000000 --- a/patches/kernel/0199-x86-mm-pti-Force-entry-through-trampoline-when-PTI-a.patch +++ /dev/null @@ -1,63 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 4 Dec 2017 15:07:43 +0100 -Subject: [PATCH] x86/mm/pti: Force entry through trampoline when PTI active -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Force the entry through the trampoline only when PTI is active. Otherwise -go through the normal entry code. - -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 8d4b067895791ab9fdb1aadfc505f64d71239dd2) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 9ae1ea4821648be179a96fe65b3ed4bd111a5c98) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/cpu/common.c | 5 ++++- - 1 file changed, 4 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index 623ba3635793..99f37d1636ff 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -1340,7 +1340,10 @@ void syscall_init(void) - (entry_SYSCALL_64_trampoline - _entry_trampoline); - - wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); -- wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); -+ if (static_cpu_has(X86_FEATURE_PTI)) -+ wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); -+ else -+ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); - - #ifdef CONFIG_IA32_EMULATION - wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); --- -2.14.2 - diff --git a/patches/kernel/0200-x86-mm-pti-Force-entry-through-trampoline-when-PTI-a.patch b/patches/kernel/0200-x86-mm-pti-Force-entry-through-trampoline-when-PTI-a.patch new file mode 100644 index 0000000..c0d2a1c --- /dev/null +++ b/patches/kernel/0200-x86-mm-pti-Force-entry-through-trampoline-when-PTI-a.patch @@ -0,0 +1,63 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 4 Dec 2017 15:07:43 +0100 +Subject: [PATCH] x86/mm/pti: Force entry through trampoline when PTI active +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Force the entry through the trampoline only when PTI is active. Otherwise +go through the normal entry code. + +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 8d4b067895791ab9fdb1aadfc505f64d71239dd2) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 9ae1ea4821648be179a96fe65b3ed4bd111a5c98) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/cpu/common.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 623ba3635793..99f37d1636ff 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1340,7 +1340,10 @@ void syscall_init(void) + (entry_SYSCALL_64_trampoline - _entry_trampoline); + + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); +- wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); ++ if (static_cpu_has(X86_FEATURE_PTI)) ++ wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); ++ else ++ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + + #ifdef CONFIG_IA32_EMULATION + wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); +-- +2.14.2 + diff --git a/patches/kernel/0200-x86-mm-pti-Share-cpu_entry_area-with-user-space-page.patch b/patches/kernel/0200-x86-mm-pti-Share-cpu_entry_area-with-user-space-page.patch deleted file mode 100644 index 2decc9c..0000000 --- a/patches/kernel/0200-x86-mm-pti-Share-cpu_entry_area-with-user-space-page.patch +++ /dev/null @@ -1,87 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Mon, 4 Dec 2017 15:07:45 +0100 -Subject: [PATCH] x86/mm/pti: Share cpu_entry_area with user space page tables -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Share the cpu entry area so the user space and kernel space page tables -have the same P4D page. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(cherry picked from commit f7cfbee91559ca7e3e961a00ffac921208a115ad) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 6e8142de3a6e84a82a421b66a74ba37976912282) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/pti.c | 25 +++++++++++++++++++++++++ - 1 file changed, 25 insertions(+) - -diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c -index d58bcee470fc..59290356f19f 100644 ---- a/arch/x86/mm/pti.c -+++ b/arch/x86/mm/pti.c -@@ -264,6 +264,29 @@ pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) - } - } - -+/* -+ * Clone a single p4d (i.e. a top-level entry on 4-level systems and a -+ * next-level entry on 5-level systems. -+ */ -+static void __init pti_clone_p4d(unsigned long addr) -+{ -+ p4d_t *kernel_p4d, *user_p4d; -+ pgd_t *kernel_pgd; -+ -+ user_p4d = pti_user_pagetable_walk_p4d(addr); -+ kernel_pgd = pgd_offset_k(addr); -+ kernel_p4d = p4d_offset(kernel_pgd, addr); -+ *user_p4d = *kernel_p4d; -+} -+ -+/* -+ * Clone the CPU_ENTRY_AREA into the user space visible page table. -+ */ -+static void __init pti_clone_user_shared(void) -+{ -+ pti_clone_p4d(CPU_ENTRY_AREA_BASE); -+} -+ - /* - * Initialize kernel page table isolation - */ -@@ -273,4 +296,6 @@ void __init pti_init(void) - return; - - pr_info("enabled\n"); -+ -+ pti_clone_user_shared(); - } --- -2.14.2 - diff --git a/patches/kernel/0201-x86-entry-Align-entry-text-section-to-PMD-boundary.patch b/patches/kernel/0201-x86-entry-Align-entry-text-section-to-PMD-boundary.patch deleted file mode 100644 index 4b7d5a4..0000000 --- a/patches/kernel/0201-x86-entry-Align-entry-text-section-to-PMD-boundary.patch +++ /dev/null @@ -1,79 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 4 Dec 2017 15:07:46 +0100 -Subject: [PATCH] x86/entry: Align entry text section to PMD boundary -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The (irq)entry text must be visible in the user space page tables. To allow -simple PMD based sharing, make the entry text PMD aligned. - -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 2f7412ba9c6af5ab16bdbb4a3fdb1dcd2b4fd3c2) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 3cf72b14b56834882ebe731d5fa84d249c56a188) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/vmlinux.lds.S | 8 ++++++++ - 1 file changed, 8 insertions(+) - -diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S -index 423aa36f0150..f505d8dbdccf 100644 ---- a/arch/x86/kernel/vmlinux.lds.S -+++ b/arch/x86/kernel/vmlinux.lds.S -@@ -60,11 +60,17 @@ jiffies_64 = jiffies; - . = ALIGN(HPAGE_SIZE); \ - __end_rodata_hpage_align = .; - -+#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); -+#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); -+ - #else - - #define X64_ALIGN_RODATA_BEGIN - #define X64_ALIGN_RODATA_END - -+#define ALIGN_ENTRY_TEXT_BEGIN -+#define ALIGN_ENTRY_TEXT_END -+ - #endif - - PHDRS { -@@ -101,8 +107,10 @@ SECTIONS - CPUIDLE_TEXT - LOCK_TEXT - KPROBES_TEXT -+ ALIGN_ENTRY_TEXT_BEGIN - ENTRY_TEXT - IRQENTRY_TEXT -+ ALIGN_ENTRY_TEXT_END - SOFTIRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) --- -2.14.2 - diff --git a/patches/kernel/0201-x86-mm-pti-Share-cpu_entry_area-with-user-space-page.patch b/patches/kernel/0201-x86-mm-pti-Share-cpu_entry_area-with-user-space-page.patch new file mode 100644 index 0000000..2decc9c --- /dev/null +++ b/patches/kernel/0201-x86-mm-pti-Share-cpu_entry_area-with-user-space-page.patch @@ -0,0 +1,87 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Mon, 4 Dec 2017 15:07:45 +0100 +Subject: [PATCH] x86/mm/pti: Share cpu_entry_area with user space page tables +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Share the cpu entry area so the user space and kernel space page tables +have the same P4D page. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(cherry picked from commit f7cfbee91559ca7e3e961a00ffac921208a115ad) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 6e8142de3a6e84a82a421b66a74ba37976912282) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/pti.c | 25 +++++++++++++++++++++++++ + 1 file changed, 25 insertions(+) + +diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c +index d58bcee470fc..59290356f19f 100644 +--- a/arch/x86/mm/pti.c ++++ b/arch/x86/mm/pti.c +@@ -264,6 +264,29 @@ pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) + } + } + ++/* ++ * Clone a single p4d (i.e. a top-level entry on 4-level systems and a ++ * next-level entry on 5-level systems. ++ */ ++static void __init pti_clone_p4d(unsigned long addr) ++{ ++ p4d_t *kernel_p4d, *user_p4d; ++ pgd_t *kernel_pgd; ++ ++ user_p4d = pti_user_pagetable_walk_p4d(addr); ++ kernel_pgd = pgd_offset_k(addr); ++ kernel_p4d = p4d_offset(kernel_pgd, addr); ++ *user_p4d = *kernel_p4d; ++} ++ ++/* ++ * Clone the CPU_ENTRY_AREA into the user space visible page table. ++ */ ++static void __init pti_clone_user_shared(void) ++{ ++ pti_clone_p4d(CPU_ENTRY_AREA_BASE); ++} ++ + /* + * Initialize kernel page table isolation + */ +@@ -273,4 +296,6 @@ void __init pti_init(void) + return; + + pr_info("enabled\n"); ++ ++ pti_clone_user_shared(); + } +-- +2.14.2 + diff --git a/patches/kernel/0202-x86-entry-Align-entry-text-section-to-PMD-boundary.patch b/patches/kernel/0202-x86-entry-Align-entry-text-section-to-PMD-boundary.patch new file mode 100644 index 0000000..4b7d5a4 --- /dev/null +++ b/patches/kernel/0202-x86-entry-Align-entry-text-section-to-PMD-boundary.patch @@ -0,0 +1,79 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 4 Dec 2017 15:07:46 +0100 +Subject: [PATCH] x86/entry: Align entry text section to PMD boundary +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The (irq)entry text must be visible in the user space page tables. To allow +simple PMD based sharing, make the entry text PMD aligned. + +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 2f7412ba9c6af5ab16bdbb4a3fdb1dcd2b4fd3c2) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 3cf72b14b56834882ebe731d5fa84d249c56a188) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/vmlinux.lds.S | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S +index 423aa36f0150..f505d8dbdccf 100644 +--- a/arch/x86/kernel/vmlinux.lds.S ++++ b/arch/x86/kernel/vmlinux.lds.S +@@ -60,11 +60,17 @@ jiffies_64 = jiffies; + . = ALIGN(HPAGE_SIZE); \ + __end_rodata_hpage_align = .; + ++#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); ++#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); ++ + #else + + #define X64_ALIGN_RODATA_BEGIN + #define X64_ALIGN_RODATA_END + ++#define ALIGN_ENTRY_TEXT_BEGIN ++#define ALIGN_ENTRY_TEXT_END ++ + #endif + + PHDRS { +@@ -101,8 +107,10 @@ SECTIONS + CPUIDLE_TEXT + LOCK_TEXT + KPROBES_TEXT ++ ALIGN_ENTRY_TEXT_BEGIN + ENTRY_TEXT + IRQENTRY_TEXT ++ ALIGN_ENTRY_TEXT_END + SOFTIRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) +-- +2.14.2 + diff --git a/patches/kernel/0202-x86-mm-pti-Share-entry-text-PMD.patch b/patches/kernel/0202-x86-mm-pti-Share-entry-text-PMD.patch deleted file mode 100644 index eb6ec0d..0000000 --- a/patches/kernel/0202-x86-mm-pti-Share-entry-text-PMD.patch +++ /dev/null @@ -1,74 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 4 Dec 2017 15:07:47 +0100 -Subject: [PATCH] x86/mm/pti: Share entry text PMD -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Share the entry text PMD of the kernel mapping with the user space -mapping. If large pages are enabled this is a single PMD entry and at the -point where it is copied into the user page table the RW bit has not been -cleared yet. Clear it right away so the user space visible map becomes RX. - -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 6dc72c3cbca0580642808d677181cad4c6433893) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit ee98d7446b4a7c12a57a38b1a5f51e3df0ac2cf3) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/pti.c | 10 ++++++++++ - 1 file changed, 10 insertions(+) - -diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c -index 59290356f19f..0e78797650a7 100644 ---- a/arch/x86/mm/pti.c -+++ b/arch/x86/mm/pti.c -@@ -287,6 +287,15 @@ static void __init pti_clone_user_shared(void) - pti_clone_p4d(CPU_ENTRY_AREA_BASE); - } - -+/* -+ * Clone the populated PMDs of the entry and irqentry text and force it RO. -+ */ -+static void __init pti_clone_entry_text(void) -+{ -+ pti_clone_pmds((unsigned long) __entry_text_start, -+ (unsigned long) __irqentry_text_end, _PAGE_RW); -+} -+ - /* - * Initialize kernel page table isolation - */ -@@ -298,4 +307,5 @@ void __init pti_init(void) - pr_info("enabled\n"); - - pti_clone_user_shared(); -+ pti_clone_entry_text(); - } --- -2.14.2 - diff --git a/patches/kernel/0203-x86-mm-pti-Map-ESPFIX-into-user-space.patch b/patches/kernel/0203-x86-mm-pti-Map-ESPFIX-into-user-space.patch deleted file mode 100644 index 0db3a23..0000000 --- a/patches/kernel/0203-x86-mm-pti-Map-ESPFIX-into-user-space.patch +++ /dev/null @@ -1,64 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Fri, 15 Dec 2017 22:08:18 +0100 -Subject: [PATCH] x86/mm/pti: Map ESPFIX into user space -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Map the ESPFIX pages into user space when PTI is enabled. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Kees Cook -Cc: Linus Torvalds -Cc: Peter Zijlstra -Signed-off-by: Ingo Molnar -(cherry picked from commit 4b6bbe95b87966ba08999574db65c93c5e925a36) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit f5103cc3035ae6d1816404696ee2eb06d53b6709) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/pti.c | 11 +++++++++++ - 1 file changed, 11 insertions(+) - -diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c -index 0e78797650a7..b1c38ef9fbbb 100644 ---- a/arch/x86/mm/pti.c -+++ b/arch/x86/mm/pti.c -@@ -287,6 +287,16 @@ static void __init pti_clone_user_shared(void) - pti_clone_p4d(CPU_ENTRY_AREA_BASE); - } - -+/* -+ * Clone the ESPFIX P4D into the user space visinble page table -+ */ -+static void __init pti_setup_espfix64(void) -+{ -+#ifdef CONFIG_X86_ESPFIX64 -+ pti_clone_p4d(ESPFIX_BASE_ADDR); -+#endif -+} -+ - /* - * Clone the populated PMDs of the entry and irqentry text and force it RO. - */ -@@ -308,4 +318,5 @@ void __init pti_init(void) - - pti_clone_user_shared(); - pti_clone_entry_text(); -+ pti_setup_espfix64(); - } --- -2.14.2 - diff --git a/patches/kernel/0203-x86-mm-pti-Share-entry-text-PMD.patch b/patches/kernel/0203-x86-mm-pti-Share-entry-text-PMD.patch new file mode 100644 index 0000000..eb6ec0d --- /dev/null +++ b/patches/kernel/0203-x86-mm-pti-Share-entry-text-PMD.patch @@ -0,0 +1,74 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 4 Dec 2017 15:07:47 +0100 +Subject: [PATCH] x86/mm/pti: Share entry text PMD +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Share the entry text PMD of the kernel mapping with the user space +mapping. If large pages are enabled this is a single PMD entry and at the +point where it is copied into the user page table the RW bit has not been +cleared yet. Clear it right away so the user space visible map becomes RX. + +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 6dc72c3cbca0580642808d677181cad4c6433893) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit ee98d7446b4a7c12a57a38b1a5f51e3df0ac2cf3) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/pti.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c +index 59290356f19f..0e78797650a7 100644 +--- a/arch/x86/mm/pti.c ++++ b/arch/x86/mm/pti.c +@@ -287,6 +287,15 @@ static void __init pti_clone_user_shared(void) + pti_clone_p4d(CPU_ENTRY_AREA_BASE); + } + ++/* ++ * Clone the populated PMDs of the entry and irqentry text and force it RO. ++ */ ++static void __init pti_clone_entry_text(void) ++{ ++ pti_clone_pmds((unsigned long) __entry_text_start, ++ (unsigned long) __irqentry_text_end, _PAGE_RW); ++} ++ + /* + * Initialize kernel page table isolation + */ +@@ -298,4 +307,5 @@ void __init pti_init(void) + pr_info("enabled\n"); + + pti_clone_user_shared(); ++ pti_clone_entry_text(); + } +-- +2.14.2 + diff --git a/patches/kernel/0204-x86-cpu_entry_area-Add-debugstore-entries-to-cpu_ent.patch b/patches/kernel/0204-x86-cpu_entry_area-Add-debugstore-entries-to-cpu_ent.patch deleted file mode 100644 index c9ca293..0000000 --- a/patches/kernel/0204-x86-cpu_entry_area-Add-debugstore-entries-to-cpu_ent.patch +++ /dev/null @@ -1,244 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 4 Dec 2017 15:07:49 +0100 -Subject: [PATCH] x86/cpu_entry_area: Add debugstore entries to cpu_entry_area -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The Intel PEBS/BTS debug store is a design trainwreck as it expects virtual -addresses which must be visible in any execution context. - -So it is required to make these mappings visible to user space when kernel -page table isolation is active. - -Provide enough room for the buffer mappings in the cpu_entry_area so the -buffers are available in the user space visible page tables. - -At the point where the kernel side entry area is populated there is no -buffer available yet, but the kernel PMD must be populated. To achieve this -set the entries for these buffers to non present. - -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 10043e02db7f8a4161f76434931051e7d797a5f6) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 4b9996f9c2d35d23a9fa2afe4f161402e6f28309) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/events/perf_event.h | 21 ++------------------ - arch/x86/include/asm/cpu_entry_area.h | 13 +++++++++++++ - arch/x86/include/asm/intel_ds.h | 36 +++++++++++++++++++++++++++++++++++ - arch/x86/events/intel/ds.c | 5 +++-- - arch/x86/mm/cpu_entry_area.c | 27 ++++++++++++++++++++++++++ - 5 files changed, 81 insertions(+), 21 deletions(-) - create mode 100644 arch/x86/include/asm/intel_ds.h - -diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h -index 590eaf7c2c3e..308bc14f58af 100644 ---- a/arch/x86/events/perf_event.h -+++ b/arch/x86/events/perf_event.h -@@ -14,6 +14,8 @@ - - #include - -+#include -+ - /* To enable MSR tracing please use the generic trace points. */ - - /* -@@ -77,8 +79,6 @@ struct amd_nb { - struct event_constraint event_constraints[X86_PMC_IDX_MAX]; - }; - --/* The maximal number of PEBS events: */ --#define MAX_PEBS_EVENTS 8 - #define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1) - - /* -@@ -95,23 +95,6 @@ struct amd_nb { - PERF_SAMPLE_TRANSACTION | \ - PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) - --/* -- * A debug store configuration. -- * -- * We only support architectures that use 64bit fields. -- */ --struct debug_store { -- u64 bts_buffer_base; -- u64 bts_index; -- u64 bts_absolute_maximum; -- u64 bts_interrupt_threshold; -- u64 pebs_buffer_base; -- u64 pebs_index; -- u64 pebs_absolute_maximum; -- u64 pebs_interrupt_threshold; -- u64 pebs_event_reset[MAX_PEBS_EVENTS]; --}; -- - #define PEBS_REGS \ - (PERF_REG_X86_AX | \ - PERF_REG_X86_BX | \ -diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h -index 2fbc69a0916e..4a7884b8dca5 100644 ---- a/arch/x86/include/asm/cpu_entry_area.h -+++ b/arch/x86/include/asm/cpu_entry_area.h -@@ -5,6 +5,7 @@ - - #include - #include -+#include - - /* - * cpu_entry_area is a percpu region that contains things needed by the CPU -@@ -40,6 +41,18 @@ struct cpu_entry_area { - */ - char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; - #endif -+#ifdef CONFIG_CPU_SUP_INTEL -+ /* -+ * Per CPU debug store for Intel performance monitoring. Wastes a -+ * full page at the moment. -+ */ -+ struct debug_store cpu_debug_store; -+ /* -+ * The actual PEBS/BTS buffers must be mapped to user space -+ * Reserve enough fixmap PTEs. -+ */ -+ struct debug_store_buffers cpu_debug_buffers; -+#endif - }; - - #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) -diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h -new file mode 100644 -index 000000000000..62a9f4966b42 ---- /dev/null -+++ b/arch/x86/include/asm/intel_ds.h -@@ -0,0 +1,36 @@ -+#ifndef _ASM_INTEL_DS_H -+#define _ASM_INTEL_DS_H -+ -+#include -+ -+#define BTS_BUFFER_SIZE (PAGE_SIZE << 4) -+#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) -+ -+/* The maximal number of PEBS events: */ -+#define MAX_PEBS_EVENTS 8 -+ -+/* -+ * A debug store configuration. -+ * -+ * We only support architectures that use 64bit fields. -+ */ -+struct debug_store { -+ u64 bts_buffer_base; -+ u64 bts_index; -+ u64 bts_absolute_maximum; -+ u64 bts_interrupt_threshold; -+ u64 pebs_buffer_base; -+ u64 pebs_index; -+ u64 pebs_absolute_maximum; -+ u64 pebs_interrupt_threshold; -+ u64 pebs_event_reset[MAX_PEBS_EVENTS]; -+} __aligned(PAGE_SIZE); -+ -+DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); -+ -+struct debug_store_buffers { -+ char bts_buffer[BTS_BUFFER_SIZE]; -+ char pebs_buffer[PEBS_BUFFER_SIZE]; -+}; -+ -+#endif -diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c -index 98e36e0c791c..21a4ed789ec0 100644 ---- a/arch/x86/events/intel/ds.c -+++ b/arch/x86/events/intel/ds.c -@@ -7,11 +7,12 @@ - - #include "../perf_event.h" - -+/* Waste a full page so it can be mapped into the cpu_entry_area */ -+DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); -+ - /* The size of a BTS record in bytes: */ - #define BTS_RECORD_SIZE 24 - --#define BTS_BUFFER_SIZE (PAGE_SIZE << 4) --#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) - #define PEBS_FIXUP_SIZE PAGE_SIZE - - /* -diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c -index fe814fd5e014..b9283cc27622 100644 ---- a/arch/x86/mm/cpu_entry_area.c -+++ b/arch/x86/mm/cpu_entry_area.c -@@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) - cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); - } - -+static void percpu_setup_debug_store(int cpu) -+{ -+#ifdef CONFIG_CPU_SUP_INTEL -+ int npages; -+ void *cea; -+ -+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) -+ return; -+ -+ cea = &get_cpu_entry_area(cpu)->cpu_debug_store; -+ npages = sizeof(struct debug_store) / PAGE_SIZE; -+ BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0); -+ cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages, -+ PAGE_KERNEL); -+ -+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers; -+ /* -+ * Force the population of PMDs for not yet allocated per cpu -+ * memory like debug store buffers. -+ */ -+ npages = sizeof(struct debug_store_buffers) / PAGE_SIZE; -+ for (; npages; npages--, cea += PAGE_SIZE) -+ cea_set_pte(cea, 0, PAGE_NONE); -+#endif -+} -+ - /* Setup the fixmap mappings only once per-processor */ - static void __init setup_cpu_entry_area(int cpu) - { -@@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(int cpu) - cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, - __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); - #endif -+ percpu_setup_debug_store(cpu); - } - - static __init void setup_cpu_entry_area_ptes(void) --- -2.14.2 - diff --git a/patches/kernel/0204-x86-mm-pti-Map-ESPFIX-into-user-space.patch b/patches/kernel/0204-x86-mm-pti-Map-ESPFIX-into-user-space.patch new file mode 100644 index 0000000..0db3a23 --- /dev/null +++ b/patches/kernel/0204-x86-mm-pti-Map-ESPFIX-into-user-space.patch @@ -0,0 +1,64 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Fri, 15 Dec 2017 22:08:18 +0100 +Subject: [PATCH] x86/mm/pti: Map ESPFIX into user space +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Map the ESPFIX pages into user space when PTI is enabled. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Kees Cook +Cc: Linus Torvalds +Cc: Peter Zijlstra +Signed-off-by: Ingo Molnar +(cherry picked from commit 4b6bbe95b87966ba08999574db65c93c5e925a36) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit f5103cc3035ae6d1816404696ee2eb06d53b6709) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/pti.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c +index 0e78797650a7..b1c38ef9fbbb 100644 +--- a/arch/x86/mm/pti.c ++++ b/arch/x86/mm/pti.c +@@ -287,6 +287,16 @@ static void __init pti_clone_user_shared(void) + pti_clone_p4d(CPU_ENTRY_AREA_BASE); + } + ++/* ++ * Clone the ESPFIX P4D into the user space visinble page table ++ */ ++static void __init pti_setup_espfix64(void) ++{ ++#ifdef CONFIG_X86_ESPFIX64 ++ pti_clone_p4d(ESPFIX_BASE_ADDR); ++#endif ++} ++ + /* + * Clone the populated PMDs of the entry and irqentry text and force it RO. + */ +@@ -308,4 +318,5 @@ void __init pti_init(void) + + pti_clone_user_shared(); + pti_clone_entry_text(); ++ pti_setup_espfix64(); + } +-- +2.14.2 + diff --git a/patches/kernel/0205-x86-cpu_entry_area-Add-debugstore-entries-to-cpu_ent.patch b/patches/kernel/0205-x86-cpu_entry_area-Add-debugstore-entries-to-cpu_ent.patch new file mode 100644 index 0000000..c9ca293 --- /dev/null +++ b/patches/kernel/0205-x86-cpu_entry_area-Add-debugstore-entries-to-cpu_ent.patch @@ -0,0 +1,244 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 4 Dec 2017 15:07:49 +0100 +Subject: [PATCH] x86/cpu_entry_area: Add debugstore entries to cpu_entry_area +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The Intel PEBS/BTS debug store is a design trainwreck as it expects virtual +addresses which must be visible in any execution context. + +So it is required to make these mappings visible to user space when kernel +page table isolation is active. + +Provide enough room for the buffer mappings in the cpu_entry_area so the +buffers are available in the user space visible page tables. + +At the point where the kernel side entry area is populated there is no +buffer available yet, but the kernel PMD must be populated. To achieve this +set the entries for these buffers to non present. + +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 10043e02db7f8a4161f76434931051e7d797a5f6) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 4b9996f9c2d35d23a9fa2afe4f161402e6f28309) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/events/perf_event.h | 21 ++------------------ + arch/x86/include/asm/cpu_entry_area.h | 13 +++++++++++++ + arch/x86/include/asm/intel_ds.h | 36 +++++++++++++++++++++++++++++++++++ + arch/x86/events/intel/ds.c | 5 +++-- + arch/x86/mm/cpu_entry_area.c | 27 ++++++++++++++++++++++++++ + 5 files changed, 81 insertions(+), 21 deletions(-) + create mode 100644 arch/x86/include/asm/intel_ds.h + +diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h +index 590eaf7c2c3e..308bc14f58af 100644 +--- a/arch/x86/events/perf_event.h ++++ b/arch/x86/events/perf_event.h +@@ -14,6 +14,8 @@ + + #include + ++#include ++ + /* To enable MSR tracing please use the generic trace points. */ + + /* +@@ -77,8 +79,6 @@ struct amd_nb { + struct event_constraint event_constraints[X86_PMC_IDX_MAX]; + }; + +-/* The maximal number of PEBS events: */ +-#define MAX_PEBS_EVENTS 8 + #define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1) + + /* +@@ -95,23 +95,6 @@ struct amd_nb { + PERF_SAMPLE_TRANSACTION | \ + PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) + +-/* +- * A debug store configuration. +- * +- * We only support architectures that use 64bit fields. +- */ +-struct debug_store { +- u64 bts_buffer_base; +- u64 bts_index; +- u64 bts_absolute_maximum; +- u64 bts_interrupt_threshold; +- u64 pebs_buffer_base; +- u64 pebs_index; +- u64 pebs_absolute_maximum; +- u64 pebs_interrupt_threshold; +- u64 pebs_event_reset[MAX_PEBS_EVENTS]; +-}; +- + #define PEBS_REGS \ + (PERF_REG_X86_AX | \ + PERF_REG_X86_BX | \ +diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h +index 2fbc69a0916e..4a7884b8dca5 100644 +--- a/arch/x86/include/asm/cpu_entry_area.h ++++ b/arch/x86/include/asm/cpu_entry_area.h +@@ -5,6 +5,7 @@ + + #include + #include ++#include + + /* + * cpu_entry_area is a percpu region that contains things needed by the CPU +@@ -40,6 +41,18 @@ struct cpu_entry_area { + */ + char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; + #endif ++#ifdef CONFIG_CPU_SUP_INTEL ++ /* ++ * Per CPU debug store for Intel performance monitoring. Wastes a ++ * full page at the moment. ++ */ ++ struct debug_store cpu_debug_store; ++ /* ++ * The actual PEBS/BTS buffers must be mapped to user space ++ * Reserve enough fixmap PTEs. ++ */ ++ struct debug_store_buffers cpu_debug_buffers; ++#endif + }; + + #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) +diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h +new file mode 100644 +index 000000000000..62a9f4966b42 +--- /dev/null ++++ b/arch/x86/include/asm/intel_ds.h +@@ -0,0 +1,36 @@ ++#ifndef _ASM_INTEL_DS_H ++#define _ASM_INTEL_DS_H ++ ++#include ++ ++#define BTS_BUFFER_SIZE (PAGE_SIZE << 4) ++#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) ++ ++/* The maximal number of PEBS events: */ ++#define MAX_PEBS_EVENTS 8 ++ ++/* ++ * A debug store configuration. ++ * ++ * We only support architectures that use 64bit fields. ++ */ ++struct debug_store { ++ u64 bts_buffer_base; ++ u64 bts_index; ++ u64 bts_absolute_maximum; ++ u64 bts_interrupt_threshold; ++ u64 pebs_buffer_base; ++ u64 pebs_index; ++ u64 pebs_absolute_maximum; ++ u64 pebs_interrupt_threshold; ++ u64 pebs_event_reset[MAX_PEBS_EVENTS]; ++} __aligned(PAGE_SIZE); ++ ++DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); ++ ++struct debug_store_buffers { ++ char bts_buffer[BTS_BUFFER_SIZE]; ++ char pebs_buffer[PEBS_BUFFER_SIZE]; ++}; ++ ++#endif +diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c +index 98e36e0c791c..21a4ed789ec0 100644 +--- a/arch/x86/events/intel/ds.c ++++ b/arch/x86/events/intel/ds.c +@@ -7,11 +7,12 @@ + + #include "../perf_event.h" + ++/* Waste a full page so it can be mapped into the cpu_entry_area */ ++DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); ++ + /* The size of a BTS record in bytes: */ + #define BTS_RECORD_SIZE 24 + +-#define BTS_BUFFER_SIZE (PAGE_SIZE << 4) +-#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) + #define PEBS_FIXUP_SIZE PAGE_SIZE + + /* +diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c +index fe814fd5e014..b9283cc27622 100644 +--- a/arch/x86/mm/cpu_entry_area.c ++++ b/arch/x86/mm/cpu_entry_area.c +@@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) + cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); + } + ++static void percpu_setup_debug_store(int cpu) ++{ ++#ifdef CONFIG_CPU_SUP_INTEL ++ int npages; ++ void *cea; ++ ++ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ++ return; ++ ++ cea = &get_cpu_entry_area(cpu)->cpu_debug_store; ++ npages = sizeof(struct debug_store) / PAGE_SIZE; ++ BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0); ++ cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages, ++ PAGE_KERNEL); ++ ++ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers; ++ /* ++ * Force the population of PMDs for not yet allocated per cpu ++ * memory like debug store buffers. ++ */ ++ npages = sizeof(struct debug_store_buffers) / PAGE_SIZE; ++ for (; npages; npages--, cea += PAGE_SIZE) ++ cea_set_pte(cea, 0, PAGE_NONE); ++#endif ++} ++ + /* Setup the fixmap mappings only once per-processor */ + static void __init setup_cpu_entry_area(int cpu) + { +@@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(int cpu) + cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); + #endif ++ percpu_setup_debug_store(cpu); + } + + static __init void setup_cpu_entry_area_ptes(void) +-- +2.14.2 + diff --git a/patches/kernel/0205-x86-events-intel-ds-Map-debug-buffers-in-cpu_entry_a.patch b/patches/kernel/0205-x86-events-intel-ds-Map-debug-buffers-in-cpu_entry_a.patch deleted file mode 100644 index ef63988..0000000 --- a/patches/kernel/0205-x86-events-intel-ds-Map-debug-buffers-in-cpu_entry_a.patch +++ /dev/null @@ -1,280 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Hugh Dickins -Date: Mon, 4 Dec 2017 15:07:50 +0100 -Subject: [PATCH] x86/events/intel/ds: Map debug buffers in cpu_entry_area -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The BTS and PEBS buffers both have their virtual addresses programmed into -the hardware. This means that any access to them is performed via the page -tables. The times that the hardware accesses these are entirely dependent -on how the performance monitoring hardware events are set up. In other -words, there is no way for the kernel to tell when the hardware might -access these buffers. - -To avoid perf crashes, place 'debug_store' allocate pages and map them into -the cpu_entry_area. - -The PEBS fixup buffer does not need this treatment. - -[ tglx: Got rid of the kaiser_add_mapping() complication ] - -Signed-off-by: Hugh Dickins -Signed-off-by: Dave Hansen -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(cherry picked from commit c1961a4631daef4aeabee8e368b1b13e8f173c91) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 569dedbb62e16e3268f006dcf745b8d27690ef91) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/events/perf_event.h | 2 + - arch/x86/events/intel/ds.c | 125 +++++++++++++++++++++++++++---------------- - 2 files changed, 82 insertions(+), 45 deletions(-) - -diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h -index 308bc14f58af..eb0876475f18 100644 ---- a/arch/x86/events/perf_event.h -+++ b/arch/x86/events/perf_event.h -@@ -199,6 +199,8 @@ struct cpu_hw_events { - * Intel DebugStore bits - */ - struct debug_store *ds; -+ void *ds_pebs_vaddr; -+ void *ds_bts_vaddr; - u64 pebs_enabled; - int n_pebs; - int n_large_pebs; -diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c -index 21a4ed789ec0..85df1f12c49e 100644 ---- a/arch/x86/events/intel/ds.c -+++ b/arch/x86/events/intel/ds.c -@@ -2,6 +2,7 @@ - #include - #include - -+#include - #include - #include - -@@ -279,17 +280,52 @@ void fini_debug_store_on_cpu(int cpu) - - static DEFINE_PER_CPU(void *, insn_buffer); - --static int alloc_pebs_buffer(int cpu) -+static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot) - { -- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; -+ phys_addr_t pa; -+ size_t msz = 0; -+ -+ pa = virt_to_phys(addr); -+ for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE) -+ cea_set_pte(cea, pa, prot); -+} -+ -+static void ds_clear_cea(void *cea, size_t size) -+{ -+ size_t msz = 0; -+ -+ for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE) -+ cea_set_pte(cea, 0, PAGE_NONE); -+} -+ -+static void *dsalloc_pages(size_t size, gfp_t flags, int cpu) -+{ -+ unsigned int order = get_order(size); - int node = cpu_to_node(cpu); -- int max; -- void *buffer, *ibuffer; -+ struct page *page; -+ -+ page = __alloc_pages_node(node, flags | __GFP_ZERO, order); -+ return page ? page_address(page) : NULL; -+} -+ -+static void dsfree_pages(const void *buffer, size_t size) -+{ -+ if (buffer) -+ free_pages((unsigned long)buffer, get_order(size)); -+} -+ -+static int alloc_pebs_buffer(int cpu) -+{ -+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); -+ struct debug_store *ds = hwev->ds; -+ size_t bsiz = x86_pmu.pebs_buffer_size; -+ int max, node = cpu_to_node(cpu); -+ void *buffer, *ibuffer, *cea; - - if (!x86_pmu.pebs) - return 0; - -- buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); -+ buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); - if (unlikely(!buffer)) - return -ENOMEM; - -@@ -300,25 +336,27 @@ static int alloc_pebs_buffer(int cpu) - if (x86_pmu.intel_cap.pebs_format < 2) { - ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); - if (!ibuffer) { -- kfree(buffer); -+ dsfree_pages(buffer, bsiz); - return -ENOMEM; - } - per_cpu(insn_buffer, cpu) = ibuffer; - } -- -- max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size; -- -- ds->pebs_buffer_base = (u64)(unsigned long)buffer; -+ hwev->ds_pebs_vaddr = buffer; -+ /* Update the cpu entry area mapping */ -+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; -+ ds->pebs_buffer_base = (unsigned long) cea; -+ ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL); - ds->pebs_index = ds->pebs_buffer_base; -- ds->pebs_absolute_maximum = ds->pebs_buffer_base + -- max * x86_pmu.pebs_record_size; -- -+ max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size); -+ ds->pebs_absolute_maximum = ds->pebs_buffer_base + max; - return 0; - } - - static void release_pebs_buffer(int cpu) - { -- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; -+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); -+ struct debug_store *ds = hwev->ds; -+ void *cea; - - if (!ds || !x86_pmu.pebs) - return; -@@ -326,73 +364,70 @@ static void release_pebs_buffer(int cpu) - kfree(per_cpu(insn_buffer, cpu)); - per_cpu(insn_buffer, cpu) = NULL; - -- kfree((void *)(unsigned long)ds->pebs_buffer_base); -+ /* Clear the fixmap */ -+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; -+ ds_clear_cea(cea, x86_pmu.pebs_buffer_size); - ds->pebs_buffer_base = 0; -+ dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); -+ hwev->ds_pebs_vaddr = NULL; - } - - static int alloc_bts_buffer(int cpu) - { -- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; -- int node = cpu_to_node(cpu); -- int max, thresh; -- void *buffer; -+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); -+ struct debug_store *ds = hwev->ds; -+ void *buffer, *cea; -+ int max; - - if (!x86_pmu.bts) - return 0; - -- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); -+ buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu); - if (unlikely(!buffer)) { - WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); - return -ENOMEM; - } -- -- max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; -- thresh = max / 16; -- -- ds->bts_buffer_base = (u64)(unsigned long)buffer; -+ hwev->ds_bts_vaddr = buffer; -+ /* Update the fixmap */ -+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; -+ ds->bts_buffer_base = (unsigned long) cea; -+ ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL); - ds->bts_index = ds->bts_buffer_base; -- ds->bts_absolute_maximum = ds->bts_buffer_base + -- max * BTS_RECORD_SIZE; -- ds->bts_interrupt_threshold = ds->bts_absolute_maximum - -- thresh * BTS_RECORD_SIZE; -- -+ max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE); -+ ds->bts_absolute_maximum = ds->bts_buffer_base + max; -+ ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16); - return 0; - } - - static void release_bts_buffer(int cpu) - { -- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; -+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); -+ struct debug_store *ds = hwev->ds; -+ void *cea; - - if (!ds || !x86_pmu.bts) - return; - -- kfree((void *)(unsigned long)ds->bts_buffer_base); -+ /* Clear the fixmap */ -+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; -+ ds_clear_cea(cea, BTS_BUFFER_SIZE); - ds->bts_buffer_base = 0; -+ dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE); -+ hwev->ds_bts_vaddr = NULL; - } - - static int alloc_ds_buffer(int cpu) - { -- int node = cpu_to_node(cpu); -- struct debug_store *ds; -- -- ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); -- if (unlikely(!ds)) -- return -ENOMEM; -+ struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store; - -+ memset(ds, 0, sizeof(*ds)); - per_cpu(cpu_hw_events, cpu).ds = ds; -- - return 0; - } - - static void release_ds_buffer(int cpu) - { -- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; -- -- if (!ds) -- return; -- - per_cpu(cpu_hw_events, cpu).ds = NULL; -- kfree(ds); - } - - void release_ds_buffers(void) --- -2.14.2 - diff --git a/patches/kernel/0206-x86-events-intel-ds-Map-debug-buffers-in-cpu_entry_a.patch b/patches/kernel/0206-x86-events-intel-ds-Map-debug-buffers-in-cpu_entry_a.patch new file mode 100644 index 0000000..ef63988 --- /dev/null +++ b/patches/kernel/0206-x86-events-intel-ds-Map-debug-buffers-in-cpu_entry_a.patch @@ -0,0 +1,280 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Mon, 4 Dec 2017 15:07:50 +0100 +Subject: [PATCH] x86/events/intel/ds: Map debug buffers in cpu_entry_area +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The BTS and PEBS buffers both have their virtual addresses programmed into +the hardware. This means that any access to them is performed via the page +tables. The times that the hardware accesses these are entirely dependent +on how the performance monitoring hardware events are set up. In other +words, there is no way for the kernel to tell when the hardware might +access these buffers. + +To avoid perf crashes, place 'debug_store' allocate pages and map them into +the cpu_entry_area. + +The PEBS fixup buffer does not need this treatment. + +[ tglx: Got rid of the kaiser_add_mapping() complication ] + +Signed-off-by: Hugh Dickins +Signed-off-by: Dave Hansen +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(cherry picked from commit c1961a4631daef4aeabee8e368b1b13e8f173c91) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 569dedbb62e16e3268f006dcf745b8d27690ef91) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/events/perf_event.h | 2 + + arch/x86/events/intel/ds.c | 125 +++++++++++++++++++++++++++---------------- + 2 files changed, 82 insertions(+), 45 deletions(-) + +diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h +index 308bc14f58af..eb0876475f18 100644 +--- a/arch/x86/events/perf_event.h ++++ b/arch/x86/events/perf_event.h +@@ -199,6 +199,8 @@ struct cpu_hw_events { + * Intel DebugStore bits + */ + struct debug_store *ds; ++ void *ds_pebs_vaddr; ++ void *ds_bts_vaddr; + u64 pebs_enabled; + int n_pebs; + int n_large_pebs; +diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c +index 21a4ed789ec0..85df1f12c49e 100644 +--- a/arch/x86/events/intel/ds.c ++++ b/arch/x86/events/intel/ds.c +@@ -2,6 +2,7 @@ + #include + #include + ++#include + #include + #include + +@@ -279,17 +280,52 @@ void fini_debug_store_on_cpu(int cpu) + + static DEFINE_PER_CPU(void *, insn_buffer); + +-static int alloc_pebs_buffer(int cpu) ++static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot) + { +- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; ++ phys_addr_t pa; ++ size_t msz = 0; ++ ++ pa = virt_to_phys(addr); ++ for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE) ++ cea_set_pte(cea, pa, prot); ++} ++ ++static void ds_clear_cea(void *cea, size_t size) ++{ ++ size_t msz = 0; ++ ++ for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE) ++ cea_set_pte(cea, 0, PAGE_NONE); ++} ++ ++static void *dsalloc_pages(size_t size, gfp_t flags, int cpu) ++{ ++ unsigned int order = get_order(size); + int node = cpu_to_node(cpu); +- int max; +- void *buffer, *ibuffer; ++ struct page *page; ++ ++ page = __alloc_pages_node(node, flags | __GFP_ZERO, order); ++ return page ? page_address(page) : NULL; ++} ++ ++static void dsfree_pages(const void *buffer, size_t size) ++{ ++ if (buffer) ++ free_pages((unsigned long)buffer, get_order(size)); ++} ++ ++static int alloc_pebs_buffer(int cpu) ++{ ++ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); ++ struct debug_store *ds = hwev->ds; ++ size_t bsiz = x86_pmu.pebs_buffer_size; ++ int max, node = cpu_to_node(cpu); ++ void *buffer, *ibuffer, *cea; + + if (!x86_pmu.pebs) + return 0; + +- buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); ++ buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); + if (unlikely(!buffer)) + return -ENOMEM; + +@@ -300,25 +336,27 @@ static int alloc_pebs_buffer(int cpu) + if (x86_pmu.intel_cap.pebs_format < 2) { + ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); + if (!ibuffer) { +- kfree(buffer); ++ dsfree_pages(buffer, bsiz); + return -ENOMEM; + } + per_cpu(insn_buffer, cpu) = ibuffer; + } +- +- max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size; +- +- ds->pebs_buffer_base = (u64)(unsigned long)buffer; ++ hwev->ds_pebs_vaddr = buffer; ++ /* Update the cpu entry area mapping */ ++ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; ++ ds->pebs_buffer_base = (unsigned long) cea; ++ ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL); + ds->pebs_index = ds->pebs_buffer_base; +- ds->pebs_absolute_maximum = ds->pebs_buffer_base + +- max * x86_pmu.pebs_record_size; +- ++ max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size); ++ ds->pebs_absolute_maximum = ds->pebs_buffer_base + max; + return 0; + } + + static void release_pebs_buffer(int cpu) + { +- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; ++ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); ++ struct debug_store *ds = hwev->ds; ++ void *cea; + + if (!ds || !x86_pmu.pebs) + return; +@@ -326,73 +364,70 @@ static void release_pebs_buffer(int cpu) + kfree(per_cpu(insn_buffer, cpu)); + per_cpu(insn_buffer, cpu) = NULL; + +- kfree((void *)(unsigned long)ds->pebs_buffer_base); ++ /* Clear the fixmap */ ++ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; ++ ds_clear_cea(cea, x86_pmu.pebs_buffer_size); + ds->pebs_buffer_base = 0; ++ dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); ++ hwev->ds_pebs_vaddr = NULL; + } + + static int alloc_bts_buffer(int cpu) + { +- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; +- int node = cpu_to_node(cpu); +- int max, thresh; +- void *buffer; ++ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); ++ struct debug_store *ds = hwev->ds; ++ void *buffer, *cea; ++ int max; + + if (!x86_pmu.bts) + return 0; + +- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); ++ buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu); + if (unlikely(!buffer)) { + WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); + return -ENOMEM; + } +- +- max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; +- thresh = max / 16; +- +- ds->bts_buffer_base = (u64)(unsigned long)buffer; ++ hwev->ds_bts_vaddr = buffer; ++ /* Update the fixmap */ ++ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; ++ ds->bts_buffer_base = (unsigned long) cea; ++ ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL); + ds->bts_index = ds->bts_buffer_base; +- ds->bts_absolute_maximum = ds->bts_buffer_base + +- max * BTS_RECORD_SIZE; +- ds->bts_interrupt_threshold = ds->bts_absolute_maximum - +- thresh * BTS_RECORD_SIZE; +- ++ max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE); ++ ds->bts_absolute_maximum = ds->bts_buffer_base + max; ++ ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16); + return 0; + } + + static void release_bts_buffer(int cpu) + { +- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; ++ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); ++ struct debug_store *ds = hwev->ds; ++ void *cea; + + if (!ds || !x86_pmu.bts) + return; + +- kfree((void *)(unsigned long)ds->bts_buffer_base); ++ /* Clear the fixmap */ ++ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; ++ ds_clear_cea(cea, BTS_BUFFER_SIZE); + ds->bts_buffer_base = 0; ++ dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE); ++ hwev->ds_bts_vaddr = NULL; + } + + static int alloc_ds_buffer(int cpu) + { +- int node = cpu_to_node(cpu); +- struct debug_store *ds; +- +- ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); +- if (unlikely(!ds)) +- return -ENOMEM; ++ struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store; + ++ memset(ds, 0, sizeof(*ds)); + per_cpu(cpu_hw_events, cpu).ds = ds; +- + return 0; + } + + static void release_ds_buffer(int cpu) + { +- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; +- +- if (!ds) +- return; +- + per_cpu(cpu_hw_events, cpu).ds = NULL; +- kfree(ds); + } + + void release_ds_buffers(void) +-- +2.14.2 + diff --git a/patches/kernel/0206-x86-mm-64-Make-a-full-PGD-entry-size-hole-in-the-mem.patch b/patches/kernel/0206-x86-mm-64-Make-a-full-PGD-entry-size-hole-in-the-mem.patch deleted file mode 100644 index e408245..0000000 --- a/patches/kernel/0206-x86-mm-64-Make-a-full-PGD-entry-size-hole-in-the-mem.patch +++ /dev/null @@ -1,74 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Tue, 12 Dec 2017 07:56:44 -0800 -Subject: [PATCH] x86/mm/64: Make a full PGD-entry size hole in the memory map -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Shrink vmalloc space from 16384TiB to 12800TiB to enlarge the hole starting -at 0xff90000000000000 to be a full PGD entry. - -A subsequent patch will use this hole for the pagetable isolation LDT -alias. - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Kees Cook -Cc: Kirill A. Shutemov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Signed-off-by: Ingo Molnar -(cherry picked from commit 9f449772a3106bcdd4eb8fdeb281147b0e99fb30) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 29b1c137d449dfc8fdcb476158f236625691fd28) -Signed-off-by: Fabian Grünbichler ---- - Documentation/x86/x86_64/mm.txt | 4 ++-- - arch/x86/include/asm/pgtable_64_types.h | 4 ++-- - 2 files changed, 4 insertions(+), 4 deletions(-) - -diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt -index 51101708a03a..496a1dbf139d 100644 ---- a/Documentation/x86/x86_64/mm.txt -+++ b/Documentation/x86/x86_64/mm.txt -@@ -29,8 +29,8 @@ Virtual memory map with 5 level page tables: - hole caused by [56:63] sign extension - ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor - ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory --ff90000000000000 - ff91ffffffffffff (=49 bits) hole --ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space -+ff90000000000000 - ff9fffffffffffff (=52 bits) hole -+ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB) - ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole - ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) - ... unused hole ... -diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h -index 42e2750da525..5932dead34ee 100644 ---- a/arch/x86/include/asm/pgtable_64_types.h -+++ b/arch/x86/include/asm/pgtable_64_types.h -@@ -78,8 +78,8 @@ typedef struct { pteval_t pte; } pte_t; - #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) - - #ifdef CONFIG_X86_5LEVEL --# define VMALLOC_SIZE_TB _AC(16384, UL) --# define __VMALLOC_BASE _AC(0xff92000000000000, UL) -+# define VMALLOC_SIZE_TB _AC(12800, UL) -+# define __VMALLOC_BASE _AC(0xffa0000000000000, UL) - # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) - #else - # define VMALLOC_SIZE_TB _AC(32, UL) --- -2.14.2 - diff --git a/patches/kernel/0207-x86-mm-64-Make-a-full-PGD-entry-size-hole-in-the-mem.patch b/patches/kernel/0207-x86-mm-64-Make-a-full-PGD-entry-size-hole-in-the-mem.patch new file mode 100644 index 0000000..e408245 --- /dev/null +++ b/patches/kernel/0207-x86-mm-64-Make-a-full-PGD-entry-size-hole-in-the-mem.patch @@ -0,0 +1,74 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Tue, 12 Dec 2017 07:56:44 -0800 +Subject: [PATCH] x86/mm/64: Make a full PGD-entry size hole in the memory map +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Shrink vmalloc space from 16384TiB to 12800TiB to enlarge the hole starting +at 0xff90000000000000 to be a full PGD entry. + +A subsequent patch will use this hole for the pagetable isolation LDT +alias. + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Kees Cook +Cc: Kirill A. Shutemov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Signed-off-by: Ingo Molnar +(cherry picked from commit 9f449772a3106bcdd4eb8fdeb281147b0e99fb30) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 29b1c137d449dfc8fdcb476158f236625691fd28) +Signed-off-by: Fabian Grünbichler +--- + Documentation/x86/x86_64/mm.txt | 4 ++-- + arch/x86/include/asm/pgtable_64_types.h | 4 ++-- + 2 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt +index 51101708a03a..496a1dbf139d 100644 +--- a/Documentation/x86/x86_64/mm.txt ++++ b/Documentation/x86/x86_64/mm.txt +@@ -29,8 +29,8 @@ Virtual memory map with 5 level page tables: + hole caused by [56:63] sign extension + ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor + ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory +-ff90000000000000 - ff91ffffffffffff (=49 bits) hole +-ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space ++ff90000000000000 - ff9fffffffffffff (=52 bits) hole ++ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB) + ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole + ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) + ... unused hole ... +diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h +index 42e2750da525..5932dead34ee 100644 +--- a/arch/x86/include/asm/pgtable_64_types.h ++++ b/arch/x86/include/asm/pgtable_64_types.h +@@ -78,8 +78,8 @@ typedef struct { pteval_t pte; } pte_t; + #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) + + #ifdef CONFIG_X86_5LEVEL +-# define VMALLOC_SIZE_TB _AC(16384, UL) +-# define __VMALLOC_BASE _AC(0xff92000000000000, UL) ++# define VMALLOC_SIZE_TB _AC(12800, UL) ++# define __VMALLOC_BASE _AC(0xffa0000000000000, UL) + # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) + #else + # define VMALLOC_SIZE_TB _AC(32, UL) +-- +2.14.2 + diff --git a/patches/kernel/0207-x86-pti-Put-the-LDT-in-its-own-PGD-if-PTI-is-on.patch b/patches/kernel/0207-x86-pti-Put-the-LDT-in-its-own-PGD-if-PTI-is-on.patch deleted file mode 100644 index 74b842f..0000000 --- a/patches/kernel/0207-x86-pti-Put-the-LDT-in-its-own-PGD-if-PTI-is-on.patch +++ /dev/null @@ -1,466 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Tue, 12 Dec 2017 07:56:45 -0800 -Subject: [PATCH] x86/pti: Put the LDT in its own PGD if PTI is on -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -With PTI enabled, the LDT must be mapped in the usermode tables somewhere. -The LDT is per process, i.e. per mm. - -An earlier approach mapped the LDT on context switch into a fixmap area, -but that's a big overhead and exhausted the fixmap space when NR_CPUS got -big. - -Take advantage of the fact that there is an address space hole which -provides a completely unused pgd. Use this pgd to manage per-mm LDT -mappings. - -This has a down side: the LDT isn't (currently) randomized, and an attack -that can write the LDT is instant root due to call gates (thanks, AMD, for -leaving call gates in AMD64 but designing them wrong so they're only useful -for exploits). This can be mitigated by making the LDT read-only or -randomizing the mapping, either of which is strightforward on top of this -patch. - -This will significantly slow down LDT users, but that shouldn't matter for -important workloads -- the LDT is only used by DOSEMU(2), Wine, and very -old libc implementations. - -[ tglx: Cleaned it up. ] - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: Dave Hansen -Cc: David Laight -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Kees Cook -Cc: Kirill A. Shutemov -Cc: Linus Torvalds -Cc: Peter Zijlstra -Signed-off-by: Ingo Molnar -(cherry picked from commit f55f0501cbf65ec41cca5058513031b711730b1d) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit c250643846b45ea6782fb0cfcc15e8cd34744bc7) -Signed-off-by: Fabian Grünbichler ---- - Documentation/x86/x86_64/mm.txt | 3 +- - arch/x86/include/asm/mmu_context.h | 59 ++++++++++++-- - arch/x86/include/asm/pgtable_64_types.h | 4 + - arch/x86/include/asm/processor.h | 23 ++++-- - arch/x86/kernel/ldt.c | 139 +++++++++++++++++++++++++++++++- - arch/x86/mm/dump_pagetables.c | 9 +++ - 6 files changed, 220 insertions(+), 17 deletions(-) - -diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt -index 496a1dbf139d..ad41b3813f0a 100644 ---- a/Documentation/x86/x86_64/mm.txt -+++ b/Documentation/x86/x86_64/mm.txt -@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) - ... unused hole ... - ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) - ... unused hole ... -+fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI - fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping - ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks - ... unused hole ... -@@ -29,7 +30,7 @@ Virtual memory map with 5 level page tables: - hole caused by [56:63] sign extension - ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor - ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory --ff90000000000000 - ff9fffffffffffff (=52 bits) hole -+ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI - ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB) - ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole - ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) -diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h -index 89a01ad7e370..9e3546e1c0f4 100644 ---- a/arch/x86/include/asm/mmu_context.h -+++ b/arch/x86/include/asm/mmu_context.h -@@ -49,10 +49,33 @@ struct ldt_struct { - * call gates. On native, we could merge the ldt_struct and LDT - * allocations, but it's not worth trying to optimize. - */ -- struct desc_struct *entries; -- unsigned int nr_entries; -+ struct desc_struct *entries; -+ unsigned int nr_entries; -+ -+ /* -+ * If PTI is in use, then the entries array is not mapped while we're -+ * in user mode. The whole array will be aliased at the addressed -+ * given by ldt_slot_va(slot). We use two slots so that we can allocate -+ * and map, and enable a new LDT without invalidating the mapping -+ * of an older, still-in-use LDT. -+ * -+ * slot will be -1 if this LDT doesn't have an alias mapping. -+ */ -+ int slot; - }; - -+/* This is a multiple of PAGE_SIZE. */ -+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) -+ -+static inline void *ldt_slot_va(int slot) -+{ -+#ifdef CONFIG_X86_64 -+ return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); -+#else -+ BUG(); -+#endif -+} -+ - /* - * Used for LDT copy/destruction. - */ -@@ -63,6 +86,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm) - } - int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); - void destroy_context_ldt(struct mm_struct *mm); -+void ldt_arch_exit_mmap(struct mm_struct *mm); - #else /* CONFIG_MODIFY_LDT_SYSCALL */ - static inline void init_new_context_ldt(struct mm_struct *mm) { } - static inline int ldt_dup_context(struct mm_struct *oldmm, -@@ -70,7 +94,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm, - { - return 0; - } --static inline void destroy_context_ldt(struct mm_struct *mm) {} -+static inline void destroy_context_ldt(struct mm_struct *mm) { } -+static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { } - #endif - - static inline void load_mm_ldt(struct mm_struct *mm) -@@ -95,10 +120,31 @@ static inline void load_mm_ldt(struct mm_struct *mm) - * that we can see. - */ - -- if (unlikely(ldt)) -- set_ldt(ldt->entries, ldt->nr_entries); -- else -+ if (unlikely(ldt)) { -+ if (static_cpu_has(X86_FEATURE_PTI)) { -+ if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) { -+ /* -+ * Whoops -- either the new LDT isn't mapped -+ * (if slot == -1) or is mapped into a bogus -+ * slot (if slot > 1). -+ */ -+ clear_LDT(); -+ return; -+ } -+ -+ /* -+ * If page table isolation is enabled, ldt->entries -+ * will not be mapped in the userspace pagetables. -+ * Tell the CPU to access the LDT through the alias -+ * at ldt_slot_va(ldt->slot). -+ */ -+ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries); -+ } else { -+ set_ldt(ldt->entries, ldt->nr_entries); -+ } -+ } else { - clear_LDT(); -+ } - #else - clear_LDT(); - #endif -@@ -193,6 +239,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) - static inline void arch_exit_mmap(struct mm_struct *mm) - { - paravirt_arch_exit_mmap(mm); -+ ldt_arch_exit_mmap(mm); - } - - #ifdef CONFIG_X86_64 -diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h -index 5932dead34ee..e8a809ee0bb6 100644 ---- a/arch/x86/include/asm/pgtable_64_types.h -+++ b/arch/x86/include/asm/pgtable_64_types.h -@@ -81,10 +81,14 @@ typedef struct { pteval_t pte; } pte_t; - # define VMALLOC_SIZE_TB _AC(12800, UL) - # define __VMALLOC_BASE _AC(0xffa0000000000000, UL) - # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) -+# define LDT_PGD_ENTRY _AC(-112, UL) -+# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) - #else - # define VMALLOC_SIZE_TB _AC(32, UL) - # define __VMALLOC_BASE _AC(0xffffc90000000000, UL) - # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) -+# define LDT_PGD_ENTRY _AC(-4, UL) -+# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) - #endif - - #ifdef CONFIG_RANDOMIZE_MEMORY -diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index 935d68609922..24503521c947 100644 ---- a/arch/x86/include/asm/processor.h -+++ b/arch/x86/include/asm/processor.h -@@ -843,13 +843,22 @@ static inline void spin_lock_prefetch(const void *x) - - #else - /* -- * User space process size. 47bits minus one guard page. The guard -- * page is necessary on Intel CPUs: if a SYSCALL instruction is at -- * the highest possible canonical userspace address, then that -- * syscall will enter the kernel with a non-canonical return -- * address, and SYSRET will explode dangerously. We avoid this -- * particular problem by preventing anything from being mapped -- * at the maximum canonical address. -+ * User space process size. This is the first address outside the user range. -+ * There are a few constraints that determine this: -+ * -+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical -+ * address, then that syscall will enter the kernel with a -+ * non-canonical return address, and SYSRET will explode dangerously. -+ * We avoid this particular problem by preventing anything executable -+ * from being mapped at the maximum canonical address. -+ * -+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the -+ * CPUs malfunction if they execute code from the highest canonical page. -+ * They'll speculate right off the end of the canonical space, and -+ * bad things happen. This is worked around in the same way as the -+ * Intel problem. -+ * -+ * With page table isolation enabled, we map the LDT in ... [stay tuned] - */ - #define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) - -diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c -index 74a5aaf13f3c..eceaada581ff 100644 ---- a/arch/x86/kernel/ldt.c -+++ b/arch/x86/kernel/ldt.c -@@ -23,6 +23,7 @@ - #include - - #include -+#include - #include - #include - #include -@@ -50,13 +51,11 @@ static void refresh_ldt_segments(void) - static void flush_ldt(void *__mm) - { - struct mm_struct *mm = __mm; -- mm_context_t *pc; - - if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) - return; - -- pc = &mm->context; -- set_ldt(pc->ldt->entries, pc->ldt->nr_entries); -+ load_mm_ldt(mm); - - refresh_ldt_segments(); - } -@@ -93,10 +92,121 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) - return NULL; - } - -+ /* The new LDT isn't aliased for PTI yet. */ -+ new_ldt->slot = -1; -+ - new_ldt->nr_entries = num_entries; - return new_ldt; - } - -+/* -+ * If PTI is enabled, this maps the LDT into the kernelmode and -+ * usermode tables for the given mm. -+ * -+ * There is no corresponding unmap function. Even if the LDT is freed, we -+ * leave the PTEs around until the slot is reused or the mm is destroyed. -+ * This is harmless: the LDT is always in ordinary memory, and no one will -+ * access the freed slot. -+ * -+ * If we wanted to unmap freed LDTs, we'd also need to do a flush to make -+ * it useful, and the flush would slow down modify_ldt(). -+ */ -+static int -+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) -+{ -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+ bool is_vmalloc, had_top_level_entry; -+ unsigned long va; -+ spinlock_t *ptl; -+ pgd_t *pgd; -+ int i; -+ -+ if (!static_cpu_has(X86_FEATURE_PTI)) -+ return 0; -+ -+ /* -+ * Any given ldt_struct should have map_ldt_struct() called at most -+ * once. -+ */ -+ WARN_ON(ldt->slot != -1); -+ -+ /* -+ * Did we already have the top level entry allocated? We can't -+ * use pgd_none() for this because it doens't do anything on -+ * 4-level page table kernels. -+ */ -+ pgd = pgd_offset(mm, LDT_BASE_ADDR); -+ had_top_level_entry = (pgd->pgd != 0); -+ -+ is_vmalloc = is_vmalloc_addr(ldt->entries); -+ -+ for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) { -+ unsigned long offset = i << PAGE_SHIFT; -+ const void *src = (char *)ldt->entries + offset; -+ unsigned long pfn; -+ pte_t pte, *ptep; -+ -+ va = (unsigned long)ldt_slot_va(slot) + offset; -+ pfn = is_vmalloc ? vmalloc_to_pfn(src) : -+ page_to_pfn(virt_to_page(src)); -+ /* -+ * Treat the PTI LDT range as a *userspace* range. -+ * get_locked_pte() will allocate all needed pagetables -+ * and account for them in this mm. -+ */ -+ ptep = get_locked_pte(mm, va, &ptl); -+ if (!ptep) -+ return -ENOMEM; -+ pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)); -+ set_pte_at(mm, va, ptep, pte); -+ pte_unmap_unlock(ptep, ptl); -+ } -+ -+ if (mm->context.ldt) { -+ /* -+ * We already had an LDT. The top-level entry should already -+ * have been allocated and synchronized with the usermode -+ * tables. -+ */ -+ WARN_ON(!had_top_level_entry); -+ if (static_cpu_has(X86_FEATURE_PTI)) -+ WARN_ON(!kernel_to_user_pgdp(pgd)->pgd); -+ } else { -+ /* -+ * This is the first time we're mapping an LDT for this process. -+ * Sync the pgd to the usermode tables. -+ */ -+ WARN_ON(had_top_level_entry); -+ if (static_cpu_has(X86_FEATURE_PTI)) { -+ WARN_ON(kernel_to_user_pgdp(pgd)->pgd); -+ set_pgd(kernel_to_user_pgdp(pgd), *pgd); -+ } -+ } -+ -+ va = (unsigned long)ldt_slot_va(slot); -+ flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); -+ -+ ldt->slot = slot; -+#endif -+ return 0; -+} -+ -+static void free_ldt_pgtables(struct mm_struct *mm) -+{ -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+ struct mmu_gather tlb; -+ unsigned long start = LDT_BASE_ADDR; -+ unsigned long end = start + (1UL << PGDIR_SHIFT); -+ -+ if (!static_cpu_has(X86_FEATURE_PTI)) -+ return; -+ -+ tlb_gather_mmu(&tlb, mm, start, end); -+ free_pgd_range(&tlb, start, end, start, end); -+ tlb_finish_mmu(&tlb, start, end); -+#endif -+} -+ - /* After calling this, the LDT is immutable. */ - static void finalize_ldt_struct(struct ldt_struct *ldt) - { -@@ -155,6 +265,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) - new_ldt->nr_entries * LDT_ENTRY_SIZE); - finalize_ldt_struct(new_ldt); - -+ retval = map_ldt_struct(mm, new_ldt, 0); -+ if (retval) { -+ free_ldt_pgtables(mm); -+ free_ldt_struct(new_ldt); -+ goto out_unlock; -+ } - mm->context.ldt = new_ldt; - - out_unlock: -@@ -173,6 +289,11 @@ void destroy_context_ldt(struct mm_struct *mm) - mm->context.ldt = NULL; - } - -+void ldt_arch_exit_mmap(struct mm_struct *mm) -+{ -+ free_ldt_pgtables(mm); -+} -+ - static int read_ldt(void __user *ptr, unsigned long bytecount) - { - struct mm_struct *mm = current->mm; -@@ -286,6 +407,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) - new_ldt->entries[ldt_info.entry_number] = ldt; - finalize_ldt_struct(new_ldt); - -+ /* -+ * If we are using PTI, map the new LDT into the userspace pagetables. -+ * If there is already an LDT, use the other slot so that other CPUs -+ * will continue to use the old LDT until install_ldt() switches -+ * them over to the new LDT. -+ */ -+ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); -+ if (error) { -+ free_ldt_struct(old_ldt); -+ goto out_unlock; -+ } -+ - install_ldt(mm, new_ldt); - free_ldt_struct(old_ldt); - error = 0; -diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c -index 3b7720404a9f..eed93dd4cb4a 100644 ---- a/arch/x86/mm/dump_pagetables.c -+++ b/arch/x86/mm/dump_pagetables.c -@@ -52,11 +52,17 @@ enum address_markers_idx { - USER_SPACE_NR = 0, - KERNEL_SPACE_NR, - LOW_KERNEL_NR, -+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL) -+ LDT_NR, -+#endif - VMALLOC_START_NR, - VMEMMAP_START_NR, - #ifdef CONFIG_KASAN - KASAN_SHADOW_START_NR, - KASAN_SHADOW_END_NR, -+#endif -+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) -+ LDT_NR, - #endif - CPU_ENTRY_AREA_NR, - #ifdef CONFIG_X86_ESPFIX64 -@@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = { - #ifdef CONFIG_KASAN - [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, - [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, -+#endif -+#ifdef CONFIG_MODIFY_LDT_SYSCALL -+ [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" }, - #endif - [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, - #ifdef CONFIG_X86_ESPFIX64 --- -2.14.2 - diff --git a/patches/kernel/0208-x86-pti-Map-the-vsyscall-page-if-needed.patch b/patches/kernel/0208-x86-pti-Map-the-vsyscall-page-if-needed.patch deleted file mode 100644 index e6e0ecc..0000000 --- a/patches/kernel/0208-x86-pti-Map-the-vsyscall-page-if-needed.patch +++ /dev/null @@ -1,172 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Lutomirski -Date: Tue, 12 Dec 2017 07:56:42 -0800 -Subject: [PATCH] x86/pti: Map the vsyscall page if needed -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Make VSYSCALLs work fully in PTI mode by mapping them properly to the user -space visible page tables. - -[ tglx: Hide unused functions (Patch by Arnd Bergmann) ] - -Signed-off-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Kees Cook -Cc: Linus Torvalds -Cc: Peter Zijlstra -Signed-off-by: Ingo Molnar -(cherry picked from commit 85900ea51577e31b186e523c8f4e068c79ecc7d3) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 7a2ba0ea0a18cfc1f18c3f1389ef85f2a0d3227d) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/vsyscall.h | 1 + - arch/x86/entry/vsyscall/vsyscall_64.c | 6 ++-- - arch/x86/mm/pti.c | 65 +++++++++++++++++++++++++++++++++++ - 3 files changed, 69 insertions(+), 3 deletions(-) - -diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h -index 6ba66ee79710..0eaeb223d692 100644 ---- a/arch/x86/include/asm/vsyscall.h -+++ b/arch/x86/include/asm/vsyscall.h -@@ -6,6 +6,7 @@ - - #ifdef CONFIG_X86_VSYSCALL_EMULATION - extern void map_vsyscall(void); -+extern void set_vsyscall_pgtable_user_bits(pgd_t *root); - - /* - * Called on instruction fetch fault in vsyscall page. -diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c -index 5e56a4ced848..238b4bcd3c47 100644 ---- a/arch/x86/entry/vsyscall/vsyscall_64.c -+++ b/arch/x86/entry/vsyscall/vsyscall_64.c -@@ -343,14 +343,14 @@ int in_gate_area_no_mm(unsigned long addr) - * vsyscalls but leave the page not present. If so, we skip calling - * this. - */ --static void __init set_vsyscall_pgtable_user_bits(void) -+void __init set_vsyscall_pgtable_user_bits(pgd_t *root) - { - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - -- pgd = pgd_offset_k(VSYSCALL_ADDR); -+ pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); - set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); - p4d = p4d_offset(pgd, VSYSCALL_ADDR); - #if CONFIG_PGTABLE_LEVELS >= 5 -@@ -372,7 +372,7 @@ void __init map_vsyscall(void) - vsyscall_mode == NATIVE - ? PAGE_KERNEL_VSYSCALL - : PAGE_KERNEL_VVAR); -- set_vsyscall_pgtable_user_bits(); -+ set_vsyscall_pgtable_user_bits(swapper_pg_dir); - } - - BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != -diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c -index b1c38ef9fbbb..bce8aea65606 100644 ---- a/arch/x86/mm/pti.c -+++ b/arch/x86/mm/pti.c -@@ -38,6 +38,7 @@ - - #include - #include -+#include - #include - #include - #include -@@ -223,6 +224,69 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) - return pmd_offset(pud, address); - } - -+#ifdef CONFIG_X86_VSYSCALL_EMULATION -+/* -+ * Walk the shadow copy of the page tables (optionally) trying to allocate -+ * page table pages on the way down. Does not support large pages. -+ * -+ * Note: this is only used when mapping *new* kernel data into the -+ * user/shadow page tables. It is never used for userspace data. -+ * -+ * Returns a pointer to a PTE on success, or NULL on failure. -+ */ -+static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) -+{ -+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); -+ pmd_t *pmd = pti_user_pagetable_walk_pmd(address); -+ pte_t *pte; -+ -+ /* We can't do anything sensible if we hit a large mapping. */ -+ if (pmd_large(*pmd)) { -+ WARN_ON(1); -+ return NULL; -+ } -+ -+ if (pmd_none(*pmd)) { -+ unsigned long new_pte_page = __get_free_page(gfp); -+ if (!new_pte_page) -+ return NULL; -+ -+ if (pmd_none(*pmd)) { -+ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); -+ new_pte_page = 0; -+ } -+ if (new_pte_page) -+ free_page(new_pte_page); -+ } -+ -+ pte = pte_offset_kernel(pmd, address); -+ if (pte_flags(*pte) & _PAGE_USER) { -+ WARN_ONCE(1, "attempt to walk to user pte\n"); -+ return NULL; -+ } -+ return pte; -+} -+ -+static void __init pti_setup_vsyscall(void) -+{ -+ pte_t *pte, *target_pte; -+ unsigned int level; -+ -+ pte = lookup_address(VSYSCALL_ADDR, &level); -+ if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) -+ return; -+ -+ target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); -+ if (WARN_ON(!target_pte)) -+ return; -+ -+ *target_pte = *pte; -+ set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); -+} -+#else -+static void __init pti_setup_vsyscall(void) { } -+#endif -+ - static void __init - pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) - { -@@ -319,4 +383,5 @@ void __init pti_init(void) - pti_clone_user_shared(); - pti_clone_entry_text(); - pti_setup_espfix64(); -+ pti_setup_vsyscall(); - } --- -2.14.2 - diff --git a/patches/kernel/0208-x86-pti-Put-the-LDT-in-its-own-PGD-if-PTI-is-on.patch b/patches/kernel/0208-x86-pti-Put-the-LDT-in-its-own-PGD-if-PTI-is-on.patch new file mode 100644 index 0000000..74b842f --- /dev/null +++ b/patches/kernel/0208-x86-pti-Put-the-LDT-in-its-own-PGD-if-PTI-is-on.patch @@ -0,0 +1,466 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Tue, 12 Dec 2017 07:56:45 -0800 +Subject: [PATCH] x86/pti: Put the LDT in its own PGD if PTI is on +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +With PTI enabled, the LDT must be mapped in the usermode tables somewhere. +The LDT is per process, i.e. per mm. + +An earlier approach mapped the LDT on context switch into a fixmap area, +but that's a big overhead and exhausted the fixmap space when NR_CPUS got +big. + +Take advantage of the fact that there is an address space hole which +provides a completely unused pgd. Use this pgd to manage per-mm LDT +mappings. + +This has a down side: the LDT isn't (currently) randomized, and an attack +that can write the LDT is instant root due to call gates (thanks, AMD, for +leaving call gates in AMD64 but designing them wrong so they're only useful +for exploits). This can be mitigated by making the LDT read-only or +randomizing the mapping, either of which is strightforward on top of this +patch. + +This will significantly slow down LDT users, but that shouldn't matter for +important workloads -- the LDT is only used by DOSEMU(2), Wine, and very +old libc implementations. + +[ tglx: Cleaned it up. ] + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Dave Hansen +Cc: David Laight +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Kees Cook +Cc: Kirill A. Shutemov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Signed-off-by: Ingo Molnar +(cherry picked from commit f55f0501cbf65ec41cca5058513031b711730b1d) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit c250643846b45ea6782fb0cfcc15e8cd34744bc7) +Signed-off-by: Fabian Grünbichler +--- + Documentation/x86/x86_64/mm.txt | 3 +- + arch/x86/include/asm/mmu_context.h | 59 ++++++++++++-- + arch/x86/include/asm/pgtable_64_types.h | 4 + + arch/x86/include/asm/processor.h | 23 ++++-- + arch/x86/kernel/ldt.c | 139 +++++++++++++++++++++++++++++++- + arch/x86/mm/dump_pagetables.c | 9 +++ + 6 files changed, 220 insertions(+), 17 deletions(-) + +diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt +index 496a1dbf139d..ad41b3813f0a 100644 +--- a/Documentation/x86/x86_64/mm.txt ++++ b/Documentation/x86/x86_64/mm.txt +@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) + ... unused hole ... + ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) + ... unused hole ... ++fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI + fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping + ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks + ... unused hole ... +@@ -29,7 +30,7 @@ Virtual memory map with 5 level page tables: + hole caused by [56:63] sign extension + ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor + ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory +-ff90000000000000 - ff9fffffffffffff (=52 bits) hole ++ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI + ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB) + ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole + ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index 89a01ad7e370..9e3546e1c0f4 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -49,10 +49,33 @@ struct ldt_struct { + * call gates. On native, we could merge the ldt_struct and LDT + * allocations, but it's not worth trying to optimize. + */ +- struct desc_struct *entries; +- unsigned int nr_entries; ++ struct desc_struct *entries; ++ unsigned int nr_entries; ++ ++ /* ++ * If PTI is in use, then the entries array is not mapped while we're ++ * in user mode. The whole array will be aliased at the addressed ++ * given by ldt_slot_va(slot). We use two slots so that we can allocate ++ * and map, and enable a new LDT without invalidating the mapping ++ * of an older, still-in-use LDT. ++ * ++ * slot will be -1 if this LDT doesn't have an alias mapping. ++ */ ++ int slot; + }; + ++/* This is a multiple of PAGE_SIZE. */ ++#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) ++ ++static inline void *ldt_slot_va(int slot) ++{ ++#ifdef CONFIG_X86_64 ++ return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); ++#else ++ BUG(); ++#endif ++} ++ + /* + * Used for LDT copy/destruction. + */ +@@ -63,6 +86,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm) + } + int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); + void destroy_context_ldt(struct mm_struct *mm); ++void ldt_arch_exit_mmap(struct mm_struct *mm); + #else /* CONFIG_MODIFY_LDT_SYSCALL */ + static inline void init_new_context_ldt(struct mm_struct *mm) { } + static inline int ldt_dup_context(struct mm_struct *oldmm, +@@ -70,7 +94,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm, + { + return 0; + } +-static inline void destroy_context_ldt(struct mm_struct *mm) {} ++static inline void destroy_context_ldt(struct mm_struct *mm) { } ++static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { } + #endif + + static inline void load_mm_ldt(struct mm_struct *mm) +@@ -95,10 +120,31 @@ static inline void load_mm_ldt(struct mm_struct *mm) + * that we can see. + */ + +- if (unlikely(ldt)) +- set_ldt(ldt->entries, ldt->nr_entries); +- else ++ if (unlikely(ldt)) { ++ if (static_cpu_has(X86_FEATURE_PTI)) { ++ if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) { ++ /* ++ * Whoops -- either the new LDT isn't mapped ++ * (if slot == -1) or is mapped into a bogus ++ * slot (if slot > 1). ++ */ ++ clear_LDT(); ++ return; ++ } ++ ++ /* ++ * If page table isolation is enabled, ldt->entries ++ * will not be mapped in the userspace pagetables. ++ * Tell the CPU to access the LDT through the alias ++ * at ldt_slot_va(ldt->slot). ++ */ ++ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries); ++ } else { ++ set_ldt(ldt->entries, ldt->nr_entries); ++ } ++ } else { + clear_LDT(); ++ } + #else + clear_LDT(); + #endif +@@ -193,6 +239,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) + static inline void arch_exit_mmap(struct mm_struct *mm) + { + paravirt_arch_exit_mmap(mm); ++ ldt_arch_exit_mmap(mm); + } + + #ifdef CONFIG_X86_64 +diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h +index 5932dead34ee..e8a809ee0bb6 100644 +--- a/arch/x86/include/asm/pgtable_64_types.h ++++ b/arch/x86/include/asm/pgtable_64_types.h +@@ -81,10 +81,14 @@ typedef struct { pteval_t pte; } pte_t; + # define VMALLOC_SIZE_TB _AC(12800, UL) + # define __VMALLOC_BASE _AC(0xffa0000000000000, UL) + # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) ++# define LDT_PGD_ENTRY _AC(-112, UL) ++# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) + #else + # define VMALLOC_SIZE_TB _AC(32, UL) + # define __VMALLOC_BASE _AC(0xffffc90000000000, UL) + # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) ++# define LDT_PGD_ENTRY _AC(-4, UL) ++# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) + #endif + + #ifdef CONFIG_RANDOMIZE_MEMORY +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 935d68609922..24503521c947 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -843,13 +843,22 @@ static inline void spin_lock_prefetch(const void *x) + + #else + /* +- * User space process size. 47bits minus one guard page. The guard +- * page is necessary on Intel CPUs: if a SYSCALL instruction is at +- * the highest possible canonical userspace address, then that +- * syscall will enter the kernel with a non-canonical return +- * address, and SYSRET will explode dangerously. We avoid this +- * particular problem by preventing anything from being mapped +- * at the maximum canonical address. ++ * User space process size. This is the first address outside the user range. ++ * There are a few constraints that determine this: ++ * ++ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical ++ * address, then that syscall will enter the kernel with a ++ * non-canonical return address, and SYSRET will explode dangerously. ++ * We avoid this particular problem by preventing anything executable ++ * from being mapped at the maximum canonical address. ++ * ++ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the ++ * CPUs malfunction if they execute code from the highest canonical page. ++ * They'll speculate right off the end of the canonical space, and ++ * bad things happen. This is worked around in the same way as the ++ * Intel problem. ++ * ++ * With page table isolation enabled, we map the LDT in ... [stay tuned] + */ + #define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) + +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index 74a5aaf13f3c..eceaada581ff 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -23,6 +23,7 @@ + #include + + #include ++#include + #include + #include + #include +@@ -50,13 +51,11 @@ static void refresh_ldt_segments(void) + static void flush_ldt(void *__mm) + { + struct mm_struct *mm = __mm; +- mm_context_t *pc; + + if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) + return; + +- pc = &mm->context; +- set_ldt(pc->ldt->entries, pc->ldt->nr_entries); ++ load_mm_ldt(mm); + + refresh_ldt_segments(); + } +@@ -93,10 +92,121 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) + return NULL; + } + ++ /* The new LDT isn't aliased for PTI yet. */ ++ new_ldt->slot = -1; ++ + new_ldt->nr_entries = num_entries; + return new_ldt; + } + ++/* ++ * If PTI is enabled, this maps the LDT into the kernelmode and ++ * usermode tables for the given mm. ++ * ++ * There is no corresponding unmap function. Even if the LDT is freed, we ++ * leave the PTEs around until the slot is reused or the mm is destroyed. ++ * This is harmless: the LDT is always in ordinary memory, and no one will ++ * access the freed slot. ++ * ++ * If we wanted to unmap freed LDTs, we'd also need to do a flush to make ++ * it useful, and the flush would slow down modify_ldt(). ++ */ ++static int ++map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) ++{ ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ bool is_vmalloc, had_top_level_entry; ++ unsigned long va; ++ spinlock_t *ptl; ++ pgd_t *pgd; ++ int i; ++ ++ if (!static_cpu_has(X86_FEATURE_PTI)) ++ return 0; ++ ++ /* ++ * Any given ldt_struct should have map_ldt_struct() called at most ++ * once. ++ */ ++ WARN_ON(ldt->slot != -1); ++ ++ /* ++ * Did we already have the top level entry allocated? We can't ++ * use pgd_none() for this because it doens't do anything on ++ * 4-level page table kernels. ++ */ ++ pgd = pgd_offset(mm, LDT_BASE_ADDR); ++ had_top_level_entry = (pgd->pgd != 0); ++ ++ is_vmalloc = is_vmalloc_addr(ldt->entries); ++ ++ for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) { ++ unsigned long offset = i << PAGE_SHIFT; ++ const void *src = (char *)ldt->entries + offset; ++ unsigned long pfn; ++ pte_t pte, *ptep; ++ ++ va = (unsigned long)ldt_slot_va(slot) + offset; ++ pfn = is_vmalloc ? vmalloc_to_pfn(src) : ++ page_to_pfn(virt_to_page(src)); ++ /* ++ * Treat the PTI LDT range as a *userspace* range. ++ * get_locked_pte() will allocate all needed pagetables ++ * and account for them in this mm. ++ */ ++ ptep = get_locked_pte(mm, va, &ptl); ++ if (!ptep) ++ return -ENOMEM; ++ pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)); ++ set_pte_at(mm, va, ptep, pte); ++ pte_unmap_unlock(ptep, ptl); ++ } ++ ++ if (mm->context.ldt) { ++ /* ++ * We already had an LDT. The top-level entry should already ++ * have been allocated and synchronized with the usermode ++ * tables. ++ */ ++ WARN_ON(!had_top_level_entry); ++ if (static_cpu_has(X86_FEATURE_PTI)) ++ WARN_ON(!kernel_to_user_pgdp(pgd)->pgd); ++ } else { ++ /* ++ * This is the first time we're mapping an LDT for this process. ++ * Sync the pgd to the usermode tables. ++ */ ++ WARN_ON(had_top_level_entry); ++ if (static_cpu_has(X86_FEATURE_PTI)) { ++ WARN_ON(kernel_to_user_pgdp(pgd)->pgd); ++ set_pgd(kernel_to_user_pgdp(pgd), *pgd); ++ } ++ } ++ ++ va = (unsigned long)ldt_slot_va(slot); ++ flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); ++ ++ ldt->slot = slot; ++#endif ++ return 0; ++} ++ ++static void free_ldt_pgtables(struct mm_struct *mm) ++{ ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ struct mmu_gather tlb; ++ unsigned long start = LDT_BASE_ADDR; ++ unsigned long end = start + (1UL << PGDIR_SHIFT); ++ ++ if (!static_cpu_has(X86_FEATURE_PTI)) ++ return; ++ ++ tlb_gather_mmu(&tlb, mm, start, end); ++ free_pgd_range(&tlb, start, end, start, end); ++ tlb_finish_mmu(&tlb, start, end); ++#endif ++} ++ + /* After calling this, the LDT is immutable. */ + static void finalize_ldt_struct(struct ldt_struct *ldt) + { +@@ -155,6 +265,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) + new_ldt->nr_entries * LDT_ENTRY_SIZE); + finalize_ldt_struct(new_ldt); + ++ retval = map_ldt_struct(mm, new_ldt, 0); ++ if (retval) { ++ free_ldt_pgtables(mm); ++ free_ldt_struct(new_ldt); ++ goto out_unlock; ++ } + mm->context.ldt = new_ldt; + + out_unlock: +@@ -173,6 +289,11 @@ void destroy_context_ldt(struct mm_struct *mm) + mm->context.ldt = NULL; + } + ++void ldt_arch_exit_mmap(struct mm_struct *mm) ++{ ++ free_ldt_pgtables(mm); ++} ++ + static int read_ldt(void __user *ptr, unsigned long bytecount) + { + struct mm_struct *mm = current->mm; +@@ -286,6 +407,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) + new_ldt->entries[ldt_info.entry_number] = ldt; + finalize_ldt_struct(new_ldt); + ++ /* ++ * If we are using PTI, map the new LDT into the userspace pagetables. ++ * If there is already an LDT, use the other slot so that other CPUs ++ * will continue to use the old LDT until install_ldt() switches ++ * them over to the new LDT. ++ */ ++ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); ++ if (error) { ++ free_ldt_struct(old_ldt); ++ goto out_unlock; ++ } ++ + install_ldt(mm, new_ldt); + free_ldt_struct(old_ldt); + error = 0; +diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c +index 3b7720404a9f..eed93dd4cb4a 100644 +--- a/arch/x86/mm/dump_pagetables.c ++++ b/arch/x86/mm/dump_pagetables.c +@@ -52,11 +52,17 @@ enum address_markers_idx { + USER_SPACE_NR = 0, + KERNEL_SPACE_NR, + LOW_KERNEL_NR, ++#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL) ++ LDT_NR, ++#endif + VMALLOC_START_NR, + VMEMMAP_START_NR, + #ifdef CONFIG_KASAN + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, ++#endif ++#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) ++ LDT_NR, + #endif + CPU_ENTRY_AREA_NR, + #ifdef CONFIG_X86_ESPFIX64 +@@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = { + #ifdef CONFIG_KASAN + [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, + [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, ++#endif ++#ifdef CONFIG_MODIFY_LDT_SYSCALL ++ [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" }, + #endif + [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, + #ifdef CONFIG_X86_ESPFIX64 +-- +2.14.2 + diff --git a/patches/kernel/0209-x86-mm-Allow-flushing-for-future-ASID-switches.patch b/patches/kernel/0209-x86-mm-Allow-flushing-for-future-ASID-switches.patch deleted file mode 100644 index 2314bde..0000000 --- a/patches/kernel/0209-x86-mm-Allow-flushing-for-future-ASID-switches.patch +++ /dev/null @@ -1,192 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Mon, 4 Dec 2017 15:07:57 +0100 -Subject: [PATCH] x86/mm: Allow flushing for future ASID switches -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -If changing the page tables in such a way that an invalidation of all -contexts (aka. PCIDs / ASIDs) is required, they can be actively invalidated -by: - - 1. INVPCID for each PCID (works for single pages too). - - 2. Load CR3 with each PCID without the NOFLUSH bit set - - 3. Load CR3 with the NOFLUSH bit set for each and do INVLPG for each address. - -But, none of these are really feasible since there are ~6 ASIDs (12 with -PAGE_TABLE_ISOLATION) at the time that invalidation is required. -Instead of actively invalidating them, invalidate the *current* context and -also mark the cpu_tlbstate _quickly_ to indicate future invalidation to be -required. - -At the next context-switch, look for this indicator -('invalidate_other' being set) invalidate all of the -cpu_tlbstate.ctxs[] entries. - -This ensures that any future context switches will do a full flush -of the TLB, picking up the previous changes. - -[ tglx: Folded more fixups from Peter ] - -Signed-off-by: Dave Hansen -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 2ea907c4fe7b78e5840c1dc07800eae93248cad1) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit fbb7e6e9e7e7cedecc164d660d08563f88103b56) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/tlbflush.h | 37 +++++++++++++++++++++++++++++-------- - arch/x86/mm/tlb.c | 35 +++++++++++++++++++++++++++++++++++ - 2 files changed, 64 insertions(+), 8 deletions(-) - -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index 503f87c30c15..3769ce182eac 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -124,6 +124,17 @@ struct tlb_state { - */ - bool is_lazy; - -+ /* -+ * If set we changed the page tables in such a way that we -+ * needed an invalidation of all contexts (aka. PCIDs / ASIDs). -+ * This tells us to go invalidate all the non-loaded ctxs[] -+ * on the next context switch. -+ * -+ * The current ctx was kept up-to-date as it ran and does not -+ * need to be invalidated. -+ */ -+ bool invalidate_other; -+ - /* - * Access to this CR4 shadow and to H/W CR4 is protected by - * disabling interrupts when modifying either one. -@@ -201,6 +212,14 @@ static inline unsigned long cr4_read_shadow(void) - return this_cpu_read(cpu_tlbstate.cr4); - } - -+/* -+ * Mark all other ASIDs as invalid, preserves the current. -+ */ -+static inline void invalidate_other_asid(void) -+{ -+ this_cpu_write(cpu_tlbstate.invalidate_other, true); -+} -+ - /* - * Save some of cr4 feature set we're using (e.g. Pentium 4MB - * enable and PPro Global page enable), so that any CPU's that boot -@@ -287,14 +306,6 @@ static inline void __flush_tlb_all(void) - */ - __flush_tlb(); - } -- -- /* -- * Note: if we somehow had PCID but not PGE, then this wouldn't work -- -- * we'd end up flushing kernel translations for the current ASID but -- * we might fail to flush kernel translations for other cached ASIDs. -- * -- * To avoid this issue, we force PCID off if PGE is off. -- */ - } - - /* -@@ -304,6 +315,16 @@ static inline void __flush_tlb_one(unsigned long addr) - { - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); - __flush_tlb_single(addr); -+ -+ if (!static_cpu_has(X86_FEATURE_PTI)) -+ return; -+ -+ /* -+ * __flush_tlb_single() will have cleared the TLB entry for this ASID, -+ * but since kernel space is replicated across all, we must also -+ * invalidate all others. -+ */ -+ invalidate_other_asid(); - } - - #define TLB_FLUSH_ALL -1UL -diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c -index 87d4f961bcb4..ce87b69fb4e0 100644 ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -28,6 +28,38 @@ - * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi - */ - -+/* -+ * We get here when we do something requiring a TLB invalidation -+ * but could not go invalidate all of the contexts. We do the -+ * necessary invalidation by clearing out the 'ctx_id' which -+ * forces a TLB flush when the context is loaded. -+ */ -+void clear_asid_other(void) -+{ -+ u16 asid; -+ -+ /* -+ * This is only expected to be set if we have disabled -+ * kernel _PAGE_GLOBAL pages. -+ */ -+ if (!static_cpu_has(X86_FEATURE_PTI)) { -+ WARN_ON_ONCE(1); -+ return; -+ } -+ -+ for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { -+ /* Do not need to flush the current asid */ -+ if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid)) -+ continue; -+ /* -+ * Make sure the next time we go to switch to -+ * this asid, we do a flush: -+ */ -+ this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0); -+ } -+ this_cpu_write(cpu_tlbstate.invalidate_other, false); -+} -+ - atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); - - DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode); -@@ -43,6 +75,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, - return; - } - -+ if (this_cpu_read(cpu_tlbstate.invalidate_other)) -+ clear_asid_other(); -+ - for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { - if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != - next->context.ctx_id) --- -2.14.2 - diff --git a/patches/kernel/0209-x86-pti-Map-the-vsyscall-page-if-needed.patch b/patches/kernel/0209-x86-pti-Map-the-vsyscall-page-if-needed.patch new file mode 100644 index 0000000..e6e0ecc --- /dev/null +++ b/patches/kernel/0209-x86-pti-Map-the-vsyscall-page-if-needed.patch @@ -0,0 +1,172 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Tue, 12 Dec 2017 07:56:42 -0800 +Subject: [PATCH] x86/pti: Map the vsyscall page if needed +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Make VSYSCALLs work fully in PTI mode by mapping them properly to the user +space visible page tables. + +[ tglx: Hide unused functions (Patch by Arnd Bergmann) ] + +Signed-off-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Kees Cook +Cc: Linus Torvalds +Cc: Peter Zijlstra +Signed-off-by: Ingo Molnar +(cherry picked from commit 85900ea51577e31b186e523c8f4e068c79ecc7d3) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 7a2ba0ea0a18cfc1f18c3f1389ef85f2a0d3227d) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/vsyscall.h | 1 + + arch/x86/entry/vsyscall/vsyscall_64.c | 6 ++-- + arch/x86/mm/pti.c | 65 +++++++++++++++++++++++++++++++++++ + 3 files changed, 69 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h +index 6ba66ee79710..0eaeb223d692 100644 +--- a/arch/x86/include/asm/vsyscall.h ++++ b/arch/x86/include/asm/vsyscall.h +@@ -6,6 +6,7 @@ + + #ifdef CONFIG_X86_VSYSCALL_EMULATION + extern void map_vsyscall(void); ++extern void set_vsyscall_pgtable_user_bits(pgd_t *root); + + /* + * Called on instruction fetch fault in vsyscall page. +diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c +index 5e56a4ced848..238b4bcd3c47 100644 +--- a/arch/x86/entry/vsyscall/vsyscall_64.c ++++ b/arch/x86/entry/vsyscall/vsyscall_64.c +@@ -343,14 +343,14 @@ int in_gate_area_no_mm(unsigned long addr) + * vsyscalls but leave the page not present. If so, we skip calling + * this. + */ +-static void __init set_vsyscall_pgtable_user_bits(void) ++void __init set_vsyscall_pgtable_user_bits(pgd_t *root) + { + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + +- pgd = pgd_offset_k(VSYSCALL_ADDR); ++ pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); + set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); + p4d = p4d_offset(pgd, VSYSCALL_ADDR); + #if CONFIG_PGTABLE_LEVELS >= 5 +@@ -372,7 +372,7 @@ void __init map_vsyscall(void) + vsyscall_mode == NATIVE + ? PAGE_KERNEL_VSYSCALL + : PAGE_KERNEL_VVAR); +- set_vsyscall_pgtable_user_bits(); ++ set_vsyscall_pgtable_user_bits(swapper_pg_dir); + } + + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != +diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c +index b1c38ef9fbbb..bce8aea65606 100644 +--- a/arch/x86/mm/pti.c ++++ b/arch/x86/mm/pti.c +@@ -38,6 +38,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -223,6 +224,69 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) + return pmd_offset(pud, address); + } + ++#ifdef CONFIG_X86_VSYSCALL_EMULATION ++/* ++ * Walk the shadow copy of the page tables (optionally) trying to allocate ++ * page table pages on the way down. Does not support large pages. ++ * ++ * Note: this is only used when mapping *new* kernel data into the ++ * user/shadow page tables. It is never used for userspace data. ++ * ++ * Returns a pointer to a PTE on success, or NULL on failure. ++ */ ++static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) ++{ ++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); ++ pmd_t *pmd = pti_user_pagetable_walk_pmd(address); ++ pte_t *pte; ++ ++ /* We can't do anything sensible if we hit a large mapping. */ ++ if (pmd_large(*pmd)) { ++ WARN_ON(1); ++ return NULL; ++ } ++ ++ if (pmd_none(*pmd)) { ++ unsigned long new_pte_page = __get_free_page(gfp); ++ if (!new_pte_page) ++ return NULL; ++ ++ if (pmd_none(*pmd)) { ++ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); ++ new_pte_page = 0; ++ } ++ if (new_pte_page) ++ free_page(new_pte_page); ++ } ++ ++ pte = pte_offset_kernel(pmd, address); ++ if (pte_flags(*pte) & _PAGE_USER) { ++ WARN_ONCE(1, "attempt to walk to user pte\n"); ++ return NULL; ++ } ++ return pte; ++} ++ ++static void __init pti_setup_vsyscall(void) ++{ ++ pte_t *pte, *target_pte; ++ unsigned int level; ++ ++ pte = lookup_address(VSYSCALL_ADDR, &level); ++ if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) ++ return; ++ ++ target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); ++ if (WARN_ON(!target_pte)) ++ return; ++ ++ *target_pte = *pte; ++ set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); ++} ++#else ++static void __init pti_setup_vsyscall(void) { } ++#endif ++ + static void __init + pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) + { +@@ -319,4 +383,5 @@ void __init pti_init(void) + pti_clone_user_shared(); + pti_clone_entry_text(); + pti_setup_espfix64(); ++ pti_setup_vsyscall(); + } +-- +2.14.2 + diff --git a/patches/kernel/0210-x86-mm-Abstract-switching-CR3.patch b/patches/kernel/0210-x86-mm-Abstract-switching-CR3.patch deleted file mode 100644 index 081d23a..0000000 --- a/patches/kernel/0210-x86-mm-Abstract-switching-CR3.patch +++ /dev/null @@ -1,96 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Mon, 4 Dec 2017 15:07:58 +0100 -Subject: [PATCH] x86/mm: Abstract switching CR3 -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -In preparation to adding additional PCID flushing, abstract the -loading of a new ASID into CR3. - -[ PeterZ: Split out from big combo patch ] - -Signed-off-by: Dave Hansen -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 48e111982cda033fec832c6b0592c2acedd85d04) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 1e2affe2a79305b3a5f3ad65d3f61ad9d1f9e168) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/tlb.c | 22 ++++++++++++++++++++-- - 1 file changed, 20 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c -index ce87b69fb4e0..353f2f4e1d96 100644 ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -101,6 +101,24 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, - *need_flush = true; - } - -+static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) -+{ -+ unsigned long new_mm_cr3; -+ -+ if (need_flush) { -+ new_mm_cr3 = build_cr3(pgdir, new_asid); -+ } else { -+ new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); -+ } -+ -+ /* -+ * Caution: many callers of this function expect -+ * that load_cr3() is serializing and orders TLB -+ * fills with respect to the mm_cpumask writes. -+ */ -+ write_cr3(new_mm_cr3); -+} -+ - void leave_mm(int cpu) - { - struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); -@@ -228,7 +246,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - if (need_flush) { - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); -- write_cr3(build_cr3(next->pgd, new_asid)); -+ load_new_mm_cr3(next->pgd, new_asid, true); - - /* - * NB: This gets called via leave_mm() in the idle path -@@ -241,7 +259,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - } else { - /* The new ASID is already up to date. */ -- write_cr3(build_cr3_noflush(next->pgd, new_asid)); -+ load_new_mm_cr3(next->pgd, new_asid, false); - - /* See above wrt _rcuidle. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); --- -2.14.2 - diff --git a/patches/kernel/0210-x86-mm-Allow-flushing-for-future-ASID-switches.patch b/patches/kernel/0210-x86-mm-Allow-flushing-for-future-ASID-switches.patch new file mode 100644 index 0000000..2314bde --- /dev/null +++ b/patches/kernel/0210-x86-mm-Allow-flushing-for-future-ASID-switches.patch @@ -0,0 +1,192 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Mon, 4 Dec 2017 15:07:57 +0100 +Subject: [PATCH] x86/mm: Allow flushing for future ASID switches +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +If changing the page tables in such a way that an invalidation of all +contexts (aka. PCIDs / ASIDs) is required, they can be actively invalidated +by: + + 1. INVPCID for each PCID (works for single pages too). + + 2. Load CR3 with each PCID without the NOFLUSH bit set + + 3. Load CR3 with the NOFLUSH bit set for each and do INVLPG for each address. + +But, none of these are really feasible since there are ~6 ASIDs (12 with +PAGE_TABLE_ISOLATION) at the time that invalidation is required. +Instead of actively invalidating them, invalidate the *current* context and +also mark the cpu_tlbstate _quickly_ to indicate future invalidation to be +required. + +At the next context-switch, look for this indicator +('invalidate_other' being set) invalidate all of the +cpu_tlbstate.ctxs[] entries. + +This ensures that any future context switches will do a full flush +of the TLB, picking up the previous changes. + +[ tglx: Folded more fixups from Peter ] + +Signed-off-by: Dave Hansen +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 2ea907c4fe7b78e5840c1dc07800eae93248cad1) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit fbb7e6e9e7e7cedecc164d660d08563f88103b56) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/tlbflush.h | 37 +++++++++++++++++++++++++++++-------- + arch/x86/mm/tlb.c | 35 +++++++++++++++++++++++++++++++++++ + 2 files changed, 64 insertions(+), 8 deletions(-) + +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 503f87c30c15..3769ce182eac 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -124,6 +124,17 @@ struct tlb_state { + */ + bool is_lazy; + ++ /* ++ * If set we changed the page tables in such a way that we ++ * needed an invalidation of all contexts (aka. PCIDs / ASIDs). ++ * This tells us to go invalidate all the non-loaded ctxs[] ++ * on the next context switch. ++ * ++ * The current ctx was kept up-to-date as it ran and does not ++ * need to be invalidated. ++ */ ++ bool invalidate_other; ++ + /* + * Access to this CR4 shadow and to H/W CR4 is protected by + * disabling interrupts when modifying either one. +@@ -201,6 +212,14 @@ static inline unsigned long cr4_read_shadow(void) + return this_cpu_read(cpu_tlbstate.cr4); + } + ++/* ++ * Mark all other ASIDs as invalid, preserves the current. ++ */ ++static inline void invalidate_other_asid(void) ++{ ++ this_cpu_write(cpu_tlbstate.invalidate_other, true); ++} ++ + /* + * Save some of cr4 feature set we're using (e.g. Pentium 4MB + * enable and PPro Global page enable), so that any CPU's that boot +@@ -287,14 +306,6 @@ static inline void __flush_tlb_all(void) + */ + __flush_tlb(); + } +- +- /* +- * Note: if we somehow had PCID but not PGE, then this wouldn't work -- +- * we'd end up flushing kernel translations for the current ASID but +- * we might fail to flush kernel translations for other cached ASIDs. +- * +- * To avoid this issue, we force PCID off if PGE is off. +- */ + } + + /* +@@ -304,6 +315,16 @@ static inline void __flush_tlb_one(unsigned long addr) + { + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); + __flush_tlb_single(addr); ++ ++ if (!static_cpu_has(X86_FEATURE_PTI)) ++ return; ++ ++ /* ++ * __flush_tlb_single() will have cleared the TLB entry for this ASID, ++ * but since kernel space is replicated across all, we must also ++ * invalidate all others. ++ */ ++ invalidate_other_asid(); + } + + #define TLB_FLUSH_ALL -1UL +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 87d4f961bcb4..ce87b69fb4e0 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -28,6 +28,38 @@ + * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi + */ + ++/* ++ * We get here when we do something requiring a TLB invalidation ++ * but could not go invalidate all of the contexts. We do the ++ * necessary invalidation by clearing out the 'ctx_id' which ++ * forces a TLB flush when the context is loaded. ++ */ ++void clear_asid_other(void) ++{ ++ u16 asid; ++ ++ /* ++ * This is only expected to be set if we have disabled ++ * kernel _PAGE_GLOBAL pages. ++ */ ++ if (!static_cpu_has(X86_FEATURE_PTI)) { ++ WARN_ON_ONCE(1); ++ return; ++ } ++ ++ for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { ++ /* Do not need to flush the current asid */ ++ if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid)) ++ continue; ++ /* ++ * Make sure the next time we go to switch to ++ * this asid, we do a flush: ++ */ ++ this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0); ++ } ++ this_cpu_write(cpu_tlbstate.invalidate_other, false); ++} ++ + atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); + + DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode); +@@ -43,6 +75,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, + return; + } + ++ if (this_cpu_read(cpu_tlbstate.invalidate_other)) ++ clear_asid_other(); ++ + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { + if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != + next->context.ctx_id) +-- +2.14.2 + diff --git a/patches/kernel/0211-x86-mm-Abstract-switching-CR3.patch b/patches/kernel/0211-x86-mm-Abstract-switching-CR3.patch new file mode 100644 index 0000000..081d23a --- /dev/null +++ b/patches/kernel/0211-x86-mm-Abstract-switching-CR3.patch @@ -0,0 +1,96 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Mon, 4 Dec 2017 15:07:58 +0100 +Subject: [PATCH] x86/mm: Abstract switching CR3 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +In preparation to adding additional PCID flushing, abstract the +loading of a new ASID into CR3. + +[ PeterZ: Split out from big combo patch ] + +Signed-off-by: Dave Hansen +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 48e111982cda033fec832c6b0592c2acedd85d04) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 1e2affe2a79305b3a5f3ad65d3f61ad9d1f9e168) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/tlb.c | 22 ++++++++++++++++++++-- + 1 file changed, 20 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index ce87b69fb4e0..353f2f4e1d96 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -101,6 +101,24 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, + *need_flush = true; + } + ++static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) ++{ ++ unsigned long new_mm_cr3; ++ ++ if (need_flush) { ++ new_mm_cr3 = build_cr3(pgdir, new_asid); ++ } else { ++ new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); ++ } ++ ++ /* ++ * Caution: many callers of this function expect ++ * that load_cr3() is serializing and orders TLB ++ * fills with respect to the mm_cpumask writes. ++ */ ++ write_cr3(new_mm_cr3); ++} ++ + void leave_mm(int cpu) + { + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); +@@ -228,7 +246,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); +- write_cr3(build_cr3(next->pgd, new_asid)); ++ load_new_mm_cr3(next->pgd, new_asid, true); + + /* + * NB: This gets called via leave_mm() in the idle path +@@ -241,7 +259,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ +- write_cr3(build_cr3_noflush(next->pgd, new_asid)); ++ load_new_mm_cr3(next->pgd, new_asid, false); + + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); +-- +2.14.2 + diff --git a/patches/kernel/0211-x86-mm-Use-Fix-PCID-to-optimize-user-kernel-switches.patch b/patches/kernel/0211-x86-mm-Use-Fix-PCID-to-optimize-user-kernel-switches.patch deleted file mode 100644 index 5e21faf..0000000 --- a/patches/kernel/0211-x86-mm-Use-Fix-PCID-to-optimize-user-kernel-switches.patch +++ /dev/null @@ -1,497 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Mon, 4 Dec 2017 15:07:59 +0100 -Subject: [PATCH] x86/mm: Use/Fix PCID to optimize user/kernel switches -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -We can use PCID to retain the TLBs across CR3 switches; including those now -part of the user/kernel switch. This increases performance of kernel -entry/exit at the cost of more expensive/complicated TLB flushing. - -Now that we have two address spaces, one for kernel and one for user space, -we need two PCIDs per mm. We use the top PCID bit to indicate a user PCID -(just like we use the PFN LSB for the PGD). Since we do TLB invalidation -from kernel space, the existing code will only invalidate the kernel PCID, -we augment that by marking the corresponding user PCID invalid, and upon -switching back to userspace, use a flushing CR3 write for the switch. - -In order to access the user_pcid_flush_mask we use PER_CPU storage, which -means the previously established SWAPGS vs CR3 ordering is now mandatory -and required. - -Having to do this memory access does require additional registers, most -sites have a functioning stack and we can spill one (RAX), sites without -functional stack need to otherwise provide the second scratch register. - -Note: PCID is generally available on Intel Sandybridge and later CPUs. -Note: Up until this point TLB flushing was broken in this series. - -Based-on-code-from: Dave Hansen -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(backported from commit 6fd166aae78c0ab738d49bda653cbd9e3b1491cf) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit ac7471365d49c0a91d4b63453eb848cc19f17589) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/calling.h | 72 ++++++++++++++++++----- - arch/x86/include/asm/processor-flags.h | 5 ++ - arch/x86/include/asm/tlbflush.h | 91 +++++++++++++++++++++++++---- - arch/x86/include/uapi/asm/processor-flags.h | 7 ++- - arch/x86/kernel/asm-offsets.c | 4 ++ - arch/x86/mm/init.c | 2 +- - arch/x86/mm/tlb.c | 1 + - arch/x86/entry/entry_64.S | 9 +-- - arch/x86/entry/entry_64_compat.S | 4 +- - 9 files changed, 162 insertions(+), 33 deletions(-) - -diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h -index bb56f5346ae8..ce5fb309926d 100644 ---- a/arch/x86/entry/calling.h -+++ b/arch/x86/entry/calling.h -@@ -2,6 +2,9 @@ - #include - #include - #include -+#include -+#include -+#include - - /* - -@@ -190,17 +193,21 @@ For 32-bit we have the following conventions - kernel is built with - - #ifdef CONFIG_PAGE_TABLE_ISOLATION - --/* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two halves: */ --#define PTI_SWITCH_MASK (1< - #include - #include -+#include -+#include - - static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) - { -@@ -23,24 +25,54 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) - - /* There are 12 bits of space for ASIDS in CR3 */ - #define CR3_HW_ASID_BITS 12 -+ - /* - * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for - * user/kernel switches - */ --#define PTI_CONSUMED_ASID_BITS 0 -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+# define PTI_CONSUMED_PCID_BITS 1 -+#else -+# define PTI_CONSUMED_PCID_BITS 0 -+#endif -+ -+#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) - --#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS) - /* - * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account - * for them being zero-based. Another -1 is because ASID 0 is reserved for - * use by non-PCID-aware users. - */ --#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) -+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) -+ -+/* -+ * 6 because 6 should be plenty and struct tlb_state will fit in two cache -+ * lines. -+ */ -+#define TLB_NR_DYN_ASIDS 6 - - static inline u16 kern_pcid(u16 asid) - { - VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); -+ -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+ /* -+ * Make sure that the dynamic ASID space does not confict with the -+ * bit we are using to switch between user and kernel ASIDs. -+ */ -+ BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT)); -+ - /* -+ * The ASID being passed in here should have respected the -+ * MAX_ASID_AVAILABLE and thus never have the switch bit set. -+ */ -+ VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT)); -+#endif -+ /* -+ * The dynamically-assigned ASIDs that get passed in are small -+ * (mm == NULL then we borrow a mm which may change during a -- * task switch and therefore we must not be preempted while we write CR3 -- * back: -+ * If current->mm == NULL then we borrow a mm which may change -+ * during a task switch and therefore we must not be preempted -+ * while we write CR3 back: - */ - preempt_disable(); - native_write_cr3(__native_read_cr3()); -@@ -290,7 +350,14 @@ static inline void __native_flush_tlb_global(void) - */ - static inline void __native_flush_tlb_single(unsigned long addr) - { -+ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); -+ - asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); -+ -+ if (!static_cpu_has(X86_FEATURE_PTI)) -+ return; -+ -+ invalidate_user_asid(loaded_mm_asid); - } - - /* -diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h -index 39946d0a1d41..69077da3dbf1 100644 ---- a/arch/x86/include/uapi/asm/processor-flags.h -+++ b/arch/x86/include/uapi/asm/processor-flags.h -@@ -77,7 +77,12 @@ - #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) - #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ - #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) --#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ -+ -+#define X86_CR3_PCID_BITS 12 -+#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL)) -+ -+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ -+#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) - - /* - * Intel CPU features in CR4 -diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c -index 25b4832e9c28..87c3bafcef2c 100644 ---- a/arch/x86/kernel/asm-offsets.c -+++ b/arch/x86/kernel/asm-offsets.c -@@ -16,6 +16,7 @@ - #include - #include - #include -+#include - - #ifdef CONFIG_XEN - #include -@@ -93,6 +94,9 @@ void common(void) { - BLANK(); - DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); - -+ /* TLB state for the entry code */ -+ OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); -+ - /* Layout info for cpu_entry_area */ - OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); - OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); -diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c -index af75069fb116..caeb8a7bf0a4 100644 ---- a/arch/x86/mm/init.c -+++ b/arch/x86/mm/init.c -@@ -855,7 +855,7 @@ void __init zone_sizes_init(void) - free_area_init_nodes(max_zone_pfns); - } - --DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { -+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { - .loaded_mm = &init_mm, - .next_asid = 1, - .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ -diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c -index 353f2f4e1d96..06f3854d0a4f 100644 ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -106,6 +106,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) - unsigned long new_mm_cr3; - - if (need_flush) { -+ invalidate_user_asid(new_asid); - new_mm_cr3 = build_cr3(pgdir, new_asid); - } else { - new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 292ccc6ec48d..fb43f14ed299 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -22,7 +22,6 @@ - #include - #include - #include --#include "calling.h" - #include - #include - #include -@@ -39,6 +38,8 @@ - #include - #include - -+#include "calling.h" -+ - .code64 - .section .entry.text, "ax" - -@@ -405,7 +406,7 @@ syscall_return_via_sysret: - * We are on the trampoline stack. All regs except RDI are live. - * We can do future final exit work right here. - */ -- SWITCH_TO_USER_CR3 scratch_reg=%rdi -+ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi - - popq %rdi - popq %rsp -@@ -743,7 +744,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) - * We can do future final exit work right here. - */ - -- SWITCH_TO_USER_CR3 scratch_reg=%rdi -+ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi - - /* Restore RDI. */ - popq %rdi -@@ -856,7 +857,7 @@ native_irq_return_ldt: - */ - orq PER_CPU_VAR(espfix_stack), %rax - -- SWITCH_TO_USER_CR3 scratch_reg=%rdi /* to user CR3 */ -+ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi - SWAPGS /* to user GS */ - popq %rdi /* Restore user RDI */ - -diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S -index 43f856aeee67..973527e34887 100644 ---- a/arch/x86/entry/entry_64_compat.S -+++ b/arch/x86/entry/entry_64_compat.S -@@ -274,9 +274,9 @@ sysret32_from_system_call: - * switch until after after the last reference to the process - * stack. - * -- * %r8 is zeroed before the sysret, thus safe to clobber. -+ * %r8/%r9 are zeroed before the sysret, thus safe to clobber. - */ -- SWITCH_TO_USER_CR3 scratch_reg=%r8 -+ SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9 - - xorq %r8, %r8 - xorq %r9, %r9 --- -2.14.2 - diff --git a/patches/kernel/0212-x86-mm-Optimize-RESTORE_CR3.patch b/patches/kernel/0212-x86-mm-Optimize-RESTORE_CR3.patch deleted file mode 100644 index 2f04370..0000000 --- a/patches/kernel/0212-x86-mm-Optimize-RESTORE_CR3.patch +++ /dev/null @@ -1,127 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Mon, 4 Dec 2017 15:08:00 +0100 -Subject: [PATCH] x86/mm: Optimize RESTORE_CR3 -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Most NMI/paranoid exceptions will not in fact change pagetables and would -thus not require TLB flushing, however RESTORE_CR3 uses flushing CR3 -writes. - -Restores to kernel PCIDs can be NOFLUSH, because we explicitly flush the -kernel mappings and now that we track which user PCIDs need flushing we can -avoid those too when possible. - -This does mean RESTORE_CR3 needs an additional scratch_reg, luckily both -sites have plenty available. - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 21e94459110252d41b45c0c8ba50fd72a664d50c) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 6ebe6e2896841282357d43c09394b0ca47c41e4a) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/calling.h | 30 ++++++++++++++++++++++++++++-- - arch/x86/entry/entry_64.S | 4 ++-- - 2 files changed, 30 insertions(+), 4 deletions(-) - -diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h -index ce5fb309926d..015e0a84bb99 100644 ---- a/arch/x86/entry/calling.h -+++ b/arch/x86/entry/calling.h -@@ -280,8 +280,34 @@ For 32-bit we have the following conventions - kernel is built with - .Ldone_\@: - .endm - --.macro RESTORE_CR3 save_reg:req -+.macro RESTORE_CR3 scratch_reg:req save_reg:req - ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI -+ -+ ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID -+ -+ /* -+ * KERNEL pages can always resume with NOFLUSH as we do -+ * explicit flushes. -+ */ -+ bt $X86_CR3_PTI_SWITCH_BIT, \save_reg -+ jnc .Lnoflush_\@ -+ -+ /* -+ * Check if there's a pending flush for the user ASID we're -+ * about to set. -+ */ -+ movq \save_reg, \scratch_reg -+ andq $(0x7FF), \scratch_reg -+ bt \scratch_reg, THIS_CPU_user_pcid_flush_mask -+ jnc .Lnoflush_\@ -+ -+ btr \scratch_reg, THIS_CPU_user_pcid_flush_mask -+ jmp .Lwrcr3_\@ -+ -+.Lnoflush_\@: -+ SET_NOFLUSH_BIT \save_reg -+ -+.Lwrcr3_\@: - /* - * The CR3 write could be avoided when not changing its value, - * but would require a CR3 read *and* a scratch register. -@@ -300,7 +326,7 @@ For 32-bit we have the following conventions - kernel is built with - .endm - .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req - .endm --.macro RESTORE_CR3 save_reg:req -+.macro RESTORE_CR3 scratch_reg:req save_reg:req - .endm - - #endif -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index fb43f14ed299..b48f2c78a9bf 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -1300,7 +1300,7 @@ ENTRY(paranoid_exit) - testl %ebx, %ebx /* swapgs needed? */ - jnz .Lparanoid_exit_no_swapgs - TRACE_IRQS_IRETQ -- RESTORE_CR3 save_reg=%r14 -+ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 - SWAPGS_UNSAFE_STACK - jmp .Lparanoid_exit_restore - .Lparanoid_exit_no_swapgs: -@@ -1742,7 +1742,7 @@ end_repeat_nmi: - movq $-1, %rsi - call do_nmi - -- RESTORE_CR3 save_reg=%r14 -+ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 - - testl %ebx, %ebx /* swapgs needed? */ - jnz nmi_restore --- -2.14.2 - diff --git a/patches/kernel/0212-x86-mm-Use-Fix-PCID-to-optimize-user-kernel-switches.patch b/patches/kernel/0212-x86-mm-Use-Fix-PCID-to-optimize-user-kernel-switches.patch new file mode 100644 index 0000000..5e21faf --- /dev/null +++ b/patches/kernel/0212-x86-mm-Use-Fix-PCID-to-optimize-user-kernel-switches.patch @@ -0,0 +1,497 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Mon, 4 Dec 2017 15:07:59 +0100 +Subject: [PATCH] x86/mm: Use/Fix PCID to optimize user/kernel switches +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +We can use PCID to retain the TLBs across CR3 switches; including those now +part of the user/kernel switch. This increases performance of kernel +entry/exit at the cost of more expensive/complicated TLB flushing. + +Now that we have two address spaces, one for kernel and one for user space, +we need two PCIDs per mm. We use the top PCID bit to indicate a user PCID +(just like we use the PFN LSB for the PGD). Since we do TLB invalidation +from kernel space, the existing code will only invalidate the kernel PCID, +we augment that by marking the corresponding user PCID invalid, and upon +switching back to userspace, use a flushing CR3 write for the switch. + +In order to access the user_pcid_flush_mask we use PER_CPU storage, which +means the previously established SWAPGS vs CR3 ordering is now mandatory +and required. + +Having to do this memory access does require additional registers, most +sites have a functioning stack and we can spill one (RAX), sites without +functional stack need to otherwise provide the second scratch register. + +Note: PCID is generally available on Intel Sandybridge and later CPUs. +Note: Up until this point TLB flushing was broken in this series. + +Based-on-code-from: Dave Hansen +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(backported from commit 6fd166aae78c0ab738d49bda653cbd9e3b1491cf) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit ac7471365d49c0a91d4b63453eb848cc19f17589) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/calling.h | 72 ++++++++++++++++++----- + arch/x86/include/asm/processor-flags.h | 5 ++ + arch/x86/include/asm/tlbflush.h | 91 +++++++++++++++++++++++++---- + arch/x86/include/uapi/asm/processor-flags.h | 7 ++- + arch/x86/kernel/asm-offsets.c | 4 ++ + arch/x86/mm/init.c | 2 +- + arch/x86/mm/tlb.c | 1 + + arch/x86/entry/entry_64.S | 9 +-- + arch/x86/entry/entry_64_compat.S | 4 +- + 9 files changed, 162 insertions(+), 33 deletions(-) + +diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h +index bb56f5346ae8..ce5fb309926d 100644 +--- a/arch/x86/entry/calling.h ++++ b/arch/x86/entry/calling.h +@@ -2,6 +2,9 @@ + #include + #include + #include ++#include ++#include ++#include + + /* + +@@ -190,17 +193,21 @@ For 32-bit we have the following conventions - kernel is built with + + #ifdef CONFIG_PAGE_TABLE_ISOLATION + +-/* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two halves: */ +-#define PTI_SWITCH_MASK (1< + #include + #include ++#include ++#include + + static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) + { +@@ -23,24 +25,54 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) + + /* There are 12 bits of space for ASIDS in CR3 */ + #define CR3_HW_ASID_BITS 12 ++ + /* + * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for + * user/kernel switches + */ +-#define PTI_CONSUMED_ASID_BITS 0 ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++# define PTI_CONSUMED_PCID_BITS 1 ++#else ++# define PTI_CONSUMED_PCID_BITS 0 ++#endif ++ ++#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) + +-#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS) + /* + * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account + * for them being zero-based. Another -1 is because ASID 0 is reserved for + * use by non-PCID-aware users. + */ +-#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) ++#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) ++ ++/* ++ * 6 because 6 should be plenty and struct tlb_state will fit in two cache ++ * lines. ++ */ ++#define TLB_NR_DYN_ASIDS 6 + + static inline u16 kern_pcid(u16 asid) + { + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); ++ ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ /* ++ * Make sure that the dynamic ASID space does not confict with the ++ * bit we are using to switch between user and kernel ASIDs. ++ */ ++ BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT)); ++ + /* ++ * The ASID being passed in here should have respected the ++ * MAX_ASID_AVAILABLE and thus never have the switch bit set. ++ */ ++ VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT)); ++#endif ++ /* ++ * The dynamically-assigned ASIDs that get passed in are small ++ * (mm == NULL then we borrow a mm which may change during a +- * task switch and therefore we must not be preempted while we write CR3 +- * back: ++ * If current->mm == NULL then we borrow a mm which may change ++ * during a task switch and therefore we must not be preempted ++ * while we write CR3 back: + */ + preempt_disable(); + native_write_cr3(__native_read_cr3()); +@@ -290,7 +350,14 @@ static inline void __native_flush_tlb_global(void) + */ + static inline void __native_flush_tlb_single(unsigned long addr) + { ++ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); ++ + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); ++ ++ if (!static_cpu_has(X86_FEATURE_PTI)) ++ return; ++ ++ invalidate_user_asid(loaded_mm_asid); + } + + /* +diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h +index 39946d0a1d41..69077da3dbf1 100644 +--- a/arch/x86/include/uapi/asm/processor-flags.h ++++ b/arch/x86/include/uapi/asm/processor-flags.h +@@ -77,7 +77,12 @@ + #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) + #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ + #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) +-#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ ++ ++#define X86_CR3_PCID_BITS 12 ++#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL)) ++ ++#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ ++#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) + + /* + * Intel CPU features in CR4 +diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c +index 25b4832e9c28..87c3bafcef2c 100644 +--- a/arch/x86/kernel/asm-offsets.c ++++ b/arch/x86/kernel/asm-offsets.c +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + + #ifdef CONFIG_XEN + #include +@@ -93,6 +94,9 @@ void common(void) { + BLANK(); + DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + ++ /* TLB state for the entry code */ ++ OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); ++ + /* Layout info for cpu_entry_area */ + OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c +index af75069fb116..caeb8a7bf0a4 100644 +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -855,7 +855,7 @@ void __init zone_sizes_init(void) + free_area_init_nodes(max_zone_pfns); + } + +-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { ++__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { + .loaded_mm = &init_mm, + .next_asid = 1, + .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 353f2f4e1d96..06f3854d0a4f 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -106,6 +106,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) + unsigned long new_mm_cr3; + + if (need_flush) { ++ invalidate_user_asid(new_asid); + new_mm_cr3 = build_cr3(pgdir, new_asid); + } else { + new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 292ccc6ec48d..fb43f14ed299 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -22,7 +22,6 @@ + #include + #include + #include +-#include "calling.h" + #include + #include + #include +@@ -39,6 +38,8 @@ + #include + #include + ++#include "calling.h" ++ + .code64 + .section .entry.text, "ax" + +@@ -405,7 +406,7 @@ syscall_return_via_sysret: + * We are on the trampoline stack. All regs except RDI are live. + * We can do future final exit work right here. + */ +- SWITCH_TO_USER_CR3 scratch_reg=%rdi ++ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + + popq %rdi + popq %rsp +@@ -743,7 +744,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) + * We can do future final exit work right here. + */ + +- SWITCH_TO_USER_CR3 scratch_reg=%rdi ++ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + + /* Restore RDI. */ + popq %rdi +@@ -856,7 +857,7 @@ native_irq_return_ldt: + */ + orq PER_CPU_VAR(espfix_stack), %rax + +- SWITCH_TO_USER_CR3 scratch_reg=%rdi /* to user CR3 */ ++ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + SWAPGS /* to user GS */ + popq %rdi /* Restore user RDI */ + +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index 43f856aeee67..973527e34887 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -274,9 +274,9 @@ sysret32_from_system_call: + * switch until after after the last reference to the process + * stack. + * +- * %r8 is zeroed before the sysret, thus safe to clobber. ++ * %r8/%r9 are zeroed before the sysret, thus safe to clobber. + */ +- SWITCH_TO_USER_CR3 scratch_reg=%r8 ++ SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9 + + xorq %r8, %r8 + xorq %r9, %r9 +-- +2.14.2 + diff --git a/patches/kernel/0213-x86-mm-Optimize-RESTORE_CR3.patch b/patches/kernel/0213-x86-mm-Optimize-RESTORE_CR3.patch new file mode 100644 index 0000000..2f04370 --- /dev/null +++ b/patches/kernel/0213-x86-mm-Optimize-RESTORE_CR3.patch @@ -0,0 +1,127 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Mon, 4 Dec 2017 15:08:00 +0100 +Subject: [PATCH] x86/mm: Optimize RESTORE_CR3 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Most NMI/paranoid exceptions will not in fact change pagetables and would +thus not require TLB flushing, however RESTORE_CR3 uses flushing CR3 +writes. + +Restores to kernel PCIDs can be NOFLUSH, because we explicitly flush the +kernel mappings and now that we track which user PCIDs need flushing we can +avoid those too when possible. + +This does mean RESTORE_CR3 needs an additional scratch_reg, luckily both +sites have plenty available. + +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 21e94459110252d41b45c0c8ba50fd72a664d50c) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 6ebe6e2896841282357d43c09394b0ca47c41e4a) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/calling.h | 30 ++++++++++++++++++++++++++++-- + arch/x86/entry/entry_64.S | 4 ++-- + 2 files changed, 30 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h +index ce5fb309926d..015e0a84bb99 100644 +--- a/arch/x86/entry/calling.h ++++ b/arch/x86/entry/calling.h +@@ -280,8 +280,34 @@ For 32-bit we have the following conventions - kernel is built with + .Ldone_\@: + .endm + +-.macro RESTORE_CR3 save_reg:req ++.macro RESTORE_CR3 scratch_reg:req save_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI ++ ++ ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID ++ ++ /* ++ * KERNEL pages can always resume with NOFLUSH as we do ++ * explicit flushes. ++ */ ++ bt $X86_CR3_PTI_SWITCH_BIT, \save_reg ++ jnc .Lnoflush_\@ ++ ++ /* ++ * Check if there's a pending flush for the user ASID we're ++ * about to set. ++ */ ++ movq \save_reg, \scratch_reg ++ andq $(0x7FF), \scratch_reg ++ bt \scratch_reg, THIS_CPU_user_pcid_flush_mask ++ jnc .Lnoflush_\@ ++ ++ btr \scratch_reg, THIS_CPU_user_pcid_flush_mask ++ jmp .Lwrcr3_\@ ++ ++.Lnoflush_\@: ++ SET_NOFLUSH_BIT \save_reg ++ ++.Lwrcr3_\@: + /* + * The CR3 write could be avoided when not changing its value, + * but would require a CR3 read *and* a scratch register. +@@ -300,7 +326,7 @@ For 32-bit we have the following conventions - kernel is built with + .endm + .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req + .endm +-.macro RESTORE_CR3 save_reg:req ++.macro RESTORE_CR3 scratch_reg:req save_reg:req + .endm + + #endif +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index fb43f14ed299..b48f2c78a9bf 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1300,7 +1300,7 @@ ENTRY(paranoid_exit) + testl %ebx, %ebx /* swapgs needed? */ + jnz .Lparanoid_exit_no_swapgs + TRACE_IRQS_IRETQ +- RESTORE_CR3 save_reg=%r14 ++ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 + SWAPGS_UNSAFE_STACK + jmp .Lparanoid_exit_restore + .Lparanoid_exit_no_swapgs: +@@ -1742,7 +1742,7 @@ end_repeat_nmi: + movq $-1, %rsi + call do_nmi + +- RESTORE_CR3 save_reg=%r14 ++ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 + + testl %ebx, %ebx /* swapgs needed? */ + jnz nmi_restore +-- +2.14.2 + diff --git a/patches/kernel/0213-x86-mm-Use-INVPCID-for-__native_flush_tlb_single.patch b/patches/kernel/0213-x86-mm-Use-INVPCID-for-__native_flush_tlb_single.patch deleted file mode 100644 index 61216e3..0000000 --- a/patches/kernel/0213-x86-mm-Use-INVPCID-for-__native_flush_tlb_single.patch +++ /dev/null @@ -1,194 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Mon, 4 Dec 2017 15:08:01 +0100 -Subject: [PATCH] x86/mm: Use INVPCID for __native_flush_tlb_single() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -This uses INVPCID to shoot down individual lines of the user mapping -instead of marking the entire user map as invalid. This -could/might/possibly be faster. - -This for sure needs tlb_single_page_flush_ceiling to be redetermined; -esp. since INVPCID is _slow_. - -A detailed performance analysis is available here: - - https://lkml.kernel.org/r/3062e486-3539-8a1f-5724-16199420be71@intel.com - -[ Peterz: Split out from big combo patch ] - -Signed-off-by: Dave Hansen -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 6cff64b86aaaa07f89f50498055a20e45754b0c1) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit e4986a4e89c0eb40f824a8505feefff3328ad4b2) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/cpufeatures.h | 1 + - arch/x86/include/asm/tlbflush.h | 23 +++++++++++++- - arch/x86/mm/init.c | 64 ++++++++++++++++++++++---------------- - 3 files changed, 60 insertions(+), 28 deletions(-) - -diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h -index de4e91452de4..9b0c283afcf0 100644 ---- a/arch/x86/include/asm/cpufeatures.h -+++ b/arch/x86/include/asm/cpufeatures.h -@@ -196,6 +196,7 @@ - #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ - #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ - #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ -+#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ - - #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ - #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index 2b7b32c243f1..979e590648a5 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -84,6 +84,18 @@ static inline u16 kern_pcid(u16 asid) - return asid + 1; - } - -+/* -+ * The user PCID is just the kernel one, plus the "switch bit". -+ */ -+static inline u16 user_pcid(u16 asid) -+{ -+ u16 ret = kern_pcid(asid); -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+ ret |= 1 << X86_CR3_PTI_SWITCH_BIT; -+#endif -+ return ret; -+} -+ - struct pgd_t; - static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) - { -@@ -324,6 +336,8 @@ static inline void __native_flush_tlb_global(void) - /* - * Using INVPCID is considerably faster than a pair of writes - * to CR4 sandwiched inside an IRQ flag save/restore. -+ * -+ * Note, this works with CR4.PCIDE=0 or 1. - */ - invpcid_flush_all(); - return; -@@ -357,7 +371,14 @@ static inline void __native_flush_tlb_single(unsigned long addr) - if (!static_cpu_has(X86_FEATURE_PTI)) - return; - -- invalidate_user_asid(loaded_mm_asid); -+ /* -+ * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1. -+ * Just use invalidate_user_asid() in case we are called early. -+ */ -+ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) -+ invalidate_user_asid(loaded_mm_asid); -+ else -+ invpcid_flush_one(user_pcid(loaded_mm_asid), addr); - } - - /* -diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c -index caeb8a7bf0a4..80259ad8c386 100644 ---- a/arch/x86/mm/init.c -+++ b/arch/x86/mm/init.c -@@ -203,34 +203,44 @@ static void __init probe_page_size_mask(void) - - static void setup_pcid(void) - { --#ifdef CONFIG_X86_64 -- if (boot_cpu_has(X86_FEATURE_PCID)) { -- if (boot_cpu_has(X86_FEATURE_PGE)) { -- /* -- * This can't be cr4_set_bits_and_update_boot() -- -- * the trampoline code can't handle CR4.PCIDE and -- * it wouldn't do any good anyway. Despite the name, -- * cr4_set_bits_and_update_boot() doesn't actually -- * cause the bits in question to remain set all the -- * way through the secondary boot asm. -- * -- * Instead, we brute-force it and set CR4.PCIDE -- * manually in start_secondary(). -- */ -- cr4_set_bits(X86_CR4_PCIDE); -- } else { -- /* -- * flush_tlb_all(), as currently implemented, won't -- * work if PCID is on but PGE is not. Since that -- * combination doesn't exist on real hardware, there's -- * no reason to try to fully support it, but it's -- * polite to avoid corrupting data if we're on -- * an improperly configured VM. -- */ -- setup_clear_cpu_cap(X86_FEATURE_PCID); -- } -+ if (!IS_ENABLED(CONFIG_X86_64)) -+ return; -+ -+ if (!boot_cpu_has(X86_FEATURE_PCID)) -+ return; -+ -+ if (boot_cpu_has(X86_FEATURE_PGE)) { -+ /* -+ * This can't be cr4_set_bits_and_update_boot() -- the -+ * trampoline code can't handle CR4.PCIDE and it wouldn't -+ * do any good anyway. Despite the name, -+ * cr4_set_bits_and_update_boot() doesn't actually cause -+ * the bits in question to remain set all the way through -+ * the secondary boot asm. -+ * -+ * Instead, we brute-force it and set CR4.PCIDE manually in -+ * start_secondary(). -+ */ -+ cr4_set_bits(X86_CR4_PCIDE); -+ -+ /* -+ * INVPCID's single-context modes (2/3) only work if we set -+ * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable -+ * on systems that have X86_CR4_PCIDE clear, or that have -+ * no INVPCID support at all. -+ */ -+ if (boot_cpu_has(X86_FEATURE_INVPCID)) -+ setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE); -+ } else { -+ /* -+ * flush_tlb_all(), as currently implemented, won't work if -+ * PCID is on but PGE is not. Since that combination -+ * doesn't exist on real hardware, there's no reason to try -+ * to fully support it, but it's polite to avoid corrupting -+ * data if we're on an improperly configured VM. -+ */ -+ setup_clear_cpu_cap(X86_FEATURE_PCID); - } --#endif - } - - #ifdef CONFIG_X86_32 --- -2.14.2 - diff --git a/patches/kernel/0214-x86-mm-Clarify-the-whole-ASID-kernel-PCID-user-PCID-.patch b/patches/kernel/0214-x86-mm-Clarify-the-whole-ASID-kernel-PCID-user-PCID-.patch deleted file mode 100644 index 1aee978..0000000 --- a/patches/kernel/0214-x86-mm-Clarify-the-whole-ASID-kernel-PCID-user-PCID-.patch +++ /dev/null @@ -1,142 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Tue, 5 Dec 2017 13:34:53 +0100 -Subject: [PATCH] x86/mm: Clarify the whole ASID/kernel PCID/user PCID naming -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Ideally we'd also use sparse to enforce this separation so it becomes much -more difficult to mess up. - -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 0a126abd576ebc6403f063dbe20cf7416c9d9393) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 2ee6efc0f708e21cfd08471132ac2255fac54553) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/tlbflush.h | 55 ++++++++++++++++++++++++++++++++--------- - 1 file changed, 43 insertions(+), 12 deletions(-) - -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index 979e590648a5..7a04a1f1ca11 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -12,16 +12,33 @@ - #include - #include - --static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) --{ -- /* -- * Bump the generation count. This also serves as a full barrier -- * that synchronizes with switch_mm(): callers are required to order -- * their read of mm_cpumask after their writes to the paging -- * structures. -- */ -- return atomic64_inc_return(&mm->context.tlb_gen); --} -+/* -+ * The x86 feature is called PCID (Process Context IDentifier). It is similar -+ * to what is traditionally called ASID on the RISC processors. -+ * -+ * We don't use the traditional ASID implementation, where each process/mm gets -+ * its own ASID and flush/restart when we run out of ASID space. -+ * -+ * Instead we have a small per-cpu array of ASIDs and cache the last few mm's -+ * that came by on this CPU, allowing cheaper switch_mm between processes on -+ * this CPU. -+ * -+ * We end up with different spaces for different things. To avoid confusion we -+ * use different names for each of them: -+ * -+ * ASID - [0, TLB_NR_DYN_ASIDS-1] -+ * the canonical identifier for an mm -+ * -+ * kPCID - [1, TLB_NR_DYN_ASIDS] -+ * the value we write into the PCID part of CR3; corresponds to the -+ * ASID+1, because PCID 0 is special. -+ * -+ * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] -+ * for KPTI each mm has two address spaces and thus needs two -+ * PCID values, but we can still do with a single ASID denomination -+ * for each mm. Corresponds to kPCID + 2048. -+ * -+ */ - - /* There are 12 bits of space for ASIDS in CR3 */ - #define CR3_HW_ASID_BITS 12 -@@ -40,7 +57,7 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) - - /* - * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account -- * for them being zero-based. Another -1 is because ASID 0 is reserved for -+ * for them being zero-based. Another -1 is because PCID 0 is reserved for - * use by non-PCID-aware users. - */ - #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) -@@ -51,6 +68,9 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) - */ - #define TLB_NR_DYN_ASIDS 6 - -+/* -+ * Given @asid, compute kPCID -+ */ - static inline u16 kern_pcid(u16 asid) - { - VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); -@@ -85,7 +105,7 @@ static inline u16 kern_pcid(u16 asid) - } - - /* -- * The user PCID is just the kernel one, plus the "switch bit". -+ * Given @asid, compute uPCID - */ - static inline u16 user_pcid(u16 asid) - { -@@ -473,6 +493,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) - void native_flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info); - -+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) -+{ -+ /* -+ * Bump the generation count. This also serves as a full barrier -+ * that synchronizes with switch_mm(): callers are required to order -+ * their read of mm_cpumask after their writes to the paging -+ * structures. -+ */ -+ return atomic64_inc_return(&mm->context.tlb_gen); -+} -+ - static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, - struct mm_struct *mm) - { --- -2.14.2 - diff --git a/patches/kernel/0214-x86-mm-Use-INVPCID-for-__native_flush_tlb_single.patch b/patches/kernel/0214-x86-mm-Use-INVPCID-for-__native_flush_tlb_single.patch new file mode 100644 index 0000000..61216e3 --- /dev/null +++ b/patches/kernel/0214-x86-mm-Use-INVPCID-for-__native_flush_tlb_single.patch @@ -0,0 +1,194 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Mon, 4 Dec 2017 15:08:01 +0100 +Subject: [PATCH] x86/mm: Use INVPCID for __native_flush_tlb_single() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +This uses INVPCID to shoot down individual lines of the user mapping +instead of marking the entire user map as invalid. This +could/might/possibly be faster. + +This for sure needs tlb_single_page_flush_ceiling to be redetermined; +esp. since INVPCID is _slow_. + +A detailed performance analysis is available here: + + https://lkml.kernel.org/r/3062e486-3539-8a1f-5724-16199420be71@intel.com + +[ Peterz: Split out from big combo patch ] + +Signed-off-by: Dave Hansen +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 6cff64b86aaaa07f89f50498055a20e45754b0c1) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit e4986a4e89c0eb40f824a8505feefff3328ad4b2) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/cpufeatures.h | 1 + + arch/x86/include/asm/tlbflush.h | 23 +++++++++++++- + arch/x86/mm/init.c | 64 ++++++++++++++++++++++---------------- + 3 files changed, 60 insertions(+), 28 deletions(-) + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index de4e91452de4..9b0c283afcf0 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -196,6 +196,7 @@ + #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ + #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ + #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ ++#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ + + #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ + #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 2b7b32c243f1..979e590648a5 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -84,6 +84,18 @@ static inline u16 kern_pcid(u16 asid) + return asid + 1; + } + ++/* ++ * The user PCID is just the kernel one, plus the "switch bit". ++ */ ++static inline u16 user_pcid(u16 asid) ++{ ++ u16 ret = kern_pcid(asid); ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ ret |= 1 << X86_CR3_PTI_SWITCH_BIT; ++#endif ++ return ret; ++} ++ + struct pgd_t; + static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) + { +@@ -324,6 +336,8 @@ static inline void __native_flush_tlb_global(void) + /* + * Using INVPCID is considerably faster than a pair of writes + * to CR4 sandwiched inside an IRQ flag save/restore. ++ * ++ * Note, this works with CR4.PCIDE=0 or 1. + */ + invpcid_flush_all(); + return; +@@ -357,7 +371,14 @@ static inline void __native_flush_tlb_single(unsigned long addr) + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + +- invalidate_user_asid(loaded_mm_asid); ++ /* ++ * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1. ++ * Just use invalidate_user_asid() in case we are called early. ++ */ ++ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) ++ invalidate_user_asid(loaded_mm_asid); ++ else ++ invpcid_flush_one(user_pcid(loaded_mm_asid), addr); + } + + /* +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c +index caeb8a7bf0a4..80259ad8c386 100644 +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -203,34 +203,44 @@ static void __init probe_page_size_mask(void) + + static void setup_pcid(void) + { +-#ifdef CONFIG_X86_64 +- if (boot_cpu_has(X86_FEATURE_PCID)) { +- if (boot_cpu_has(X86_FEATURE_PGE)) { +- /* +- * This can't be cr4_set_bits_and_update_boot() -- +- * the trampoline code can't handle CR4.PCIDE and +- * it wouldn't do any good anyway. Despite the name, +- * cr4_set_bits_and_update_boot() doesn't actually +- * cause the bits in question to remain set all the +- * way through the secondary boot asm. +- * +- * Instead, we brute-force it and set CR4.PCIDE +- * manually in start_secondary(). +- */ +- cr4_set_bits(X86_CR4_PCIDE); +- } else { +- /* +- * flush_tlb_all(), as currently implemented, won't +- * work if PCID is on but PGE is not. Since that +- * combination doesn't exist on real hardware, there's +- * no reason to try to fully support it, but it's +- * polite to avoid corrupting data if we're on +- * an improperly configured VM. +- */ +- setup_clear_cpu_cap(X86_FEATURE_PCID); +- } ++ if (!IS_ENABLED(CONFIG_X86_64)) ++ return; ++ ++ if (!boot_cpu_has(X86_FEATURE_PCID)) ++ return; ++ ++ if (boot_cpu_has(X86_FEATURE_PGE)) { ++ /* ++ * This can't be cr4_set_bits_and_update_boot() -- the ++ * trampoline code can't handle CR4.PCIDE and it wouldn't ++ * do any good anyway. Despite the name, ++ * cr4_set_bits_and_update_boot() doesn't actually cause ++ * the bits in question to remain set all the way through ++ * the secondary boot asm. ++ * ++ * Instead, we brute-force it and set CR4.PCIDE manually in ++ * start_secondary(). ++ */ ++ cr4_set_bits(X86_CR4_PCIDE); ++ ++ /* ++ * INVPCID's single-context modes (2/3) only work if we set ++ * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable ++ * on systems that have X86_CR4_PCIDE clear, or that have ++ * no INVPCID support at all. ++ */ ++ if (boot_cpu_has(X86_FEATURE_INVPCID)) ++ setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE); ++ } else { ++ /* ++ * flush_tlb_all(), as currently implemented, won't work if ++ * PCID is on but PGE is not. Since that combination ++ * doesn't exist on real hardware, there's no reason to try ++ * to fully support it, but it's polite to avoid corrupting ++ * data if we're on an improperly configured VM. ++ */ ++ setup_clear_cpu_cap(X86_FEATURE_PCID); + } +-#endif + } + + #ifdef CONFIG_X86_32 +-- +2.14.2 + diff --git a/patches/kernel/0215-x86-dumpstack-Indicate-in-Oops-whether-PTI-is-config.patch b/patches/kernel/0215-x86-dumpstack-Indicate-in-Oops-whether-PTI-is-config.patch deleted file mode 100644 index 171622c..0000000 --- a/patches/kernel/0215-x86-dumpstack-Indicate-in-Oops-whether-PTI-is-config.patch +++ /dev/null @@ -1,87 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Vlastimil Babka -Date: Tue, 19 Dec 2017 22:33:46 +0100 -Subject: [PATCH] x86/dumpstack: Indicate in Oops whether PTI is configured and - enabled -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -CONFIG_PAGE_TABLE_ISOLATION is relatively new and intrusive feature that may -still have some corner cases which could take some time to manifest and be -fixed. It would be useful to have Oops messages indicate whether it was -enabled for building the kernel, and whether it was disabled during boot. - -Example of fully enabled: - - Oops: 0001 [#1] SMP PTI - -Example of enabled during build, but disabled during boot: - - Oops: 0001 [#1] SMP NOPTI - -We can decide to remove this after the feature has been tested in the field -long enough. - -[ tglx: Made it use boot_cpu_has() as requested by Borislav ] - -Signed-off-by: Vlastimil Babka -Signed-off-by: Thomas Gleixner -Reviewed-by: Eduardo Valentin -Acked-by: Dave Hansen -Cc: Andy Lutomirski -Cc: Andy Lutomirsky -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: bpetkov@suse.de -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: jkosina@suse.cz -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 5f26d76c3fd67c48806415ef8b1116c97beff8ba) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 7edb91fcc96589ad6b80446ec3835f83ffabb710) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/dumpstack.c | 6 ++++-- - 1 file changed, 4 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c -index 2bdeb983b9d8..19a936e9b259 100644 ---- a/arch/x86/kernel/dumpstack.c -+++ b/arch/x86/kernel/dumpstack.c -@@ -298,11 +298,13 @@ int __die(const char *str, struct pt_regs *regs, long err) - unsigned long sp; - #endif - printk(KERN_DEFAULT -- "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter, -+ "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, - IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", - IS_ENABLED(CONFIG_SMP) ? " SMP" : "", - debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", -- IS_ENABLED(CONFIG_KASAN) ? " KASAN" : ""); -+ IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", -+ IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? -+ (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); - - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) --- -2.14.2 - diff --git a/patches/kernel/0215-x86-mm-Clarify-the-whole-ASID-kernel-PCID-user-PCID-.patch b/patches/kernel/0215-x86-mm-Clarify-the-whole-ASID-kernel-PCID-user-PCID-.patch new file mode 100644 index 0000000..1aee978 --- /dev/null +++ b/patches/kernel/0215-x86-mm-Clarify-the-whole-ASID-kernel-PCID-user-PCID-.patch @@ -0,0 +1,142 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Tue, 5 Dec 2017 13:34:53 +0100 +Subject: [PATCH] x86/mm: Clarify the whole ASID/kernel PCID/user PCID naming +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Ideally we'd also use sparse to enforce this separation so it becomes much +more difficult to mess up. + +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 0a126abd576ebc6403f063dbe20cf7416c9d9393) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 2ee6efc0f708e21cfd08471132ac2255fac54553) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/tlbflush.h | 55 ++++++++++++++++++++++++++++++++--------- + 1 file changed, 43 insertions(+), 12 deletions(-) + +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 979e590648a5..7a04a1f1ca11 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -12,16 +12,33 @@ + #include + #include + +-static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) +-{ +- /* +- * Bump the generation count. This also serves as a full barrier +- * that synchronizes with switch_mm(): callers are required to order +- * their read of mm_cpumask after their writes to the paging +- * structures. +- */ +- return atomic64_inc_return(&mm->context.tlb_gen); +-} ++/* ++ * The x86 feature is called PCID (Process Context IDentifier). It is similar ++ * to what is traditionally called ASID on the RISC processors. ++ * ++ * We don't use the traditional ASID implementation, where each process/mm gets ++ * its own ASID and flush/restart when we run out of ASID space. ++ * ++ * Instead we have a small per-cpu array of ASIDs and cache the last few mm's ++ * that came by on this CPU, allowing cheaper switch_mm between processes on ++ * this CPU. ++ * ++ * We end up with different spaces for different things. To avoid confusion we ++ * use different names for each of them: ++ * ++ * ASID - [0, TLB_NR_DYN_ASIDS-1] ++ * the canonical identifier for an mm ++ * ++ * kPCID - [1, TLB_NR_DYN_ASIDS] ++ * the value we write into the PCID part of CR3; corresponds to the ++ * ASID+1, because PCID 0 is special. ++ * ++ * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] ++ * for KPTI each mm has two address spaces and thus needs two ++ * PCID values, but we can still do with a single ASID denomination ++ * for each mm. Corresponds to kPCID + 2048. ++ * ++ */ + + /* There are 12 bits of space for ASIDS in CR3 */ + #define CR3_HW_ASID_BITS 12 +@@ -40,7 +57,7 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) + + /* + * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account +- * for them being zero-based. Another -1 is because ASID 0 is reserved for ++ * for them being zero-based. Another -1 is because PCID 0 is reserved for + * use by non-PCID-aware users. + */ + #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) +@@ -51,6 +68,9 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) + */ + #define TLB_NR_DYN_ASIDS 6 + ++/* ++ * Given @asid, compute kPCID ++ */ + static inline u16 kern_pcid(u16 asid) + { + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); +@@ -85,7 +105,7 @@ static inline u16 kern_pcid(u16 asid) + } + + /* +- * The user PCID is just the kernel one, plus the "switch bit". ++ * Given @asid, compute uPCID + */ + static inline u16 user_pcid(u16 asid) + { +@@ -473,6 +493,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) + void native_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info); + ++static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) ++{ ++ /* ++ * Bump the generation count. This also serves as a full barrier ++ * that synchronizes with switch_mm(): callers are required to order ++ * their read of mm_cpumask after their writes to the paging ++ * structures. ++ */ ++ return atomic64_inc_return(&mm->context.tlb_gen); ++} ++ + static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm) + { +-- +2.14.2 + diff --git a/patches/kernel/0216-x86-dumpstack-Indicate-in-Oops-whether-PTI-is-config.patch b/patches/kernel/0216-x86-dumpstack-Indicate-in-Oops-whether-PTI-is-config.patch new file mode 100644 index 0000000..171622c --- /dev/null +++ b/patches/kernel/0216-x86-dumpstack-Indicate-in-Oops-whether-PTI-is-config.patch @@ -0,0 +1,87 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Vlastimil Babka +Date: Tue, 19 Dec 2017 22:33:46 +0100 +Subject: [PATCH] x86/dumpstack: Indicate in Oops whether PTI is configured and + enabled +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +CONFIG_PAGE_TABLE_ISOLATION is relatively new and intrusive feature that may +still have some corner cases which could take some time to manifest and be +fixed. It would be useful to have Oops messages indicate whether it was +enabled for building the kernel, and whether it was disabled during boot. + +Example of fully enabled: + + Oops: 0001 [#1] SMP PTI + +Example of enabled during build, but disabled during boot: + + Oops: 0001 [#1] SMP NOPTI + +We can decide to remove this after the feature has been tested in the field +long enough. + +[ tglx: Made it use boot_cpu_has() as requested by Borislav ] + +Signed-off-by: Vlastimil Babka +Signed-off-by: Thomas Gleixner +Reviewed-by: Eduardo Valentin +Acked-by: Dave Hansen +Cc: Andy Lutomirski +Cc: Andy Lutomirsky +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: bpetkov@suse.de +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: jkosina@suse.cz +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 5f26d76c3fd67c48806415ef8b1116c97beff8ba) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 7edb91fcc96589ad6b80446ec3835f83ffabb710) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/dumpstack.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index 2bdeb983b9d8..19a936e9b259 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -298,11 +298,13 @@ int __die(const char *str, struct pt_regs *regs, long err) + unsigned long sp; + #endif + printk(KERN_DEFAULT +- "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter, ++ "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, + IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", + IS_ENABLED(CONFIG_SMP) ? " SMP" : "", + debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", +- IS_ENABLED(CONFIG_KASAN) ? " KASAN" : ""); ++ IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", ++ IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? ++ (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); + + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) +-- +2.14.2 + diff --git a/patches/kernel/0216-x86-mm-pti-Add-Kconfig.patch b/patches/kernel/0216-x86-mm-pti-Add-Kconfig.patch deleted file mode 100644 index 07ebcee..0000000 --- a/patches/kernel/0216-x86-mm-pti-Add-Kconfig.patch +++ /dev/null @@ -1,81 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Mon, 4 Dec 2017 15:08:03 +0100 -Subject: [PATCH] x86/mm/pti: Add Kconfig -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Finally allow CONFIG_PAGE_TABLE_ISOLATION to be enabled. - -PARAVIRT generally requires that the kernel not manage its own page tables. -It also means that the hypervisor and kernel must agree wholeheartedly -about what format the page tables are in and what they contain. -PAGE_TABLE_ISOLATION, unfortunately, changes the rules and they -can not be used together. - -I've seen conflicting feedback from maintainers lately about whether they -want the Kconfig magic to go first or last in a patch series. It's going -last here because the partially-applied series leads to kernels that can -not boot in a bunch of cases. I did a run through the entire series with -CONFIG_PAGE_TABLE_ISOLATION=y to look for build errors, though. - -[ tglx: Removed SMP and !PARAVIRT dependencies as they not longer exist ] - -Signed-off-by: Dave Hansen -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit 385ce0ea4c078517fa51c261882c4e72fba53005) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit ce12963b837e809f6ae048587d9377a298c1094d) -Signed-off-by: Fabian Grünbichler ---- - security/Kconfig | 10 ++++++++++ - 1 file changed, 10 insertions(+) - -diff --git a/security/Kconfig b/security/Kconfig -index 305b496ff6a3..91cb8f611a0d 100644 ---- a/security/Kconfig -+++ b/security/Kconfig -@@ -96,6 +96,16 @@ config SECURITY_NETWORK - implement socket and networking access controls. - If you are unsure how to answer this question, answer N. - -+config PAGE_TABLE_ISOLATION -+ bool "Remove the kernel mapping in user mode" -+ depends on X86_64 && !UML -+ help -+ This feature reduces the number of hardware side channels by -+ ensuring that the majority of kernel addresses are not mapped -+ into userspace. -+ -+ See Documentation/x86/pagetable-isolation.txt for more details. -+ - config SECURITY_INFINIBAND - bool "Infiniband Security Hooks" - depends on SECURITY && INFINIBAND --- -2.14.2 - diff --git a/patches/kernel/0217-x86-mm-dump_pagetables-Add-page-table-directory-to-t.patch b/patches/kernel/0217-x86-mm-dump_pagetables-Add-page-table-directory-to-t.patch deleted file mode 100644 index a80eab0..0000000 --- a/patches/kernel/0217-x86-mm-dump_pagetables-Add-page-table-directory-to-t.patch +++ /dev/null @@ -1,87 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Borislav Petkov -Date: Mon, 4 Dec 2017 15:08:04 +0100 -Subject: [PATCH] x86/mm/dump_pagetables: Add page table directory to the - debugfs VFS hierarchy -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The upcoming support for dumping the kernel and the user space page tables -of the current process would create more random files in the top level -debugfs directory. - -Add a page table directory and move the existing file to it. - -Signed-off-by: Borislav Petkov -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 75298aa179d56cd64f54e58a19fffc8ab922b4c0) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit ae5c4af9bbefed4adc12075c28fb5889547c99cc) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/debug_pagetables.c | 15 ++++++++++----- - 1 file changed, 10 insertions(+), 5 deletions(-) - -diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c -index bfcffdf6c577..d1449fb6dc7a 100644 ---- a/arch/x86/mm/debug_pagetables.c -+++ b/arch/x86/mm/debug_pagetables.c -@@ -22,21 +22,26 @@ static const struct file_operations ptdump_fops = { - .release = single_release, - }; - --static struct dentry *pe; -+static struct dentry *dir, *pe; - - static int __init pt_dump_debug_init(void) - { -- pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL, -- &ptdump_fops); -- if (!pe) -+ dir = debugfs_create_dir("page_tables", NULL); -+ if (!dir) - return -ENOMEM; - -+ pe = debugfs_create_file("kernel", 0400, dir, NULL, &ptdump_fops); -+ if (!pe) -+ goto err; - return 0; -+err: -+ debugfs_remove_recursive(dir); -+ return -ENOMEM; - } - - static void __exit pt_dump_debug_exit(void) - { -- debugfs_remove_recursive(pe); -+ debugfs_remove_recursive(dir); - } - - module_init(pt_dump_debug_init); --- -2.14.2 - diff --git a/patches/kernel/0217-x86-mm-pti-Add-Kconfig.patch b/patches/kernel/0217-x86-mm-pti-Add-Kconfig.patch new file mode 100644 index 0000000..07ebcee --- /dev/null +++ b/patches/kernel/0217-x86-mm-pti-Add-Kconfig.patch @@ -0,0 +1,81 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Mon, 4 Dec 2017 15:08:03 +0100 +Subject: [PATCH] x86/mm/pti: Add Kconfig +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Finally allow CONFIG_PAGE_TABLE_ISOLATION to be enabled. + +PARAVIRT generally requires that the kernel not manage its own page tables. +It also means that the hypervisor and kernel must agree wholeheartedly +about what format the page tables are in and what they contain. +PAGE_TABLE_ISOLATION, unfortunately, changes the rules and they +can not be used together. + +I've seen conflicting feedback from maintainers lately about whether they +want the Kconfig magic to go first or last in a patch series. It's going +last here because the partially-applied series leads to kernels that can +not boot in a bunch of cases. I did a run through the entire series with +CONFIG_PAGE_TABLE_ISOLATION=y to look for build errors, though. + +[ tglx: Removed SMP and !PARAVIRT dependencies as they not longer exist ] + +Signed-off-by: Dave Hansen +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit 385ce0ea4c078517fa51c261882c4e72fba53005) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit ce12963b837e809f6ae048587d9377a298c1094d) +Signed-off-by: Fabian Grünbichler +--- + security/Kconfig | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/security/Kconfig b/security/Kconfig +index 305b496ff6a3..91cb8f611a0d 100644 +--- a/security/Kconfig ++++ b/security/Kconfig +@@ -96,6 +96,16 @@ config SECURITY_NETWORK + implement socket and networking access controls. + If you are unsure how to answer this question, answer N. + ++config PAGE_TABLE_ISOLATION ++ bool "Remove the kernel mapping in user mode" ++ depends on X86_64 && !UML ++ help ++ This feature reduces the number of hardware side channels by ++ ensuring that the majority of kernel addresses are not mapped ++ into userspace. ++ ++ See Documentation/x86/pagetable-isolation.txt for more details. ++ + config SECURITY_INFINIBAND + bool "Infiniband Security Hooks" + depends on SECURITY && INFINIBAND +-- +2.14.2 + diff --git a/patches/kernel/0218-x86-mm-dump_pagetables-Add-page-table-directory-to-t.patch b/patches/kernel/0218-x86-mm-dump_pagetables-Add-page-table-directory-to-t.patch new file mode 100644 index 0000000..a80eab0 --- /dev/null +++ b/patches/kernel/0218-x86-mm-dump_pagetables-Add-page-table-directory-to-t.patch @@ -0,0 +1,87 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Borislav Petkov +Date: Mon, 4 Dec 2017 15:08:04 +0100 +Subject: [PATCH] x86/mm/dump_pagetables: Add page table directory to the + debugfs VFS hierarchy +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The upcoming support for dumping the kernel and the user space page tables +of the current process would create more random files in the top level +debugfs directory. + +Add a page table directory and move the existing file to it. + +Signed-off-by: Borislav Petkov +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 75298aa179d56cd64f54e58a19fffc8ab922b4c0) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit ae5c4af9bbefed4adc12075c28fb5889547c99cc) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/debug_pagetables.c | 15 ++++++++++----- + 1 file changed, 10 insertions(+), 5 deletions(-) + +diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c +index bfcffdf6c577..d1449fb6dc7a 100644 +--- a/arch/x86/mm/debug_pagetables.c ++++ b/arch/x86/mm/debug_pagetables.c +@@ -22,21 +22,26 @@ static const struct file_operations ptdump_fops = { + .release = single_release, + }; + +-static struct dentry *pe; ++static struct dentry *dir, *pe; + + static int __init pt_dump_debug_init(void) + { +- pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL, +- &ptdump_fops); +- if (!pe) ++ dir = debugfs_create_dir("page_tables", NULL); ++ if (!dir) + return -ENOMEM; + ++ pe = debugfs_create_file("kernel", 0400, dir, NULL, &ptdump_fops); ++ if (!pe) ++ goto err; + return 0; ++err: ++ debugfs_remove_recursive(dir); ++ return -ENOMEM; + } + + static void __exit pt_dump_debug_exit(void) + { +- debugfs_remove_recursive(pe); ++ debugfs_remove_recursive(dir); + } + + module_init(pt_dump_debug_init); +-- +2.14.2 + diff --git a/patches/kernel/0218-x86-mm-dump_pagetables-Check-user-space-page-table-f.patch b/patches/kernel/0218-x86-mm-dump_pagetables-Check-user-space-page-table-f.patch deleted file mode 100644 index 8152b66..0000000 --- a/patches/kernel/0218-x86-mm-dump_pagetables-Check-user-space-page-table-f.patch +++ /dev/null @@ -1,139 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 4 Dec 2017 15:08:05 +0100 -Subject: [PATCH] x86/mm/dump_pagetables: Check user space page table for WX - pages -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -ptdump_walk_pgd_level_checkwx() checks the kernel page table for WX pages, -but does not check the PAGE_TABLE_ISOLATION user space page table. - -Restructure the code so that dmesg output is selected by an explicit -argument and not implicit via checking the pgd argument for !NULL. - -Add the check for the user space page table. - -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit b4bf4f924b1d7bade38fd51b2e401d20d0956e4d) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 1adfe82e8fe5afa2fae59efe498c461d5a52cb6c) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/pgtable.h | 1 + - arch/x86/mm/debug_pagetables.c | 2 +- - arch/x86/mm/dump_pagetables.c | 30 +++++++++++++++++++++++++----- - 3 files changed, 27 insertions(+), 6 deletions(-) - -diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h -index 25604b8a251a..4f5eb81cf8be 100644 ---- a/arch/x86/include/asm/pgtable.h -+++ b/arch/x86/include/asm/pgtable.h -@@ -17,6 +17,7 @@ - #include - - void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); -+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd); - void ptdump_walk_pgd_level_checkwx(void); - - #ifdef CONFIG_DEBUG_WX -diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c -index d1449fb6dc7a..8e70c1599e51 100644 ---- a/arch/x86/mm/debug_pagetables.c -+++ b/arch/x86/mm/debug_pagetables.c -@@ -5,7 +5,7 @@ - - static int ptdump_show(struct seq_file *m, void *v) - { -- ptdump_walk_pgd_level(m, NULL); -+ ptdump_walk_pgd_level_debugfs(m, NULL); - return 0; - } - -diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c -index eed93dd4cb4a..7b022ad37c4e 100644 ---- a/arch/x86/mm/dump_pagetables.c -+++ b/arch/x86/mm/dump_pagetables.c -@@ -457,7 +457,7 @@ static inline bool is_hypervisor_range(int idx) - } - - static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, -- bool checkwx) -+ bool checkwx, bool dmesg) - { - #ifdef CONFIG_X86_64 - pgd_t *start = (pgd_t *) &init_top_pgt; -@@ -470,7 +470,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, - - if (pgd) { - start = pgd; -- st.to_dmesg = true; -+ st.to_dmesg = dmesg; - } - - st.check_wx = checkwx; -@@ -508,13 +508,33 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, - - void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) - { -- ptdump_walk_pgd_level_core(m, pgd, false); -+ ptdump_walk_pgd_level_core(m, pgd, false, true); -+} -+ -+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd) -+{ -+ ptdump_walk_pgd_level_core(m, pgd, false, false); -+} -+EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); -+ -+static void ptdump_walk_user_pgd_level_checkwx(void) -+{ -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+ pgd_t *pgd = (pgd_t *) &init_top_pgt; -+ -+ if (!static_cpu_has(X86_FEATURE_PTI)) -+ return; -+ -+ pr_info("x86/mm: Checking user space page tables\n"); -+ pgd = kernel_to_user_pgdp(pgd); -+ ptdump_walk_pgd_level_core(NULL, pgd, true, false); -+#endif - } --EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level); - - void ptdump_walk_pgd_level_checkwx(void) - { -- ptdump_walk_pgd_level_core(NULL, NULL, true); -+ ptdump_walk_pgd_level_core(NULL, NULL, true, false); -+ ptdump_walk_user_pgd_level_checkwx(); - } - - static int __init pt_dump_init(void) --- -2.14.2 - diff --git a/patches/kernel/0219-x86-mm-dump_pagetables-Allow-dumping-current-pagetab.patch b/patches/kernel/0219-x86-mm-dump_pagetables-Allow-dumping-current-pagetab.patch deleted file mode 100644 index 00aa1ce..0000000 --- a/patches/kernel/0219-x86-mm-dump_pagetables-Allow-dumping-current-pagetab.patch +++ /dev/null @@ -1,188 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Mon, 4 Dec 2017 15:08:06 +0100 -Subject: [PATCH] x86/mm/dump_pagetables: Allow dumping current pagetables -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Add two debugfs files which allow to dump the pagetable of the current -task. - -current_kernel dumps the regular page table. This is the page table which -is normally shared between kernel and user space. If kernel page table -isolation is enabled this is the kernel space mapping. - -If kernel page table isolation is enabled the second file, current_user, -dumps the user space page table. - -These files allow to verify the resulting page tables for page table -isolation, but even in the normal case its useful to be able to inspect -user space page tables of current for debugging purposes. - -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Boris Ostrovsky -Cc: Borislav Petkov -Cc: Brian Gerst -Cc: Dave Hansen -Cc: David Laight -Cc: Denys Vlasenko -Cc: Eduardo Valentin -Cc: Greg KH -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Will Deacon -Cc: aliguori@amazon.com -Cc: daniel.gruss@iaik.tugraz.at -Cc: hughd@google.com -Cc: keescook@google.com -Cc: linux-mm@kvack.org -Signed-off-by: Ingo Molnar -(cherry picked from commit a4b51ef6552c704764684cef7e753162dc87c5fa) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit e31e0526cb47bd1d848fc3fdb10d2aeb909e46b5) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/pgtable.h | 2 +- - arch/x86/mm/debug_pagetables.c | 71 +++++++++++++++++++++++++++++++++++++++--- - arch/x86/mm/dump_pagetables.c | 6 +++- - 3 files changed, 73 insertions(+), 6 deletions(-) - -diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h -index 4f5eb81cf8be..1f9e7fea3c06 100644 ---- a/arch/x86/include/asm/pgtable.h -+++ b/arch/x86/include/asm/pgtable.h -@@ -17,7 +17,7 @@ - #include - - void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); --void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd); -+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user); - void ptdump_walk_pgd_level_checkwx(void); - - #ifdef CONFIG_DEBUG_WX -diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c -index 8e70c1599e51..421f2664ffa0 100644 ---- a/arch/x86/mm/debug_pagetables.c -+++ b/arch/x86/mm/debug_pagetables.c -@@ -5,7 +5,7 @@ - - static int ptdump_show(struct seq_file *m, void *v) - { -- ptdump_walk_pgd_level_debugfs(m, NULL); -+ ptdump_walk_pgd_level_debugfs(m, NULL, false); - return 0; - } - -@@ -22,7 +22,57 @@ static const struct file_operations ptdump_fops = { - .release = single_release, - }; - --static struct dentry *dir, *pe; -+static int ptdump_show_curknl(struct seq_file *m, void *v) -+{ -+ if (current->mm->pgd) { -+ down_read(¤t->mm->mmap_sem); -+ ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false); -+ up_read(¤t->mm->mmap_sem); -+ } -+ return 0; -+} -+ -+static int ptdump_open_curknl(struct inode *inode, struct file *filp) -+{ -+ return single_open(filp, ptdump_show_curknl, NULL); -+} -+ -+static const struct file_operations ptdump_curknl_fops = { -+ .owner = THIS_MODULE, -+ .open = ptdump_open_curknl, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = single_release, -+}; -+ -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+static struct dentry *pe_curusr; -+ -+static int ptdump_show_curusr(struct seq_file *m, void *v) -+{ -+ if (current->mm->pgd) { -+ down_read(¤t->mm->mmap_sem); -+ ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true); -+ up_read(¤t->mm->mmap_sem); -+ } -+ return 0; -+} -+ -+static int ptdump_open_curusr(struct inode *inode, struct file *filp) -+{ -+ return single_open(filp, ptdump_show_curusr, NULL); -+} -+ -+static const struct file_operations ptdump_curusr_fops = { -+ .owner = THIS_MODULE, -+ .open = ptdump_open_curusr, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = single_release, -+}; -+#endif -+ -+static struct dentry *dir, *pe_knl, *pe_curknl; - - static int __init pt_dump_debug_init(void) - { -@@ -30,9 +80,22 @@ static int __init pt_dump_debug_init(void) - if (!dir) - return -ENOMEM; - -- pe = debugfs_create_file("kernel", 0400, dir, NULL, &ptdump_fops); -- if (!pe) -+ pe_knl = debugfs_create_file("kernel", 0400, dir, NULL, -+ &ptdump_fops); -+ if (!pe_knl) -+ goto err; -+ -+ pe_curknl = debugfs_create_file("current_kernel", 0400, -+ dir, NULL, &ptdump_curknl_fops); -+ if (!pe_curknl) -+ goto err; -+ -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+ pe_curusr = debugfs_create_file("current_user", 0400, -+ dir, NULL, &ptdump_curusr_fops); -+ if (!pe_curusr) - goto err; -+#endif - return 0; - err: - debugfs_remove_recursive(dir); -diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c -index 7b022ad37c4e..12b93d350480 100644 ---- a/arch/x86/mm/dump_pagetables.c -+++ b/arch/x86/mm/dump_pagetables.c -@@ -511,8 +511,12 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) - ptdump_walk_pgd_level_core(m, pgd, false, true); - } - --void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd) -+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) - { -+#ifdef CONFIG_PAGE_TABLE_ISOLATION -+ if (user && static_cpu_has(X86_FEATURE_PTI)) -+ pgd = kernel_to_user_pgdp(pgd); -+#endif - ptdump_walk_pgd_level_core(m, pgd, false, false); - } - EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); --- -2.14.2 - diff --git a/patches/kernel/0219-x86-mm-dump_pagetables-Check-user-space-page-table-f.patch b/patches/kernel/0219-x86-mm-dump_pagetables-Check-user-space-page-table-f.patch new file mode 100644 index 0000000..8152b66 --- /dev/null +++ b/patches/kernel/0219-x86-mm-dump_pagetables-Check-user-space-page-table-f.patch @@ -0,0 +1,139 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 4 Dec 2017 15:08:05 +0100 +Subject: [PATCH] x86/mm/dump_pagetables: Check user space page table for WX + pages +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +ptdump_walk_pgd_level_checkwx() checks the kernel page table for WX pages, +but does not check the PAGE_TABLE_ISOLATION user space page table. + +Restructure the code so that dmesg output is selected by an explicit +argument and not implicit via checking the pgd argument for !NULL. + +Add the check for the user space page table. + +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit b4bf4f924b1d7bade38fd51b2e401d20d0956e4d) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 1adfe82e8fe5afa2fae59efe498c461d5a52cb6c) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/pgtable.h | 1 + + arch/x86/mm/debug_pagetables.c | 2 +- + arch/x86/mm/dump_pagetables.c | 30 +++++++++++++++++++++++++----- + 3 files changed, 27 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index 25604b8a251a..4f5eb81cf8be 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -17,6 +17,7 @@ + #include + + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); ++void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd); + void ptdump_walk_pgd_level_checkwx(void); + + #ifdef CONFIG_DEBUG_WX +diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c +index d1449fb6dc7a..8e70c1599e51 100644 +--- a/arch/x86/mm/debug_pagetables.c ++++ b/arch/x86/mm/debug_pagetables.c +@@ -5,7 +5,7 @@ + + static int ptdump_show(struct seq_file *m, void *v) + { +- ptdump_walk_pgd_level(m, NULL); ++ ptdump_walk_pgd_level_debugfs(m, NULL); + return 0; + } + +diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c +index eed93dd4cb4a..7b022ad37c4e 100644 +--- a/arch/x86/mm/dump_pagetables.c ++++ b/arch/x86/mm/dump_pagetables.c +@@ -457,7 +457,7 @@ static inline bool is_hypervisor_range(int idx) + } + + static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, +- bool checkwx) ++ bool checkwx, bool dmesg) + { + #ifdef CONFIG_X86_64 + pgd_t *start = (pgd_t *) &init_top_pgt; +@@ -470,7 +470,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, + + if (pgd) { + start = pgd; +- st.to_dmesg = true; ++ st.to_dmesg = dmesg; + } + + st.check_wx = checkwx; +@@ -508,13 +508,33 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, + + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) + { +- ptdump_walk_pgd_level_core(m, pgd, false); ++ ptdump_walk_pgd_level_core(m, pgd, false, true); ++} ++ ++void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd) ++{ ++ ptdump_walk_pgd_level_core(m, pgd, false, false); ++} ++EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); ++ ++static void ptdump_walk_user_pgd_level_checkwx(void) ++{ ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ pgd_t *pgd = (pgd_t *) &init_top_pgt; ++ ++ if (!static_cpu_has(X86_FEATURE_PTI)) ++ return; ++ ++ pr_info("x86/mm: Checking user space page tables\n"); ++ pgd = kernel_to_user_pgdp(pgd); ++ ptdump_walk_pgd_level_core(NULL, pgd, true, false); ++#endif + } +-EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level); + + void ptdump_walk_pgd_level_checkwx(void) + { +- ptdump_walk_pgd_level_core(NULL, NULL, true); ++ ptdump_walk_pgd_level_core(NULL, NULL, true, false); ++ ptdump_walk_user_pgd_level_checkwx(); + } + + static int __init pt_dump_init(void) +-- +2.14.2 + diff --git a/patches/kernel/0220-x86-ldt-Make-the-LDT-mapping-RO.patch b/patches/kernel/0220-x86-ldt-Make-the-LDT-mapping-RO.patch deleted file mode 100644 index 4e42731..0000000 --- a/patches/kernel/0220-x86-ldt-Make-the-LDT-mapping-RO.patch +++ /dev/null @@ -1,123 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Fri, 15 Dec 2017 20:35:11 +0100 -Subject: [PATCH] x86/ldt: Make the LDT mapping RO -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Now that the LDT mapping is in a known area when PAGE_TABLE_ISOLATION is -enabled its a primary target for attacks, if a user space interface fails -to validate a write address correctly. That can never happen, right? - -The SDM states: - - If the segment descriptors in the GDT or an LDT are placed in ROM, the - processor can enter an indefinite loop if software or the processor - attempts to update (write to) the ROM-based segment descriptors. To - prevent this problem, set the accessed bits for all segment descriptors - placed in a ROM. Also, remove operating-system or executive code that - attempts to modify segment descriptors located in ROM. - -So its a valid approach to set the ACCESS bit when setting up the LDT entry -and to map the table RO. Fixup the selftest so it can handle that new mode. - -Remove the manual ACCESS bit setter in set_tls_desc() as this is now -pointless. Folded the patch from Peter Ziljstra. - -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Dave Hansen -Cc: H. Peter Anvin -Cc: Josh Poimboeuf -Cc: Juergen Gross -Cc: Linus Torvalds -Cc: Peter Zijlstra -Signed-off-by: Ingo Molnar -(cherry picked from commit 9f5cb6b32d9e0a3a7453222baaf15664d92adbf2) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit f4b13d6f67b3a89d878094901a9ca834b39415c1) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/desc.h | 2 ++ - arch/x86/kernel/ldt.c | 7 ++++++- - arch/x86/kernel/tls.c | 11 ++--------- - tools/testing/selftests/x86/ldt_gdt.c | 3 +-- - 4 files changed, 11 insertions(+), 12 deletions(-) - -diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h -index de40c514ba25..c765bc294a9d 100644 ---- a/arch/x86/include/asm/desc.h -+++ b/arch/x86/include/asm/desc.h -@@ -20,6 +20,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in - - desc->type = (info->read_exec_only ^ 1) << 1; - desc->type |= info->contents << 2; -+ /* Set the ACCESS bit so it can be mapped RO */ -+ desc->type |= 1; - - desc->s = 1; - desc->dpl = 0x3; -diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c -index eceaada581ff..2260eb6e2de7 100644 ---- a/arch/x86/kernel/ldt.c -+++ b/arch/x86/kernel/ldt.c -@@ -157,7 +157,12 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) - ptep = get_locked_pte(mm, va, &ptl); - if (!ptep) - return -ENOMEM; -- pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)); -+ /* -+ * Map it RO so the easy to find address is not a primary -+ * target via some kernel interface which misses a -+ * permission check. -+ */ -+ pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)); - set_pte_at(mm, va, ptep, pte); - pte_unmap_unlock(ptep, ptl); - } -diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c -index a106b9719c58..41880a2421ea 100644 ---- a/arch/x86/kernel/tls.c -+++ b/arch/x86/kernel/tls.c -@@ -92,17 +92,10 @@ static void set_tls_desc(struct task_struct *p, int idx, - cpu = get_cpu(); - - while (n-- > 0) { -- if (LDT_empty(info) || LDT_zero(info)) { -+ if (LDT_empty(info) || LDT_zero(info)) - memset(desc, 0, sizeof(*desc)); -- } else { -+ else - fill_ldt(desc, info); -- -- /* -- * Always set the accessed bit so that the CPU -- * doesn't try to write to the (read-only) GDT. -- */ -- desc->type |= 1; -- } - ++info; - ++desc; - } -diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c -index 783e1a754b78..bbd1d0e4d683 100644 ---- a/tools/testing/selftests/x86/ldt_gdt.c -+++ b/tools/testing/selftests/x86/ldt_gdt.c -@@ -121,8 +121,7 @@ static void check_valid_segment(uint16_t index, int ldt, - * NB: Different Linux versions do different things with the - * accessed bit in set_thread_area(). - */ -- if (ar != expected_ar && -- (ldt || ar != (expected_ar | AR_ACCESSED))) { -+ if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) { - printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n", - (ldt ? "LDT" : "GDT"), index, ar, expected_ar); - nerrs++; --- -2.14.2 - diff --git a/patches/kernel/0220-x86-mm-dump_pagetables-Allow-dumping-current-pagetab.patch b/patches/kernel/0220-x86-mm-dump_pagetables-Allow-dumping-current-pagetab.patch new file mode 100644 index 0000000..00aa1ce --- /dev/null +++ b/patches/kernel/0220-x86-mm-dump_pagetables-Allow-dumping-current-pagetab.patch @@ -0,0 +1,188 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 4 Dec 2017 15:08:06 +0100 +Subject: [PATCH] x86/mm/dump_pagetables: Allow dumping current pagetables +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Add two debugfs files which allow to dump the pagetable of the current +task. + +current_kernel dumps the regular page table. This is the page table which +is normally shared between kernel and user space. If kernel page table +isolation is enabled this is the kernel space mapping. + +If kernel page table isolation is enabled the second file, current_user, +dumps the user space page table. + +These files allow to verify the resulting page tables for page table +isolation, but even in the normal case its useful to be able to inspect +user space page tables of current for debugging purposes. + +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Boris Ostrovsky +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: David Laight +Cc: Denys Vlasenko +Cc: Eduardo Valentin +Cc: Greg KH +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Will Deacon +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar +(cherry picked from commit a4b51ef6552c704764684cef7e753162dc87c5fa) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit e31e0526cb47bd1d848fc3fdb10d2aeb909e46b5) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/pgtable.h | 2 +- + arch/x86/mm/debug_pagetables.c | 71 +++++++++++++++++++++++++++++++++++++++--- + arch/x86/mm/dump_pagetables.c | 6 +++- + 3 files changed, 73 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index 4f5eb81cf8be..1f9e7fea3c06 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -17,7 +17,7 @@ + #include + + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); +-void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd); ++void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user); + void ptdump_walk_pgd_level_checkwx(void); + + #ifdef CONFIG_DEBUG_WX +diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c +index 8e70c1599e51..421f2664ffa0 100644 +--- a/arch/x86/mm/debug_pagetables.c ++++ b/arch/x86/mm/debug_pagetables.c +@@ -5,7 +5,7 @@ + + static int ptdump_show(struct seq_file *m, void *v) + { +- ptdump_walk_pgd_level_debugfs(m, NULL); ++ ptdump_walk_pgd_level_debugfs(m, NULL, false); + return 0; + } + +@@ -22,7 +22,57 @@ static const struct file_operations ptdump_fops = { + .release = single_release, + }; + +-static struct dentry *dir, *pe; ++static int ptdump_show_curknl(struct seq_file *m, void *v) ++{ ++ if (current->mm->pgd) { ++ down_read(¤t->mm->mmap_sem); ++ ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false); ++ up_read(¤t->mm->mmap_sem); ++ } ++ return 0; ++} ++ ++static int ptdump_open_curknl(struct inode *inode, struct file *filp) ++{ ++ return single_open(filp, ptdump_show_curknl, NULL); ++} ++ ++static const struct file_operations ptdump_curknl_fops = { ++ .owner = THIS_MODULE, ++ .open = ptdump_open_curknl, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = single_release, ++}; ++ ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++static struct dentry *pe_curusr; ++ ++static int ptdump_show_curusr(struct seq_file *m, void *v) ++{ ++ if (current->mm->pgd) { ++ down_read(¤t->mm->mmap_sem); ++ ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true); ++ up_read(¤t->mm->mmap_sem); ++ } ++ return 0; ++} ++ ++static int ptdump_open_curusr(struct inode *inode, struct file *filp) ++{ ++ return single_open(filp, ptdump_show_curusr, NULL); ++} ++ ++static const struct file_operations ptdump_curusr_fops = { ++ .owner = THIS_MODULE, ++ .open = ptdump_open_curusr, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = single_release, ++}; ++#endif ++ ++static struct dentry *dir, *pe_knl, *pe_curknl; + + static int __init pt_dump_debug_init(void) + { +@@ -30,9 +80,22 @@ static int __init pt_dump_debug_init(void) + if (!dir) + return -ENOMEM; + +- pe = debugfs_create_file("kernel", 0400, dir, NULL, &ptdump_fops); +- if (!pe) ++ pe_knl = debugfs_create_file("kernel", 0400, dir, NULL, ++ &ptdump_fops); ++ if (!pe_knl) ++ goto err; ++ ++ pe_curknl = debugfs_create_file("current_kernel", 0400, ++ dir, NULL, &ptdump_curknl_fops); ++ if (!pe_curknl) ++ goto err; ++ ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ pe_curusr = debugfs_create_file("current_user", 0400, ++ dir, NULL, &ptdump_curusr_fops); ++ if (!pe_curusr) + goto err; ++#endif + return 0; + err: + debugfs_remove_recursive(dir); +diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c +index 7b022ad37c4e..12b93d350480 100644 +--- a/arch/x86/mm/dump_pagetables.c ++++ b/arch/x86/mm/dump_pagetables.c +@@ -511,8 +511,12 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) + ptdump_walk_pgd_level_core(m, pgd, false, true); + } + +-void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd) ++void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) + { ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ if (user && static_cpu_has(X86_FEATURE_PTI)) ++ pgd = kernel_to_user_pgdp(pgd); ++#endif + ptdump_walk_pgd_level_core(m, pgd, false, false); + } + EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); +-- +2.14.2 + diff --git a/patches/kernel/0221-x86-ldt-Make-the-LDT-mapping-RO.patch b/patches/kernel/0221-x86-ldt-Make-the-LDT-mapping-RO.patch new file mode 100644 index 0000000..4e42731 --- /dev/null +++ b/patches/kernel/0221-x86-ldt-Make-the-LDT-mapping-RO.patch @@ -0,0 +1,123 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Fri, 15 Dec 2017 20:35:11 +0100 +Subject: [PATCH] x86/ldt: Make the LDT mapping RO +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Now that the LDT mapping is in a known area when PAGE_TABLE_ISOLATION is +enabled its a primary target for attacks, if a user space interface fails +to validate a write address correctly. That can never happen, right? + +The SDM states: + + If the segment descriptors in the GDT or an LDT are placed in ROM, the + processor can enter an indefinite loop if software or the processor + attempts to update (write to) the ROM-based segment descriptors. To + prevent this problem, set the accessed bits for all segment descriptors + placed in a ROM. Also, remove operating-system or executive code that + attempts to modify segment descriptors located in ROM. + +So its a valid approach to set the ACCESS bit when setting up the LDT entry +and to map the table RO. Fixup the selftest so it can handle that new mode. + +Remove the manual ACCESS bit setter in set_tls_desc() as this is now +pointless. Folded the patch from Peter Ziljstra. + +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Juergen Gross +Cc: Linus Torvalds +Cc: Peter Zijlstra +Signed-off-by: Ingo Molnar +(cherry picked from commit 9f5cb6b32d9e0a3a7453222baaf15664d92adbf2) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit f4b13d6f67b3a89d878094901a9ca834b39415c1) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/desc.h | 2 ++ + arch/x86/kernel/ldt.c | 7 ++++++- + arch/x86/kernel/tls.c | 11 ++--------- + tools/testing/selftests/x86/ldt_gdt.c | 3 +-- + 4 files changed, 11 insertions(+), 12 deletions(-) + +diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h +index de40c514ba25..c765bc294a9d 100644 +--- a/arch/x86/include/asm/desc.h ++++ b/arch/x86/include/asm/desc.h +@@ -20,6 +20,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in + + desc->type = (info->read_exec_only ^ 1) << 1; + desc->type |= info->contents << 2; ++ /* Set the ACCESS bit so it can be mapped RO */ ++ desc->type |= 1; + + desc->s = 1; + desc->dpl = 0x3; +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index eceaada581ff..2260eb6e2de7 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -157,7 +157,12 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) + ptep = get_locked_pte(mm, va, &ptl); + if (!ptep) + return -ENOMEM; +- pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)); ++ /* ++ * Map it RO so the easy to find address is not a primary ++ * target via some kernel interface which misses a ++ * permission check. ++ */ ++ pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)); + set_pte_at(mm, va, ptep, pte); + pte_unmap_unlock(ptep, ptl); + } +diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c +index a106b9719c58..41880a2421ea 100644 +--- a/arch/x86/kernel/tls.c ++++ b/arch/x86/kernel/tls.c +@@ -92,17 +92,10 @@ static void set_tls_desc(struct task_struct *p, int idx, + cpu = get_cpu(); + + while (n-- > 0) { +- if (LDT_empty(info) || LDT_zero(info)) { ++ if (LDT_empty(info) || LDT_zero(info)) + memset(desc, 0, sizeof(*desc)); +- } else { ++ else + fill_ldt(desc, info); +- +- /* +- * Always set the accessed bit so that the CPU +- * doesn't try to write to the (read-only) GDT. +- */ +- desc->type |= 1; +- } + ++info; + ++desc; + } +diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c +index 783e1a754b78..bbd1d0e4d683 100644 +--- a/tools/testing/selftests/x86/ldt_gdt.c ++++ b/tools/testing/selftests/x86/ldt_gdt.c +@@ -121,8 +121,7 @@ static void check_valid_segment(uint16_t index, int ldt, + * NB: Different Linux versions do different things with the + * accessed bit in set_thread_area(). + */ +- if (ar != expected_ar && +- (ldt || ar != (expected_ar | AR_ACCESSED))) { ++ if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) { + printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n", + (ldt ? "LDT" : "GDT"), index, ar, expected_ar); + nerrs++; +-- +2.14.2 + diff --git a/patches/kernel/0221-x86-smpboot-Remove-stale-TLB-flush-invocations.patch b/patches/kernel/0221-x86-smpboot-Remove-stale-TLB-flush-invocations.patch deleted file mode 100644 index b5c62d0..0000000 --- a/patches/kernel/0221-x86-smpboot-Remove-stale-TLB-flush-invocations.patch +++ /dev/null @@ -1,74 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Sat, 30 Dec 2017 22:13:53 +0100 -Subject: [PATCH] x86/smpboot: Remove stale TLB flush invocations -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -smpboot_setup_warm_reset_vector() and smpboot_restore_warm_reset_vector() -invoke local_flush_tlb() for no obvious reason. - -Digging in history revealed that the original code in the 2.1 era added -those because the code manipulated a swapper_pg_dir pagetable entry. The -pagetable manipulation was removed long ago in the 2.3 timeframe, but the -TLB flush invocations stayed around forever. - -Remove them along with the pointless pr_debug()s which come from the same 2.1 -change. - -Reported-by: Dominik Brodowski -Signed-off-by: Thomas Gleixner -Cc: -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Dave Hansen -Cc: Linus Torvalds -Cc: Linus Torvalds -Cc: Peter Zijlstra -Link: http://lkml.kernel.org/r/20171230211829.586548655@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit 322f8b8b340c824aef891342b0f5795d15e11562) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit fb08c4a80a22dc79c9775f493e291dfe2c642b86) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/smpboot.c | 9 --------- - 1 file changed, 9 deletions(-) - -diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c -index 03d2ba2da3b0..6ad8391b9866 100644 ---- a/arch/x86/kernel/smpboot.c -+++ b/arch/x86/kernel/smpboot.c -@@ -128,25 +128,16 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) - spin_lock_irqsave(&rtc_lock, flags); - CMOS_WRITE(0xa, 0xf); - spin_unlock_irqrestore(&rtc_lock, flags); -- local_flush_tlb(); -- pr_debug("1.\n"); - *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = - start_eip >> 4; -- pr_debug("2.\n"); - *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = - start_eip & 0xf; -- pr_debug("3.\n"); - } - - static inline void smpboot_restore_warm_reset_vector(void) - { - unsigned long flags; - -- /* -- * Install writable page 0 entry to set BIOS data area. -- */ -- local_flush_tlb(); -- - /* - * Paranoid: Set warm reset code and vector here back - * to default values. --- -2.14.2 - diff --git a/patches/kernel/0222-x86-mm-Remove-preempt_disable-enable-from-__native_f.patch b/patches/kernel/0222-x86-mm-Remove-preempt_disable-enable-from-__native_f.patch deleted file mode 100644 index 934ff9b..0000000 --- a/patches/kernel/0222-x86-mm-Remove-preempt_disable-enable-from-__native_f.patch +++ /dev/null @@ -1,89 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Sat, 30 Dec 2017 22:13:54 +0100 -Subject: [PATCH] x86/mm: Remove preempt_disable/enable() from - __native_flush_tlb() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The preempt_disable/enable() pair in __native_flush_tlb() was added in -commit: - - 5cf0791da5c1 ("x86/mm: Disable preemption during CR3 read+write") - -... to protect the UP variant of flush_tlb_mm_range(). - -That preempt_disable/enable() pair should have been added to the UP variant -of flush_tlb_mm_range() instead. - -The UP variant was removed with commit: - - ce4a4e565f52 ("x86/mm: Remove the UP asm/tlbflush.h code, always use the (formerly) SMP code") - -... but the preempt_disable/enable() pair stayed around. - -The latest change to __native_flush_tlb() in commit: - - 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches") - -... added an access to a per CPU variable outside the preempt disabled -regions, which makes no sense at all. __native_flush_tlb() must always -be called with at least preemption disabled. - -Remove the preempt_disable/enable() pair and add a WARN_ON_ONCE() to catch -bad callers independent of the smp_processor_id() debugging. - -Signed-off-by: Thomas Gleixner -Cc: -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Dave Hansen -Cc: Dominik Brodowski -Cc: Linus Torvalds -Cc: Linus Torvalds -Cc: Peter Zijlstra -Link: http://lkml.kernel.org/r/20171230211829.679325424@linutronix.de -Signed-off-by: Ingo Molnar -(cherry picked from commit decab0888e6e14e11d53cefa85f8b3d3b45ce73c) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit cfcf931c425b60d0092bcb4a4deb1f5d5db0e293) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/tlbflush.h | 14 ++++++++------ - 1 file changed, 8 insertions(+), 6 deletions(-) - -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index 7a04a1f1ca11..ff6a6d668c32 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -334,15 +334,17 @@ static inline void invalidate_user_asid(u16 asid) - */ - static inline void __native_flush_tlb(void) - { -- invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); - /* -- * If current->mm == NULL then we borrow a mm which may change -- * during a task switch and therefore we must not be preempted -- * while we write CR3 back: -+ * Preemption or interrupts must be disabled to protect the access -+ * to the per CPU variable and to prevent being preempted between -+ * read_cr3() and write_cr3(). - */ -- preempt_disable(); -+ WARN_ON_ONCE(preemptible()); -+ -+ invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); -+ -+ /* If current->mm == NULL then the read_cr3() "borrows" an mm */ - native_write_cr3(__native_read_cr3()); -- preempt_enable(); - } - - /* --- -2.14.2 - diff --git a/patches/kernel/0222-x86-smpboot-Remove-stale-TLB-flush-invocations.patch b/patches/kernel/0222-x86-smpboot-Remove-stale-TLB-flush-invocations.patch new file mode 100644 index 0000000..b5c62d0 --- /dev/null +++ b/patches/kernel/0222-x86-smpboot-Remove-stale-TLB-flush-invocations.patch @@ -0,0 +1,74 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Sat, 30 Dec 2017 22:13:53 +0100 +Subject: [PATCH] x86/smpboot: Remove stale TLB flush invocations +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +smpboot_setup_warm_reset_vector() and smpboot_restore_warm_reset_vector() +invoke local_flush_tlb() for no obvious reason. + +Digging in history revealed that the original code in the 2.1 era added +those because the code manipulated a swapper_pg_dir pagetable entry. The +pagetable manipulation was removed long ago in the 2.3 timeframe, but the +TLB flush invocations stayed around forever. + +Remove them along with the pointless pr_debug()s which come from the same 2.1 +change. + +Reported-by: Dominik Brodowski +Signed-off-by: Thomas Gleixner +Cc: +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: Linus Torvalds +Cc: Linus Torvalds +Cc: Peter Zijlstra +Link: http://lkml.kernel.org/r/20171230211829.586548655@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit 322f8b8b340c824aef891342b0f5795d15e11562) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit fb08c4a80a22dc79c9775f493e291dfe2c642b86) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/smpboot.c | 9 --------- + 1 file changed, 9 deletions(-) + +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index 03d2ba2da3b0..6ad8391b9866 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -128,25 +128,16 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) + spin_lock_irqsave(&rtc_lock, flags); + CMOS_WRITE(0xa, 0xf); + spin_unlock_irqrestore(&rtc_lock, flags); +- local_flush_tlb(); +- pr_debug("1.\n"); + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = + start_eip >> 4; +- pr_debug("2.\n"); + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = + start_eip & 0xf; +- pr_debug("3.\n"); + } + + static inline void smpboot_restore_warm_reset_vector(void) + { + unsigned long flags; + +- /* +- * Install writable page 0 entry to set BIOS data area. +- */ +- local_flush_tlb(); +- + /* + * Paranoid: Set warm reset code and vector here back + * to default values. +-- +2.14.2 + diff --git a/patches/kernel/0223-x86-ldt-Plug-memory-leak-in-error-path.patch b/patches/kernel/0223-x86-ldt-Plug-memory-leak-in-error-path.patch deleted file mode 100644 index ef3580a..0000000 --- a/patches/kernel/0223-x86-ldt-Plug-memory-leak-in-error-path.patch +++ /dev/null @@ -1,63 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Sun, 31 Dec 2017 11:24:34 +0100 -Subject: [PATCH] x86/ldt: Plug memory leak in error path -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The error path in write_ldt() tries to free 'old_ldt' instead of the newly -allocated 'new_ldt', resulting in a memory leak. It also misses to clean up a -half populated LDT pagetable, which is not a leak as it gets cleaned up -when the process exits. - -Free both the potentially half populated LDT pagetable and the newly -allocated LDT struct. This can be done unconditionally because once an LDT -is mapped subsequent maps will succeed, because the PTE page is already -populated and the two LDTs fit into that single page. - -Reported-by: Mathieu Desnoyers -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Borislav Petkov -Cc: Dave Hansen -Cc: Dominik Brodowski -Cc: Linus Torvalds -Cc: Linus Torvalds -Cc: Peter Zijlstra -Fixes: f55f0501cbf6 ("x86/pti: Put the LDT in its own PGD if PTI is on") -Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1712311121340.1899@nanos -Signed-off-by: Ingo Molnar -(cherry picked from commit a62d69857aab4caa43049e72fe0ed5c4a60518dd) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 03d02494f6253d0bdca7254d85e50786448c14f9) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/ldt.c | 8 +++++++- - 1 file changed, 7 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c -index 2260eb6e2de7..9a35b7e541bc 100644 ---- a/arch/x86/kernel/ldt.c -+++ b/arch/x86/kernel/ldt.c -@@ -420,7 +420,13 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) - */ - error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); - if (error) { -- free_ldt_struct(old_ldt); -+ /* -+ * This only can fail for the first LDT setup. If an LDT is -+ * already installed then the PTE page is already -+ * populated. Mop up a half populated page table. -+ */ -+ free_ldt_pgtables(mm); -+ free_ldt_struct(new_ldt); - goto out_unlock; - } - --- -2.14.2 - diff --git a/patches/kernel/0223-x86-mm-Remove-preempt_disable-enable-from-__native_f.patch b/patches/kernel/0223-x86-mm-Remove-preempt_disable-enable-from-__native_f.patch new file mode 100644 index 0000000..934ff9b --- /dev/null +++ b/patches/kernel/0223-x86-mm-Remove-preempt_disable-enable-from-__native_f.patch @@ -0,0 +1,89 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Sat, 30 Dec 2017 22:13:54 +0100 +Subject: [PATCH] x86/mm: Remove preempt_disable/enable() from + __native_flush_tlb() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The preempt_disable/enable() pair in __native_flush_tlb() was added in +commit: + + 5cf0791da5c1 ("x86/mm: Disable preemption during CR3 read+write") + +... to protect the UP variant of flush_tlb_mm_range(). + +That preempt_disable/enable() pair should have been added to the UP variant +of flush_tlb_mm_range() instead. + +The UP variant was removed with commit: + + ce4a4e565f52 ("x86/mm: Remove the UP asm/tlbflush.h code, always use the (formerly) SMP code") + +... but the preempt_disable/enable() pair stayed around. + +The latest change to __native_flush_tlb() in commit: + + 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches") + +... added an access to a per CPU variable outside the preempt disabled +regions, which makes no sense at all. __native_flush_tlb() must always +be called with at least preemption disabled. + +Remove the preempt_disable/enable() pair and add a WARN_ON_ONCE() to catch +bad callers independent of the smp_processor_id() debugging. + +Signed-off-by: Thomas Gleixner +Cc: +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: Dominik Brodowski +Cc: Linus Torvalds +Cc: Linus Torvalds +Cc: Peter Zijlstra +Link: http://lkml.kernel.org/r/20171230211829.679325424@linutronix.de +Signed-off-by: Ingo Molnar +(cherry picked from commit decab0888e6e14e11d53cefa85f8b3d3b45ce73c) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit cfcf931c425b60d0092bcb4a4deb1f5d5db0e293) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/tlbflush.h | 14 ++++++++------ + 1 file changed, 8 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 7a04a1f1ca11..ff6a6d668c32 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -334,15 +334,17 @@ static inline void invalidate_user_asid(u16 asid) + */ + static inline void __native_flush_tlb(void) + { +- invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + /* +- * If current->mm == NULL then we borrow a mm which may change +- * during a task switch and therefore we must not be preempted +- * while we write CR3 back: ++ * Preemption or interrupts must be disabled to protect the access ++ * to the per CPU variable and to prevent being preempted between ++ * read_cr3() and write_cr3(). + */ +- preempt_disable(); ++ WARN_ON_ONCE(preemptible()); ++ ++ invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); ++ ++ /* If current->mm == NULL then the read_cr3() "borrows" an mm */ + native_write_cr3(__native_read_cr3()); +- preempt_enable(); + } + + /* +-- +2.14.2 + diff --git a/patches/kernel/0224-x86-ldt-Make-LDT-pgtable-free-conditional.patch b/patches/kernel/0224-x86-ldt-Make-LDT-pgtable-free-conditional.patch deleted file mode 100644 index 5d177cd..0000000 --- a/patches/kernel/0224-x86-ldt-Make-LDT-pgtable-free-conditional.patch +++ /dev/null @@ -1,42 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Sun, 31 Dec 2017 16:52:15 +0100 -Subject: [PATCH] x86/ldt: Make LDT pgtable free conditional -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Andy prefers to be paranoid about the pagetable free in the error path of -write_ldt(). Make it conditional and warn whenever the installment of a -secondary LDT fails. - -Requested-by: Andy Lutomirski -Signed-off-by: Thomas Gleixner -(cherry picked from commit 7f414195b0c3612acd12b4611a5fe75995cf10c7) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 4e23d9d8427c9b2bd10176bd56dfcaca5e0d6b0f) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/ldt.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c -index 9a35b7e541bc..51af781fac85 100644 ---- a/arch/x86/kernel/ldt.c -+++ b/arch/x86/kernel/ldt.c -@@ -425,7 +425,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) - * already installed then the PTE page is already - * populated. Mop up a half populated page table. - */ -- free_ldt_pgtables(mm); -+ if (!WARN_ON_ONCE(old_ldt)) -+ free_ldt_pgtables(mm); - free_ldt_struct(new_ldt); - goto out_unlock; - } --- -2.14.2 - diff --git a/patches/kernel/0224-x86-ldt-Plug-memory-leak-in-error-path.patch b/patches/kernel/0224-x86-ldt-Plug-memory-leak-in-error-path.patch new file mode 100644 index 0000000..ef3580a --- /dev/null +++ b/patches/kernel/0224-x86-ldt-Plug-memory-leak-in-error-path.patch @@ -0,0 +1,63 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Sun, 31 Dec 2017 11:24:34 +0100 +Subject: [PATCH] x86/ldt: Plug memory leak in error path +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The error path in write_ldt() tries to free 'old_ldt' instead of the newly +allocated 'new_ldt', resulting in a memory leak. It also misses to clean up a +half populated LDT pagetable, which is not a leak as it gets cleaned up +when the process exits. + +Free both the potentially half populated LDT pagetable and the newly +allocated LDT struct. This can be done unconditionally because once an LDT +is mapped subsequent maps will succeed, because the PTE page is already +populated and the two LDTs fit into that single page. + +Reported-by: Mathieu Desnoyers +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Dave Hansen +Cc: Dominik Brodowski +Cc: Linus Torvalds +Cc: Linus Torvalds +Cc: Peter Zijlstra +Fixes: f55f0501cbf6 ("x86/pti: Put the LDT in its own PGD if PTI is on") +Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1712311121340.1899@nanos +Signed-off-by: Ingo Molnar +(cherry picked from commit a62d69857aab4caa43049e72fe0ed5c4a60518dd) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 03d02494f6253d0bdca7254d85e50786448c14f9) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/ldt.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index 2260eb6e2de7..9a35b7e541bc 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -420,7 +420,13 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) + */ + error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); + if (error) { +- free_ldt_struct(old_ldt); ++ /* ++ * This only can fail for the first LDT setup. If an LDT is ++ * already installed then the PTE page is already ++ * populated. Mop up a half populated page table. ++ */ ++ free_ldt_pgtables(mm); ++ free_ldt_struct(new_ldt); + goto out_unlock; + } + +-- +2.14.2 + diff --git a/patches/kernel/0225-UBUNTU-Config-updateconfigs-to-enable-PTI.patch b/patches/kernel/0225-UBUNTU-Config-updateconfigs-to-enable-PTI.patch deleted file mode 100644 index 66452e5..0000000 --- a/patches/kernel/0225-UBUNTU-Config-updateconfigs-to-enable-PTI.patch +++ /dev/null @@ -1,77 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Whitcroft -Date: Mon, 18 Dec 2017 12:09:25 +0000 -Subject: [PATCH] UBUNTU: [Config] updateconfigs to enable PTI -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 239497489e75fe18b55f568a43c76fd65a7cbf4f) -Signed-off-by: Fabian Grünbichler ---- - debian.master/config/amd64/config.common.amd64 | 1 + - debian.master/config/config.common.ubuntu | 5 ++++- - debian.master/config/i386/config.common.i386 | 1 + - 3 files changed, 6 insertions(+), 1 deletion(-) - -diff --git a/debian.master/config/amd64/config.common.amd64 b/debian.master/config/amd64/config.common.amd64 -index 6df8bcf72690..6412a1366160 100644 ---- a/debian.master/config/amd64/config.common.amd64 -+++ b/debian.master/config/amd64/config.common.amd64 -@@ -471,6 +471,7 @@ CONFIG_UIO_PRUSS=m - CONFIG_UIO_SERCOS3=m - CONFIG_ULTRIX_PARTITION=y - CONFIG_UNIXWARE_DISKLABEL=y -+# CONFIG_UNWINDER_FRAME_POINTER is not set - CONFIG_USB_DWC2_PCI=m - CONFIG_USB_EHCI_HCD_PLATFORM=y - CONFIG_USB_GADGET=m -diff --git a/debian.master/config/config.common.ubuntu b/debian.master/config/config.common.ubuntu -index 37a14874f7f9..ebb00db16844 100644 ---- a/debian.master/config/config.common.ubuntu -+++ b/debian.master/config/config.common.ubuntu -@@ -6201,6 +6201,7 @@ CONFIG_PADATA=y - CONFIG_PAGE_COUNTER=y - # CONFIG_PAGE_OWNER is not set - # CONFIG_PAGE_POISONING is not set -+CONFIG_PAGE_TABLE_ISOLATION=y - CONFIG_PALMAS_GPADC=m - CONFIG_PANASONIC_LAPTOP=m - CONFIG_PANEL=m -@@ -8659,7 +8660,7 @@ CONFIG_STACKTRACE=y - CONFIG_STACKTRACE_SUPPORT=y - CONFIG_STACK_GUARD=256 - CONFIG_STACK_TRACER=y --# CONFIG_STACK_VALIDATION is not set -+CONFIG_STACK_VALIDATION=y - # CONFIG_STAGING_BOARD is not set - CONFIG_STAGING_MEDIA=y - # CONFIG_STATIC_KEYS_SELFTEST is not set -@@ -9173,6 +9174,8 @@ CONFIG_UNIX=y - CONFIG_UNIX98_PTYS=y - CONFIG_UNIX_DIAG=m - CONFIG_UNUSED_SYMBOLS=y -+# CONFIG_UNWINDER_GUESS is not set -+CONFIG_UNWINDER_ORC=y - CONFIG_UPROBES=y - CONFIG_UPROBE_EVENTS=y - CONFIG_US5182D=m -diff --git a/debian.master/config/i386/config.common.i386 b/debian.master/config/i386/config.common.i386 -index eb973e0eb199..4b8d6a14e31c 100644 ---- a/debian.master/config/i386/config.common.i386 -+++ b/debian.master/config/i386/config.common.i386 -@@ -463,6 +463,7 @@ CONFIG_UIO_PRUSS=m - CONFIG_UIO_SERCOS3=m - CONFIG_ULTRIX_PARTITION=y - CONFIG_UNIXWARE_DISKLABEL=y -+CONFIG_UNWINDER_FRAME_POINTER=y - CONFIG_USB_DWC2_PCI=m - CONFIG_USB_EHCI_HCD_PLATFORM=y - CONFIG_USB_GADGET=m --- -2.14.2 - diff --git a/patches/kernel/0225-x86-ldt-Make-LDT-pgtable-free-conditional.patch b/patches/kernel/0225-x86-ldt-Make-LDT-pgtable-free-conditional.patch new file mode 100644 index 0000000..5d177cd --- /dev/null +++ b/patches/kernel/0225-x86-ldt-Make-LDT-pgtable-free-conditional.patch @@ -0,0 +1,42 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Sun, 31 Dec 2017 16:52:15 +0100 +Subject: [PATCH] x86/ldt: Make LDT pgtable free conditional +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Andy prefers to be paranoid about the pagetable free in the error path of +write_ldt(). Make it conditional and warn whenever the installment of a +secondary LDT fails. + +Requested-by: Andy Lutomirski +Signed-off-by: Thomas Gleixner +(cherry picked from commit 7f414195b0c3612acd12b4611a5fe75995cf10c7) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 4e23d9d8427c9b2bd10176bd56dfcaca5e0d6b0f) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/ldt.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index 9a35b7e541bc..51af781fac85 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -425,7 +425,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) + * already installed then the PTE page is already + * populated. Mop up a half populated page table. + */ +- free_ldt_pgtables(mm); ++ if (!WARN_ON_ONCE(old_ldt)) ++ free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); + goto out_unlock; + } +-- +2.14.2 + diff --git a/patches/kernel/0226-UBUNTU-Config-updateconfigs-to-enable-PTI.patch b/patches/kernel/0226-UBUNTU-Config-updateconfigs-to-enable-PTI.patch new file mode 100644 index 0000000..66452e5 --- /dev/null +++ b/patches/kernel/0226-UBUNTU-Config-updateconfigs-to-enable-PTI.patch @@ -0,0 +1,77 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Whitcroft +Date: Mon, 18 Dec 2017 12:09:25 +0000 +Subject: [PATCH] UBUNTU: [Config] updateconfigs to enable PTI +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 239497489e75fe18b55f568a43c76fd65a7cbf4f) +Signed-off-by: Fabian Grünbichler +--- + debian.master/config/amd64/config.common.amd64 | 1 + + debian.master/config/config.common.ubuntu | 5 ++++- + debian.master/config/i386/config.common.i386 | 1 + + 3 files changed, 6 insertions(+), 1 deletion(-) + +diff --git a/debian.master/config/amd64/config.common.amd64 b/debian.master/config/amd64/config.common.amd64 +index 6df8bcf72690..6412a1366160 100644 +--- a/debian.master/config/amd64/config.common.amd64 ++++ b/debian.master/config/amd64/config.common.amd64 +@@ -471,6 +471,7 @@ CONFIG_UIO_PRUSS=m + CONFIG_UIO_SERCOS3=m + CONFIG_ULTRIX_PARTITION=y + CONFIG_UNIXWARE_DISKLABEL=y ++# CONFIG_UNWINDER_FRAME_POINTER is not set + CONFIG_USB_DWC2_PCI=m + CONFIG_USB_EHCI_HCD_PLATFORM=y + CONFIG_USB_GADGET=m +diff --git a/debian.master/config/config.common.ubuntu b/debian.master/config/config.common.ubuntu +index 37a14874f7f9..ebb00db16844 100644 +--- a/debian.master/config/config.common.ubuntu ++++ b/debian.master/config/config.common.ubuntu +@@ -6201,6 +6201,7 @@ CONFIG_PADATA=y + CONFIG_PAGE_COUNTER=y + # CONFIG_PAGE_OWNER is not set + # CONFIG_PAGE_POISONING is not set ++CONFIG_PAGE_TABLE_ISOLATION=y + CONFIG_PALMAS_GPADC=m + CONFIG_PANASONIC_LAPTOP=m + CONFIG_PANEL=m +@@ -8659,7 +8660,7 @@ CONFIG_STACKTRACE=y + CONFIG_STACKTRACE_SUPPORT=y + CONFIG_STACK_GUARD=256 + CONFIG_STACK_TRACER=y +-# CONFIG_STACK_VALIDATION is not set ++CONFIG_STACK_VALIDATION=y + # CONFIG_STAGING_BOARD is not set + CONFIG_STAGING_MEDIA=y + # CONFIG_STATIC_KEYS_SELFTEST is not set +@@ -9173,6 +9174,8 @@ CONFIG_UNIX=y + CONFIG_UNIX98_PTYS=y + CONFIG_UNIX_DIAG=m + CONFIG_UNUSED_SYMBOLS=y ++# CONFIG_UNWINDER_GUESS is not set ++CONFIG_UNWINDER_ORC=y + CONFIG_UPROBES=y + CONFIG_UPROBE_EVENTS=y + CONFIG_US5182D=m +diff --git a/debian.master/config/i386/config.common.i386 b/debian.master/config/i386/config.common.i386 +index eb973e0eb199..4b8d6a14e31c 100644 +--- a/debian.master/config/i386/config.common.i386 ++++ b/debian.master/config/i386/config.common.i386 +@@ -463,6 +463,7 @@ CONFIG_UIO_PRUSS=m + CONFIG_UIO_SERCOS3=m + CONFIG_ULTRIX_PARTITION=y + CONFIG_UNIXWARE_DISKLABEL=y ++CONFIG_UNWINDER_FRAME_POINTER=y + CONFIG_USB_DWC2_PCI=m + CONFIG_USB_EHCI_HCD_PLATFORM=y + CONFIG_USB_GADGET=m +-- +2.14.2 + diff --git a/patches/kernel/0226-kvm-x86-fix-RSM-when-PCID-is-non-zero.patch b/patches/kernel/0226-kvm-x86-fix-RSM-when-PCID-is-non-zero.patch deleted file mode 100644 index e83e451..0000000 --- a/patches/kernel/0226-kvm-x86-fix-RSM-when-PCID-is-non-zero.patch +++ /dev/null @@ -1,124 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Paolo Bonzini -Date: Thu, 21 Dec 2017 00:49:14 +0100 -Subject: [PATCH] kvm: x86: fix RSM when PCID is non-zero -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -rsm_load_state_64() and rsm_enter_protected_mode() load CR3, then -CR4 & ~PCIDE, then CR0, then CR4. - -However, setting CR4.PCIDE fails if CR3[11:0] != 0. It's probably easier -in the long run to replace rsm_enter_protected_mode() with an emulator -callback that sets all the special registers (like KVM_SET_SREGS would -do). For now, set the PCID field of CR3 only after CR4.PCIDE is 1. - -Reported-by: Laszlo Ersek -Tested-by: Laszlo Ersek -Fixes: 660a5d517aaab9187f93854425c4c63f4a09195c -Cc: stable@vger.kernel.org -Signed-off-by: Paolo Bonzini -(cherry picked from commit fae1a3e775cca8c3a9e0eb34443b310871a15a92) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit dba4ceb9a91ed2d11a47722436b3c0be15e791d4) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kvm/emulate.c | 32 +++++++++++++++++++++++++------- - 1 file changed, 25 insertions(+), 7 deletions(-) - -diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c -index fb0055953fbc..155f2af2cb39 100644 ---- a/arch/x86/kvm/emulate.c -+++ b/arch/x86/kvm/emulate.c -@@ -2399,9 +2399,21 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) - } - - static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, -- u64 cr0, u64 cr4) -+ u64 cr0, u64 cr3, u64 cr4) - { - int bad; -+ u64 pcid; -+ -+ /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */ -+ pcid = 0; -+ if (cr4 & X86_CR4_PCIDE) { -+ pcid = cr3 & 0xfff; -+ cr3 &= ~0xfff; -+ } -+ -+ bad = ctxt->ops->set_cr(ctxt, 3, cr3); -+ if (bad) -+ return X86EMUL_UNHANDLEABLE; - - /* - * First enable PAE, long mode needs it before CR0.PG = 1 is set. -@@ -2420,6 +2432,12 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, - bad = ctxt->ops->set_cr(ctxt, 4, cr4); - if (bad) - return X86EMUL_UNHANDLEABLE; -+ if (pcid) { -+ bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid); -+ if (bad) -+ return X86EMUL_UNHANDLEABLE; -+ } -+ - } - - return X86EMUL_CONTINUE; -@@ -2430,11 +2448,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) - struct desc_struct desc; - struct desc_ptr dt; - u16 selector; -- u32 val, cr0, cr4; -+ u32 val, cr0, cr3, cr4; - int i; - - cr0 = GET_SMSTATE(u32, smbase, 0x7ffc); -- ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8)); -+ cr3 = GET_SMSTATE(u32, smbase, 0x7ff8); - ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED; - ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0); - -@@ -2476,14 +2494,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) - - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8)); - -- return rsm_enter_protected_mode(ctxt, cr0, cr4); -+ return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); - } - - static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) - { - struct desc_struct desc; - struct desc_ptr dt; -- u64 val, cr0, cr4; -+ u64 val, cr0, cr3, cr4; - u32 base3; - u16 selector; - int i, r; -@@ -2500,7 +2518,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) - ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); - - cr0 = GET_SMSTATE(u64, smbase, 0x7f58); -- ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50)); -+ cr3 = GET_SMSTATE(u64, smbase, 0x7f50); - cr4 = GET_SMSTATE(u64, smbase, 0x7f48); - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00)); - val = GET_SMSTATE(u64, smbase, 0x7ed0); -@@ -2528,7 +2546,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) - dt.address = GET_SMSTATE(u64, smbase, 0x7e68); - ctxt->ops->set_gdt(ctxt, &dt); - -- r = rsm_enter_protected_mode(ctxt, cr0, cr4); -+ r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); - if (r != X86EMUL_CONTINUE) - return r; - --- -2.14.2 - diff --git a/patches/kernel/0227-kvm-x86-fix-RSM-when-PCID-is-non-zero.patch b/patches/kernel/0227-kvm-x86-fix-RSM-when-PCID-is-non-zero.patch new file mode 100644 index 0000000..e83e451 --- /dev/null +++ b/patches/kernel/0227-kvm-x86-fix-RSM-when-PCID-is-non-zero.patch @@ -0,0 +1,124 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Thu, 21 Dec 2017 00:49:14 +0100 +Subject: [PATCH] kvm: x86: fix RSM when PCID is non-zero +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +rsm_load_state_64() and rsm_enter_protected_mode() load CR3, then +CR4 & ~PCIDE, then CR0, then CR4. + +However, setting CR4.PCIDE fails if CR3[11:0] != 0. It's probably easier +in the long run to replace rsm_enter_protected_mode() with an emulator +callback that sets all the special registers (like KVM_SET_SREGS would +do). For now, set the PCID field of CR3 only after CR4.PCIDE is 1. + +Reported-by: Laszlo Ersek +Tested-by: Laszlo Ersek +Fixes: 660a5d517aaab9187f93854425c4c63f4a09195c +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +(cherry picked from commit fae1a3e775cca8c3a9e0eb34443b310871a15a92) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit dba4ceb9a91ed2d11a47722436b3c0be15e791d4) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kvm/emulate.c | 32 +++++++++++++++++++++++++------- + 1 file changed, 25 insertions(+), 7 deletions(-) + +diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c +index fb0055953fbc..155f2af2cb39 100644 +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -2399,9 +2399,21 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) + } + + static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, +- u64 cr0, u64 cr4) ++ u64 cr0, u64 cr3, u64 cr4) + { + int bad; ++ u64 pcid; ++ ++ /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */ ++ pcid = 0; ++ if (cr4 & X86_CR4_PCIDE) { ++ pcid = cr3 & 0xfff; ++ cr3 &= ~0xfff; ++ } ++ ++ bad = ctxt->ops->set_cr(ctxt, 3, cr3); ++ if (bad) ++ return X86EMUL_UNHANDLEABLE; + + /* + * First enable PAE, long mode needs it before CR0.PG = 1 is set. +@@ -2420,6 +2432,12 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, + bad = ctxt->ops->set_cr(ctxt, 4, cr4); + if (bad) + return X86EMUL_UNHANDLEABLE; ++ if (pcid) { ++ bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid); ++ if (bad) ++ return X86EMUL_UNHANDLEABLE; ++ } ++ + } + + return X86EMUL_CONTINUE; +@@ -2430,11 +2448,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) + struct desc_struct desc; + struct desc_ptr dt; + u16 selector; +- u32 val, cr0, cr4; ++ u32 val, cr0, cr3, cr4; + int i; + + cr0 = GET_SMSTATE(u32, smbase, 0x7ffc); +- ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8)); ++ cr3 = GET_SMSTATE(u32, smbase, 0x7ff8); + ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED; + ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0); + +@@ -2476,14 +2494,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) + + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8)); + +- return rsm_enter_protected_mode(ctxt, cr0, cr4); ++ return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); + } + + static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) + { + struct desc_struct desc; + struct desc_ptr dt; +- u64 val, cr0, cr4; ++ u64 val, cr0, cr3, cr4; + u32 base3; + u16 selector; + int i, r; +@@ -2500,7 +2518,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) + ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); + + cr0 = GET_SMSTATE(u64, smbase, 0x7f58); +- ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50)); ++ cr3 = GET_SMSTATE(u64, smbase, 0x7f50); + cr4 = GET_SMSTATE(u64, smbase, 0x7f48); + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00)); + val = GET_SMSTATE(u64, smbase, 0x7ed0); +@@ -2528,7 +2546,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) + dt.address = GET_SMSTATE(u64, smbase, 0x7e68); + ctxt->ops->set_gdt(ctxt, &dt); + +- r = rsm_enter_protected_mode(ctxt, cr0, cr4); ++ r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); + if (r != X86EMUL_CONTINUE) + return r; + +-- +2.14.2 + diff --git a/patches/kernel/0227-x86-pti-Switch-to-kernel-CR3-at-early-in-entry_SYSCA.patch b/patches/kernel/0227-x86-pti-Switch-to-kernel-CR3-at-early-in-entry_SYSCA.patch deleted file mode 100644 index 547c382..0000000 --- a/patches/kernel/0227-x86-pti-Switch-to-kernel-CR3-at-early-in-entry_SYSCA.patch +++ /dev/null @@ -1,78 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Wed, 3 Jan 2018 19:52:04 +0100 -Subject: [PATCH] x86/pti: Switch to kernel CR3 at early in - entry_SYSCALL_compat() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The preparation for PTI which added CR3 switching to the entry code -misplaced the CR3 switch in entry_SYSCALL_compat(). - -With PTI enabled the entry code tries to access a per cpu variable after -switching to kernel GS. This fails because that variable is not mapped to -user space. This results in a double fault and in the worst case a kernel -crash. - -Move the switch ahead of the access and clobber RSP which has been saved -already. - -Fixes: 8a09317b895f ("x86/mm/pti: Prepare the x86/entry assembly code for entry/exit CR3 switching") -Reported-by: Lars Wendler -Reported-by: Laura Abbott -Signed-off-by: Thomas Gleixner -Cc: Borislav Betkov -Cc: Andy Lutomirski , -Cc: Dave Hansen , -Cc: Peter Zijlstra , -Cc: Greg KH , , -Cc: Boris Ostrovsky , -Cc: Juergen Gross -Cc: stable@vger.kernel.org -Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801031949200.1957@nanos -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 2f45cd7a57da0a4d7f3a91a5f577c76b9ed9eb8a) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/entry_64_compat.S | 13 ++++++------- - 1 file changed, 6 insertions(+), 7 deletions(-) - -diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S -index 973527e34887..2b5e7685823c 100644 ---- a/arch/x86/entry/entry_64_compat.S -+++ b/arch/x86/entry/entry_64_compat.S -@@ -189,8 +189,13 @@ ENTRY(entry_SYSCALL_compat) - /* Interrupts are off on entry. */ - swapgs - -- /* Stash user ESP and switch to the kernel stack. */ -+ /* Stash user ESP.*/ - movl %esp, %r8d -+ -+ /* Use %rsp as scratch reg. User ESP is stashed in r8 */ -+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp -+ -+ /* Switch to the kernel stack */ - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - - /* Construct struct pt_regs on stack */ -@@ -218,12 +223,6 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) - pushq $0 /* pt_regs->r14 = 0 */ - pushq $0 /* pt_regs->r15 = 0 */ - -- /* -- * We just saved %rdi so it is safe to clobber. It is not -- * preserved during the C calls inside TRACE_IRQS_OFF anyway. -- */ -- SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi -- - /* - * User mode is traced as though IRQs are on, and SYSENTER - * turned them off. --- -2.14.2 - diff --git a/patches/kernel/0228-UBUNTU-SAUCE-bpf-reject-out-of-bounds-stack-pointer-.patch b/patches/kernel/0228-UBUNTU-SAUCE-bpf-reject-out-of-bounds-stack-pointer-.patch deleted file mode 100644 index 802ae9f..0000000 --- a/patches/kernel/0228-UBUNTU-SAUCE-bpf-reject-out-of-bounds-stack-pointer-.patch +++ /dev/null @@ -1,69 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Jann Horn -Date: Thu, 4 Jan 2018 08:01:21 -0600 -Subject: [PATCH] UBUNTU: SAUCE: bpf: reject out-of-bounds stack pointer - calculation -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Reject programs that compute wildly out-of-bounds stack pointers. -Otherwise, pointers can be computed with an offset that doesn't fit into an -`int`, causing security issues in the stack memory access check (as well as -signed integer overflow during offset addition). - -This is a fix specifically for the v4.9 stable tree because the mainline -code looks very different at this point. - -Fixes: 7bca0a9702edf ("bpf: enhance verifier to understand stack pointer arithmetic") -Signed-off-by: Jann Horn -Acked-by: Daniel Borkmann -CVE-2017-17863 -Link: https://www.spinics.net/lists/stable/msg206985.html -Signed-off-by: Seth Forshee -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 1c26ffd0e9b24d512824cabc6687a14d4777d0f3) -Signed-off-by: Fabian Grünbichler ---- - kernel/bpf/verifier.c | 22 ++++++++++++++++++++-- - 1 file changed, 20 insertions(+), 2 deletions(-) - -diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c -index 3940019b9740..4321625fe32a 100644 ---- a/kernel/bpf/verifier.c -+++ b/kernel/bpf/verifier.c -@@ -2122,10 +2122,28 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) - ((BPF_SRC(insn->code) == BPF_X && - regs[insn->src_reg].type == CONST_IMM) || - BPF_SRC(insn->code) == BPF_K)) { -- if (BPF_SRC(insn->code) == BPF_X) -+ if (BPF_SRC(insn->code) == BPF_X) { -+ /* check in case the register contains a big -+ * 64-bit value -+ */ -+ if (regs[insn->src_reg].imm < -MAX_BPF_STACK || -+ regs[insn->src_reg].imm > MAX_BPF_STACK) { -+ verbose("R%d value too big in R%d pointer arithmetic\n", -+ insn->src_reg, insn->dst_reg); -+ return -EACCES; -+ } - dst_reg->imm += regs[insn->src_reg].imm; -- else -+ } else { -+ /* safe against overflow: addition of 32-bit -+ * numbers in 64-bit representation -+ */ - dst_reg->imm += insn->imm; -+ } -+ if (dst_reg->imm > 0 || dst_reg->imm < -MAX_BPF_STACK) { -+ verbose("R%d out-of-bounds pointer arithmetic\n", -+ insn->dst_reg); -+ return -EACCES; -+ } - return 0; - } else if (opcode == BPF_ADD && - BPF_CLASS(insn->code) == BPF_ALU64 && --- -2.14.2 - diff --git a/patches/kernel/0228-x86-pti-Switch-to-kernel-CR3-at-early-in-entry_SYSCA.patch b/patches/kernel/0228-x86-pti-Switch-to-kernel-CR3-at-early-in-entry_SYSCA.patch new file mode 100644 index 0000000..547c382 --- /dev/null +++ b/patches/kernel/0228-x86-pti-Switch-to-kernel-CR3-at-early-in-entry_SYSCA.patch @@ -0,0 +1,78 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Wed, 3 Jan 2018 19:52:04 +0100 +Subject: [PATCH] x86/pti: Switch to kernel CR3 at early in + entry_SYSCALL_compat() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The preparation for PTI which added CR3 switching to the entry code +misplaced the CR3 switch in entry_SYSCALL_compat(). + +With PTI enabled the entry code tries to access a per cpu variable after +switching to kernel GS. This fails because that variable is not mapped to +user space. This results in a double fault and in the worst case a kernel +crash. + +Move the switch ahead of the access and clobber RSP which has been saved +already. + +Fixes: 8a09317b895f ("x86/mm/pti: Prepare the x86/entry assembly code for entry/exit CR3 switching") +Reported-by: Lars Wendler +Reported-by: Laura Abbott +Signed-off-by: Thomas Gleixner +Cc: Borislav Betkov +Cc: Andy Lutomirski , +Cc: Dave Hansen , +Cc: Peter Zijlstra , +Cc: Greg KH , , +Cc: Boris Ostrovsky , +Cc: Juergen Gross +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801031949200.1957@nanos +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 2f45cd7a57da0a4d7f3a91a5f577c76b9ed9eb8a) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/entry_64_compat.S | 13 ++++++------- + 1 file changed, 6 insertions(+), 7 deletions(-) + +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index 973527e34887..2b5e7685823c 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -189,8 +189,13 @@ ENTRY(entry_SYSCALL_compat) + /* Interrupts are off on entry. */ + swapgs + +- /* Stash user ESP and switch to the kernel stack. */ ++ /* Stash user ESP.*/ + movl %esp, %r8d ++ ++ /* Use %rsp as scratch reg. User ESP is stashed in r8 */ ++ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp ++ ++ /* Switch to the kernel stack */ + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + + /* Construct struct pt_regs on stack */ +@@ -218,12 +223,6 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) + pushq $0 /* pt_regs->r14 = 0 */ + pushq $0 /* pt_regs->r15 = 0 */ + +- /* +- * We just saved %rdi so it is safe to clobber. It is not +- * preserved during the C calls inside TRACE_IRQS_OFF anyway. +- */ +- SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi +- + /* + * User mode is traced as though IRQs are on, and SYSENTER + * turned them off. +-- +2.14.2 + diff --git a/patches/kernel/0229-UBUNTU-SAUCE-bpf-reject-out-of-bounds-stack-pointer-.patch b/patches/kernel/0229-UBUNTU-SAUCE-bpf-reject-out-of-bounds-stack-pointer-.patch new file mode 100644 index 0000000..802ae9f --- /dev/null +++ b/patches/kernel/0229-UBUNTU-SAUCE-bpf-reject-out-of-bounds-stack-pointer-.patch @@ -0,0 +1,69 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jann Horn +Date: Thu, 4 Jan 2018 08:01:21 -0600 +Subject: [PATCH] UBUNTU: SAUCE: bpf: reject out-of-bounds stack pointer + calculation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Reject programs that compute wildly out-of-bounds stack pointers. +Otherwise, pointers can be computed with an offset that doesn't fit into an +`int`, causing security issues in the stack memory access check (as well as +signed integer overflow during offset addition). + +This is a fix specifically for the v4.9 stable tree because the mainline +code looks very different at this point. + +Fixes: 7bca0a9702edf ("bpf: enhance verifier to understand stack pointer arithmetic") +Signed-off-by: Jann Horn +Acked-by: Daniel Borkmann +CVE-2017-17863 +Link: https://www.spinics.net/lists/stable/msg206985.html +Signed-off-by: Seth Forshee +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 1c26ffd0e9b24d512824cabc6687a14d4777d0f3) +Signed-off-by: Fabian Grünbichler +--- + kernel/bpf/verifier.c | 22 ++++++++++++++++++++-- + 1 file changed, 20 insertions(+), 2 deletions(-) + +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index 3940019b9740..4321625fe32a 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -2122,10 +2122,28 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) + ((BPF_SRC(insn->code) == BPF_X && + regs[insn->src_reg].type == CONST_IMM) || + BPF_SRC(insn->code) == BPF_K)) { +- if (BPF_SRC(insn->code) == BPF_X) ++ if (BPF_SRC(insn->code) == BPF_X) { ++ /* check in case the register contains a big ++ * 64-bit value ++ */ ++ if (regs[insn->src_reg].imm < -MAX_BPF_STACK || ++ regs[insn->src_reg].imm > MAX_BPF_STACK) { ++ verbose("R%d value too big in R%d pointer arithmetic\n", ++ insn->src_reg, insn->dst_reg); ++ return -EACCES; ++ } + dst_reg->imm += regs[insn->src_reg].imm; +- else ++ } else { ++ /* safe against overflow: addition of 32-bit ++ * numbers in 64-bit representation ++ */ + dst_reg->imm += insn->imm; ++ } ++ if (dst_reg->imm > 0 || dst_reg->imm < -MAX_BPF_STACK) { ++ verbose("R%d out-of-bounds pointer arithmetic\n", ++ insn->dst_reg); ++ return -EACCES; ++ } + return 0; + } else if (opcode == BPF_ADD && + BPF_CLASS(insn->code) == BPF_ALU64 && +-- +2.14.2 + diff --git a/patches/kernel/0229-bpf-fix-incorrect-sign-extension-in-check_alu_op.patch b/patches/kernel/0229-bpf-fix-incorrect-sign-extension-in-check_alu_op.patch deleted file mode 100644 index 9934332..0000000 --- a/patches/kernel/0229-bpf-fix-incorrect-sign-extension-in-check_alu_op.patch +++ /dev/null @@ -1,69 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Jann Horn -Date: Thu, 4 Jan 2018 08:01:22 -0600 -Subject: [PATCH] bpf: fix incorrect sign extension in check_alu_op() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -[ Upstream commit 95a762e2c8c942780948091f8f2a4f32fce1ac6f ] - -Distinguish between -BPF_ALU64|BPF_MOV|BPF_K (load 32-bit immediate, sign-extended to 64-bit) -and BPF_ALU|BPF_MOV|BPF_K (load 32-bit immediate, zero-padded to 64-bit); -only perform sign extension in the first case. - -Starting with v4.14, this is exploitable by unprivileged users as long as -the unprivileged_bpf_disabled sysctl isn't set. - -Debian assigned CVE-2017-16995 for this issue. - -v3: - - add CVE number (Ben Hutchings) - -Fixes: 484611357c19 ("bpf: allow access into map value arrays") -Signed-off-by: Jann Horn -Acked-by: Edward Cree -Signed-off-by: Alexei Starovoitov -Signed-off-by: Daniel Borkmann -CVE-2017-16995 -Signed-off-by: Seth Forshee -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 868c88129c7567525dbde3cb6989a5acd478bd80) -Signed-off-by: Fabian Grünbichler ---- - kernel/bpf/verifier.c | 15 +++++++++++---- - 1 file changed, 11 insertions(+), 4 deletions(-) - -diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c -index 4321625fe32a..cdfa07a4ef27 100644 ---- a/kernel/bpf/verifier.c -+++ b/kernel/bpf/verifier.c -@@ -2048,12 +2048,19 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) - /* case: R = imm - * remember the value we stored into this reg - */ -+ u64 imm; -+ -+ if (BPF_CLASS(insn->code) == BPF_ALU64) -+ imm = insn->imm; -+ else -+ imm = (u32)insn->imm; -+ - regs[insn->dst_reg].type = CONST_IMM; -- regs[insn->dst_reg].imm = insn->imm; -+ regs[insn->dst_reg].imm = imm; - regs[insn->dst_reg].id = 0; -- regs[insn->dst_reg].max_value = insn->imm; -- regs[insn->dst_reg].min_value = insn->imm; -- regs[insn->dst_reg].min_align = calc_align(insn->imm); -+ regs[insn->dst_reg].max_value = imm; -+ regs[insn->dst_reg].min_value = imm; -+ regs[insn->dst_reg].min_align = calc_align(imm); - regs[insn->dst_reg].value_from_signed = false; - } - --- -2.14.2 - diff --git a/patches/kernel/0230-UBUNTU-SAUCE-bpf-verifier-Fix-states_equal-compariso.patch b/patches/kernel/0230-UBUNTU-SAUCE-bpf-verifier-Fix-states_equal-compariso.patch deleted file mode 100644 index 58d7259..0000000 --- a/patches/kernel/0230-UBUNTU-SAUCE-bpf-verifier-Fix-states_equal-compariso.patch +++ /dev/null @@ -1,56 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Ben Hutchings -Date: Thu, 4 Jan 2018 08:01:23 -0600 -Subject: [PATCH] UBUNTU: SAUCE: bpf/verifier: Fix states_equal() comparison of - pointer and UNKNOWN -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -An UNKNOWN_VALUE is not supposed to be derived from a pointer, unless -pointer leaks are allowed. Therefore, states_equal() must not treat -a state with a pointer in a register as "equal" to a state with an -UNKNOWN_VALUE in that register. - -This was fixed differently upstream, but the code around here was -largely rewritten in 4.14 by commit f1174f77b50c "bpf/verifier: rework -value tracking". The bug can be detected by the bpf/verifier sub-test -"pointer/scalar confusion in state equality check (way 1)". - -Signed-off-by: Ben Hutchings -Cc: Edward Cree -Cc: Jann Horn -Cc: Alexei Starovoitov -CVE-2017-17864 -Link: https://anonscm.debian.org/cgit/kernel/linux.git/tree/debian/patches/bugfix/all/bpf-verifier-fix-states_equal-comparison-of-pointer-and-unknown.patch?h=stretch-security -Signed-off-by: Seth Forshee -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 3fb4378083def9b22f6ae222e75d880fc5c59048) -Signed-off-by: Fabian Grünbichler ---- - kernel/bpf/verifier.c | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c -index cdfa07a4ef27..4ecb2e10c5e0 100644 ---- a/kernel/bpf/verifier.c -+++ b/kernel/bpf/verifier.c -@@ -2980,11 +2980,12 @@ static bool states_equal(struct bpf_verifier_env *env, - - /* If we didn't map access then again we don't care about the - * mismatched range values and it's ok if our old type was -- * UNKNOWN and we didn't go to a NOT_INIT'ed reg. -+ * UNKNOWN and we didn't go to a NOT_INIT'ed or pointer reg. - */ - if (rold->type == NOT_INIT || - (!varlen_map_access && rold->type == UNKNOWN_VALUE && -- rcur->type != NOT_INIT)) -+ rcur->type != NOT_INIT && -+ !__is_pointer_value(env->allow_ptr_leaks, rcur))) - continue; - - /* Don't care about the reg->id in this case. */ --- -2.14.2 - diff --git a/patches/kernel/0230-bpf-fix-incorrect-sign-extension-in-check_alu_op.patch b/patches/kernel/0230-bpf-fix-incorrect-sign-extension-in-check_alu_op.patch new file mode 100644 index 0000000..9934332 --- /dev/null +++ b/patches/kernel/0230-bpf-fix-incorrect-sign-extension-in-check_alu_op.patch @@ -0,0 +1,69 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jann Horn +Date: Thu, 4 Jan 2018 08:01:22 -0600 +Subject: [PATCH] bpf: fix incorrect sign extension in check_alu_op() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +[ Upstream commit 95a762e2c8c942780948091f8f2a4f32fce1ac6f ] + +Distinguish between +BPF_ALU64|BPF_MOV|BPF_K (load 32-bit immediate, sign-extended to 64-bit) +and BPF_ALU|BPF_MOV|BPF_K (load 32-bit immediate, zero-padded to 64-bit); +only perform sign extension in the first case. + +Starting with v4.14, this is exploitable by unprivileged users as long as +the unprivileged_bpf_disabled sysctl isn't set. + +Debian assigned CVE-2017-16995 for this issue. + +v3: + - add CVE number (Ben Hutchings) + +Fixes: 484611357c19 ("bpf: allow access into map value arrays") +Signed-off-by: Jann Horn +Acked-by: Edward Cree +Signed-off-by: Alexei Starovoitov +Signed-off-by: Daniel Borkmann +CVE-2017-16995 +Signed-off-by: Seth Forshee +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 868c88129c7567525dbde3cb6989a5acd478bd80) +Signed-off-by: Fabian Grünbichler +--- + kernel/bpf/verifier.c | 15 +++++++++++---- + 1 file changed, 11 insertions(+), 4 deletions(-) + +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index 4321625fe32a..cdfa07a4ef27 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -2048,12 +2048,19 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) + /* case: R = imm + * remember the value we stored into this reg + */ ++ u64 imm; ++ ++ if (BPF_CLASS(insn->code) == BPF_ALU64) ++ imm = insn->imm; ++ else ++ imm = (u32)insn->imm; ++ + regs[insn->dst_reg].type = CONST_IMM; +- regs[insn->dst_reg].imm = insn->imm; ++ regs[insn->dst_reg].imm = imm; + regs[insn->dst_reg].id = 0; +- regs[insn->dst_reg].max_value = insn->imm; +- regs[insn->dst_reg].min_value = insn->imm; +- regs[insn->dst_reg].min_align = calc_align(insn->imm); ++ regs[insn->dst_reg].max_value = imm; ++ regs[insn->dst_reg].min_value = imm; ++ regs[insn->dst_reg].min_align = calc_align(imm); + regs[insn->dst_reg].value_from_signed = false; + } + +-- +2.14.2 + diff --git a/patches/kernel/0231-UBUNTU-SAUCE-bpf-verifier-Fix-states_equal-compariso.patch b/patches/kernel/0231-UBUNTU-SAUCE-bpf-verifier-Fix-states_equal-compariso.patch new file mode 100644 index 0000000..58d7259 --- /dev/null +++ b/patches/kernel/0231-UBUNTU-SAUCE-bpf-verifier-Fix-states_equal-compariso.patch @@ -0,0 +1,56 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Ben Hutchings +Date: Thu, 4 Jan 2018 08:01:23 -0600 +Subject: [PATCH] UBUNTU: SAUCE: bpf/verifier: Fix states_equal() comparison of + pointer and UNKNOWN +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +An UNKNOWN_VALUE is not supposed to be derived from a pointer, unless +pointer leaks are allowed. Therefore, states_equal() must not treat +a state with a pointer in a register as "equal" to a state with an +UNKNOWN_VALUE in that register. + +This was fixed differently upstream, but the code around here was +largely rewritten in 4.14 by commit f1174f77b50c "bpf/verifier: rework +value tracking". The bug can be detected by the bpf/verifier sub-test +"pointer/scalar confusion in state equality check (way 1)". + +Signed-off-by: Ben Hutchings +Cc: Edward Cree +Cc: Jann Horn +Cc: Alexei Starovoitov +CVE-2017-17864 +Link: https://anonscm.debian.org/cgit/kernel/linux.git/tree/debian/patches/bugfix/all/bpf-verifier-fix-states_equal-comparison-of-pointer-and-unknown.patch?h=stretch-security +Signed-off-by: Seth Forshee +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 3fb4378083def9b22f6ae222e75d880fc5c59048) +Signed-off-by: Fabian Grünbichler +--- + kernel/bpf/verifier.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index cdfa07a4ef27..4ecb2e10c5e0 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -2980,11 +2980,12 @@ static bool states_equal(struct bpf_verifier_env *env, + + /* If we didn't map access then again we don't care about the + * mismatched range values and it's ok if our old type was +- * UNKNOWN and we didn't go to a NOT_INIT'ed reg. ++ * UNKNOWN and we didn't go to a NOT_INIT'ed or pointer reg. + */ + if (rold->type == NOT_INIT || + (!varlen_map_access && rold->type == UNKNOWN_VALUE && +- rcur->type != NOT_INIT)) ++ rcur->type != NOT_INIT && ++ !__is_pointer_value(env->allow_ptr_leaks, rcur))) + continue; + + /* Don't care about the reg->id in this case. */ +-- +2.14.2 + diff --git a/patches/kernel/0231-bpf-fix-branch-pruning-logic.patch b/patches/kernel/0231-bpf-fix-branch-pruning-logic.patch deleted file mode 100644 index 977e60a..0000000 --- a/patches/kernel/0231-bpf-fix-branch-pruning-logic.patch +++ /dev/null @@ -1,129 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Alexei Starovoitov -Date: Thu, 4 Jan 2018 08:01:24 -0600 -Subject: [PATCH] bpf: fix branch pruning logic -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -when the verifier detects that register contains a runtime constant -and it's compared with another constant it will prune exploration -of the branch that is guaranteed not to be taken at runtime. -This is all correct, but malicious program may be constructed -in such a way that it always has a constant comparison and -the other branch is never taken under any conditions. -In this case such path through the program will not be explored -by the verifier. It won't be taken at run-time either, but since -all instructions are JITed the malicious program may cause JITs -to complain about using reserved fields, etc. -To fix the issue we have to track the instructions explored by -the verifier and sanitize instructions that are dead at run time -with NOPs. We cannot reject such dead code, since llvm generates -it for valid C code, since it doesn't do as much data flow -analysis as the verifier does. - -Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") -Signed-off-by: Alexei Starovoitov -Acked-by: Daniel Borkmann -Signed-off-by: Daniel Borkmann -(cherry picked from commit c131187db2d3fa2f8bf32fdf4e9a4ef805168467) -CVE-2017-17862 -Signed-off-by: Seth Forshee -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 2df70878d072d06f5bad0db3f2ee1ed47179dff8) -Signed-off-by: Fabian Grünbichler ---- - include/linux/bpf_verifier.h | 2 +- - kernel/bpf/verifier.c | 27 +++++++++++++++++++++++++++ - 2 files changed, 28 insertions(+), 1 deletion(-) - -diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h -index 8e5d31f6faef..effeaa64257d 100644 ---- a/include/linux/bpf_verifier.h -+++ b/include/linux/bpf_verifier.h -@@ -75,7 +75,7 @@ struct bpf_insn_aux_data { - struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ - }; - int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ -- int converted_op_size; /* the valid value width after perceived conversion */ -+ bool seen; /* this insn was processed by the verifier */ - }; - - #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ -diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c -index 4ecb2e10c5e0..dab5ba668b97 100644 ---- a/kernel/bpf/verifier.c -+++ b/kernel/bpf/verifier.c -@@ -3152,6 +3152,7 @@ static int do_check(struct bpf_verifier_env *env) - if (err) - return err; - -+ env->insn_aux_data[insn_idx].seen = true; - if (class == BPF_ALU || class == BPF_ALU64) { - err = check_alu_op(env, insn); - if (err) -@@ -3342,6 +3343,7 @@ static int do_check(struct bpf_verifier_env *env) - return err; - - insn_idx++; -+ env->insn_aux_data[insn_idx].seen = true; - } else { - verbose("invalid BPF_LD mode\n"); - return -EINVAL; -@@ -3523,6 +3525,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, - u32 off, u32 cnt) - { - struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; -+ int i; - - if (cnt == 1) - return 0; -@@ -3532,6 +3535,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, - memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); - memcpy(new_data + off + cnt - 1, old_data + off, - sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); -+ for (i = off; i < off + cnt - 1; i++) -+ new_data[i].seen = true; - env->insn_aux_data = new_data; - vfree(old_data); - return 0; -@@ -3550,6 +3555,25 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of - return new_prog; - } - -+/* The verifier does more data flow analysis than llvm and will not explore -+ * branches that are dead at run time. Malicious programs can have dead code -+ * too. Therefore replace all dead at-run-time code with nops. -+ */ -+static void sanitize_dead_code(struct bpf_verifier_env *env) -+{ -+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data; -+ struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); -+ struct bpf_insn *insn = env->prog->insnsi; -+ const int insn_cnt = env->prog->len; -+ int i; -+ -+ for (i = 0; i < insn_cnt; i++) { -+ if (aux_data[i].seen) -+ continue; -+ memcpy(insn + i, &nop, sizeof(nop)); -+ } -+} -+ - /* convert load instructions that access fields of 'struct __sk_buff' - * into sequence of instructions that access fields of 'struct sk_buff' - */ -@@ -3841,6 +3865,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) - while (pop_stack(env, NULL) >= 0); - free_states(env); - -+ if (ret == 0) -+ sanitize_dead_code(env); -+ - if (ret == 0) - /* program is valid, convert *(u32*)(ctx + off) accesses */ - ret = convert_ctx_accesses(env); --- -2.14.2 - diff --git a/patches/kernel/0232-UBUNTU-SAUCE-only-attempt-to-use-PCID-in-64-bit-buil.patch b/patches/kernel/0232-UBUNTU-SAUCE-only-attempt-to-use-PCID-in-64-bit-buil.patch deleted file mode 100644 index cbc69e6..0000000 --- a/patches/kernel/0232-UBUNTU-SAUCE-only-attempt-to-use-PCID-in-64-bit-buil.patch +++ /dev/null @@ -1,36 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andy Whitcroft -Date: Wed, 20 Dec 2017 13:33:50 +0000 -Subject: [PATCH] UBUNTU: SAUCE: only attempt to use PCID in 64 bit builds -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -[apw@canonical.com: need to review if this is still needed with the -latest patches.] -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit babace1d1ac19075498675cd787500cfa24d2b55) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/smpboot.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c -index 6ad8391b9866..398e8324fea4 100644 ---- a/arch/x86/kernel/smpboot.c -+++ b/arch/x86/kernel/smpboot.c -@@ -222,7 +222,7 @@ static void notrace start_secondary(void *unused) - * before cpu_init(), SMP booting is too fragile that we want to - * limit the things done here to the most necessary things. - */ -- if (boot_cpu_has(X86_FEATURE_PCID)) -+ if (IS_ENABLED(CONFIG_X86_64) && boot_cpu_has(X86_FEATURE_PCID)) - __write_cr4(__read_cr4() | X86_CR4_PCIDE); - cpu_init(); - x86_cpuinit.early_percpu_clock_init(); --- -2.14.2 - diff --git a/patches/kernel/0232-bpf-fix-branch-pruning-logic.patch b/patches/kernel/0232-bpf-fix-branch-pruning-logic.patch new file mode 100644 index 0000000..977e60a --- /dev/null +++ b/patches/kernel/0232-bpf-fix-branch-pruning-logic.patch @@ -0,0 +1,129 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Alexei Starovoitov +Date: Thu, 4 Jan 2018 08:01:24 -0600 +Subject: [PATCH] bpf: fix branch pruning logic +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +when the verifier detects that register contains a runtime constant +and it's compared with another constant it will prune exploration +of the branch that is guaranteed not to be taken at runtime. +This is all correct, but malicious program may be constructed +in such a way that it always has a constant comparison and +the other branch is never taken under any conditions. +In this case such path through the program will not be explored +by the verifier. It won't be taken at run-time either, but since +all instructions are JITed the malicious program may cause JITs +to complain about using reserved fields, etc. +To fix the issue we have to track the instructions explored by +the verifier and sanitize instructions that are dead at run time +with NOPs. We cannot reject such dead code, since llvm generates +it for valid C code, since it doesn't do as much data flow +analysis as the verifier does. + +Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") +Signed-off-by: Alexei Starovoitov +Acked-by: Daniel Borkmann +Signed-off-by: Daniel Borkmann +(cherry picked from commit c131187db2d3fa2f8bf32fdf4e9a4ef805168467) +CVE-2017-17862 +Signed-off-by: Seth Forshee +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 2df70878d072d06f5bad0db3f2ee1ed47179dff8) +Signed-off-by: Fabian Grünbichler +--- + include/linux/bpf_verifier.h | 2 +- + kernel/bpf/verifier.c | 27 +++++++++++++++++++++++++++ + 2 files changed, 28 insertions(+), 1 deletion(-) + +diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h +index 8e5d31f6faef..effeaa64257d 100644 +--- a/include/linux/bpf_verifier.h ++++ b/include/linux/bpf_verifier.h +@@ -75,7 +75,7 @@ struct bpf_insn_aux_data { + struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ + }; + int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ +- int converted_op_size; /* the valid value width after perceived conversion */ ++ bool seen; /* this insn was processed by the verifier */ + }; + + #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index 4ecb2e10c5e0..dab5ba668b97 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -3152,6 +3152,7 @@ static int do_check(struct bpf_verifier_env *env) + if (err) + return err; + ++ env->insn_aux_data[insn_idx].seen = true; + if (class == BPF_ALU || class == BPF_ALU64) { + err = check_alu_op(env, insn); + if (err) +@@ -3342,6 +3343,7 @@ static int do_check(struct bpf_verifier_env *env) + return err; + + insn_idx++; ++ env->insn_aux_data[insn_idx].seen = true; + } else { + verbose("invalid BPF_LD mode\n"); + return -EINVAL; +@@ -3523,6 +3525,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, + u32 off, u32 cnt) + { + struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; ++ int i; + + if (cnt == 1) + return 0; +@@ -3532,6 +3535,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, + memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); + memcpy(new_data + off + cnt - 1, old_data + off, + sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); ++ for (i = off; i < off + cnt - 1; i++) ++ new_data[i].seen = true; + env->insn_aux_data = new_data; + vfree(old_data); + return 0; +@@ -3550,6 +3555,25 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of + return new_prog; + } + ++/* The verifier does more data flow analysis than llvm and will not explore ++ * branches that are dead at run time. Malicious programs can have dead code ++ * too. Therefore replace all dead at-run-time code with nops. ++ */ ++static void sanitize_dead_code(struct bpf_verifier_env *env) ++{ ++ struct bpf_insn_aux_data *aux_data = env->insn_aux_data; ++ struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); ++ struct bpf_insn *insn = env->prog->insnsi; ++ const int insn_cnt = env->prog->len; ++ int i; ++ ++ for (i = 0; i < insn_cnt; i++) { ++ if (aux_data[i].seen) ++ continue; ++ memcpy(insn + i, &nop, sizeof(nop)); ++ } ++} ++ + /* convert load instructions that access fields of 'struct __sk_buff' + * into sequence of instructions that access fields of 'struct sk_buff' + */ +@@ -3841,6 +3865,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) + while (pop_stack(env, NULL) >= 0); + free_states(env); + ++ if (ret == 0) ++ sanitize_dead_code(env); ++ + if (ret == 0) + /* program is valid, convert *(u32*)(ctx + off) accesses */ + ret = convert_ctx_accesses(env); +-- +2.14.2 + diff --git a/patches/kernel/0233-UBUNTU-SAUCE-BODGE-temporarily-disable-some-kprobe-t.patch b/patches/kernel/0233-UBUNTU-SAUCE-BODGE-temporarily-disable-some-kprobe-t.patch deleted file mode 100644 index 2467366..0000000 --- a/patches/kernel/0233-UBUNTU-SAUCE-BODGE-temporarily-disable-some-kprobe-t.patch +++ /dev/null @@ -1,273 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Colin Ian King -Date: Sat, 6 Jan 2018 10:26:31 +0000 -Subject: [PATCH] UBUNTU: SAUCE: BODGE: temporarily disable some kprobe trace - points which are cratering -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Most of the interrupt related trace points are cratering when enabled. -Simply turn them off temporarily while we are investigating this. - -CVE-2017-5754 -Based on work by Colin King -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 4ecc04d14ee2f9b46d3e252215a7622d7d47e974) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/trace/irq_vectors.h | 2 +- - arch/x86/kernel/apic/apic.c | 7 ------- - arch/x86/kernel/cpu/mcheck/mce_amd.c | 3 --- - arch/x86/kernel/cpu/mcheck/therm_throt.c | 3 --- - arch/x86/kernel/cpu/mcheck/threshold.c | 3 --- - arch/x86/kernel/irq.c | 3 --- - arch/x86/kernel/irq_work.c | 3 --- - arch/x86/kernel/smp.c | 7 ------- - arch/x86/mm/fault.c | 9 ++------- - 9 files changed, 3 insertions(+), 37 deletions(-) - -diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h -index 7825b4426e7e..cf529e274a14 100644 ---- a/arch/x86/include/asm/trace/irq_vectors.h -+++ b/arch/x86/include/asm/trace/irq_vectors.h -@@ -67,7 +67,7 @@ DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi); - * irq_work - called when entering/exiting a irq work interrupt - * vector handler - */ --DEFINE_IRQ_VECTOR_EVENT(irq_work); -+// DEFINE_IRQ_VECTOR_EVENT(irq_work); - - /* - * We must dis-allow sampling irq_work_exit() because perf event sampling -diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c -index bb63c1350524..4a018da7eca1 100644 ---- a/arch/x86/kernel/apic/apic.c -+++ b/arch/x86/kernel/apic/apic.c -@@ -35,7 +35,6 @@ - #include - #include - --#include - #include - #include - #include -@@ -1074,9 +1073,7 @@ __visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) - * interrupt lock, which is the WrongThing (tm) to do. - */ - entering_ack_irq(); -- trace_local_timer_entry(LOCAL_TIMER_VECTOR); - local_apic_timer_interrupt(); -- trace_local_timer_exit(LOCAL_TIMER_VECTOR); - exiting_irq(); - - set_irq_regs(old_regs); -@@ -1967,9 +1964,7 @@ __visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) - u8 vector = ~regs->orig_ax; - - entering_irq(); -- trace_spurious_apic_entry(vector); - __smp_spurious_interrupt(vector); -- trace_spurious_apic_exit(vector); - exiting_irq(); - } - -@@ -2023,9 +2018,7 @@ __visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) - __visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs) - { - entering_irq(); -- trace_error_apic_entry(ERROR_APIC_VECTOR); - __smp_error_interrupt(regs); -- trace_error_apic_exit(ERROR_APIC_VECTOR); - exiting_irq(); - } - -diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c -index 5ce1a5689162..c983db8ccdb8 100644 ---- a/arch/x86/kernel/cpu/mcheck/mce_amd.c -+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c -@@ -26,7 +26,6 @@ - #include - #include - #include --#include - - #define NR_BLOCKS 5 - #define THRESHOLD_MAX 0xFFF -@@ -787,9 +786,7 @@ asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void) - asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void) - { - entering_irq(); -- trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); - __smp_deferred_error_interrupt(); -- trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); - exiting_ack_irq(); - } - -diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c -index f7370abd33c6..f366a622e186 100644 ---- a/arch/x86/kernel/cpu/mcheck/therm_throt.c -+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c -@@ -28,7 +28,6 @@ - #include - #include - #include --#include - - /* How long to wait between reporting thermal events */ - #define CHECK_INTERVAL (300 * HZ) -@@ -408,9 +407,7 @@ asmlinkage __visible void __irq_entry - smp_trace_thermal_interrupt(struct pt_regs *regs) - { - entering_irq(); -- trace_thermal_apic_entry(THERMAL_APIC_VECTOR); - __smp_thermal_interrupt(); -- trace_thermal_apic_exit(THERMAL_APIC_VECTOR); - exiting_ack_irq(); - } - -diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c -index bb0e75eed10a..623f3e3515e0 100644 ---- a/arch/x86/kernel/cpu/mcheck/threshold.c -+++ b/arch/x86/kernel/cpu/mcheck/threshold.c -@@ -7,7 +7,6 @@ - #include - #include - #include --#include - - static void default_threshold_interrupt(void) - { -@@ -33,8 +32,6 @@ asmlinkage __visible void __irq_entry smp_threshold_interrupt(void) - asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void) - { - entering_irq(); -- trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); - __smp_threshold_interrupt(); -- trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); - exiting_ack_irq(); - } -diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c -index a84142a910f3..792a49c3c6d9 100644 ---- a/arch/x86/kernel/irq.c -+++ b/arch/x86/kernel/irq.c -@@ -19,7 +19,6 @@ - #include - - #define CREATE_TRACE_POINTS --#include - - DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); - EXPORT_PER_CPU_SYMBOL(irq_stat); -@@ -327,9 +326,7 @@ __visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs) - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); -- trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); - __smp_x86_platform_ipi(); -- trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); - exiting_irq(); - set_irq_regs(old_regs); - } -diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c -index 275487872be2..06f12444c1b4 100644 ---- a/arch/x86/kernel/irq_work.c -+++ b/arch/x86/kernel/irq_work.c -@@ -8,7 +8,6 @@ - #include - #include - #include --#include - #include - - static inline void __smp_irq_work_interrupt(void) -@@ -27,9 +26,7 @@ __visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) - __visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs) - { - ipi_entering_ack_irq(); -- trace_irq_work_entry(IRQ_WORK_VECTOR); - __smp_irq_work_interrupt(); -- trace_irq_work_exit(IRQ_WORK_VECTOR); - exiting_irq(); - } - -diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c -index d798c0da451c..fbf36f1731ab 100644 ---- a/arch/x86/kernel/smp.c -+++ b/arch/x86/kernel/smp.c -@@ -31,7 +31,6 @@ - #include - #include - #include --#include - #include - #include - -@@ -280,9 +279,7 @@ __visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs) - * to nest. - */ - ipi_entering_ack_irq(); -- trace_reschedule_entry(RESCHEDULE_VECTOR); - __smp_reschedule_interrupt(); -- trace_reschedule_exit(RESCHEDULE_VECTOR); - exiting_irq(); - /* - * KVM uses this interrupt to force a cpu out of guest mode -@@ -306,9 +303,7 @@ __visible void __irq_entry - smp_trace_call_function_interrupt(struct pt_regs *regs) - { - ipi_entering_ack_irq(); -- trace_call_function_entry(CALL_FUNCTION_VECTOR); - __smp_call_function_interrupt(); -- trace_call_function_exit(CALL_FUNCTION_VECTOR); - exiting_irq(); - } - -@@ -330,9 +325,7 @@ __visible void __irq_entry - smp_trace_call_function_single_interrupt(struct pt_regs *regs) - { - ipi_entering_ack_irq(); -- trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); - __smp_call_function_single_interrupt(); -- trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); - exiting_irq(); - } - -diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c -index d3a57e7ad311..4f6478d14d1f 100644 ---- a/arch/x86/mm/fault.c -+++ b/arch/x86/mm/fault.c -@@ -26,7 +26,6 @@ - #include /* vma_pkey() */ - - #define CREATE_TRACE_POINTS --#include - - /* - * Returns 0 if mmiotrace is disabled, or if the fault is not -@@ -1471,10 +1470,6 @@ static nokprobe_inline void - trace_page_fault_entries(unsigned long address, struct pt_regs *regs, - unsigned long error_code) - { -- if (user_mode(regs)) -- trace_page_fault_user(address, regs, error_code); -- else -- trace_page_fault_kernel(address, regs, error_code); - } - - /* -@@ -1491,8 +1486,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) - enum ctx_state prev_state; - - prev_state = exception_enter(); -- if (trace_irqvectors_enabled()) -- trace_page_fault_entries(address, regs, error_code); -+// if (trace_irqvectors_enabled()) -+// trace_page_fault_entries(address, regs, error_code); - - __do_page_fault(regs, error_code, address); - exception_exit(prev_state); --- -2.14.2 - diff --git a/patches/kernel/0233-UBUNTU-SAUCE-only-attempt-to-use-PCID-in-64-bit-buil.patch b/patches/kernel/0233-UBUNTU-SAUCE-only-attempt-to-use-PCID-in-64-bit-buil.patch new file mode 100644 index 0000000..cbc69e6 --- /dev/null +++ b/patches/kernel/0233-UBUNTU-SAUCE-only-attempt-to-use-PCID-in-64-bit-buil.patch @@ -0,0 +1,36 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andy Whitcroft +Date: Wed, 20 Dec 2017 13:33:50 +0000 +Subject: [PATCH] UBUNTU: SAUCE: only attempt to use PCID in 64 bit builds +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +[apw@canonical.com: need to review if this is still needed with the +latest patches.] +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit babace1d1ac19075498675cd787500cfa24d2b55) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/smpboot.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index 6ad8391b9866..398e8324fea4 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -222,7 +222,7 @@ static void notrace start_secondary(void *unused) + * before cpu_init(), SMP booting is too fragile that we want to + * limit the things done here to the most necessary things. + */ +- if (boot_cpu_has(X86_FEATURE_PCID)) ++ if (IS_ENABLED(CONFIG_X86_64) && boot_cpu_has(X86_FEATURE_PCID)) + __write_cr4(__read_cr4() | X86_CR4_PCIDE); + cpu_init(); + x86_cpuinit.early_percpu_clock_init(); +-- +2.14.2 + diff --git a/patches/kernel/0234-UBUNTU-SAUCE-BODGE-temporarily-disable-some-kprobe-t.patch b/patches/kernel/0234-UBUNTU-SAUCE-BODGE-temporarily-disable-some-kprobe-t.patch new file mode 100644 index 0000000..2467366 --- /dev/null +++ b/patches/kernel/0234-UBUNTU-SAUCE-BODGE-temporarily-disable-some-kprobe-t.patch @@ -0,0 +1,273 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Colin Ian King +Date: Sat, 6 Jan 2018 10:26:31 +0000 +Subject: [PATCH] UBUNTU: SAUCE: BODGE: temporarily disable some kprobe trace + points which are cratering +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Most of the interrupt related trace points are cratering when enabled. +Simply turn them off temporarily while we are investigating this. + +CVE-2017-5754 +Based on work by Colin King +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 4ecc04d14ee2f9b46d3e252215a7622d7d47e974) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/trace/irq_vectors.h | 2 +- + arch/x86/kernel/apic/apic.c | 7 ------- + arch/x86/kernel/cpu/mcheck/mce_amd.c | 3 --- + arch/x86/kernel/cpu/mcheck/therm_throt.c | 3 --- + arch/x86/kernel/cpu/mcheck/threshold.c | 3 --- + arch/x86/kernel/irq.c | 3 --- + arch/x86/kernel/irq_work.c | 3 --- + arch/x86/kernel/smp.c | 7 ------- + arch/x86/mm/fault.c | 9 ++------- + 9 files changed, 3 insertions(+), 37 deletions(-) + +diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h +index 7825b4426e7e..cf529e274a14 100644 +--- a/arch/x86/include/asm/trace/irq_vectors.h ++++ b/arch/x86/include/asm/trace/irq_vectors.h +@@ -67,7 +67,7 @@ DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi); + * irq_work - called when entering/exiting a irq work interrupt + * vector handler + */ +-DEFINE_IRQ_VECTOR_EVENT(irq_work); ++// DEFINE_IRQ_VECTOR_EVENT(irq_work); + + /* + * We must dis-allow sampling irq_work_exit() because perf event sampling +diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c +index bb63c1350524..4a018da7eca1 100644 +--- a/arch/x86/kernel/apic/apic.c ++++ b/arch/x86/kernel/apic/apic.c +@@ -35,7 +35,6 @@ + #include + #include + +-#include + #include + #include + #include +@@ -1074,9 +1073,7 @@ __visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) + * interrupt lock, which is the WrongThing (tm) to do. + */ + entering_ack_irq(); +- trace_local_timer_entry(LOCAL_TIMER_VECTOR); + local_apic_timer_interrupt(); +- trace_local_timer_exit(LOCAL_TIMER_VECTOR); + exiting_irq(); + + set_irq_regs(old_regs); +@@ -1967,9 +1964,7 @@ __visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) + u8 vector = ~regs->orig_ax; + + entering_irq(); +- trace_spurious_apic_entry(vector); + __smp_spurious_interrupt(vector); +- trace_spurious_apic_exit(vector); + exiting_irq(); + } + +@@ -2023,9 +2018,7 @@ __visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) + __visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs) + { + entering_irq(); +- trace_error_apic_entry(ERROR_APIC_VECTOR); + __smp_error_interrupt(regs); +- trace_error_apic_exit(ERROR_APIC_VECTOR); + exiting_irq(); + } + +diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c +index 5ce1a5689162..c983db8ccdb8 100644 +--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c ++++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c +@@ -26,7 +26,6 @@ + #include + #include + #include +-#include + + #define NR_BLOCKS 5 + #define THRESHOLD_MAX 0xFFF +@@ -787,9 +786,7 @@ asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void) + asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void) + { + entering_irq(); +- trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); + __smp_deferred_error_interrupt(); +- trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); + exiting_ack_irq(); + } + +diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c +index f7370abd33c6..f366a622e186 100644 +--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c ++++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c +@@ -28,7 +28,6 @@ + #include + #include + #include +-#include + + /* How long to wait between reporting thermal events */ + #define CHECK_INTERVAL (300 * HZ) +@@ -408,9 +407,7 @@ asmlinkage __visible void __irq_entry + smp_trace_thermal_interrupt(struct pt_regs *regs) + { + entering_irq(); +- trace_thermal_apic_entry(THERMAL_APIC_VECTOR); + __smp_thermal_interrupt(); +- trace_thermal_apic_exit(THERMAL_APIC_VECTOR); + exiting_ack_irq(); + } + +diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c +index bb0e75eed10a..623f3e3515e0 100644 +--- a/arch/x86/kernel/cpu/mcheck/threshold.c ++++ b/arch/x86/kernel/cpu/mcheck/threshold.c +@@ -7,7 +7,6 @@ + #include + #include + #include +-#include + + static void default_threshold_interrupt(void) + { +@@ -33,8 +32,6 @@ asmlinkage __visible void __irq_entry smp_threshold_interrupt(void) + asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void) + { + entering_irq(); +- trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); + __smp_threshold_interrupt(); +- trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); + exiting_ack_irq(); + } +diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c +index a84142a910f3..792a49c3c6d9 100644 +--- a/arch/x86/kernel/irq.c ++++ b/arch/x86/kernel/irq.c +@@ -19,7 +19,6 @@ + #include + + #define CREATE_TRACE_POINTS +-#include + + DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); + EXPORT_PER_CPU_SYMBOL(irq_stat); +@@ -327,9 +326,7 @@ __visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs) + struct pt_regs *old_regs = set_irq_regs(regs); + + entering_ack_irq(); +- trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); + __smp_x86_platform_ipi(); +- trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); + exiting_irq(); + set_irq_regs(old_regs); + } +diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c +index 275487872be2..06f12444c1b4 100644 +--- a/arch/x86/kernel/irq_work.c ++++ b/arch/x86/kernel/irq_work.c +@@ -8,7 +8,6 @@ + #include + #include + #include +-#include + #include + + static inline void __smp_irq_work_interrupt(void) +@@ -27,9 +26,7 @@ __visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) + __visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs) + { + ipi_entering_ack_irq(); +- trace_irq_work_entry(IRQ_WORK_VECTOR); + __smp_irq_work_interrupt(); +- trace_irq_work_exit(IRQ_WORK_VECTOR); + exiting_irq(); + } + +diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c +index d798c0da451c..fbf36f1731ab 100644 +--- a/arch/x86/kernel/smp.c ++++ b/arch/x86/kernel/smp.c +@@ -31,7 +31,6 @@ + #include + #include + #include +-#include + #include + #include + +@@ -280,9 +279,7 @@ __visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs) + * to nest. + */ + ipi_entering_ack_irq(); +- trace_reschedule_entry(RESCHEDULE_VECTOR); + __smp_reschedule_interrupt(); +- trace_reschedule_exit(RESCHEDULE_VECTOR); + exiting_irq(); + /* + * KVM uses this interrupt to force a cpu out of guest mode +@@ -306,9 +303,7 @@ __visible void __irq_entry + smp_trace_call_function_interrupt(struct pt_regs *regs) + { + ipi_entering_ack_irq(); +- trace_call_function_entry(CALL_FUNCTION_VECTOR); + __smp_call_function_interrupt(); +- trace_call_function_exit(CALL_FUNCTION_VECTOR); + exiting_irq(); + } + +@@ -330,9 +325,7 @@ __visible void __irq_entry + smp_trace_call_function_single_interrupt(struct pt_regs *regs) + { + ipi_entering_ack_irq(); +- trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); + __smp_call_function_single_interrupt(); +- trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); + exiting_irq(); + } + +diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c +index d3a57e7ad311..4f6478d14d1f 100644 +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -26,7 +26,6 @@ + #include /* vma_pkey() */ + + #define CREATE_TRACE_POINTS +-#include + + /* + * Returns 0 if mmiotrace is disabled, or if the fault is not +@@ -1471,10 +1470,6 @@ static nokprobe_inline void + trace_page_fault_entries(unsigned long address, struct pt_regs *regs, + unsigned long error_code) + { +- if (user_mode(regs)) +- trace_page_fault_user(address, regs, error_code); +- else +- trace_page_fault_kernel(address, regs, error_code); + } + + /* +@@ -1491,8 +1486,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) + enum ctx_state prev_state; + + prev_state = exception_enter(); +- if (trace_irqvectors_enabled()) +- trace_page_fault_entries(address, regs, error_code); ++// if (trace_irqvectors_enabled()) ++// trace_page_fault_entries(address, regs, error_code); + + __do_page_fault(regs, error_code, address); + exception_exit(prev_state); +-- +2.14.2 + diff --git a/patches/kernel/0234-kvm-vmx-Scrub-hardware-GPRs-at-VM-exit.patch b/patches/kernel/0234-kvm-vmx-Scrub-hardware-GPRs-at-VM-exit.patch deleted file mode 100644 index 4e1f81a..0000000 --- a/patches/kernel/0234-kvm-vmx-Scrub-hardware-GPRs-at-VM-exit.patch +++ /dev/null @@ -1,98 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Jim Mattson -Date: Wed, 3 Jan 2018 14:31:38 -0800 -Subject: [PATCH] kvm: vmx: Scrub hardware GPRs at VM-exit -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Guest GPR values are live in the hardware GPRs at VM-exit. Do not -leave any guest values in hardware GPRs after the guest GPR values are -saved to the vcpu_vmx structure. - -This is a partial mitigation for CVE 2017-5715 and CVE 2017-5753. -Specifically, it defeats the Project Zero PoC for CVE 2017-5715. - -Suggested-by: Eric Northup -Signed-off-by: Jim Mattson -Reviewed-by: Eric Northup -Reviewed-by: Benjamin Serebrin -Reviewed-by: Andrew Honig -[Paolo: Add AMD bits, Signed-off-by: Tom Lendacky ] -Signed-off-by: Paolo Bonzini -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kvm/svm.c | 19 +++++++++++++++++++ - arch/x86/kvm/vmx.c | 14 +++++++++++++- - 2 files changed, 32 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c -index af09baa3d736..92cd94d51e1f 100644 ---- a/arch/x86/kvm/svm.c -+++ b/arch/x86/kvm/svm.c -@@ -4924,6 +4924,25 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) - "mov %%r13, %c[r13](%[svm]) \n\t" - "mov %%r14, %c[r14](%[svm]) \n\t" - "mov %%r15, %c[r15](%[svm]) \n\t" -+#endif -+ /* -+ * Clear host registers marked as clobbered to prevent -+ * speculative use. -+ */ -+ "xor %%" _ASM_BX ", %%" _ASM_BX " \n\t" -+ "xor %%" _ASM_CX ", %%" _ASM_CX " \n\t" -+ "xor %%" _ASM_DX ", %%" _ASM_DX " \n\t" -+ "xor %%" _ASM_SI ", %%" _ASM_SI " \n\t" -+ "xor %%" _ASM_DI ", %%" _ASM_DI " \n\t" -+#ifdef CONFIG_X86_64 -+ "xor %%r8, %%r8 \n\t" -+ "xor %%r9, %%r9 \n\t" -+ "xor %%r10, %%r10 \n\t" -+ "xor %%r11, %%r11 \n\t" -+ "xor %%r12, %%r12 \n\t" -+ "xor %%r13, %%r13 \n\t" -+ "xor %%r14, %%r14 \n\t" -+ "xor %%r15, %%r15 \n\t" - #endif - "pop %%" _ASM_BP - : -diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c -index d61986a36575..9b4256fd589a 100644 ---- a/arch/x86/kvm/vmx.c -+++ b/arch/x86/kvm/vmx.c -@@ -9140,6 +9140,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) - /* Save guest registers, load host registers, keep flags */ - "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" - "pop %0 \n\t" -+ "setbe %c[fail](%0)\n\t" - "mov %%" _ASM_AX ", %c[rax](%0) \n\t" - "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" - __ASM_SIZE(pop) " %c[rcx](%0) \n\t" -@@ -9156,12 +9157,23 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) - "mov %%r13, %c[r13](%0) \n\t" - "mov %%r14, %c[r14](%0) \n\t" - "mov %%r15, %c[r15](%0) \n\t" -+ "xor %%r8d, %%r8d \n\t" -+ "xor %%r9d, %%r9d \n\t" -+ "xor %%r10d, %%r10d \n\t" -+ "xor %%r11d, %%r11d \n\t" -+ "xor %%r12d, %%r12d \n\t" -+ "xor %%r13d, %%r13d \n\t" -+ "xor %%r14d, %%r14d \n\t" -+ "xor %%r15d, %%r15d \n\t" - #endif - "mov %%cr2, %%" _ASM_AX " \n\t" - "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" - -+ "xor %%eax, %%eax \n\t" -+ "xor %%ebx, %%ebx \n\t" -+ "xor %%esi, %%esi \n\t" -+ "xor %%edi, %%edi \n\t" - "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" -- "setbe %c[fail](%0) \n\t" - ".pushsection .rodata \n\t" - ".global vmx_return \n\t" - "vmx_return: " _ASM_PTR " 2b \n\t" --- -2.14.2 - diff --git a/patches/kernel/0235-kvm-vmx-Scrub-hardware-GPRs-at-VM-exit.patch b/patches/kernel/0235-kvm-vmx-Scrub-hardware-GPRs-at-VM-exit.patch new file mode 100644 index 0000000..4e1f81a --- /dev/null +++ b/patches/kernel/0235-kvm-vmx-Scrub-hardware-GPRs-at-VM-exit.patch @@ -0,0 +1,98 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jim Mattson +Date: Wed, 3 Jan 2018 14:31:38 -0800 +Subject: [PATCH] kvm: vmx: Scrub hardware GPRs at VM-exit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Guest GPR values are live in the hardware GPRs at VM-exit. Do not +leave any guest values in hardware GPRs after the guest GPR values are +saved to the vcpu_vmx structure. + +This is a partial mitigation for CVE 2017-5715 and CVE 2017-5753. +Specifically, it defeats the Project Zero PoC for CVE 2017-5715. + +Suggested-by: Eric Northup +Signed-off-by: Jim Mattson +Reviewed-by: Eric Northup +Reviewed-by: Benjamin Serebrin +Reviewed-by: Andrew Honig +[Paolo: Add AMD bits, Signed-off-by: Tom Lendacky ] +Signed-off-by: Paolo Bonzini +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kvm/svm.c | 19 +++++++++++++++++++ + arch/x86/kvm/vmx.c | 14 +++++++++++++- + 2 files changed, 32 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c +index af09baa3d736..92cd94d51e1f 100644 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -4924,6 +4924,25 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) + "mov %%r13, %c[r13](%[svm]) \n\t" + "mov %%r14, %c[r14](%[svm]) \n\t" + "mov %%r15, %c[r15](%[svm]) \n\t" ++#endif ++ /* ++ * Clear host registers marked as clobbered to prevent ++ * speculative use. ++ */ ++ "xor %%" _ASM_BX ", %%" _ASM_BX " \n\t" ++ "xor %%" _ASM_CX ", %%" _ASM_CX " \n\t" ++ "xor %%" _ASM_DX ", %%" _ASM_DX " \n\t" ++ "xor %%" _ASM_SI ", %%" _ASM_SI " \n\t" ++ "xor %%" _ASM_DI ", %%" _ASM_DI " \n\t" ++#ifdef CONFIG_X86_64 ++ "xor %%r8, %%r8 \n\t" ++ "xor %%r9, %%r9 \n\t" ++ "xor %%r10, %%r10 \n\t" ++ "xor %%r11, %%r11 \n\t" ++ "xor %%r12, %%r12 \n\t" ++ "xor %%r13, %%r13 \n\t" ++ "xor %%r14, %%r14 \n\t" ++ "xor %%r15, %%r15 \n\t" + #endif + "pop %%" _ASM_BP + : +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index d61986a36575..9b4256fd589a 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -9140,6 +9140,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) + /* Save guest registers, load host registers, keep flags */ + "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" + "pop %0 \n\t" ++ "setbe %c[fail](%0)\n\t" + "mov %%" _ASM_AX ", %c[rax](%0) \n\t" + "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" + __ASM_SIZE(pop) " %c[rcx](%0) \n\t" +@@ -9156,12 +9157,23 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) + "mov %%r13, %c[r13](%0) \n\t" + "mov %%r14, %c[r14](%0) \n\t" + "mov %%r15, %c[r15](%0) \n\t" ++ "xor %%r8d, %%r8d \n\t" ++ "xor %%r9d, %%r9d \n\t" ++ "xor %%r10d, %%r10d \n\t" ++ "xor %%r11d, %%r11d \n\t" ++ "xor %%r12d, %%r12d \n\t" ++ "xor %%r13d, %%r13d \n\t" ++ "xor %%r14d, %%r14d \n\t" ++ "xor %%r15d, %%r15d \n\t" + #endif + "mov %%cr2, %%" _ASM_AX " \n\t" + "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" + ++ "xor %%eax, %%eax \n\t" ++ "xor %%ebx, %%ebx \n\t" ++ "xor %%esi, %%esi \n\t" ++ "xor %%edi, %%edi \n\t" + "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" +- "setbe %c[fail](%0) \n\t" + ".pushsection .rodata \n\t" + ".global vmx_return \n\t" + "vmx_return: " _ASM_PTR " 2b \n\t" +-- +2.14.2 + diff --git a/patches/kernel/0235-objtool-use-sh-to-invoke-sync-check.sh-in-the-Makefi.patch b/patches/kernel/0235-objtool-use-sh-to-invoke-sync-check.sh-in-the-Makefi.patch deleted file mode 100644 index f343025..0000000 --- a/patches/kernel/0235-objtool-use-sh-to-invoke-sync-check.sh-in-the-Makefi.patch +++ /dev/null @@ -1,35 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Marcelo Henrique Cerri -Date: Sat, 6 Jan 2018 17:50:34 -0200 -Subject: [PATCH] objtool: use sh to invoke sync-check.sh in the Makefile -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -This avoids build failures when building debian packages. - -Signed-off-by: Marcelo Henrique Cerri -(cherry picked from commit 6abf30ed000f0da24985295c206cc6f08a311301) -Signed-off-by: Fabian Grünbichler ---- - tools/objtool/Makefile | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile -index 5c71bae01064..fe022f68638d 100644 ---- a/tools/objtool/Makefile -+++ b/tools/objtool/Makefile -@@ -44,7 +44,7 @@ $(OBJTOOL_IN): fixdep FORCE - @$(MAKE) $(build)=objtool - - $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN) -- @./sync-check.sh -+ @sh ./sync-check.sh - $(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@ - - --- -2.14.2 - diff --git a/patches/kernel/0236-objtool-use-sh-to-invoke-sync-check.sh-in-the-Makefi.patch b/patches/kernel/0236-objtool-use-sh-to-invoke-sync-check.sh-in-the-Makefi.patch new file mode 100644 index 0000000..f343025 --- /dev/null +++ b/patches/kernel/0236-objtool-use-sh-to-invoke-sync-check.sh-in-the-Makefi.patch @@ -0,0 +1,35 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Marcelo Henrique Cerri +Date: Sat, 6 Jan 2018 17:50:34 -0200 +Subject: [PATCH] objtool: use sh to invoke sync-check.sh in the Makefile +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +This avoids build failures when building debian packages. + +Signed-off-by: Marcelo Henrique Cerri +(cherry picked from commit 6abf30ed000f0da24985295c206cc6f08a311301) +Signed-off-by: Fabian Grünbichler +--- + tools/objtool/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile +index 5c71bae01064..fe022f68638d 100644 +--- a/tools/objtool/Makefile ++++ b/tools/objtool/Makefile +@@ -44,7 +44,7 @@ $(OBJTOOL_IN): fixdep FORCE + @$(MAKE) $(build)=objtool + + $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN) +- @./sync-check.sh ++ @sh ./sync-check.sh + $(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@ + + +-- +2.14.2 + diff --git a/patches/kernel/0236-x86-tlb-Drop-the-_GPL-from-the-cpu_tlbstate-export.patch b/patches/kernel/0236-x86-tlb-Drop-the-_GPL-from-the-cpu_tlbstate-export.patch deleted file mode 100644 index d72b125..0000000 --- a/patches/kernel/0236-x86-tlb-Drop-the-_GPL-from-the-cpu_tlbstate-export.patch +++ /dev/null @@ -1,53 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Thu, 4 Jan 2018 22:19:04 +0100 -Subject: [PATCH] x86/tlb: Drop the _GPL from the cpu_tlbstate export -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -commit 1e5476815fd7f98b888e01a0f9522b63085f96c9 upstream. - -The recent changes for PTI touch cpu_tlbstate from various tlb_flush -inlines. cpu_tlbstate is exported as GPL symbol, so this causes a -regression when building out of tree drivers for certain graphics cards. - -Aside of that the export was wrong since it was introduced as it should -have been EXPORT_PER_CPU_SYMBOL_GPL(). - -Use the correct PER_CPU export and drop the _GPL to restore the previous -state which allows users to utilize the cards they payed for. - -As always I'm really thrilled to make this kind of change to support the -#friends (or however the hot hashtag of today is spelled) from that closet -sauce graphics corp. - -Fixes: 1e02ce4cccdc ("x86: Store a per-cpu shadow copy of CR4") -Fixes: 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches") -Reported-by: Kees Cook -Signed-off-by: Thomas Gleixner -Cc: Greg Kroah-Hartman -Cc: Peter Zijlstra -Cc: Andy Lutomirski -Signed-off-by: Greg Kroah-Hartman -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/init.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c -index 80259ad8c386..6b462a472a7b 100644 ---- a/arch/x86/mm/init.c -+++ b/arch/x86/mm/init.c -@@ -870,7 +870,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { - .next_asid = 1, - .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ - }; --EXPORT_SYMBOL_GPL(cpu_tlbstate); -+EXPORT_PER_CPU_SYMBOL(cpu_tlbstate); - - void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) - { --- -2.14.2 - diff --git a/patches/kernel/0237-x86-events-intel-ds-Use-the-proper-cache-flush-metho.patch b/patches/kernel/0237-x86-events-intel-ds-Use-the-proper-cache-flush-metho.patch deleted file mode 100644 index b3276d5..0000000 --- a/patches/kernel/0237-x86-events-intel-ds-Use-the-proper-cache-flush-metho.patch +++ /dev/null @@ -1,105 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Thu, 4 Jan 2018 18:07:12 +0100 -Subject: [PATCH] x86/events/intel/ds: Use the proper cache flush method for - mapping ds buffers -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -commit 42f3bdc5dd962a5958bc024c1e1444248a6b8b4a upstream. - -Thomas reported the following warning: - - BUG: using smp_processor_id() in preemptible [00000000] code: ovsdb-server/4498 - caller is native_flush_tlb_single+0x57/0xc0 - native_flush_tlb_single+0x57/0xc0 - __set_pte_vaddr+0x2d/0x40 - set_pte_vaddr+0x2f/0x40 - cea_set_pte+0x30/0x40 - ds_update_cea.constprop.4+0x4d/0x70 - reserve_ds_buffers+0x159/0x410 - x86_reserve_hardware+0x150/0x160 - x86_pmu_event_init+0x3e/0x1f0 - perf_try_init_event+0x69/0x80 - perf_event_alloc+0x652/0x740 - SyS_perf_event_open+0x3f6/0xd60 - do_syscall_64+0x5c/0x190 - -set_pte_vaddr is used to map the ds buffers into the cpu entry area, but -there are two problems with that: - - 1) The resulting flush is not supposed to be called in preemptible context - - 2) The cpu entry area is supposed to be per CPU, but the debug store - buffers are mapped for all CPUs so these mappings need to be flushed - globally. - -Add the necessary preemption protection across the mapping code and flush -TLBs globally. - -Fixes: c1961a4631da ("x86/events/intel/ds: Map debug buffers in cpu_entry_area") -Reported-by: Thomas Zeitlhofer -Signed-off-by: Peter Zijlstra -Signed-off-by: Thomas Gleixner -Tested-by: Thomas Zeitlhofer -Cc: Greg Kroah-Hartman -Cc: Hugh Dickins -Link: https://lkml.kernel.org/r/20180104170712.GB3040@hirez.programming.kicks-ass.net -Signed-off-by: Greg Kroah-Hartman -Signed-off-by: Fabian Grünbichler ---- - arch/x86/events/intel/ds.c | 16 ++++++++++++++++ - 1 file changed, 16 insertions(+) - -diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c -index 85df1f12c49e..1d236666ee0e 100644 ---- a/arch/x86/events/intel/ds.c -+++ b/arch/x86/events/intel/ds.c -@@ -4,6 +4,7 @@ - - #include - #include -+#include - #include - - #include "../perf_event.h" -@@ -282,20 +283,35 @@ static DEFINE_PER_CPU(void *, insn_buffer); - - static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot) - { -+ unsigned long start = (unsigned long)cea; - phys_addr_t pa; - size_t msz = 0; - - pa = virt_to_phys(addr); -+ -+ preempt_disable(); - for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE) - cea_set_pte(cea, pa, prot); -+ -+ /* -+ * This is a cross-CPU update of the cpu_entry_area, we must shoot down -+ * all TLB entries for it. -+ */ -+ flush_tlb_kernel_range(start, start + size); -+ preempt_enable(); - } - - static void ds_clear_cea(void *cea, size_t size) - { -+ unsigned long start = (unsigned long)cea; - size_t msz = 0; - -+ preempt_disable(); - for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE) - cea_set_pte(cea, 0, PAGE_NONE); -+ -+ flush_tlb_kernel_range(start, start + size); -+ preempt_enable(); - } - - static void *dsalloc_pages(size_t size, gfp_t flags, int cpu) --- -2.14.2 - diff --git a/patches/kernel/0237-x86-tlb-Drop-the-_GPL-from-the-cpu_tlbstate-export.patch b/patches/kernel/0237-x86-tlb-Drop-the-_GPL-from-the-cpu_tlbstate-export.patch new file mode 100644 index 0000000..d72b125 --- /dev/null +++ b/patches/kernel/0237-x86-tlb-Drop-the-_GPL-from-the-cpu_tlbstate-export.patch @@ -0,0 +1,53 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Thu, 4 Jan 2018 22:19:04 +0100 +Subject: [PATCH] x86/tlb: Drop the _GPL from the cpu_tlbstate export +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 1e5476815fd7f98b888e01a0f9522b63085f96c9 upstream. + +The recent changes for PTI touch cpu_tlbstate from various tlb_flush +inlines. cpu_tlbstate is exported as GPL symbol, so this causes a +regression when building out of tree drivers for certain graphics cards. + +Aside of that the export was wrong since it was introduced as it should +have been EXPORT_PER_CPU_SYMBOL_GPL(). + +Use the correct PER_CPU export and drop the _GPL to restore the previous +state which allows users to utilize the cards they payed for. + +As always I'm really thrilled to make this kind of change to support the +#friends (or however the hot hashtag of today is spelled) from that closet +sauce graphics corp. + +Fixes: 1e02ce4cccdc ("x86: Store a per-cpu shadow copy of CR4") +Fixes: 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches") +Reported-by: Kees Cook +Signed-off-by: Thomas Gleixner +Cc: Greg Kroah-Hartman +Cc: Peter Zijlstra +Cc: Andy Lutomirski +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/init.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c +index 80259ad8c386..6b462a472a7b 100644 +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -870,7 +870,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { + .next_asid = 1, + .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ + }; +-EXPORT_SYMBOL_GPL(cpu_tlbstate); ++EXPORT_PER_CPU_SYMBOL(cpu_tlbstate); + + void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) + { +-- +2.14.2 + diff --git a/patches/kernel/0238-x86-events-intel-ds-Use-the-proper-cache-flush-metho.patch b/patches/kernel/0238-x86-events-intel-ds-Use-the-proper-cache-flush-metho.patch new file mode 100644 index 0000000..b3276d5 --- /dev/null +++ b/patches/kernel/0238-x86-events-intel-ds-Use-the-proper-cache-flush-metho.patch @@ -0,0 +1,105 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Thu, 4 Jan 2018 18:07:12 +0100 +Subject: [PATCH] x86/events/intel/ds: Use the proper cache flush method for + mapping ds buffers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 42f3bdc5dd962a5958bc024c1e1444248a6b8b4a upstream. + +Thomas reported the following warning: + + BUG: using smp_processor_id() in preemptible [00000000] code: ovsdb-server/4498 + caller is native_flush_tlb_single+0x57/0xc0 + native_flush_tlb_single+0x57/0xc0 + __set_pte_vaddr+0x2d/0x40 + set_pte_vaddr+0x2f/0x40 + cea_set_pte+0x30/0x40 + ds_update_cea.constprop.4+0x4d/0x70 + reserve_ds_buffers+0x159/0x410 + x86_reserve_hardware+0x150/0x160 + x86_pmu_event_init+0x3e/0x1f0 + perf_try_init_event+0x69/0x80 + perf_event_alloc+0x652/0x740 + SyS_perf_event_open+0x3f6/0xd60 + do_syscall_64+0x5c/0x190 + +set_pte_vaddr is used to map the ds buffers into the cpu entry area, but +there are two problems with that: + + 1) The resulting flush is not supposed to be called in preemptible context + + 2) The cpu entry area is supposed to be per CPU, but the debug store + buffers are mapped for all CPUs so these mappings need to be flushed + globally. + +Add the necessary preemption protection across the mapping code and flush +TLBs globally. + +Fixes: c1961a4631da ("x86/events/intel/ds: Map debug buffers in cpu_entry_area") +Reported-by: Thomas Zeitlhofer +Signed-off-by: Peter Zijlstra +Signed-off-by: Thomas Gleixner +Tested-by: Thomas Zeitlhofer +Cc: Greg Kroah-Hartman +Cc: Hugh Dickins +Link: https://lkml.kernel.org/r/20180104170712.GB3040@hirez.programming.kicks-ass.net +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Fabian Grünbichler +--- + arch/x86/events/intel/ds.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c +index 85df1f12c49e..1d236666ee0e 100644 +--- a/arch/x86/events/intel/ds.c ++++ b/arch/x86/events/intel/ds.c +@@ -4,6 +4,7 @@ + + #include + #include ++#include + #include + + #include "../perf_event.h" +@@ -282,20 +283,35 @@ static DEFINE_PER_CPU(void *, insn_buffer); + + static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot) + { ++ unsigned long start = (unsigned long)cea; + phys_addr_t pa; + size_t msz = 0; + + pa = virt_to_phys(addr); ++ ++ preempt_disable(); + for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE) + cea_set_pte(cea, pa, prot); ++ ++ /* ++ * This is a cross-CPU update of the cpu_entry_area, we must shoot down ++ * all TLB entries for it. ++ */ ++ flush_tlb_kernel_range(start, start + size); ++ preempt_enable(); + } + + static void ds_clear_cea(void *cea, size_t size) + { ++ unsigned long start = (unsigned long)cea; + size_t msz = 0; + ++ preempt_disable(); + for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE) + cea_set_pte(cea, 0, PAGE_NONE); ++ ++ flush_tlb_kernel_range(start, start + size); ++ preempt_enable(); + } + + static void *dsalloc_pages(size_t size, gfp_t flags, int cpu) +-- +2.14.2 + diff --git a/patches/kernel/0238-x86-mm-Set-MODULES_END-to-0xffffffffff000000.patch b/patches/kernel/0238-x86-mm-Set-MODULES_END-to-0xffffffffff000000.patch deleted file mode 100644 index 4b07728..0000000 --- a/patches/kernel/0238-x86-mm-Set-MODULES_END-to-0xffffffffff000000.patch +++ /dev/null @@ -1,104 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andrey Ryabinin -Date: Thu, 28 Dec 2017 19:06:20 +0300 -Subject: [PATCH] x86/mm: Set MODULES_END to 0xffffffffff000000 -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -commit f5a40711fa58f1c109165a4fec6078bf2dfd2bdc upstream. - -Since f06bdd4001c2 ("x86/mm: Adapt MODULES_END based on fixmap section size") -kasan_mem_to_shadow(MODULES_END) could be not aligned to a page boundary. - -So passing page unaligned address to kasan_populate_zero_shadow() have two -possible effects: - -1) It may leave one page hole in supposed to be populated area. After commit - 21506525fb8d ("x86/kasan/64: Teach KASAN about the cpu_entry_area") that - hole happens to be in the shadow covering fixmap area and leads to crash: - - BUG: unable to handle kernel paging request at fffffbffffe8ee04 - RIP: 0010:check_memory_region+0x5c/0x190 - - Call Trace: - - memcpy+0x1f/0x50 - ghes_copy_tofrom_phys+0xab/0x180 - ghes_read_estatus+0xfb/0x280 - ghes_notify_nmi+0x2b2/0x410 - nmi_handle+0x115/0x2c0 - default_do_nmi+0x57/0x110 - do_nmi+0xf8/0x150 - end_repeat_nmi+0x1a/0x1e - -Note, the crash likely disappeared after commit 92a0f81d8957, which -changed kasan_populate_zero_shadow() call the way it was before -commit 21506525fb8d. - -2) Attempt to load module near MODULES_END will fail, because - __vmalloc_node_range() called from kasan_module_alloc() will hit the - WARN_ON(!pte_none(*pte)) in the vmap_pte_range() and bail out with error. - -To fix this we need to make kasan_mem_to_shadow(MODULES_END) page aligned -which means that MODULES_END should be 8*PAGE_SIZE aligned. - -The whole point of commit f06bdd4001c2 was to move MODULES_END down if -NR_CPUS is big, so the cpu_entry_area takes a lot of space. -But since 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap") -the cpu_entry_area is no longer in fixmap, so we could just set -MODULES_END to a fixed 8*PAGE_SIZE aligned address. - -Fixes: f06bdd4001c2 ("x86/mm: Adapt MODULES_END based on fixmap section size") -Reported-by: Jakub Kicinski -Signed-off-by: Andrey Ryabinin -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Thomas Garnier -Link: https://lkml.kernel.org/r/20171228160620.23818-1-aryabinin@virtuozzo.com -Signed-off-by: Greg Kroah-Hartman -Signed-off-by: Fabian Grünbichler ---- - Documentation/x86/x86_64/mm.txt | 5 +---- - arch/x86/include/asm/pgtable_64_types.h | 2 +- - 2 files changed, 2 insertions(+), 5 deletions(-) - -diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt -index ad41b3813f0a..ddd5ffd31bd0 100644 ---- a/Documentation/x86/x86_64/mm.txt -+++ b/Documentation/x86/x86_64/mm.txt -@@ -43,7 +43,7 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks - ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space - ... unused hole ... - ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 --ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space -+ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space - [fixmap start] - ffffffffff5fffff kernel-internal fixmap range - ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI - ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole -@@ -67,9 +67,6 @@ memory window (this size is arbitrary, it can be raised later if needed). - The mappings are not part of any other kernel PGD and are only available - during EFI runtime calls. - --The module mapping space size changes based on the CONFIG requirements for the --following fixmap section. -- - Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all - physical memory, vmalloc/ioremap space and virtual memory map are randomized. - Their order is preserved but their base will be offset early at boot time. -diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h -index e8a809ee0bb6..c92bd73b1e46 100644 ---- a/arch/x86/include/asm/pgtable_64_types.h -+++ b/arch/x86/include/asm/pgtable_64_types.h -@@ -103,7 +103,7 @@ typedef struct { pteval_t pte; } pte_t; - - #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) - /* The module sections ends with the start of the fixmap */ --#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) -+#define MODULES_END _AC(0xffffffffff000000, UL) - #define MODULES_LEN (MODULES_END - MODULES_VADDR) - - #define ESPFIX_PGD_ENTRY _AC(-2, UL) --- -2.14.2 - diff --git a/patches/kernel/0239-x86-mm-Map-cpu_entry_area-at-the-same-place-on-4-5-l.patch b/patches/kernel/0239-x86-mm-Map-cpu_entry_area-at-the-same-place-on-4-5-l.patch deleted file mode 100644 index 8f7ff69..0000000 --- a/patches/kernel/0239-x86-mm-Map-cpu_entry_area-at-the-same-place-on-4-5-l.patch +++ /dev/null @@ -1,98 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Thu, 4 Jan 2018 13:01:40 +0100 -Subject: [PATCH] x86/mm: Map cpu_entry_area at the same place on 4/5 level -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -commit f2078904810373211fb15f91888fba14c01a4acc upstream. - -There is no reason for 4 and 5 level pagetables to have a different -layout. It just makes determining vaddr_end for KASLR harder than -necessary. - -Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap") -Signed-off-by: Thomas Gleixner -Cc: Andy Lutomirski -Cc: Benjamin Gilbert -Cc: Greg Kroah-Hartman -Cc: Dave Hansen -Cc: Peter Zijlstra -Cc: Thomas Garnier , -Cc: Alexander Kuleshov -Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801041320360.1771@nanos -Signed-off-by: Greg Kroah-Hartman -Signed-off-by: Fabian Grünbichler ---- - Documentation/x86/x86_64/mm.txt | 7 ++++--- - arch/x86/include/asm/pgtable_64_types.h | 4 ++-- - arch/x86/mm/dump_pagetables.c | 2 +- - 3 files changed, 7 insertions(+), 6 deletions(-) - -diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt -index ddd5ffd31bd0..f7dabe1f01e9 100644 ---- a/Documentation/x86/x86_64/mm.txt -+++ b/Documentation/x86/x86_64/mm.txt -@@ -12,8 +12,8 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) - ... unused hole ... - ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) - ... unused hole ... --fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI --fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping -+fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping -+fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI - ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks - ... unused hole ... - ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space -@@ -37,7 +37,8 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) - ... unused hole ... - ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) - ... unused hole ... --fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping -+fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping -+... unused hole ... - ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks - ... unused hole ... - ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space -diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h -index c92bd73b1e46..0dd48d17a4a1 100644 ---- a/arch/x86/include/asm/pgtable_64_types.h -+++ b/arch/x86/include/asm/pgtable_64_types.h -@@ -87,7 +87,7 @@ typedef struct { pteval_t pte; } pte_t; - # define VMALLOC_SIZE_TB _AC(32, UL) - # define __VMALLOC_BASE _AC(0xffffc90000000000, UL) - # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) --# define LDT_PGD_ENTRY _AC(-4, UL) -+# define LDT_PGD_ENTRY _AC(-3, UL) - # define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) - #endif - -@@ -109,7 +109,7 @@ typedef struct { pteval_t pte; } pte_t; - #define ESPFIX_PGD_ENTRY _AC(-2, UL) - #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) - --#define CPU_ENTRY_AREA_PGD _AC(-3, UL) -+#define CPU_ENTRY_AREA_PGD _AC(-4, UL) - #define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT) - - #define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) -diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c -index 12b93d350480..a764bf6f3473 100644 ---- a/arch/x86/mm/dump_pagetables.c -+++ b/arch/x86/mm/dump_pagetables.c -@@ -61,10 +61,10 @@ enum address_markers_idx { - KASAN_SHADOW_START_NR, - KASAN_SHADOW_END_NR, - #endif -+ CPU_ENTRY_AREA_NR, - #if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) - LDT_NR, - #endif -- CPU_ENTRY_AREA_NR, - #ifdef CONFIG_X86_ESPFIX64 - ESPFIX_START_NR, - #endif --- -2.14.2 - diff --git a/patches/kernel/0239-x86-mm-Set-MODULES_END-to-0xffffffffff000000.patch b/patches/kernel/0239-x86-mm-Set-MODULES_END-to-0xffffffffff000000.patch new file mode 100644 index 0000000..4b07728 --- /dev/null +++ b/patches/kernel/0239-x86-mm-Set-MODULES_END-to-0xffffffffff000000.patch @@ -0,0 +1,104 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andrey Ryabinin +Date: Thu, 28 Dec 2017 19:06:20 +0300 +Subject: [PATCH] x86/mm: Set MODULES_END to 0xffffffffff000000 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit f5a40711fa58f1c109165a4fec6078bf2dfd2bdc upstream. + +Since f06bdd4001c2 ("x86/mm: Adapt MODULES_END based on fixmap section size") +kasan_mem_to_shadow(MODULES_END) could be not aligned to a page boundary. + +So passing page unaligned address to kasan_populate_zero_shadow() have two +possible effects: + +1) It may leave one page hole in supposed to be populated area. After commit + 21506525fb8d ("x86/kasan/64: Teach KASAN about the cpu_entry_area") that + hole happens to be in the shadow covering fixmap area and leads to crash: + + BUG: unable to handle kernel paging request at fffffbffffe8ee04 + RIP: 0010:check_memory_region+0x5c/0x190 + + Call Trace: + + memcpy+0x1f/0x50 + ghes_copy_tofrom_phys+0xab/0x180 + ghes_read_estatus+0xfb/0x280 + ghes_notify_nmi+0x2b2/0x410 + nmi_handle+0x115/0x2c0 + default_do_nmi+0x57/0x110 + do_nmi+0xf8/0x150 + end_repeat_nmi+0x1a/0x1e + +Note, the crash likely disappeared after commit 92a0f81d8957, which +changed kasan_populate_zero_shadow() call the way it was before +commit 21506525fb8d. + +2) Attempt to load module near MODULES_END will fail, because + __vmalloc_node_range() called from kasan_module_alloc() will hit the + WARN_ON(!pte_none(*pte)) in the vmap_pte_range() and bail out with error. + +To fix this we need to make kasan_mem_to_shadow(MODULES_END) page aligned +which means that MODULES_END should be 8*PAGE_SIZE aligned. + +The whole point of commit f06bdd4001c2 was to move MODULES_END down if +NR_CPUS is big, so the cpu_entry_area takes a lot of space. +But since 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap") +the cpu_entry_area is no longer in fixmap, so we could just set +MODULES_END to a fixed 8*PAGE_SIZE aligned address. + +Fixes: f06bdd4001c2 ("x86/mm: Adapt MODULES_END based on fixmap section size") +Reported-by: Jakub Kicinski +Signed-off-by: Andrey Ryabinin +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Thomas Garnier +Link: https://lkml.kernel.org/r/20171228160620.23818-1-aryabinin@virtuozzo.com +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Fabian Grünbichler +--- + Documentation/x86/x86_64/mm.txt | 5 +---- + arch/x86/include/asm/pgtable_64_types.h | 2 +- + 2 files changed, 2 insertions(+), 5 deletions(-) + +diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt +index ad41b3813f0a..ddd5ffd31bd0 100644 +--- a/Documentation/x86/x86_64/mm.txt ++++ b/Documentation/x86/x86_64/mm.txt +@@ -43,7 +43,7 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks + ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space + ... unused hole ... + ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 +-ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space ++ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space + [fixmap start] - ffffffffff5fffff kernel-internal fixmap range + ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI + ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole +@@ -67,9 +67,6 @@ memory window (this size is arbitrary, it can be raised later if needed). + The mappings are not part of any other kernel PGD and are only available + during EFI runtime calls. + +-The module mapping space size changes based on the CONFIG requirements for the +-following fixmap section. +- + Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all + physical memory, vmalloc/ioremap space and virtual memory map are randomized. + Their order is preserved but their base will be offset early at boot time. +diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h +index e8a809ee0bb6..c92bd73b1e46 100644 +--- a/arch/x86/include/asm/pgtable_64_types.h ++++ b/arch/x86/include/asm/pgtable_64_types.h +@@ -103,7 +103,7 @@ typedef struct { pteval_t pte; } pte_t; + + #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) + /* The module sections ends with the start of the fixmap */ +-#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) ++#define MODULES_END _AC(0xffffffffff000000, UL) + #define MODULES_LEN (MODULES_END - MODULES_VADDR) + + #define ESPFIX_PGD_ENTRY _AC(-2, UL) +-- +2.14.2 + diff --git a/patches/kernel/0240-x86-kaslr-Fix-the-vaddr_end-mess.patch b/patches/kernel/0240-x86-kaslr-Fix-the-vaddr_end-mess.patch deleted file mode 100644 index 112e421..0000000 --- a/patches/kernel/0240-x86-kaslr-Fix-the-vaddr_end-mess.patch +++ /dev/null @@ -1,144 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Thu, 4 Jan 2018 12:32:03 +0100 -Subject: [PATCH] x86/kaslr: Fix the vaddr_end mess -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -commit 1dddd25125112ba49706518ac9077a1026a18f37 upstream. - -vaddr_end for KASLR is only documented in the KASLR code itself and is -adjusted depending on config options. So it's not surprising that a change -of the memory layout causes KASLR to have the wrong vaddr_end. This can map -arbitrary stuff into other areas causing hard to understand problems. - -Remove the whole ifdef magic and define the start of the cpu_entry_area to -be the end of the KASLR vaddr range. - -Add documentation to that effect. - -Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap") -Reported-by: Benjamin Gilbert -Signed-off-by: Thomas Gleixner -Tested-by: Benjamin Gilbert -Cc: Andy Lutomirski -Cc: Greg Kroah-Hartman -Cc: Dave Hansen -Cc: Peter Zijlstra -Cc: Thomas Garnier , -Cc: Alexander Kuleshov -Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801041320360.1771@nanos -Signed-off-by: Greg Kroah-Hartman -Signed-off-by: Fabian Grünbichler ---- - Documentation/x86/x86_64/mm.txt | 6 ++++++ - arch/x86/include/asm/pgtable_64_types.h | 8 +++++++- - arch/x86/mm/kaslr.c | 32 +++++++++----------------------- - 3 files changed, 22 insertions(+), 24 deletions(-) - -diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt -index f7dabe1f01e9..ea91cb61a602 100644 ---- a/Documentation/x86/x86_64/mm.txt -+++ b/Documentation/x86/x86_64/mm.txt -@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) - ... unused hole ... - ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) - ... unused hole ... -+ vaddr_end for KASLR - fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping - fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI - ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks -@@ -37,6 +38,7 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) - ... unused hole ... - ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) - ... unused hole ... -+ vaddr_end for KASLR - fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping - ... unused hole ... - ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks -@@ -71,3 +73,7 @@ during EFI runtime calls. - Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all - physical memory, vmalloc/ioremap space and virtual memory map are randomized. - Their order is preserved but their base will be offset early at boot time. -+ -+Be very careful vs. KASLR when changing anything here. The KASLR address -+range must not overlap with anything except the KASAN shadow area, which is -+correct as KASAN disables KASLR. -diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h -index 0dd48d17a4a1..928d558e7778 100644 ---- a/arch/x86/include/asm/pgtable_64_types.h -+++ b/arch/x86/include/asm/pgtable_64_types.h -@@ -74,7 +74,13 @@ typedef struct { pteval_t pte; } pte_t; - #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) - #define PGDIR_MASK (~(PGDIR_SIZE - 1)) - --/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ -+/* -+ * See Documentation/x86/x86_64/mm.txt for a description of the memory map. -+ * -+ * Be very careful vs. KASLR when changing anything here. The KASLR address -+ * range must not overlap with anything except the KASAN shadow area, which -+ * is correct as KASAN disables KASLR. -+ */ - #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) - - #ifdef CONFIG_X86_5LEVEL -diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c -index af599167fe3c..debc7cc8e152 100644 ---- a/arch/x86/mm/kaslr.c -+++ b/arch/x86/mm/kaslr.c -@@ -33,25 +33,14 @@ - #define TB_SHIFT 40 - - /* -- * Virtual address start and end range for randomization. The end changes base -- * on configuration to have the highest amount of space for randomization. -- * It increases the possible random position for each randomized region. -+ * Virtual address start and end range for randomization. - * -- * You need to add an if/def entry if you introduce a new memory region -- * compatible with KASLR. Your entry must be in logical order with memory -- * layout. For example, ESPFIX is before EFI because its virtual address is -- * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to -- * ensure that this order is correct and won't be changed. -+ * The end address could depend on more configuration options to make the -+ * highest amount of space for randomization available, but that's too hard -+ * to keep straight and caused issues already. - */ - static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; -- --#if defined(CONFIG_X86_ESPFIX64) --static const unsigned long vaddr_end = ESPFIX_BASE_ADDR; --#elif defined(CONFIG_EFI) --static const unsigned long vaddr_end = EFI_VA_END; --#else --static const unsigned long vaddr_end = __START_KERNEL_map; --#endif -+static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; - - /* Default values */ - unsigned long page_offset_base = __PAGE_OFFSET_BASE; -@@ -100,15 +89,12 @@ void __init kernel_randomize_memory(void) - unsigned long remain_entropy; - - /* -- * All these BUILD_BUG_ON checks ensures the memory layout is -- * consistent with the vaddr_start/vaddr_end variables. -+ * These BUILD_BUG_ON checks ensure the memory layout is consistent -+ * with the vaddr_start/vaddr_end variables. These checks are very -+ * limited.... - */ - BUILD_BUG_ON(vaddr_start >= vaddr_end); -- BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) && -- vaddr_end >= EFI_VA_END); -- BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) || -- IS_ENABLED(CONFIG_EFI)) && -- vaddr_end >= __START_KERNEL_map); -+ BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE); - BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); - - if (!kaslr_memory_enabled()) --- -2.14.2 - diff --git a/patches/kernel/0240-x86-mm-Map-cpu_entry_area-at-the-same-place-on-4-5-l.patch b/patches/kernel/0240-x86-mm-Map-cpu_entry_area-at-the-same-place-on-4-5-l.patch new file mode 100644 index 0000000..8f7ff69 --- /dev/null +++ b/patches/kernel/0240-x86-mm-Map-cpu_entry_area-at-the-same-place-on-4-5-l.patch @@ -0,0 +1,98 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Thu, 4 Jan 2018 13:01:40 +0100 +Subject: [PATCH] x86/mm: Map cpu_entry_area at the same place on 4/5 level +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit f2078904810373211fb15f91888fba14c01a4acc upstream. + +There is no reason for 4 and 5 level pagetables to have a different +layout. It just makes determining vaddr_end for KASLR harder than +necessary. + +Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap") +Signed-off-by: Thomas Gleixner +Cc: Andy Lutomirski +Cc: Benjamin Gilbert +Cc: Greg Kroah-Hartman +Cc: Dave Hansen +Cc: Peter Zijlstra +Cc: Thomas Garnier , +Cc: Alexander Kuleshov +Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801041320360.1771@nanos +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Fabian Grünbichler +--- + Documentation/x86/x86_64/mm.txt | 7 ++++--- + arch/x86/include/asm/pgtable_64_types.h | 4 ++-- + arch/x86/mm/dump_pagetables.c | 2 +- + 3 files changed, 7 insertions(+), 6 deletions(-) + +diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt +index ddd5ffd31bd0..f7dabe1f01e9 100644 +--- a/Documentation/x86/x86_64/mm.txt ++++ b/Documentation/x86/x86_64/mm.txt +@@ -12,8 +12,8 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) + ... unused hole ... + ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) + ... unused hole ... +-fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI +-fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping ++fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping ++fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI + ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks + ... unused hole ... + ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space +@@ -37,7 +37,8 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) + ... unused hole ... + ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) + ... unused hole ... +-fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping ++fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping ++... unused hole ... + ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks + ... unused hole ... + ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space +diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h +index c92bd73b1e46..0dd48d17a4a1 100644 +--- a/arch/x86/include/asm/pgtable_64_types.h ++++ b/arch/x86/include/asm/pgtable_64_types.h +@@ -87,7 +87,7 @@ typedef struct { pteval_t pte; } pte_t; + # define VMALLOC_SIZE_TB _AC(32, UL) + # define __VMALLOC_BASE _AC(0xffffc90000000000, UL) + # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) +-# define LDT_PGD_ENTRY _AC(-4, UL) ++# define LDT_PGD_ENTRY _AC(-3, UL) + # define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) + #endif + +@@ -109,7 +109,7 @@ typedef struct { pteval_t pte; } pte_t; + #define ESPFIX_PGD_ENTRY _AC(-2, UL) + #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) + +-#define CPU_ENTRY_AREA_PGD _AC(-3, UL) ++#define CPU_ENTRY_AREA_PGD _AC(-4, UL) + #define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT) + + #define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) +diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c +index 12b93d350480..a764bf6f3473 100644 +--- a/arch/x86/mm/dump_pagetables.c ++++ b/arch/x86/mm/dump_pagetables.c +@@ -61,10 +61,10 @@ enum address_markers_idx { + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, + #endif ++ CPU_ENTRY_AREA_NR, + #if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) + LDT_NR, + #endif +- CPU_ENTRY_AREA_NR, + #ifdef CONFIG_X86_ESPFIX64 + ESPFIX_START_NR, + #endif +-- +2.14.2 + diff --git a/patches/kernel/0241-x86-alternatives-Add-missing-n-at-end-of-ALTERNATIVE.patch b/patches/kernel/0241-x86-alternatives-Add-missing-n-at-end-of-ALTERNATIVE.patch deleted file mode 100644 index 79d0c2c..0000000 --- a/patches/kernel/0241-x86-alternatives-Add-missing-n-at-end-of-ALTERNATIVE.patch +++ /dev/null @@ -1,62 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: David Woodhouse -Date: Thu, 4 Jan 2018 14:37:05 +0000 -Subject: [PATCH] x86/alternatives: Add missing '\n' at end of ALTERNATIVE - inline asm -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -commit b9e705ef7cfaf22db0daab91ad3cd33b0fa32eb9 upstream. - -Where an ALTERNATIVE is used in the middle of an inline asm block, this -would otherwise lead to the following instruction being appended directly -to the trailing ".popsection", and a failed compile. - -Fixes: 9cebed423c84 ("x86, alternative: Use .pushsection/.popsection") -Signed-off-by: David Woodhouse -Signed-off-by: Thomas Gleixner -Cc: gnomes@lxorguk.ukuu.org.uk -Cc: Rik van Riel -Cc: ak@linux.intel.com -Cc: Tim Chen -Cc: Peter Zijlstra -Cc: Paul Turner -Cc: Jiri Kosina -Cc: Andy Lutomirski -Cc: Dave Hansen -Cc: Kees Cook -Cc: Linus Torvalds -Cc: Greg Kroah-Hartman -Link: https://lkml.kernel.org/r/20180104143710.8961-8-dwmw@amazon.co.uk -Signed-off-by: Greg Kroah-Hartman -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/alternative.h | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h -index d4aea31eec03..deca9b9c7923 100644 ---- a/arch/x86/include/asm/alternative.h -+++ b/arch/x86/include/asm/alternative.h -@@ -139,7 +139,7 @@ static inline int alternatives_text_reserved(void *start, void *end) - ".popsection\n" \ - ".pushsection .altinstr_replacement, \"ax\"\n" \ - ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ -- ".popsection" -+ ".popsection\n" - - #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ - OLDINSTR_2(oldinstr, 1, 2) \ -@@ -150,7 +150,7 @@ static inline int alternatives_text_reserved(void *start, void *end) - ".pushsection .altinstr_replacement, \"ax\"\n" \ - ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ - ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ -- ".popsection" -+ ".popsection\n" - - /* - * Alternative instructions for different CPU types or capabilities. --- -2.14.2 - diff --git a/patches/kernel/0241-x86-kaslr-Fix-the-vaddr_end-mess.patch b/patches/kernel/0241-x86-kaslr-Fix-the-vaddr_end-mess.patch new file mode 100644 index 0000000..112e421 --- /dev/null +++ b/patches/kernel/0241-x86-kaslr-Fix-the-vaddr_end-mess.patch @@ -0,0 +1,144 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Thu, 4 Jan 2018 12:32:03 +0100 +Subject: [PATCH] x86/kaslr: Fix the vaddr_end mess +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 1dddd25125112ba49706518ac9077a1026a18f37 upstream. + +vaddr_end for KASLR is only documented in the KASLR code itself and is +adjusted depending on config options. So it's not surprising that a change +of the memory layout causes KASLR to have the wrong vaddr_end. This can map +arbitrary stuff into other areas causing hard to understand problems. + +Remove the whole ifdef magic and define the start of the cpu_entry_area to +be the end of the KASLR vaddr range. + +Add documentation to that effect. + +Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap") +Reported-by: Benjamin Gilbert +Signed-off-by: Thomas Gleixner +Tested-by: Benjamin Gilbert +Cc: Andy Lutomirski +Cc: Greg Kroah-Hartman +Cc: Dave Hansen +Cc: Peter Zijlstra +Cc: Thomas Garnier , +Cc: Alexander Kuleshov +Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801041320360.1771@nanos +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Fabian Grünbichler +--- + Documentation/x86/x86_64/mm.txt | 6 ++++++ + arch/x86/include/asm/pgtable_64_types.h | 8 +++++++- + arch/x86/mm/kaslr.c | 32 +++++++++----------------------- + 3 files changed, 22 insertions(+), 24 deletions(-) + +diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt +index f7dabe1f01e9..ea91cb61a602 100644 +--- a/Documentation/x86/x86_64/mm.txt ++++ b/Documentation/x86/x86_64/mm.txt +@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) + ... unused hole ... + ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) + ... unused hole ... ++ vaddr_end for KASLR + fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping + fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI + ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks +@@ -37,6 +38,7 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) + ... unused hole ... + ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) + ... unused hole ... ++ vaddr_end for KASLR + fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping + ... unused hole ... + ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks +@@ -71,3 +73,7 @@ during EFI runtime calls. + Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all + physical memory, vmalloc/ioremap space and virtual memory map are randomized. + Their order is preserved but their base will be offset early at boot time. ++ ++Be very careful vs. KASLR when changing anything here. The KASLR address ++range must not overlap with anything except the KASAN shadow area, which is ++correct as KASAN disables KASLR. +diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h +index 0dd48d17a4a1..928d558e7778 100644 +--- a/arch/x86/include/asm/pgtable_64_types.h ++++ b/arch/x86/include/asm/pgtable_64_types.h +@@ -74,7 +74,13 @@ typedef struct { pteval_t pte; } pte_t; + #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) + #define PGDIR_MASK (~(PGDIR_SIZE - 1)) + +-/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ ++/* ++ * See Documentation/x86/x86_64/mm.txt for a description of the memory map. ++ * ++ * Be very careful vs. KASLR when changing anything here. The KASLR address ++ * range must not overlap with anything except the KASAN shadow area, which ++ * is correct as KASAN disables KASLR. ++ */ + #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) + + #ifdef CONFIG_X86_5LEVEL +diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c +index af599167fe3c..debc7cc8e152 100644 +--- a/arch/x86/mm/kaslr.c ++++ b/arch/x86/mm/kaslr.c +@@ -33,25 +33,14 @@ + #define TB_SHIFT 40 + + /* +- * Virtual address start and end range for randomization. The end changes base +- * on configuration to have the highest amount of space for randomization. +- * It increases the possible random position for each randomized region. ++ * Virtual address start and end range for randomization. + * +- * You need to add an if/def entry if you introduce a new memory region +- * compatible with KASLR. Your entry must be in logical order with memory +- * layout. For example, ESPFIX is before EFI because its virtual address is +- * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to +- * ensure that this order is correct and won't be changed. ++ * The end address could depend on more configuration options to make the ++ * highest amount of space for randomization available, but that's too hard ++ * to keep straight and caused issues already. + */ + static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; +- +-#if defined(CONFIG_X86_ESPFIX64) +-static const unsigned long vaddr_end = ESPFIX_BASE_ADDR; +-#elif defined(CONFIG_EFI) +-static const unsigned long vaddr_end = EFI_VA_END; +-#else +-static const unsigned long vaddr_end = __START_KERNEL_map; +-#endif ++static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; + + /* Default values */ + unsigned long page_offset_base = __PAGE_OFFSET_BASE; +@@ -100,15 +89,12 @@ void __init kernel_randomize_memory(void) + unsigned long remain_entropy; + + /* +- * All these BUILD_BUG_ON checks ensures the memory layout is +- * consistent with the vaddr_start/vaddr_end variables. ++ * These BUILD_BUG_ON checks ensure the memory layout is consistent ++ * with the vaddr_start/vaddr_end variables. These checks are very ++ * limited.... + */ + BUILD_BUG_ON(vaddr_start >= vaddr_end); +- BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) && +- vaddr_end >= EFI_VA_END); +- BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) || +- IS_ENABLED(CONFIG_EFI)) && +- vaddr_end >= __START_KERNEL_map); ++ BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE); + BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); + + if (!kaslr_memory_enabled()) +-- +2.14.2 + diff --git a/patches/kernel/0242-x86-alternatives-Add-missing-n-at-end-of-ALTERNATIVE.patch b/patches/kernel/0242-x86-alternatives-Add-missing-n-at-end-of-ALTERNATIVE.patch new file mode 100644 index 0000000..79d0c2c --- /dev/null +++ b/patches/kernel/0242-x86-alternatives-Add-missing-n-at-end-of-ALTERNATIVE.patch @@ -0,0 +1,62 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: David Woodhouse +Date: Thu, 4 Jan 2018 14:37:05 +0000 +Subject: [PATCH] x86/alternatives: Add missing '\n' at end of ALTERNATIVE + inline asm +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit b9e705ef7cfaf22db0daab91ad3cd33b0fa32eb9 upstream. + +Where an ALTERNATIVE is used in the middle of an inline asm block, this +would otherwise lead to the following instruction being appended directly +to the trailing ".popsection", and a failed compile. + +Fixes: 9cebed423c84 ("x86, alternative: Use .pushsection/.popsection") +Signed-off-by: David Woodhouse +Signed-off-by: Thomas Gleixner +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel +Cc: ak@linux.intel.com +Cc: Tim Chen +Cc: Peter Zijlstra +Cc: Paul Turner +Cc: Jiri Kosina +Cc: Andy Lutomirski +Cc: Dave Hansen +Cc: Kees Cook +Cc: Linus Torvalds +Cc: Greg Kroah-Hartman +Link: https://lkml.kernel.org/r/20180104143710.8961-8-dwmw@amazon.co.uk +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/alternative.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h +index d4aea31eec03..deca9b9c7923 100644 +--- a/arch/x86/include/asm/alternative.h ++++ b/arch/x86/include/asm/alternative.h +@@ -139,7 +139,7 @@ static inline int alternatives_text_reserved(void *start, void *end) + ".popsection\n" \ + ".pushsection .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ +- ".popsection" ++ ".popsection\n" + + #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ + OLDINSTR_2(oldinstr, 1, 2) \ +@@ -150,7 +150,7 @@ static inline int alternatives_text_reserved(void *start, void *end) + ".pushsection .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ + ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ +- ".popsection" ++ ".popsection\n" + + /* + * Alternative instructions for different CPU types or capabilities. +-- +2.14.2 + diff --git a/patches/kernel/0242-x86-cpu-x86-pti-Do-not-enable-PTI-on-AMD-processors.patch b/patches/kernel/0242-x86-cpu-x86-pti-Do-not-enable-PTI-on-AMD-processors.patch deleted file mode 100644 index 9f6c71e..0000000 --- a/patches/kernel/0242-x86-cpu-x86-pti-Do-not-enable-PTI-on-AMD-processors.patch +++ /dev/null @@ -1,54 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tom Lendacky -Date: Tue, 26 Dec 2017 23:43:54 -0600 -Subject: [PATCH] x86/cpu, x86/pti: Do not enable PTI on AMD processors -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -AMD processors are not subject to the types of attacks that the kernel -page table isolation feature protects against. The AMD microarchitecture -does not allow memory references, including speculative references, that -access higher privileged data when running in a lesser privileged mode -when that access would result in a page fault. - -Disable page table isolation by default on AMD processors by not setting -the X86_BUG_CPU_INSECURE feature, which controls whether X86_FEATURE_PTI -is set. - -Signed-off-by: Tom Lendacky -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Cc: Dave Hansen -Cc: Andy Lutomirski -Cc: stable@vger.kernel.org -Link: https://lkml.kernel.org/r/20171227054354.20369.94587.stgit@tlendack-t1.amdoffice.net - -(cherry picked from commit 694d99d40972f12e59a3696effee8a376b79d7c8) -Signed-off-by: Marcelo Henrique Cerri -(cherry picked from commit 9d334f48f017b9c6457c6ba321e5a53a1cc6a5c7) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/cpu/common.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index 99f37d1636ff..1854dd8071a6 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -899,8 +899,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) - - setup_force_cpu_cap(X86_FEATURE_ALWAYS); - -- /* Assume for now that ALL x86 CPUs are insecure */ -- setup_force_cpu_bug(X86_BUG_CPU_INSECURE); -+ if (c->x86_vendor != X86_VENDOR_AMD) -+ setup_force_cpu_bug(X86_BUG_CPU_INSECURE); - - fpu__init_system(c); - } --- -2.14.2 - diff --git a/patches/kernel/0243-x86-cpu-x86-pti-Do-not-enable-PTI-on-AMD-processors.patch b/patches/kernel/0243-x86-cpu-x86-pti-Do-not-enable-PTI-on-AMD-processors.patch new file mode 100644 index 0000000..9f6c71e --- /dev/null +++ b/patches/kernel/0243-x86-cpu-x86-pti-Do-not-enable-PTI-on-AMD-processors.patch @@ -0,0 +1,54 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Tue, 26 Dec 2017 23:43:54 -0600 +Subject: [PATCH] x86/cpu, x86/pti: Do not enable PTI on AMD processors +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +AMD processors are not subject to the types of attacks that the kernel +page table isolation feature protects against. The AMD microarchitecture +does not allow memory references, including speculative references, that +access higher privileged data when running in a lesser privileged mode +when that access would result in a page fault. + +Disable page table isolation by default on AMD processors by not setting +the X86_BUG_CPU_INSECURE feature, which controls whether X86_FEATURE_PTI +is set. + +Signed-off-by: Tom Lendacky +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Cc: Dave Hansen +Cc: Andy Lutomirski +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/20171227054354.20369.94587.stgit@tlendack-t1.amdoffice.net + +(cherry picked from commit 694d99d40972f12e59a3696effee8a376b79d7c8) +Signed-off-by: Marcelo Henrique Cerri +(cherry picked from commit 9d334f48f017b9c6457c6ba321e5a53a1cc6a5c7) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/cpu/common.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 99f37d1636ff..1854dd8071a6 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -899,8 +899,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) + + setup_force_cpu_cap(X86_FEATURE_ALWAYS); + +- /* Assume for now that ALL x86 CPUs are insecure */ +- setup_force_cpu_bug(X86_BUG_CPU_INSECURE); ++ if (c->x86_vendor != X86_VENDOR_AMD) ++ setup_force_cpu_bug(X86_BUG_CPU_INSECURE); + + fpu__init_system(c); + } +-- +2.14.2 + diff --git a/patches/kernel/0243-x86-microcode-AMD-Add-support-for-fam17h-microcode-l.patch b/patches/kernel/0243-x86-microcode-AMD-Add-support-for-fam17h-microcode-l.patch deleted file mode 100644 index b7d44c5..0000000 --- a/patches/kernel/0243-x86-microcode-AMD-Add-support-for-fam17h-microcode-l.patch +++ /dev/null @@ -1,51 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tom Lendacky -Date: Thu, 30 Nov 2017 16:46:40 -0600 -Subject: [PATCH] x86/microcode/AMD: Add support for fam17h microcode loading -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -commit f4e9b7af0cd58dd039a0fb2cd67d57cea4889abf upstream. - -The size for the Microcode Patch Block (MPB) for an AMD family 17h -processor is 3200 bytes. Add a #define for fam17h so that it does -not default to 2048 bytes and fail a microcode load/update. - -Signed-off-by: Tom Lendacky -Signed-off-by: Thomas Gleixner -Reviewed-by: Borislav Petkov -Link: https://lkml.kernel.org/r/20171130224640.15391.40247.stgit@tlendack-t1.amdoffice.net -Signed-off-by: Ingo Molnar -Cc: Alice Ferrazzi -Signed-off-by: Greg Kroah-Hartman -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/cpu/microcode/amd.c | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c -index 21b185793c80..248cad00fee6 100644 ---- a/arch/x86/kernel/cpu/microcode/amd.c -+++ b/arch/x86/kernel/cpu/microcode/amd.c -@@ -467,6 +467,7 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, - #define F14H_MPB_MAX_SIZE 1824 - #define F15H_MPB_MAX_SIZE 4096 - #define F16H_MPB_MAX_SIZE 3458 -+#define F17H_MPB_MAX_SIZE 3200 - - switch (family) { - case 0x14: -@@ -478,6 +479,9 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, - case 0x16: - max_size = F16H_MPB_MAX_SIZE; - break; -+ case 0x17: -+ max_size = F17H_MPB_MAX_SIZE; -+ break; - default: - max_size = F1XH_MPB_MAX_SIZE; - break; --- -2.14.2 - diff --git a/patches/kernel/0244-Revert-scsi-libsas-allow-async-aborts.patch b/patches/kernel/0244-Revert-scsi-libsas-allow-async-aborts.patch deleted file mode 100644 index f6b461c..0000000 --- a/patches/kernel/0244-Revert-scsi-libsas-allow-async-aborts.patch +++ /dev/null @@ -1,32 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= -Date: Mon, 8 Jan 2018 11:49:28 +0100 -Subject: [PATCH] Revert "scsi: libsas: allow async aborts" -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -This reverts commit 909657615d9b3ce709be4fd95b9a9e8c8c7c2be6. - -Signed-off-by: Fabian Grünbichler ---- - drivers/scsi/libsas/sas_scsi_host.c | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c -index 87e5079d816b..137fb586aa64 100644 ---- a/drivers/scsi/libsas/sas_scsi_host.c -+++ b/drivers/scsi/libsas/sas_scsi_host.c -@@ -491,6 +491,9 @@ int sas_eh_abort_handler(struct scsi_cmnd *cmd) - struct Scsi_Host *host = cmd->device->host; - struct sas_internal *i = to_sas_internal(host->transportt); - -+ if (current != host->ehandler) -+ return FAILED; -+ - if (!i->dft->lldd_abort_task) - return FAILED; - --- -2.14.2 - diff --git a/patches/kernel/0244-x86-microcode-AMD-Add-support-for-fam17h-microcode-l.patch b/patches/kernel/0244-x86-microcode-AMD-Add-support-for-fam17h-microcode-l.patch new file mode 100644 index 0000000..b7d44c5 --- /dev/null +++ b/patches/kernel/0244-x86-microcode-AMD-Add-support-for-fam17h-microcode-l.patch @@ -0,0 +1,51 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Thu, 30 Nov 2017 16:46:40 -0600 +Subject: [PATCH] x86/microcode/AMD: Add support for fam17h microcode loading +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit f4e9b7af0cd58dd039a0fb2cd67d57cea4889abf upstream. + +The size for the Microcode Patch Block (MPB) for an AMD family 17h +processor is 3200 bytes. Add a #define for fam17h so that it does +not default to 2048 bytes and fail a microcode load/update. + +Signed-off-by: Tom Lendacky +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Link: https://lkml.kernel.org/r/20171130224640.15391.40247.stgit@tlendack-t1.amdoffice.net +Signed-off-by: Ingo Molnar +Cc: Alice Ferrazzi +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/cpu/microcode/amd.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c +index 21b185793c80..248cad00fee6 100644 +--- a/arch/x86/kernel/cpu/microcode/amd.c ++++ b/arch/x86/kernel/cpu/microcode/amd.c +@@ -467,6 +467,7 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, + #define F14H_MPB_MAX_SIZE 1824 + #define F15H_MPB_MAX_SIZE 4096 + #define F16H_MPB_MAX_SIZE 3458 ++#define F17H_MPB_MAX_SIZE 3200 + + switch (family) { + case 0x14: +@@ -478,6 +479,9 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, + case 0x16: + max_size = F16H_MPB_MAX_SIZE; + break; ++ case 0x17: ++ max_size = F17H_MPB_MAX_SIZE; ++ break; + default: + max_size = F1XH_MPB_MAX_SIZE; + break; +-- +2.14.2 + diff --git a/patches/kernel/0245-Revert-scsi-libsas-allow-async-aborts.patch b/patches/kernel/0245-Revert-scsi-libsas-allow-async-aborts.patch new file mode 100644 index 0000000..f6b461c --- /dev/null +++ b/patches/kernel/0245-Revert-scsi-libsas-allow-async-aborts.patch @@ -0,0 +1,32 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= +Date: Mon, 8 Jan 2018 11:49:28 +0100 +Subject: [PATCH] Revert "scsi: libsas: allow async aborts" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This reverts commit 909657615d9b3ce709be4fd95b9a9e8c8c7c2be6. + +Signed-off-by: Fabian Grünbichler +--- + drivers/scsi/libsas/sas_scsi_host.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c +index 87e5079d816b..137fb586aa64 100644 +--- a/drivers/scsi/libsas/sas_scsi_host.c ++++ b/drivers/scsi/libsas/sas_scsi_host.c +@@ -491,6 +491,9 @@ int sas_eh_abort_handler(struct scsi_cmnd *cmd) + struct Scsi_Host *host = cmd->device->host; + struct sas_internal *i = to_sas_internal(host->transportt); + ++ if (current != host->ehandler) ++ return FAILED; ++ + if (!i->dft->lldd_abort_task) + return FAILED; + +-- +2.14.2 + diff --git a/patches/kernel/0245-x86-pti-Make-sure-the-user-kernel-PTEs-match.patch b/patches/kernel/0245-x86-pti-Make-sure-the-user-kernel-PTEs-match.patch deleted file mode 100644 index 9f594a1..0000000 --- a/patches/kernel/0245-x86-pti-Make-sure-the-user-kernel-PTEs-match.patch +++ /dev/null @@ -1,66 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Wed, 3 Jan 2018 15:57:59 +0100 -Subject: [PATCH] x86/pti: Make sure the user/kernel PTEs match -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Meelis reported that his K8 Athlon64 emits MCE warnings when PTI is -enabled: - -[Hardware Error]: Error Addr: 0x0000ffff81e000e0 -[Hardware Error]: MC1 Error: L1 TLB multimatch. -[Hardware Error]: cache level: L1, tx: INSN - -The address is in the entry area, which is mapped into kernel _AND_ user -space. That's special because we switch CR3 while we are executing -there. - -User mapping: -0xffffffff81e00000-0xffffffff82000000 2M ro PSE GLB x pmd - -Kernel mapping: -0xffffffff81000000-0xffffffff82000000 16M ro PSE x pmd - -So the K8 is complaining that the TLB entries differ. They differ in the -GLB bit. - -Drop the GLB bit when installing the user shared mapping. - -Fixes: 6dc72c3cbca0 ("x86/mm/pti: Share entry text PMD") -Reported-by: Meelis Roos -Signed-off-by: Thomas Gleixner -Tested-by: Meelis Roos -Cc: Borislav Petkov -Cc: Tom Lendacky -Cc: stable@vger.kernel.org -Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801031407180.1957@nanos -(cherry picked from commit 52994c256df36fda9a715697431cba9daecb6b11) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 8a95d206afc447d8461815c67e618bd8b2c6457f) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/pti.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c -index bce8aea65606..2da28ba97508 100644 ---- a/arch/x86/mm/pti.c -+++ b/arch/x86/mm/pti.c -@@ -367,7 +367,8 @@ static void __init pti_setup_espfix64(void) - static void __init pti_clone_entry_text(void) - { - pti_clone_pmds((unsigned long) __entry_text_start, -- (unsigned long) __irqentry_text_end, _PAGE_RW); -+ (unsigned long) __irqentry_text_end, -+ _PAGE_RW | _PAGE_GLOBAL); - } - - /* --- -2.14.2 - diff --git a/patches/kernel/0246-x86-dumpstack-Fix-partial-register-dumps.patch b/patches/kernel/0246-x86-dumpstack-Fix-partial-register-dumps.patch deleted file mode 100644 index c450268..0000000 --- a/patches/kernel/0246-x86-dumpstack-Fix-partial-register-dumps.patch +++ /dev/null @@ -1,172 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Sun, 31 Dec 2017 10:18:06 -0600 -Subject: [PATCH] x86/dumpstack: Fix partial register dumps -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The show_regs_safe() logic is wrong. When there's an iret stack frame, -it prints the entire pt_regs -- most of which is random stack data -- -instead of just the five registers at the end. - -show_regs_safe() is also poorly named: the on_stack() checks aren't for -safety. Rename the function to show_regs_if_on_stack() and add a -comment to explain why the checks are needed. - -These issues were introduced with the "partial register dump" feature of -the following commit: - - b02fcf9ba121 ("x86/unwinder: Handle stack overflows more gracefully") - -That patch had gone through a few iterations of development, and the -above issues were artifacts from a previous iteration of the patch where -'regs' pointed directly to the iret frame rather than to the (partially -empty) pt_regs. - -Tested-by: Alexander Tsoy -Signed-off-by: Josh Poimboeuf -Cc: Andy Lutomirski -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: Toralf Förster -Cc: stable@vger.kernel.org -Fixes: b02fcf9ba121 ("x86/unwinder: Handle stack overflows more gracefully") -Link: http://lkml.kernel.org/r/5b05b8b344f59db2d3d50dbdeba92d60f2304c54.1514736742.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit a9cdbe72c4e8bf3b38781c317a79326e2e1a230d) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 3f159d02ecca1ffe81dc467767833dd6d0345147) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/unwind.h | 17 +++++++++++++---- - arch/x86/kernel/dumpstack.c | 28 ++++++++++++++++++++-------- - arch/x86/kernel/stacktrace.c | 2 +- - 3 files changed, 34 insertions(+), 13 deletions(-) - -diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h -index 38fa6154e382..e1c1cb5019bc 100644 ---- a/arch/x86/include/asm/unwind.h -+++ b/arch/x86/include/asm/unwind.h -@@ -55,18 +55,27 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, - - #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) - /* -- * WARNING: The entire pt_regs may not be safe to dereference. In some cases, -- * only the iret frame registers are accessible. Use with caution! -+ * If 'partial' returns true, only the iret frame registers are valid. - */ --static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) -+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, -+ bool *partial) - { - if (unwind_done(state)) - return NULL; - -+ if (partial) { -+#ifdef CONFIG_UNWINDER_ORC -+ *partial = !state->full_regs; -+#else -+ *partial = false; -+#endif -+ } -+ - return state->regs; - } - #else --static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) -+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, -+ bool *partial) - { - return NULL; - } -diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c -index 19a936e9b259..8da5b487919f 100644 ---- a/arch/x86/kernel/dumpstack.c -+++ b/arch/x86/kernel/dumpstack.c -@@ -76,12 +76,23 @@ void show_iret_regs(struct pt_regs *regs) - regs->sp, regs->flags); - } - --static void show_regs_safe(struct stack_info *info, struct pt_regs *regs) -+static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, -+ bool partial) - { -- if (on_stack(info, regs, sizeof(*regs))) -+ /* -+ * These on_stack() checks aren't strictly necessary: the unwind code -+ * has already validated the 'regs' pointer. The checks are done for -+ * ordering reasons: if the registers are on the next stack, we don't -+ * want to print them out yet. Otherwise they'll be shown as part of -+ * the wrong stack. Later, when show_trace_log_lvl() switches to the -+ * next stack, this function will be called again with the same regs so -+ * they can be printed in the right context. -+ */ -+ if (!partial && on_stack(info, regs, sizeof(*regs))) { - __show_regs(regs, 0); -- else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET, -- IRET_FRAME_SIZE)) { -+ -+ } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, -+ IRET_FRAME_SIZE)) { - /* - * When an interrupt or exception occurs in entry code, the - * full pt_regs might not have been saved yet. In that case -@@ -98,6 +109,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - struct stack_info stack_info = {0}; - unsigned long visit_mask = 0; - int graph_idx = 0; -+ bool partial; - - printk("%sCall Trace:\n", log_lvl); - -@@ -140,7 +152,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - printk("%s <%s>\n", log_lvl, stack_name); - - if (regs) -- show_regs_safe(&stack_info, regs); -+ show_regs_if_on_stack(&stack_info, regs, partial); - - /* - * Scan the stack, printing any text addresses we find. At the -@@ -164,7 +176,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - - /* - * Don't print regs->ip again if it was already printed -- * by show_regs_safe() below. -+ * by show_regs_if_on_stack(). - */ - if (regs && stack == ®s->ip) { - unwind_next_frame(&state); -@@ -200,9 +212,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unwind_next_frame(&state); - - /* if the frame has entry regs, print them */ -- regs = unwind_get_entry_regs(&state); -+ regs = unwind_get_entry_regs(&state, &partial); - if (regs) -- show_regs_safe(&stack_info, regs); -+ show_regs_if_on_stack(&stack_info, regs, partial); - } - - if (stack_name) -diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c -index 8dabd7bf1673..60244bfaf88f 100644 ---- a/arch/x86/kernel/stacktrace.c -+++ b/arch/x86/kernel/stacktrace.c -@@ -98,7 +98,7 @@ static int __save_stack_trace_reliable(struct stack_trace *trace, - for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state); - unwind_next_frame(&state)) { - -- regs = unwind_get_entry_regs(&state); -+ regs = unwind_get_entry_regs(&state, NULL); - if (regs) { - /* - * Kernel mode registers on the stack indicate an --- -2.14.2 - diff --git a/patches/kernel/0246-x86-pti-Make-sure-the-user-kernel-PTEs-match.patch b/patches/kernel/0246-x86-pti-Make-sure-the-user-kernel-PTEs-match.patch new file mode 100644 index 0000000..9f594a1 --- /dev/null +++ b/patches/kernel/0246-x86-pti-Make-sure-the-user-kernel-PTEs-match.patch @@ -0,0 +1,66 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Wed, 3 Jan 2018 15:57:59 +0100 +Subject: [PATCH] x86/pti: Make sure the user/kernel PTEs match +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Meelis reported that his K8 Athlon64 emits MCE warnings when PTI is +enabled: + +[Hardware Error]: Error Addr: 0x0000ffff81e000e0 +[Hardware Error]: MC1 Error: L1 TLB multimatch. +[Hardware Error]: cache level: L1, tx: INSN + +The address is in the entry area, which is mapped into kernel _AND_ user +space. That's special because we switch CR3 while we are executing +there. + +User mapping: +0xffffffff81e00000-0xffffffff82000000 2M ro PSE GLB x pmd + +Kernel mapping: +0xffffffff81000000-0xffffffff82000000 16M ro PSE x pmd + +So the K8 is complaining that the TLB entries differ. They differ in the +GLB bit. + +Drop the GLB bit when installing the user shared mapping. + +Fixes: 6dc72c3cbca0 ("x86/mm/pti: Share entry text PMD") +Reported-by: Meelis Roos +Signed-off-by: Thomas Gleixner +Tested-by: Meelis Roos +Cc: Borislav Petkov +Cc: Tom Lendacky +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801031407180.1957@nanos +(cherry picked from commit 52994c256df36fda9a715697431cba9daecb6b11) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 8a95d206afc447d8461815c67e618bd8b2c6457f) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/pti.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c +index bce8aea65606..2da28ba97508 100644 +--- a/arch/x86/mm/pti.c ++++ b/arch/x86/mm/pti.c +@@ -367,7 +367,8 @@ static void __init pti_setup_espfix64(void) + static void __init pti_clone_entry_text(void) + { + pti_clone_pmds((unsigned long) __entry_text_start, +- (unsigned long) __irqentry_text_end, _PAGE_RW); ++ (unsigned long) __irqentry_text_end, ++ _PAGE_RW | _PAGE_GLOBAL); + } + + /* +-- +2.14.2 + diff --git a/patches/kernel/0247-x86-dumpstack-Fix-partial-register-dumps.patch b/patches/kernel/0247-x86-dumpstack-Fix-partial-register-dumps.patch new file mode 100644 index 0000000..c450268 --- /dev/null +++ b/patches/kernel/0247-x86-dumpstack-Fix-partial-register-dumps.patch @@ -0,0 +1,172 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Sun, 31 Dec 2017 10:18:06 -0600 +Subject: [PATCH] x86/dumpstack: Fix partial register dumps +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The show_regs_safe() logic is wrong. When there's an iret stack frame, +it prints the entire pt_regs -- most of which is random stack data -- +instead of just the five registers at the end. + +show_regs_safe() is also poorly named: the on_stack() checks aren't for +safety. Rename the function to show_regs_if_on_stack() and add a +comment to explain why the checks are needed. + +These issues were introduced with the "partial register dump" feature of +the following commit: + + b02fcf9ba121 ("x86/unwinder: Handle stack overflows more gracefully") + +That patch had gone through a few iterations of development, and the +above issues were artifacts from a previous iteration of the patch where +'regs' pointed directly to the iret frame rather than to the (partially +empty) pt_regs. + +Tested-by: Alexander Tsoy +Signed-off-by: Josh Poimboeuf +Cc: Andy Lutomirski +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: Toralf Förster +Cc: stable@vger.kernel.org +Fixes: b02fcf9ba121 ("x86/unwinder: Handle stack overflows more gracefully") +Link: http://lkml.kernel.org/r/5b05b8b344f59db2d3d50dbdeba92d60f2304c54.1514736742.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit a9cdbe72c4e8bf3b38781c317a79326e2e1a230d) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 3f159d02ecca1ffe81dc467767833dd6d0345147) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/unwind.h | 17 +++++++++++++---- + arch/x86/kernel/dumpstack.c | 28 ++++++++++++++++++++-------- + arch/x86/kernel/stacktrace.c | 2 +- + 3 files changed, 34 insertions(+), 13 deletions(-) + +diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h +index 38fa6154e382..e1c1cb5019bc 100644 +--- a/arch/x86/include/asm/unwind.h ++++ b/arch/x86/include/asm/unwind.h +@@ -55,18 +55,27 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, + + #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) + /* +- * WARNING: The entire pt_regs may not be safe to dereference. In some cases, +- * only the iret frame registers are accessible. Use with caution! ++ * If 'partial' returns true, only the iret frame registers are valid. + */ +-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) ++static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, ++ bool *partial) + { + if (unwind_done(state)) + return NULL; + ++ if (partial) { ++#ifdef CONFIG_UNWINDER_ORC ++ *partial = !state->full_regs; ++#else ++ *partial = false; ++#endif ++ } ++ + return state->regs; + } + #else +-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) ++static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, ++ bool *partial) + { + return NULL; + } +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index 19a936e9b259..8da5b487919f 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -76,12 +76,23 @@ void show_iret_regs(struct pt_regs *regs) + regs->sp, regs->flags); + } + +-static void show_regs_safe(struct stack_info *info, struct pt_regs *regs) ++static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, ++ bool partial) + { +- if (on_stack(info, regs, sizeof(*regs))) ++ /* ++ * These on_stack() checks aren't strictly necessary: the unwind code ++ * has already validated the 'regs' pointer. The checks are done for ++ * ordering reasons: if the registers are on the next stack, we don't ++ * want to print them out yet. Otherwise they'll be shown as part of ++ * the wrong stack. Later, when show_trace_log_lvl() switches to the ++ * next stack, this function will be called again with the same regs so ++ * they can be printed in the right context. ++ */ ++ if (!partial && on_stack(info, regs, sizeof(*regs))) { + __show_regs(regs, 0); +- else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET, +- IRET_FRAME_SIZE)) { ++ ++ } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, ++ IRET_FRAME_SIZE)) { + /* + * When an interrupt or exception occurs in entry code, the + * full pt_regs might not have been saved yet. In that case +@@ -98,6 +109,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + struct stack_info stack_info = {0}; + unsigned long visit_mask = 0; + int graph_idx = 0; ++ bool partial; + + printk("%sCall Trace:\n", log_lvl); + +@@ -140,7 +152,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + printk("%s <%s>\n", log_lvl, stack_name); + + if (regs) +- show_regs_safe(&stack_info, regs); ++ show_regs_if_on_stack(&stack_info, regs, partial); + + /* + * Scan the stack, printing any text addresses we find. At the +@@ -164,7 +176,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + + /* + * Don't print regs->ip again if it was already printed +- * by show_regs_safe() below. ++ * by show_regs_if_on_stack(). + */ + if (regs && stack == ®s->ip) { + unwind_next_frame(&state); +@@ -200,9 +212,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unwind_next_frame(&state); + + /* if the frame has entry regs, print them */ +- regs = unwind_get_entry_regs(&state); ++ regs = unwind_get_entry_regs(&state, &partial); + if (regs) +- show_regs_safe(&stack_info, regs); ++ show_regs_if_on_stack(&stack_info, regs, partial); + } + + if (stack_name) +diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c +index 8dabd7bf1673..60244bfaf88f 100644 +--- a/arch/x86/kernel/stacktrace.c ++++ b/arch/x86/kernel/stacktrace.c +@@ -98,7 +98,7 @@ static int __save_stack_trace_reliable(struct stack_trace *trace, + for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + +- regs = unwind_get_entry_regs(&state); ++ regs = unwind_get_entry_regs(&state, NULL); + if (regs) { + /* + * Kernel mode registers on the stack indicate an +-- +2.14.2 + diff --git a/patches/kernel/0247-x86-dumpstack-Print-registers-for-first-stack-frame.patch b/patches/kernel/0247-x86-dumpstack-Print-registers-for-first-stack-frame.patch deleted file mode 100644 index 26b5a60..0000000 --- a/patches/kernel/0247-x86-dumpstack-Print-registers-for-first-stack-frame.patch +++ /dev/null @@ -1,58 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Josh Poimboeuf -Date: Sun, 31 Dec 2017 10:18:07 -0600 -Subject: [PATCH] x86/dumpstack: Print registers for first stack frame -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -In the stack dump code, if the frame after the starting pt_regs is also -a regs frame, the registers don't get printed. Fix that. - -Reported-by: Andy Lutomirski -Tested-by: Alexander Tsoy -Signed-off-by: Josh Poimboeuf -Cc: Andy Lutomirski -Cc: Linus Torvalds -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: Toralf Förster -Cc: stable@vger.kernel.org -Fixes: 3b3fa11bc700 ("x86/dumpstack: Print any pt_regs found on the stack") -Link: http://lkml.kernel.org/r/396f84491d2f0ef64eda4217a2165f5712f6a115.1514736742.git.jpoimboe@redhat.com -Signed-off-by: Ingo Molnar -(cherry picked from commit 3ffdeb1a02be3086f1411a15c5b9c481fa28e21f) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 3aef1ce621ae2eb0bd58e07cf9e66a859faa17cd) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/dumpstack.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c -index 8da5b487919f..042f80c50e3b 100644 ---- a/arch/x86/kernel/dumpstack.c -+++ b/arch/x86/kernel/dumpstack.c -@@ -115,6 +115,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - - unwind_start(&state, task, regs, stack); - stack = stack ? : get_stack_pointer(task, regs); -+ regs = unwind_get_entry_regs(&state, &partial); - - /* - * Iterate through the stacks, starting with the current stack pointer. -@@ -132,7 +133,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - * - hardirq stack - * - entry stack - */ -- for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { -+ for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { - const char *stack_name; - - if (get_stack_info(stack, task, &stack_info, &visit_mask)) { --- -2.14.2 - diff --git a/patches/kernel/0248-x86-dumpstack-Print-registers-for-first-stack-frame.patch b/patches/kernel/0248-x86-dumpstack-Print-registers-for-first-stack-frame.patch new file mode 100644 index 0000000..26b5a60 --- /dev/null +++ b/patches/kernel/0248-x86-dumpstack-Print-registers-for-first-stack-frame.patch @@ -0,0 +1,58 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf +Date: Sun, 31 Dec 2017 10:18:07 -0600 +Subject: [PATCH] x86/dumpstack: Print registers for first stack frame +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +In the stack dump code, if the frame after the starting pt_regs is also +a regs frame, the registers don't get printed. Fix that. + +Reported-by: Andy Lutomirski +Tested-by: Alexander Tsoy +Signed-off-by: Josh Poimboeuf +Cc: Andy Lutomirski +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: Toralf Förster +Cc: stable@vger.kernel.org +Fixes: 3b3fa11bc700 ("x86/dumpstack: Print any pt_regs found on the stack") +Link: http://lkml.kernel.org/r/396f84491d2f0ef64eda4217a2165f5712f6a115.1514736742.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar +(cherry picked from commit 3ffdeb1a02be3086f1411a15c5b9c481fa28e21f) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 3aef1ce621ae2eb0bd58e07cf9e66a859faa17cd) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/dumpstack.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index 8da5b487919f..042f80c50e3b 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -115,6 +115,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + + unwind_start(&state, task, regs, stack); + stack = stack ? : get_stack_pointer(task, regs); ++ regs = unwind_get_entry_regs(&state, &partial); + + /* + * Iterate through the stacks, starting with the current stack pointer. +@@ -132,7 +133,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + * - hardirq stack + * - entry stack + */ +- for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { ++ for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + const char *stack_name; + + if (get_stack_info(stack, task, &stack_info, &visit_mask)) { +-- +2.14.2 + diff --git a/patches/kernel/0248-x86-process-Define-cpu_tss_rw-in-same-section-as-dec.patch b/patches/kernel/0248-x86-process-Define-cpu_tss_rw-in-same-section-as-dec.patch deleted file mode 100644 index 24c5a1b..0000000 --- a/patches/kernel/0248-x86-process-Define-cpu_tss_rw-in-same-section-as-dec.patch +++ /dev/null @@ -1,64 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Nick Desaulniers -Date: Wed, 3 Jan 2018 12:39:52 -0800 -Subject: [PATCH] x86/process: Define cpu_tss_rw in same section as declaration -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -cpu_tss_rw is declared with DECLARE_PER_CPU_PAGE_ALIGNED -but then defined with DEFINE_PER_CPU_SHARED_ALIGNED -leading to section mismatch warnings. - -Use DEFINE_PER_CPU_PAGE_ALIGNED consistently. This is necessary because -it's mapped to the cpu entry area and must be page aligned. - -[ tglx: Massaged changelog a bit ] - -Fixes: 1a935bc3d4ea ("x86/entry: Move SYSENTER_stack to the beginning of struct tss_struct") -Suggested-by: Thomas Gleixner -Signed-off-by: Nick Desaulniers -Signed-off-by: Thomas Gleixner -Cc: thomas.lendacky@amd.com -Cc: Borislav Petkov -Cc: tklauser@distanz.ch -Cc: minipli@googlemail.com -Cc: me@kylehuey.com -Cc: namit@vmware.com -Cc: luto@kernel.org -Cc: jpoimboe@redhat.com -Cc: tj@kernel.org -Cc: cl@linux.com -Cc: bp@suse.de -Cc: thgarnie@google.com -Cc: kirill.shutemov@linux.intel.com -Cc: stable@vger.kernel.org -Link: https://lkml.kernel.org/r/20180103203954.183360-1-ndesaulniers@google.com - -(cherry picked from commit 2fd9c41aea47f4ad071accf94b94f94f2c4d31eb) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit f45e574914ae47825d2eea46abc9d6fbabe55e56) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/process.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c -index 3688a7b9d055..07e6218ad7d9 100644 ---- a/arch/x86/kernel/process.c -+++ b/arch/x86/kernel/process.c -@@ -46,7 +46,7 @@ - * section. Since TSS's are completely CPU-local, we want them - * on exact cacheline boundaries, to eliminate cacheline ping-pong. - */ --__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = { -+__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { - .x86_tss = { - /* - * .sp0 is only used when entering ring 0 from a lower --- -2.14.2 - diff --git a/patches/kernel/0249-x86-process-Define-cpu_tss_rw-in-same-section-as-dec.patch b/patches/kernel/0249-x86-process-Define-cpu_tss_rw-in-same-section-as-dec.patch new file mode 100644 index 0000000..24c5a1b --- /dev/null +++ b/patches/kernel/0249-x86-process-Define-cpu_tss_rw-in-same-section-as-dec.patch @@ -0,0 +1,64 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Nick Desaulniers +Date: Wed, 3 Jan 2018 12:39:52 -0800 +Subject: [PATCH] x86/process: Define cpu_tss_rw in same section as declaration +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +cpu_tss_rw is declared with DECLARE_PER_CPU_PAGE_ALIGNED +but then defined with DEFINE_PER_CPU_SHARED_ALIGNED +leading to section mismatch warnings. + +Use DEFINE_PER_CPU_PAGE_ALIGNED consistently. This is necessary because +it's mapped to the cpu entry area and must be page aligned. + +[ tglx: Massaged changelog a bit ] + +Fixes: 1a935bc3d4ea ("x86/entry: Move SYSENTER_stack to the beginning of struct tss_struct") +Suggested-by: Thomas Gleixner +Signed-off-by: Nick Desaulniers +Signed-off-by: Thomas Gleixner +Cc: thomas.lendacky@amd.com +Cc: Borislav Petkov +Cc: tklauser@distanz.ch +Cc: minipli@googlemail.com +Cc: me@kylehuey.com +Cc: namit@vmware.com +Cc: luto@kernel.org +Cc: jpoimboe@redhat.com +Cc: tj@kernel.org +Cc: cl@linux.com +Cc: bp@suse.de +Cc: thgarnie@google.com +Cc: kirill.shutemov@linux.intel.com +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/20180103203954.183360-1-ndesaulniers@google.com + +(cherry picked from commit 2fd9c41aea47f4ad071accf94b94f94f2c4d31eb) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit f45e574914ae47825d2eea46abc9d6fbabe55e56) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/process.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 3688a7b9d055..07e6218ad7d9 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -46,7 +46,7 @@ + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = { ++__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { + .x86_tss = { + /* + * .sp0 is only used when entering ring 0 from a lower +-- +2.14.2 + diff --git a/patches/kernel/0249-x86-pti-Rename-BUG_CPU_INSECURE-to-BUG_CPU_MELTDOWN.patch b/patches/kernel/0249-x86-pti-Rename-BUG_CPU_INSECURE-to-BUG_CPU_MELTDOWN.patch deleted file mode 100644 index f8049c8..0000000 --- a/patches/kernel/0249-x86-pti-Rename-BUG_CPU_INSECURE-to-BUG_CPU_MELTDOWN.patch +++ /dev/null @@ -1,98 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Fri, 5 Jan 2018 15:27:34 +0100 -Subject: [PATCH] x86/pti: Rename BUG_CPU_INSECURE to BUG_CPU_MELTDOWN -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Use the name associated with the particular attack which needs page table -isolation for mitigation. - -Signed-off-by: Thomas Gleixner -Acked-by: David Woodhouse -Cc: Alan Cox -Cc: Jiri Koshina -Cc: Linus Torvalds -Cc: Tim Chen -Cc: Andi Lutomirski -Cc: Andi Kleen -Cc: Peter Zijlstra -Cc: Paul Turner -Cc: Tom Lendacky -Cc: Greg KH -Cc: Dave Hansen -Cc: Kees Cook -Cc: stable@vger.kernel.org -Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801051525300.1724@nanos - -(cherry picked from commit de791821c295cc61419a06fe5562288417d1bc58) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit aefb6725ee33758a2869c37e22dbc7ca80548007) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/cpufeatures.h | 2 +- - arch/x86/kernel/cpu/common.c | 2 +- - arch/x86/mm/pti.c | 6 +++--- - 3 files changed, 5 insertions(+), 5 deletions(-) - -diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h -index 9b0c283afcf0..b7900d26066c 100644 ---- a/arch/x86/include/asm/cpufeatures.h -+++ b/arch/x86/include/asm/cpufeatures.h -@@ -340,6 +340,6 @@ - #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ - #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ - #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ --#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */ -+#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ - - #endif /* _ASM_X86_CPUFEATURES_H */ -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index 1854dd8071a6..142ab555dafa 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -900,7 +900,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) - setup_force_cpu_cap(X86_FEATURE_ALWAYS); - - if (c->x86_vendor != X86_VENDOR_AMD) -- setup_force_cpu_bug(X86_BUG_CPU_INSECURE); -+ setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); - - fpu__init_system(c); - } -diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c -index 2da28ba97508..43d4a4a29037 100644 ---- a/arch/x86/mm/pti.c -+++ b/arch/x86/mm/pti.c -@@ -56,13 +56,13 @@ - - static void __init pti_print_if_insecure(const char *reason) - { -- if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) -+ if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) - pr_info("%s\n", reason); - } - - static void __init pti_print_if_secure(const char *reason) - { -- if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) -+ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) - pr_info("%s\n", reason); - } - -@@ -96,7 +96,7 @@ void __init pti_check_boottime_disable(void) - } - - autosel: -- if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) -+ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) - return; - enable: - setup_force_cpu_cap(X86_FEATURE_PTI); --- -2.14.2 - diff --git a/patches/kernel/0250-x86-pti-Rename-BUG_CPU_INSECURE-to-BUG_CPU_MELTDOWN.patch b/patches/kernel/0250-x86-pti-Rename-BUG_CPU_INSECURE-to-BUG_CPU_MELTDOWN.patch new file mode 100644 index 0000000..f8049c8 --- /dev/null +++ b/patches/kernel/0250-x86-pti-Rename-BUG_CPU_INSECURE-to-BUG_CPU_MELTDOWN.patch @@ -0,0 +1,98 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Fri, 5 Jan 2018 15:27:34 +0100 +Subject: [PATCH] x86/pti: Rename BUG_CPU_INSECURE to BUG_CPU_MELTDOWN +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Use the name associated with the particular attack which needs page table +isolation for mitigation. + +Signed-off-by: Thomas Gleixner +Acked-by: David Woodhouse +Cc: Alan Cox +Cc: Jiri Koshina +Cc: Linus Torvalds +Cc: Tim Chen +Cc: Andi Lutomirski +Cc: Andi Kleen +Cc: Peter Zijlstra +Cc: Paul Turner +Cc: Tom Lendacky +Cc: Greg KH +Cc: Dave Hansen +Cc: Kees Cook +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801051525300.1724@nanos + +(cherry picked from commit de791821c295cc61419a06fe5562288417d1bc58) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit aefb6725ee33758a2869c37e22dbc7ca80548007) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/cpufeatures.h | 2 +- + arch/x86/kernel/cpu/common.c | 2 +- + arch/x86/mm/pti.c | 6 +++--- + 3 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 9b0c283afcf0..b7900d26066c 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -340,6 +340,6 @@ + #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ + #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ + #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ +-#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */ ++#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ + + #endif /* _ASM_X86_CPUFEATURES_H */ +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 1854dd8071a6..142ab555dafa 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -900,7 +900,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) + setup_force_cpu_cap(X86_FEATURE_ALWAYS); + + if (c->x86_vendor != X86_VENDOR_AMD) +- setup_force_cpu_bug(X86_BUG_CPU_INSECURE); ++ setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + + fpu__init_system(c); + } +diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c +index 2da28ba97508..43d4a4a29037 100644 +--- a/arch/x86/mm/pti.c ++++ b/arch/x86/mm/pti.c +@@ -56,13 +56,13 @@ + + static void __init pti_print_if_insecure(const char *reason) + { +- if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) ++ if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + pr_info("%s\n", reason); + } + + static void __init pti_print_if_secure(const char *reason) + { +- if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) ++ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + pr_info("%s\n", reason); + } + +@@ -96,7 +96,7 @@ void __init pti_check_boottime_disable(void) + } + + autosel: +- if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) ++ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + return; + enable: + setup_force_cpu_cap(X86_FEATURE_PTI); +-- +2.14.2 + diff --git a/patches/kernel/0250-x86-pti-Unbreak-EFI-old_memmap.patch b/patches/kernel/0250-x86-pti-Unbreak-EFI-old_memmap.patch deleted file mode 100644 index 0aca0ec..0000000 --- a/patches/kernel/0250-x86-pti-Unbreak-EFI-old_memmap.patch +++ /dev/null @@ -1,61 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Jiri Kosina -Date: Fri, 5 Jan 2018 22:35:41 +0100 -Subject: [PATCH] x86/pti: Unbreak EFI old_memmap -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -EFI_OLD_MEMMAP's efi_call_phys_prolog() calls set_pgd() with swapper PGD that -has PAGE_USER set, which makes PTI set NX on it, and therefore EFI can't -execute it's code. - -Fix that by forcefully clearing _PAGE_NX from the PGD (this can't be done -by the pgprot API). - -_PAGE_NX will be automatically reintroduced in efi_call_phys_epilog(), as -_set_pgd() will again notice that this is _PAGE_USER, and set _PAGE_NX on -it. - -Tested-by: Dimitri Sivanich -Signed-off-by: Jiri Kosina -Signed-off-by: Ingo Molnar -Signed-off-by: Thomas Gleixner -Acked-by: Dave Hansen -Cc: Andrea Arcangeli -Cc: Ard Biesheuvel -Cc: Linus Torvalds -Cc: Matt Fleming -Cc: Peter Zijlstra -Cc: Thomas Gleixner -Cc: linux-efi@vger.kernel.org -Cc: stable@vger.kernel.org -Link: http://lkml.kernel.org/r/nycvar.YFH.7.76.1801052215460.11852@cbobk.fhfr.pm -(cherry picked from commit de53c3786a3ce162a1c815d0c04c766c23ec9c0a) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 31afacd8089f54061e718e5d491f11747755c503) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/platform/efi/efi_64.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c -index b104224d3d6c..987a38e82f73 100644 ---- a/arch/x86/platform/efi/efi_64.c -+++ b/arch/x86/platform/efi/efi_64.c -@@ -133,7 +133,9 @@ pgd_t * __init efi_call_phys_prolog(void) - pud[j] = *pud_offset(p4d_k, vaddr); - } - } -+ pgd_offset_k(pgd * PGDIR_SIZE)->pgd &= ~_PAGE_NX; - } -+ - out: - __flush_tlb_all(); - --- -2.14.2 - diff --git a/patches/kernel/0251-x86-Documentation-Add-PTI-description.patch b/patches/kernel/0251-x86-Documentation-Add-PTI-description.patch deleted file mode 100644 index 6a38eaa..0000000 --- a/patches/kernel/0251-x86-Documentation-Add-PTI-description.patch +++ /dev/null @@ -1,275 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Fri, 5 Jan 2018 09:44:36 -0800 -Subject: [PATCH] x86/Documentation: Add PTI description -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Add some details about how PTI works, what some of the downsides -are, and how to debug it when things go wrong. - -Also document the kernel parameter: 'pti/nopti'. - -Signed-off-by: Dave Hansen -Signed-off-by: Thomas Gleixner -Reviewed-by: Randy Dunlap -Reviewed-by: Kees Cook -Cc: Moritz Lipp -Cc: Daniel Gruss -Cc: Michael Schwarz -Cc: Richard Fellner -Cc: Andy Lutomirski -Cc: Linus Torvalds -Cc: Hugh Dickins -Cc: Andi Lutomirsky -Cc: stable@vger.kernel.org -Link: https://lkml.kernel.org/r/20180105174436.1BC6FA2B@viggo.jf.intel.com - -(cherry picked from commit 01c9b17bf673b05bb401b76ec763e9730ccf1376) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 1acf87c45b0170e717fc1b06a2d6fef47e07f79b) -Signed-off-by: Fabian Grünbichler ---- - Documentation/admin-guide/kernel-parameters.txt | 21 ++- - Documentation/x86/pti.txt | 186 ++++++++++++++++++++++++ - 2 files changed, 200 insertions(+), 7 deletions(-) - create mode 100644 Documentation/x86/pti.txt - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index b4d2edf316db..1a6ebc6cdf26 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -2677,8 +2677,6 @@ - steal time is computed, but won't influence scheduler - behaviour - -- nopti [X86-64] Disable kernel page table isolation -- - nolapic [X86-32,APIC] Do not enable or use the local APIC. - - nolapic_timer [X86-32,APIC] Do not use the local APIC timer. -@@ -3247,11 +3245,20 @@ - pt. [PARIDE] - See Documentation/blockdev/paride.txt. - -- pti= [X86_64] -- Control user/kernel address space isolation: -- on - enable -- off - disable -- auto - default setting -+ pti= [X86_64] Control Page Table Isolation of user and -+ kernel address spaces. Disabling this feature -+ removes hardening, but improves performance of -+ system calls and interrupts. -+ -+ on - unconditionally enable -+ off - unconditionally disable -+ auto - kernel detects whether your CPU model is -+ vulnerable to issues that PTI mitigates -+ -+ Not specifying this option is equivalent to pti=auto. -+ -+ nopti [X86_64] -+ Equivalent to pti=off - - pty.legacy_count= - [KNL] Number of legacy pty's. Overwrites compiled-in -diff --git a/Documentation/x86/pti.txt b/Documentation/x86/pti.txt -new file mode 100644 -index 000000000000..d11eff61fc9a ---- /dev/null -+++ b/Documentation/x86/pti.txt -@@ -0,0 +1,186 @@ -+Overview -+======== -+ -+Page Table Isolation (pti, previously known as KAISER[1]) is a -+countermeasure against attacks on the shared user/kernel address -+space such as the "Meltdown" approach[2]. -+ -+To mitigate this class of attacks, we create an independent set of -+page tables for use only when running userspace applications. When -+the kernel is entered via syscalls, interrupts or exceptions, the -+page tables are switched to the full "kernel" copy. When the system -+switches back to user mode, the user copy is used again. -+ -+The userspace page tables contain only a minimal amount of kernel -+data: only what is needed to enter/exit the kernel such as the -+entry/exit functions themselves and the interrupt descriptor table -+(IDT). There are a few strictly unnecessary things that get mapped -+such as the first C function when entering an interrupt (see -+comments in pti.c). -+ -+This approach helps to ensure that side-channel attacks leveraging -+the paging structures do not function when PTI is enabled. It can be -+enabled by setting CONFIG_PAGE_TABLE_ISOLATION=y at compile time. -+Once enabled at compile-time, it can be disabled at boot with the -+'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt). -+ -+Page Table Management -+===================== -+ -+When PTI is enabled, the kernel manages two sets of page tables. -+The first set is very similar to the single set which is present in -+kernels without PTI. This includes a complete mapping of userspace -+that the kernel can use for things like copy_to_user(). -+ -+Although _complete_, the user portion of the kernel page tables is -+crippled by setting the NX bit in the top level. This ensures -+that any missed kernel->user CR3 switch will immediately crash -+userspace upon executing its first instruction. -+ -+The userspace page tables map only the kernel data needed to enter -+and exit the kernel. This data is entirely contained in the 'struct -+cpu_entry_area' structure which is placed in the fixmap which gives -+each CPU's copy of the area a compile-time-fixed virtual address. -+ -+For new userspace mappings, the kernel makes the entries in its -+page tables like normal. The only difference is when the kernel -+makes entries in the top (PGD) level. In addition to setting the -+entry in the main kernel PGD, a copy of the entry is made in the -+userspace page tables' PGD. -+ -+This sharing at the PGD level also inherently shares all the lower -+layers of the page tables. This leaves a single, shared set of -+userspace page tables to manage. One PTE to lock, one set of -+accessed bits, dirty bits, etc... -+ -+Overhead -+======== -+ -+Protection against side-channel attacks is important. But, -+this protection comes at a cost: -+ -+1. Increased Memory Use -+ a. Each process now needs an order-1 PGD instead of order-0. -+ (Consumes an additional 4k per process). -+ b. The 'cpu_entry_area' structure must be 2MB in size and 2MB -+ aligned so that it can be mapped by setting a single PMD -+ entry. This consumes nearly 2MB of RAM once the kernel -+ is decompressed, but no space in the kernel image itself. -+ -+2. Runtime Cost -+ a. CR3 manipulation to switch between the page table copies -+ must be done at interrupt, syscall, and exception entry -+ and exit (it can be skipped when the kernel is interrupted, -+ though.) Moves to CR3 are on the order of a hundred -+ cycles, and are required at every entry and exit. -+ b. A "trampoline" must be used for SYSCALL entry. This -+ trampoline depends on a smaller set of resources than the -+ non-PTI SYSCALL entry code, so requires mapping fewer -+ things into the userspace page tables. The downside is -+ that stacks must be switched at entry time. -+ d. Global pages are disabled for all kernel structures not -+ mapped into both kernel and userspace page tables. This -+ feature of the MMU allows different processes to share TLB -+ entries mapping the kernel. Losing the feature means more -+ TLB misses after a context switch. The actual loss of -+ performance is very small, however, never exceeding 1%. -+ d. Process Context IDentifiers (PCID) is a CPU feature that -+ allows us to skip flushing the entire TLB when switching page -+ tables by setting a special bit in CR3 when the page tables -+ are changed. This makes switching the page tables (at context -+ switch, or kernel entry/exit) cheaper. But, on systems with -+ PCID support, the context switch code must flush both the user -+ and kernel entries out of the TLB. The user PCID TLB flush is -+ deferred until the exit to userspace, minimizing the cost. -+ See intel.com/sdm for the gory PCID/INVPCID details. -+ e. The userspace page tables must be populated for each new -+ process. Even without PTI, the shared kernel mappings -+ are created by copying top-level (PGD) entries into each -+ new process. But, with PTI, there are now *two* kernel -+ mappings: one in the kernel page tables that maps everything -+ and one for the entry/exit structures. At fork(), we need to -+ copy both. -+ f. In addition to the fork()-time copying, there must also -+ be an update to the userspace PGD any time a set_pgd() is done -+ on a PGD used to map userspace. This ensures that the kernel -+ and userspace copies always map the same userspace -+ memory. -+ g. On systems without PCID support, each CR3 write flushes -+ the entire TLB. That means that each syscall, interrupt -+ or exception flushes the TLB. -+ h. INVPCID is a TLB-flushing instruction which allows flushing -+ of TLB entries for non-current PCIDs. Some systems support -+ PCIDs, but do not support INVPCID. On these systems, addresses -+ can only be flushed from the TLB for the current PCID. When -+ flushing a kernel address, we need to flush all PCIDs, so a -+ single kernel address flush will require a TLB-flushing CR3 -+ write upon the next use of every PCID. -+ -+Possible Future Work -+==================== -+1. We can be more careful about not actually writing to CR3 -+ unless its value is actually changed. -+2. Allow PTI to be enabled/disabled at runtime in addition to the -+ boot-time switching. -+ -+Testing -+======== -+ -+To test stability of PTI, the following test procedure is recommended, -+ideally doing all of these in parallel: -+ -+1. Set CONFIG_DEBUG_ENTRY=y -+2. Run several copies of all of the tools/testing/selftests/x86/ tests -+ (excluding MPX and protection_keys) in a loop on multiple CPUs for -+ several minutes. These tests frequently uncover corner cases in the -+ kernel entry code. In general, old kernels might cause these tests -+ themselves to crash, but they should never crash the kernel. -+3. Run the 'perf' tool in a mode (top or record) that generates many -+ frequent performance monitoring non-maskable interrupts (see "NMI" -+ in /proc/interrupts). This exercises the NMI entry/exit code which -+ is known to trigger bugs in code paths that did not expect to be -+ interrupted, including nested NMIs. Using "-c" boosts the rate of -+ NMIs, and using two -c with separate counters encourages nested NMIs -+ and less deterministic behavior. -+ -+ while true; do perf record -c 10000 -e instructions,cycles -a sleep 10; done -+ -+4. Launch a KVM virtual machine. -+5. Run 32-bit binaries on systems supporting the SYSCALL instruction. -+ This has been a lightly-tested code path and needs extra scrutiny. -+ -+Debugging -+========= -+ -+Bugs in PTI cause a few different signatures of crashes -+that are worth noting here. -+ -+ * Failures of the selftests/x86 code. Usually a bug in one of the -+ more obscure corners of entry_64.S -+ * Crashes in early boot, especially around CPU bringup. Bugs -+ in the trampoline code or mappings cause these. -+ * Crashes at the first interrupt. Caused by bugs in entry_64.S, -+ like screwing up a page table switch. Also caused by -+ incorrectly mapping the IRQ handler entry code. -+ * Crashes at the first NMI. The NMI code is separate from main -+ interrupt handlers and can have bugs that do not affect -+ normal interrupts. Also caused by incorrectly mapping NMI -+ code. NMIs that interrupt the entry code must be very -+ careful and can be the cause of crashes that show up when -+ running perf. -+ * Kernel crashes at the first exit to userspace. entry_64.S -+ bugs, or failing to map some of the exit code. -+ * Crashes at first interrupt that interrupts userspace. The paths -+ in entry_64.S that return to userspace are sometimes separate -+ from the ones that return to the kernel. -+ * Double faults: overflowing the kernel stack because of page -+ faults upon page faults. Caused by touching non-pti-mapped -+ data in the entry code, or forgetting to switch to kernel -+ CR3 before calling into C functions which are not pti-mapped. -+ * Userspace segfaults early in boot, sometimes manifesting -+ as mount(8) failing to mount the rootfs. These have -+ tended to be TLB invalidation issues. Usually invalidating -+ the wrong PCID, or otherwise missing an invalidation. -+ -+1. https://gruss.cc/files/kaiser.pdf -+2. https://meltdownattack.com/meltdown.pdf --- -2.14.2 - diff --git a/patches/kernel/0251-x86-pti-Unbreak-EFI-old_memmap.patch b/patches/kernel/0251-x86-pti-Unbreak-EFI-old_memmap.patch new file mode 100644 index 0000000..0aca0ec --- /dev/null +++ b/patches/kernel/0251-x86-pti-Unbreak-EFI-old_memmap.patch @@ -0,0 +1,61 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jiri Kosina +Date: Fri, 5 Jan 2018 22:35:41 +0100 +Subject: [PATCH] x86/pti: Unbreak EFI old_memmap +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +EFI_OLD_MEMMAP's efi_call_phys_prolog() calls set_pgd() with swapper PGD that +has PAGE_USER set, which makes PTI set NX on it, and therefore EFI can't +execute it's code. + +Fix that by forcefully clearing _PAGE_NX from the PGD (this can't be done +by the pgprot API). + +_PAGE_NX will be automatically reintroduced in efi_call_phys_epilog(), as +_set_pgd() will again notice that this is _PAGE_USER, and set _PAGE_NX on +it. + +Tested-by: Dimitri Sivanich +Signed-off-by: Jiri Kosina +Signed-off-by: Ingo Molnar +Signed-off-by: Thomas Gleixner +Acked-by: Dave Hansen +Cc: Andrea Arcangeli +Cc: Ard Biesheuvel +Cc: Linus Torvalds +Cc: Matt Fleming +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: linux-efi@vger.kernel.org +Cc: stable@vger.kernel.org +Link: http://lkml.kernel.org/r/nycvar.YFH.7.76.1801052215460.11852@cbobk.fhfr.pm +(cherry picked from commit de53c3786a3ce162a1c815d0c04c766c23ec9c0a) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 31afacd8089f54061e718e5d491f11747755c503) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/platform/efi/efi_64.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c +index b104224d3d6c..987a38e82f73 100644 +--- a/arch/x86/platform/efi/efi_64.c ++++ b/arch/x86/platform/efi/efi_64.c +@@ -133,7 +133,9 @@ pgd_t * __init efi_call_phys_prolog(void) + pud[j] = *pud_offset(p4d_k, vaddr); + } + } ++ pgd_offset_k(pgd * PGDIR_SIZE)->pgd &= ~_PAGE_NX; + } ++ + out: + __flush_tlb_all(); + +-- +2.14.2 + diff --git a/patches/kernel/0252-x86-Documentation-Add-PTI-description.patch b/patches/kernel/0252-x86-Documentation-Add-PTI-description.patch new file mode 100644 index 0000000..6a38eaa --- /dev/null +++ b/patches/kernel/0252-x86-Documentation-Add-PTI-description.patch @@ -0,0 +1,275 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Fri, 5 Jan 2018 09:44:36 -0800 +Subject: [PATCH] x86/Documentation: Add PTI description +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Add some details about how PTI works, what some of the downsides +are, and how to debug it when things go wrong. + +Also document the kernel parameter: 'pti/nopti'. + +Signed-off-by: Dave Hansen +Signed-off-by: Thomas Gleixner +Reviewed-by: Randy Dunlap +Reviewed-by: Kees Cook +Cc: Moritz Lipp +Cc: Daniel Gruss +Cc: Michael Schwarz +Cc: Richard Fellner +Cc: Andy Lutomirski +Cc: Linus Torvalds +Cc: Hugh Dickins +Cc: Andi Lutomirsky +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/20180105174436.1BC6FA2B@viggo.jf.intel.com + +(cherry picked from commit 01c9b17bf673b05bb401b76ec763e9730ccf1376) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 1acf87c45b0170e717fc1b06a2d6fef47e07f79b) +Signed-off-by: Fabian Grünbichler +--- + Documentation/admin-guide/kernel-parameters.txt | 21 ++- + Documentation/x86/pti.txt | 186 ++++++++++++++++++++++++ + 2 files changed, 200 insertions(+), 7 deletions(-) + create mode 100644 Documentation/x86/pti.txt + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index b4d2edf316db..1a6ebc6cdf26 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -2677,8 +2677,6 @@ + steal time is computed, but won't influence scheduler + behaviour + +- nopti [X86-64] Disable kernel page table isolation +- + nolapic [X86-32,APIC] Do not enable or use the local APIC. + + nolapic_timer [X86-32,APIC] Do not use the local APIC timer. +@@ -3247,11 +3245,20 @@ + pt. [PARIDE] + See Documentation/blockdev/paride.txt. + +- pti= [X86_64] +- Control user/kernel address space isolation: +- on - enable +- off - disable +- auto - default setting ++ pti= [X86_64] Control Page Table Isolation of user and ++ kernel address spaces. Disabling this feature ++ removes hardening, but improves performance of ++ system calls and interrupts. ++ ++ on - unconditionally enable ++ off - unconditionally disable ++ auto - kernel detects whether your CPU model is ++ vulnerable to issues that PTI mitigates ++ ++ Not specifying this option is equivalent to pti=auto. ++ ++ nopti [X86_64] ++ Equivalent to pti=off + + pty.legacy_count= + [KNL] Number of legacy pty's. Overwrites compiled-in +diff --git a/Documentation/x86/pti.txt b/Documentation/x86/pti.txt +new file mode 100644 +index 000000000000..d11eff61fc9a +--- /dev/null ++++ b/Documentation/x86/pti.txt +@@ -0,0 +1,186 @@ ++Overview ++======== ++ ++Page Table Isolation (pti, previously known as KAISER[1]) is a ++countermeasure against attacks on the shared user/kernel address ++space such as the "Meltdown" approach[2]. ++ ++To mitigate this class of attacks, we create an independent set of ++page tables for use only when running userspace applications. When ++the kernel is entered via syscalls, interrupts or exceptions, the ++page tables are switched to the full "kernel" copy. When the system ++switches back to user mode, the user copy is used again. ++ ++The userspace page tables contain only a minimal amount of kernel ++data: only what is needed to enter/exit the kernel such as the ++entry/exit functions themselves and the interrupt descriptor table ++(IDT). There are a few strictly unnecessary things that get mapped ++such as the first C function when entering an interrupt (see ++comments in pti.c). ++ ++This approach helps to ensure that side-channel attacks leveraging ++the paging structures do not function when PTI is enabled. It can be ++enabled by setting CONFIG_PAGE_TABLE_ISOLATION=y at compile time. ++Once enabled at compile-time, it can be disabled at boot with the ++'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt). ++ ++Page Table Management ++===================== ++ ++When PTI is enabled, the kernel manages two sets of page tables. ++The first set is very similar to the single set which is present in ++kernels without PTI. This includes a complete mapping of userspace ++that the kernel can use for things like copy_to_user(). ++ ++Although _complete_, the user portion of the kernel page tables is ++crippled by setting the NX bit in the top level. This ensures ++that any missed kernel->user CR3 switch will immediately crash ++userspace upon executing its first instruction. ++ ++The userspace page tables map only the kernel data needed to enter ++and exit the kernel. This data is entirely contained in the 'struct ++cpu_entry_area' structure which is placed in the fixmap which gives ++each CPU's copy of the area a compile-time-fixed virtual address. ++ ++For new userspace mappings, the kernel makes the entries in its ++page tables like normal. The only difference is when the kernel ++makes entries in the top (PGD) level. In addition to setting the ++entry in the main kernel PGD, a copy of the entry is made in the ++userspace page tables' PGD. ++ ++This sharing at the PGD level also inherently shares all the lower ++layers of the page tables. This leaves a single, shared set of ++userspace page tables to manage. One PTE to lock, one set of ++accessed bits, dirty bits, etc... ++ ++Overhead ++======== ++ ++Protection against side-channel attacks is important. But, ++this protection comes at a cost: ++ ++1. Increased Memory Use ++ a. Each process now needs an order-1 PGD instead of order-0. ++ (Consumes an additional 4k per process). ++ b. The 'cpu_entry_area' structure must be 2MB in size and 2MB ++ aligned so that it can be mapped by setting a single PMD ++ entry. This consumes nearly 2MB of RAM once the kernel ++ is decompressed, but no space in the kernel image itself. ++ ++2. Runtime Cost ++ a. CR3 manipulation to switch between the page table copies ++ must be done at interrupt, syscall, and exception entry ++ and exit (it can be skipped when the kernel is interrupted, ++ though.) Moves to CR3 are on the order of a hundred ++ cycles, and are required at every entry and exit. ++ b. A "trampoline" must be used for SYSCALL entry. This ++ trampoline depends on a smaller set of resources than the ++ non-PTI SYSCALL entry code, so requires mapping fewer ++ things into the userspace page tables. The downside is ++ that stacks must be switched at entry time. ++ d. Global pages are disabled for all kernel structures not ++ mapped into both kernel and userspace page tables. This ++ feature of the MMU allows different processes to share TLB ++ entries mapping the kernel. Losing the feature means more ++ TLB misses after a context switch. The actual loss of ++ performance is very small, however, never exceeding 1%. ++ d. Process Context IDentifiers (PCID) is a CPU feature that ++ allows us to skip flushing the entire TLB when switching page ++ tables by setting a special bit in CR3 when the page tables ++ are changed. This makes switching the page tables (at context ++ switch, or kernel entry/exit) cheaper. But, on systems with ++ PCID support, the context switch code must flush both the user ++ and kernel entries out of the TLB. The user PCID TLB flush is ++ deferred until the exit to userspace, minimizing the cost. ++ See intel.com/sdm for the gory PCID/INVPCID details. ++ e. The userspace page tables must be populated for each new ++ process. Even without PTI, the shared kernel mappings ++ are created by copying top-level (PGD) entries into each ++ new process. But, with PTI, there are now *two* kernel ++ mappings: one in the kernel page tables that maps everything ++ and one for the entry/exit structures. At fork(), we need to ++ copy both. ++ f. In addition to the fork()-time copying, there must also ++ be an update to the userspace PGD any time a set_pgd() is done ++ on a PGD used to map userspace. This ensures that the kernel ++ and userspace copies always map the same userspace ++ memory. ++ g. On systems without PCID support, each CR3 write flushes ++ the entire TLB. That means that each syscall, interrupt ++ or exception flushes the TLB. ++ h. INVPCID is a TLB-flushing instruction which allows flushing ++ of TLB entries for non-current PCIDs. Some systems support ++ PCIDs, but do not support INVPCID. On these systems, addresses ++ can only be flushed from the TLB for the current PCID. When ++ flushing a kernel address, we need to flush all PCIDs, so a ++ single kernel address flush will require a TLB-flushing CR3 ++ write upon the next use of every PCID. ++ ++Possible Future Work ++==================== ++1. We can be more careful about not actually writing to CR3 ++ unless its value is actually changed. ++2. Allow PTI to be enabled/disabled at runtime in addition to the ++ boot-time switching. ++ ++Testing ++======== ++ ++To test stability of PTI, the following test procedure is recommended, ++ideally doing all of these in parallel: ++ ++1. Set CONFIG_DEBUG_ENTRY=y ++2. Run several copies of all of the tools/testing/selftests/x86/ tests ++ (excluding MPX and protection_keys) in a loop on multiple CPUs for ++ several minutes. These tests frequently uncover corner cases in the ++ kernel entry code. In general, old kernels might cause these tests ++ themselves to crash, but they should never crash the kernel. ++3. Run the 'perf' tool in a mode (top or record) that generates many ++ frequent performance monitoring non-maskable interrupts (see "NMI" ++ in /proc/interrupts). This exercises the NMI entry/exit code which ++ is known to trigger bugs in code paths that did not expect to be ++ interrupted, including nested NMIs. Using "-c" boosts the rate of ++ NMIs, and using two -c with separate counters encourages nested NMIs ++ and less deterministic behavior. ++ ++ while true; do perf record -c 10000 -e instructions,cycles -a sleep 10; done ++ ++4. Launch a KVM virtual machine. ++5. Run 32-bit binaries on systems supporting the SYSCALL instruction. ++ This has been a lightly-tested code path and needs extra scrutiny. ++ ++Debugging ++========= ++ ++Bugs in PTI cause a few different signatures of crashes ++that are worth noting here. ++ ++ * Failures of the selftests/x86 code. Usually a bug in one of the ++ more obscure corners of entry_64.S ++ * Crashes in early boot, especially around CPU bringup. Bugs ++ in the trampoline code or mappings cause these. ++ * Crashes at the first interrupt. Caused by bugs in entry_64.S, ++ like screwing up a page table switch. Also caused by ++ incorrectly mapping the IRQ handler entry code. ++ * Crashes at the first NMI. The NMI code is separate from main ++ interrupt handlers and can have bugs that do not affect ++ normal interrupts. Also caused by incorrectly mapping NMI ++ code. NMIs that interrupt the entry code must be very ++ careful and can be the cause of crashes that show up when ++ running perf. ++ * Kernel crashes at the first exit to userspace. entry_64.S ++ bugs, or failing to map some of the exit code. ++ * Crashes at first interrupt that interrupts userspace. The paths ++ in entry_64.S that return to userspace are sometimes separate ++ from the ones that return to the kernel. ++ * Double faults: overflowing the kernel stack because of page ++ faults upon page faults. Caused by touching non-pti-mapped ++ data in the entry code, or forgetting to switch to kernel ++ CR3 before calling into C functions which are not pti-mapped. ++ * Userspace segfaults early in boot, sometimes manifesting ++ as mount(8) failing to mount the rootfs. These have ++ tended to be TLB invalidation issues. Usually invalidating ++ the wrong PCID, or otherwise missing an invalidation. ++ ++1. https://gruss.cc/files/kaiser.pdf ++2. https://meltdownattack.com/meltdown.pdf +-- +2.14.2 + diff --git a/patches/kernel/0252-x86-cpufeatures-Add-X86_BUG_SPECTRE_V-12.patch b/patches/kernel/0252-x86-cpufeatures-Add-X86_BUG_SPECTRE_V-12.patch deleted file mode 100644 index 0e9d104..0000000 --- a/patches/kernel/0252-x86-cpufeatures-Add-X86_BUG_SPECTRE_V-12.patch +++ /dev/null @@ -1,68 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: David Woodhouse -Date: Sat, 6 Jan 2018 11:49:23 +0000 -Subject: [PATCH] x86/cpufeatures: Add X86_BUG_SPECTRE_V[12] -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -Add the bug bits for spectre v1/2 and force them unconditionally for all -cpus. - -Signed-off-by: David Woodhouse -Signed-off-by: Thomas Gleixner -Cc: gnomes@lxorguk.ukuu.org.uk -Cc: Rik van Riel -Cc: Andi Kleen -Cc: Peter Zijlstra -Cc: Linus Torvalds -Cc: Jiri Kosina -Cc: Andy Lutomirski -Cc: Dave Hansen -Cc: Kees Cook -Cc: Tim Chen -Cc: Greg Kroah-Hartman -Cc: Paul Turner -Cc: stable@vger.kernel.org -Link: https://lkml.kernel.org/r/1515239374-23361-2-git-send-email-dwmw@amazon.co.uk -(cherry picked from commit 99c6fa2511d8a683e61468be91b83f85452115fa) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit de861dbf4587b9dac9a1978e6349199755e8c1b1) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/cpufeatures.h | 2 ++ - arch/x86/kernel/cpu/common.c | 3 +++ - 2 files changed, 5 insertions(+) - -diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h -index b7900d26066c..3928050b51b0 100644 ---- a/arch/x86/include/asm/cpufeatures.h -+++ b/arch/x86/include/asm/cpufeatures.h -@@ -341,5 +341,7 @@ - #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ - #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ - #define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ -+#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ -+#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ - - #endif /* _ASM_X86_CPUFEATURES_H */ -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index 142ab555dafa..01abbf69d522 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -902,6 +902,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) - if (c->x86_vendor != X86_VENDOR_AMD) - setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); - -+ setup_force_cpu_bug(X86_BUG_SPECTRE_V1); -+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2); -+ - fpu__init_system(c); - } - --- -2.14.2 - diff --git a/patches/kernel/0253-x86-cpufeatures-Add-X86_BUG_SPECTRE_V-12.patch b/patches/kernel/0253-x86-cpufeatures-Add-X86_BUG_SPECTRE_V-12.patch new file mode 100644 index 0000000..0e9d104 --- /dev/null +++ b/patches/kernel/0253-x86-cpufeatures-Add-X86_BUG_SPECTRE_V-12.patch @@ -0,0 +1,68 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: David Woodhouse +Date: Sat, 6 Jan 2018 11:49:23 +0000 +Subject: [PATCH] x86/cpufeatures: Add X86_BUG_SPECTRE_V[12] +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +Add the bug bits for spectre v1/2 and force them unconditionally for all +cpus. + +Signed-off-by: David Woodhouse +Signed-off-by: Thomas Gleixner +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel +Cc: Andi Kleen +Cc: Peter Zijlstra +Cc: Linus Torvalds +Cc: Jiri Kosina +Cc: Andy Lutomirski +Cc: Dave Hansen +Cc: Kees Cook +Cc: Tim Chen +Cc: Greg Kroah-Hartman +Cc: Paul Turner +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/1515239374-23361-2-git-send-email-dwmw@amazon.co.uk +(cherry picked from commit 99c6fa2511d8a683e61468be91b83f85452115fa) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit de861dbf4587b9dac9a1978e6349199755e8c1b1) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/cpufeatures.h | 2 ++ + arch/x86/kernel/cpu/common.c | 3 +++ + 2 files changed, 5 insertions(+) + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index b7900d26066c..3928050b51b0 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -341,5 +341,7 @@ + #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ + #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ + #define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ ++#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ ++#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ + + #endif /* _ASM_X86_CPUFEATURES_H */ +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 142ab555dafa..01abbf69d522 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -902,6 +902,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) + if (c->x86_vendor != X86_VENDOR_AMD) + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + ++ setup_force_cpu_bug(X86_BUG_SPECTRE_V1); ++ setup_force_cpu_bug(X86_BUG_SPECTRE_V2); ++ + fpu__init_system(c); + } + +-- +2.14.2 + diff --git a/patches/kernel/0253-x86-tboot-Unbreak-tboot-with-PTI-enabled.patch b/patches/kernel/0253-x86-tboot-Unbreak-tboot-with-PTI-enabled.patch deleted file mode 100644 index aa0048e..0000000 --- a/patches/kernel/0253-x86-tboot-Unbreak-tboot-with-PTI-enabled.patch +++ /dev/null @@ -1,58 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Sat, 6 Jan 2018 18:41:14 +0100 -Subject: [PATCH] x86/tboot: Unbreak tboot with PTI enabled -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -This is another case similar to what EFI does: create a new set of -page tables, map some code at a low address, and jump to it. PTI -mistakes this low address for userspace and mistakenly marks it -non-executable in an effort to make it unusable for userspace. - -Undo the poison to allow execution. - -Fixes: 385ce0ea4c07 ("x86/mm/pti: Add Kconfig") -Signed-off-by: Dave Hansen -Signed-off-by: Andrea Arcangeli -Signed-off-by: Thomas Gleixner -Cc: Alan Cox -Cc: Tim Chen -Cc: Jon Masters -Cc: Dave Hansen -Cc: Andi Kleen -Cc: Jeff Law -Cc: Paolo Bonzini -Cc: Linus Torvalds -Cc: Greg Kroah-Hartman -Cc: David" -Cc: Nick Clifton -Cc: stable@vger.kernel.org -Link: https://lkml.kernel.org/r/20180108102805.GK25546@redhat.com -(cherry picked from commit 262b6b30087246abf09d6275eb0c0dc421bcbe38) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit f03e9108405491791f0b883a2d95e2620ddfce64) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/tboot.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c -index a4eb27918ceb..75869a4b6c41 100644 ---- a/arch/x86/kernel/tboot.c -+++ b/arch/x86/kernel/tboot.c -@@ -127,6 +127,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, - p4d = p4d_alloc(&tboot_mm, pgd, vaddr); - if (!p4d) - return -1; -+ pgd->pgd &= ~_PAGE_NX; - pud = pud_alloc(&tboot_mm, p4d, vaddr); - if (!pud) - return -1; --- -2.14.2 - diff --git a/patches/kernel/0254-x86-mm-pti-Remove-dead-logic-in-pti_user_pagetable_w.patch b/patches/kernel/0254-x86-mm-pti-Remove-dead-logic-in-pti_user_pagetable_w.patch deleted file mode 100644 index 8056750..0000000 --- a/patches/kernel/0254-x86-mm-pti-Remove-dead-logic-in-pti_user_pagetable_w.patch +++ /dev/null @@ -1,151 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Jike Song -Date: Tue, 9 Jan 2018 00:03:41 +0800 -Subject: [PATCH] x86/mm/pti: Remove dead logic in pti_user_pagetable_walk*() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The following code contains dead logic: - - 162 if (pgd_none(*pgd)) { - 163 unsigned long new_p4d_page = __get_free_page(gfp); - 164 if (!new_p4d_page) - 165 return NULL; - 166 - 167 if (pgd_none(*pgd)) { - 168 set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); - 169 new_p4d_page = 0; - 170 } - 171 if (new_p4d_page) - 172 free_page(new_p4d_page); - 173 } - -There can't be any difference between two pgd_none(*pgd) at L162 and L167, -so it's always false at L171. - -Dave Hansen explained: - - Yes, the double-test was part of an optimization where we attempted to - avoid using a global spinlock in the fork() path. We would check for - unallocated mid-level page tables without the lock. The lock was only - taken when we needed to *make* an entry to avoid collisions. - - Now that it is all single-threaded, there is no chance of a collision, - no need for a lock, and no need for the re-check. - -As all these functions are only called during init, mark them __init as -well. - -Fixes: 03f4424f348e ("x86/mm/pti: Add functions to clone kernel PMDs") -Signed-off-by: Jike Song -Signed-off-by: Thomas Gleixner -Cc: Alan Cox -Cc: Andi Kleen -Cc: Tom Lendacky -Cc: Peter Zijlstra -Cc: Tim Chen -Cc: Jiri Koshina -Cc: Dave Hansen -Cc: Borislav Petkov -Cc: Kees Cook -Cc: Andi Lutomirski -Cc: Linus Torvalds -Cc: Greg KH -Cc: David Woodhouse -Cc: Paul Turner -Cc: stable@vger.kernel.org -Link: https://lkml.kernel.org/r/20180108160341.3461-1-albcamus@gmail.com - -(cherry picked from commit 8d56eff266f3e41a6c39926269c4c3f58f881a8e) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit de8ab6bea570e70d1478af2c1667714bc900ae70) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/pti.c | 32 ++++++-------------------------- - 1 file changed, 6 insertions(+), 26 deletions(-) - -diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c -index 43d4a4a29037..ce38f165489b 100644 ---- a/arch/x86/mm/pti.c -+++ b/arch/x86/mm/pti.c -@@ -149,7 +149,7 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) - * - * Returns a pointer to a P4D on success, or NULL on failure. - */ --static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) -+static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) - { - pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); - gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); -@@ -164,12 +164,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) - if (!new_p4d_page) - return NULL; - -- if (pgd_none(*pgd)) { -- set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); -- new_p4d_page = 0; -- } -- if (new_p4d_page) -- free_page(new_p4d_page); -+ set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); - } - BUILD_BUG_ON(pgd_large(*pgd) != 0); - -@@ -182,7 +177,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) - * - * Returns a pointer to a PMD on success, or NULL on failure. - */ --static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) -+static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) - { - gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); - p4d_t *p4d = pti_user_pagetable_walk_p4d(address); -@@ -194,12 +189,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) - if (!new_pud_page) - return NULL; - -- if (p4d_none(*p4d)) { -- set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); -- new_pud_page = 0; -- } -- if (new_pud_page) -- free_page(new_pud_page); -+ set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); - } - - pud = pud_offset(p4d, address); -@@ -213,12 +203,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) - if (!new_pmd_page) - return NULL; - -- if (pud_none(*pud)) { -- set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); -- new_pmd_page = 0; -- } -- if (new_pmd_page) -- free_page(new_pmd_page); -+ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); - } - - return pmd_offset(pud, address); -@@ -251,12 +236,7 @@ static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) - if (!new_pte_page) - return NULL; - -- if (pmd_none(*pmd)) { -- set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); -- new_pte_page = 0; -- } -- if (new_pte_page) -- free_page(new_pte_page); -+ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); - } - - pte = pte_offset_kernel(pmd, address); --- -2.14.2 - diff --git a/patches/kernel/0254-x86-tboot-Unbreak-tboot-with-PTI-enabled.patch b/patches/kernel/0254-x86-tboot-Unbreak-tboot-with-PTI-enabled.patch new file mode 100644 index 0000000..aa0048e --- /dev/null +++ b/patches/kernel/0254-x86-tboot-Unbreak-tboot-with-PTI-enabled.patch @@ -0,0 +1,58 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Sat, 6 Jan 2018 18:41:14 +0100 +Subject: [PATCH] x86/tboot: Unbreak tboot with PTI enabled +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +This is another case similar to what EFI does: create a new set of +page tables, map some code at a low address, and jump to it. PTI +mistakes this low address for userspace and mistakenly marks it +non-executable in an effort to make it unusable for userspace. + +Undo the poison to allow execution. + +Fixes: 385ce0ea4c07 ("x86/mm/pti: Add Kconfig") +Signed-off-by: Dave Hansen +Signed-off-by: Andrea Arcangeli +Signed-off-by: Thomas Gleixner +Cc: Alan Cox +Cc: Tim Chen +Cc: Jon Masters +Cc: Dave Hansen +Cc: Andi Kleen +Cc: Jeff Law +Cc: Paolo Bonzini +Cc: Linus Torvalds +Cc: Greg Kroah-Hartman +Cc: David" +Cc: Nick Clifton +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/20180108102805.GK25546@redhat.com +(cherry picked from commit 262b6b30087246abf09d6275eb0c0dc421bcbe38) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit f03e9108405491791f0b883a2d95e2620ddfce64) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/tboot.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c +index a4eb27918ceb..75869a4b6c41 100644 +--- a/arch/x86/kernel/tboot.c ++++ b/arch/x86/kernel/tboot.c +@@ -127,6 +127,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, + p4d = p4d_alloc(&tboot_mm, pgd, vaddr); + if (!p4d) + return -1; ++ pgd->pgd &= ~_PAGE_NX; + pud = pud_alloc(&tboot_mm, p4d, vaddr); + if (!pud) + return -1; +-- +2.14.2 + diff --git a/patches/kernel/0255-x86-cpu-AMD-Make-LFENCE-a-serializing-instruction.patch b/patches/kernel/0255-x86-cpu-AMD-Make-LFENCE-a-serializing-instruction.patch deleted file mode 100644 index 65d8af4..0000000 --- a/patches/kernel/0255-x86-cpu-AMD-Make-LFENCE-a-serializing-instruction.patch +++ /dev/null @@ -1,77 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tom Lendacky -Date: Mon, 8 Jan 2018 16:09:21 -0600 -Subject: [PATCH] x86/cpu/AMD: Make LFENCE a serializing instruction -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -To aid in speculation control, make LFENCE a serializing instruction -since it has less overhead than MFENCE. This is done by setting bit 1 -of MSR 0xc0011029 (DE_CFG). Some families that support LFENCE do not -have this MSR. For these families, the LFENCE instruction is already -serializing. - -Signed-off-by: Tom Lendacky -Signed-off-by: Thomas Gleixner -Reviewed-by: Reviewed-by: Borislav Petkov -Cc: Peter Zijlstra -Cc: Tim Chen -Cc: Dave Hansen -Cc: Borislav Petkov -Cc: Dan Williams -Cc: Linus Torvalds -Cc: Greg Kroah-Hartman -Cc: David Woodhouse -Cc: Paul Turner -Link: https://lkml.kernel.org/r/20180108220921.12580.71694.stgit@tlendack-t1.amdoffice.net - -(cherry picked from commit e4d0e84e490790798691aaa0f2e598637f1867ec) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit bde943193168fe9a3814badaa0cae3422029dce5) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/msr-index.h | 2 ++ - arch/x86/kernel/cpu/amd.c | 10 ++++++++++ - 2 files changed, 12 insertions(+) - -diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h -index 5573c75f8e4c..25147df4acfc 100644 ---- a/arch/x86/include/asm/msr-index.h -+++ b/arch/x86/include/asm/msr-index.h -@@ -351,6 +351,8 @@ - #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL - #define FAM10H_MMIO_CONF_BASE_SHIFT 20 - #define MSR_FAM10H_NODE_ID 0xc001100c -+#define MSR_F10H_DECFG 0xc0011029 -+#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 - - /* K8 MSRs */ - #define MSR_K8_TOP_MEM1 0xc001001a -diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c -index 2a5328cc03a6..c9a4e4db7860 100644 ---- a/arch/x86/kernel/cpu/amd.c -+++ b/arch/x86/kernel/cpu/amd.c -@@ -785,6 +785,16 @@ static void init_amd(struct cpuinfo_x86 *c) - set_cpu_cap(c, X86_FEATURE_K8); - - if (cpu_has(c, X86_FEATURE_XMM2)) { -+ /* -+ * A serializing LFENCE has less overhead than MFENCE, so -+ * use it for execution serialization. On families which -+ * don't have that MSR, LFENCE is already serializing. -+ * msr_set_bit() uses the safe accessors, too, even if the MSR -+ * is not present. -+ */ -+ msr_set_bit(MSR_F10H_DECFG, -+ MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); -+ - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); - } --- -2.14.2 - diff --git a/patches/kernel/0255-x86-mm-pti-Remove-dead-logic-in-pti_user_pagetable_w.patch b/patches/kernel/0255-x86-mm-pti-Remove-dead-logic-in-pti_user_pagetable_w.patch new file mode 100644 index 0000000..8056750 --- /dev/null +++ b/patches/kernel/0255-x86-mm-pti-Remove-dead-logic-in-pti_user_pagetable_w.patch @@ -0,0 +1,151 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jike Song +Date: Tue, 9 Jan 2018 00:03:41 +0800 +Subject: [PATCH] x86/mm/pti: Remove dead logic in pti_user_pagetable_walk*() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The following code contains dead logic: + + 162 if (pgd_none(*pgd)) { + 163 unsigned long new_p4d_page = __get_free_page(gfp); + 164 if (!new_p4d_page) + 165 return NULL; + 166 + 167 if (pgd_none(*pgd)) { + 168 set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); + 169 new_p4d_page = 0; + 170 } + 171 if (new_p4d_page) + 172 free_page(new_p4d_page); + 173 } + +There can't be any difference between two pgd_none(*pgd) at L162 and L167, +so it's always false at L171. + +Dave Hansen explained: + + Yes, the double-test was part of an optimization where we attempted to + avoid using a global spinlock in the fork() path. We would check for + unallocated mid-level page tables without the lock. The lock was only + taken when we needed to *make* an entry to avoid collisions. + + Now that it is all single-threaded, there is no chance of a collision, + no need for a lock, and no need for the re-check. + +As all these functions are only called during init, mark them __init as +well. + +Fixes: 03f4424f348e ("x86/mm/pti: Add functions to clone kernel PMDs") +Signed-off-by: Jike Song +Signed-off-by: Thomas Gleixner +Cc: Alan Cox +Cc: Andi Kleen +Cc: Tom Lendacky +Cc: Peter Zijlstra +Cc: Tim Chen +Cc: Jiri Koshina +Cc: Dave Hansen +Cc: Borislav Petkov +Cc: Kees Cook +Cc: Andi Lutomirski +Cc: Linus Torvalds +Cc: Greg KH +Cc: David Woodhouse +Cc: Paul Turner +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/20180108160341.3461-1-albcamus@gmail.com + +(cherry picked from commit 8d56eff266f3e41a6c39926269c4c3f58f881a8e) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit de8ab6bea570e70d1478af2c1667714bc900ae70) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/pti.c | 32 ++++++-------------------------- + 1 file changed, 6 insertions(+), 26 deletions(-) + +diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c +index 43d4a4a29037..ce38f165489b 100644 +--- a/arch/x86/mm/pti.c ++++ b/arch/x86/mm/pti.c +@@ -149,7 +149,7 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) + * + * Returns a pointer to a P4D on success, or NULL on failure. + */ +-static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) ++static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) + { + pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); +@@ -164,12 +164,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) + if (!new_p4d_page) + return NULL; + +- if (pgd_none(*pgd)) { +- set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); +- new_p4d_page = 0; +- } +- if (new_p4d_page) +- free_page(new_p4d_page); ++ set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); + } + BUILD_BUG_ON(pgd_large(*pgd) != 0); + +@@ -182,7 +177,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) + * + * Returns a pointer to a PMD on success, or NULL on failure. + */ +-static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) ++static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) + { + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + p4d_t *p4d = pti_user_pagetable_walk_p4d(address); +@@ -194,12 +189,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) + if (!new_pud_page) + return NULL; + +- if (p4d_none(*p4d)) { +- set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); +- new_pud_page = 0; +- } +- if (new_pud_page) +- free_page(new_pud_page); ++ set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); + } + + pud = pud_offset(p4d, address); +@@ -213,12 +203,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) + if (!new_pmd_page) + return NULL; + +- if (pud_none(*pud)) { +- set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); +- new_pmd_page = 0; +- } +- if (new_pmd_page) +- free_page(new_pmd_page); ++ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); + } + + return pmd_offset(pud, address); +@@ -251,12 +236,7 @@ static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) + if (!new_pte_page) + return NULL; + +- if (pmd_none(*pmd)) { +- set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); +- new_pte_page = 0; +- } +- if (new_pte_page) +- free_page(new_pte_page); ++ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); + } + + pte = pte_offset_kernel(pmd, address); +-- +2.14.2 + diff --git a/patches/kernel/0256-x86-cpu-AMD-Make-LFENCE-a-serializing-instruction.patch b/patches/kernel/0256-x86-cpu-AMD-Make-LFENCE-a-serializing-instruction.patch new file mode 100644 index 0000000..65d8af4 --- /dev/null +++ b/patches/kernel/0256-x86-cpu-AMD-Make-LFENCE-a-serializing-instruction.patch @@ -0,0 +1,77 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Mon, 8 Jan 2018 16:09:21 -0600 +Subject: [PATCH] x86/cpu/AMD: Make LFENCE a serializing instruction +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +To aid in speculation control, make LFENCE a serializing instruction +since it has less overhead than MFENCE. This is done by setting bit 1 +of MSR 0xc0011029 (DE_CFG). Some families that support LFENCE do not +have this MSR. For these families, the LFENCE instruction is already +serializing. + +Signed-off-by: Tom Lendacky +Signed-off-by: Thomas Gleixner +Reviewed-by: Reviewed-by: Borislav Petkov +Cc: Peter Zijlstra +Cc: Tim Chen +Cc: Dave Hansen +Cc: Borislav Petkov +Cc: Dan Williams +Cc: Linus Torvalds +Cc: Greg Kroah-Hartman +Cc: David Woodhouse +Cc: Paul Turner +Link: https://lkml.kernel.org/r/20180108220921.12580.71694.stgit@tlendack-t1.amdoffice.net + +(cherry picked from commit e4d0e84e490790798691aaa0f2e598637f1867ec) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit bde943193168fe9a3814badaa0cae3422029dce5) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/msr-index.h | 2 ++ + arch/x86/kernel/cpu/amd.c | 10 ++++++++++ + 2 files changed, 12 insertions(+) + +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index 5573c75f8e4c..25147df4acfc 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -351,6 +351,8 @@ + #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL + #define FAM10H_MMIO_CONF_BASE_SHIFT 20 + #define MSR_FAM10H_NODE_ID 0xc001100c ++#define MSR_F10H_DECFG 0xc0011029 ++#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 + + /* K8 MSRs */ + #define MSR_K8_TOP_MEM1 0xc001001a +diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c +index 2a5328cc03a6..c9a4e4db7860 100644 +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -785,6 +785,16 @@ static void init_amd(struct cpuinfo_x86 *c) + set_cpu_cap(c, X86_FEATURE_K8); + + if (cpu_has(c, X86_FEATURE_XMM2)) { ++ /* ++ * A serializing LFENCE has less overhead than MFENCE, so ++ * use it for execution serialization. On families which ++ * don't have that MSR, LFENCE is already serializing. ++ * msr_set_bit() uses the safe accessors, too, even if the MSR ++ * is not present. ++ */ ++ msr_set_bit(MSR_F10H_DECFG, ++ MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); ++ + /* MFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + } +-- +2.14.2 + diff --git a/patches/kernel/0256-x86-cpu-AMD-Use-LFENCE_RDTSC-in-preference-to-MFENCE.patch b/patches/kernel/0256-x86-cpu-AMD-Use-LFENCE_RDTSC-in-preference-to-MFENCE.patch deleted file mode 100644 index d2324ef..0000000 --- a/patches/kernel/0256-x86-cpu-AMD-Use-LFENCE_RDTSC-in-preference-to-MFENCE.patch +++ /dev/null @@ -1,92 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tom Lendacky -Date: Mon, 8 Jan 2018 16:09:32 -0600 -Subject: [PATCH] x86/cpu/AMD: Use LFENCE_RDTSC in preference to MFENCE_RDTSC -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -With LFENCE now a serializing instruction, use LFENCE_RDTSC in preference -to MFENCE_RDTSC. However, since the kernel could be running under a -hypervisor that does not support writing that MSR, read the MSR back and -verify that the bit has been set successfully. If the MSR can be read -and the bit is set, then set the LFENCE_RDTSC feature, otherwise set the -MFENCE_RDTSC feature. - -Signed-off-by: Tom Lendacky -Signed-off-by: Thomas Gleixner -Reviewed-by: Reviewed-by: Borislav Petkov -Cc: Peter Zijlstra -Cc: Tim Chen -Cc: Dave Hansen -Cc: Borislav Petkov -Cc: Dan Williams -Cc: Linus Torvalds -Cc: Greg Kroah-Hartman -Cc: David Woodhouse -Cc: Paul Turner -Link: https://lkml.kernel.org/r/20180108220932.12580.52458.stgit@tlendack-t1.amdoffice.net - -(cherry picked from commit 9c6a73c75864ad9fa49e5fa6513e4c4071c0e29f) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit dc39f26bf11d270cb4cfd251919afb16d98d6c2b) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/msr-index.h | 1 + - arch/x86/kernel/cpu/amd.c | 18 ++++++++++++++++-- - 2 files changed, 17 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h -index 25147df4acfc..db88b7f852b4 100644 ---- a/arch/x86/include/asm/msr-index.h -+++ b/arch/x86/include/asm/msr-index.h -@@ -353,6 +353,7 @@ - #define MSR_FAM10H_NODE_ID 0xc001100c - #define MSR_F10H_DECFG 0xc0011029 - #define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 -+#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT) - - /* K8 MSRs */ - #define MSR_K8_TOP_MEM1 0xc001001a -diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c -index c9a4e4db7860..99eef4a09fd9 100644 ---- a/arch/x86/kernel/cpu/amd.c -+++ b/arch/x86/kernel/cpu/amd.c -@@ -785,6 +785,9 @@ static void init_amd(struct cpuinfo_x86 *c) - set_cpu_cap(c, X86_FEATURE_K8); - - if (cpu_has(c, X86_FEATURE_XMM2)) { -+ unsigned long long val; -+ int ret; -+ - /* - * A serializing LFENCE has less overhead than MFENCE, so - * use it for execution serialization. On families which -@@ -795,8 +798,19 @@ static void init_amd(struct cpuinfo_x86 *c) - msr_set_bit(MSR_F10H_DECFG, - MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); - -- /* MFENCE stops RDTSC speculation */ -- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); -+ /* -+ * Verify that the MSR write was successful (could be running -+ * under a hypervisor) and only then assume that LFENCE is -+ * serializing. -+ */ -+ ret = rdmsrl_safe(MSR_F10H_DECFG, &val); -+ if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { -+ /* A serializing LFENCE stops RDTSC speculation */ -+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); -+ } else { -+ /* MFENCE stops RDTSC speculation */ -+ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); -+ } - } - - /* --- -2.14.2 - diff --git a/patches/kernel/0257-x86-alternatives-Fix-optimize_nops-checking.patch b/patches/kernel/0257-x86-alternatives-Fix-optimize_nops-checking.patch deleted file mode 100644 index 81b3d15..0000000 --- a/patches/kernel/0257-x86-alternatives-Fix-optimize_nops-checking.patch +++ /dev/null @@ -1,63 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Borislav Petkov -Date: Wed, 10 Jan 2018 12:28:16 +0100 -Subject: [PATCH] x86/alternatives: Fix optimize_nops() checking -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The alternatives code checks only the first byte whether it is a NOP, but -with NOPs in front of the payload and having actual instructions after it -breaks the "optimized' test. - -Make sure to scan all bytes before deciding to optimize the NOPs in there. - -Reported-by: David Woodhouse -Signed-off-by: Borislav Petkov -Signed-off-by: Thomas Gleixner -Cc: Tom Lendacky -Cc: Andi Kleen -Cc: Tim Chen -Cc: Peter Zijlstra -Cc: Jiri Kosina -Cc: Dave Hansen -Cc: Andi Kleen -Cc: Andrew Lutomirski -Cc: Linus Torvalds -Cc: Greg Kroah-Hartman -Cc: Paul Turner -Link: https://lkml.kernel.org/r/20180110112815.mgciyf5acwacphkq@pd.tnic - -(cherry picked from commit 612e8e9350fd19cae6900cf36ea0c6892d1a0dca) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit dc241f68557ee1929a92b9ec6f7a1294bbbd4f00) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/alternative.c | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c -index 32e14d137416..5dc05755a044 100644 ---- a/arch/x86/kernel/alternative.c -+++ b/arch/x86/kernel/alternative.c -@@ -344,9 +344,12 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) - static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) - { - unsigned long flags; -+ int i; - -- if (instr[0] != 0x90) -- return; -+ for (i = 0; i < a->padlen; i++) { -+ if (instr[i] != 0x90) -+ return; -+ } - - local_irq_save(flags); - add_nops(instr + (a->instrlen - a->padlen), a->padlen); --- -2.14.2 - diff --git a/patches/kernel/0257-x86-cpu-AMD-Use-LFENCE_RDTSC-in-preference-to-MFENCE.patch b/patches/kernel/0257-x86-cpu-AMD-Use-LFENCE_RDTSC-in-preference-to-MFENCE.patch new file mode 100644 index 0000000..d2324ef --- /dev/null +++ b/patches/kernel/0257-x86-cpu-AMD-Use-LFENCE_RDTSC-in-preference-to-MFENCE.patch @@ -0,0 +1,92 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Mon, 8 Jan 2018 16:09:32 -0600 +Subject: [PATCH] x86/cpu/AMD: Use LFENCE_RDTSC in preference to MFENCE_RDTSC +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +With LFENCE now a serializing instruction, use LFENCE_RDTSC in preference +to MFENCE_RDTSC. However, since the kernel could be running under a +hypervisor that does not support writing that MSR, read the MSR back and +verify that the bit has been set successfully. If the MSR can be read +and the bit is set, then set the LFENCE_RDTSC feature, otherwise set the +MFENCE_RDTSC feature. + +Signed-off-by: Tom Lendacky +Signed-off-by: Thomas Gleixner +Reviewed-by: Reviewed-by: Borislav Petkov +Cc: Peter Zijlstra +Cc: Tim Chen +Cc: Dave Hansen +Cc: Borislav Petkov +Cc: Dan Williams +Cc: Linus Torvalds +Cc: Greg Kroah-Hartman +Cc: David Woodhouse +Cc: Paul Turner +Link: https://lkml.kernel.org/r/20180108220932.12580.52458.stgit@tlendack-t1.amdoffice.net + +(cherry picked from commit 9c6a73c75864ad9fa49e5fa6513e4c4071c0e29f) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit dc39f26bf11d270cb4cfd251919afb16d98d6c2b) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/msr-index.h | 1 + + arch/x86/kernel/cpu/amd.c | 18 ++++++++++++++++-- + 2 files changed, 17 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index 25147df4acfc..db88b7f852b4 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -353,6 +353,7 @@ + #define MSR_FAM10H_NODE_ID 0xc001100c + #define MSR_F10H_DECFG 0xc0011029 + #define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 ++#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT) + + /* K8 MSRs */ + #define MSR_K8_TOP_MEM1 0xc001001a +diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c +index c9a4e4db7860..99eef4a09fd9 100644 +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -785,6 +785,9 @@ static void init_amd(struct cpuinfo_x86 *c) + set_cpu_cap(c, X86_FEATURE_K8); + + if (cpu_has(c, X86_FEATURE_XMM2)) { ++ unsigned long long val; ++ int ret; ++ + /* + * A serializing LFENCE has less overhead than MFENCE, so + * use it for execution serialization. On families which +@@ -795,8 +798,19 @@ static void init_amd(struct cpuinfo_x86 *c) + msr_set_bit(MSR_F10H_DECFG, + MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + +- /* MFENCE stops RDTSC speculation */ +- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); ++ /* ++ * Verify that the MSR write was successful (could be running ++ * under a hypervisor) and only then assume that LFENCE is ++ * serializing. ++ */ ++ ret = rdmsrl_safe(MSR_F10H_DECFG, &val); ++ if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { ++ /* A serializing LFENCE stops RDTSC speculation */ ++ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); ++ } else { ++ /* MFENCE stops RDTSC speculation */ ++ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); ++ } + } + + /* +-- +2.14.2 + diff --git a/patches/kernel/0258-x86-alternatives-Fix-optimize_nops-checking.patch b/patches/kernel/0258-x86-alternatives-Fix-optimize_nops-checking.patch new file mode 100644 index 0000000..81b3d15 --- /dev/null +++ b/patches/kernel/0258-x86-alternatives-Fix-optimize_nops-checking.patch @@ -0,0 +1,63 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Borislav Petkov +Date: Wed, 10 Jan 2018 12:28:16 +0100 +Subject: [PATCH] x86/alternatives: Fix optimize_nops() checking +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The alternatives code checks only the first byte whether it is a NOP, but +with NOPs in front of the payload and having actual instructions after it +breaks the "optimized' test. + +Make sure to scan all bytes before deciding to optimize the NOPs in there. + +Reported-by: David Woodhouse +Signed-off-by: Borislav Petkov +Signed-off-by: Thomas Gleixner +Cc: Tom Lendacky +Cc: Andi Kleen +Cc: Tim Chen +Cc: Peter Zijlstra +Cc: Jiri Kosina +Cc: Dave Hansen +Cc: Andi Kleen +Cc: Andrew Lutomirski +Cc: Linus Torvalds +Cc: Greg Kroah-Hartman +Cc: Paul Turner +Link: https://lkml.kernel.org/r/20180110112815.mgciyf5acwacphkq@pd.tnic + +(cherry picked from commit 612e8e9350fd19cae6900cf36ea0c6892d1a0dca) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit dc241f68557ee1929a92b9ec6f7a1294bbbd4f00) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/alternative.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c +index 32e14d137416..5dc05755a044 100644 +--- a/arch/x86/kernel/alternative.c ++++ b/arch/x86/kernel/alternative.c +@@ -344,9 +344,12 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) + static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) + { + unsigned long flags; ++ int i; + +- if (instr[0] != 0x90) +- return; ++ for (i = 0; i < a->padlen; i++) { ++ if (instr[i] != 0x90) ++ return; ++ } + + local_irq_save(flags); + add_nops(instr + (a->instrlen - a->padlen), a->padlen); +-- +2.14.2 + diff --git a/patches/kernel/0258-x86-pti-Make-unpoison-of-pgd-for-trusted-boot-work-f.patch b/patches/kernel/0258-x86-pti-Make-unpoison-of-pgd-for-trusted-boot-work-f.patch deleted file mode 100644 index 774cb5d..0000000 --- a/patches/kernel/0258-x86-pti-Make-unpoison-of-pgd-for-trusted-boot-work-f.patch +++ /dev/null @@ -1,83 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Wed, 10 Jan 2018 14:49:39 -0800 -Subject: [PATCH] x86/pti: Make unpoison of pgd for trusted boot work for real -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -The inital fix for trusted boot and PTI potentially misses the pgd clearing -if pud_alloc() sets a PGD. It probably works in *practice* because for two -adjacent calls to map_tboot_page() that share a PGD entry, the first will -clear NX, *then* allocate and set the PGD (without NX clear). The second -call will *not* allocate but will clear the NX bit. - -Defer the NX clearing to a point after it is known that all top-level -allocations have occurred. Add a comment to clarify why. - -[ tglx: Massaged changelog ] - -Fixes: 262b6b30087 ("x86/tboot: Unbreak tboot with PTI enabled") -Signed-off-by: Dave Hansen -Signed-off-by: Thomas Gleixner -Reviewed-by: Andrea Arcangeli -Cc: Jon Masters -Cc: "Tim Chen" -Cc: gnomes@lxorguk.ukuu.org.uk -Cc: peterz@infradead.org -Cc: ning.sun@intel.com -Cc: tboot-devel@lists.sourceforge.net -Cc: andi@firstfloor.org -Cc: luto@kernel.org -Cc: law@redhat.com -Cc: pbonzini@redhat.com -Cc: torvalds@linux-foundation.org -Cc: gregkh@linux-foundation.org -Cc: dwmw@amazon.co.uk -Cc: nickc@redhat.com -Cc: stable@vger.kernel.org -Link: https://lkml.kernel.org/r/20180110224939.2695CD47@viggo.jf.intel.com -(cherry picked from commit 8a931d1e24bacf01f00a35d43bfe7917256c5c49) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 9935124a5c771c004a578423275633232fb7a006) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/tboot.c | 12 +++++++++++- - 1 file changed, 11 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c -index 75869a4b6c41..a2486f444073 100644 ---- a/arch/x86/kernel/tboot.c -+++ b/arch/x86/kernel/tboot.c -@@ -127,7 +127,6 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, - p4d = p4d_alloc(&tboot_mm, pgd, vaddr); - if (!p4d) - return -1; -- pgd->pgd &= ~_PAGE_NX; - pud = pud_alloc(&tboot_mm, p4d, vaddr); - if (!pud) - return -1; -@@ -139,6 +138,17 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, - return -1; - set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); - pte_unmap(pte); -+ -+ /* -+ * PTI poisons low addresses in the kernel page tables in the -+ * name of making them unusable for userspace. To execute -+ * code at such a low address, the poison must be cleared. -+ * -+ * Note: 'pgd' actually gets set in p4d_alloc() _or_ -+ * pud_alloc() depending on 4/5-level paging. -+ */ -+ pgd->pgd &= ~_PAGE_NX; -+ - return 0; - } - --- -2.14.2 - diff --git a/patches/kernel/0259-locking-barriers-introduce-new-memory-barrier-gmb.patch b/patches/kernel/0259-locking-barriers-introduce-new-memory-barrier-gmb.patch deleted file mode 100644 index 1079661..0000000 --- a/patches/kernel/0259-locking-barriers-introduce-new-memory-barrier-gmb.patch +++ /dev/null @@ -1,62 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Elena Reshetova -Date: Mon, 4 Sep 2017 13:11:43 +0300 -Subject: [PATCH] locking/barriers: introduce new memory barrier gmb() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -In constrast to existing mb() and rmb() barriers, -gmb() barrier is arch-independent and can be used to -implement any type of memory barrier. -In x86 case, it is either lfence or mfence, based on -processor type. ARM and others can define it according -to their needs. - -Suggested-by: Arjan van de Ven -Signed-off-by: Elena Reshetova -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 6834bd7e6159da957a6c01deebf16132a694bc23) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/barrier.h | 3 +++ - include/asm-generic/barrier.h | 4 ++++ - 2 files changed, 7 insertions(+) - -diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h -index bfb28caf97b1..aae78054cae2 100644 ---- a/arch/x86/include/asm/barrier.h -+++ b/arch/x86/include/asm/barrier.h -@@ -23,6 +23,9 @@ - #define wmb() asm volatile("sfence" ::: "memory") - #endif - -+#define gmb() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \ -+ "lfence", X86_FEATURE_LFENCE_RDTSC); -+ - #ifdef CONFIG_X86_PPRO_FENCE - #define dma_rmb() rmb() - #else -diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h -index fe297b599b0a..0ee1345c9222 100644 ---- a/include/asm-generic/barrier.h -+++ b/include/asm-generic/barrier.h -@@ -42,6 +42,10 @@ - #define wmb() mb() - #endif - -+#ifndef gmb -+#define gmb() do { } while (0) -+#endif -+ - #ifndef dma_rmb - #define dma_rmb() rmb() - #endif --- -2.14.2 - diff --git a/patches/kernel/0259-x86-pti-Make-unpoison-of-pgd-for-trusted-boot-work-f.patch b/patches/kernel/0259-x86-pti-Make-unpoison-of-pgd-for-trusted-boot-work-f.patch new file mode 100644 index 0000000..774cb5d --- /dev/null +++ b/patches/kernel/0259-x86-pti-Make-unpoison-of-pgd-for-trusted-boot-work-f.patch @@ -0,0 +1,83 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Wed, 10 Jan 2018 14:49:39 -0800 +Subject: [PATCH] x86/pti: Make unpoison of pgd for trusted boot work for real +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +The inital fix for trusted boot and PTI potentially misses the pgd clearing +if pud_alloc() sets a PGD. It probably works in *practice* because for two +adjacent calls to map_tboot_page() that share a PGD entry, the first will +clear NX, *then* allocate and set the PGD (without NX clear). The second +call will *not* allocate but will clear the NX bit. + +Defer the NX clearing to a point after it is known that all top-level +allocations have occurred. Add a comment to clarify why. + +[ tglx: Massaged changelog ] + +Fixes: 262b6b30087 ("x86/tboot: Unbreak tboot with PTI enabled") +Signed-off-by: Dave Hansen +Signed-off-by: Thomas Gleixner +Reviewed-by: Andrea Arcangeli +Cc: Jon Masters +Cc: "Tim Chen" +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: peterz@infradead.org +Cc: ning.sun@intel.com +Cc: tboot-devel@lists.sourceforge.net +Cc: andi@firstfloor.org +Cc: luto@kernel.org +Cc: law@redhat.com +Cc: pbonzini@redhat.com +Cc: torvalds@linux-foundation.org +Cc: gregkh@linux-foundation.org +Cc: dwmw@amazon.co.uk +Cc: nickc@redhat.com +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/20180110224939.2695CD47@viggo.jf.intel.com +(cherry picked from commit 8a931d1e24bacf01f00a35d43bfe7917256c5c49) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 9935124a5c771c004a578423275633232fb7a006) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/tboot.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c +index 75869a4b6c41..a2486f444073 100644 +--- a/arch/x86/kernel/tboot.c ++++ b/arch/x86/kernel/tboot.c +@@ -127,7 +127,6 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, + p4d = p4d_alloc(&tboot_mm, pgd, vaddr); + if (!p4d) + return -1; +- pgd->pgd &= ~_PAGE_NX; + pud = pud_alloc(&tboot_mm, p4d, vaddr); + if (!pud) + return -1; +@@ -139,6 +138,17 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, + return -1; + set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); + pte_unmap(pte); ++ ++ /* ++ * PTI poisons low addresses in the kernel page tables in the ++ * name of making them unusable for userspace. To execute ++ * code at such a low address, the poison must be cleared. ++ * ++ * Note: 'pgd' actually gets set in p4d_alloc() _or_ ++ * pud_alloc() depending on 4/5-level paging. ++ */ ++ pgd->pgd &= ~_PAGE_NX; ++ + return 0; + } + +-- +2.14.2 + diff --git a/patches/kernel/0260-bpf-prevent-speculative-execution-in-eBPF-interprete.patch b/patches/kernel/0260-bpf-prevent-speculative-execution-in-eBPF-interprete.patch deleted file mode 100644 index 4758995..0000000 --- a/patches/kernel/0260-bpf-prevent-speculative-execution-in-eBPF-interprete.patch +++ /dev/null @@ -1,60 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Elena Reshetova -Date: Mon, 4 Sep 2017 13:11:44 +0300 -Subject: [PATCH] bpf: prevent speculative execution in eBPF interpreter -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -This adds a generic memory barrier before LD_IMM_DW and -LDX_MEM_B/H/W/DW eBPF instructions during eBPF program -execution in order to prevent speculative execution on out -of bound BFP_MAP array indexes. This way an arbitary kernel -memory is not exposed through side channel attacks. - -For more details, please see this Google Project Zero report: tbd - -Signed-off-by: Elena Reshetova -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit dd13f73106c260dea7a689d33d1457639af820aa) -Signed-off-by: Fabian Grünbichler ---- - kernel/bpf/core.c | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c -index 9a1bed1f3029..3f83c60e3e86 100644 ---- a/kernel/bpf/core.c -+++ b/kernel/bpf/core.c -@@ -33,6 +33,7 @@ - #include - - #include -+#include - - /* Registers */ - #define BPF_R0 regs[BPF_REG_0] -@@ -920,6 +921,7 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, - DST = IMM; - CONT; - LD_IMM_DW: -+ gmb(); - DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; - insn++; - CONT; -@@ -1133,6 +1135,7 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, - *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \ - CONT; \ - LDX_MEM_##SIZEOP: \ -+ gmb(); \ - DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ - CONT; - --- -2.14.2 - diff --git a/patches/kernel/0260-locking-barriers-introduce-new-memory-barrier-gmb.patch b/patches/kernel/0260-locking-barriers-introduce-new-memory-barrier-gmb.patch new file mode 100644 index 0000000..1079661 --- /dev/null +++ b/patches/kernel/0260-locking-barriers-introduce-new-memory-barrier-gmb.patch @@ -0,0 +1,62 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Elena Reshetova +Date: Mon, 4 Sep 2017 13:11:43 +0300 +Subject: [PATCH] locking/barriers: introduce new memory barrier gmb() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +In constrast to existing mb() and rmb() barriers, +gmb() barrier is arch-independent and can be used to +implement any type of memory barrier. +In x86 case, it is either lfence or mfence, based on +processor type. ARM and others can define it according +to their needs. + +Suggested-by: Arjan van de Ven +Signed-off-by: Elena Reshetova +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 6834bd7e6159da957a6c01deebf16132a694bc23) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/barrier.h | 3 +++ + include/asm-generic/barrier.h | 4 ++++ + 2 files changed, 7 insertions(+) + +diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h +index bfb28caf97b1..aae78054cae2 100644 +--- a/arch/x86/include/asm/barrier.h ++++ b/arch/x86/include/asm/barrier.h +@@ -23,6 +23,9 @@ + #define wmb() asm volatile("sfence" ::: "memory") + #endif + ++#define gmb() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \ ++ "lfence", X86_FEATURE_LFENCE_RDTSC); ++ + #ifdef CONFIG_X86_PPRO_FENCE + #define dma_rmb() rmb() + #else +diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h +index fe297b599b0a..0ee1345c9222 100644 +--- a/include/asm-generic/barrier.h ++++ b/include/asm-generic/barrier.h +@@ -42,6 +42,10 @@ + #define wmb() mb() + #endif + ++#ifndef gmb ++#define gmb() do { } while (0) ++#endif ++ + #ifndef dma_rmb + #define dma_rmb() rmb() + #endif +-- +2.14.2 + diff --git a/patches/kernel/0261-bpf-prevent-speculative-execution-in-eBPF-interprete.patch b/patches/kernel/0261-bpf-prevent-speculative-execution-in-eBPF-interprete.patch new file mode 100644 index 0000000..4758995 --- /dev/null +++ b/patches/kernel/0261-bpf-prevent-speculative-execution-in-eBPF-interprete.patch @@ -0,0 +1,60 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Elena Reshetova +Date: Mon, 4 Sep 2017 13:11:44 +0300 +Subject: [PATCH] bpf: prevent speculative execution in eBPF interpreter +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +This adds a generic memory barrier before LD_IMM_DW and +LDX_MEM_B/H/W/DW eBPF instructions during eBPF program +execution in order to prevent speculative execution on out +of bound BFP_MAP array indexes. This way an arbitary kernel +memory is not exposed through side channel attacks. + +For more details, please see this Google Project Zero report: tbd + +Signed-off-by: Elena Reshetova +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit dd13f73106c260dea7a689d33d1457639af820aa) +Signed-off-by: Fabian Grünbichler +--- + kernel/bpf/core.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c +index 9a1bed1f3029..3f83c60e3e86 100644 +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -33,6 +33,7 @@ + #include + + #include ++#include + + /* Registers */ + #define BPF_R0 regs[BPF_REG_0] +@@ -920,6 +921,7 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, + DST = IMM; + CONT; + LD_IMM_DW: ++ gmb(); + DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; + insn++; + CONT; +@@ -1133,6 +1135,7 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, + *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \ + CONT; \ + LDX_MEM_##SIZEOP: \ ++ gmb(); \ + DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ + CONT; + +-- +2.14.2 + diff --git a/patches/kernel/0261-x86-bpf-jit-prevent-speculative-execution-when-JIT-i.patch b/patches/kernel/0261-x86-bpf-jit-prevent-speculative-execution-when-JIT-i.patch deleted file mode 100644 index 5e11a7b..0000000 --- a/patches/kernel/0261-x86-bpf-jit-prevent-speculative-execution-when-JIT-i.patch +++ /dev/null @@ -1,93 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Elena Reshetova -Date: Mon, 4 Sep 2017 13:11:45 +0300 -Subject: [PATCH] x86, bpf, jit: prevent speculative execution when JIT is - enabled -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -When constant blinding is enabled (bpf_jit_harden = 1), this adds -a generic memory barrier (lfence for intel, mfence for AMD) before -emitting x86 jitted code for the BPF_ALU(64)_OR_X and BPF_ALU_LHS_X -(for BPF_REG_AX register) eBPF instructions. This is needed in order -to prevent speculative execution on out of bounds BPF_MAP array -indexes when JIT is enabled. This way an arbitary kernel memory is -not exposed through side-channel attacks. - -For more details, please see this Google Project Zero report: tbd - -Signed-off-by: Elena Reshetova -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit cf9676859a05d0d784067072e8121e63888bacc7) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/net/bpf_jit_comp.c | 33 ++++++++++++++++++++++++++++++++- - 1 file changed, 32 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c -index 4d50ced94686..879dbfefb66d 100644 ---- a/arch/x86/net/bpf_jit_comp.c -+++ b/arch/x86/net/bpf_jit_comp.c -@@ -107,6 +107,27 @@ static void bpf_flush_icache(void *start, void *end) - set_fs(old_fs); - } - -+static void emit_memory_barrier(u8 **pprog) -+{ -+ u8 *prog = *pprog; -+ int cnt = 0; -+ -+ if (bpf_jit_blinding_enabled()) { -+ if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) -+ /* x86 LFENCE opcode 0F AE E8 */ -+ EMIT3(0x0f, 0xae, 0xe8); -+ else if (boot_cpu_has(X86_FEATURE_MFENCE_RDTSC)) -+ /* AMD MFENCE opcode 0F AE F0 */ -+ EMIT3(0x0f, 0xae, 0xf0); -+ else -+ /* we should never end up here, -+ * but if we do, better not to emit anything*/ -+ return; -+ } -+ *pprog = prog; -+ return; -+} -+ - #define CHOOSE_LOAD_FUNC(K, func) \ - ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) - -@@ -399,7 +420,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, - case BPF_ADD: b2 = 0x01; break; - case BPF_SUB: b2 = 0x29; break; - case BPF_AND: b2 = 0x21; break; -- case BPF_OR: b2 = 0x09; break; -+ case BPF_OR: b2 = 0x09; emit_memory_barrier(&prog); break; - case BPF_XOR: b2 = 0x31; break; - } - if (BPF_CLASS(insn->code) == BPF_ALU64) -@@ -646,6 +667,16 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, - case BPF_ALU64 | BPF_RSH | BPF_X: - case BPF_ALU64 | BPF_ARSH | BPF_X: - -+ /* If blinding is enabled, each -+ * BPF_LD | BPF_IMM | BPF_DW instruction -+ * is converted to 4 eBPF instructions with -+ * BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32) -+ * always present(number 3). Detect such cases -+ * and insert memory barriers. */ -+ if ((BPF_CLASS(insn->code) == BPF_ALU64) -+ && (BPF_OP(insn->code) == BPF_LSH) -+ && (src_reg == BPF_REG_AX)) -+ emit_memory_barrier(&prog); - /* check for bad case when dst_reg == rcx */ - if (dst_reg == BPF_REG_4) { - /* mov r11, dst_reg */ --- -2.14.2 - diff --git a/patches/kernel/0262-uvcvideo-prevent-speculative-execution.patch b/patches/kernel/0262-uvcvideo-prevent-speculative-execution.patch deleted file mode 100644 index f4d5128..0000000 --- a/patches/kernel/0262-uvcvideo-prevent-speculative-execution.patch +++ /dev/null @@ -1,38 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Elena Reshetova -Date: Mon, 4 Sep 2017 13:11:46 +0300 -Subject: [PATCH] uvcvideo: prevent speculative execution -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -real commit text tbd - -Signed-off-by: Elena Reshetova -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 65d4588b16395360695525add0ca79fa6ba04fa5) -Signed-off-by: Fabian Grünbichler ---- - drivers/media/usb/uvc/uvc_v4l2.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/drivers/media/usb/uvc/uvc_v4l2.c b/drivers/media/usb/uvc/uvc_v4l2.c -index 3e7e283a44a8..fcedd1798e9d 100644 ---- a/drivers/media/usb/uvc/uvc_v4l2.c -+++ b/drivers/media/usb/uvc/uvc_v4l2.c -@@ -821,6 +821,7 @@ static int uvc_ioctl_enum_input(struct file *file, void *fh, - } - pin = iterm->id; - } else if (index < selector->bNrInPins) { -+ gmb(); - pin = selector->baSourceID[index]; - list_for_each_entry(iterm, &chain->entities, chain) { - if (!UVC_ENTITY_IS_ITERM(iterm)) --- -2.14.2 - diff --git a/patches/kernel/0262-x86-bpf-jit-prevent-speculative-execution-when-JIT-i.patch b/patches/kernel/0262-x86-bpf-jit-prevent-speculative-execution-when-JIT-i.patch new file mode 100644 index 0000000..5e11a7b --- /dev/null +++ b/patches/kernel/0262-x86-bpf-jit-prevent-speculative-execution-when-JIT-i.patch @@ -0,0 +1,93 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Elena Reshetova +Date: Mon, 4 Sep 2017 13:11:45 +0300 +Subject: [PATCH] x86, bpf, jit: prevent speculative execution when JIT is + enabled +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +When constant blinding is enabled (bpf_jit_harden = 1), this adds +a generic memory barrier (lfence for intel, mfence for AMD) before +emitting x86 jitted code for the BPF_ALU(64)_OR_X and BPF_ALU_LHS_X +(for BPF_REG_AX register) eBPF instructions. This is needed in order +to prevent speculative execution on out of bounds BPF_MAP array +indexes when JIT is enabled. This way an arbitary kernel memory is +not exposed through side-channel attacks. + +For more details, please see this Google Project Zero report: tbd + +Signed-off-by: Elena Reshetova +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit cf9676859a05d0d784067072e8121e63888bacc7) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/net/bpf_jit_comp.c | 33 ++++++++++++++++++++++++++++++++- + 1 file changed, 32 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c +index 4d50ced94686..879dbfefb66d 100644 +--- a/arch/x86/net/bpf_jit_comp.c ++++ b/arch/x86/net/bpf_jit_comp.c +@@ -107,6 +107,27 @@ static void bpf_flush_icache(void *start, void *end) + set_fs(old_fs); + } + ++static void emit_memory_barrier(u8 **pprog) ++{ ++ u8 *prog = *pprog; ++ int cnt = 0; ++ ++ if (bpf_jit_blinding_enabled()) { ++ if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) ++ /* x86 LFENCE opcode 0F AE E8 */ ++ EMIT3(0x0f, 0xae, 0xe8); ++ else if (boot_cpu_has(X86_FEATURE_MFENCE_RDTSC)) ++ /* AMD MFENCE opcode 0F AE F0 */ ++ EMIT3(0x0f, 0xae, 0xf0); ++ else ++ /* we should never end up here, ++ * but if we do, better not to emit anything*/ ++ return; ++ } ++ *pprog = prog; ++ return; ++} ++ + #define CHOOSE_LOAD_FUNC(K, func) \ + ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) + +@@ -399,7 +420,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, + case BPF_ADD: b2 = 0x01; break; + case BPF_SUB: b2 = 0x29; break; + case BPF_AND: b2 = 0x21; break; +- case BPF_OR: b2 = 0x09; break; ++ case BPF_OR: b2 = 0x09; emit_memory_barrier(&prog); break; + case BPF_XOR: b2 = 0x31; break; + } + if (BPF_CLASS(insn->code) == BPF_ALU64) +@@ -646,6 +667,16 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, + case BPF_ALU64 | BPF_RSH | BPF_X: + case BPF_ALU64 | BPF_ARSH | BPF_X: + ++ /* If blinding is enabled, each ++ * BPF_LD | BPF_IMM | BPF_DW instruction ++ * is converted to 4 eBPF instructions with ++ * BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32) ++ * always present(number 3). Detect such cases ++ * and insert memory barriers. */ ++ if ((BPF_CLASS(insn->code) == BPF_ALU64) ++ && (BPF_OP(insn->code) == BPF_LSH) ++ && (src_reg == BPF_REG_AX)) ++ emit_memory_barrier(&prog); + /* check for bad case when dst_reg == rcx */ + if (dst_reg == BPF_REG_4) { + /* mov r11, dst_reg */ +-- +2.14.2 + diff --git a/patches/kernel/0263-carl9170-prevent-speculative-execution.patch b/patches/kernel/0263-carl9170-prevent-speculative-execution.patch deleted file mode 100644 index aa83932..0000000 --- a/patches/kernel/0263-carl9170-prevent-speculative-execution.patch +++ /dev/null @@ -1,38 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Elena Reshetova -Date: Mon, 4 Sep 2017 13:11:47 +0300 -Subject: [PATCH] carl9170: prevent speculative execution -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Real commit text tbd - -Signed-off-by: Elena Reshetova -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit dc218eba4fe8241ab073be41a068f6796450c6d0) -Signed-off-by: Fabian Grünbichler ---- - drivers/net/wireless/ath/carl9170/main.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/drivers/net/wireless/ath/carl9170/main.c b/drivers/net/wireless/ath/carl9170/main.c -index 988c8857d78c..7e2c1c870a1d 100644 ---- a/drivers/net/wireless/ath/carl9170/main.c -+++ b/drivers/net/wireless/ath/carl9170/main.c -@@ -1388,6 +1388,7 @@ static int carl9170_op_conf_tx(struct ieee80211_hw *hw, - - mutex_lock(&ar->mutex); - if (queue < ar->hw->queues) { -+ gmb(); - memcpy(&ar->edcf[ar9170_qmap[queue]], param, sizeof(*param)); - ret = carl9170_set_qos(ar); - } else { --- -2.14.2 - diff --git a/patches/kernel/0263-uvcvideo-prevent-speculative-execution.patch b/patches/kernel/0263-uvcvideo-prevent-speculative-execution.patch new file mode 100644 index 0000000..f4d5128 --- /dev/null +++ b/patches/kernel/0263-uvcvideo-prevent-speculative-execution.patch @@ -0,0 +1,38 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Elena Reshetova +Date: Mon, 4 Sep 2017 13:11:46 +0300 +Subject: [PATCH] uvcvideo: prevent speculative execution +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +real commit text tbd + +Signed-off-by: Elena Reshetova +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 65d4588b16395360695525add0ca79fa6ba04fa5) +Signed-off-by: Fabian Grünbichler +--- + drivers/media/usb/uvc/uvc_v4l2.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/media/usb/uvc/uvc_v4l2.c b/drivers/media/usb/uvc/uvc_v4l2.c +index 3e7e283a44a8..fcedd1798e9d 100644 +--- a/drivers/media/usb/uvc/uvc_v4l2.c ++++ b/drivers/media/usb/uvc/uvc_v4l2.c +@@ -821,6 +821,7 @@ static int uvc_ioctl_enum_input(struct file *file, void *fh, + } + pin = iterm->id; + } else if (index < selector->bNrInPins) { ++ gmb(); + pin = selector->baSourceID[index]; + list_for_each_entry(iterm, &chain->entities, chain) { + if (!UVC_ENTITY_IS_ITERM(iterm)) +-- +2.14.2 + diff --git a/patches/kernel/0264-carl9170-prevent-speculative-execution.patch b/patches/kernel/0264-carl9170-prevent-speculative-execution.patch new file mode 100644 index 0000000..aa83932 --- /dev/null +++ b/patches/kernel/0264-carl9170-prevent-speculative-execution.patch @@ -0,0 +1,38 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Elena Reshetova +Date: Mon, 4 Sep 2017 13:11:47 +0300 +Subject: [PATCH] carl9170: prevent speculative execution +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Real commit text tbd + +Signed-off-by: Elena Reshetova +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit dc218eba4fe8241ab073be41a068f6796450c6d0) +Signed-off-by: Fabian Grünbichler +--- + drivers/net/wireless/ath/carl9170/main.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/net/wireless/ath/carl9170/main.c b/drivers/net/wireless/ath/carl9170/main.c +index 988c8857d78c..7e2c1c870a1d 100644 +--- a/drivers/net/wireless/ath/carl9170/main.c ++++ b/drivers/net/wireless/ath/carl9170/main.c +@@ -1388,6 +1388,7 @@ static int carl9170_op_conf_tx(struct ieee80211_hw *hw, + + mutex_lock(&ar->mutex); + if (queue < ar->hw->queues) { ++ gmb(); + memcpy(&ar->edcf[ar9170_qmap[queue]], param, sizeof(*param)); + ret = carl9170_set_qos(ar); + } else { +-- +2.14.2 + diff --git a/patches/kernel/0264-p54-prevent-speculative-execution.patch b/patches/kernel/0264-p54-prevent-speculative-execution.patch deleted file mode 100644 index 3d9ba34..0000000 --- a/patches/kernel/0264-p54-prevent-speculative-execution.patch +++ /dev/null @@ -1,38 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Elena Reshetova -Date: Mon, 4 Sep 2017 13:11:48 +0300 -Subject: [PATCH] p54: prevent speculative execution -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Real commit text tbd - -Signed-off-by: Elena Reshetova -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 57b537e161bb9d44475a05b2b12d64bfb50319d3) -Signed-off-by: Fabian Grünbichler ---- - drivers/net/wireless/intersil/p54/main.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/drivers/net/wireless/intersil/p54/main.c b/drivers/net/wireless/intersil/p54/main.c -index d5a3bf91a03e..7e6af1f67960 100644 ---- a/drivers/net/wireless/intersil/p54/main.c -+++ b/drivers/net/wireless/intersil/p54/main.c -@@ -415,6 +415,7 @@ static int p54_conf_tx(struct ieee80211_hw *dev, - - mutex_lock(&priv->conf_mutex); - if (queue < dev->queues) { -+ gmb(); - P54_SET_QUEUE(priv->qos_params[queue], params->aifs, - params->cw_min, params->cw_max, params->txop); - ret = p54_set_edcf(priv); --- -2.14.2 - diff --git a/patches/kernel/0265-p54-prevent-speculative-execution.patch b/patches/kernel/0265-p54-prevent-speculative-execution.patch new file mode 100644 index 0000000..3d9ba34 --- /dev/null +++ b/patches/kernel/0265-p54-prevent-speculative-execution.patch @@ -0,0 +1,38 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Elena Reshetova +Date: Mon, 4 Sep 2017 13:11:48 +0300 +Subject: [PATCH] p54: prevent speculative execution +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Real commit text tbd + +Signed-off-by: Elena Reshetova +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 57b537e161bb9d44475a05b2b12d64bfb50319d3) +Signed-off-by: Fabian Grünbichler +--- + drivers/net/wireless/intersil/p54/main.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/net/wireless/intersil/p54/main.c b/drivers/net/wireless/intersil/p54/main.c +index d5a3bf91a03e..7e6af1f67960 100644 +--- a/drivers/net/wireless/intersil/p54/main.c ++++ b/drivers/net/wireless/intersil/p54/main.c +@@ -415,6 +415,7 @@ static int p54_conf_tx(struct ieee80211_hw *dev, + + mutex_lock(&priv->conf_mutex); + if (queue < dev->queues) { ++ gmb(); + P54_SET_QUEUE(priv->qos_params[queue], params->aifs, + params->cw_min, params->cw_max, params->txop); + ret = p54_set_edcf(priv); +-- +2.14.2 + diff --git a/patches/kernel/0265-qla2xxx-prevent-speculative-execution.patch b/patches/kernel/0265-qla2xxx-prevent-speculative-execution.patch deleted file mode 100644 index 527c7f9..0000000 --- a/patches/kernel/0265-qla2xxx-prevent-speculative-execution.patch +++ /dev/null @@ -1,60 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Elena Reshetova -Date: Mon, 4 Sep 2017 13:11:49 +0300 -Subject: [PATCH] qla2xxx: prevent speculative execution -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Real commit text tbd - -Signed-off-by: Elena Reshetova -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit d71318e5f16371dbc0e89a786336a521551f8946) -Signed-off-by: Fabian Grünbichler ---- - drivers/scsi/qla2xxx/qla_mr.c | 12 ++++++++---- - 1 file changed, 8 insertions(+), 4 deletions(-) - -diff --git a/drivers/scsi/qla2xxx/qla_mr.c b/drivers/scsi/qla2xxx/qla_mr.c -index 10b742d27e16..ca923d8803f9 100644 ---- a/drivers/scsi/qla2xxx/qla_mr.c -+++ b/drivers/scsi/qla2xxx/qla_mr.c -@@ -2304,10 +2304,12 @@ qlafx00_status_entry(scsi_qla_host_t *vha, struct rsp_que *rsp, void *pkt) - req = ha->req_q_map[que]; - - /* Validate handle. */ -- if (handle < req->num_outstanding_cmds) -+ if (handle < req->num_outstanding_cmds) { -+ gmb(); - sp = req->outstanding_cmds[handle]; -- else -+ } else { - sp = NULL; -+ } - - if (sp == NULL) { - ql_dbg(ql_dbg_io, vha, 0x3034, -@@ -2655,10 +2657,12 @@ qlafx00_multistatus_entry(struct scsi_qla_host *vha, - req = ha->req_q_map[que]; - - /* Validate handle. */ -- if (handle < req->num_outstanding_cmds) -+ if (handle < req->num_outstanding_cmds) { -+ gmb(); - sp = req->outstanding_cmds[handle]; -- else -+ } else { - sp = NULL; -+ } - - if (sp == NULL) { - ql_dbg(ql_dbg_io, vha, 0x3044, --- -2.14.2 - diff --git a/patches/kernel/0266-cw1200-prevent-speculative-execution.patch b/patches/kernel/0266-cw1200-prevent-speculative-execution.patch deleted file mode 100644 index 05a4767..0000000 --- a/patches/kernel/0266-cw1200-prevent-speculative-execution.patch +++ /dev/null @@ -1,38 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Elena Reshetova -Date: Mon, 4 Sep 2017 13:11:50 +0300 -Subject: [PATCH] cw1200: prevent speculative execution -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Real commit text tbd - -Signed-off-by: Elena Reshetova -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 30770297508b781f2c1e82c52f793bc4d2cb2356) -Signed-off-by: Fabian Grünbichler ---- - drivers/net/wireless/st/cw1200/sta.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/drivers/net/wireless/st/cw1200/sta.c b/drivers/net/wireless/st/cw1200/sta.c -index a52224836a2b..bbff06a4263e 100644 ---- a/drivers/net/wireless/st/cw1200/sta.c -+++ b/drivers/net/wireless/st/cw1200/sta.c -@@ -619,6 +619,7 @@ int cw1200_conf_tx(struct ieee80211_hw *dev, struct ieee80211_vif *vif, - mutex_lock(&priv->conf_mutex); - - if (queue < dev->queues) { -+ gmb(); - old_uapsd_flags = le16_to_cpu(priv->uapsd_info.uapsd_flags); - - WSM_TX_QUEUE_SET(&priv->tx_queue_params, queue, 0, 0, 0); --- -2.14.2 - diff --git a/patches/kernel/0266-qla2xxx-prevent-speculative-execution.patch b/patches/kernel/0266-qla2xxx-prevent-speculative-execution.patch new file mode 100644 index 0000000..527c7f9 --- /dev/null +++ b/patches/kernel/0266-qla2xxx-prevent-speculative-execution.patch @@ -0,0 +1,60 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Elena Reshetova +Date: Mon, 4 Sep 2017 13:11:49 +0300 +Subject: [PATCH] qla2xxx: prevent speculative execution +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Real commit text tbd + +Signed-off-by: Elena Reshetova +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit d71318e5f16371dbc0e89a786336a521551f8946) +Signed-off-by: Fabian Grünbichler +--- + drivers/scsi/qla2xxx/qla_mr.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/drivers/scsi/qla2xxx/qla_mr.c b/drivers/scsi/qla2xxx/qla_mr.c +index 10b742d27e16..ca923d8803f9 100644 +--- a/drivers/scsi/qla2xxx/qla_mr.c ++++ b/drivers/scsi/qla2xxx/qla_mr.c +@@ -2304,10 +2304,12 @@ qlafx00_status_entry(scsi_qla_host_t *vha, struct rsp_que *rsp, void *pkt) + req = ha->req_q_map[que]; + + /* Validate handle. */ +- if (handle < req->num_outstanding_cmds) ++ if (handle < req->num_outstanding_cmds) { ++ gmb(); + sp = req->outstanding_cmds[handle]; +- else ++ } else { + sp = NULL; ++ } + + if (sp == NULL) { + ql_dbg(ql_dbg_io, vha, 0x3034, +@@ -2655,10 +2657,12 @@ qlafx00_multistatus_entry(struct scsi_qla_host *vha, + req = ha->req_q_map[que]; + + /* Validate handle. */ +- if (handle < req->num_outstanding_cmds) ++ if (handle < req->num_outstanding_cmds) { ++ gmb(); + sp = req->outstanding_cmds[handle]; +- else ++ } else { + sp = NULL; ++ } + + if (sp == NULL) { + ql_dbg(ql_dbg_io, vha, 0x3044, +-- +2.14.2 + diff --git a/patches/kernel/0267-Thermal-int340x-prevent-speculative-execution.patch b/patches/kernel/0267-Thermal-int340x-prevent-speculative-execution.patch deleted file mode 100644 index 58d560d..0000000 --- a/patches/kernel/0267-Thermal-int340x-prevent-speculative-execution.patch +++ /dev/null @@ -1,52 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Elena Reshetova -Date: Mon, 4 Sep 2017 13:11:51 +0300 -Subject: [PATCH] Thermal/int340x: prevent speculative execution -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Real commit text tbd - -Signed-off-by: Elena Reshetova -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 3904f4cadeeaa9370f0635eb2f66194ca238325b) -Signed-off-by: Fabian Grünbichler ---- - drivers/thermal/int340x_thermal/int340x_thermal_zone.c | 11 ++++++----- - 1 file changed, 6 insertions(+), 5 deletions(-) - -diff --git a/drivers/thermal/int340x_thermal/int340x_thermal_zone.c b/drivers/thermal/int340x_thermal/int340x_thermal_zone.c -index 145a5c53ff5c..4f9917ef3c11 100644 ---- a/drivers/thermal/int340x_thermal/int340x_thermal_zone.c -+++ b/drivers/thermal/int340x_thermal/int340x_thermal_zone.c -@@ -57,15 +57,16 @@ static int int340x_thermal_get_trip_temp(struct thermal_zone_device *zone, - if (d->override_ops && d->override_ops->get_trip_temp) - return d->override_ops->get_trip_temp(zone, trip, temp); - -- if (trip < d->aux_trip_nr) -+ if (trip < d->aux_trip_nr) { -+ gmb(); - *temp = d->aux_trips[trip]; -- else if (trip == d->crt_trip_id) -+ } else if (trip == d->crt_trip_id) { - *temp = d->crt_temp; -- else if (trip == d->psv_trip_id) -+ } else if (trip == d->psv_trip_id) { - *temp = d->psv_temp; -- else if (trip == d->hot_trip_id) -+ } else if (trip == d->hot_trip_id) { - *temp = d->hot_temp; -- else { -+ } else { - for (i = 0; i < INT340X_THERMAL_MAX_ACT_TRIP_COUNT; i++) { - if (d->act_trips[i].valid && - d->act_trips[i].id == trip) { --- -2.14.2 - diff --git a/patches/kernel/0267-cw1200-prevent-speculative-execution.patch b/patches/kernel/0267-cw1200-prevent-speculative-execution.patch new file mode 100644 index 0000000..05a4767 --- /dev/null +++ b/patches/kernel/0267-cw1200-prevent-speculative-execution.patch @@ -0,0 +1,38 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Elena Reshetova +Date: Mon, 4 Sep 2017 13:11:50 +0300 +Subject: [PATCH] cw1200: prevent speculative execution +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Real commit text tbd + +Signed-off-by: Elena Reshetova +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 30770297508b781f2c1e82c52f793bc4d2cb2356) +Signed-off-by: Fabian Grünbichler +--- + drivers/net/wireless/st/cw1200/sta.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/net/wireless/st/cw1200/sta.c b/drivers/net/wireless/st/cw1200/sta.c +index a52224836a2b..bbff06a4263e 100644 +--- a/drivers/net/wireless/st/cw1200/sta.c ++++ b/drivers/net/wireless/st/cw1200/sta.c +@@ -619,6 +619,7 @@ int cw1200_conf_tx(struct ieee80211_hw *dev, struct ieee80211_vif *vif, + mutex_lock(&priv->conf_mutex); + + if (queue < dev->queues) { ++ gmb(); + old_uapsd_flags = le16_to_cpu(priv->uapsd_info.uapsd_flags); + + WSM_TX_QUEUE_SET(&priv->tx_queue_params, queue, 0, 0, 0); +-- +2.14.2 + diff --git a/patches/kernel/0268-Thermal-int340x-prevent-speculative-execution.patch b/patches/kernel/0268-Thermal-int340x-prevent-speculative-execution.patch new file mode 100644 index 0000000..58d560d --- /dev/null +++ b/patches/kernel/0268-Thermal-int340x-prevent-speculative-execution.patch @@ -0,0 +1,52 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Elena Reshetova +Date: Mon, 4 Sep 2017 13:11:51 +0300 +Subject: [PATCH] Thermal/int340x: prevent speculative execution +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Real commit text tbd + +Signed-off-by: Elena Reshetova +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 3904f4cadeeaa9370f0635eb2f66194ca238325b) +Signed-off-by: Fabian Grünbichler +--- + drivers/thermal/int340x_thermal/int340x_thermal_zone.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/drivers/thermal/int340x_thermal/int340x_thermal_zone.c b/drivers/thermal/int340x_thermal/int340x_thermal_zone.c +index 145a5c53ff5c..4f9917ef3c11 100644 +--- a/drivers/thermal/int340x_thermal/int340x_thermal_zone.c ++++ b/drivers/thermal/int340x_thermal/int340x_thermal_zone.c +@@ -57,15 +57,16 @@ static int int340x_thermal_get_trip_temp(struct thermal_zone_device *zone, + if (d->override_ops && d->override_ops->get_trip_temp) + return d->override_ops->get_trip_temp(zone, trip, temp); + +- if (trip < d->aux_trip_nr) ++ if (trip < d->aux_trip_nr) { ++ gmb(); + *temp = d->aux_trips[trip]; +- else if (trip == d->crt_trip_id) ++ } else if (trip == d->crt_trip_id) { + *temp = d->crt_temp; +- else if (trip == d->psv_trip_id) ++ } else if (trip == d->psv_trip_id) { + *temp = d->psv_temp; +- else if (trip == d->hot_trip_id) ++ } else if (trip == d->hot_trip_id) { + *temp = d->hot_temp; +- else { ++ } else { + for (i = 0; i < INT340X_THERMAL_MAX_ACT_TRIP_COUNT; i++) { + if (d->act_trips[i].valid && + d->act_trips[i].id == trip) { +-- +2.14.2 + diff --git a/patches/kernel/0268-userns-prevent-speculative-execution.patch b/patches/kernel/0268-userns-prevent-speculative-execution.patch deleted file mode 100644 index 4854b18..0000000 --- a/patches/kernel/0268-userns-prevent-speculative-execution.patch +++ /dev/null @@ -1,42 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Elena Reshetova -Date: Mon, 4 Sep 2017 13:11:52 +0300 -Subject: [PATCH] userns: prevent speculative execution -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Real commit text tbd - -Signed-off-by: Elena Reshetova -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 1410678db6238e625775f7108c68a9e5b8d439a1) -Signed-off-by: Fabian Grünbichler ---- - kernel/user_namespace.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index 4eacf186f5bc..684cc69d431c 100644 ---- a/kernel/user_namespace.c -+++ b/kernel/user_namespace.c -@@ -549,8 +549,10 @@ static void *m_start(struct seq_file *seq, loff_t *ppos, - struct uid_gid_extent *extent = NULL; - loff_t pos = *ppos; - -- if (pos < map->nr_extents) -+ if (pos < map->nr_extents) { -+ gmb(); - extent = &map->extent[pos]; -+ } - - return extent; - } --- -2.14.2 - diff --git a/patches/kernel/0269-ipv6-prevent-speculative-execution.patch b/patches/kernel/0269-ipv6-prevent-speculative-execution.patch deleted file mode 100644 index a5fa697..0000000 --- a/patches/kernel/0269-ipv6-prevent-speculative-execution.patch +++ /dev/null @@ -1,38 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Elena Reshetova -Date: Mon, 4 Sep 2017 13:11:53 +0300 -Subject: [PATCH] ipv6: prevent speculative execution -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Real commit text tbd - -Signed-off-by: Elena Reshetova -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit fdb98114a31aa5c0083bd7cd5b42ea569b6f77dc) -Signed-off-by: Fabian Grünbichler ---- - net/ipv6/raw.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c -index 60be012fe708..1a0eae661512 100644 ---- a/net/ipv6/raw.c -+++ b/net/ipv6/raw.c -@@ -726,6 +726,7 @@ static int raw6_getfrag(void *from, char *to, int offset, int len, int odd, - if (offset < rfv->hlen) { - int copy = min(rfv->hlen - offset, len); - -+ gmb(); - if (skb->ip_summed == CHECKSUM_PARTIAL) - memcpy(to, rfv->c + offset, copy); - else --- -2.14.2 - diff --git a/patches/kernel/0269-userns-prevent-speculative-execution.patch b/patches/kernel/0269-userns-prevent-speculative-execution.patch new file mode 100644 index 0000000..4854b18 --- /dev/null +++ b/patches/kernel/0269-userns-prevent-speculative-execution.patch @@ -0,0 +1,42 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Elena Reshetova +Date: Mon, 4 Sep 2017 13:11:52 +0300 +Subject: [PATCH] userns: prevent speculative execution +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Real commit text tbd + +Signed-off-by: Elena Reshetova +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 1410678db6238e625775f7108c68a9e5b8d439a1) +Signed-off-by: Fabian Grünbichler +--- + kernel/user_namespace.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index 4eacf186f5bc..684cc69d431c 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -549,8 +549,10 @@ static void *m_start(struct seq_file *seq, loff_t *ppos, + struct uid_gid_extent *extent = NULL; + loff_t pos = *ppos; + +- if (pos < map->nr_extents) ++ if (pos < map->nr_extents) { ++ gmb(); + extent = &map->extent[pos]; ++ } + + return extent; + } +-- +2.14.2 + diff --git a/patches/kernel/0270-fs-prevent-speculative-execution.patch b/patches/kernel/0270-fs-prevent-speculative-execution.patch deleted file mode 100644 index bf85225..0000000 --- a/patches/kernel/0270-fs-prevent-speculative-execution.patch +++ /dev/null @@ -1,42 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Elena Reshetova -Date: Mon, 4 Sep 2017 13:11:54 +0300 -Subject: [PATCH] fs: prevent speculative execution -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Real commit text tbd - -Signed-off-by: Elena Reshetova -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 1ca9e14b253a501f055c3ea29d992c028473676e) -Signed-off-by: Fabian Grünbichler ---- - include/linux/fdtable.h | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h -index 6e84b2cae6ad..09b124542bb8 100644 ---- a/include/linux/fdtable.h -+++ b/include/linux/fdtable.h -@@ -81,8 +81,10 @@ static inline struct file *__fcheck_files(struct files_struct *files, unsigned i - { - struct fdtable *fdt = rcu_dereference_raw(files->fdt); - -- if (fd < fdt->max_fds) -+ if (fd < fdt->max_fds) { -+ gmb(); - return rcu_dereference_raw(fdt->fd[fd]); -+ } - return NULL; - } - --- -2.14.2 - diff --git a/patches/kernel/0270-ipv6-prevent-speculative-execution.patch b/patches/kernel/0270-ipv6-prevent-speculative-execution.patch new file mode 100644 index 0000000..a5fa697 --- /dev/null +++ b/patches/kernel/0270-ipv6-prevent-speculative-execution.patch @@ -0,0 +1,38 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Elena Reshetova +Date: Mon, 4 Sep 2017 13:11:53 +0300 +Subject: [PATCH] ipv6: prevent speculative execution +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Real commit text tbd + +Signed-off-by: Elena Reshetova +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit fdb98114a31aa5c0083bd7cd5b42ea569b6f77dc) +Signed-off-by: Fabian Grünbichler +--- + net/ipv6/raw.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c +index 60be012fe708..1a0eae661512 100644 +--- a/net/ipv6/raw.c ++++ b/net/ipv6/raw.c +@@ -726,6 +726,7 @@ static int raw6_getfrag(void *from, char *to, int offset, int len, int odd, + if (offset < rfv->hlen) { + int copy = min(rfv->hlen - offset, len); + ++ gmb(); + if (skb->ip_summed == CHECKSUM_PARTIAL) + memcpy(to, rfv->c + offset, copy); + else +-- +2.14.2 + diff --git a/patches/kernel/0271-fs-prevent-speculative-execution.patch b/patches/kernel/0271-fs-prevent-speculative-execution.patch new file mode 100644 index 0000000..bf85225 --- /dev/null +++ b/patches/kernel/0271-fs-prevent-speculative-execution.patch @@ -0,0 +1,42 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Elena Reshetova +Date: Mon, 4 Sep 2017 13:11:54 +0300 +Subject: [PATCH] fs: prevent speculative execution +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Real commit text tbd + +Signed-off-by: Elena Reshetova +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 1ca9e14b253a501f055c3ea29d992c028473676e) +Signed-off-by: Fabian Grünbichler +--- + include/linux/fdtable.h | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h +index 6e84b2cae6ad..09b124542bb8 100644 +--- a/include/linux/fdtable.h ++++ b/include/linux/fdtable.h +@@ -81,8 +81,10 @@ static inline struct file *__fcheck_files(struct files_struct *files, unsigned i + { + struct fdtable *fdt = rcu_dereference_raw(files->fdt); + +- if (fd < fdt->max_fds) ++ if (fd < fdt->max_fds) { ++ gmb(); + return rcu_dereference_raw(fdt->fd[fd]); ++ } + return NULL; + } + +-- +2.14.2 + diff --git a/patches/kernel/0271-net-mpls-prevent-speculative-execution.patch b/patches/kernel/0271-net-mpls-prevent-speculative-execution.patch deleted file mode 100644 index cc840c9..0000000 --- a/patches/kernel/0271-net-mpls-prevent-speculative-execution.patch +++ /dev/null @@ -1,39 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Elena Reshetova -Date: Mon, 4 Sep 2017 13:11:55 +0300 -Subject: [PATCH] net: mpls: prevent speculative execution -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Real commit text tbd - -Signed-off-by: Elena Reshetova -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 98a9550398f87c5430d5e893104e21caa1e2e8d3) -Signed-off-by: Fabian Grünbichler ---- - net/mpls/af_mpls.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c -index ea4f481839dd..08dfb99e19f2 100644 ---- a/net/mpls/af_mpls.c -+++ b/net/mpls/af_mpls.c -@@ -50,6 +50,8 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index) - if (index < net->mpls.platform_labels) { - struct mpls_route __rcu **platform_label = - rcu_dereference(net->mpls.platform_label); -+ -+ gmb(); - rt = rcu_dereference(platform_label[index]); - } - return rt; --- -2.14.2 - diff --git a/patches/kernel/0272-net-mpls-prevent-speculative-execution.patch b/patches/kernel/0272-net-mpls-prevent-speculative-execution.patch new file mode 100644 index 0000000..cc840c9 --- /dev/null +++ b/patches/kernel/0272-net-mpls-prevent-speculative-execution.patch @@ -0,0 +1,39 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Elena Reshetova +Date: Mon, 4 Sep 2017 13:11:55 +0300 +Subject: [PATCH] net: mpls: prevent speculative execution +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Real commit text tbd + +Signed-off-by: Elena Reshetova +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 98a9550398f87c5430d5e893104e21caa1e2e8d3) +Signed-off-by: Fabian Grünbichler +--- + net/mpls/af_mpls.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c +index ea4f481839dd..08dfb99e19f2 100644 +--- a/net/mpls/af_mpls.c ++++ b/net/mpls/af_mpls.c +@@ -50,6 +50,8 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index) + if (index < net->mpls.platform_labels) { + struct mpls_route __rcu **platform_label = + rcu_dereference(net->mpls.platform_label); ++ ++ gmb(); + rt = rcu_dereference(platform_label[index]); + } + return rt; +-- +2.14.2 + diff --git a/patches/kernel/0272-udf-prevent-speculative-execution.patch b/patches/kernel/0272-udf-prevent-speculative-execution.patch deleted file mode 100644 index 0287316..0000000 --- a/patches/kernel/0272-udf-prevent-speculative-execution.patch +++ /dev/null @@ -1,57 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Elena Reshetova -Date: Mon, 4 Sep 2017 13:11:56 +0300 -Subject: [PATCH] udf: prevent speculative execution -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Real commit text tbd - -Signed-off-by: Elena Reshetova -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit f575840dd363aa80a14faacddf90b95db1185e2c) -Signed-off-by: Fabian Grünbichler ---- - fs/udf/misc.c | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/fs/udf/misc.c b/fs/udf/misc.c -index 3949c4bec3a3..4bd10b2e8540 100644 ---- a/fs/udf/misc.c -+++ b/fs/udf/misc.c -@@ -104,6 +104,8 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, - iinfo->i_lenEAttr) { - uint32_t aal = - le32_to_cpu(eahd->appAttrLocation); -+ -+ gmb(); - memmove(&ea[offset - aal + size], - &ea[aal], offset - aal); - offset -= aal; -@@ -114,6 +116,8 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, - iinfo->i_lenEAttr) { - uint32_t ial = - le32_to_cpu(eahd->impAttrLocation); -+ -+ gmb(); - memmove(&ea[offset - ial + size], - &ea[ial], offset - ial); - offset -= ial; -@@ -125,6 +129,8 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, - iinfo->i_lenEAttr) { - uint32_t aal = - le32_to_cpu(eahd->appAttrLocation); -+ -+ gmb(); - memmove(&ea[offset - aal + size], - &ea[aal], offset - aal); - offset -= aal; --- -2.14.2 - diff --git a/patches/kernel/0273-udf-prevent-speculative-execution.patch b/patches/kernel/0273-udf-prevent-speculative-execution.patch new file mode 100644 index 0000000..0287316 --- /dev/null +++ b/patches/kernel/0273-udf-prevent-speculative-execution.patch @@ -0,0 +1,57 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Elena Reshetova +Date: Mon, 4 Sep 2017 13:11:56 +0300 +Subject: [PATCH] udf: prevent speculative execution +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Real commit text tbd + +Signed-off-by: Elena Reshetova +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit f575840dd363aa80a14faacddf90b95db1185e2c) +Signed-off-by: Fabian Grünbichler +--- + fs/udf/misc.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/fs/udf/misc.c b/fs/udf/misc.c +index 3949c4bec3a3..4bd10b2e8540 100644 +--- a/fs/udf/misc.c ++++ b/fs/udf/misc.c +@@ -104,6 +104,8 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, + iinfo->i_lenEAttr) { + uint32_t aal = + le32_to_cpu(eahd->appAttrLocation); ++ ++ gmb(); + memmove(&ea[offset - aal + size], + &ea[aal], offset - aal); + offset -= aal; +@@ -114,6 +116,8 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, + iinfo->i_lenEAttr) { + uint32_t ial = + le32_to_cpu(eahd->impAttrLocation); ++ ++ gmb(); + memmove(&ea[offset - ial + size], + &ea[ial], offset - ial); + offset -= ial; +@@ -125,6 +129,8 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, + iinfo->i_lenEAttr) { + uint32_t aal = + le32_to_cpu(eahd->appAttrLocation); ++ ++ gmb(); + memmove(&ea[offset - aal + size], + &ea[aal], offset - aal); + offset -= aal; +-- +2.14.2 + diff --git a/patches/kernel/0273-x86-feature-Enable-the-x86-feature-to-control-Specul.patch b/patches/kernel/0273-x86-feature-Enable-the-x86-feature-to-control-Specul.patch deleted file mode 100644 index 2284c1b..0000000 --- a/patches/kernel/0273-x86-feature-Enable-the-x86-feature-to-control-Specul.patch +++ /dev/null @@ -1,77 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tim Chen -Date: Thu, 24 Aug 2017 09:34:41 -0700 -Subject: [PATCH] x86/feature: Enable the x86 feature to control Speculation -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -cpuid ax=0x7, return rdx bit 26 to indicate presence of this feature -IA32_SPEC_CTRL (0x48) and IA32_PRED_CMD (0x49) -IA32_SPEC_CTRL, bit0 – Indirect Branch Restricted Speculation (IBRS) -IA32_PRED_CMD, bit0 – Indirect Branch Prediction Barrier (IBPB) - -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit f1f160a92b70c25d6e6e76788463bbec86a73313) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/cpufeatures.h | 1 + - arch/x86/include/asm/msr-index.h | 5 +++++ - arch/x86/kernel/cpu/scattered.c | 1 + - 3 files changed, 7 insertions(+) - -diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h -index 3928050b51b0..44be8fd069bf 100644 ---- a/arch/x86/include/asm/cpufeatures.h -+++ b/arch/x86/include/asm/cpufeatures.h -@@ -208,6 +208,7 @@ - #define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */ - - #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ -+#define X86_FEATURE_SPEC_CTRL ( 7*32+19) /* Control Speculation Control */ - - /* Virtualization flags: Linux defined, word 8 */ - #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ -diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h -index db88b7f852b4..4e3438a00a50 100644 ---- a/arch/x86/include/asm/msr-index.h -+++ b/arch/x86/include/asm/msr-index.h -@@ -41,6 +41,9 @@ - #define MSR_PPIN_CTL 0x0000004e - #define MSR_PPIN 0x0000004f - -+#define MSR_IA32_SPEC_CTRL 0x00000048 -+#define MSR_IA32_PRED_CMD 0x00000049 -+ - #define MSR_IA32_PERFCTR0 0x000000c1 - #define MSR_IA32_PERFCTR1 0x000000c2 - #define MSR_FSB_FREQ 0x000000cd -@@ -437,6 +440,8 @@ - #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) - #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) - #define FEATURE_CONTROL_LMCE (1<<20) -+#define FEATURE_ENABLE_IBRS (1<<0) -+#define FEATURE_SET_IBPB (1<<0) - - #define MSR_IA32_APICBASE 0x0000001b - #define MSR_IA32_APICBASE_BSP (1<<8) -diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c -index 23c23508c012..9651ea395812 100644 ---- a/arch/x86/kernel/cpu/scattered.c -+++ b/arch/x86/kernel/cpu/scattered.c -@@ -24,6 +24,7 @@ static const struct cpuid_bit cpuid_bits[] = { - { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, - { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, - { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, -+ { X86_FEATURE_SPEC_CTRL, CPUID_EDX, 26, 0x00000007, 0 }, - { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, - { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, - { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, --- -2.14.2 - diff --git a/patches/kernel/0274-x86-feature-Enable-the-x86-feature-to-control-Specul.patch b/patches/kernel/0274-x86-feature-Enable-the-x86-feature-to-control-Specul.patch new file mode 100644 index 0000000..2284c1b --- /dev/null +++ b/patches/kernel/0274-x86-feature-Enable-the-x86-feature-to-control-Specul.patch @@ -0,0 +1,77 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Thu, 24 Aug 2017 09:34:41 -0700 +Subject: [PATCH] x86/feature: Enable the x86 feature to control Speculation +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +cpuid ax=0x7, return rdx bit 26 to indicate presence of this feature +IA32_SPEC_CTRL (0x48) and IA32_PRED_CMD (0x49) +IA32_SPEC_CTRL, bit0 – Indirect Branch Restricted Speculation (IBRS) +IA32_PRED_CMD, bit0 – Indirect Branch Prediction Barrier (IBPB) + +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit f1f160a92b70c25d6e6e76788463bbec86a73313) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/cpufeatures.h | 1 + + arch/x86/include/asm/msr-index.h | 5 +++++ + arch/x86/kernel/cpu/scattered.c | 1 + + 3 files changed, 7 insertions(+) + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 3928050b51b0..44be8fd069bf 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -208,6 +208,7 @@ + #define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */ + + #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ ++#define X86_FEATURE_SPEC_CTRL ( 7*32+19) /* Control Speculation Control */ + + /* Virtualization flags: Linux defined, word 8 */ + #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index db88b7f852b4..4e3438a00a50 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -41,6 +41,9 @@ + #define MSR_PPIN_CTL 0x0000004e + #define MSR_PPIN 0x0000004f + ++#define MSR_IA32_SPEC_CTRL 0x00000048 ++#define MSR_IA32_PRED_CMD 0x00000049 ++ + #define MSR_IA32_PERFCTR0 0x000000c1 + #define MSR_IA32_PERFCTR1 0x000000c2 + #define MSR_FSB_FREQ 0x000000cd +@@ -437,6 +440,8 @@ + #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) + #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) + #define FEATURE_CONTROL_LMCE (1<<20) ++#define FEATURE_ENABLE_IBRS (1<<0) ++#define FEATURE_SET_IBPB (1<<0) + + #define MSR_IA32_APICBASE 0x0000001b + #define MSR_IA32_APICBASE_BSP (1<<8) +diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c +index 23c23508c012..9651ea395812 100644 +--- a/arch/x86/kernel/cpu/scattered.c ++++ b/arch/x86/kernel/cpu/scattered.c +@@ -24,6 +24,7 @@ static const struct cpuid_bit cpuid_bits[] = { + { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, + { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, + { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, ++ { X86_FEATURE_SPEC_CTRL, CPUID_EDX, 26, 0x00000007, 0 }, + { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, + { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, + { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, +-- +2.14.2 + diff --git a/patches/kernel/0274-x86-feature-Report-presence-of-IBPB-and-IBRS-control.patch b/patches/kernel/0274-x86-feature-Report-presence-of-IBPB-and-IBRS-control.patch deleted file mode 100644 index f4944f4..0000000 --- a/patches/kernel/0274-x86-feature-Report-presence-of-IBPB-and-IBRS-control.patch +++ /dev/null @@ -1,41 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tim Chen -Date: Wed, 27 Sep 2017 12:09:14 -0700 -Subject: [PATCH] x86/feature: Report presence of IBPB and IBRS control -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Report presence of IBPB and IBRS. - -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit c41156d893e7f48bebf8d71cfddd39d8fb2724f8) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/cpu/intel.c | 5 +++++ - 1 file changed, 5 insertions(+) - -diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c -index dfa90a3a5145..f1d94c73625a 100644 ---- a/arch/x86/kernel/cpu/intel.c -+++ b/arch/x86/kernel/cpu/intel.c -@@ -627,6 +627,11 @@ static void init_intel(struct cpuinfo_x86 *c) - init_intel_energy_perf(c); - - init_intel_misc_features(c); -+ -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ printk_once(KERN_INFO "FEATURE SPEC_CTRL Present\n"); -+ else -+ printk_once(KERN_INFO "FEATURE SPEC_CTRL Not Present\n"); - } - - #ifdef CONFIG_X86_32 --- -2.14.2 - diff --git a/patches/kernel/0275-x86-enter-MACROS-to-set-clear-IBRS-and-set-IBPB.patch b/patches/kernel/0275-x86-enter-MACROS-to-set-clear-IBRS-and-set-IBPB.patch deleted file mode 100644 index f150ffa..0000000 --- a/patches/kernel/0275-x86-enter-MACROS-to-set-clear-IBRS-and-set-IBPB.patch +++ /dev/null @@ -1,84 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tim Chen -Date: Fri, 15 Sep 2017 18:04:53 -0700 -Subject: [PATCH] x86/enter: MACROS to set/clear IBRS and set IBPB -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Setup macros to control IBRS and IBPB - -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 171d754fe3b783d361555cf2569e68a7b0e0d54a) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/spec_ctrl.h | 52 ++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 52 insertions(+) - create mode 100644 arch/x86/include/asm/spec_ctrl.h - -diff --git a/arch/x86/include/asm/spec_ctrl.h b/arch/x86/include/asm/spec_ctrl.h -new file mode 100644 -index 000000000000..7f8bb09b6acb ---- /dev/null -+++ b/arch/x86/include/asm/spec_ctrl.h -@@ -0,0 +1,52 @@ -+#ifndef _ASM_X86_SPEC_CTRL_H -+#define _ASM_X86_SPEC_CTRL_H -+ -+#include -+#include -+#include -+#include -+ -+#ifdef __ASSEMBLY__ -+ -+#define __ASM_ENABLE_IBRS \ -+ pushq %rax; \ -+ pushq %rcx; \ -+ pushq %rdx; \ -+ movl $MSR_IA32_SPEC_CTRL, %ecx; \ -+ movl $0, %edx; \ -+ movl $FEATURE_ENABLE_IBRS, %eax; \ -+ wrmsr; \ -+ popq %rdx; \ -+ popq %rcx; \ -+ popq %rax -+#define __ASM_ENABLE_IBRS_CLOBBER \ -+ movl $MSR_IA32_SPEC_CTRL, %ecx; \ -+ movl $0, %edx; \ -+ movl $FEATURE_ENABLE_IBRS, %eax; \ -+ wrmsr; -+#define __ASM_DISABLE_IBRS \ -+ pushq %rax; \ -+ pushq %rcx; \ -+ pushq %rdx; \ -+ movl $MSR_IA32_SPEC_CTRL, %ecx; \ -+ movl $0, %edx; \ -+ movl $0, %eax; \ -+ wrmsr; \ -+ popq %rdx; \ -+ popq %rcx; \ -+ popq %rax -+ -+.macro ENABLE_IBRS -+ALTERNATIVE "", __stringify(__ASM_ENABLE_IBRS), X86_FEATURE_SPEC_CTRL -+.endm -+ -+.macro ENABLE_IBRS_CLOBBER -+ALTERNATIVE "", __stringify(__ASM_ENABLE_IBRS_CLOBBER), X86_FEATURE_SPEC_CTRL -+.endm -+ -+.macro DISABLE_IBRS -+ALTERNATIVE "", __stringify(__ASM_DISABLE_IBRS), X86_FEATURE_SPEC_CTRL -+.endm -+ -+#endif /* __ASSEMBLY__ */ -+#endif /* _ASM_X86_SPEC_CTRL_H */ --- -2.14.2 - diff --git a/patches/kernel/0275-x86-feature-Report-presence-of-IBPB-and-IBRS-control.patch b/patches/kernel/0275-x86-feature-Report-presence-of-IBPB-and-IBRS-control.patch new file mode 100644 index 0000000..f4944f4 --- /dev/null +++ b/patches/kernel/0275-x86-feature-Report-presence-of-IBPB-and-IBRS-control.patch @@ -0,0 +1,41 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Wed, 27 Sep 2017 12:09:14 -0700 +Subject: [PATCH] x86/feature: Report presence of IBPB and IBRS control +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Report presence of IBPB and IBRS. + +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit c41156d893e7f48bebf8d71cfddd39d8fb2724f8) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/cpu/intel.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c +index dfa90a3a5145..f1d94c73625a 100644 +--- a/arch/x86/kernel/cpu/intel.c ++++ b/arch/x86/kernel/cpu/intel.c +@@ -627,6 +627,11 @@ static void init_intel(struct cpuinfo_x86 *c) + init_intel_energy_perf(c); + + init_intel_misc_features(c); ++ ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ printk_once(KERN_INFO "FEATURE SPEC_CTRL Present\n"); ++ else ++ printk_once(KERN_INFO "FEATURE SPEC_CTRL Not Present\n"); + } + + #ifdef CONFIG_X86_32 +-- +2.14.2 + diff --git a/patches/kernel/0276-x86-enter-MACROS-to-set-clear-IBRS-and-set-IBPB.patch b/patches/kernel/0276-x86-enter-MACROS-to-set-clear-IBRS-and-set-IBPB.patch new file mode 100644 index 0000000..f150ffa --- /dev/null +++ b/patches/kernel/0276-x86-enter-MACROS-to-set-clear-IBRS-and-set-IBPB.patch @@ -0,0 +1,84 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Fri, 15 Sep 2017 18:04:53 -0700 +Subject: [PATCH] x86/enter: MACROS to set/clear IBRS and set IBPB +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Setup macros to control IBRS and IBPB + +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 171d754fe3b783d361555cf2569e68a7b0e0d54a) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/spec_ctrl.h | 52 ++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 52 insertions(+) + create mode 100644 arch/x86/include/asm/spec_ctrl.h + +diff --git a/arch/x86/include/asm/spec_ctrl.h b/arch/x86/include/asm/spec_ctrl.h +new file mode 100644 +index 000000000000..7f8bb09b6acb +--- /dev/null ++++ b/arch/x86/include/asm/spec_ctrl.h +@@ -0,0 +1,52 @@ ++#ifndef _ASM_X86_SPEC_CTRL_H ++#define _ASM_X86_SPEC_CTRL_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef __ASSEMBLY__ ++ ++#define __ASM_ENABLE_IBRS \ ++ pushq %rax; \ ++ pushq %rcx; \ ++ pushq %rdx; \ ++ movl $MSR_IA32_SPEC_CTRL, %ecx; \ ++ movl $0, %edx; \ ++ movl $FEATURE_ENABLE_IBRS, %eax; \ ++ wrmsr; \ ++ popq %rdx; \ ++ popq %rcx; \ ++ popq %rax ++#define __ASM_ENABLE_IBRS_CLOBBER \ ++ movl $MSR_IA32_SPEC_CTRL, %ecx; \ ++ movl $0, %edx; \ ++ movl $FEATURE_ENABLE_IBRS, %eax; \ ++ wrmsr; ++#define __ASM_DISABLE_IBRS \ ++ pushq %rax; \ ++ pushq %rcx; \ ++ pushq %rdx; \ ++ movl $MSR_IA32_SPEC_CTRL, %ecx; \ ++ movl $0, %edx; \ ++ movl $0, %eax; \ ++ wrmsr; \ ++ popq %rdx; \ ++ popq %rcx; \ ++ popq %rax ++ ++.macro ENABLE_IBRS ++ALTERNATIVE "", __stringify(__ASM_ENABLE_IBRS), X86_FEATURE_SPEC_CTRL ++.endm ++ ++.macro ENABLE_IBRS_CLOBBER ++ALTERNATIVE "", __stringify(__ASM_ENABLE_IBRS_CLOBBER), X86_FEATURE_SPEC_CTRL ++.endm ++ ++.macro DISABLE_IBRS ++ALTERNATIVE "", __stringify(__ASM_DISABLE_IBRS), X86_FEATURE_SPEC_CTRL ++.endm ++ ++#endif /* __ASSEMBLY__ */ ++#endif /* _ASM_X86_SPEC_CTRL_H */ +-- +2.14.2 + diff --git a/patches/kernel/0276-x86-enter-Use-IBRS-on-syscall-and-interrupts.patch b/patches/kernel/0276-x86-enter-Use-IBRS-on-syscall-and-interrupts.patch deleted file mode 100644 index 68f96e7..0000000 --- a/patches/kernel/0276-x86-enter-Use-IBRS-on-syscall-and-interrupts.patch +++ /dev/null @@ -1,171 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tim Chen -Date: Fri, 13 Oct 2017 14:25:00 -0700 -Subject: [PATCH] x86/enter: Use IBRS on syscall and interrupts -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Set IBRS upon kernel entrance via syscall and interrupts. Clear it upon exit. - -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit d7eb5f9ed26dbdc39df793491bdcc9f80d41325e) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/entry_64.S | 18 +++++++++++++++++- - arch/x86/entry/entry_64_compat.S | 7 +++++++ - 2 files changed, 24 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index b48f2c78a9bf..5f898c3c1dad 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -36,6 +36,7 @@ - #include - #include - #include -+#include - #include - - #include "calling.h" -@@ -235,6 +236,8 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) - sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ - UNWIND_HINT_REGS extra=0 - -+ ENABLE_IBRS -+ - /* - * If we need to do entry work or if we guess we'll need to do - * exit work, go straight to the slow path. -@@ -286,6 +289,7 @@ entry_SYSCALL_64_fastpath: - TRACE_IRQS_ON /* user mode is traced as IRQs on */ - movq RIP(%rsp), %rcx - movq EFLAGS(%rsp), %r11 -+ DISABLE_IBRS - addq $6*8, %rsp /* skip extra regs -- they were preserved */ - UNWIND_HINT_EMPTY - jmp .Lpop_c_regs_except_rcx_r11_and_sysret -@@ -379,6 +383,8 @@ return_from_SYSCALL_64: - * perf profiles. Nothing jumps here. - */ - syscall_return_via_sysret: -+ DISABLE_IBRS -+ - /* rcx and r11 are already restored (see code above) */ - UNWIND_HINT_EMPTY - POP_EXTRA_REGS -@@ -660,6 +666,10 @@ END(irq_entries_start) - /* - * IRQ from user mode. - * -+ */ -+ ENABLE_IBRS -+ -+ /* - * We need to tell lockdep that IRQs are off. We can't do this until - * we fix gsbase, and we should do it before enter_from_user_mode - * (which can take locks). Since TRACE_IRQS_OFF idempotent, -@@ -743,7 +753,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) - * We are on the trampoline stack. All regs except RDI are live. - * We can do future final exit work right here. - */ -- -+ DISABLE_IBRS - SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi - - /* Restore RDI. */ -@@ -1277,6 +1287,7 @@ ENTRY(paranoid_entry) - - 1: - SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 -+ ENABLE_IBRS_CLOBBER - - ret - END(paranoid_entry) -@@ -1331,6 +1342,8 @@ ENTRY(error_entry) - /* We have user CR3. Change to kernel CR3. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rax - -+ ENABLE_IBRS -+ - .Lerror_entry_from_usermode_after_swapgs: - /* Put us onto the real thread stack. */ - popq %r12 /* save return addr in %12 */ -@@ -1377,6 +1390,7 @@ ENTRY(error_entry) - */ - SWAPGS - SWITCH_TO_KERNEL_CR3 scratch_reg=%rax -+ ENABLE_IBRS_CLOBBER - jmp .Lerror_entry_done - - .Lbstep_iret: -@@ -1391,6 +1405,7 @@ ENTRY(error_entry) - */ - SWAPGS - SWITCH_TO_KERNEL_CR3 scratch_reg=%rax -+ ENABLE_IBRS_CLOBBER - - /* - * Pretend that the exception came from user mode: set up pt_regs -@@ -1518,6 +1533,7 @@ ENTRY(nmi) - UNWIND_HINT_REGS - ENCODE_FRAME_POINTER - -+ ENABLE_IBRS - /* - * At this point we no longer need to worry about stack damage - * due to nesting -- we're on the normal thread stack and we're -diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S -index 2b5e7685823c..ee4f3edb3c50 100644 ---- a/arch/x86/entry/entry_64_compat.S -+++ b/arch/x86/entry/entry_64_compat.S -@@ -13,6 +13,7 @@ - #include - #include - #include -+#include - #include - #include - -@@ -95,6 +96,8 @@ ENTRY(entry_SYSENTER_compat) - pushq $0 /* pt_regs->r15 = 0 */ - cld - -+ ENABLE_IBRS -+ - /* - * SYSENTER doesn't filter flags, so we need to clear NT and AC - * ourselves. To save a few cycles, we can check whether -@@ -194,6 +197,7 @@ ENTRY(entry_SYSCALL_compat) - - /* Use %rsp as scratch reg. User ESP is stashed in r8 */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp -+ ENABLE_IBRS - - /* Switch to the kernel stack */ - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp -@@ -249,6 +253,7 @@ sysret32_from_system_call: - popq %rsi /* pt_regs->si */ - popq %rdi /* pt_regs->di */ - -+ DISABLE_IBRS - /* - * USERGS_SYSRET32 does: - * GSBASE = user's GS base -@@ -348,6 +353,8 @@ ENTRY(entry_INT80_compat) - pushq %r15 /* pt_regs->r15 */ - cld - -+ ENABLE_IBRS -+ - /* - * User mode is traced as though IRQs are on, and the interrupt - * gate turned them off. --- -2.14.2 - diff --git a/patches/kernel/0277-x86-enter-Use-IBRS-on-syscall-and-interrupts.patch b/patches/kernel/0277-x86-enter-Use-IBRS-on-syscall-and-interrupts.patch new file mode 100644 index 0000000..68f96e7 --- /dev/null +++ b/patches/kernel/0277-x86-enter-Use-IBRS-on-syscall-and-interrupts.patch @@ -0,0 +1,171 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Fri, 13 Oct 2017 14:25:00 -0700 +Subject: [PATCH] x86/enter: Use IBRS on syscall and interrupts +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Set IBRS upon kernel entrance via syscall and interrupts. Clear it upon exit. + +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit d7eb5f9ed26dbdc39df793491bdcc9f80d41325e) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/entry_64.S | 18 +++++++++++++++++- + arch/x86/entry/entry_64_compat.S | 7 +++++++ + 2 files changed, 24 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index b48f2c78a9bf..5f898c3c1dad 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + #include + + #include "calling.h" +@@ -235,6 +236,8 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) + sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + UNWIND_HINT_REGS extra=0 + ++ ENABLE_IBRS ++ + /* + * If we need to do entry work or if we guess we'll need to do + * exit work, go straight to the slow path. +@@ -286,6 +289,7 @@ entry_SYSCALL_64_fastpath: + TRACE_IRQS_ON /* user mode is traced as IRQs on */ + movq RIP(%rsp), %rcx + movq EFLAGS(%rsp), %r11 ++ DISABLE_IBRS + addq $6*8, %rsp /* skip extra regs -- they were preserved */ + UNWIND_HINT_EMPTY + jmp .Lpop_c_regs_except_rcx_r11_and_sysret +@@ -379,6 +383,8 @@ return_from_SYSCALL_64: + * perf profiles. Nothing jumps here. + */ + syscall_return_via_sysret: ++ DISABLE_IBRS ++ + /* rcx and r11 are already restored (see code above) */ + UNWIND_HINT_EMPTY + POP_EXTRA_REGS +@@ -660,6 +666,10 @@ END(irq_entries_start) + /* + * IRQ from user mode. + * ++ */ ++ ENABLE_IBRS ++ ++ /* + * We need to tell lockdep that IRQs are off. We can't do this until + * we fix gsbase, and we should do it before enter_from_user_mode + * (which can take locks). Since TRACE_IRQS_OFF idempotent, +@@ -743,7 +753,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) + * We are on the trampoline stack. All regs except RDI are live. + * We can do future final exit work right here. + */ +- ++ DISABLE_IBRS + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + + /* Restore RDI. */ +@@ -1277,6 +1287,7 @@ ENTRY(paranoid_entry) + + 1: + SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 ++ ENABLE_IBRS_CLOBBER + + ret + END(paranoid_entry) +@@ -1331,6 +1342,8 @@ ENTRY(error_entry) + /* We have user CR3. Change to kernel CR3. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + ++ ENABLE_IBRS ++ + .Lerror_entry_from_usermode_after_swapgs: + /* Put us onto the real thread stack. */ + popq %r12 /* save return addr in %12 */ +@@ -1377,6 +1390,7 @@ ENTRY(error_entry) + */ + SWAPGS + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax ++ ENABLE_IBRS_CLOBBER + jmp .Lerror_entry_done + + .Lbstep_iret: +@@ -1391,6 +1405,7 @@ ENTRY(error_entry) + */ + SWAPGS + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax ++ ENABLE_IBRS_CLOBBER + + /* + * Pretend that the exception came from user mode: set up pt_regs +@@ -1518,6 +1533,7 @@ ENTRY(nmi) + UNWIND_HINT_REGS + ENCODE_FRAME_POINTER + ++ ENABLE_IBRS + /* + * At this point we no longer need to worry about stack damage + * due to nesting -- we're on the normal thread stack and we're +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index 2b5e7685823c..ee4f3edb3c50 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -95,6 +96,8 @@ ENTRY(entry_SYSENTER_compat) + pushq $0 /* pt_regs->r15 = 0 */ + cld + ++ ENABLE_IBRS ++ + /* + * SYSENTER doesn't filter flags, so we need to clear NT and AC + * ourselves. To save a few cycles, we can check whether +@@ -194,6 +197,7 @@ ENTRY(entry_SYSCALL_compat) + + /* Use %rsp as scratch reg. User ESP is stashed in r8 */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp ++ ENABLE_IBRS + + /* Switch to the kernel stack */ + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp +@@ -249,6 +253,7 @@ sysret32_from_system_call: + popq %rsi /* pt_regs->si */ + popq %rdi /* pt_regs->di */ + ++ DISABLE_IBRS + /* + * USERGS_SYSRET32 does: + * GSBASE = user's GS base +@@ -348,6 +353,8 @@ ENTRY(entry_INT80_compat) + pushq %r15 /* pt_regs->r15 */ + cld + ++ ENABLE_IBRS ++ + /* + * User mode is traced as though IRQs are on, and the interrupt + * gate turned them off. +-- +2.14.2 + diff --git a/patches/kernel/0277-x86-idle-Disable-IBRS-entering-idle-and-enable-it-on.patch b/patches/kernel/0277-x86-idle-Disable-IBRS-entering-idle-and-enable-it-on.patch deleted file mode 100644 index cc4b348..0000000 --- a/patches/kernel/0277-x86-idle-Disable-IBRS-entering-idle-and-enable-it-on.patch +++ /dev/null @@ -1,117 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tim Chen -Date: Mon, 6 Nov 2017 18:19:14 -0800 -Subject: [PATCH] x86/idle: Disable IBRS entering idle and enable it on wakeup -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Clear IBRS on idle entry and set it on idle exit into kernel on mwait. - -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 5521b04afda1d683c1ebad6c25c2529a88e6f061) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/mwait.h | 8 ++++++++ - arch/x86/kernel/process.c | 12 ++++++++++-- - arch/x86/lib/delay.c | 10 ++++++++++ - 3 files changed, 28 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h -index bda3c27f0da0..f15120ada161 100644 ---- a/arch/x86/include/asm/mwait.h -+++ b/arch/x86/include/asm/mwait.h -@@ -5,6 +5,8 @@ - #include - - #include -+#include -+#include - - #define MWAIT_SUBSTATE_MASK 0xf - #define MWAIT_CSTATE_MASK 0xf -@@ -105,9 +107,15 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) - mb(); - } - -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); -+ - __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (!need_resched()) - __mwait(eax, ecx); -+ -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); - } - current_clr_polling(); - } -diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c -index 07e6218ad7d9..3adb3806a284 100644 ---- a/arch/x86/kernel/process.c -+++ b/arch/x86/kernel/process.c -@@ -447,11 +447,19 @@ static __cpuidle void mwait_idle(void) - mb(); /* quirk */ - } - -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); -+ - __monitor((void *)¤t_thread_info()->flags, 0, 0); -- if (!need_resched()) -+ if (!need_resched()) { - __sti_mwait(0, 0); -- else -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); -+ } else { -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); - local_irq_enable(); -+ } - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); - } else { - local_irq_enable(); -diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c -index cf2ac227c2ac..b088463973e4 100644 ---- a/arch/x86/lib/delay.c -+++ b/arch/x86/lib/delay.c -@@ -26,6 +26,8 @@ - # include - #endif - -+#define IBRS_DISABLE_THRESHOLD 1000 -+ - /* simple loop based delay: */ - static void delay_loop(unsigned long loops) - { -@@ -105,6 +107,10 @@ static void delay_mwaitx(unsigned long __loops) - for (;;) { - delay = min_t(u64, MWAITX_MAX_LOOPS, loops); - -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) && -+ (delay > IBRS_DISABLE_THRESHOLD)) -+ native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); -+ - /* - * Use cpu_tss_rw as a cacheline-aligned, seldomly - * accessed per-cpu variable as the monitor target. -@@ -118,6 +124,10 @@ static void delay_mwaitx(unsigned long __loops) - */ - __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); - -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) && -+ (delay > IBRS_DISABLE_THRESHOLD)) -+ native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); -+ - end = rdtsc_ordered(); - - if (loops <= end - start) --- -2.14.2 - diff --git a/patches/kernel/0278-x86-idle-Disable-IBRS-entering-idle-and-enable-it-on.patch b/patches/kernel/0278-x86-idle-Disable-IBRS-entering-idle-and-enable-it-on.patch new file mode 100644 index 0000000..cc4b348 --- /dev/null +++ b/patches/kernel/0278-x86-idle-Disable-IBRS-entering-idle-and-enable-it-on.patch @@ -0,0 +1,117 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Mon, 6 Nov 2017 18:19:14 -0800 +Subject: [PATCH] x86/idle: Disable IBRS entering idle and enable it on wakeup +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Clear IBRS on idle entry and set it on idle exit into kernel on mwait. + +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 5521b04afda1d683c1ebad6c25c2529a88e6f061) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/mwait.h | 8 ++++++++ + arch/x86/kernel/process.c | 12 ++++++++++-- + arch/x86/lib/delay.c | 10 ++++++++++ + 3 files changed, 28 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h +index bda3c27f0da0..f15120ada161 100644 +--- a/arch/x86/include/asm/mwait.h ++++ b/arch/x86/include/asm/mwait.h +@@ -5,6 +5,8 @@ + #include + + #include ++#include ++#include + + #define MWAIT_SUBSTATE_MASK 0xf + #define MWAIT_CSTATE_MASK 0xf +@@ -105,9 +107,15 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) + mb(); + } + ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ + __monitor((void *)¤t_thread_info()->flags, 0, 0); + if (!need_resched()) + __mwait(eax, ecx); ++ ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); + } + current_clr_polling(); + } +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 07e6218ad7d9..3adb3806a284 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -447,11 +447,19 @@ static __cpuidle void mwait_idle(void) + mb(); /* quirk */ + } + ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ + __monitor((void *)¤t_thread_info()->flags, 0, 0); +- if (!need_resched()) ++ if (!need_resched()) { + __sti_mwait(0, 0); +- else ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); ++ } else { ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); + local_irq_enable(); ++ } + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + } else { + local_irq_enable(); +diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c +index cf2ac227c2ac..b088463973e4 100644 +--- a/arch/x86/lib/delay.c ++++ b/arch/x86/lib/delay.c +@@ -26,6 +26,8 @@ + # include + #endif + ++#define IBRS_DISABLE_THRESHOLD 1000 ++ + /* simple loop based delay: */ + static void delay_loop(unsigned long loops) + { +@@ -105,6 +107,10 @@ static void delay_mwaitx(unsigned long __loops) + for (;;) { + delay = min_t(u64, MWAITX_MAX_LOOPS, loops); + ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) && ++ (delay > IBRS_DISABLE_THRESHOLD)) ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ + /* + * Use cpu_tss_rw as a cacheline-aligned, seldomly + * accessed per-cpu variable as the monitor target. +@@ -118,6 +124,10 @@ static void delay_mwaitx(unsigned long __loops) + */ + __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); + ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) && ++ (delay > IBRS_DISABLE_THRESHOLD)) ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); ++ + end = rdtsc_ordered(); + + if (loops <= end - start) +-- +2.14.2 + diff --git a/patches/kernel/0278-x86-idle-Disable-IBRS-when-offlining-cpu-and-re-enab.patch b/patches/kernel/0278-x86-idle-Disable-IBRS-when-offlining-cpu-and-re-enab.patch deleted file mode 100644 index 8424c28..0000000 --- a/patches/kernel/0278-x86-idle-Disable-IBRS-when-offlining-cpu-and-re-enab.patch +++ /dev/null @@ -1,54 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tim Chen -Date: Wed, 15 Nov 2017 12:24:19 -0800 -Subject: [PATCH] x86/idle: Disable IBRS when offlining cpu and re-enable on - wakeup -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Clear IBRS when cpu is offlined and set it when brining it back online. - -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 9bcf662c1690880b2464fe99d0f58dce53c0d89f) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/smpboot.c | 7 +++++++ - 1 file changed, 7 insertions(+) - -diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c -index 398e8324fea4..a652bff7add4 100644 ---- a/arch/x86/kernel/smpboot.c -+++ b/arch/x86/kernel/smpboot.c -@@ -77,6 +77,7 @@ - #include - #include - #include -+#include - - /* Number of siblings per CPU package */ - int smp_num_siblings = 1; -@@ -1692,9 +1693,15 @@ void native_play_dead(void) - play_dead_common(); - tboot_shutdown(TB_SHUTDOWN_WFS); - -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); -+ - mwait_play_dead(); /* Only returns on failure */ - if (cpuidle_play_dead()) - hlt_play_dead(); -+ -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); - } - - #else /* ... !CONFIG_HOTPLUG_CPU */ --- -2.14.2 - diff --git a/patches/kernel/0279-x86-idle-Disable-IBRS-when-offlining-cpu-and-re-enab.patch b/patches/kernel/0279-x86-idle-Disable-IBRS-when-offlining-cpu-and-re-enab.patch new file mode 100644 index 0000000..8424c28 --- /dev/null +++ b/patches/kernel/0279-x86-idle-Disable-IBRS-when-offlining-cpu-and-re-enab.patch @@ -0,0 +1,54 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Wed, 15 Nov 2017 12:24:19 -0800 +Subject: [PATCH] x86/idle: Disable IBRS when offlining cpu and re-enable on + wakeup +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Clear IBRS when cpu is offlined and set it when brining it back online. + +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 9bcf662c1690880b2464fe99d0f58dce53c0d89f) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/smpboot.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index 398e8324fea4..a652bff7add4 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -77,6 +77,7 @@ + #include + #include + #include ++#include + + /* Number of siblings per CPU package */ + int smp_num_siblings = 1; +@@ -1692,9 +1693,15 @@ void native_play_dead(void) + play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); + ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); ++ + mwait_play_dead(); /* Only returns on failure */ + if (cpuidle_play_dead()) + hlt_play_dead(); ++ ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); + } + + #else /* ... !CONFIG_HOTPLUG_CPU */ +-- +2.14.2 + diff --git a/patches/kernel/0279-x86-mm-Set-IBPB-upon-context-switch.patch b/patches/kernel/0279-x86-mm-Set-IBPB-upon-context-switch.patch deleted file mode 100644 index de5de85..0000000 --- a/patches/kernel/0279-x86-mm-Set-IBPB-upon-context-switch.patch +++ /dev/null @@ -1,47 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tim Chen -Date: Fri, 20 Oct 2017 12:56:29 -0700 -Subject: [PATCH] x86/mm: Set IBPB upon context switch -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Set IBPB on context switch with changing of page table. - -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit a3320203792b633fb96df5d0bbfb7036129b78e2) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/mm/tlb.c | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c -index 06f3854d0a4f..bb3ded3a4e5f 100644 ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -12,6 +12,7 @@ - #include - #include - #include -+#include - #include - - /* -@@ -218,6 +219,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - u16 new_asid; - bool need_flush; - -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ native_wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB); -+ - if (IS_ENABLED(CONFIG_VMAP_STACK)) { - /* - * If our current stack is in vmalloc space and isn't --- -2.14.2 - diff --git a/patches/kernel/0280-x86-mm-Only-set-IBPB-when-the-new-thread-cannot-ptra.patch b/patches/kernel/0280-x86-mm-Only-set-IBPB-when-the-new-thread-cannot-ptra.patch deleted file mode 100644 index f85c03f..0000000 --- a/patches/kernel/0280-x86-mm-Only-set-IBPB-when-the-new-thread-cannot-ptra.patch +++ /dev/null @@ -1,127 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tim Chen -Date: Tue, 7 Nov 2017 13:52:42 -0800 -Subject: [PATCH] x86/mm: Only set IBPB when the new thread cannot ptrace - current thread -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -To reduce overhead of setting IBPB, we only do that when -the new thread cannot ptrace the current one. If the new -thread has ptrace capability on current thread, it is safe. - -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 65941af723059ffeeca269b99ab51b3c9e320751) -Signed-off-by: Fabian Grünbichler ---- - include/linux/ptrace.h | 6 ++++++ - arch/x86/mm/tlb.c | 5 ++++- - kernel/ptrace.c | 18 ++++++++++++++---- - 3 files changed, 24 insertions(+), 5 deletions(-) - -diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h -index 0e5fcc11b1b8..d6afefd5465b 100644 ---- a/include/linux/ptrace.h -+++ b/include/linux/ptrace.h -@@ -63,12 +63,15 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); - #define PTRACE_MODE_NOAUDIT 0x04 - #define PTRACE_MODE_FSCREDS 0x08 - #define PTRACE_MODE_REALCREDS 0x10 -+#define PTRACE_MODE_NOACCESS_CHK 0x20 - - /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ - #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) - #define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) - #define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) - #define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) -+#define PTRACE_MODE_IBPB (PTRACE_MODE_ATTACH | PTRACE_MODE_NOAUDIT \ -+ | PTRACE_MODE_NOACCESS_CHK | PTRACE_MODE_REALCREDS) - - /** - * ptrace_may_access - check whether the caller is permitted to access -@@ -86,6 +89,9 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); - */ - extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); - -+extern int ___ptrace_may_access(struct task_struct *cur, struct task_struct *task, -+ unsigned int mode); -+ - static inline int ptrace_reparented(struct task_struct *child) - { - return !same_thread_group(child->real_parent, child->parent); -diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c -index bb3ded3a4e5f..301e6efbc514 100644 ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -6,6 +6,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -219,7 +220,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - u16 new_asid; - bool need_flush; - -- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ /* Null tsk means switching to kernel, so that's safe */ -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) && tsk && -+ ___ptrace_may_access(tsk, current, PTRACE_MODE_IBPB)) - native_wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB); - - if (IS_ENABLED(CONFIG_VMAP_STACK)) { -diff --git a/kernel/ptrace.c b/kernel/ptrace.c -index 60f356d91060..f2f0f1aeabaf 100644 ---- a/kernel/ptrace.c -+++ b/kernel/ptrace.c -@@ -268,9 +268,10 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) - } - - /* Returns 0 on success, -errno on denial. */ --static int __ptrace_may_access(struct task_struct *task, unsigned int mode) -+int ___ptrace_may_access(struct task_struct *cur, struct task_struct *task, -+ unsigned int mode) - { -- const struct cred *cred = current_cred(), *tcred; -+ const struct cred *cred = __task_cred(cur), *tcred; - struct mm_struct *mm; - kuid_t caller_uid; - kgid_t caller_gid; -@@ -290,7 +291,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) - */ - - /* Don't let security modules deny introspection */ -- if (same_thread_group(task, current)) -+ if (same_thread_group(task, cur)) - return 0; - rcu_read_lock(); - if (mode & PTRACE_MODE_FSCREDS) { -@@ -328,7 +329,16 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) - !ptrace_has_cap(mm->user_ns, mode))) - return -EPERM; - -- return security_ptrace_access_check(task, mode); -+ if (!(mode & PTRACE_MODE_NOACCESS_CHK)) -+ return security_ptrace_access_check(task, mode); -+ -+ return 0; -+} -+EXPORT_SYMBOL_GPL(___ptrace_may_access); -+ -+static int __ptrace_may_access(struct task_struct *task, unsigned int mode) -+{ -+ return ___ptrace_may_access(current, task, mode); - } - - bool ptrace_may_access(struct task_struct *task, unsigned int mode) --- -2.14.2 - diff --git a/patches/kernel/0280-x86-mm-Set-IBPB-upon-context-switch.patch b/patches/kernel/0280-x86-mm-Set-IBPB-upon-context-switch.patch new file mode 100644 index 0000000..de5de85 --- /dev/null +++ b/patches/kernel/0280-x86-mm-Set-IBPB-upon-context-switch.patch @@ -0,0 +1,47 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Fri, 20 Oct 2017 12:56:29 -0700 +Subject: [PATCH] x86/mm: Set IBPB upon context switch +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Set IBPB on context switch with changing of page table. + +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit a3320203792b633fb96df5d0bbfb7036129b78e2) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/mm/tlb.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 06f3854d0a4f..bb3ded3a4e5f 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -12,6 +12,7 @@ + #include + #include + #include ++#include + #include + + /* +@@ -218,6 +219,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + u16 new_asid; + bool need_flush; + ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ native_wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB); ++ + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + /* + * If our current stack is in vmalloc space and isn't +-- +2.14.2 + diff --git a/patches/kernel/0281-x86-entry-Stuff-RSB-for-entry-to-kernel-for-non-SMEP.patch b/patches/kernel/0281-x86-entry-Stuff-RSB-for-entry-to-kernel-for-non-SMEP.patch deleted file mode 100644 index 0eebfdf..0000000 --- a/patches/kernel/0281-x86-entry-Stuff-RSB-for-entry-to-kernel-for-non-SMEP.patch +++ /dev/null @@ -1,202 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tim Chen -Date: Tue, 14 Nov 2017 17:16:30 -0800 -Subject: [PATCH] x86/entry: Stuff RSB for entry to kernel for non-SMEP - platform -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Stuff RSB to prevent RSB underflow on non-SMEP platform. - -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit b82785ac1d33ce219c77d72b7bd80a21e1441ac8) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/spec_ctrl.h | 71 ++++++++++++++++++++++++++++++++++++++++ - arch/x86/entry/entry_64.S | 18 ++++++++-- - arch/x86/entry/entry_64_compat.S | 4 +++ - 3 files changed, 91 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/include/asm/spec_ctrl.h b/arch/x86/include/asm/spec_ctrl.h -index 7f8bb09b6acb..55ee1f36bda2 100644 ---- a/arch/x86/include/asm/spec_ctrl.h -+++ b/arch/x86/include/asm/spec_ctrl.h -@@ -35,6 +35,73 @@ - popq %rdx; \ - popq %rcx; \ - popq %rax -+#define __ASM_STUFF_RSB \ -+ call 1f; \ -+ pause; \ -+1: call 2f; \ -+ pause; \ -+2: call 3f; \ -+ pause; \ -+3: call 4f; \ -+ pause; \ -+4: call 5f; \ -+ pause; \ -+5: call 6f; \ -+ pause; \ -+6: call 7f; \ -+ pause; \ -+7: call 8f; \ -+ pause; \ -+8: call 9f; \ -+ pause; \ -+9: call 10f; \ -+ pause; \ -+10: call 11f; \ -+ pause; \ -+11: call 12f; \ -+ pause; \ -+12: call 13f; \ -+ pause; \ -+13: call 14f; \ -+ pause; \ -+14: call 15f; \ -+ pause; \ -+15: call 16f; \ -+ pause; \ -+16: call 17f; \ -+ pause; \ -+17: call 18f; \ -+ pause; \ -+18: call 19f; \ -+ pause; \ -+19: call 20f; \ -+ pause; \ -+20: call 21f; \ -+ pause; \ -+21: call 22f; \ -+ pause; \ -+22: call 23f; \ -+ pause; \ -+23: call 24f; \ -+ pause; \ -+24: call 25f; \ -+ pause; \ -+25: call 26f; \ -+ pause; \ -+26: call 27f; \ -+ pause; \ -+27: call 28f; \ -+ pause; \ -+28: call 29f; \ -+ pause; \ -+29: call 30f; \ -+ pause; \ -+30: call 31f; \ -+ pause; \ -+31: call 32f; \ -+ pause; \ -+32: \ -+ add $(32*8), %rsp; - - .macro ENABLE_IBRS - ALTERNATIVE "", __stringify(__ASM_ENABLE_IBRS), X86_FEATURE_SPEC_CTRL -@@ -48,5 +115,9 @@ ALTERNATIVE "", __stringify(__ASM_ENABLE_IBRS_CLOBBER), X86_FEATURE_SPEC_CTRL - ALTERNATIVE "", __stringify(__ASM_DISABLE_IBRS), X86_FEATURE_SPEC_CTRL - .endm - -+.macro STUFF_RSB -+ALTERNATIVE __stringify(__ASM_STUFF_RSB), "", X86_FEATURE_SMEP -+.endm -+ - #endif /* __ASSEMBLY__ */ - #endif /* _ASM_X86_SPEC_CTRL_H */ -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 5f898c3c1dad..f6ec4ad5b114 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -214,8 +214,6 @@ ENTRY(entry_SYSCALL_64) - movq %rsp, PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - -- TRACE_IRQS_OFF -- - /* Construct struct pt_regs on stack */ - pushq $__USER_DS /* pt_regs->ss */ - pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ -@@ -238,6 +236,10 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) - - ENABLE_IBRS - -+ STUFF_RSB -+ -+ TRACE_IRQS_OFF -+ - /* - * If we need to do entry work or if we guess we'll need to do - * exit work, go straight to the slow path. -@@ -658,6 +660,13 @@ END(irq_entries_start) - ALLOC_PT_GPREGS_ON_STACK - SAVE_C_REGS - SAVE_EXTRA_REGS -+ -+ /* -+ * Have to do stuffing before encoding frame pointer. -+ * Could add some unnecessary RSB clearing if coming -+ * from kernel for non-SMEP platform. -+ */ -+ STUFF_RSB - ENCODE_FRAME_POINTER - - testb $3, CS(%rsp) -@@ -1276,6 +1285,10 @@ ENTRY(paranoid_entry) - cld - SAVE_C_REGS 8 - SAVE_EXTRA_REGS 8 -+ /* -+ * Do the stuffing unconditionally from user/kernel to be safe -+ */ -+ STUFF_RSB - ENCODE_FRAME_POINTER 8 - movl $1, %ebx - movl $MSR_GS_BASE, %ecx -@@ -1329,6 +1342,7 @@ ENTRY(error_entry) - cld - SAVE_C_REGS 8 - SAVE_EXTRA_REGS 8 -+ STUFF_RSB - ENCODE_FRAME_POINTER 8 - xorl %ebx, %ebx - testb $3, CS+8(%rsp) -diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S -index ee4f3edb3c50..1480222bae02 100644 ---- a/arch/x86/entry/entry_64_compat.S -+++ b/arch/x86/entry/entry_64_compat.S -@@ -97,6 +97,7 @@ ENTRY(entry_SYSENTER_compat) - cld - - ENABLE_IBRS -+ STUFF_RSB - - /* - * SYSENTER doesn't filter flags, so we need to clear NT and AC -@@ -227,6 +228,8 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) - pushq $0 /* pt_regs->r14 = 0 */ - pushq $0 /* pt_regs->r15 = 0 */ - -+ STUFF_RSB -+ - /* - * User mode is traced as though IRQs are on, and SYSENTER - * turned them off. -@@ -354,6 +357,7 @@ ENTRY(entry_INT80_compat) - cld - - ENABLE_IBRS -+ STUFF_RSB - - /* - * User mode is traced as though IRQs are on, and the interrupt --- -2.14.2 - diff --git a/patches/kernel/0281-x86-mm-Only-set-IBPB-when-the-new-thread-cannot-ptra.patch b/patches/kernel/0281-x86-mm-Only-set-IBPB-when-the-new-thread-cannot-ptra.patch new file mode 100644 index 0000000..f85c03f --- /dev/null +++ b/patches/kernel/0281-x86-mm-Only-set-IBPB-when-the-new-thread-cannot-ptra.patch @@ -0,0 +1,127 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Tue, 7 Nov 2017 13:52:42 -0800 +Subject: [PATCH] x86/mm: Only set IBPB when the new thread cannot ptrace + current thread +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +To reduce overhead of setting IBPB, we only do that when +the new thread cannot ptrace the current one. If the new +thread has ptrace capability on current thread, it is safe. + +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 65941af723059ffeeca269b99ab51b3c9e320751) +Signed-off-by: Fabian Grünbichler +--- + include/linux/ptrace.h | 6 ++++++ + arch/x86/mm/tlb.c | 5 ++++- + kernel/ptrace.c | 18 ++++++++++++++---- + 3 files changed, 24 insertions(+), 5 deletions(-) + +diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h +index 0e5fcc11b1b8..d6afefd5465b 100644 +--- a/include/linux/ptrace.h ++++ b/include/linux/ptrace.h +@@ -63,12 +63,15 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); + #define PTRACE_MODE_NOAUDIT 0x04 + #define PTRACE_MODE_FSCREDS 0x08 + #define PTRACE_MODE_REALCREDS 0x10 ++#define PTRACE_MODE_NOACCESS_CHK 0x20 + + /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ + #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) + #define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) + #define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) + #define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) ++#define PTRACE_MODE_IBPB (PTRACE_MODE_ATTACH | PTRACE_MODE_NOAUDIT \ ++ | PTRACE_MODE_NOACCESS_CHK | PTRACE_MODE_REALCREDS) + + /** + * ptrace_may_access - check whether the caller is permitted to access +@@ -86,6 +89,9 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); + */ + extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); + ++extern int ___ptrace_may_access(struct task_struct *cur, struct task_struct *task, ++ unsigned int mode); ++ + static inline int ptrace_reparented(struct task_struct *child) + { + return !same_thread_group(child->real_parent, child->parent); +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index bb3ded3a4e5f..301e6efbc514 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -6,6 +6,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -219,7 +220,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + u16 new_asid; + bool need_flush; + +- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ /* Null tsk means switching to kernel, so that's safe */ ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) && tsk && ++ ___ptrace_may_access(tsk, current, PTRACE_MODE_IBPB)) + native_wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB); + + if (IS_ENABLED(CONFIG_VMAP_STACK)) { +diff --git a/kernel/ptrace.c b/kernel/ptrace.c +index 60f356d91060..f2f0f1aeabaf 100644 +--- a/kernel/ptrace.c ++++ b/kernel/ptrace.c +@@ -268,9 +268,10 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) + } + + /* Returns 0 on success, -errno on denial. */ +-static int __ptrace_may_access(struct task_struct *task, unsigned int mode) ++int ___ptrace_may_access(struct task_struct *cur, struct task_struct *task, ++ unsigned int mode) + { +- const struct cred *cred = current_cred(), *tcred; ++ const struct cred *cred = __task_cred(cur), *tcred; + struct mm_struct *mm; + kuid_t caller_uid; + kgid_t caller_gid; +@@ -290,7 +291,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) + */ + + /* Don't let security modules deny introspection */ +- if (same_thread_group(task, current)) ++ if (same_thread_group(task, cur)) + return 0; + rcu_read_lock(); + if (mode & PTRACE_MODE_FSCREDS) { +@@ -328,7 +329,16 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) + !ptrace_has_cap(mm->user_ns, mode))) + return -EPERM; + +- return security_ptrace_access_check(task, mode); ++ if (!(mode & PTRACE_MODE_NOACCESS_CHK)) ++ return security_ptrace_access_check(task, mode); ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(___ptrace_may_access); ++ ++static int __ptrace_may_access(struct task_struct *task, unsigned int mode) ++{ ++ return ___ptrace_may_access(current, task, mode); + } + + bool ptrace_may_access(struct task_struct *task, unsigned int mode) +-- +2.14.2 + diff --git a/patches/kernel/0282-x86-entry-Stuff-RSB-for-entry-to-kernel-for-non-SMEP.patch b/patches/kernel/0282-x86-entry-Stuff-RSB-for-entry-to-kernel-for-non-SMEP.patch new file mode 100644 index 0000000..0eebfdf --- /dev/null +++ b/patches/kernel/0282-x86-entry-Stuff-RSB-for-entry-to-kernel-for-non-SMEP.patch @@ -0,0 +1,202 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Tue, 14 Nov 2017 17:16:30 -0800 +Subject: [PATCH] x86/entry: Stuff RSB for entry to kernel for non-SMEP + platform +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Stuff RSB to prevent RSB underflow on non-SMEP platform. + +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit b82785ac1d33ce219c77d72b7bd80a21e1441ac8) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/spec_ctrl.h | 71 ++++++++++++++++++++++++++++++++++++++++ + arch/x86/entry/entry_64.S | 18 ++++++++-- + arch/x86/entry/entry_64_compat.S | 4 +++ + 3 files changed, 91 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/spec_ctrl.h b/arch/x86/include/asm/spec_ctrl.h +index 7f8bb09b6acb..55ee1f36bda2 100644 +--- a/arch/x86/include/asm/spec_ctrl.h ++++ b/arch/x86/include/asm/spec_ctrl.h +@@ -35,6 +35,73 @@ + popq %rdx; \ + popq %rcx; \ + popq %rax ++#define __ASM_STUFF_RSB \ ++ call 1f; \ ++ pause; \ ++1: call 2f; \ ++ pause; \ ++2: call 3f; \ ++ pause; \ ++3: call 4f; \ ++ pause; \ ++4: call 5f; \ ++ pause; \ ++5: call 6f; \ ++ pause; \ ++6: call 7f; \ ++ pause; \ ++7: call 8f; \ ++ pause; \ ++8: call 9f; \ ++ pause; \ ++9: call 10f; \ ++ pause; \ ++10: call 11f; \ ++ pause; \ ++11: call 12f; \ ++ pause; \ ++12: call 13f; \ ++ pause; \ ++13: call 14f; \ ++ pause; \ ++14: call 15f; \ ++ pause; \ ++15: call 16f; \ ++ pause; \ ++16: call 17f; \ ++ pause; \ ++17: call 18f; \ ++ pause; \ ++18: call 19f; \ ++ pause; \ ++19: call 20f; \ ++ pause; \ ++20: call 21f; \ ++ pause; \ ++21: call 22f; \ ++ pause; \ ++22: call 23f; \ ++ pause; \ ++23: call 24f; \ ++ pause; \ ++24: call 25f; \ ++ pause; \ ++25: call 26f; \ ++ pause; \ ++26: call 27f; \ ++ pause; \ ++27: call 28f; \ ++ pause; \ ++28: call 29f; \ ++ pause; \ ++29: call 30f; \ ++ pause; \ ++30: call 31f; \ ++ pause; \ ++31: call 32f; \ ++ pause; \ ++32: \ ++ add $(32*8), %rsp; + + .macro ENABLE_IBRS + ALTERNATIVE "", __stringify(__ASM_ENABLE_IBRS), X86_FEATURE_SPEC_CTRL +@@ -48,5 +115,9 @@ ALTERNATIVE "", __stringify(__ASM_ENABLE_IBRS_CLOBBER), X86_FEATURE_SPEC_CTRL + ALTERNATIVE "", __stringify(__ASM_DISABLE_IBRS), X86_FEATURE_SPEC_CTRL + .endm + ++.macro STUFF_RSB ++ALTERNATIVE __stringify(__ASM_STUFF_RSB), "", X86_FEATURE_SMEP ++.endm ++ + #endif /* __ASSEMBLY__ */ + #endif /* _ASM_X86_SPEC_CTRL_H */ +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 5f898c3c1dad..f6ec4ad5b114 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -214,8 +214,6 @@ ENTRY(entry_SYSCALL_64) + movq %rsp, PER_CPU_VAR(rsp_scratch) + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + +- TRACE_IRQS_OFF +- + /* Construct struct pt_regs on stack */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ +@@ -238,6 +236,10 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) + + ENABLE_IBRS + ++ STUFF_RSB ++ ++ TRACE_IRQS_OFF ++ + /* + * If we need to do entry work or if we guess we'll need to do + * exit work, go straight to the slow path. +@@ -658,6 +660,13 @@ END(irq_entries_start) + ALLOC_PT_GPREGS_ON_STACK + SAVE_C_REGS + SAVE_EXTRA_REGS ++ ++ /* ++ * Have to do stuffing before encoding frame pointer. ++ * Could add some unnecessary RSB clearing if coming ++ * from kernel for non-SMEP platform. ++ */ ++ STUFF_RSB + ENCODE_FRAME_POINTER + + testb $3, CS(%rsp) +@@ -1276,6 +1285,10 @@ ENTRY(paranoid_entry) + cld + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 ++ /* ++ * Do the stuffing unconditionally from user/kernel to be safe ++ */ ++ STUFF_RSB + ENCODE_FRAME_POINTER 8 + movl $1, %ebx + movl $MSR_GS_BASE, %ecx +@@ -1329,6 +1342,7 @@ ENTRY(error_entry) + cld + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 ++ STUFF_RSB + ENCODE_FRAME_POINTER 8 + xorl %ebx, %ebx + testb $3, CS+8(%rsp) +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index ee4f3edb3c50..1480222bae02 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -97,6 +97,7 @@ ENTRY(entry_SYSENTER_compat) + cld + + ENABLE_IBRS ++ STUFF_RSB + + /* + * SYSENTER doesn't filter flags, so we need to clear NT and AC +@@ -227,6 +228,8 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) + pushq $0 /* pt_regs->r14 = 0 */ + pushq $0 /* pt_regs->r15 = 0 */ + ++ STUFF_RSB ++ + /* + * User mode is traced as though IRQs are on, and SYSENTER + * turned them off. +@@ -354,6 +357,7 @@ ENTRY(entry_INT80_compat) + cld + + ENABLE_IBRS ++ STUFF_RSB + + /* + * User mode is traced as though IRQs are on, and the interrupt +-- +2.14.2 + diff --git a/patches/kernel/0282-x86-kvm-add-MSR_IA32_SPEC_CTRL-and-MSR_IA32_PRED_CMD.patch b/patches/kernel/0282-x86-kvm-add-MSR_IA32_SPEC_CTRL-and-MSR_IA32_PRED_CMD.patch deleted file mode 100644 index e285492..0000000 --- a/patches/kernel/0282-x86-kvm-add-MSR_IA32_SPEC_CTRL-and-MSR_IA32_PRED_CMD.patch +++ /dev/null @@ -1,103 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Wei Wang -Date: Tue, 7 Nov 2017 16:47:53 +0800 -Subject: [PATCH] x86/kvm: add MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD to kvm -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Add field to access guest MSR_IA332_SPEC_CTRL and MSR_IA32_PRED_CMD state. - -Signed-off-by: Wei Wang -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 15eb187f47ee2be44d34313bc89cfb719d82cb21) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/kvm_host.h | 2 ++ - arch/x86/kvm/vmx.c | 10 ++++++++++ - arch/x86/kvm/x86.c | 2 +- - 3 files changed, 13 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h -index b69af3df978a..1953c0a5b972 100644 ---- a/arch/x86/include/asm/kvm_host.h -+++ b/arch/x86/include/asm/kvm_host.h -@@ -628,6 +628,8 @@ struct kvm_vcpu_arch { - u64 mcg_ext_ctl; - u64 *mce_banks; - -+ u64 spec_ctrl; -+ - /* Cache MMIO info */ - u64 mmio_gva; - unsigned access; -diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c -index 9b4256fd589a..daff9962c90a 100644 ---- a/arch/x86/kvm/vmx.c -+++ b/arch/x86/kvm/vmx.c -@@ -50,6 +50,7 @@ - #include - #include - #include -+#include - - #include "trace.h" - #include "pmu.h" -@@ -3247,6 +3248,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) - case MSR_IA32_TSC: - msr_info->data = guest_read_tsc(vcpu); - break; -+ case MSR_IA32_SPEC_CTRL: -+ msr_info->data = vcpu->arch.spec_ctrl; -+ break; - case MSR_IA32_SYSENTER_CS: - msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); - break; -@@ -3351,6 +3355,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) - case MSR_IA32_TSC: - kvm_write_tsc(vcpu, msr_info); - break; -+ case MSR_IA32_SPEC_CTRL: -+ vcpu->arch.spec_ctrl = msr_info->data; -+ break; - case MSR_IA32_CR_PAT: - if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) -@@ -6146,6 +6153,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) - - msr_info.index = ecx; - msr_info.host_initiated = false; -+ - if (vmx_get_msr(vcpu, &msr_info)) { - trace_kvm_msr_read_ex(ecx); - kvm_inject_gp(vcpu, 0); -@@ -6699,6 +6707,8 @@ static __init int hardware_setup(void) - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); -+ vmx_disable_intercept_for_msr(MSR_IA32_SPEC_CTRL, false); -+ vmx_disable_intercept_for_msr(MSR_IA32_PRED_CMD, false); - - memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, - vmx_msr_bitmap_legacy, PAGE_SIZE); -diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c -index 703cd4171921..eae4aecf3cfe 100644 ---- a/arch/x86/kvm/x86.c -+++ b/arch/x86/kvm/x86.c -@@ -983,7 +983,7 @@ static u32 msrs_to_save[] = { - MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, - #endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, -- MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, -+ MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, MSR_IA32_SPEC_CTRL, - }; - - static unsigned num_msrs_to_save; --- -2.14.2 - diff --git a/patches/kernel/0283-x86-kvm-Set-IBPB-when-switching-VM.patch b/patches/kernel/0283-x86-kvm-Set-IBPB-when-switching-VM.patch deleted file mode 100644 index 171ed40..0000000 --- a/patches/kernel/0283-x86-kvm-Set-IBPB-when-switching-VM.patch +++ /dev/null @@ -1,46 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tim Chen -Date: Fri, 13 Oct 2017 14:31:46 -0700 -Subject: [PATCH] x86/kvm: Set IBPB when switching VM -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Set IBPB (Indirect branch prediction barrier) when switching VM. - -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 665076ad780e8620505c742cfcb4b0f3fb99324a) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kvm/vmx.c | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c -index daff9962c90a..8df195bbb41d 100644 ---- a/arch/x86/kvm/vmx.c -+++ b/arch/x86/kvm/vmx.c -@@ -1488,6 +1488,7 @@ static void vmcs_load(struct vmcs *vmcs) - if (error) - printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", - vmcs, phys_addr); -+ - } - - #ifdef CONFIG_KEXEC_CORE -@@ -2268,6 +2269,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) - if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { - per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; - vmcs_load(vmx->loaded_vmcs->vmcs); -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ native_wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB); - } - - if (!already_loaded) { --- -2.14.2 - diff --git a/patches/kernel/0283-x86-kvm-add-MSR_IA32_SPEC_CTRL-and-MSR_IA32_PRED_CMD.patch b/patches/kernel/0283-x86-kvm-add-MSR_IA32_SPEC_CTRL-and-MSR_IA32_PRED_CMD.patch new file mode 100644 index 0000000..e285492 --- /dev/null +++ b/patches/kernel/0283-x86-kvm-add-MSR_IA32_SPEC_CTRL-and-MSR_IA32_PRED_CMD.patch @@ -0,0 +1,103 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Wei Wang +Date: Tue, 7 Nov 2017 16:47:53 +0800 +Subject: [PATCH] x86/kvm: add MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD to kvm +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Add field to access guest MSR_IA332_SPEC_CTRL and MSR_IA32_PRED_CMD state. + +Signed-off-by: Wei Wang +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 15eb187f47ee2be44d34313bc89cfb719d82cb21) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/kvm_host.h | 2 ++ + arch/x86/kvm/vmx.c | 10 ++++++++++ + arch/x86/kvm/x86.c | 2 +- + 3 files changed, 13 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index b69af3df978a..1953c0a5b972 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -628,6 +628,8 @@ struct kvm_vcpu_arch { + u64 mcg_ext_ctl; + u64 *mce_banks; + ++ u64 spec_ctrl; ++ + /* Cache MMIO info */ + u64 mmio_gva; + unsigned access; +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 9b4256fd589a..daff9962c90a 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -50,6 +50,7 @@ + #include + #include + #include ++#include + + #include "trace.h" + #include "pmu.h" +@@ -3247,6 +3248,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + case MSR_IA32_TSC: + msr_info->data = guest_read_tsc(vcpu); + break; ++ case MSR_IA32_SPEC_CTRL: ++ msr_info->data = vcpu->arch.spec_ctrl; ++ break; + case MSR_IA32_SYSENTER_CS: + msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); + break; +@@ -3351,6 +3355,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr_info); + break; ++ case MSR_IA32_SPEC_CTRL: ++ vcpu->arch.spec_ctrl = msr_info->data; ++ break; + case MSR_IA32_CR_PAT: + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) +@@ -6146,6 +6153,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) + + msr_info.index = ecx; + msr_info.host_initiated = false; ++ + if (vmx_get_msr(vcpu, &msr_info)) { + trace_kvm_msr_read_ex(ecx); + kvm_inject_gp(vcpu, 0); +@@ -6699,6 +6707,8 @@ static __init int hardware_setup(void) + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); ++ vmx_disable_intercept_for_msr(MSR_IA32_SPEC_CTRL, false); ++ vmx_disable_intercept_for_msr(MSR_IA32_PRED_CMD, false); + + memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, + vmx_msr_bitmap_legacy, PAGE_SIZE); +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 703cd4171921..eae4aecf3cfe 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -983,7 +983,7 @@ static u32 msrs_to_save[] = { + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, + #endif + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, +- MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, ++ MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, MSR_IA32_SPEC_CTRL, + }; + + static unsigned num_msrs_to_save; +-- +2.14.2 + diff --git a/patches/kernel/0284-x86-kvm-Set-IBPB-when-switching-VM.patch b/patches/kernel/0284-x86-kvm-Set-IBPB-when-switching-VM.patch new file mode 100644 index 0000000..171ed40 --- /dev/null +++ b/patches/kernel/0284-x86-kvm-Set-IBPB-when-switching-VM.patch @@ -0,0 +1,46 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Fri, 13 Oct 2017 14:31:46 -0700 +Subject: [PATCH] x86/kvm: Set IBPB when switching VM +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Set IBPB (Indirect branch prediction barrier) when switching VM. + +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 665076ad780e8620505c742cfcb4b0f3fb99324a) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kvm/vmx.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index daff9962c90a..8df195bbb41d 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -1488,6 +1488,7 @@ static void vmcs_load(struct vmcs *vmcs) + if (error) + printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", + vmcs, phys_addr); ++ + } + + #ifdef CONFIG_KEXEC_CORE +@@ -2268,6 +2269,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { + per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; + vmcs_load(vmx->loaded_vmcs->vmcs); ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ native_wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB); + } + + if (!already_loaded) { +-- +2.14.2 + diff --git a/patches/kernel/0284-x86-kvm-Toggle-IBRS-on-VM-entry-and-exit.patch b/patches/kernel/0284-x86-kvm-Toggle-IBRS-on-VM-entry-and-exit.patch deleted file mode 100644 index 5fae670..0000000 --- a/patches/kernel/0284-x86-kvm-Toggle-IBRS-on-VM-entry-and-exit.patch +++ /dev/null @@ -1,42 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tim Chen -Date: Fri, 20 Oct 2017 17:04:35 -0700 -Subject: [PATCH] x86/kvm: Toggle IBRS on VM entry and exit -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Restore guest IBRS on VM entry and set it to 1 on VM exit -back to kernel. - -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 3dc28210342f174270bcefac74ef5d0b52ffd846) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kvm/vmx.c | 5 +++++ - 1 file changed, 5 insertions(+) - -diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c -index 8df195bbb41d..57d538fc7c75 100644 ---- a/arch/x86/kvm/vmx.c -+++ b/arch/x86/kvm/vmx.c -@@ -9101,6 +9101,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) - __write_pkru(vcpu->arch.pkru); - - atomic_switch_perf_msrs(vmx); -+ -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, -+ vcpu->arch.spec_ctrl, FEATURE_ENABLE_IBRS); -+ - debugctlmsr = get_debugctlmsr(); - - vmx_arm_hv_timer(vcpu); --- -2.14.2 - diff --git a/patches/kernel/0285-x86-kvm-Pad-RSB-on-VM-transition.patch b/patches/kernel/0285-x86-kvm-Pad-RSB-on-VM-transition.patch deleted file mode 100644 index f337f15..0000000 --- a/patches/kernel/0285-x86-kvm-Pad-RSB-on-VM-transition.patch +++ /dev/null @@ -1,154 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tim Chen -Date: Fri, 20 Oct 2017 17:05:54 -0700 -Subject: [PATCH] x86/kvm: Pad RSB on VM transition -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Add code to pad the local CPU's RSB entries to protect -from previous less privilege mode. - -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 5369368d3520addb2ffb2413cfa7e8f3efe2e31d) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/kvm_host.h | 103 ++++++++++++++++++++++++++++++++++++++++ - arch/x86/kvm/vmx.c | 2 + - 2 files changed, 105 insertions(+) - -diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h -index 1953c0a5b972..4117a97228a2 100644 ---- a/arch/x86/include/asm/kvm_host.h -+++ b/arch/x86/include/asm/kvm_host.h -@@ -125,6 +125,109 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) - - #define ASYNC_PF_PER_VCPU 64 - -+static inline void stuff_RSB(void) -+{ -+ __asm__ __volatile__(" \n\ -+ call .label1 \n\ -+ pause \n\ -+.label1: \n\ -+ call .label2 \n\ -+ pause \n\ -+.label2: \n\ -+ call .label3 \n\ -+ pause \n\ -+.label3: \n\ -+ call .label4 \n\ -+ pause \n\ -+.label4: \n\ -+ call .label5 \n\ -+ pause \n\ -+.label5: \n\ -+ call .label6 \n\ -+ pause \n\ -+.label6: \n\ -+ call .label7 \n\ -+ pause \n\ -+.label7: \n\ -+ call .label8 \n\ -+ pause \n\ -+.label8: \n\ -+ call .label9 \n\ -+ pause \n\ -+.label9: \n\ -+ call .label10 \n\ -+ pause \n\ -+.label10: \n\ -+ call .label11 \n\ -+ pause \n\ -+.label11: \n\ -+ call .label12 \n\ -+ pause \n\ -+.label12: \n\ -+ call .label13 \n\ -+ pause \n\ -+.label13: \n\ -+ call .label14 \n\ -+ pause \n\ -+.label14: \n\ -+ call .label15 \n\ -+ pause \n\ -+.label15: \n\ -+ call .label16 \n\ -+ pause \n\ -+.label16: \n\ -+ call .label17 \n\ -+ pause \n\ -+.label17: \n\ -+ call .label18 \n\ -+ pause \n\ -+.label18: \n\ -+ call .label19 \n\ -+ pause \n\ -+.label19: \n\ -+ call .label20 \n\ -+ pause \n\ -+.label20: \n\ -+ call .label21 \n\ -+ pause \n\ -+.label21: \n\ -+ call .label22 \n\ -+ pause \n\ -+.label22: \n\ -+ call .label23 \n\ -+ pause \n\ -+.label23: \n\ -+ call .label24 \n\ -+ pause \n\ -+.label24: \n\ -+ call .label25 \n\ -+ pause \n\ -+.label25: \n\ -+ call .label26 \n\ -+ pause \n\ -+.label26: \n\ -+ call .label27 \n\ -+ pause \n\ -+.label27: \n\ -+ call .label28 \n\ -+ pause \n\ -+.label28: \n\ -+ call .label29 \n\ -+ pause \n\ -+.label29: \n\ -+ call .label30 \n\ -+ pause \n\ -+.label30: \n\ -+ call .label31 \n\ -+ pause \n\ -+.label31: \n\ -+ call .label32 \n\ -+ pause \n\ -+.label32: \n\ -+ add $(32*8), %%rsp \n\ -+": : :"memory"); -+} -+ - enum kvm_reg { - VCPU_REGS_RAX = 0, - VCPU_REGS_RCX = 1, -diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c -index 57d538fc7c75..496884b6467f 100644 ---- a/arch/x86/kvm/vmx.c -+++ b/arch/x86/kvm/vmx.c -@@ -9228,6 +9228,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) - #endif - ); - -+ stuff_RSB(); -+ - /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ - if (debugctlmsr) - update_debugctlmsr(debugctlmsr); --- -2.14.2 - diff --git a/patches/kernel/0285-x86-kvm-Toggle-IBRS-on-VM-entry-and-exit.patch b/patches/kernel/0285-x86-kvm-Toggle-IBRS-on-VM-entry-and-exit.patch new file mode 100644 index 0000000..5fae670 --- /dev/null +++ b/patches/kernel/0285-x86-kvm-Toggle-IBRS-on-VM-entry-and-exit.patch @@ -0,0 +1,42 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Fri, 20 Oct 2017 17:04:35 -0700 +Subject: [PATCH] x86/kvm: Toggle IBRS on VM entry and exit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Restore guest IBRS on VM entry and set it to 1 on VM exit +back to kernel. + +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 3dc28210342f174270bcefac74ef5d0b52ffd846) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kvm/vmx.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 8df195bbb41d..57d538fc7c75 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -9101,6 +9101,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) + __write_pkru(vcpu->arch.pkru); + + atomic_switch_perf_msrs(vmx); ++ ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, ++ vcpu->arch.spec_ctrl, FEATURE_ENABLE_IBRS); ++ + debugctlmsr = get_debugctlmsr(); + + vmx_arm_hv_timer(vcpu); +-- +2.14.2 + diff --git a/patches/kernel/0286-x86-kvm-Pad-RSB-on-VM-transition.patch b/patches/kernel/0286-x86-kvm-Pad-RSB-on-VM-transition.patch new file mode 100644 index 0000000..f337f15 --- /dev/null +++ b/patches/kernel/0286-x86-kvm-Pad-RSB-on-VM-transition.patch @@ -0,0 +1,154 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Fri, 20 Oct 2017 17:05:54 -0700 +Subject: [PATCH] x86/kvm: Pad RSB on VM transition +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Add code to pad the local CPU's RSB entries to protect +from previous less privilege mode. + +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 5369368d3520addb2ffb2413cfa7e8f3efe2e31d) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/kvm_host.h | 103 ++++++++++++++++++++++++++++++++++++++++ + arch/x86/kvm/vmx.c | 2 + + 2 files changed, 105 insertions(+) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 1953c0a5b972..4117a97228a2 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -125,6 +125,109 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) + + #define ASYNC_PF_PER_VCPU 64 + ++static inline void stuff_RSB(void) ++{ ++ __asm__ __volatile__(" \n\ ++ call .label1 \n\ ++ pause \n\ ++.label1: \n\ ++ call .label2 \n\ ++ pause \n\ ++.label2: \n\ ++ call .label3 \n\ ++ pause \n\ ++.label3: \n\ ++ call .label4 \n\ ++ pause \n\ ++.label4: \n\ ++ call .label5 \n\ ++ pause \n\ ++.label5: \n\ ++ call .label6 \n\ ++ pause \n\ ++.label6: \n\ ++ call .label7 \n\ ++ pause \n\ ++.label7: \n\ ++ call .label8 \n\ ++ pause \n\ ++.label8: \n\ ++ call .label9 \n\ ++ pause \n\ ++.label9: \n\ ++ call .label10 \n\ ++ pause \n\ ++.label10: \n\ ++ call .label11 \n\ ++ pause \n\ ++.label11: \n\ ++ call .label12 \n\ ++ pause \n\ ++.label12: \n\ ++ call .label13 \n\ ++ pause \n\ ++.label13: \n\ ++ call .label14 \n\ ++ pause \n\ ++.label14: \n\ ++ call .label15 \n\ ++ pause \n\ ++.label15: \n\ ++ call .label16 \n\ ++ pause \n\ ++.label16: \n\ ++ call .label17 \n\ ++ pause \n\ ++.label17: \n\ ++ call .label18 \n\ ++ pause \n\ ++.label18: \n\ ++ call .label19 \n\ ++ pause \n\ ++.label19: \n\ ++ call .label20 \n\ ++ pause \n\ ++.label20: \n\ ++ call .label21 \n\ ++ pause \n\ ++.label21: \n\ ++ call .label22 \n\ ++ pause \n\ ++.label22: \n\ ++ call .label23 \n\ ++ pause \n\ ++.label23: \n\ ++ call .label24 \n\ ++ pause \n\ ++.label24: \n\ ++ call .label25 \n\ ++ pause \n\ ++.label25: \n\ ++ call .label26 \n\ ++ pause \n\ ++.label26: \n\ ++ call .label27 \n\ ++ pause \n\ ++.label27: \n\ ++ call .label28 \n\ ++ pause \n\ ++.label28: \n\ ++ call .label29 \n\ ++ pause \n\ ++.label29: \n\ ++ call .label30 \n\ ++ pause \n\ ++.label30: \n\ ++ call .label31 \n\ ++ pause \n\ ++.label31: \n\ ++ call .label32 \n\ ++ pause \n\ ++.label32: \n\ ++ add $(32*8), %%rsp \n\ ++": : :"memory"); ++} ++ + enum kvm_reg { + VCPU_REGS_RAX = 0, + VCPU_REGS_RCX = 1, +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 57d538fc7c75..496884b6467f 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -9228,6 +9228,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) + #endif + ); + ++ stuff_RSB(); ++ + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ + if (debugctlmsr) + update_debugctlmsr(debugctlmsr); +-- +2.14.2 + diff --git a/patches/kernel/0286-x86-spec_ctrl-Add-sysctl-knobs-to-enable-disable-SPE.patch b/patches/kernel/0286-x86-spec_ctrl-Add-sysctl-knobs-to-enable-disable-SPE.patch deleted file mode 100644 index ef9bf3f..0000000 --- a/patches/kernel/0286-x86-spec_ctrl-Add-sysctl-knobs-to-enable-disable-SPE.patch +++ /dev/null @@ -1,613 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tim Chen -Date: Thu, 16 Nov 2017 04:47:48 -0800 -Subject: [PATCH] x86/spec_ctrl: Add sysctl knobs to enable/disable SPEC_CTRL - feature -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -There are 2 ways to control IBPB and IBRS - -1. At boot time - noibrs kernel boot parameter will disable IBRS usage - noibpb kernel boot parameter will disable IBPB usage -Otherwise if the above parameters are not specified, the system -will enable ibrs and ibpb usage if the cpu supports it. - -2. At run time - echo 0 > /proc/sys/kernel/ibrs_enabled will turn off IBRS - echo 1 > /proc/sys/kernel/ibrs_enabled will turn on IBRS in kernel - echo 2 > /proc/sys/kernel/ibrs_enabled will turn on IBRS in both userspace and kernel - -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -[marcelo.cerri@canonical.com: add x86 guards to kernel/smp.c] -[marcelo.cerri@canonical.com: include asm/msr.h under x86 guard in kernel/sysctl.c] -Signed-off-by: Marcelo Henrique Cerri -(cherry picked from commit 23225db7b02c7f8b94e5d5050987430089e6f7cc) -Signed-off-by: Fabian Grünbichler ---- - Documentation/admin-guide/kernel-parameters.txt | 10 ++ - arch/x86/include/asm/mwait.h | 4 +- - arch/x86/include/asm/spec_ctrl.h | 24 ++++- - include/linux/smp.h | 87 +++++++++++++++++ - arch/x86/kernel/cpu/intel.c | 11 ++- - arch/x86/kernel/cpu/microcode/core.c | 11 +++ - arch/x86/kernel/process.c | 6 +- - arch/x86/kernel/smpboot.c | 4 +- - arch/x86/kvm/vmx.c | 4 +- - arch/x86/lib/delay.c | 6 +- - arch/x86/mm/tlb.c | 2 +- - kernel/smp.c | 41 ++++++++ - kernel/sysctl.c | 125 ++++++++++++++++++++++++ - 13 files changed, 316 insertions(+), 19 deletions(-) - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 1a6ebc6cdf26..e7216bc05b3b 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -2566,6 +2566,16 @@ - noexec=on: enable non-executable mappings (default) - noexec=off: disable non-executable mappings - -+ noibrs [X86] -+ Don't use indirect branch restricted speculation (IBRS) -+ feature when running in secure environment, -+ to avoid performance overhead. -+ -+ noibpb [X86] -+ Don't use indirect branch prediction barrier (IBPB) -+ feature when running in secure environment, -+ to avoid performance overhead. -+ - nosmap [X86] - Disable SMAP (Supervisor Mode Access Prevention) - even if it is supported by processor. -diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h -index f15120ada161..d665daab3f84 100644 ---- a/arch/x86/include/asm/mwait.h -+++ b/arch/x86/include/asm/mwait.h -@@ -107,14 +107,14 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) - mb(); - } - -- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ if (ibrs_inuse) - native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); - - __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (!need_resched()) - __mwait(eax, ecx); - -- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ if (ibrs_inuse) - native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); - } - current_clr_polling(); -diff --git a/arch/x86/include/asm/spec_ctrl.h b/arch/x86/include/asm/spec_ctrl.h -index 55ee1f36bda2..4c69e51261cc 100644 ---- a/arch/x86/include/asm/spec_ctrl.h -+++ b/arch/x86/include/asm/spec_ctrl.h -@@ -8,6 +8,9 @@ - - #ifdef __ASSEMBLY__ - -+.extern use_ibrs -+.extern use_ibpb -+ - #define __ASM_ENABLE_IBRS \ - pushq %rax; \ - pushq %rcx; \ -@@ -104,15 +107,30 @@ - add $(32*8), %rsp; - - .macro ENABLE_IBRS --ALTERNATIVE "", __stringify(__ASM_ENABLE_IBRS), X86_FEATURE_SPEC_CTRL -+ testl $1, use_ibrs -+ jz 10f -+ __ASM_ENABLE_IBRS -+ jmp 20f -+10: -+ lfence -+20: - .endm - - .macro ENABLE_IBRS_CLOBBER --ALTERNATIVE "", __stringify(__ASM_ENABLE_IBRS_CLOBBER), X86_FEATURE_SPEC_CTRL -+ testl $1, use_ibrs -+ jz 11f -+ __ASM_ENABLE_IBRS_CLOBBER -+ jmp 21f -+11: -+ lfence -+21: - .endm - - .macro DISABLE_IBRS --ALTERNATIVE "", __stringify(__ASM_DISABLE_IBRS), X86_FEATURE_SPEC_CTRL -+ testl $1, use_ibrs -+ jz 9f -+ __ASM_DISABLE_IBRS -+9: - .endm - - .macro STUFF_RSB -diff --git a/include/linux/smp.h b/include/linux/smp.h -index 68123c1fe549..e2935c0a1bb4 100644 ---- a/include/linux/smp.h -+++ b/include/linux/smp.h -@@ -50,6 +50,93 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), - - int smp_call_function_single_async(int cpu, struct call_single_data *csd); - -+#ifdef CONFIG_X86 -+/* indicate usage of IBRS to control execution speculation */ -+extern int use_ibrs; -+extern u32 sysctl_ibrs_enabled; -+extern struct mutex spec_ctrl_mutex; -+#define ibrs_supported (use_ibrs & 0x2) -+#define ibrs_disabled (use_ibrs & 0x4) -+static inline void set_ibrs_inuse(void) -+{ -+ if (ibrs_supported) -+ use_ibrs |= 0x1; -+} -+static inline void clear_ibrs_inuse(void) -+{ -+ use_ibrs &= ~0x1; -+} -+static inline int check_ibrs_inuse(void) -+{ -+ if (use_ibrs & 0x1) -+ return 1; -+ else -+ /* rmb to prevent wrong speculation for security */ -+ rmb(); -+ return 0; -+} -+static inline void set_ibrs_supported(void) -+{ -+ use_ibrs |= 0x2; -+ if (!ibrs_disabled) -+ set_ibrs_inuse(); -+} -+static inline void set_ibrs_disabled(void) -+{ -+ use_ibrs |= 0x4; -+ if (check_ibrs_inuse()) -+ clear_ibrs_inuse(); -+} -+static inline void clear_ibrs_disabled(void) -+{ -+ use_ibrs &= ~0x4; -+ set_ibrs_inuse(); -+} -+#define ibrs_inuse (check_ibrs_inuse()) -+ -+/* indicate usage of IBPB to control execution speculation */ -+extern int use_ibpb; -+extern u32 sysctl_ibpb_enabled; -+#define ibpb_supported (use_ibpb & 0x2) -+#define ibpb_disabled (use_ibpb & 0x4) -+static inline void set_ibpb_inuse(void) -+{ -+ if (ibpb_supported) -+ use_ibpb |= 0x1; -+} -+static inline void clear_ibpb_inuse(void) -+{ -+ use_ibpb &= ~0x1; -+} -+static inline int check_ibpb_inuse(void) -+{ -+ if (use_ibpb & 0x1) -+ return 1; -+ else -+ /* rmb to prevent wrong speculation for security */ -+ rmb(); -+ return 0; -+} -+static inline void set_ibpb_supported(void) -+{ -+ use_ibpb |= 0x2; -+ if (!ibpb_disabled) -+ set_ibpb_inuse(); -+} -+static inline void set_ibpb_disabled(void) -+{ -+ use_ibpb |= 0x4; -+ if (check_ibpb_inuse()) -+ clear_ibpb_inuse(); -+} -+static inline void clear_ibpb_disabled(void) -+{ -+ use_ibpb &= ~0x4; -+ set_ibpb_inuse(); -+} -+#define ibpb_inuse (check_ibpb_inuse()) -+#endif -+ - #ifdef CONFIG_SMP - - #include -diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c -index f1d94c73625a..c69ea2efbed1 100644 ---- a/arch/x86/kernel/cpu/intel.c -+++ b/arch/x86/kernel/cpu/intel.c -@@ -628,10 +628,17 @@ static void init_intel(struct cpuinfo_x86 *c) - - init_intel_misc_features(c); - -- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) { - printk_once(KERN_INFO "FEATURE SPEC_CTRL Present\n"); -- else -+ set_ibrs_supported(); -+ set_ibpb_supported(); -+ if (ibrs_inuse) -+ sysctl_ibrs_enabled = 1; -+ if (ibpb_inuse) -+ sysctl_ibpb_enabled = 1; -+ } else { - printk_once(KERN_INFO "FEATURE SPEC_CTRL Not Present\n"); -+ } - } - - #ifdef CONFIG_X86_32 -diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c -index c4fa4a85d4cb..6450aeda72fc 100644 ---- a/arch/x86/kernel/cpu/microcode/core.c -+++ b/arch/x86/kernel/cpu/microcode/core.c -@@ -535,6 +535,17 @@ static ssize_t reload_store(struct device *dev, - } - if (!ret) - perf_check_microcode(); -+ -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) { -+ printk_once(KERN_INFO "FEATURE SPEC_CTRL Present\n"); -+ set_ibrs_supported(); -+ set_ibpb_supported(); -+ if (ibrs_inuse) -+ sysctl_ibrs_enabled = 1; -+ if (ibpb_inuse) -+ sysctl_ibpb_enabled = 1; -+ } -+ - mutex_unlock(µcode_mutex); - put_online_cpus(); - -diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c -index 3adb3806a284..3fdf5358998e 100644 ---- a/arch/x86/kernel/process.c -+++ b/arch/x86/kernel/process.c -@@ -447,16 +447,16 @@ static __cpuidle void mwait_idle(void) - mb(); /* quirk */ - } - -- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ if (ibrs_inuse) - native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); - - __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (!need_resched()) { - __sti_mwait(0, 0); -- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ if (ibrs_inuse) - native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); - } else { -- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ if (ibrs_inuse) - native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); - local_irq_enable(); - } -diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c -index a652bff7add4..9317aa4a7446 100644 ---- a/arch/x86/kernel/smpboot.c -+++ b/arch/x86/kernel/smpboot.c -@@ -1693,14 +1693,14 @@ void native_play_dead(void) - play_dead_common(); - tboot_shutdown(TB_SHUTDOWN_WFS); - -- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ if (ibrs_inuse) - native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); - - mwait_play_dead(); /* Only returns on failure */ - if (cpuidle_play_dead()) - hlt_play_dead(); - -- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ if (ibrs_inuse) - native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); - } - -diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c -index 496884b6467f..d2168203bddc 100644 ---- a/arch/x86/kvm/vmx.c -+++ b/arch/x86/kvm/vmx.c -@@ -2269,7 +2269,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) - if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { - per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; - vmcs_load(vmx->loaded_vmcs->vmcs); -- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ if (ibpb_inuse) - native_wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB); - } - -@@ -9102,7 +9102,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) - - atomic_switch_perf_msrs(vmx); - -- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ if (ibrs_inuse) - add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, - vcpu->arch.spec_ctrl, FEATURE_ENABLE_IBRS); - -diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c -index b088463973e4..72a174642550 100644 ---- a/arch/x86/lib/delay.c -+++ b/arch/x86/lib/delay.c -@@ -107,8 +107,7 @@ static void delay_mwaitx(unsigned long __loops) - for (;;) { - delay = min_t(u64, MWAITX_MAX_LOOPS, loops); - -- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) && -- (delay > IBRS_DISABLE_THRESHOLD)) -+ if (ibrs_inuse && (delay > IBRS_DISABLE_THRESHOLD)) - native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); - - /* -@@ -124,8 +123,7 @@ static void delay_mwaitx(unsigned long __loops) - */ - __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); - -- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) && -- (delay > IBRS_DISABLE_THRESHOLD)) -+ if (ibrs_inuse && (delay > IBRS_DISABLE_THRESHOLD)) - native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); - - end = rdtsc_ordered(); -diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c -index 301e6efbc514..6365f769de3d 100644 ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -221,7 +221,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, - bool need_flush; - - /* Null tsk means switching to kernel, so that's safe */ -- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) && tsk && -+ if (ibpb_inuse && tsk && - ___ptrace_may_access(tsk, current, PTRACE_MODE_IBPB)) - native_wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB); - -diff --git a/kernel/smp.c b/kernel/smp.c -index 3061483cb3ad..3bece045f4a4 100644 ---- a/kernel/smp.c -+++ b/kernel/smp.c -@@ -498,6 +498,26 @@ EXPORT_SYMBOL(smp_call_function); - unsigned int setup_max_cpus = NR_CPUS; - EXPORT_SYMBOL(setup_max_cpus); - -+#ifdef CONFIG_X86 -+/* -+ * use IBRS -+ * bit 0 = indicate if ibrs is currently in use -+ * bit 1 = indicate if system supports ibrs -+ * bit 2 = indicate if admin disables ibrs -+*/ -+ -+int use_ibrs; -+EXPORT_SYMBOL(use_ibrs); -+ -+/* -+ * use IBRS -+ * bit 0 = indicate if ibpb is currently in use -+ * bit 1 = indicate if system supports ibpb -+ * bit 2 = indicate if admin disables ibpb -+*/ -+int use_ibpb; -+EXPORT_SYMBOL(use_ibpb); -+#endif - - /* - * Setup routine for controlling SMP activation -@@ -522,6 +542,27 @@ static int __init nosmp(char *str) - - early_param("nosmp", nosmp); - -+#ifdef CONFIG_X86 -+static int __init noibrs(char *str) -+{ -+ set_ibrs_disabled(); -+ -+ return 0; -+} -+ -+early_param("noibrs", noibrs); -+ -+static int __init noibpb(char *str) -+{ -+ set_ibpb_disabled(); -+ -+ return 0; -+} -+ -+early_param("noibpb", noibpb); -+#endif -+ -+ - /* this is hard limit */ - static int __init nrcpus(char *str) - { -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index 7ab08d5728e6..69c37bd6251a 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -72,6 +72,7 @@ - #include - - #ifdef CONFIG_X86 -+#include - #include - #include - #include -@@ -222,6 +223,15 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); - #endif - -+#ifdef CONFIG_X86 -+int proc_dointvec_ibrs_ctrl(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos); -+int proc_dointvec_ibpb_ctrl(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos); -+int proc_dointvec_ibrs_dump(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos); -+#endif -+ - #ifdef CONFIG_MAGIC_SYSRQ - /* Note: sysrq code uses it's own private copy */ - static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; -@@ -258,6 +268,12 @@ extern struct ctl_table epoll_table[]; - int sysctl_legacy_va_layout; - #endif - -+u32 sysctl_ibrs_dump = 0; -+u32 sysctl_ibrs_enabled = 0; -+EXPORT_SYMBOL(sysctl_ibrs_enabled); -+u32 sysctl_ibpb_enabled = 0; -+EXPORT_SYMBOL(sysctl_ibpb_enabled); -+ - /* The default sysctl tables: */ - - static struct ctl_table sysctl_base_table[] = { -@@ -1241,6 +1257,35 @@ static struct ctl_table kern_table[] = { - .extra1 = &zero, - .extra2 = &one, - }, -+#endif -+#ifdef CONFIG_X86 -+ { -+ .procname = "ibrs_enabled", -+ .data = &sysctl_ibrs_enabled, -+ .maxlen = sizeof(unsigned int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec_ibrs_ctrl, -+ .extra1 = &zero, -+ .extra2 = &two, -+ }, -+ { -+ .procname = "ibpb_enabled", -+ .data = &sysctl_ibpb_enabled, -+ .maxlen = sizeof(unsigned int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec_ibpb_ctrl, -+ .extra1 = &zero, -+ .extra2 = &one, -+ }, -+ { -+ .procname = "ibrs_dump", -+ .data = &sysctl_ibrs_dump, -+ .maxlen = sizeof(unsigned int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec_ibrs_dump, -+ .extra1 = &zero, -+ .extra2 = &one, -+ }, - #endif - { } - }; -@@ -2585,6 +2630,86 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, - do_proc_dointvec_minmax_conv, ¶m); - } - -+#ifdef CONFIG_X86 -+int proc_dointvec_ibrs_dump(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos) -+{ -+ int ret; -+ unsigned int cpu; -+ -+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); -+ printk("sysctl_ibrs_enabled = %u, sysctl_ibpb_enabled = %u\n", sysctl_ibrs_enabled, sysctl_ibpb_enabled); -+ printk("use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); -+ for_each_online_cpu(cpu) { -+ u64 val; -+ -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) -+ rdmsrl_on_cpu(cpu, MSR_IA32_SPEC_CTRL, &val); -+ else -+ val = 0; -+ printk("read cpu %d ibrs val %lu\n", cpu, (unsigned long) val); -+ } -+ return ret; -+} -+ -+int proc_dointvec_ibrs_ctrl(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos) -+{ -+ int ret; -+ unsigned int cpu; -+ -+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); -+ pr_debug("sysctl_ibrs_enabled = %u, sysctl_ibpb_enabled = %u\n", sysctl_ibrs_enabled, sysctl_ibpb_enabled); -+ pr_debug("before:use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); -+ if (sysctl_ibrs_enabled == 0) { -+ /* always set IBRS off */ -+ set_ibrs_disabled(); -+ if (ibrs_supported) { -+ for_each_online_cpu(cpu) -+ wrmsrl_on_cpu(cpu, MSR_IA32_SPEC_CTRL, 0x0); -+ } -+ } else if (sysctl_ibrs_enabled == 2) { -+ /* always set IBRS on, even in user space */ -+ clear_ibrs_disabled(); -+ if (ibrs_supported) { -+ for_each_online_cpu(cpu) -+ wrmsrl_on_cpu(cpu, MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); -+ } else { -+ sysctl_ibrs_enabled = 0; -+ } -+ } else if (sysctl_ibrs_enabled == 1) { -+ /* use IBRS in kernel */ -+ clear_ibrs_disabled(); -+ if (!ibrs_inuse) -+ /* platform don't support ibrs */ -+ sysctl_ibrs_enabled = 0; -+ } -+ pr_debug("after:use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); -+ return ret; -+} -+ -+int proc_dointvec_ibpb_ctrl(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos) -+{ -+ int ret; -+ -+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); -+ pr_debug("sysctl_ibrs_enabled = %u, sysctl_ibpb_enabled = %u\n", sysctl_ibrs_enabled, sysctl_ibpb_enabled); -+ pr_debug("before:use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); -+ if (sysctl_ibpb_enabled == 0) -+ set_ibpb_disabled(); -+ else if (sysctl_ibpb_enabled == 1) { -+ clear_ibpb_disabled(); -+ if (!ibpb_inuse) -+ /* platform don't support ibpb */ -+ sysctl_ibpb_enabled = 0; -+ } -+ pr_debug("after:use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); -+ return ret; -+} -+#endif -+ -+ - struct do_proc_douintvec_minmax_conv_param { - unsigned int *min; - unsigned int *max; --- -2.14.2 - diff --git a/patches/kernel/0287-x86-spec_ctrl-Add-lock-to-serialize-changes-to-ibrs-.patch b/patches/kernel/0287-x86-spec_ctrl-Add-lock-to-serialize-changes-to-ibrs-.patch deleted file mode 100644 index ac1928c..0000000 --- a/patches/kernel/0287-x86-spec_ctrl-Add-lock-to-serialize-changes-to-ibrs-.patch +++ /dev/null @@ -1,166 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tim Chen -Date: Mon, 20 Nov 2017 13:47:54 -0800 -Subject: [PATCH] x86/spec_ctrl: Add lock to serialize changes to ibrs and ibpb - control -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 85789933bc45a3e763823675bd0d80e3e617f234) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/cpu/intel.c | 22 ++++++++++++---------- - arch/x86/kernel/cpu/microcode/core.c | 2 ++ - kernel/smp.c | 4 ++++ - kernel/sysctl.c | 14 +++++++++++++- - 4 files changed, 31 insertions(+), 11 deletions(-) - -diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c -index c69ea2efbed1..8d558e24783c 100644 ---- a/arch/x86/kernel/cpu/intel.c -+++ b/arch/x86/kernel/cpu/intel.c -@@ -628,16 +628,18 @@ static void init_intel(struct cpuinfo_x86 *c) - - init_intel_misc_features(c); - -- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) { -- printk_once(KERN_INFO "FEATURE SPEC_CTRL Present\n"); -- set_ibrs_supported(); -- set_ibpb_supported(); -- if (ibrs_inuse) -- sysctl_ibrs_enabled = 1; -- if (ibpb_inuse) -- sysctl_ibpb_enabled = 1; -- } else { -- printk_once(KERN_INFO "FEATURE SPEC_CTRL Not Present\n"); -+ if (!c->cpu_index) { -+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) { -+ printk(KERN_INFO "FEATURE SPEC_CTRL Present\n"); -+ set_ibrs_supported(); -+ set_ibpb_supported(); -+ if (ibrs_inuse) -+ sysctl_ibrs_enabled = 1; -+ if (ibpb_inuse) -+ sysctl_ibpb_enabled = 1; -+ } else { -+ printk(KERN_INFO "FEATURE SPEC_CTRL Not Present\n"); -+ } - } - } - -diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c -index 6450aeda72fc..55086921d29e 100644 ---- a/arch/x86/kernel/cpu/microcode/core.c -+++ b/arch/x86/kernel/cpu/microcode/core.c -@@ -538,12 +538,14 @@ static ssize_t reload_store(struct device *dev, - - if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) { - printk_once(KERN_INFO "FEATURE SPEC_CTRL Present\n"); -+ mutex_lock(&spec_ctrl_mutex); - set_ibrs_supported(); - set_ibpb_supported(); - if (ibrs_inuse) - sysctl_ibrs_enabled = 1; - if (ibpb_inuse) - sysctl_ibpb_enabled = 1; -+ mutex_unlock(&spec_ctrl_mutex); - } - - mutex_unlock(µcode_mutex); -diff --git a/kernel/smp.c b/kernel/smp.c -index 3bece045f4a4..a224ec0c540c 100644 ---- a/kernel/smp.c -+++ b/kernel/smp.c -@@ -519,6 +519,10 @@ int use_ibpb; - EXPORT_SYMBOL(use_ibpb); - #endif - -+/* mutex to serialize IBRS & IBPB control changes */ -+DEFINE_MUTEX(spec_ctrl_mutex); -+EXPORT_SYMBOL(spec_ctrl_mutex); -+ - /* - * Setup routine for controlling SMP activation - * -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index 69c37bd6251a..47a37792109d 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -69,6 +69,7 @@ - #include - - #include -+#include - #include - - #ifdef CONFIG_X86 -@@ -2634,12 +2635,17 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, - int proc_dointvec_ibrs_dump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) - { -- int ret; -+ int ret, orig_inuse; - unsigned int cpu; - -+ - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - printk("sysctl_ibrs_enabled = %u, sysctl_ibpb_enabled = %u\n", sysctl_ibrs_enabled, sysctl_ibpb_enabled); - printk("use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); -+ mutex_lock(&spec_ctrl_mutex); -+ orig_inuse = use_ibrs; -+ /* temporary halt to ibrs usage to dump ibrs values */ -+ clear_ibrs_inuse(); - for_each_online_cpu(cpu) { - u64 val; - -@@ -2649,6 +2655,8 @@ int proc_dointvec_ibrs_dump(struct ctl_table *table, int write, - val = 0; - printk("read cpu %d ibrs val %lu\n", cpu, (unsigned long) val); - } -+ use_ibrs = orig_inuse; -+ mutex_unlock(&spec_ctrl_mutex); - return ret; - } - -@@ -2661,6 +2669,7 @@ int proc_dointvec_ibrs_ctrl(struct ctl_table *table, int write, - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - pr_debug("sysctl_ibrs_enabled = %u, sysctl_ibpb_enabled = %u\n", sysctl_ibrs_enabled, sysctl_ibpb_enabled); - pr_debug("before:use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); -+ mutex_lock(&spec_ctrl_mutex); - if (sysctl_ibrs_enabled == 0) { - /* always set IBRS off */ - set_ibrs_disabled(); -@@ -2684,6 +2693,7 @@ int proc_dointvec_ibrs_ctrl(struct ctl_table *table, int write, - /* platform don't support ibrs */ - sysctl_ibrs_enabled = 0; - } -+ mutex_unlock(&spec_ctrl_mutex); - pr_debug("after:use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); - return ret; - } -@@ -2696,6 +2706,7 @@ int proc_dointvec_ibpb_ctrl(struct ctl_table *table, int write, - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - pr_debug("sysctl_ibrs_enabled = %u, sysctl_ibpb_enabled = %u\n", sysctl_ibrs_enabled, sysctl_ibpb_enabled); - pr_debug("before:use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); -+ mutex_lock(&spec_ctrl_mutex); - if (sysctl_ibpb_enabled == 0) - set_ibpb_disabled(); - else if (sysctl_ibpb_enabled == 1) { -@@ -2704,6 +2715,7 @@ int proc_dointvec_ibpb_ctrl(struct ctl_table *table, int write, - /* platform don't support ibpb */ - sysctl_ibpb_enabled = 0; - } -+ mutex_unlock(&spec_ctrl_mutex); - pr_debug("after:use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); - return ret; - } --- -2.14.2 - diff --git a/patches/kernel/0287-x86-spec_ctrl-Add-sysctl-knobs-to-enable-disable-SPE.patch b/patches/kernel/0287-x86-spec_ctrl-Add-sysctl-knobs-to-enable-disable-SPE.patch new file mode 100644 index 0000000..ef9bf3f --- /dev/null +++ b/patches/kernel/0287-x86-spec_ctrl-Add-sysctl-knobs-to-enable-disable-SPE.patch @@ -0,0 +1,613 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Thu, 16 Nov 2017 04:47:48 -0800 +Subject: [PATCH] x86/spec_ctrl: Add sysctl knobs to enable/disable SPEC_CTRL + feature +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +There are 2 ways to control IBPB and IBRS + +1. At boot time + noibrs kernel boot parameter will disable IBRS usage + noibpb kernel boot parameter will disable IBPB usage +Otherwise if the above parameters are not specified, the system +will enable ibrs and ibpb usage if the cpu supports it. + +2. At run time + echo 0 > /proc/sys/kernel/ibrs_enabled will turn off IBRS + echo 1 > /proc/sys/kernel/ibrs_enabled will turn on IBRS in kernel + echo 2 > /proc/sys/kernel/ibrs_enabled will turn on IBRS in both userspace and kernel + +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +[marcelo.cerri@canonical.com: add x86 guards to kernel/smp.c] +[marcelo.cerri@canonical.com: include asm/msr.h under x86 guard in kernel/sysctl.c] +Signed-off-by: Marcelo Henrique Cerri +(cherry picked from commit 23225db7b02c7f8b94e5d5050987430089e6f7cc) +Signed-off-by: Fabian Grünbichler +--- + Documentation/admin-guide/kernel-parameters.txt | 10 ++ + arch/x86/include/asm/mwait.h | 4 +- + arch/x86/include/asm/spec_ctrl.h | 24 ++++- + include/linux/smp.h | 87 +++++++++++++++++ + arch/x86/kernel/cpu/intel.c | 11 ++- + arch/x86/kernel/cpu/microcode/core.c | 11 +++ + arch/x86/kernel/process.c | 6 +- + arch/x86/kernel/smpboot.c | 4 +- + arch/x86/kvm/vmx.c | 4 +- + arch/x86/lib/delay.c | 6 +- + arch/x86/mm/tlb.c | 2 +- + kernel/smp.c | 41 ++++++++ + kernel/sysctl.c | 125 ++++++++++++++++++++++++ + 13 files changed, 316 insertions(+), 19 deletions(-) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 1a6ebc6cdf26..e7216bc05b3b 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -2566,6 +2566,16 @@ + noexec=on: enable non-executable mappings (default) + noexec=off: disable non-executable mappings + ++ noibrs [X86] ++ Don't use indirect branch restricted speculation (IBRS) ++ feature when running in secure environment, ++ to avoid performance overhead. ++ ++ noibpb [X86] ++ Don't use indirect branch prediction barrier (IBPB) ++ feature when running in secure environment, ++ to avoid performance overhead. ++ + nosmap [X86] + Disable SMAP (Supervisor Mode Access Prevention) + even if it is supported by processor. +diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h +index f15120ada161..d665daab3f84 100644 +--- a/arch/x86/include/asm/mwait.h ++++ b/arch/x86/include/asm/mwait.h +@@ -107,14 +107,14 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) + mb(); + } + +- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ if (ibrs_inuse) + native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + if (!need_resched()) + __mwait(eax, ecx); + +- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ if (ibrs_inuse) + native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); + } + current_clr_polling(); +diff --git a/arch/x86/include/asm/spec_ctrl.h b/arch/x86/include/asm/spec_ctrl.h +index 55ee1f36bda2..4c69e51261cc 100644 +--- a/arch/x86/include/asm/spec_ctrl.h ++++ b/arch/x86/include/asm/spec_ctrl.h +@@ -8,6 +8,9 @@ + + #ifdef __ASSEMBLY__ + ++.extern use_ibrs ++.extern use_ibpb ++ + #define __ASM_ENABLE_IBRS \ + pushq %rax; \ + pushq %rcx; \ +@@ -104,15 +107,30 @@ + add $(32*8), %rsp; + + .macro ENABLE_IBRS +-ALTERNATIVE "", __stringify(__ASM_ENABLE_IBRS), X86_FEATURE_SPEC_CTRL ++ testl $1, use_ibrs ++ jz 10f ++ __ASM_ENABLE_IBRS ++ jmp 20f ++10: ++ lfence ++20: + .endm + + .macro ENABLE_IBRS_CLOBBER +-ALTERNATIVE "", __stringify(__ASM_ENABLE_IBRS_CLOBBER), X86_FEATURE_SPEC_CTRL ++ testl $1, use_ibrs ++ jz 11f ++ __ASM_ENABLE_IBRS_CLOBBER ++ jmp 21f ++11: ++ lfence ++21: + .endm + + .macro DISABLE_IBRS +-ALTERNATIVE "", __stringify(__ASM_DISABLE_IBRS), X86_FEATURE_SPEC_CTRL ++ testl $1, use_ibrs ++ jz 9f ++ __ASM_DISABLE_IBRS ++9: + .endm + + .macro STUFF_RSB +diff --git a/include/linux/smp.h b/include/linux/smp.h +index 68123c1fe549..e2935c0a1bb4 100644 +--- a/include/linux/smp.h ++++ b/include/linux/smp.h +@@ -50,6 +50,93 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + + int smp_call_function_single_async(int cpu, struct call_single_data *csd); + ++#ifdef CONFIG_X86 ++/* indicate usage of IBRS to control execution speculation */ ++extern int use_ibrs; ++extern u32 sysctl_ibrs_enabled; ++extern struct mutex spec_ctrl_mutex; ++#define ibrs_supported (use_ibrs & 0x2) ++#define ibrs_disabled (use_ibrs & 0x4) ++static inline void set_ibrs_inuse(void) ++{ ++ if (ibrs_supported) ++ use_ibrs |= 0x1; ++} ++static inline void clear_ibrs_inuse(void) ++{ ++ use_ibrs &= ~0x1; ++} ++static inline int check_ibrs_inuse(void) ++{ ++ if (use_ibrs & 0x1) ++ return 1; ++ else ++ /* rmb to prevent wrong speculation for security */ ++ rmb(); ++ return 0; ++} ++static inline void set_ibrs_supported(void) ++{ ++ use_ibrs |= 0x2; ++ if (!ibrs_disabled) ++ set_ibrs_inuse(); ++} ++static inline void set_ibrs_disabled(void) ++{ ++ use_ibrs |= 0x4; ++ if (check_ibrs_inuse()) ++ clear_ibrs_inuse(); ++} ++static inline void clear_ibrs_disabled(void) ++{ ++ use_ibrs &= ~0x4; ++ set_ibrs_inuse(); ++} ++#define ibrs_inuse (check_ibrs_inuse()) ++ ++/* indicate usage of IBPB to control execution speculation */ ++extern int use_ibpb; ++extern u32 sysctl_ibpb_enabled; ++#define ibpb_supported (use_ibpb & 0x2) ++#define ibpb_disabled (use_ibpb & 0x4) ++static inline void set_ibpb_inuse(void) ++{ ++ if (ibpb_supported) ++ use_ibpb |= 0x1; ++} ++static inline void clear_ibpb_inuse(void) ++{ ++ use_ibpb &= ~0x1; ++} ++static inline int check_ibpb_inuse(void) ++{ ++ if (use_ibpb & 0x1) ++ return 1; ++ else ++ /* rmb to prevent wrong speculation for security */ ++ rmb(); ++ return 0; ++} ++static inline void set_ibpb_supported(void) ++{ ++ use_ibpb |= 0x2; ++ if (!ibpb_disabled) ++ set_ibpb_inuse(); ++} ++static inline void set_ibpb_disabled(void) ++{ ++ use_ibpb |= 0x4; ++ if (check_ibpb_inuse()) ++ clear_ibpb_inuse(); ++} ++static inline void clear_ibpb_disabled(void) ++{ ++ use_ibpb &= ~0x4; ++ set_ibpb_inuse(); ++} ++#define ibpb_inuse (check_ibpb_inuse()) ++#endif ++ + #ifdef CONFIG_SMP + + #include +diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c +index f1d94c73625a..c69ea2efbed1 100644 +--- a/arch/x86/kernel/cpu/intel.c ++++ b/arch/x86/kernel/cpu/intel.c +@@ -628,10 +628,17 @@ static void init_intel(struct cpuinfo_x86 *c) + + init_intel_misc_features(c); + +- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) { + printk_once(KERN_INFO "FEATURE SPEC_CTRL Present\n"); +- else ++ set_ibrs_supported(); ++ set_ibpb_supported(); ++ if (ibrs_inuse) ++ sysctl_ibrs_enabled = 1; ++ if (ibpb_inuse) ++ sysctl_ibpb_enabled = 1; ++ } else { + printk_once(KERN_INFO "FEATURE SPEC_CTRL Not Present\n"); ++ } + } + + #ifdef CONFIG_X86_32 +diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c +index c4fa4a85d4cb..6450aeda72fc 100644 +--- a/arch/x86/kernel/cpu/microcode/core.c ++++ b/arch/x86/kernel/cpu/microcode/core.c +@@ -535,6 +535,17 @@ static ssize_t reload_store(struct device *dev, + } + if (!ret) + perf_check_microcode(); ++ ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) { ++ printk_once(KERN_INFO "FEATURE SPEC_CTRL Present\n"); ++ set_ibrs_supported(); ++ set_ibpb_supported(); ++ if (ibrs_inuse) ++ sysctl_ibrs_enabled = 1; ++ if (ibpb_inuse) ++ sysctl_ibpb_enabled = 1; ++ } ++ + mutex_unlock(µcode_mutex); + put_online_cpus(); + +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 3adb3806a284..3fdf5358998e 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -447,16 +447,16 @@ static __cpuidle void mwait_idle(void) + mb(); /* quirk */ + } + +- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ if (ibrs_inuse) + native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + if (!need_resched()) { + __sti_mwait(0, 0); +- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ if (ibrs_inuse) + native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); + } else { +- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ if (ibrs_inuse) + native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); + local_irq_enable(); + } +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index a652bff7add4..9317aa4a7446 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -1693,14 +1693,14 @@ void native_play_dead(void) + play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); + +- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ if (ibrs_inuse) + native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); + + mwait_play_dead(); /* Only returns on failure */ + if (cpuidle_play_dead()) + hlt_play_dead(); + +- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ if (ibrs_inuse) + native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); + } + +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 496884b6467f..d2168203bddc 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -2269,7 +2269,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { + per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; + vmcs_load(vmx->loaded_vmcs->vmcs); +- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ if (ibpb_inuse) + native_wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB); + } + +@@ -9102,7 +9102,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) + + atomic_switch_perf_msrs(vmx); + +- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ if (ibrs_inuse) + add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL, + vcpu->arch.spec_ctrl, FEATURE_ENABLE_IBRS); + +diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c +index b088463973e4..72a174642550 100644 +--- a/arch/x86/lib/delay.c ++++ b/arch/x86/lib/delay.c +@@ -107,8 +107,7 @@ static void delay_mwaitx(unsigned long __loops) + for (;;) { + delay = min_t(u64, MWAITX_MAX_LOOPS, loops); + +- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) && +- (delay > IBRS_DISABLE_THRESHOLD)) ++ if (ibrs_inuse && (delay > IBRS_DISABLE_THRESHOLD)) + native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); + + /* +@@ -124,8 +123,7 @@ static void delay_mwaitx(unsigned long __loops) + */ + __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); + +- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) && +- (delay > IBRS_DISABLE_THRESHOLD)) ++ if (ibrs_inuse && (delay > IBRS_DISABLE_THRESHOLD)) + native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); + + end = rdtsc_ordered(); +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 301e6efbc514..6365f769de3d 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -221,7 +221,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + bool need_flush; + + /* Null tsk means switching to kernel, so that's safe */ +- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) && tsk && ++ if (ibpb_inuse && tsk && + ___ptrace_may_access(tsk, current, PTRACE_MODE_IBPB)) + native_wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB); + +diff --git a/kernel/smp.c b/kernel/smp.c +index 3061483cb3ad..3bece045f4a4 100644 +--- a/kernel/smp.c ++++ b/kernel/smp.c +@@ -498,6 +498,26 @@ EXPORT_SYMBOL(smp_call_function); + unsigned int setup_max_cpus = NR_CPUS; + EXPORT_SYMBOL(setup_max_cpus); + ++#ifdef CONFIG_X86 ++/* ++ * use IBRS ++ * bit 0 = indicate if ibrs is currently in use ++ * bit 1 = indicate if system supports ibrs ++ * bit 2 = indicate if admin disables ibrs ++*/ ++ ++int use_ibrs; ++EXPORT_SYMBOL(use_ibrs); ++ ++/* ++ * use IBRS ++ * bit 0 = indicate if ibpb is currently in use ++ * bit 1 = indicate if system supports ibpb ++ * bit 2 = indicate if admin disables ibpb ++*/ ++int use_ibpb; ++EXPORT_SYMBOL(use_ibpb); ++#endif + + /* + * Setup routine for controlling SMP activation +@@ -522,6 +542,27 @@ static int __init nosmp(char *str) + + early_param("nosmp", nosmp); + ++#ifdef CONFIG_X86 ++static int __init noibrs(char *str) ++{ ++ set_ibrs_disabled(); ++ ++ return 0; ++} ++ ++early_param("noibrs", noibrs); ++ ++static int __init noibpb(char *str) ++{ ++ set_ibpb_disabled(); ++ ++ return 0; ++} ++ ++early_param("noibpb", noibpb); ++#endif ++ ++ + /* this is hard limit */ + static int __init nrcpus(char *str) + { +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 7ab08d5728e6..69c37bd6251a 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -72,6 +72,7 @@ + #include + + #ifdef CONFIG_X86 ++#include + #include + #include + #include +@@ -222,6 +223,15 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + #endif + ++#ifdef CONFIG_X86 ++int proc_dointvec_ibrs_ctrl(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos); ++int proc_dointvec_ibpb_ctrl(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos); ++int proc_dointvec_ibrs_dump(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos); ++#endif ++ + #ifdef CONFIG_MAGIC_SYSRQ + /* Note: sysrq code uses it's own private copy */ + static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; +@@ -258,6 +268,12 @@ extern struct ctl_table epoll_table[]; + int sysctl_legacy_va_layout; + #endif + ++u32 sysctl_ibrs_dump = 0; ++u32 sysctl_ibrs_enabled = 0; ++EXPORT_SYMBOL(sysctl_ibrs_enabled); ++u32 sysctl_ibpb_enabled = 0; ++EXPORT_SYMBOL(sysctl_ibpb_enabled); ++ + /* The default sysctl tables: */ + + static struct ctl_table sysctl_base_table[] = { +@@ -1241,6 +1257,35 @@ static struct ctl_table kern_table[] = { + .extra1 = &zero, + .extra2 = &one, + }, ++#endif ++#ifdef CONFIG_X86 ++ { ++ .procname = "ibrs_enabled", ++ .data = &sysctl_ibrs_enabled, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_ibrs_ctrl, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++ { ++ .procname = "ibpb_enabled", ++ .data = &sysctl_ibpb_enabled, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_ibpb_ctrl, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++ { ++ .procname = "ibrs_dump", ++ .data = &sysctl_ibrs_dump, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_ibrs_dump, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, + #endif + { } + }; +@@ -2585,6 +2630,86 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, + do_proc_dointvec_minmax_conv, ¶m); + } + ++#ifdef CONFIG_X86 ++int proc_dointvec_ibrs_dump(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ int ret; ++ unsigned int cpu; ++ ++ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); ++ printk("sysctl_ibrs_enabled = %u, sysctl_ibpb_enabled = %u\n", sysctl_ibrs_enabled, sysctl_ibpb_enabled); ++ printk("use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); ++ for_each_online_cpu(cpu) { ++ u64 val; ++ ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) ++ rdmsrl_on_cpu(cpu, MSR_IA32_SPEC_CTRL, &val); ++ else ++ val = 0; ++ printk("read cpu %d ibrs val %lu\n", cpu, (unsigned long) val); ++ } ++ return ret; ++} ++ ++int proc_dointvec_ibrs_ctrl(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ int ret; ++ unsigned int cpu; ++ ++ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); ++ pr_debug("sysctl_ibrs_enabled = %u, sysctl_ibpb_enabled = %u\n", sysctl_ibrs_enabled, sysctl_ibpb_enabled); ++ pr_debug("before:use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); ++ if (sysctl_ibrs_enabled == 0) { ++ /* always set IBRS off */ ++ set_ibrs_disabled(); ++ if (ibrs_supported) { ++ for_each_online_cpu(cpu) ++ wrmsrl_on_cpu(cpu, MSR_IA32_SPEC_CTRL, 0x0); ++ } ++ } else if (sysctl_ibrs_enabled == 2) { ++ /* always set IBRS on, even in user space */ ++ clear_ibrs_disabled(); ++ if (ibrs_supported) { ++ for_each_online_cpu(cpu) ++ wrmsrl_on_cpu(cpu, MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); ++ } else { ++ sysctl_ibrs_enabled = 0; ++ } ++ } else if (sysctl_ibrs_enabled == 1) { ++ /* use IBRS in kernel */ ++ clear_ibrs_disabled(); ++ if (!ibrs_inuse) ++ /* platform don't support ibrs */ ++ sysctl_ibrs_enabled = 0; ++ } ++ pr_debug("after:use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); ++ return ret; ++} ++ ++int proc_dointvec_ibpb_ctrl(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ int ret; ++ ++ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); ++ pr_debug("sysctl_ibrs_enabled = %u, sysctl_ibpb_enabled = %u\n", sysctl_ibrs_enabled, sysctl_ibpb_enabled); ++ pr_debug("before:use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); ++ if (sysctl_ibpb_enabled == 0) ++ set_ibpb_disabled(); ++ else if (sysctl_ibpb_enabled == 1) { ++ clear_ibpb_disabled(); ++ if (!ibpb_inuse) ++ /* platform don't support ibpb */ ++ sysctl_ibpb_enabled = 0; ++ } ++ pr_debug("after:use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); ++ return ret; ++} ++#endif ++ ++ + struct do_proc_douintvec_minmax_conv_param { + unsigned int *min; + unsigned int *max; +-- +2.14.2 + diff --git a/patches/kernel/0288-x86-spec_ctrl-Add-lock-to-serialize-changes-to-ibrs-.patch b/patches/kernel/0288-x86-spec_ctrl-Add-lock-to-serialize-changes-to-ibrs-.patch new file mode 100644 index 0000000..ac1928c --- /dev/null +++ b/patches/kernel/0288-x86-spec_ctrl-Add-lock-to-serialize-changes-to-ibrs-.patch @@ -0,0 +1,166 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Mon, 20 Nov 2017 13:47:54 -0800 +Subject: [PATCH] x86/spec_ctrl: Add lock to serialize changes to ibrs and ibpb + control +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 85789933bc45a3e763823675bd0d80e3e617f234) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/cpu/intel.c | 22 ++++++++++++---------- + arch/x86/kernel/cpu/microcode/core.c | 2 ++ + kernel/smp.c | 4 ++++ + kernel/sysctl.c | 14 +++++++++++++- + 4 files changed, 31 insertions(+), 11 deletions(-) + +diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c +index c69ea2efbed1..8d558e24783c 100644 +--- a/arch/x86/kernel/cpu/intel.c ++++ b/arch/x86/kernel/cpu/intel.c +@@ -628,16 +628,18 @@ static void init_intel(struct cpuinfo_x86 *c) + + init_intel_misc_features(c); + +- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) { +- printk_once(KERN_INFO "FEATURE SPEC_CTRL Present\n"); +- set_ibrs_supported(); +- set_ibpb_supported(); +- if (ibrs_inuse) +- sysctl_ibrs_enabled = 1; +- if (ibpb_inuse) +- sysctl_ibpb_enabled = 1; +- } else { +- printk_once(KERN_INFO "FEATURE SPEC_CTRL Not Present\n"); ++ if (!c->cpu_index) { ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) { ++ printk(KERN_INFO "FEATURE SPEC_CTRL Present\n"); ++ set_ibrs_supported(); ++ set_ibpb_supported(); ++ if (ibrs_inuse) ++ sysctl_ibrs_enabled = 1; ++ if (ibpb_inuse) ++ sysctl_ibpb_enabled = 1; ++ } else { ++ printk(KERN_INFO "FEATURE SPEC_CTRL Not Present\n"); ++ } + } + } + +diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c +index 6450aeda72fc..55086921d29e 100644 +--- a/arch/x86/kernel/cpu/microcode/core.c ++++ b/arch/x86/kernel/cpu/microcode/core.c +@@ -538,12 +538,14 @@ static ssize_t reload_store(struct device *dev, + + if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) { + printk_once(KERN_INFO "FEATURE SPEC_CTRL Present\n"); ++ mutex_lock(&spec_ctrl_mutex); + set_ibrs_supported(); + set_ibpb_supported(); + if (ibrs_inuse) + sysctl_ibrs_enabled = 1; + if (ibpb_inuse) + sysctl_ibpb_enabled = 1; ++ mutex_unlock(&spec_ctrl_mutex); + } + + mutex_unlock(µcode_mutex); +diff --git a/kernel/smp.c b/kernel/smp.c +index 3bece045f4a4..a224ec0c540c 100644 +--- a/kernel/smp.c ++++ b/kernel/smp.c +@@ -519,6 +519,10 @@ int use_ibpb; + EXPORT_SYMBOL(use_ibpb); + #endif + ++/* mutex to serialize IBRS & IBPB control changes */ ++DEFINE_MUTEX(spec_ctrl_mutex); ++EXPORT_SYMBOL(spec_ctrl_mutex); ++ + /* + * Setup routine for controlling SMP activation + * +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 69c37bd6251a..47a37792109d 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -69,6 +69,7 @@ + #include + + #include ++#include + #include + + #ifdef CONFIG_X86 +@@ -2634,12 +2635,17 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, + int proc_dointvec_ibrs_dump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) + { +- int ret; ++ int ret, orig_inuse; + unsigned int cpu; + ++ + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + printk("sysctl_ibrs_enabled = %u, sysctl_ibpb_enabled = %u\n", sysctl_ibrs_enabled, sysctl_ibpb_enabled); + printk("use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); ++ mutex_lock(&spec_ctrl_mutex); ++ orig_inuse = use_ibrs; ++ /* temporary halt to ibrs usage to dump ibrs values */ ++ clear_ibrs_inuse(); + for_each_online_cpu(cpu) { + u64 val; + +@@ -2649,6 +2655,8 @@ int proc_dointvec_ibrs_dump(struct ctl_table *table, int write, + val = 0; + printk("read cpu %d ibrs val %lu\n", cpu, (unsigned long) val); + } ++ use_ibrs = orig_inuse; ++ mutex_unlock(&spec_ctrl_mutex); + return ret; + } + +@@ -2661,6 +2669,7 @@ int proc_dointvec_ibrs_ctrl(struct ctl_table *table, int write, + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + pr_debug("sysctl_ibrs_enabled = %u, sysctl_ibpb_enabled = %u\n", sysctl_ibrs_enabled, sysctl_ibpb_enabled); + pr_debug("before:use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); ++ mutex_lock(&spec_ctrl_mutex); + if (sysctl_ibrs_enabled == 0) { + /* always set IBRS off */ + set_ibrs_disabled(); +@@ -2684,6 +2693,7 @@ int proc_dointvec_ibrs_ctrl(struct ctl_table *table, int write, + /* platform don't support ibrs */ + sysctl_ibrs_enabled = 0; + } ++ mutex_unlock(&spec_ctrl_mutex); + pr_debug("after:use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); + return ret; + } +@@ -2696,6 +2706,7 @@ int proc_dointvec_ibpb_ctrl(struct ctl_table *table, int write, + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + pr_debug("sysctl_ibrs_enabled = %u, sysctl_ibpb_enabled = %u\n", sysctl_ibrs_enabled, sysctl_ibpb_enabled); + pr_debug("before:use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); ++ mutex_lock(&spec_ctrl_mutex); + if (sysctl_ibpb_enabled == 0) + set_ibpb_disabled(); + else if (sysctl_ibpb_enabled == 1) { +@@ -2704,6 +2715,7 @@ int proc_dointvec_ibpb_ctrl(struct ctl_table *table, int write, + /* platform don't support ibpb */ + sysctl_ibpb_enabled = 0; + } ++ mutex_unlock(&spec_ctrl_mutex); + pr_debug("after:use_ibrs = %d, use_ibpb = %d\n", use_ibrs, use_ibpb); + return ret; + } +-- +2.14.2 + diff --git a/patches/kernel/0288-x86-syscall-Clear-unused-extra-registers-on-syscall-.patch b/patches/kernel/0288-x86-syscall-Clear-unused-extra-registers-on-syscall-.patch deleted file mode 100644 index c1ab5a1..0000000 --- a/patches/kernel/0288-x86-syscall-Clear-unused-extra-registers-on-syscall-.patch +++ /dev/null @@ -1,94 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tim Chen -Date: Tue, 19 Sep 2017 15:21:40 -0700 -Subject: [PATCH] x86/syscall: Clear unused extra registers on syscall entrance -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -To prevent the unused registers %r12-%r15, %rbp and %rbx from -being used speculatively, we clear them upon syscall entrance -for code hygiene. - -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 7b5ea16f42b5e4860cf9033897bcdfa3e1209033) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/calling.h | 9 +++++++++ - arch/x86/entry/entry_64.S | 12 ++++++++---- - 2 files changed, 17 insertions(+), 4 deletions(-) - -diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h -index 015e0a84bb99..d537818ad285 100644 ---- a/arch/x86/entry/calling.h -+++ b/arch/x86/entry/calling.h -@@ -155,6 +155,15 @@ For 32-bit we have the following conventions - kernel is built with - popq %rbx - .endm - -+ .macro CLEAR_EXTRA_REGS -+ xorq %r15, %r15 -+ xorq %r14, %r14 -+ xorq %r13, %r13 -+ xorq %r12, %r12 -+ xorq %rbp, %rbp -+ xorq %rbx, %rbx -+ .endm -+ - .macro POP_C_REGS - popq %r11 - popq %r10 -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index f6ec4ad5b114..1118a6256c69 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -231,10 +231,16 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) - pushq %r9 /* pt_regs->r9 */ - pushq %r10 /* pt_regs->r10 */ - pushq %r11 /* pt_regs->r11 */ -- sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ -+ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not used */ - UNWIND_HINT_REGS extra=0 - - ENABLE_IBRS -+ /* -+ * Clear the unused extra regs for code hygiene. -+ * Will restore the callee saved extra regs at end of syscall. -+ */ -+ SAVE_EXTRA_REGS -+ CLEAR_EXTRA_REGS - - STUFF_RSB - -@@ -292,7 +298,7 @@ entry_SYSCALL_64_fastpath: - movq RIP(%rsp), %rcx - movq EFLAGS(%rsp), %r11 - DISABLE_IBRS -- addq $6*8, %rsp /* skip extra regs -- they were preserved */ -+ POP_EXTRA_REGS - UNWIND_HINT_EMPTY - jmp .Lpop_c_regs_except_rcx_r11_and_sysret - -@@ -304,14 +310,12 @@ entry_SYSCALL_64_fastpath: - */ - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) -- SAVE_EXTRA_REGS - movq %rsp, %rdi - call syscall_return_slowpath /* returns with IRQs disabled */ - jmp return_from_SYSCALL_64 - - entry_SYSCALL64_slow_path: - /* IRQs are off. */ -- SAVE_EXTRA_REGS - movq %rsp, %rdi - call do_syscall_64 /* returns with IRQs disabled */ - --- -2.14.2 - diff --git a/patches/kernel/0289-x86-syscall-Clear-unused-extra-registers-on-32-bit-c.patch b/patches/kernel/0289-x86-syscall-Clear-unused-extra-registers-on-32-bit-c.patch deleted file mode 100644 index e6f6cbc..0000000 --- a/patches/kernel/0289-x86-syscall-Clear-unused-extra-registers-on-32-bit-c.patch +++ /dev/null @@ -1,101 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tim Chen -Date: Fri, 15 Sep 2017 19:41:24 -0700 -Subject: [PATCH] x86/syscall: Clear unused extra registers on 32-bit - compatible syscall entrance -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -To prevent the unused registers %r8-%r15, from being used speculatively, -we clear them upon syscall entrance for code hygiene in 32 bit compatible -mode. - -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 725ad2ef81ccceb3e31a7263faae2059d05e2c48) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/calling.h | 11 +++++++++++ - arch/x86/entry/entry_64_compat.S | 18 ++++++++++++++---- - 2 files changed, 25 insertions(+), 4 deletions(-) - -diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h -index d537818ad285..0e34002bc801 100644 ---- a/arch/x86/entry/calling.h -+++ b/arch/x86/entry/calling.h -@@ -155,6 +155,17 @@ For 32-bit we have the following conventions - kernel is built with - popq %rbx - .endm - -+ .macro CLEAR_R8_TO_R15 -+ xorq %r15, %r15 -+ xorq %r14, %r14 -+ xorq %r13, %r13 -+ xorq %r12, %r12 -+ xorq %r11, %r11 -+ xorq %r10, %r10 -+ xorq %r9, %r9 -+ xorq %r8, %r8 -+ .endm -+ - .macro CLEAR_EXTRA_REGS - xorq %r15, %r15 - xorq %r14, %r14 -diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S -index 1480222bae02..8d7ae9657375 100644 ---- a/arch/x86/entry/entry_64_compat.S -+++ b/arch/x86/entry/entry_64_compat.S -@@ -99,6 +99,8 @@ ENTRY(entry_SYSENTER_compat) - ENABLE_IBRS - STUFF_RSB - -+ CLEAR_R8_TO_R15 -+ - /* - * SYSENTER doesn't filter flags, so we need to clear NT and AC - * ourselves. To save a few cycles, we can check whether -@@ -223,10 +225,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) - pushq $0 /* pt_regs->r11 = 0 */ - pushq %rbx /* pt_regs->rbx */ - pushq %rbp /* pt_regs->rbp (will be overwritten) */ -- pushq $0 /* pt_regs->r12 = 0 */ -- pushq $0 /* pt_regs->r13 = 0 */ -- pushq $0 /* pt_regs->r14 = 0 */ -- pushq $0 /* pt_regs->r15 = 0 */ -+ pushq %r12 /* pt_regs->r12 */ -+ pushq %r13 /* pt_regs->r13 */ -+ pushq %r14 /* pt_regs->r14 */ -+ pushq %r15 /* pt_regs->r15 */ -+ -+ CLEAR_R8_TO_R15 - - STUFF_RSB - -@@ -245,6 +249,10 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) - /* Opportunistic SYSRET */ - sysret32_from_system_call: - TRACE_IRQS_ON /* User mode traces as IRQs on. */ -+ movq R15(%rsp), %r15 /* pt_regs->r15 */ -+ movq R14(%rsp), %r14 /* pt_regs->r14 */ -+ movq R13(%rsp), %r13 /* pt_regs->r13 */ -+ movq R12(%rsp), %r12 /* pt_regs->r12 */ - movq RBX(%rsp), %rbx /* pt_regs->rbx */ - movq RBP(%rsp), %rbp /* pt_regs->rbp */ - movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ -@@ -359,6 +367,8 @@ ENTRY(entry_INT80_compat) - ENABLE_IBRS - STUFF_RSB - -+ CLEAR_R8_TO_R15 -+ - /* - * User mode is traced as though IRQs are on, and the interrupt - * gate turned them off. --- -2.14.2 - diff --git a/patches/kernel/0289-x86-syscall-Clear-unused-extra-registers-on-syscall-.patch b/patches/kernel/0289-x86-syscall-Clear-unused-extra-registers-on-syscall-.patch new file mode 100644 index 0000000..c1ab5a1 --- /dev/null +++ b/patches/kernel/0289-x86-syscall-Clear-unused-extra-registers-on-syscall-.patch @@ -0,0 +1,94 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Tue, 19 Sep 2017 15:21:40 -0700 +Subject: [PATCH] x86/syscall: Clear unused extra registers on syscall entrance +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +To prevent the unused registers %r12-%r15, %rbp and %rbx from +being used speculatively, we clear them upon syscall entrance +for code hygiene. + +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 7b5ea16f42b5e4860cf9033897bcdfa3e1209033) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/calling.h | 9 +++++++++ + arch/x86/entry/entry_64.S | 12 ++++++++---- + 2 files changed, 17 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h +index 015e0a84bb99..d537818ad285 100644 +--- a/arch/x86/entry/calling.h ++++ b/arch/x86/entry/calling.h +@@ -155,6 +155,15 @@ For 32-bit we have the following conventions - kernel is built with + popq %rbx + .endm + ++ .macro CLEAR_EXTRA_REGS ++ xorq %r15, %r15 ++ xorq %r14, %r14 ++ xorq %r13, %r13 ++ xorq %r12, %r12 ++ xorq %rbp, %rbp ++ xorq %rbx, %rbx ++ .endm ++ + .macro POP_C_REGS + popq %r11 + popq %r10 +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index f6ec4ad5b114..1118a6256c69 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -231,10 +231,16 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ +- sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ ++ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not used */ + UNWIND_HINT_REGS extra=0 + + ENABLE_IBRS ++ /* ++ * Clear the unused extra regs for code hygiene. ++ * Will restore the callee saved extra regs at end of syscall. ++ */ ++ SAVE_EXTRA_REGS ++ CLEAR_EXTRA_REGS + + STUFF_RSB + +@@ -292,7 +298,7 @@ entry_SYSCALL_64_fastpath: + movq RIP(%rsp), %rcx + movq EFLAGS(%rsp), %r11 + DISABLE_IBRS +- addq $6*8, %rsp /* skip extra regs -- they were preserved */ ++ POP_EXTRA_REGS + UNWIND_HINT_EMPTY + jmp .Lpop_c_regs_except_rcx_r11_and_sysret + +@@ -304,14 +310,12 @@ entry_SYSCALL_64_fastpath: + */ + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) +- SAVE_EXTRA_REGS + movq %rsp, %rdi + call syscall_return_slowpath /* returns with IRQs disabled */ + jmp return_from_SYSCALL_64 + + entry_SYSCALL64_slow_path: + /* IRQs are off. */ +- SAVE_EXTRA_REGS + movq %rsp, %rdi + call do_syscall_64 /* returns with IRQs disabled */ + +-- +2.14.2 + diff --git a/patches/kernel/0290-x86-entry-Use-retpoline-for-syscall-s-indirect-calls.patch b/patches/kernel/0290-x86-entry-Use-retpoline-for-syscall-s-indirect-calls.patch deleted file mode 100644 index 831c137..0000000 --- a/patches/kernel/0290-x86-entry-Use-retpoline-for-syscall-s-indirect-calls.patch +++ /dev/null @@ -1,44 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tim Chen -Date: Wed, 8 Nov 2017 16:30:06 -0800 -Subject: [PATCH] x86/entry: Use retpoline for syscall's indirect calls -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Signed-off-by: Tim Chen -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit d2e0236f395e876f5303fb5021e4fe6eea881402) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/entry/entry_64.S | 10 +++++++++- - 1 file changed, 9 insertions(+), 1 deletion(-) - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 1118a6256c69..be7196967f9f 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -276,7 +276,15 @@ entry_SYSCALL_64_fastpath: - * It might end up jumping to the slow path. If it jumps, RAX - * and all argument registers are clobbered. - */ -- call *sys_call_table(, %rax, 8) -+ movq sys_call_table(, %rax, 8), %r10 -+ jmp 1f -+4: callq 2f -+3: nop -+ jmp 3b -+2: mov %r10, (%rsp) -+ retq -+1: callq 4b -+ - .Lentry_SYSCALL_64_after_fastpath_call: - - movq %rax, RAX(%rsp) --- -2.14.2 - diff --git a/patches/kernel/0290-x86-syscall-Clear-unused-extra-registers-on-32-bit-c.patch b/patches/kernel/0290-x86-syscall-Clear-unused-extra-registers-on-32-bit-c.patch new file mode 100644 index 0000000..e6f6cbc --- /dev/null +++ b/patches/kernel/0290-x86-syscall-Clear-unused-extra-registers-on-32-bit-c.patch @@ -0,0 +1,101 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Fri, 15 Sep 2017 19:41:24 -0700 +Subject: [PATCH] x86/syscall: Clear unused extra registers on 32-bit + compatible syscall entrance +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +To prevent the unused registers %r8-%r15, from being used speculatively, +we clear them upon syscall entrance for code hygiene in 32 bit compatible +mode. + +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 725ad2ef81ccceb3e31a7263faae2059d05e2c48) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/calling.h | 11 +++++++++++ + arch/x86/entry/entry_64_compat.S | 18 ++++++++++++++---- + 2 files changed, 25 insertions(+), 4 deletions(-) + +diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h +index d537818ad285..0e34002bc801 100644 +--- a/arch/x86/entry/calling.h ++++ b/arch/x86/entry/calling.h +@@ -155,6 +155,17 @@ For 32-bit we have the following conventions - kernel is built with + popq %rbx + .endm + ++ .macro CLEAR_R8_TO_R15 ++ xorq %r15, %r15 ++ xorq %r14, %r14 ++ xorq %r13, %r13 ++ xorq %r12, %r12 ++ xorq %r11, %r11 ++ xorq %r10, %r10 ++ xorq %r9, %r9 ++ xorq %r8, %r8 ++ .endm ++ + .macro CLEAR_EXTRA_REGS + xorq %r15, %r15 + xorq %r14, %r14 +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index 1480222bae02..8d7ae9657375 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -99,6 +99,8 @@ ENTRY(entry_SYSENTER_compat) + ENABLE_IBRS + STUFF_RSB + ++ CLEAR_R8_TO_R15 ++ + /* + * SYSENTER doesn't filter flags, so we need to clear NT and AC + * ourselves. To save a few cycles, we can check whether +@@ -223,10 +225,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) + pushq $0 /* pt_regs->r11 = 0 */ + pushq %rbx /* pt_regs->rbx */ + pushq %rbp /* pt_regs->rbp (will be overwritten) */ +- pushq $0 /* pt_regs->r12 = 0 */ +- pushq $0 /* pt_regs->r13 = 0 */ +- pushq $0 /* pt_regs->r14 = 0 */ +- pushq $0 /* pt_regs->r15 = 0 */ ++ pushq %r12 /* pt_regs->r12 */ ++ pushq %r13 /* pt_regs->r13 */ ++ pushq %r14 /* pt_regs->r14 */ ++ pushq %r15 /* pt_regs->r15 */ ++ ++ CLEAR_R8_TO_R15 + + STUFF_RSB + +@@ -245,6 +249,10 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) + /* Opportunistic SYSRET */ + sysret32_from_system_call: + TRACE_IRQS_ON /* User mode traces as IRQs on. */ ++ movq R15(%rsp), %r15 /* pt_regs->r15 */ ++ movq R14(%rsp), %r14 /* pt_regs->r14 */ ++ movq R13(%rsp), %r13 /* pt_regs->r13 */ ++ movq R12(%rsp), %r12 /* pt_regs->r12 */ + movq RBX(%rsp), %rbx /* pt_regs->rbx */ + movq RBP(%rsp), %rbp /* pt_regs->rbp */ + movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ +@@ -359,6 +367,8 @@ ENTRY(entry_INT80_compat) + ENABLE_IBRS + STUFF_RSB + ++ CLEAR_R8_TO_R15 ++ + /* + * User mode is traced as though IRQs are on, and the interrupt + * gate turned them off. +-- +2.14.2 + diff --git a/patches/kernel/0291-x86-cpu-AMD-Add-speculative-control-support-for-AMD.patch b/patches/kernel/0291-x86-cpu-AMD-Add-speculative-control-support-for-AMD.patch deleted file mode 100644 index febc693..0000000 --- a/patches/kernel/0291-x86-cpu-AMD-Add-speculative-control-support-for-AMD.patch +++ /dev/null @@ -1,112 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tom Lendacky -Date: Wed, 20 Dec 2017 10:52:54 +0000 -Subject: [PATCH] x86/cpu/AMD: Add speculative control support for AMD -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Add speculative control support for AMD processors. For AMD, speculative -control is indicated as follows: - - CPUID EAX=0x00000007, ECX=0x00 return EDX[26] indicates support for - both IBRS and IBPB. - - CPUID EAX=0x80000008, ECX=0x00 return EBX[12] indicates support for - just IBPB. - -On AMD family 0x10, 0x12 and 0x16 processors where either of the above -features are not supported, IBPB can be achieved by disabling -indirect branch predictor support in MSR 0xc0011021[14] at boot. - -Signed-off-by: Tom Lendacky -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 8c3fc9e98177daee2281ed40e3d61f9cf4eee576) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/cpufeatures.h | 1 + - arch/x86/include/asm/msr-index.h | 1 + - arch/x86/kernel/cpu/amd.c | 39 ++++++++++++++++++++++++++++++++++++++ - 3 files changed, 41 insertions(+) - -diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h -index 44be8fd069bf..a97b327137aa 100644 ---- a/arch/x86/include/asm/cpufeatures.h -+++ b/arch/x86/include/asm/cpufeatures.h -@@ -268,6 +268,7 @@ - #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ - #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ - #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ -+#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ - - /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ - #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h -index 4e3438a00a50..954aad6c32f4 100644 ---- a/arch/x86/include/asm/msr-index.h -+++ b/arch/x86/include/asm/msr-index.h -@@ -345,6 +345,7 @@ - #define MSR_F15H_NB_PERF_CTR 0xc0010241 - #define MSR_F15H_PTSC 0xc0010280 - #define MSR_F15H_IC_CFG 0xc0011021 -+#define MSR_F15H_IC_CFG_DIS_IND BIT_ULL(14) - - /* Fam 10h MSRs */ - #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 -diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c -index 99eef4a09fd9..42871c1a8da8 100644 ---- a/arch/x86/kernel/cpu/amd.c -+++ b/arch/x86/kernel/cpu/amd.c -@@ -830,6 +830,45 @@ static void init_amd(struct cpuinfo_x86 *c) - /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */ - if (!cpu_has(c, X86_FEATURE_XENPV)) - set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); -+ -+ /* AMD speculative control support */ -+ if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { -+ pr_info_once("FEATURE SPEC_CTRL Present\n"); -+ set_ibrs_supported(); -+ set_ibpb_supported(); -+ if (ibrs_inuse) -+ sysctl_ibrs_enabled = 1; -+ if (ibpb_inuse) -+ sysctl_ibpb_enabled = 1; -+ } else if (cpu_has(c, X86_FEATURE_IBPB)) { -+ pr_info_once("FEATURE SPEC_CTRL Not Present\n"); -+ pr_info_once("FEATURE IBPB Present\n"); -+ set_ibpb_supported(); -+ if (ibpb_inuse) -+ sysctl_ibpb_enabled = 1; -+ } else { -+ pr_info_once("FEATURE SPEC_CTRL Not Present\n"); -+ pr_info_once("FEATURE IBPB Not Present\n"); -+ /* -+ * On AMD processors that do not support the speculative -+ * control features, IBPB type support can be achieved by -+ * disabling indirect branch predictor support. -+ */ -+ if (!ibpb_disabled) { -+ u64 val; -+ -+ switch (c->x86) { -+ case 0x10: -+ case 0x12: -+ case 0x16: -+ pr_info_once("Disabling indirect branch predictor support\n"); -+ rdmsrl(MSR_F15H_IC_CFG, val); -+ val |= MSR_F15H_IC_CFG_DIS_IND; -+ wrmsrl(MSR_F15H_IC_CFG, val); -+ break; -+ } -+ } -+ } - } - - #ifdef CONFIG_X86_32 --- -2.14.2 - diff --git a/patches/kernel/0291-x86-entry-Use-retpoline-for-syscall-s-indirect-calls.patch b/patches/kernel/0291-x86-entry-Use-retpoline-for-syscall-s-indirect-calls.patch new file mode 100644 index 0000000..831c137 --- /dev/null +++ b/patches/kernel/0291-x86-entry-Use-retpoline-for-syscall-s-indirect-calls.patch @@ -0,0 +1,44 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Wed, 8 Nov 2017 16:30:06 -0800 +Subject: [PATCH] x86/entry: Use retpoline for syscall's indirect calls +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Signed-off-by: Tim Chen +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit d2e0236f395e876f5303fb5021e4fe6eea881402) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/entry/entry_64.S | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 1118a6256c69..be7196967f9f 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -276,7 +276,15 @@ entry_SYSCALL_64_fastpath: + * It might end up jumping to the slow path. If it jumps, RAX + * and all argument registers are clobbered. + */ +- call *sys_call_table(, %rax, 8) ++ movq sys_call_table(, %rax, 8), %r10 ++ jmp 1f ++4: callq 2f ++3: nop ++ jmp 3b ++2: mov %r10, (%rsp) ++ retq ++1: callq 4b ++ + .Lentry_SYSCALL_64_after_fastpath_call: + + movq %rax, RAX(%rsp) +-- +2.14.2 + diff --git a/patches/kernel/0292-x86-cpu-AMD-Add-speculative-control-support-for-AMD.patch b/patches/kernel/0292-x86-cpu-AMD-Add-speculative-control-support-for-AMD.patch new file mode 100644 index 0000000..febc693 --- /dev/null +++ b/patches/kernel/0292-x86-cpu-AMD-Add-speculative-control-support-for-AMD.patch @@ -0,0 +1,112 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Wed, 20 Dec 2017 10:52:54 +0000 +Subject: [PATCH] x86/cpu/AMD: Add speculative control support for AMD +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Add speculative control support for AMD processors. For AMD, speculative +control is indicated as follows: + + CPUID EAX=0x00000007, ECX=0x00 return EDX[26] indicates support for + both IBRS and IBPB. + + CPUID EAX=0x80000008, ECX=0x00 return EBX[12] indicates support for + just IBPB. + +On AMD family 0x10, 0x12 and 0x16 processors where either of the above +features are not supported, IBPB can be achieved by disabling +indirect branch predictor support in MSR 0xc0011021[14] at boot. + +Signed-off-by: Tom Lendacky +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 8c3fc9e98177daee2281ed40e3d61f9cf4eee576) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/cpufeatures.h | 1 + + arch/x86/include/asm/msr-index.h | 1 + + arch/x86/kernel/cpu/amd.c | 39 ++++++++++++++++++++++++++++++++++++++ + 3 files changed, 41 insertions(+) + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 44be8fd069bf..a97b327137aa 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -268,6 +268,7 @@ + #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ + #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ + #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ ++#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ + + /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ + #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index 4e3438a00a50..954aad6c32f4 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -345,6 +345,7 @@ + #define MSR_F15H_NB_PERF_CTR 0xc0010241 + #define MSR_F15H_PTSC 0xc0010280 + #define MSR_F15H_IC_CFG 0xc0011021 ++#define MSR_F15H_IC_CFG_DIS_IND BIT_ULL(14) + + /* Fam 10h MSRs */ + #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 +diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c +index 99eef4a09fd9..42871c1a8da8 100644 +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -830,6 +830,45 @@ static void init_amd(struct cpuinfo_x86 *c) + /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */ + if (!cpu_has(c, X86_FEATURE_XENPV)) + set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); ++ ++ /* AMD speculative control support */ ++ if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { ++ pr_info_once("FEATURE SPEC_CTRL Present\n"); ++ set_ibrs_supported(); ++ set_ibpb_supported(); ++ if (ibrs_inuse) ++ sysctl_ibrs_enabled = 1; ++ if (ibpb_inuse) ++ sysctl_ibpb_enabled = 1; ++ } else if (cpu_has(c, X86_FEATURE_IBPB)) { ++ pr_info_once("FEATURE SPEC_CTRL Not Present\n"); ++ pr_info_once("FEATURE IBPB Present\n"); ++ set_ibpb_supported(); ++ if (ibpb_inuse) ++ sysctl_ibpb_enabled = 1; ++ } else { ++ pr_info_once("FEATURE SPEC_CTRL Not Present\n"); ++ pr_info_once("FEATURE IBPB Not Present\n"); ++ /* ++ * On AMD processors that do not support the speculative ++ * control features, IBPB type support can be achieved by ++ * disabling indirect branch predictor support. ++ */ ++ if (!ibpb_disabled) { ++ u64 val; ++ ++ switch (c->x86) { ++ case 0x10: ++ case 0x12: ++ case 0x16: ++ pr_info_once("Disabling indirect branch predictor support\n"); ++ rdmsrl(MSR_F15H_IC_CFG, val); ++ val |= MSR_F15H_IC_CFG_DIS_IND; ++ wrmsrl(MSR_F15H_IC_CFG, val); ++ break; ++ } ++ } ++ } + } + + #ifdef CONFIG_X86_32 +-- +2.14.2 + diff --git a/patches/kernel/0292-x86-microcode-Extend-post-microcode-reload-to-suppor.patch b/patches/kernel/0292-x86-microcode-Extend-post-microcode-reload-to-suppor.patch deleted file mode 100644 index eef6684..0000000 --- a/patches/kernel/0292-x86-microcode-Extend-post-microcode-reload-to-suppor.patch +++ /dev/null @@ -1,45 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tom Lendacky -Date: Wed, 20 Dec 2017 10:55:47 +0000 -Subject: [PATCH] x86/microcode: Extend post microcode reload to support IBPB - feature -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Add an IBPB feature check to the speculative control update check after -a microcode reload. - -Signed-off-by: Tom Lendacky -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 099878acd3738271fb2ade01f4649b1ed2fb72d5) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/cpu/microcode/core.c | 7 +++++++ - 1 file changed, 7 insertions(+) - -diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c -index 55086921d29e..638c08350d65 100644 ---- a/arch/x86/kernel/cpu/microcode/core.c -+++ b/arch/x86/kernel/cpu/microcode/core.c -@@ -546,6 +546,13 @@ static ssize_t reload_store(struct device *dev, - if (ibpb_inuse) - sysctl_ibpb_enabled = 1; - mutex_unlock(&spec_ctrl_mutex); -+ } else if (boot_cpu_has(X86_FEATURE_IBPB)) { -+ printk_once(KERN_INFO "FEATURE IBPB Present\n"); -+ mutex_lock(&spec_ctrl_mutex); -+ set_ibpb_supported(); -+ if (ibpb_inuse) -+ sysctl_ibpb_enabled = 1; -+ mutex_unlock(&spec_ctrl_mutex); - } - - mutex_unlock(µcode_mutex); --- -2.14.2 - diff --git a/patches/kernel/0293-KVM-SVM-Do-not-intercept-new-speculative-control-MSR.patch b/patches/kernel/0293-KVM-SVM-Do-not-intercept-new-speculative-control-MSR.patch deleted file mode 100644 index b57fe4e..0000000 --- a/patches/kernel/0293-KVM-SVM-Do-not-intercept-new-speculative-control-MSR.patch +++ /dev/null @@ -1,40 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tom Lendacky -Date: Wed, 20 Dec 2017 10:55:47 +0000 -Subject: [PATCH] KVM: SVM: Do not intercept new speculative control MSRs -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Allow guest access to the speculative control MSRs without being -intercepted. - -Signed-off-by: Paolo Bonzini -Signed-off-by: Tom Lendacky -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit ccaa77a824fd3e21f0b8ae6b5a66fc1ee7e35b14) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kvm/svm.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c -index 92cd94d51e1f..94adf6becc2e 100644 ---- a/arch/x86/kvm/svm.c -+++ b/arch/x86/kvm/svm.c -@@ -248,6 +248,8 @@ static const struct svm_direct_access_msrs { - { .index = MSR_CSTAR, .always = true }, - { .index = MSR_SYSCALL_MASK, .always = true }, - #endif -+ { .index = MSR_IA32_SPEC_CTRL, .always = true }, -+ { .index = MSR_IA32_PRED_CMD, .always = true }, - { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, - { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, - { .index = MSR_IA32_LASTINTFROMIP, .always = false }, --- -2.14.2 - diff --git a/patches/kernel/0293-x86-microcode-Extend-post-microcode-reload-to-suppor.patch b/patches/kernel/0293-x86-microcode-Extend-post-microcode-reload-to-suppor.patch new file mode 100644 index 0000000..eef6684 --- /dev/null +++ b/patches/kernel/0293-x86-microcode-Extend-post-microcode-reload-to-suppor.patch @@ -0,0 +1,45 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Wed, 20 Dec 2017 10:55:47 +0000 +Subject: [PATCH] x86/microcode: Extend post microcode reload to support IBPB + feature +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Add an IBPB feature check to the speculative control update check after +a microcode reload. + +Signed-off-by: Tom Lendacky +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 099878acd3738271fb2ade01f4649b1ed2fb72d5) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/cpu/microcode/core.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c +index 55086921d29e..638c08350d65 100644 +--- a/arch/x86/kernel/cpu/microcode/core.c ++++ b/arch/x86/kernel/cpu/microcode/core.c +@@ -546,6 +546,13 @@ static ssize_t reload_store(struct device *dev, + if (ibpb_inuse) + sysctl_ibpb_enabled = 1; + mutex_unlock(&spec_ctrl_mutex); ++ } else if (boot_cpu_has(X86_FEATURE_IBPB)) { ++ printk_once(KERN_INFO "FEATURE IBPB Present\n"); ++ mutex_lock(&spec_ctrl_mutex); ++ set_ibpb_supported(); ++ if (ibpb_inuse) ++ sysctl_ibpb_enabled = 1; ++ mutex_unlock(&spec_ctrl_mutex); + } + + mutex_unlock(µcode_mutex); +-- +2.14.2 + diff --git a/patches/kernel/0294-KVM-SVM-Do-not-intercept-new-speculative-control-MSR.patch b/patches/kernel/0294-KVM-SVM-Do-not-intercept-new-speculative-control-MSR.patch new file mode 100644 index 0000000..b57fe4e --- /dev/null +++ b/patches/kernel/0294-KVM-SVM-Do-not-intercept-new-speculative-control-MSR.patch @@ -0,0 +1,40 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Wed, 20 Dec 2017 10:55:47 +0000 +Subject: [PATCH] KVM: SVM: Do not intercept new speculative control MSRs +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Allow guest access to the speculative control MSRs without being +intercepted. + +Signed-off-by: Paolo Bonzini +Signed-off-by: Tom Lendacky +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit ccaa77a824fd3e21f0b8ae6b5a66fc1ee7e35b14) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kvm/svm.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c +index 92cd94d51e1f..94adf6becc2e 100644 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -248,6 +248,8 @@ static const struct svm_direct_access_msrs { + { .index = MSR_CSTAR, .always = true }, + { .index = MSR_SYSCALL_MASK, .always = true }, + #endif ++ { .index = MSR_IA32_SPEC_CTRL, .always = true }, ++ { .index = MSR_IA32_PRED_CMD, .always = true }, + { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, + { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, + { .index = MSR_IA32_LASTINTFROMIP, .always = false }, +-- +2.14.2 + diff --git a/patches/kernel/0294-x86-svm-Set-IBRS-value-on-VM-entry-and-exit.patch b/patches/kernel/0294-x86-svm-Set-IBRS-value-on-VM-entry-and-exit.patch deleted file mode 100644 index 880d9b4..0000000 --- a/patches/kernel/0294-x86-svm-Set-IBRS-value-on-VM-entry-and-exit.patch +++ /dev/null @@ -1,83 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tom Lendacky -Date: Wed, 20 Dec 2017 10:55:47 +0000 -Subject: [PATCH] x86/svm: Set IBRS value on VM entry and exit -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Set/restore the guests IBRS value on VM entry. On VM exit back to the -kernel save the guest IBRS value and then set IBRS to 1. - -Signed-off-by: Paolo Bonzini -Signed-off-by: Tom Lendacky -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 72f71e6826fac9a656c3994fb6f979cd65a14c64) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kvm/svm.c | 17 +++++++++++++++++ - 1 file changed, 17 insertions(+) - -diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c -index 94adf6becc2e..a1b19e810c49 100644 ---- a/arch/x86/kvm/svm.c -+++ b/arch/x86/kvm/svm.c -@@ -175,6 +175,8 @@ struct vcpu_svm { - - u64 next_rip; - -+ u64 spec_ctrl; -+ - u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; - struct { - u16 fs; -@@ -3547,6 +3549,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) - case MSR_VM_CR: - msr_info->data = svm->nested.vm_cr_msr; - break; -+ case MSR_IA32_SPEC_CTRL: -+ msr_info->data = svm->spec_ctrl; -+ break; - case MSR_IA32_UCODE_REV: - msr_info->data = 0x01000065; - break; -@@ -3702,6 +3707,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) - case MSR_VM_IGNNE: - vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); - break; -+ case MSR_IA32_SPEC_CTRL: -+ svm->spec_ctrl = data; -+ break; - case MSR_IA32_APICBASE: - if (kvm_vcpu_apicv_active(vcpu)) - avic_update_vapic_bar(to_svm(vcpu), data); -@@ -4883,6 +4891,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) - - local_irq_enable(); - -+ if (ibrs_inuse && (svm->spec_ctrl != FEATURE_ENABLE_IBRS)) -+ wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); -+ - asm volatile ( - "push %%" _ASM_BP "; \n\t" - "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" -@@ -4975,6 +4986,12 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) - #endif - ); - -+ if (ibrs_inuse) { -+ rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); -+ if (svm->spec_ctrl != FEATURE_ENABLE_IBRS) -+ wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); -+ } -+ - #ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, svm->host.gs_base); - #else --- -2.14.2 - diff --git a/patches/kernel/0295-x86-svm-Set-IBPB-when-running-a-different-VCPU.patch b/patches/kernel/0295-x86-svm-Set-IBPB-when-running-a-different-VCPU.patch deleted file mode 100644 index 9b2262c..0000000 --- a/patches/kernel/0295-x86-svm-Set-IBPB-when-running-a-different-VCPU.patch +++ /dev/null @@ -1,73 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tom Lendacky -Date: Wed, 20 Dec 2017 10:55:47 +0000 -Subject: [PATCH] x86/svm: Set IBPB when running a different VCPU -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Set IBPB (Indirect Branch Prediction Barrier) when the current CPU is -going to run a VCPU different from what was previously run. - -Signed-off-by: Paolo Bonzini -Signed-off-by: Tom Lendacky -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 0ba3eaabbb6666ebd344ee80534e58c375a00810) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kvm/svm.c | 16 ++++++++++++++++ - 1 file changed, 16 insertions(+) - -diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c -index a1b19e810c49..fade4869856a 100644 ---- a/arch/x86/kvm/svm.c -+++ b/arch/x86/kvm/svm.c -@@ -518,6 +518,8 @@ struct svm_cpu_data { - struct kvm_ldttss_desc *tss_desc; - - struct page *save_area; -+ -+ struct vmcb *current_vmcb; - }; - - static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); -@@ -1685,11 +1687,19 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) - __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, svm); -+ -+ /* -+ * The VMCB could be recycled, causing a false negative in svm_vcpu_load; -+ * block speculative execution. -+ */ -+ if (ibpb_inuse) -+ wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB); - } - - static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) - { - struct vcpu_svm *svm = to_svm(vcpu); -+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu); - int i; - - if (unlikely(cpu != vcpu->cpu)) { -@@ -1718,6 +1728,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) - if (static_cpu_has(X86_FEATURE_RDTSCP)) - wrmsrl(MSR_TSC_AUX, svm->tsc_aux); - -+ if (sd->current_vmcb != svm->vmcb) { -+ sd->current_vmcb = svm->vmcb; -+ if (ibpb_inuse) -+ wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB); -+ } -+ - avic_vcpu_load(vcpu, cpu); - } - --- -2.14.2 - diff --git a/patches/kernel/0295-x86-svm-Set-IBRS-value-on-VM-entry-and-exit.patch b/patches/kernel/0295-x86-svm-Set-IBRS-value-on-VM-entry-and-exit.patch new file mode 100644 index 0000000..880d9b4 --- /dev/null +++ b/patches/kernel/0295-x86-svm-Set-IBRS-value-on-VM-entry-and-exit.patch @@ -0,0 +1,83 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Wed, 20 Dec 2017 10:55:47 +0000 +Subject: [PATCH] x86/svm: Set IBRS value on VM entry and exit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Set/restore the guests IBRS value on VM entry. On VM exit back to the +kernel save the guest IBRS value and then set IBRS to 1. + +Signed-off-by: Paolo Bonzini +Signed-off-by: Tom Lendacky +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 72f71e6826fac9a656c3994fb6f979cd65a14c64) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kvm/svm.c | 17 +++++++++++++++++ + 1 file changed, 17 insertions(+) + +diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c +index 94adf6becc2e..a1b19e810c49 100644 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -175,6 +175,8 @@ struct vcpu_svm { + + u64 next_rip; + ++ u64 spec_ctrl; ++ + u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; + struct { + u16 fs; +@@ -3547,6 +3549,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + case MSR_VM_CR: + msr_info->data = svm->nested.vm_cr_msr; + break; ++ case MSR_IA32_SPEC_CTRL: ++ msr_info->data = svm->spec_ctrl; ++ break; + case MSR_IA32_UCODE_REV: + msr_info->data = 0x01000065; + break; +@@ -3702,6 +3707,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) + case MSR_VM_IGNNE: + vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); + break; ++ case MSR_IA32_SPEC_CTRL: ++ svm->spec_ctrl = data; ++ break; + case MSR_IA32_APICBASE: + if (kvm_vcpu_apicv_active(vcpu)) + avic_update_vapic_bar(to_svm(vcpu), data); +@@ -4883,6 +4891,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) + + local_irq_enable(); + ++ if (ibrs_inuse && (svm->spec_ctrl != FEATURE_ENABLE_IBRS)) ++ wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); ++ + asm volatile ( + "push %%" _ASM_BP "; \n\t" + "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" +@@ -4975,6 +4986,12 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) + #endif + ); + ++ if (ibrs_inuse) { ++ rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); ++ if (svm->spec_ctrl != FEATURE_ENABLE_IBRS) ++ wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); ++ } ++ + #ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, svm->host.gs_base); + #else +-- +2.14.2 + diff --git a/patches/kernel/0296-KVM-x86-Add-speculative-control-CPUID-support-for-gu.patch b/patches/kernel/0296-KVM-x86-Add-speculative-control-CPUID-support-for-gu.patch deleted file mode 100644 index 8537b7c..0000000 --- a/patches/kernel/0296-KVM-x86-Add-speculative-control-CPUID-support-for-gu.patch +++ /dev/null @@ -1,63 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tom Lendacky -Date: Wed, 20 Dec 2017 10:55:47 +0000 -Subject: [PATCH] KVM: x86: Add speculative control CPUID support for guests -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Provide the guest with the speculative control CPUID related values. - -Signed-off-by: Paolo Bonzini -Signed-off-by: Tom Lendacky -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit db7641e5f41cd517c4181ce90c4f9ecc93af4b2b) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kvm/cpuid.c | 12 ++++++++++-- - 1 file changed, 10 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c -index 19adbb418443..f64502d21a89 100644 ---- a/arch/x86/kvm/cpuid.c -+++ b/arch/x86/kvm/cpuid.c -@@ -70,6 +70,7 @@ u64 kvm_supported_xcr0(void) - /* These are scattered features in cpufeatures.h. */ - #define KVM_CPUID_BIT_AVX512_4VNNIW 2 - #define KVM_CPUID_BIT_AVX512_4FMAPS 3 -+#define KVM_CPUID_BIT_SPEC_CTRL 26 - #define KF(x) bit(KVM_CPUID_BIT_##x) - - int kvm_update_cpuid(struct kvm_vcpu *vcpu) -@@ -387,7 +388,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, - - /* cpuid 7.0.edx*/ - const u32 kvm_cpuid_7_0_edx_x86_features = -- KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS); -+ KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS) | -+ KF(SPEC_CTRL); -+ -+ /* cpuid 0x80000008.0.ebx */ -+ const u32 kvm_cpuid_80000008_0_ebx_x86_features = -+ F(IBPB); - - /* all calls to cpuid_count() should be made on the same cpu */ - get_cpu(); -@@ -622,7 +628,9 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, - if (!g_phys_as) - g_phys_as = phys_as; - entry->eax = g_phys_as | (virt_as << 8); -- entry->ebx = entry->edx = 0; -+ entry->ebx &= kvm_cpuid_80000008_0_ebx_x86_features; -+ cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); -+ entry->edx = 0; - break; - } - case 0x80000019: --- -2.14.2 - diff --git a/patches/kernel/0296-x86-svm-Set-IBPB-when-running-a-different-VCPU.patch b/patches/kernel/0296-x86-svm-Set-IBPB-when-running-a-different-VCPU.patch new file mode 100644 index 0000000..9b2262c --- /dev/null +++ b/patches/kernel/0296-x86-svm-Set-IBPB-when-running-a-different-VCPU.patch @@ -0,0 +1,73 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Wed, 20 Dec 2017 10:55:47 +0000 +Subject: [PATCH] x86/svm: Set IBPB when running a different VCPU +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Set IBPB (Indirect Branch Prediction Barrier) when the current CPU is +going to run a VCPU different from what was previously run. + +Signed-off-by: Paolo Bonzini +Signed-off-by: Tom Lendacky +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 0ba3eaabbb6666ebd344ee80534e58c375a00810) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kvm/svm.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c +index a1b19e810c49..fade4869856a 100644 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -518,6 +518,8 @@ struct svm_cpu_data { + struct kvm_ldttss_desc *tss_desc; + + struct page *save_area; ++ ++ struct vmcb *current_vmcb; + }; + + static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); +@@ -1685,11 +1687,19 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) + __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, svm); ++ ++ /* ++ * The VMCB could be recycled, causing a false negative in svm_vcpu_load; ++ * block speculative execution. ++ */ ++ if (ibpb_inuse) ++ wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB); + } + + static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + { + struct vcpu_svm *svm = to_svm(vcpu); ++ struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + int i; + + if (unlikely(cpu != vcpu->cpu)) { +@@ -1718,6 +1728,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + if (static_cpu_has(X86_FEATURE_RDTSCP)) + wrmsrl(MSR_TSC_AUX, svm->tsc_aux); + ++ if (sd->current_vmcb != svm->vmcb) { ++ sd->current_vmcb = svm->vmcb; ++ if (ibpb_inuse) ++ wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB); ++ } ++ + avic_vcpu_load(vcpu, cpu); + } + +-- +2.14.2 + diff --git a/patches/kernel/0297-KVM-x86-Add-speculative-control-CPUID-support-for-gu.patch b/patches/kernel/0297-KVM-x86-Add-speculative-control-CPUID-support-for-gu.patch new file mode 100644 index 0000000..8537b7c --- /dev/null +++ b/patches/kernel/0297-KVM-x86-Add-speculative-control-CPUID-support-for-gu.patch @@ -0,0 +1,63 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Wed, 20 Dec 2017 10:55:47 +0000 +Subject: [PATCH] KVM: x86: Add speculative control CPUID support for guests +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Provide the guest with the speculative control CPUID related values. + +Signed-off-by: Paolo Bonzini +Signed-off-by: Tom Lendacky +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit db7641e5f41cd517c4181ce90c4f9ecc93af4b2b) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kvm/cpuid.c | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c +index 19adbb418443..f64502d21a89 100644 +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -70,6 +70,7 @@ u64 kvm_supported_xcr0(void) + /* These are scattered features in cpufeatures.h. */ + #define KVM_CPUID_BIT_AVX512_4VNNIW 2 + #define KVM_CPUID_BIT_AVX512_4FMAPS 3 ++#define KVM_CPUID_BIT_SPEC_CTRL 26 + #define KF(x) bit(KVM_CPUID_BIT_##x) + + int kvm_update_cpuid(struct kvm_vcpu *vcpu) +@@ -387,7 +388,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + + /* cpuid 7.0.edx*/ + const u32 kvm_cpuid_7_0_edx_x86_features = +- KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS); ++ KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS) | ++ KF(SPEC_CTRL); ++ ++ /* cpuid 0x80000008.0.ebx */ ++ const u32 kvm_cpuid_80000008_0_ebx_x86_features = ++ F(IBPB); + + /* all calls to cpuid_count() should be made on the same cpu */ + get_cpu(); +@@ -622,7 +628,9 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + if (!g_phys_as) + g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); +- entry->ebx = entry->edx = 0; ++ entry->ebx &= kvm_cpuid_80000008_0_ebx_x86_features; ++ cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); ++ entry->edx = 0; + break; + } + case 0x80000019: +-- +2.14.2 + diff --git a/patches/kernel/0297-x86-svm-Add-code-to-clobber-the-RSB-on-VM-exit.patch b/patches/kernel/0297-x86-svm-Add-code-to-clobber-the-RSB-on-VM-exit.patch deleted file mode 100644 index d7b82f4..0000000 --- a/patches/kernel/0297-x86-svm-Add-code-to-clobber-the-RSB-on-VM-exit.patch +++ /dev/null @@ -1,39 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tom Lendacky -Date: Wed, 20 Dec 2017 10:55:47 +0000 -Subject: [PATCH] x86/svm: Add code to clobber the RSB on VM exit -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Add code to overwrite the local CPU RSB entries from the previous less -privileged mode. - -Signed-off-by: Tom Lendacky -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 9392e24469b71ff665cdbc3d81db215f9383219d) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kvm/svm.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c -index fade4869856a..e99bdfcc6b01 100644 ---- a/arch/x86/kvm/svm.c -+++ b/arch/x86/kvm/svm.c -@@ -5008,6 +5008,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) - wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); - } - -+ stuff_RSB(); -+ - #ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, svm->host.gs_base); - #else --- -2.14.2 - diff --git a/patches/kernel/0298-x86-cpu-AMD-Remove-now-unused-definition-of-MFENCE_R.patch b/patches/kernel/0298-x86-cpu-AMD-Remove-now-unused-definition-of-MFENCE_R.patch deleted file mode 100644 index 839cd53..0000000 --- a/patches/kernel/0298-x86-cpu-AMD-Remove-now-unused-definition-of-MFENCE_R.patch +++ /dev/null @@ -1,71 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Tom Lendacky -Date: Wed, 20 Dec 2017 10:55:48 +0000 -Subject: [PATCH] x86/cpu/AMD: Remove now unused definition of MFENCE_RDTSC - feature -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -With the switch to using LFENCE_RDTSC on AMD platforms there is no longer -a need for the MFENCE_RDTSC feature. Remove it usage and definition. - -Signed-off-by: Tom Lendacky -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 6e6c998937329e9d13d4b239233cd058e8a7730f) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/barrier.h | 3 +-- - arch/x86/include/asm/msr.h | 3 +-- - arch/x86/net/bpf_jit_comp.c | 3 --- - 3 files changed, 2 insertions(+), 7 deletions(-) - -diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h -index aae78054cae2..d00432579444 100644 ---- a/arch/x86/include/asm/barrier.h -+++ b/arch/x86/include/asm/barrier.h -@@ -23,8 +23,7 @@ - #define wmb() asm volatile("sfence" ::: "memory") - #endif - --#define gmb() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \ -- "lfence", X86_FEATURE_LFENCE_RDTSC); -+#define gmb() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC); - - #ifdef CONFIG_X86_PPRO_FENCE - #define dma_rmb() rmb() -diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h -index 898dba2e2e2c..3139098269f6 100644 ---- a/arch/x86/include/asm/msr.h -+++ b/arch/x86/include/asm/msr.h -@@ -213,8 +213,7 @@ static __always_inline unsigned long long rdtsc_ordered(void) - * that some other imaginary CPU is updating continuously with a - * time stamp. - */ -- alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, -- "lfence", X86_FEATURE_LFENCE_RDTSC); -+ alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC); - return rdtsc(); - } - -diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c -index 879dbfefb66d..e20e304320f9 100644 ---- a/arch/x86/net/bpf_jit_comp.c -+++ b/arch/x86/net/bpf_jit_comp.c -@@ -116,9 +116,6 @@ static void emit_memory_barrier(u8 **pprog) - if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) - /* x86 LFENCE opcode 0F AE E8 */ - EMIT3(0x0f, 0xae, 0xe8); -- else if (boot_cpu_has(X86_FEATURE_MFENCE_RDTSC)) -- /* AMD MFENCE opcode 0F AE F0 */ -- EMIT3(0x0f, 0xae, 0xf0); - else - /* we should never end up here, - * but if we do, better not to emit anything*/ --- -2.14.2 - diff --git a/patches/kernel/0298-x86-svm-Add-code-to-clobber-the-RSB-on-VM-exit.patch b/patches/kernel/0298-x86-svm-Add-code-to-clobber-the-RSB-on-VM-exit.patch new file mode 100644 index 0000000..d7b82f4 --- /dev/null +++ b/patches/kernel/0298-x86-svm-Add-code-to-clobber-the-RSB-on-VM-exit.patch @@ -0,0 +1,39 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Wed, 20 Dec 2017 10:55:47 +0000 +Subject: [PATCH] x86/svm: Add code to clobber the RSB on VM exit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Add code to overwrite the local CPU RSB entries from the previous less +privileged mode. + +Signed-off-by: Tom Lendacky +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 9392e24469b71ff665cdbc3d81db215f9383219d) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kvm/svm.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c +index fade4869856a..e99bdfcc6b01 100644 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -5008,6 +5008,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) + wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); + } + ++ stuff_RSB(); ++ + #ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, svm->host.gs_base); + #else +-- +2.14.2 + diff --git a/patches/kernel/0299-UBUNTU-SAUCE-x86-kvm-Fix-stuff_RSB-for-32-bit.patch b/patches/kernel/0299-UBUNTU-SAUCE-x86-kvm-Fix-stuff_RSB-for-32-bit.patch deleted file mode 100644 index 6a04663..0000000 --- a/patches/kernel/0299-UBUNTU-SAUCE-x86-kvm-Fix-stuff_RSB-for-32-bit.patch +++ /dev/null @@ -1,44 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: William Grant -Date: Thu, 11 Jan 2018 17:05:42 -0600 -Subject: [PATCH] UBUNTU: SAUCE: x86/kvm: Fix stuff_RSB() for 32-bit -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5753 -CVE-2017-5715 - -Signed-off-by: William Grant -Acked-by: Kamal Mostafa -Signed-off-by: Seth Forshee -(cherry picked from commit 306dada4f850bf537dbd8ff06cf1522074b3f327) -Signed-off-by: Fabian Grünbichler ---- - arch/x86/include/asm/kvm_host.h | 10 +++++++--- - 1 file changed, 7 insertions(+), 3 deletions(-) - -diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h -index 4117a97228a2..f39bc68efa56 100644 ---- a/arch/x86/include/asm/kvm_host.h -+++ b/arch/x86/include/asm/kvm_host.h -@@ -223,9 +223,13 @@ static inline void stuff_RSB(void) - .label31: \n\ - call .label32 \n\ - pause \n\ --.label32: \n\ -- add $(32*8), %%rsp \n\ --": : :"memory"); -+.label32: \n" -+#ifdef CONFIG_X86_64 -+" add $(32*8), %%rsp \n" -+#else -+" add $(32*4), %%esp \n" -+#endif -+: : :"memory"); - } - - enum kvm_reg { --- -2.14.2 - diff --git a/patches/kernel/0299-x86-cpu-AMD-Remove-now-unused-definition-of-MFENCE_R.patch b/patches/kernel/0299-x86-cpu-AMD-Remove-now-unused-definition-of-MFENCE_R.patch new file mode 100644 index 0000000..839cd53 --- /dev/null +++ b/patches/kernel/0299-x86-cpu-AMD-Remove-now-unused-definition-of-MFENCE_R.patch @@ -0,0 +1,71 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky +Date: Wed, 20 Dec 2017 10:55:48 +0000 +Subject: [PATCH] x86/cpu/AMD: Remove now unused definition of MFENCE_RDTSC + feature +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +With the switch to using LFENCE_RDTSC on AMD platforms there is no longer +a need for the MFENCE_RDTSC feature. Remove it usage and definition. + +Signed-off-by: Tom Lendacky +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 6e6c998937329e9d13d4b239233cd058e8a7730f) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/barrier.h | 3 +-- + arch/x86/include/asm/msr.h | 3 +-- + arch/x86/net/bpf_jit_comp.c | 3 --- + 3 files changed, 2 insertions(+), 7 deletions(-) + +diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h +index aae78054cae2..d00432579444 100644 +--- a/arch/x86/include/asm/barrier.h ++++ b/arch/x86/include/asm/barrier.h +@@ -23,8 +23,7 @@ + #define wmb() asm volatile("sfence" ::: "memory") + #endif + +-#define gmb() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \ +- "lfence", X86_FEATURE_LFENCE_RDTSC); ++#define gmb() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC); + + #ifdef CONFIG_X86_PPRO_FENCE + #define dma_rmb() rmb() +diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h +index 898dba2e2e2c..3139098269f6 100644 +--- a/arch/x86/include/asm/msr.h ++++ b/arch/x86/include/asm/msr.h +@@ -213,8 +213,7 @@ static __always_inline unsigned long long rdtsc_ordered(void) + * that some other imaginary CPU is updating continuously with a + * time stamp. + */ +- alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, +- "lfence", X86_FEATURE_LFENCE_RDTSC); ++ alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC); + return rdtsc(); + } + +diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c +index 879dbfefb66d..e20e304320f9 100644 +--- a/arch/x86/net/bpf_jit_comp.c ++++ b/arch/x86/net/bpf_jit_comp.c +@@ -116,9 +116,6 @@ static void emit_memory_barrier(u8 **pprog) + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) + /* x86 LFENCE opcode 0F AE E8 */ + EMIT3(0x0f, 0xae, 0xe8); +- else if (boot_cpu_has(X86_FEATURE_MFENCE_RDTSC)) +- /* AMD MFENCE opcode 0F AE F0 */ +- EMIT3(0x0f, 0xae, 0xf0); + else + /* we should never end up here, + * but if we do, better not to emit anything*/ +-- +2.14.2 + diff --git a/patches/kernel/0300-UBUNTU-SAUCE-x86-kvm-Fix-stuff_RSB-for-32-bit.patch b/patches/kernel/0300-UBUNTU-SAUCE-x86-kvm-Fix-stuff_RSB-for-32-bit.patch new file mode 100644 index 0000000..6a04663 --- /dev/null +++ b/patches/kernel/0300-UBUNTU-SAUCE-x86-kvm-Fix-stuff_RSB-for-32-bit.patch @@ -0,0 +1,44 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: William Grant +Date: Thu, 11 Jan 2018 17:05:42 -0600 +Subject: [PATCH] UBUNTU: SAUCE: x86/kvm: Fix stuff_RSB() for 32-bit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5753 +CVE-2017-5715 + +Signed-off-by: William Grant +Acked-by: Kamal Mostafa +Signed-off-by: Seth Forshee +(cherry picked from commit 306dada4f850bf537dbd8ff06cf1522074b3f327) +Signed-off-by: Fabian Grünbichler +--- + arch/x86/include/asm/kvm_host.h | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 4117a97228a2..f39bc68efa56 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -223,9 +223,13 @@ static inline void stuff_RSB(void) + .label31: \n\ + call .label32 \n\ + pause \n\ +-.label32: \n\ +- add $(32*8), %%rsp \n\ +-": : :"memory"); ++.label32: \n" ++#ifdef CONFIG_X86_64 ++" add $(32*8), %%rsp \n" ++#else ++" add $(32*4), %%esp \n" ++#endif ++: : :"memory"); + } + + enum kvm_reg { +-- +2.14.2 + diff --git a/patches/kernel/0300-x86-pti-Enable-PTI-by-default.patch b/patches/kernel/0300-x86-pti-Enable-PTI-by-default.patch deleted file mode 100644 index d720d28..0000000 --- a/patches/kernel/0300-x86-pti-Enable-PTI-by-default.patch +++ /dev/null @@ -1,39 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Thomas Gleixner -Date: Wed, 3 Jan 2018 15:18:44 +0100 -Subject: [PATCH] x86/pti: Enable PTI by default -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CVE-2017-5754 - -This really want's to be enabled by default. Users who know what they are -doing can disable it either in the config or on the kernel command line. - -Signed-off-by: Thomas Gleixner -Cc: stable@vger.kernel.org -(cherry picked from commit 87faa0d9b43b4755ff6963a22d1fd1bee1aa3b39) -Signed-off-by: Andy Whitcroft -Signed-off-by: Kleber Sacilotto de Souza -(cherry picked from commit 436cdbfed2112bea7943f4a0f6dfabf54088c8c6) -Signed-off-by: Fabian Grünbichler ---- - security/Kconfig | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/security/Kconfig b/security/Kconfig -index 91cb8f611a0d..529dccc22ce5 100644 ---- a/security/Kconfig -+++ b/security/Kconfig -@@ -98,6 +98,7 @@ config SECURITY_NETWORK - - config PAGE_TABLE_ISOLATION - bool "Remove the kernel mapping in user mode" -+ default y - depends on X86_64 && !UML - help - This feature reduces the number of hardware side channels by --- -2.14.2 - diff --git a/patches/kernel/0301-KVM-x86-Add-memory-barrier-on-vmcs-field-lookup.patch b/patches/kernel/0301-KVM-x86-Add-memory-barrier-on-vmcs-field-lookup.patch deleted file mode 100644 index e8b4be8..0000000 --- a/patches/kernel/0301-KVM-x86-Add-memory-barrier-on-vmcs-field-lookup.patch +++ /dev/null @@ -1,49 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Andrew Honig -Date: Wed, 10 Jan 2018 10:12:03 -0800 -Subject: [PATCH] KVM: x86: Add memory barrier on vmcs field lookup -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -commit 75f139aaf896d6fdeec2e468ddfa4b2fe469bf40 upstream. - -This adds a memory barrier when performing a lookup into -the vmcs_field_to_offset_table. This is related to -CVE-2017-5753. - -Signed-off-by: Andrew Honig -Reviewed-by: Jim Mattson -Signed-off-by: Paolo Bonzini -Signed-off-by: Greg Kroah-Hartman -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kvm/vmx.c | 12 ++++++++++-- - 1 file changed, 10 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c -index d2168203bddc..e6fa3df81fd8 100644 ---- a/arch/x86/kvm/vmx.c -+++ b/arch/x86/kvm/vmx.c -@@ -882,8 +882,16 @@ static inline short vmcs_field_to_offset(unsigned long field) - { - BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); - -- if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) || -- vmcs_field_to_offset_table[field] == 0) -+ if (field >= ARRAY_SIZE(vmcs_field_to_offset_table)) -+ return -ENOENT; -+ -+ /* -+ * FIXME: Mitigation for CVE-2017-5753. To be replaced with a -+ * generic mechanism. -+ */ -+ asm("lfence"); -+ -+ if (vmcs_field_to_offset_table[field] == 0) - return -ENOENT; - - return vmcs_field_to_offset_table[field]; --- -2.14.2 - diff --git a/patches/kernel/0301-x86-pti-Enable-PTI-by-default.patch b/patches/kernel/0301-x86-pti-Enable-PTI-by-default.patch new file mode 100644 index 0000000..d720d28 --- /dev/null +++ b/patches/kernel/0301-x86-pti-Enable-PTI-by-default.patch @@ -0,0 +1,39 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Wed, 3 Jan 2018 15:18:44 +0100 +Subject: [PATCH] x86/pti: Enable PTI by default +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +CVE-2017-5754 + +This really want's to be enabled by default. Users who know what they are +doing can disable it either in the config or on the kernel command line. + +Signed-off-by: Thomas Gleixner +Cc: stable@vger.kernel.org +(cherry picked from commit 87faa0d9b43b4755ff6963a22d1fd1bee1aa3b39) +Signed-off-by: Andy Whitcroft +Signed-off-by: Kleber Sacilotto de Souza +(cherry picked from commit 436cdbfed2112bea7943f4a0f6dfabf54088c8c6) +Signed-off-by: Fabian Grünbichler +--- + security/Kconfig | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/security/Kconfig b/security/Kconfig +index 91cb8f611a0d..529dccc22ce5 100644 +--- a/security/Kconfig ++++ b/security/Kconfig +@@ -98,6 +98,7 @@ config SECURITY_NETWORK + + config PAGE_TABLE_ISOLATION + bool "Remove the kernel mapping in user mode" ++ default y + depends on X86_64 && !UML + help + This feature reduces the number of hardware side channels by +-- +2.14.2 + diff --git a/patches/kernel/0302-KVM-x86-Add-memory-barrier-on-vmcs-field-lookup.patch b/patches/kernel/0302-KVM-x86-Add-memory-barrier-on-vmcs-field-lookup.patch new file mode 100644 index 0000000..e8b4be8 --- /dev/null +++ b/patches/kernel/0302-KVM-x86-Add-memory-barrier-on-vmcs-field-lookup.patch @@ -0,0 +1,49 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Andrew Honig +Date: Wed, 10 Jan 2018 10:12:03 -0800 +Subject: [PATCH] KVM: x86: Add memory barrier on vmcs field lookup +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 75f139aaf896d6fdeec2e468ddfa4b2fe469bf40 upstream. + +This adds a memory barrier when performing a lookup into +the vmcs_field_to_offset_table. This is related to +CVE-2017-5753. + +Signed-off-by: Andrew Honig +Reviewed-by: Jim Mattson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kvm/vmx.c | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index d2168203bddc..e6fa3df81fd8 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -882,8 +882,16 @@ static inline short vmcs_field_to_offset(unsigned long field) + { + BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); + +- if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) || +- vmcs_field_to_offset_table[field] == 0) ++ if (field >= ARRAY_SIZE(vmcs_field_to_offset_table)) ++ return -ENOENT; ++ ++ /* ++ * FIXME: Mitigation for CVE-2017-5753. To be replaced with a ++ * generic mechanism. ++ */ ++ asm("lfence"); ++ ++ if (vmcs_field_to_offset_table[field] == 0) + return -ENOENT; + + return vmcs_field_to_offset_table[field]; +-- +2.14.2 + diff --git a/patches/kernel/0302-x86-tboot-Unbreak-tboot-with-PTI-enabled.patch b/patches/kernel/0302-x86-tboot-Unbreak-tboot-with-PTI-enabled.patch deleted file mode 100644 index a65f18b..0000000 --- a/patches/kernel/0302-x86-tboot-Unbreak-tboot-with-PTI-enabled.patch +++ /dev/null @@ -1,54 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Dave Hansen -Date: Sat, 6 Jan 2018 18:41:14 +0100 -Subject: [PATCH] x86/tboot: Unbreak tboot with PTI enabled -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -commit 262b6b30087246abf09d6275eb0c0dc421bcbe38 upstream. - -This is another case similar to what EFI does: create a new set of -page tables, map some code at a low address, and jump to it. PTI -mistakes this low address for userspace and mistakenly marks it -non-executable in an effort to make it unusable for userspace. - -Undo the poison to allow execution. - -Fixes: 385ce0ea4c07 ("x86/mm/pti: Add Kconfig") -Signed-off-by: Dave Hansen -Signed-off-by: Andrea Arcangeli -Signed-off-by: Thomas Gleixner -Cc: Alan Cox -Cc: Tim Chen -Cc: Jon Masters -Cc: Dave Hansen -Cc: Andi Kleen -Cc: Jeff Law -Cc: Paolo Bonzini -Cc: Linus Torvalds -Cc: Greg Kroah-Hartman -Cc: David" -Cc: Nick Clifton -Link: https://lkml.kernel.org/r/20180108102805.GK25546@redhat.com -Signed-off-by: Greg Kroah-Hartman -Signed-off-by: Fabian Grünbichler ---- - arch/x86/kernel/tboot.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c -index a2486f444073..8337730f0956 100644 ---- a/arch/x86/kernel/tboot.c -+++ b/arch/x86/kernel/tboot.c -@@ -127,6 +127,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, - p4d = p4d_alloc(&tboot_mm, pgd, vaddr); - if (!p4d) - return -1; -+ pgd->pgd &= ~_PAGE_NX; - pud = pud_alloc(&tboot_mm, p4d, vaddr); - if (!pud) - return -1; --- -2.14.2 - diff --git a/patches/kernel/0303-x86-perf-Disable-intel_bts-when-PTI.patch b/patches/kernel/0303-x86-perf-Disable-intel_bts-when-PTI.patch deleted file mode 100644 index 039498e..0000000 --- a/patches/kernel/0303-x86-perf-Disable-intel_bts-when-PTI.patch +++ /dev/null @@ -1,72 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Sun, 14 Jan 2018 11:27:13 +0100 -Subject: [PATCH] x86,perf: Disable intel_bts when PTI -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -commit 99a9dc98ba52267ce5e062b52de88ea1f1b2a7d8 upstream. - -The intel_bts driver does not use the 'normal' BTS buffer which is exposed -through the cpu_entry_area but instead uses the memory allocated for the -perf AUX buffer. - -This obviously comes apart when using PTI because then the kernel mapping; -which includes that AUX buffer memory; disappears. Fixing this requires to -expose a mapping which is visible in all context and that's not trivial. - -As a quick fix disable this driver when PTI is enabled to prevent -malfunction. - -Fixes: 385ce0ea4c07 ("x86/mm/pti: Add Kconfig") -Reported-by: Vince Weaver -Reported-by: Robert Święcki -Signed-off-by: Peter Zijlstra (Intel) -Signed-off-by: Thomas Gleixner -Cc: Alexander Shishkin -Cc: greg@kroah.com -Cc: hughd@google.com -Cc: luto@amacapital.net -Cc: Vince Weaver -Cc: torvalds@linux-foundation.org -Cc: stable@vger.kernel.org -Link: https://lkml.kernel.org/r/20180114102713.GB6166@worktop.programming.kicks-ass.net -Signed-off-by: Greg Kroah-Hartman -Signed-off-by: Fabian Grünbichler ---- - arch/x86/events/intel/bts.c | 18 ++++++++++++++++++ - 1 file changed, 18 insertions(+) - -diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c -index ddd8d3516bfc..9a62e6fce0e0 100644 ---- a/arch/x86/events/intel/bts.c -+++ b/arch/x86/events/intel/bts.c -@@ -582,6 +582,24 @@ static __init int bts_init(void) - if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) - return -ENODEV; - -+ if (boot_cpu_has(X86_FEATURE_PTI)) { -+ /* -+ * BTS hardware writes through a virtual memory map we must -+ * either use the kernel physical map, or the user mapping of -+ * the AUX buffer. -+ * -+ * However, since this driver supports per-CPU and per-task inherit -+ * we cannot use the user mapping since it will not be availble -+ * if we're not running the owning process. -+ * -+ * With PTI we can't use the kernal map either, because its not -+ * there when we run userspace. -+ * -+ * For now, disable this driver when using PTI. -+ */ -+ return -ENODEV; -+ } -+ - bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE | - PERF_PMU_CAP_EXCLUSIVE; - bts_pmu.task_ctx_nr = perf_sw_context; --- -2.14.2 - diff --git a/patches/kernel/0303-x86-tboot-Unbreak-tboot-with-PTI-enabled.patch b/patches/kernel/0303-x86-tboot-Unbreak-tboot-with-PTI-enabled.patch new file mode 100644 index 0000000..a65f18b --- /dev/null +++ b/patches/kernel/0303-x86-tboot-Unbreak-tboot-with-PTI-enabled.patch @@ -0,0 +1,54 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dave Hansen +Date: Sat, 6 Jan 2018 18:41:14 +0100 +Subject: [PATCH] x86/tboot: Unbreak tboot with PTI enabled +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 262b6b30087246abf09d6275eb0c0dc421bcbe38 upstream. + +This is another case similar to what EFI does: create a new set of +page tables, map some code at a low address, and jump to it. PTI +mistakes this low address for userspace and mistakenly marks it +non-executable in an effort to make it unusable for userspace. + +Undo the poison to allow execution. + +Fixes: 385ce0ea4c07 ("x86/mm/pti: Add Kconfig") +Signed-off-by: Dave Hansen +Signed-off-by: Andrea Arcangeli +Signed-off-by: Thomas Gleixner +Cc: Alan Cox +Cc: Tim Chen +Cc: Jon Masters +Cc: Dave Hansen +Cc: Andi Kleen +Cc: Jeff Law +Cc: Paolo Bonzini +Cc: Linus Torvalds +Cc: Greg Kroah-Hartman +Cc: David" +Cc: Nick Clifton +Link: https://lkml.kernel.org/r/20180108102805.GK25546@redhat.com +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Fabian Grünbichler +--- + arch/x86/kernel/tboot.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c +index a2486f444073..8337730f0956 100644 +--- a/arch/x86/kernel/tboot.c ++++ b/arch/x86/kernel/tboot.c +@@ -127,6 +127,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, + p4d = p4d_alloc(&tboot_mm, pgd, vaddr); + if (!p4d) + return -1; ++ pgd->pgd &= ~_PAGE_NX; + pud = pud_alloc(&tboot_mm, p4d, vaddr); + if (!pud) + return -1; +-- +2.14.2 + diff --git a/patches/kernel/0304-x86-perf-Disable-intel_bts-when-PTI.patch b/patches/kernel/0304-x86-perf-Disable-intel_bts-when-PTI.patch new file mode 100644 index 0000000..039498e --- /dev/null +++ b/patches/kernel/0304-x86-perf-Disable-intel_bts-when-PTI.patch @@ -0,0 +1,72 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Sun, 14 Jan 2018 11:27:13 +0100 +Subject: [PATCH] x86,perf: Disable intel_bts when PTI +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 99a9dc98ba52267ce5e062b52de88ea1f1b2a7d8 upstream. + +The intel_bts driver does not use the 'normal' BTS buffer which is exposed +through the cpu_entry_area but instead uses the memory allocated for the +perf AUX buffer. + +This obviously comes apart when using PTI because then the kernel mapping; +which includes that AUX buffer memory; disappears. Fixing this requires to +expose a mapping which is visible in all context and that's not trivial. + +As a quick fix disable this driver when PTI is enabled to prevent +malfunction. + +Fixes: 385ce0ea4c07 ("x86/mm/pti: Add Kconfig") +Reported-by: Vince Weaver +Reported-by: Robert Święcki +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Thomas Gleixner +Cc: Alexander Shishkin +Cc: greg@kroah.com +Cc: hughd@google.com +Cc: luto@amacapital.net +Cc: Vince Weaver +Cc: torvalds@linux-foundation.org +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/20180114102713.GB6166@worktop.programming.kicks-ass.net +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Fabian Grünbichler +--- + arch/x86/events/intel/bts.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c +index ddd8d3516bfc..9a62e6fce0e0 100644 +--- a/arch/x86/events/intel/bts.c ++++ b/arch/x86/events/intel/bts.c +@@ -582,6 +582,24 @@ static __init int bts_init(void) + if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) + return -ENODEV; + ++ if (boot_cpu_has(X86_FEATURE_PTI)) { ++ /* ++ * BTS hardware writes through a virtual memory map we must ++ * either use the kernel physical map, or the user mapping of ++ * the AUX buffer. ++ * ++ * However, since this driver supports per-CPU and per-task inherit ++ * we cannot use the user mapping since it will not be availble ++ * if we're not running the owning process. ++ * ++ * With PTI we can't use the kernal map either, because its not ++ * there when we run userspace. ++ * ++ * For now, disable this driver when using PTI. ++ */ ++ return -ENODEV; ++ } ++ + bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE | + PERF_PMU_CAP_EXCLUSIVE; + bts_pmu.task_ctx_nr = perf_sw_context; +-- +2.14.2 +