fi
if test -z "$KBUILD_BUILD_USER"; then
LINUX_COMPILE_BY=$(whoami | sed 's/\\/\\\\/')
---
-2.14.2
-
}
if (ether_addr_equal(br->bridge_id.addr, addr))
---
-2.14.2
-
Safety option to keep boot IRQs enabled. This
should never be necessary.
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
-index 99eec22d99b7..7576c2b0c913 100644
+index 9dcd5ed5a05b..8882b8d38d7d 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
-@@ -3687,6 +3687,107 @@ static int __init pci_apply_final_quirks(void)
+@@ -3694,6 +3694,107 @@ static int __init pci_apply_final_quirks(void)
fs_initcall_sync(pci_apply_final_quirks);
/*
* Following are device-specific reset methods which can be used to
* reset a single function if other methods (e.g. FLR, PM D0->D3) are
-@@ -4529,6 +4630,7 @@ static const struct pci_dev_acs_enabled {
+@@ -4536,6 +4637,7 @@ static const struct pci_dev_acs_enabled {
{ 0x10df, 0x720, pci_quirk_mf_endpoint_acs }, /* Emulex Skyhawk-R */
/* Cavium ThunderX */
{ PCI_VENDOR_ID_CAVIUM, PCI_ANY_ID, pci_quirk_cavium_acs },
{ 0 }
};
---
-2.14.2
-
module_param(halt_poll_ns_grow, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
---
-2.14.2
-
if (!strcmp(token, "xattr")) {
opts->flags |= CGRP_ROOT_XATTR;
continue;
---
-2.14.2
-
index e8cb34193433..f76c4bf3d46a 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
-@@ -299,6 +299,16 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
-
+@@ -300,6 +300,16 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
-+/*
+ /*
+ * Cgroup v2 behavior is used when on default hierarchy or the
+ * cgroup_v2_mode flag is set.
+ */
+ (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
+}
+
- /*
++/*
* This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
+ * silently switch it to mount "cgroup" instead
@@ -489,8 +499,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/* On legacy hiearchy, we must be a subset of our parent cpuset. */
mutex_lock(&cpuset_mutex);
---
-2.14.2
-
list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
---
-2.14.2
-
ret = ib_security_pkey_access(map->agent.device,
map->agent.port_num,
---
-2.14.2
-
1 file changed, 7 insertions(+)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
-index 6b1744499a90..5c37c1a1a949 100644
+index 068084c8e540..da10db3de636 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
-@@ -3650,6 +3650,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+@@ -3666,6 +3666,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
u32 ecx = msr->index;
u64 data = msr->data;
switch (ecx) {
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr);
break;
---
-2.14.2
-
-From 727ba748e110b4de50d142edca9d6a9b7e6111d8 Mon Sep 17 00:00:00 2001
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Felix Wilhelm <fwilhelm@google.com>
Date: Mon, 11 Jun 2018 09:43:44 +0200
-Subject: kvm: nVMX: Enforce cpl=0 for VMX instructions
+Subject: [PATCH] kvm: nVMX: Enforce cpl=0 for VMX instructions
VMX instructions executed inside a L1 VM will always trigger a VM exit
even when executed with cpl 3. This means we must perform the
1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
-index 709de996..4bf1f9d 100644
+index 54980817194a..b2d75b59b6e5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
-@@ -7905,6 +7905,12 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
+@@ -7180,6 +7180,12 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return 1;
}
if (vmx->nested.vmxon) {
nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
return kvm_skip_emulated_instruction(vcpu);
-@@ -7964,6 +7970,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
+@@ -7239,6 +7245,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
*/
static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
{
if (!to_vmx(vcpu)->nested.vmxon) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 0;
-@@ -8283,7 +8294,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
+@@ -7577,7 +7588,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
if (get_vmx_mem_address(vcpu, exit_qualification,
vmx_instruction_info, true, &gva))
return 1;
kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
&field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
}
-@@ -8448,7 +8459,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
+@@ -7720,7 +7731,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
if (get_vmx_mem_address(vcpu, exit_qualification,
vmx_instruction_info, true, &vmcs_gva))
return 1;
if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
(void *)&to_vmx(vcpu)->nested.current_vmptr,
sizeof(u64), &e)) {
---
-cgit v1.1
-
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Wolfgang Bumiller <w.bumiller@proxmox.com>
-Date: Fri, 19 Jan 2018 11:12:37 +0100
-Subject: [PATCH] net: sched: em_nbyte: don't add the data offset twice
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-'ptr' is shifted by the offset and then validated,
-the memcmp should not add it a second time.
-
-Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- net/sched/em_nbyte.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
-index df3110d69585..07c10bac06a0 100644
---- a/net/sched/em_nbyte.c
-+++ b/net/sched/em_nbyte.c
-@@ -51,7 +51,7 @@ static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
- if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
- return 0;
-
-- return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len);
-+ return !memcmp(ptr, nbyte->pattern, nbyte->hdr.len);
- }
-
- static struct tcf_ematch_ops em_nbyte_ops = {
---
-2.14.2
-
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Wolfgang Bumiller <w.bumiller@proxmox.com>
+Date: Fri, 19 Jan 2018 11:12:37 +0100
+Subject: [PATCH] net: sched: em_nbyte: don't add the data offset twice
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+'ptr' is shifted by the offset and then validated,
+the memcmp should not add it a second time.
+
+Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ net/sched/em_nbyte.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
+index df3110d69585..07c10bac06a0 100644
+--- a/net/sched/em_nbyte.c
++++ b/net/sched/em_nbyte.c
+@@ -51,7 +51,7 @@ static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
+ if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
+ return 0;
+
+- return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len);
++ return !memcmp(ptr, nbyte->pattern, nbyte->hdr.len);
+ }
+
+ static struct tcf_ematch_ops em_nbyte_ops = {
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Wolfgang Bumiller <w.bumiller@proxmox.com>
-Date: Fri, 19 Jan 2018 11:12:38 +0100
-Subject: [PATCH] net: sched: fix TCF_LAYER_LINK case in tcf_get_base_ptr
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-TCF_LAYER_LINK and TCF_LAYER_NETWORK returned the same pointer as
-skb->data points to the network header.
-Use skb_mac_header instead.
-
-Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- include/net/pkt_cls.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
-index 537d0a0ad4c4..4450961b1554 100644
---- a/include/net/pkt_cls.h
-+++ b/include/net/pkt_cls.h
-@@ -395,7 +395,7 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer)
- {
- switch (layer) {
- case TCF_LAYER_LINK:
-- return skb->data;
-+ return skb_mac_header(skb);
- case TCF_LAYER_NETWORK:
- return skb_network_header(skb);
- case TCF_LAYER_TRANSPORT:
---
-2.14.2
-
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Andrew Honig <ahonig@google.com>
-Date: Wed, 10 Jan 2018 10:12:03 -0800
-Subject: [PATCH] KVM: x86: Add memory barrier on vmcs field lookup
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-commit 75f139aaf896d6fdeec2e468ddfa4b2fe469bf40 upstream.
-
-This adds a memory barrier when performing a lookup into
-the vmcs_field_to_offset_table. This is related to
-CVE-2017-5753.
-
-Signed-off-by: Andrew Honig <ahonig@google.com>
-Reviewed-by: Jim Mattson <jmattson@google.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- arch/x86/kvm/vmx.c | 12 ++++++++++--
- 1 file changed, 10 insertions(+), 2 deletions(-)
-
-diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
-index 0510bc11beb2..c79de3ac9d49 100644
---- a/arch/x86/kvm/vmx.c
-+++ b/arch/x86/kvm/vmx.c
-@@ -883,8 +883,16 @@ static inline short vmcs_field_to_offset(unsigned long field)
- {
- BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
-
-- if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
-- vmcs_field_to_offset_table[field] == 0)
-+ if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
-+ return -ENOENT;
-+
-+ /*
-+ * FIXME: Mitigation for CVE-2017-5753. To be replaced with a
-+ * generic mechanism.
-+ */
-+ asm("lfence");
-+
-+ if (vmcs_field_to_offset_table[field] == 0)
- return -ENOENT;
-
- return vmcs_field_to_offset_table[field];
---
-2.14.2
-
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Wolfgang Bumiller <w.bumiller@proxmox.com>
+Date: Fri, 19 Jan 2018 11:12:38 +0100
+Subject: [PATCH] net: sched: fix TCF_LAYER_LINK case in tcf_get_base_ptr
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+TCF_LAYER_LINK and TCF_LAYER_NETWORK returned the same pointer as
+skb->data points to the network header.
+Use skb_mac_header instead.
+
+Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ include/net/pkt_cls.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
+index 537d0a0ad4c4..4450961b1554 100644
+--- a/include/net/pkt_cls.h
++++ b/include/net/pkt_cls.h
+@@ -395,7 +395,7 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer)
+ {
+ switch (layer) {
+ case TCF_LAYER_LINK:
+- return skb->data;
++ return skb_mac_header(skb);
+ case TCF_LAYER_NETWORK:
+ return skb_network_header(skb);
+ case TCF_LAYER_TRANSPORT:
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
-Date: Mon, 16 Oct 2017 12:40:29 -0500
-Subject: [PATCH] EDAC, sb_edac: Fix missing break in switch
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Add missing break statement in order to prevent the code from falling
-through.
-
-Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
-Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
-Cc: linux-edac <linux-edac@vger.kernel.org>
-Link: http://lkml.kernel.org/r/20171016174029.GA19757@embeddedor.com
-Signed-off-by: Borislav Petkov <bp@suse.de>
-(cherry picked from commit a8e9b186f153a44690ad0363a56716e7077ad28c)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- drivers/edac/sb_edac.c | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
-index 5c3e707ff3fc..59af590b660c 100644
---- a/drivers/edac/sb_edac.c
-+++ b/drivers/edac/sb_edac.c
-@@ -2454,6 +2454,7 @@ static int ibridge_mci_bind_devs(struct mem_ctl_info *mci,
- case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA:
- case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TA:
- pvt->pci_ta = pdev;
-+ break;
- case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_RAS:
- case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS:
- pvt->pci_ras = pdev;
---
-2.14.2
-
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Andrew Honig <ahonig@google.com>
+Date: Wed, 10 Jan 2018 10:12:03 -0800
+Subject: [PATCH] KVM: x86: Add memory barrier on vmcs field lookup
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+commit 75f139aaf896d6fdeec2e468ddfa4b2fe469bf40 upstream.
+
+This adds a memory barrier when performing a lookup into
+the vmcs_field_to_offset_table. This is related to
+CVE-2017-5753.
+
+Signed-off-by: Andrew Honig <ahonig@google.com>
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ arch/x86/kvm/vmx.c | 12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
+index b2d75b59b6e5..a393186d14b1 100644
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -883,8 +883,16 @@ static inline short vmcs_field_to_offset(unsigned long field)
+ {
+ BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
+
+- if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
+- vmcs_field_to_offset_table[field] == 0)
++ if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
++ return -ENOENT;
++
++ /*
++ * FIXME: Mitigation for CVE-2017-5753. To be replaced with a
++ * generic mechanism.
++ */
++ asm("lfence");
++
++ if (vmcs_field_to_offset_table[field] == 0)
+ return -ENOENT;
+
+ return vmcs_field_to_offset_table[field];
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
+Date: Mon, 16 Oct 2017 12:40:29 -0500
+Subject: [PATCH] EDAC, sb_edac: Fix missing break in switch
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Add missing break statement in order to prevent the code from falling
+through.
+
+Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
+Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
+Cc: linux-edac <linux-edac@vger.kernel.org>
+Link: http://lkml.kernel.org/r/20171016174029.GA19757@embeddedor.com
+Signed-off-by: Borislav Petkov <bp@suse.de>
+(cherry picked from commit a8e9b186f153a44690ad0363a56716e7077ad28c)
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ drivers/edac/sb_edac.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
+index 5c3e707ff3fc..59af590b660c 100644
+--- a/drivers/edac/sb_edac.c
++++ b/drivers/edac/sb_edac.c
+@@ -2454,6 +2454,7 @@ static int ibridge_mci_bind_devs(struct mem_ctl_info *mci,
+ case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA:
+ case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TA:
+ pvt->pci_ta = pdev;
++ break;
+ case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_RAS:
+ case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS:
+ pvt->pci_ras = pdev;
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Omar Sandoval <osandov@fb.com>
-Date: Tue, 5 Dec 2017 23:15:31 -0800
-Subject: [PATCH] sched/wait: Fix add_wait_queue() behavioral change
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-The following cleanup commit:
-
- 50816c48997a ("sched/wait: Standardize internal naming of wait-queue entries")
-
-... unintentionally changed the behavior of add_wait_queue() from
-inserting the wait entry at the head of the wait queue to the tail
-of the wait queue.
-
-Beyond a negative performance impact this change in behavior
-theoretically also breaks wait queues which mix exclusive and
-non-exclusive waiters, as non-exclusive waiters will not be
-woken up if they are queued behind enough exclusive waiters.
-
-Signed-off-by: Omar Sandoval <osandov@fb.com>
-Reviewed-by: Jens Axboe <axboe@kernel.dk>
-Acked-by: Peter Zijlstra <peterz@infradead.org>
-Cc: Linus Torvalds <torvalds@linux-foundation.org>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Cc: kernel-team@fb.com
-Fixes: ("sched/wait: Standardize internal naming of wait-queue entries")
-Link: http://lkml.kernel.org/r/a16c8ccffd39bd08fdaa45a5192294c784b803a7.1512544324.git.osandov@fb.com
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-(cherry picked from commit c6b9d9a33029014446bd9ed84c1688f6d3d4eab9)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- kernel/sched/wait.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
-index d6afed6d0752..c09ebe92a40a 100644
---- a/kernel/sched/wait.c
-+++ b/kernel/sched/wait.c
-@@ -27,7 +27,7 @@ void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
-
- wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
- spin_lock_irqsave(&wq_head->lock, flags);
-- __add_wait_queue_entry_tail(wq_head, wq_entry);
-+ __add_wait_queue(wq_head, wq_entry);
- spin_unlock_irqrestore(&wq_head->lock, flags);
- }
- EXPORT_SYMBOL(add_wait_queue);
---
-2.14.2
-
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Andi Kleen <ak@linux.intel.com>
-Date: Thu, 25 Jan 2018 15:50:28 -0800
-Subject: [PATCH] module/retpoline: Warn about missing retpoline in module
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-There's a risk that a kernel which has full retpoline mitigations becomes
-vulnerable when a module gets loaded that hasn't been compiled with the
-right compiler or the right option.
-
-To enable detection of that mismatch at module load time, add a module info
-string "retpoline" at build time when the module was compiled with
-retpoline support. This only covers compiled C source, but assembler source
-or prebuilt object files are not checked.
-
-If a retpoline enabled kernel detects a non retpoline protected module at
-load time, print a warning and report it in the sysfs vulnerability file.
-
-[ tglx: Massaged changelog ]
-
-Signed-off-by: Andi Kleen <ak@linux.intel.com>
-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-Cc: David Woodhouse <dwmw2@infradead.org>
-Cc: gregkh@linuxfoundation.org
-Cc: torvalds@linux-foundation.org
-Cc: jeyu@kernel.org
-Cc: arjan@linux.intel.com
-Link: https://lkml.kernel.org/r/20180125235028.31211-1-andi@firstfloor.org
-(backported from commit caf7501a1b4ec964190f31f9c3f163de252273b8)
-Conflicts:
- arch/x86/kernel/cpu/bugs.c
-context changes
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- arch/x86/kernel/cpu/bugs.c | 18 +++++++++++++++++-
- include/linux/module.h | 9 +++++++++
- kernel/module.c | 11 +++++++++++
- scripts/mod/modpost.c | 9 +++++++++
- 4 files changed, 46 insertions(+), 1 deletion(-)
-
-diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
-index 2443b9580e94..e0b6aa62b253 100644
---- a/arch/x86/kernel/cpu/bugs.c
-+++ b/arch/x86/kernel/cpu/bugs.c
-@@ -11,6 +11,7 @@
- #include <linux/utsname.h>
- #include <linux/cpu.h>
- #include <linux/smp.h>
-+#include <linux/module.h>
- #include <linux/nospec.h>
- #include <linux/prctl.h>
-
-@@ -130,6 +131,19 @@ static const char *spectre_v2_strings[] = {
-
- static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
- SPECTRE_V2_NONE;
-+static bool spectre_v2_bad_module;
-+
-+#ifdef RETPOLINE
-+bool retpoline_module_ok(bool has_retpoline)
-+{
-+ if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
-+ return true;
-+
-+ pr_err("System may be vunerable to spectre v2\n");
-+ spectre_v2_bad_module = true;
-+ return false;
-+}
-+#endif
-
- void x86_spec_ctrl_set(u64 val)
- {
-@@ -615,7 +629,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
- return sprintf(buf, "Mitigation: OSB (observable speculation barrier, Intel v6)\n");
-
- case X86_BUG_SPECTRE_V2:
-- return sprintf(buf, "%s%s\n", spectre_v2_strings[spectre_v2_enabled], ibpb_inuse ? ", IBPB (Intel v4)" : "");
-+ return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
-+ ibpb_inuse ? ",IBPB (Intel v4)" : "",
-+ spectre_v2_bad_module ? " - vulnerable module loaded" : "");
-
- case X86_BUG_SPEC_STORE_BYPASS:
- return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
-diff --git a/include/linux/module.h b/include/linux/module.h
-index e7bdd549e527..c4fdf7661f82 100644
---- a/include/linux/module.h
-+++ b/include/linux/module.h
-@@ -794,6 +794,15 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr,
- static inline void module_bug_cleanup(struct module *mod) {}
- #endif /* CONFIG_GENERIC_BUG */
-
-+#ifdef RETPOLINE
-+extern bool retpoline_module_ok(bool has_retpoline);
-+#else
-+static inline bool retpoline_module_ok(bool has_retpoline)
-+{
-+ return true;
-+}
-+#endif
-+
- #ifdef CONFIG_MODULE_SIG
- static inline bool module_sig_ok(struct module *module)
- {
-diff --git a/kernel/module.c b/kernel/module.c
-index e5b878b26906..de7db074f793 100644
---- a/kernel/module.c
-+++ b/kernel/module.c
-@@ -2855,6 +2855,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
- }
- #endif /* CONFIG_LIVEPATCH */
-
-+static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
-+{
-+ if (retpoline_module_ok(get_modinfo(info, "retpoline")))
-+ return;
-+
-+ pr_warn("%s: loading module not compiled with retpoline compiler.\n",
-+ mod->name);
-+}
-+
- /* Sets info->hdr and info->len. */
- static int copy_module_from_user(const void __user *umod, unsigned long len,
- struct load_info *info)
-@@ -3021,6 +3030,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
- add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
- }
-
-+ check_modinfo_retpoline(mod, info);
-+
- if (get_modinfo(info, "staging")) {
- add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
- pr_warn("%s: module is from the staging directory, the quality "
-diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
-index 48397feb08fb..cc91f81ac33e 100644
---- a/scripts/mod/modpost.c
-+++ b/scripts/mod/modpost.c
-@@ -2147,6 +2147,14 @@ static void add_intree_flag(struct buffer *b, int is_intree)
- buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n");
- }
-
-+/* Cannot check for assembler */
-+static void add_retpoline(struct buffer *b)
-+{
-+ buf_printf(b, "\n#ifdef RETPOLINE\n");
-+ buf_printf(b, "MODULE_INFO(retpoline, \"Y\");\n");
-+ buf_printf(b, "#endif\n");
-+}
-+
- static void add_staging_flag(struct buffer *b, const char *name)
- {
- static const char *staging_dir = "drivers/staging";
-@@ -2492,6 +2500,7 @@ int main(int argc, char **argv)
-
- add_header(&buf, mod);
- add_intree_flag(&buf, !external_module);
-+ add_retpoline(&buf);
- add_staging_flag(&buf, mod->name);
- err |= add_versions(&buf, mod);
- add_depends(&buf, mod, modules);
---
-2.14.2
-
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Omar Sandoval <osandov@fb.com>
+Date: Tue, 5 Dec 2017 23:15:31 -0800
+Subject: [PATCH] sched/wait: Fix add_wait_queue() behavioral change
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The following cleanup commit:
+
+ 50816c48997a ("sched/wait: Standardize internal naming of wait-queue entries")
+
+... unintentionally changed the behavior of add_wait_queue() from
+inserting the wait entry at the head of the wait queue to the tail
+of the wait queue.
+
+Beyond a negative performance impact this change in behavior
+theoretically also breaks wait queues which mix exclusive and
+non-exclusive waiters, as non-exclusive waiters will not be
+woken up if they are queued behind enough exclusive waiters.
+
+Signed-off-by: Omar Sandoval <osandov@fb.com>
+Reviewed-by: Jens Axboe <axboe@kernel.dk>
+Acked-by: Peter Zijlstra <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: kernel-team@fb.com
+Fixes: ("sched/wait: Standardize internal naming of wait-queue entries")
+Link: http://lkml.kernel.org/r/a16c8ccffd39bd08fdaa45a5192294c784b803a7.1512544324.git.osandov@fb.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+(cherry picked from commit c6b9d9a33029014446bd9ed84c1688f6d3d4eab9)
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ kernel/sched/wait.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
+index d6afed6d0752..c09ebe92a40a 100644
+--- a/kernel/sched/wait.c
++++ b/kernel/sched/wait.c
+@@ -27,7 +27,7 @@ void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
+
+ wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&wq_head->lock, flags);
+- __add_wait_queue_entry_tail(wq_head, wq_entry);
++ __add_wait_queue(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+ }
+ EXPORT_SYMBOL(add_wait_queue);
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Andi Kleen <ak@linux.intel.com>
+Date: Thu, 25 Jan 2018 15:50:28 -0800
+Subject: [PATCH] module/retpoline: Warn about missing retpoline in module
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+There's a risk that a kernel which has full retpoline mitigations becomes
+vulnerable when a module gets loaded that hasn't been compiled with the
+right compiler or the right option.
+
+To enable detection of that mismatch at module load time, add a module info
+string "retpoline" at build time when the module was compiled with
+retpoline support. This only covers compiled C source, but assembler source
+or prebuilt object files are not checked.
+
+If a retpoline enabled kernel detects a non retpoline protected module at
+load time, print a warning and report it in the sysfs vulnerability file.
+
+[ tglx: Massaged changelog ]
+
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: David Woodhouse <dwmw2@infradead.org>
+Cc: gregkh@linuxfoundation.org
+Cc: torvalds@linux-foundation.org
+Cc: jeyu@kernel.org
+Cc: arjan@linux.intel.com
+Link: https://lkml.kernel.org/r/20180125235028.31211-1-andi@firstfloor.org
+(backported from commit caf7501a1b4ec964190f31f9c3f163de252273b8)
+Conflicts:
+ arch/x86/kernel/cpu/bugs.c
+context changes
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ arch/x86/kernel/cpu/bugs.c | 18 +++++++++++++++++-
+ include/linux/module.h | 9 +++++++++
+ kernel/module.c | 11 +++++++++++
+ scripts/mod/modpost.c | 9 +++++++++
+ 4 files changed, 46 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index 7e5db5aa37f3..b5bcdf7e94d7 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -11,6 +11,7 @@
+ #include <linux/utsname.h>
+ #include <linux/cpu.h>
+ #include <linux/smp.h>
++#include <linux/module.h>
+ #include <linux/nospec.h>
+ #include <linux/prctl.h>
+
+@@ -131,6 +132,19 @@ static const char *spectre_v2_strings[] = {
+
+ static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
+ SPECTRE_V2_NONE;
++static bool spectre_v2_bad_module;
++
++#ifdef RETPOLINE
++bool retpoline_module_ok(bool has_retpoline)
++{
++ if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
++ return true;
++
++ pr_err("System may be vunerable to spectre v2\n");
++ spectre_v2_bad_module = true;
++ return false;
++}
++#endif
+
+ void
+ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
+@@ -627,7 +641,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
+ return sprintf(buf, "Mitigation: OSB (observable speculation barrier, Intel v6)\n");
+
+ case X86_BUG_SPECTRE_V2:
+- return sprintf(buf, "%s%s\n", spectre_v2_strings[spectre_v2_enabled], ibpb_inuse ? ", IBPB (Intel v4)" : "");
++ return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
++ ibpb_inuse ? ",IBPB (Intel v4)" : "",
++ spectre_v2_bad_module ? " - vulnerable module loaded" : "");
+
+ case X86_BUG_SPEC_STORE_BYPASS:
+ return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
+diff --git a/include/linux/module.h b/include/linux/module.h
+index e7bdd549e527..c4fdf7661f82 100644
+--- a/include/linux/module.h
++++ b/include/linux/module.h
+@@ -794,6 +794,15 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr,
+ static inline void module_bug_cleanup(struct module *mod) {}
+ #endif /* CONFIG_GENERIC_BUG */
+
++#ifdef RETPOLINE
++extern bool retpoline_module_ok(bool has_retpoline);
++#else
++static inline bool retpoline_module_ok(bool has_retpoline)
++{
++ return true;
++}
++#endif
++
+ #ifdef CONFIG_MODULE_SIG
+ static inline bool module_sig_ok(struct module *module)
+ {
+diff --git a/kernel/module.c b/kernel/module.c
+index 41b97a191a72..1c3fd6f767b4 100644
+--- a/kernel/module.c
++++ b/kernel/module.c
+@@ -2855,6 +2855,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
+ }
+ #endif /* CONFIG_LIVEPATCH */
+
++static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
++{
++ if (retpoline_module_ok(get_modinfo(info, "retpoline")))
++ return;
++
++ pr_warn("%s: loading module not compiled with retpoline compiler.\n",
++ mod->name);
++}
++
+ /* Sets info->hdr and info->len. */
+ static int copy_module_from_user(const void __user *umod, unsigned long len,
+ struct load_info *info)
+@@ -3021,6 +3030,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
+ add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
+ }
+
++ check_modinfo_retpoline(mod, info);
++
+ if (get_modinfo(info, "staging")) {
+ add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
+ pr_warn("%s: module is from the staging directory, the quality "
+diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
+index 48397feb08fb..cc91f81ac33e 100644
+--- a/scripts/mod/modpost.c
++++ b/scripts/mod/modpost.c
+@@ -2147,6 +2147,14 @@ static void add_intree_flag(struct buffer *b, int is_intree)
+ buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n");
+ }
+
++/* Cannot check for assembler */
++static void add_retpoline(struct buffer *b)
++{
++ buf_printf(b, "\n#ifdef RETPOLINE\n");
++ buf_printf(b, "MODULE_INFO(retpoline, \"Y\");\n");
++ buf_printf(b, "#endif\n");
++}
++
+ static void add_staging_flag(struct buffer *b, const char *name)
+ {
+ static const char *staging_dir = "drivers/staging";
+@@ -2492,6 +2500,7 @@ int main(int argc, char **argv)
+
+ add_header(&buf, mod);
+ add_intree_flag(&buf, !external_module);
++ add_retpoline(&buf);
+ add_staging_flag(&buf, mod->name);
+ err |= add_versions(&buf, mod);
+ add_depends(&buf, mod, modules);
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Dan Streetman <ddstreet@ieee.org>
-Date: Thu, 18 Jan 2018 16:14:26 -0500
-Subject: [PATCH] net: tcp: close sock if net namespace is exiting
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-When a tcp socket is closed, if it detects that its net namespace is
-exiting, close immediately and do not wait for FIN sequence.
-
-For normal sockets, a reference is taken to their net namespace, so it will
-never exit while the socket is open. However, kernel sockets do not take a
-reference to their net namespace, so it may begin exiting while the kernel
-socket is still open. In this case if the kernel socket is a tcp socket,
-it will stay open trying to complete its close sequence. The sock's dst(s)
-hold a reference to their interface, which are all transferred to the
-namespace's loopback interface when the real interfaces are taken down.
-When the namespace tries to take down its loopback interface, it hangs
-waiting for all references to the loopback interface to release, which
-results in messages like:
-
-unregister_netdevice: waiting for lo to become free. Usage count = 1
-
-These messages continue until the socket finally times out and closes.
-Since the net namespace cleanup holds the net_mutex while calling its
-registered pernet callbacks, any new net namespace initialization is
-blocked until the current net namespace finishes exiting.
-
-After this change, the tcp socket notices the exiting net namespace, and
-closes immediately, releasing its dst(s) and their reference to the
-loopback interface, which lets the net namespace continue exiting.
-
-Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407
-Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811
-Signed-off-by: Dan Streetman <ddstreet@canonical.com>
-Signed-off-by: David S. Miller <davem@davemloft.net>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- include/net/net_namespace.h | 10 ++++++++++
- net/ipv4/tcp.c | 3 +++
- net/ipv4/tcp_timer.c | 15 +++++++++++++++
- 3 files changed, 28 insertions(+)
-
-diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
-index 1c401bd4c2e0..a5d023fa78db 100644
---- a/include/net/net_namespace.h
-+++ b/include/net/net_namespace.h
-@@ -221,6 +221,11 @@ int net_eq(const struct net *net1, const struct net *net2)
- return net1 == net2;
- }
-
-+static inline int check_net(const struct net *net)
-+{
-+ return atomic_read(&net->count) != 0;
-+}
-+
- void net_drop_ns(void *);
-
- #else
-@@ -245,6 +250,11 @@ int net_eq(const struct net *net1, const struct net *net2)
- return 1;
- }
-
-+static inline int check_net(const struct net *net)
-+{
-+ return 1;
-+}
-+
- #define net_drop_ns NULL
- #endif
-
-diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
-index a3e91b552edc..fd2a086da910 100644
---- a/net/ipv4/tcp.c
-+++ b/net/ipv4/tcp.c
-@@ -2258,6 +2258,9 @@ void tcp_close(struct sock *sk, long timeout)
- tcp_send_active_reset(sk, GFP_ATOMIC);
- __NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPABORTONMEMORY);
-+ } else if (!check_net(sock_net(sk))) {
-+ /* Not possible to send reset; just close */
-+ tcp_set_state(sk, TCP_CLOSE);
- }
- }
-
-diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
-index e906014890b6..ec1e5de41653 100644
---- a/net/ipv4/tcp_timer.c
-+++ b/net/ipv4/tcp_timer.c
-@@ -50,11 +50,19 @@ static void tcp_write_err(struct sock *sk)
- * to prevent DoS attacks. It is called when a retransmission timeout
- * or zero probe timeout occurs on orphaned socket.
- *
-+ * Also close if our net namespace is exiting; in that case there is no
-+ * hope of ever communicating again since all netns interfaces are already
-+ * down (or about to be down), and we need to release our dst references,
-+ * which have been moved to the netns loopback interface, so the namespace
-+ * can finish exiting. This condition is only possible if we are a kernel
-+ * socket, as those do not hold references to the namespace.
-+ *
- * Criteria is still not confirmed experimentally and may change.
- * We kill the socket, if:
- * 1. If number of orphaned sockets exceeds an administratively configured
- * limit.
- * 2. If we have strong memory pressure.
-+ * 3. If our net namespace is exiting.
- */
- static int tcp_out_of_resources(struct sock *sk, bool do_reset)
- {
-@@ -83,6 +91,13 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
- return 1;
- }
-+
-+ if (!check_net(sock_net(sk))) {
-+ /* Not possible to send reset; just close */
-+ tcp_done(sk);
-+ return 1;
-+ }
-+
- return 0;
- }
-
---
-2.14.2
-
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Dan Streetman <ddstreet@ieee.org>
+Date: Thu, 18 Jan 2018 16:14:26 -0500
+Subject: [PATCH] net: tcp: close sock if net namespace is exiting
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+When a tcp socket is closed, if it detects that its net namespace is
+exiting, close immediately and do not wait for FIN sequence.
+
+For normal sockets, a reference is taken to their net namespace, so it will
+never exit while the socket is open. However, kernel sockets do not take a
+reference to their net namespace, so it may begin exiting while the kernel
+socket is still open. In this case if the kernel socket is a tcp socket,
+it will stay open trying to complete its close sequence. The sock's dst(s)
+hold a reference to their interface, which are all transferred to the
+namespace's loopback interface when the real interfaces are taken down.
+When the namespace tries to take down its loopback interface, it hangs
+waiting for all references to the loopback interface to release, which
+results in messages like:
+
+unregister_netdevice: waiting for lo to become free. Usage count = 1
+
+These messages continue until the socket finally times out and closes.
+Since the net namespace cleanup holds the net_mutex while calling its
+registered pernet callbacks, any new net namespace initialization is
+blocked until the current net namespace finishes exiting.
+
+After this change, the tcp socket notices the exiting net namespace, and
+closes immediately, releasing its dst(s) and their reference to the
+loopback interface, which lets the net namespace continue exiting.
+
+Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811
+Signed-off-by: Dan Streetman <ddstreet@canonical.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ include/net/net_namespace.h | 10 ++++++++++
+ net/ipv4/tcp.c | 3 +++
+ net/ipv4/tcp_timer.c | 15 +++++++++++++++
+ 3 files changed, 28 insertions(+)
+
+diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
+index 1c401bd4c2e0..a5d023fa78db 100644
+--- a/include/net/net_namespace.h
++++ b/include/net/net_namespace.h
+@@ -221,6 +221,11 @@ int net_eq(const struct net *net1, const struct net *net2)
+ return net1 == net2;
+ }
+
++static inline int check_net(const struct net *net)
++{
++ return atomic_read(&net->count) != 0;
++}
++
+ void net_drop_ns(void *);
+
+ #else
+@@ -245,6 +250,11 @@ int net_eq(const struct net *net1, const struct net *net2)
+ return 1;
+ }
+
++static inline int check_net(const struct net *net)
++{
++ return 1;
++}
++
+ #define net_drop_ns NULL
+ #endif
+
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index a3e91b552edc..fd2a086da910 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -2258,6 +2258,9 @@ void tcp_close(struct sock *sk, long timeout)
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ __NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPABORTONMEMORY);
++ } else if (!check_net(sock_net(sk))) {
++ /* Not possible to send reset; just close */
++ tcp_set_state(sk, TCP_CLOSE);
+ }
+ }
+
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
+index e906014890b6..ec1e5de41653 100644
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -50,11 +50,19 @@ static void tcp_write_err(struct sock *sk)
+ * to prevent DoS attacks. It is called when a retransmission timeout
+ * or zero probe timeout occurs on orphaned socket.
+ *
++ * Also close if our net namespace is exiting; in that case there is no
++ * hope of ever communicating again since all netns interfaces are already
++ * down (or about to be down), and we need to release our dst references,
++ * which have been moved to the netns loopback interface, so the namespace
++ * can finish exiting. This condition is only possible if we are a kernel
++ * socket, as those do not hold references to the namespace.
++ *
+ * Criteria is still not confirmed experimentally and may change.
+ * We kill the socket, if:
+ * 1. If number of orphaned sockets exceeds an administratively configured
+ * limit.
+ * 2. If we have strong memory pressure.
++ * 3. If our net namespace is exiting.
+ */
+ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
+ {
+@@ -83,6 +91,13 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
+ return 1;
+ }
++
++ if (!check_net(sock_net(sk))) {
++ /* Not possible to send reset; just close */
++ tcp_done(sk);
++ return 1;
++ }
++
+ return 0;
+ }
+
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Tommi Rantala <tommi.t.rantala@nokia.com>
-Date: Mon, 5 Feb 2018 21:48:14 +0200
-Subject: [PATCH] sctp: fix dst refcnt leak in sctp_v4_get_dst
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Fix dst reference count leak in sctp_v4_get_dst() introduced in commit
-410f03831 ("sctp: add routing output fallback"):
-
-When walking the address_list, successive ip_route_output_key() calls
-may return the same rt->dst with the reference incremented on each call.
-
-The code would not decrement the dst refcount when the dst pointer was
-identical from the previous iteration, causing the dst refcnt leak.
-
-Testcase:
- ip netns add TEST
- ip netns exec TEST ip link set lo up
- ip link add dummy0 type dummy
- ip link add dummy1 type dummy
- ip link add dummy2 type dummy
- ip link set dev dummy0 netns TEST
- ip link set dev dummy1 netns TEST
- ip link set dev dummy2 netns TEST
- ip netns exec TEST ip addr add 192.168.1.1/24 dev dummy0
- ip netns exec TEST ip link set dummy0 up
- ip netns exec TEST ip addr add 192.168.1.2/24 dev dummy1
- ip netns exec TEST ip link set dummy1 up
- ip netns exec TEST ip addr add 192.168.1.3/24 dev dummy2
- ip netns exec TEST ip link set dummy2 up
- ip netns exec TEST sctp_test -H 192.168.1.2 -P 20002 -h 192.168.1.1 -p 20000 -s -B 192.168.1.3
- ip netns del TEST
-
-In 4.4 and 4.9 kernels this results to:
- [ 354.179591] unregister_netdevice: waiting for lo to become free. Usage count = 1
- [ 364.419674] unregister_netdevice: waiting for lo to become free. Usage count = 1
- [ 374.663664] unregister_netdevice: waiting for lo to become free. Usage count = 1
- [ 384.903717] unregister_netdevice: waiting for lo to become free. Usage count = 1
- [ 395.143724] unregister_netdevice: waiting for lo to become free. Usage count = 1
- [ 405.383645] unregister_netdevice: waiting for lo to become free. Usage count = 1
- ...
-
-Fixes: 410f03831 ("sctp: add routing output fallback")
-Fixes: 0ca50d12f ("sctp: fix src address selection if using secondary addresses")
-Signed-off-by: Tommi Rantala <tommi.t.rantala@nokia.com>
-Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
-Acked-by: Neil Horman <nhorman@tuxdriver.com>
-Signed-off-by: David S. Miller <davem@davemloft.net>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- net/sctp/protocol.c | 10 ++++------
- 1 file changed, 4 insertions(+), 6 deletions(-)
-
-diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
-index 989a900383b5..e1a3ae4f3cab 100644
---- a/net/sctp/protocol.c
-+++ b/net/sctp/protocol.c
-@@ -514,22 +514,20 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
- if (IS_ERR(rt))
- continue;
-
-- if (!dst)
-- dst = &rt->dst;
--
- /* Ensure the src address belongs to the output
- * interface.
- */
- odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr,
- false);
- if (!odev || odev->ifindex != fl4->flowi4_oif) {
-- if (&rt->dst != dst)
-+ if (!dst)
-+ dst = &rt->dst;
-+ else
- dst_release(&rt->dst);
- continue;
- }
-
-- if (dst != &rt->dst)
-- dst_release(dst);
-+ dst_release(dst);
- dst = &rt->dst;
- break;
- }
---
-2.14.2
-
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Tommi Rantala <tommi.t.rantala@nokia.com>
+Date: Mon, 5 Feb 2018 21:48:14 +0200
+Subject: [PATCH] sctp: fix dst refcnt leak in sctp_v4_get_dst
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Fix dst reference count leak in sctp_v4_get_dst() introduced in commit
+410f03831 ("sctp: add routing output fallback"):
+
+When walking the address_list, successive ip_route_output_key() calls
+may return the same rt->dst with the reference incremented on each call.
+
+The code would not decrement the dst refcount when the dst pointer was
+identical from the previous iteration, causing the dst refcnt leak.
+
+Testcase:
+ ip netns add TEST
+ ip netns exec TEST ip link set lo up
+ ip link add dummy0 type dummy
+ ip link add dummy1 type dummy
+ ip link add dummy2 type dummy
+ ip link set dev dummy0 netns TEST
+ ip link set dev dummy1 netns TEST
+ ip link set dev dummy2 netns TEST
+ ip netns exec TEST ip addr add 192.168.1.1/24 dev dummy0
+ ip netns exec TEST ip link set dummy0 up
+ ip netns exec TEST ip addr add 192.168.1.2/24 dev dummy1
+ ip netns exec TEST ip link set dummy1 up
+ ip netns exec TEST ip addr add 192.168.1.3/24 dev dummy2
+ ip netns exec TEST ip link set dummy2 up
+ ip netns exec TEST sctp_test -H 192.168.1.2 -P 20002 -h 192.168.1.1 -p 20000 -s -B 192.168.1.3
+ ip netns del TEST
+
+In 4.4 and 4.9 kernels this results to:
+ [ 354.179591] unregister_netdevice: waiting for lo to become free. Usage count = 1
+ [ 364.419674] unregister_netdevice: waiting for lo to become free. Usage count = 1
+ [ 374.663664] unregister_netdevice: waiting for lo to become free. Usage count = 1
+ [ 384.903717] unregister_netdevice: waiting for lo to become free. Usage count = 1
+ [ 395.143724] unregister_netdevice: waiting for lo to become free. Usage count = 1
+ [ 405.383645] unregister_netdevice: waiting for lo to become free. Usage count = 1
+ ...
+
+Fixes: 410f03831 ("sctp: add routing output fallback")
+Fixes: 0ca50d12f ("sctp: fix src address selection if using secondary addresses")
+Signed-off-by: Tommi Rantala <tommi.t.rantala@nokia.com>
+Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Acked-by: Neil Horman <nhorman@tuxdriver.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ net/sctp/protocol.c | 10 ++++------
+ 1 file changed, 4 insertions(+), 6 deletions(-)
+
+diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
+index 989a900383b5..e1a3ae4f3cab 100644
+--- a/net/sctp/protocol.c
++++ b/net/sctp/protocol.c
+@@ -514,22 +514,20 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
+ if (IS_ERR(rt))
+ continue;
+
+- if (!dst)
+- dst = &rt->dst;
+-
+ /* Ensure the src address belongs to the output
+ * interface.
+ */
+ odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr,
+ false);
+ if (!odev || odev->ifindex != fl4->flowi4_oif) {
+- if (&rt->dst != dst)
++ if (!dst)
++ dst = &rt->dst;
++ else
+ dst_release(&rt->dst);
+ continue;
+ }
+
+- if (dst != &rt->dst)
+- dst_release(dst);
++ dst_release(dst);
+ dst = &rt->dst;
+ break;
+ }
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Alexey Kodanev <alexey.kodanev@oracle.com>
-Date: Mon, 5 Feb 2018 15:10:35 +0300
-Subject: [PATCH] sctp: fix dst refcnt leak in sctp_v6_get_dst()
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-When going through the bind address list in sctp_v6_get_dst() and
-the previously found address is better ('matchlen > bmatchlen'),
-the code continues to the next iteration without releasing currently
-held destination.
-
-Fix it by releasing 'bdst' before continue to the next iteration, and
-instead of introducing one more '!IS_ERR(bdst)' check for dst_release(),
-move the already existed one right after ip6_dst_lookup_flow(), i.e. we
-shouldn't proceed further if we get an error for the route lookup.
-
-Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using secondary addresses for ipv6")
-Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
-Acked-by: Neil Horman <nhorman@tuxdriver.com>
-Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
-Signed-off-by: David S. Miller <davem@davemloft.net>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- net/sctp/ipv6.c | 10 +++++++---
- 1 file changed, 7 insertions(+), 3 deletions(-)
-
-diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
-index edb462b0b73b..e626d72868fe 100644
---- a/net/sctp/ipv6.c
-+++ b/net/sctp/ipv6.c
-@@ -326,8 +326,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
- final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
- bdst = ip6_dst_lookup_flow(sk, fl6, final_p);
-
-- if (!IS_ERR(bdst) &&
-- ipv6_chk_addr(dev_net(bdst->dev),
-+ if (IS_ERR(bdst))
-+ continue;
-+
-+ if (ipv6_chk_addr(dev_net(bdst->dev),
- &laddr->a.v6.sin6_addr, bdst->dev, 1)) {
- if (!IS_ERR_OR_NULL(dst))
- dst_release(dst);
-@@ -336,8 +338,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
- }
-
- bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
-- if (matchlen > bmatchlen)
-+ if (matchlen > bmatchlen) {
-+ dst_release(bdst);
- continue;
-+ }
-
- if (!IS_ERR_OR_NULL(dst))
- dst_release(dst);
---
-2.14.2
-
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Vasily Averin <vvs@virtuozzo.com>
-Date: Thu, 2 Nov 2017 13:03:42 +0300
-Subject: [PATCH] lockd: lost rollback of set_grace_period() in
- lockd_down_net()
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Commit efda760fe95ea ("lockd: fix lockd shutdown race") is incorrect,
-it removes lockd_manager and disarm grace_period_end for init_net only.
-
-If nfsd was started from another net namespace lockd_up_net() calls
-set_grace_period() that adds lockd_manager into per-netns list
-and queues grace_period_end delayed work.
-
-These action should be reverted in lockd_down_net().
-Otherwise it can lead to double list_add on after restart nfsd in netns,
-and to use-after-free if non-disarmed delayed work will be executed after netns destroy.
-
-Fixes: efda760fe95e ("lockd: fix lockd shutdown race")
-Cc: stable@vger.kernel.org
-Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
-Signed-off-by: J. Bruce Fields <bfields@redhat.com>
-(cherry picked from commit 3a2b19d1ee5633f76ae8a88da7bc039a5d1732aa)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- fs/lockd/svc.c | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
-index 726b6cecf430..fa8f6effcf00 100644
---- a/fs/lockd/svc.c
-+++ b/fs/lockd/svc.c
-@@ -274,6 +274,8 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
- if (ln->nlmsvc_users) {
- if (--ln->nlmsvc_users == 0) {
- nlm_shutdown_hosts_net(net);
-+ cancel_delayed_work_sync(&ln->grace_period_end);
-+ locks_end_grace(&ln->lockd_manager);
- svc_shutdown_net(serv, net);
- dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net);
- }
---
-2.14.2
-
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Alexey Kodanev <alexey.kodanev@oracle.com>
+Date: Mon, 5 Feb 2018 15:10:35 +0300
+Subject: [PATCH] sctp: fix dst refcnt leak in sctp_v6_get_dst()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+When going through the bind address list in sctp_v6_get_dst() and
+the previously found address is better ('matchlen > bmatchlen'),
+the code continues to the next iteration without releasing currently
+held destination.
+
+Fix it by releasing 'bdst' before continue to the next iteration, and
+instead of introducing one more '!IS_ERR(bdst)' check for dst_release(),
+move the already existed one right after ip6_dst_lookup_flow(), i.e. we
+shouldn't proceed further if we get an error for the route lookup.
+
+Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using secondary addresses for ipv6")
+Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
+Acked-by: Neil Horman <nhorman@tuxdriver.com>
+Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ net/sctp/ipv6.c | 10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
+index edb462b0b73b..e626d72868fe 100644
+--- a/net/sctp/ipv6.c
++++ b/net/sctp/ipv6.c
+@@ -326,8 +326,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
+ final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
+ bdst = ip6_dst_lookup_flow(sk, fl6, final_p);
+
+- if (!IS_ERR(bdst) &&
+- ipv6_chk_addr(dev_net(bdst->dev),
++ if (IS_ERR(bdst))
++ continue;
++
++ if (ipv6_chk_addr(dev_net(bdst->dev),
+ &laddr->a.v6.sin6_addr, bdst->dev, 1)) {
+ if (!IS_ERR_OR_NULL(dst))
+ dst_release(dst);
+@@ -336,8 +338,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
+ }
+
+ bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
+- if (matchlen > bmatchlen)
++ if (matchlen > bmatchlen) {
++ dst_release(bdst);
+ continue;
++ }
+
+ if (!IS_ERR_OR_NULL(dst))
+ dst_release(dst);
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Vasily Averin <vvs@virtuozzo.com>
+Date: Thu, 2 Nov 2017 13:03:42 +0300
+Subject: [PATCH] lockd: lost rollback of set_grace_period() in
+ lockd_down_net()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Commit efda760fe95ea ("lockd: fix lockd shutdown race") is incorrect,
+it removes lockd_manager and disarm grace_period_end for init_net only.
+
+If nfsd was started from another net namespace lockd_up_net() calls
+set_grace_period() that adds lockd_manager into per-netns list
+and queues grace_period_end delayed work.
+
+These action should be reverted in lockd_down_net().
+Otherwise it can lead to double list_add on after restart nfsd in netns,
+and to use-after-free if non-disarmed delayed work will be executed after netns destroy.
+
+Fixes: efda760fe95e ("lockd: fix lockd shutdown race")
+Cc: stable@vger.kernel.org
+Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+(cherry picked from commit 3a2b19d1ee5633f76ae8a88da7bc039a5d1732aa)
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ fs/lockd/svc.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
+index 726b6cecf430..fa8f6effcf00 100644
+--- a/fs/lockd/svc.c
++++ b/fs/lockd/svc.c
+@@ -274,6 +274,8 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
+ if (ln->nlmsvc_users) {
+ if (--ln->nlmsvc_users == 0) {
+ nlm_shutdown_hosts_net(net);
++ cancel_delayed_work_sync(&ln->grace_period_end);
++ locks_end_grace(&ln->lockd_manager);
+ svc_shutdown_net(serv, net);
+ dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net);
+ }
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Changwei Ge <ge.changwei@h3c.com>
-Date: Wed, 31 Jan 2018 16:15:02 -0800
-Subject: [PATCH] ocfs2: make metadata estimation accurate and clear
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Current code assume that ::w_unwritten_list always has only one item on.
-This is not right and hard to get understood. So improve how to count
-unwritten item.
-
-Link: http://lkml.kernel.org/r/1515479070-32653-1-git-send-email-ge.changwei@h3c.com
-Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
-Reported-by: John Lightsey <john@nixnuts.net>
-Tested-by: John Lightsey <john@nixnuts.net>
-Cc: Mark Fasheh <mfasheh@versity.com>
-Cc: Joseph Qi <jiangqi903@gmail.com>
-Cc: Junxiao Bi <junxiao.bi@oracle.com>
-Cc: Joel Becker <jlbec@evilplan.org>
-Cc: Changwei Ge <ge.changwei@h3c.com>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-(cherry picked from commit 63de8bd9328bf2a778fc277503da163ae3defa3c)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- fs/ocfs2/aops.c | 4 +++-
- 1 file changed, 3 insertions(+), 1 deletion(-)
-
-diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
-index 88a31e9340a0..77ec9b495027 100644
---- a/fs/ocfs2/aops.c
-+++ b/fs/ocfs2/aops.c
-@@ -784,6 +784,7 @@ struct ocfs2_write_ctxt {
- struct ocfs2_cached_dealloc_ctxt w_dealloc;
-
- struct list_head w_unwritten_list;
-+ unsigned int w_unwritten_count;
- };
-
- void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
-@@ -1373,6 +1374,7 @@ static int ocfs2_unwritten_check(struct inode *inode,
- desc->c_clear_unwritten = 0;
- list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
- list_add_tail(&new->ue_node, &wc->w_unwritten_list);
-+ wc->w_unwritten_count++;
- new = NULL;
- unlock:
- spin_unlock(&oi->ip_lock);
-@@ -2246,7 +2248,7 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
- ue->ue_phys = desc->c_phys;
-
- list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
-- dwc->dw_zero_count++;
-+ dwc->dw_zero_count += wc->w_unwritten_count;
- }
-
- ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
---
-2.14.2
-
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Changwei Ge <ge.changwei@h3c.com>
+Date: Wed, 31 Jan 2018 16:15:02 -0800
+Subject: [PATCH] ocfs2: make metadata estimation accurate and clear
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Current code assume that ::w_unwritten_list always has only one item on.
+This is not right and hard to get understood. So improve how to count
+unwritten item.
+
+Link: http://lkml.kernel.org/r/1515479070-32653-1-git-send-email-ge.changwei@h3c.com
+Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
+Reported-by: John Lightsey <john@nixnuts.net>
+Tested-by: John Lightsey <john@nixnuts.net>
+Cc: Mark Fasheh <mfasheh@versity.com>
+Cc: Joseph Qi <jiangqi903@gmail.com>
+Cc: Junxiao Bi <junxiao.bi@oracle.com>
+Cc: Joel Becker <jlbec@evilplan.org>
+Cc: Changwei Ge <ge.changwei@h3c.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+(cherry picked from commit 63de8bd9328bf2a778fc277503da163ae3defa3c)
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ fs/ocfs2/aops.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
+index 88a31e9340a0..77ec9b495027 100644
+--- a/fs/ocfs2/aops.c
++++ b/fs/ocfs2/aops.c
+@@ -784,6 +784,7 @@ struct ocfs2_write_ctxt {
+ struct ocfs2_cached_dealloc_ctxt w_dealloc;
+
+ struct list_head w_unwritten_list;
++ unsigned int w_unwritten_count;
+ };
+
+ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
+@@ -1373,6 +1374,7 @@ static int ocfs2_unwritten_check(struct inode *inode,
+ desc->c_clear_unwritten = 0;
+ list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
+ list_add_tail(&new->ue_node, &wc->w_unwritten_list);
++ wc->w_unwritten_count++;
+ new = NULL;
+ unlock:
+ spin_unlock(&oi->ip_lock);
+@@ -2246,7 +2248,7 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
+ ue->ue_phys = desc->c_phys;
+
+ list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
+- dwc->dw_zero_count++;
++ dwc->dw_zero_count += wc->w_unwritten_count;
+ }
+
+ ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Changwei Ge <ge.changwei@h3c.com>
-Date: Wed, 31 Jan 2018 16:15:06 -0800
-Subject: [PATCH] ocfs2: try to reuse extent block in dealloc without
- meta_alloc
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-A crash issue was reported by John Lightsey with a call trace as follows:
-
- ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2]
- ocfs2_change_extent_flag+0x33a/0x470 [ocfs2]
- ocfs2_mark_extent_written+0x172/0x220 [ocfs2]
- ocfs2_dio_end_io+0x62d/0x910 [ocfs2]
- dio_complete+0x19a/0x1a0
- do_blockdev_direct_IO+0x19dd/0x1eb0
- __blockdev_direct_IO+0x43/0x50
- ocfs2_direct_IO+0x8f/0xa0 [ocfs2]
- generic_file_direct_write+0xb2/0x170
- __generic_file_write_iter+0xc3/0x1b0
- ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2]
- __vfs_write+0xae/0xf0
- vfs_write+0xb8/0x1b0
- SyS_write+0x4f/0xb0
- system_call_fastpath+0x16/0x75
-
-The BUG code told that extent tree wants to grow but no metadata was
-reserved ahead of time. From my investigation into this issue, the root
-cause it that although enough metadata is not reserved, there should be
-enough for following use. Rightmost extent is merged into its left one
-due to a certain times of marking extent written. Because during
-marking extent written, we got many physically continuous extents. At
-last, an empty extent showed up and the rightmost path is removed from
-extent tree.
-
-Add a new mechanism to reuse extent block cached in dealloc which were
-just unlinked from extent tree to solve this crash issue.
-
-Criteria is that during marking extents *written*, if extent rotation
-and merging results in unlinking extent with growing extent tree later
-without any metadata reserved ahead of time, try to reuse those extents
-in dealloc in which deleted extents are cached.
-
-Also, this patch addresses the issue John reported that ::dw_zero_count
-is not calculated properly.
-
-After applying this patch, the issue John reported was gone. Thanks for
-the reproducer provided by John. And this patch has passed
-ocfs2-test(29 cases) suite running by New H3C Group.
-
-[ge.changwei@h3c.com: fix static checker warnning]
- Link: http://lkml.kernel.org/r/63ADC13FD55D6546B7DECE290D39E373F29196AE@H3CMLB12-EX.srv.huawei-3com.com
-[akpm@linux-foundation.org: brelse(NULL) is legal]
-Link: http://lkml.kernel.org/r/1515479070-32653-2-git-send-email-ge.changwei@h3c.com
-Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
-Reported-by: John Lightsey <john@nixnuts.net>
-Tested-by: John Lightsey <john@nixnuts.net>
-Cc: Joel Becker <jlbec@evilplan.org>
-Cc: Joseph Qi <jiangqi903@gmail.com>
-Cc: Junxiao Bi <junxiao.bi@oracle.com>
-Cc: Dan Carpenter <dan.carpenter@oracle.com>
-Cc: Mark Fasheh <mfasheh@versity.com>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-(cherry picked from commit 71a36944042b7d9dd71f6a5d1c5ea1c2353b5d42)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- fs/ocfs2/alloc.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
- fs/ocfs2/alloc.h | 1 +
- fs/ocfs2/aops.c | 6 ++
- 3 files changed, 203 insertions(+), 10 deletions(-)
-
-diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
-index 386aecce881d..9b5e7d8ba710 100644
---- a/fs/ocfs2/alloc.c
-+++ b/fs/ocfs2/alloc.c
-@@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
- struct ocfs2_extent_rec *rec);
- static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
- static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
-+
-+static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
-+ struct ocfs2_extent_tree *et,
-+ struct buffer_head **new_eb_bh,
-+ int blk_wanted, int *blk_given);
-+static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
-+
- static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
- .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
- .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
-@@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
- if (!obj)
- obj = (void *)bh->b_data;
- et->et_object = obj;
-+ et->et_dealloc = NULL;
-
- et->et_ops->eo_fill_root_el(et);
- if (!et->et_ops->eo_fill_max_leaf_clusters)
-@@ -1159,7 +1167,7 @@ static int ocfs2_add_branch(handle_t *handle,
- struct buffer_head **last_eb_bh,
- struct ocfs2_alloc_context *meta_ac)
- {
-- int status, new_blocks, i;
-+ int status, new_blocks, i, block_given = 0;
- u64 next_blkno, new_last_eb_blk;
- struct buffer_head *bh;
- struct buffer_head **new_eb_bhs = NULL;
-@@ -1214,11 +1222,31 @@ static int ocfs2_add_branch(handle_t *handle,
- goto bail;
- }
-
-- status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
-- meta_ac, new_eb_bhs);
-- if (status < 0) {
-- mlog_errno(status);
-- goto bail;
-+ /* Firstyly, try to reuse dealloc since we have already estimated how
-+ * many extent blocks we may use.
-+ */
-+ if (!ocfs2_is_dealloc_empty(et)) {
-+ status = ocfs2_reuse_blk_from_dealloc(handle, et,
-+ new_eb_bhs, new_blocks,
-+ &block_given);
-+ if (status < 0) {
-+ mlog_errno(status);
-+ goto bail;
-+ }
-+ }
-+
-+ BUG_ON(block_given > new_blocks);
-+
-+ if (block_given < new_blocks) {
-+ BUG_ON(!meta_ac);
-+ status = ocfs2_create_new_meta_bhs(handle, et,
-+ new_blocks - block_given,
-+ meta_ac,
-+ &new_eb_bhs[block_given]);
-+ if (status < 0) {
-+ mlog_errno(status);
-+ goto bail;
-+ }
- }
-
- /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
-@@ -1341,15 +1369,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
- struct ocfs2_alloc_context *meta_ac,
- struct buffer_head **ret_new_eb_bh)
- {
-- int status, i;
-+ int status, i, block_given = 0;
- u32 new_clusters;
- struct buffer_head *new_eb_bh = NULL;
- struct ocfs2_extent_block *eb;
- struct ocfs2_extent_list *root_el;
- struct ocfs2_extent_list *eb_el;
-
-- status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
-- &new_eb_bh);
-+ if (!ocfs2_is_dealloc_empty(et)) {
-+ status = ocfs2_reuse_blk_from_dealloc(handle, et,
-+ &new_eb_bh, 1,
-+ &block_given);
-+ } else if (meta_ac) {
-+ status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
-+ &new_eb_bh);
-+
-+ } else {
-+ BUG();
-+ }
-+
- if (status < 0) {
- mlog_errno(status);
- goto bail;
-@@ -1512,7 +1550,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
- int depth = le16_to_cpu(el->l_tree_depth);
- struct buffer_head *bh = NULL;
-
-- BUG_ON(meta_ac == NULL);
-+ BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
-
- shift = ocfs2_find_branch_target(et, &bh);
- if (shift < 0) {
-@@ -6593,6 +6631,154 @@ ocfs2_find_per_slot_free_list(int type,
- return fl;
- }
-
-+static struct ocfs2_per_slot_free_list *
-+ocfs2_find_preferred_free_list(int type,
-+ int preferred_slot,
-+ int *real_slot,
-+ struct ocfs2_cached_dealloc_ctxt *ctxt)
-+{
-+ struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
-+
-+ while (fl) {
-+ if (fl->f_inode_type == type && fl->f_slot == preferred_slot) {
-+ *real_slot = fl->f_slot;
-+ return fl;
-+ }
-+
-+ fl = fl->f_next_suballocator;
-+ }
-+
-+ /* If we can't find any free list matching preferred slot, just use
-+ * the first one.
-+ */
-+ fl = ctxt->c_first_suballocator;
-+ *real_slot = fl->f_slot;
-+
-+ return fl;
-+}
-+
-+/* Return Value 1 indicates empty */
-+static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
-+{
-+ struct ocfs2_per_slot_free_list *fl = NULL;
-+
-+ if (!et->et_dealloc)
-+ return 1;
-+
-+ fl = et->et_dealloc->c_first_suballocator;
-+ if (!fl)
-+ return 1;
-+
-+ if (!fl->f_first)
-+ return 1;
-+
-+ return 0;
-+}
-+
-+/* If extent was deleted from tree due to extent rotation and merging, and
-+ * no metadata is reserved ahead of time. Try to reuse some extents
-+ * just deleted. This is only used to reuse extent blocks.
-+ * It is supposed to find enough extent blocks in dealloc if our estimation
-+ * on metadata is accurate.
-+ */
-+static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
-+ struct ocfs2_extent_tree *et,
-+ struct buffer_head **new_eb_bh,
-+ int blk_wanted, int *blk_given)
-+{
-+ int i, status = 0, real_slot;
-+ struct ocfs2_cached_dealloc_ctxt *dealloc;
-+ struct ocfs2_per_slot_free_list *fl;
-+ struct ocfs2_cached_block_free *bf;
-+ struct ocfs2_extent_block *eb;
-+ struct ocfs2_super *osb =
-+ OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
-+
-+ *blk_given = 0;
-+
-+ /* If extent tree doesn't have a dealloc, this is not faulty. Just
-+ * tell upper caller dealloc can't provide any block and it should
-+ * ask for alloc to claim more space.
-+ */
-+ dealloc = et->et_dealloc;
-+ if (!dealloc)
-+ goto bail;
-+
-+ for (i = 0; i < blk_wanted; i++) {
-+ /* Prefer to use local slot */
-+ fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE,
-+ osb->slot_num, &real_slot,
-+ dealloc);
-+ /* If no more block can be reused, we should claim more
-+ * from alloc. Just return here normally.
-+ */
-+ if (!fl) {
-+ status = 0;
-+ break;
-+ }
-+
-+ bf = fl->f_first;
-+ fl->f_first = bf->free_next;
-+
-+ new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
-+ if (new_eb_bh[i] == NULL) {
-+ status = -ENOMEM;
-+ mlog_errno(status);
-+ goto bail;
-+ }
-+
-+ mlog(0, "Reusing block(%llu) from "
-+ "dealloc(local slot:%d, real slot:%d)\n",
-+ bf->free_blk, osb->slot_num, real_slot);
-+
-+ ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
-+
-+ status = ocfs2_journal_access_eb(handle, et->et_ci,
-+ new_eb_bh[i],
-+ OCFS2_JOURNAL_ACCESS_CREATE);
-+ if (status < 0) {
-+ mlog_errno(status);
-+ goto bail;
-+ }
-+
-+ memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
-+ eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
-+
-+ /* We can't guarantee that buffer head is still cached, so
-+ * polutlate the extent block again.
-+ */
-+ strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
-+ eb->h_blkno = cpu_to_le64(bf->free_blk);
-+ eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
-+ eb->h_suballoc_slot = cpu_to_le16(real_slot);
-+ eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
-+ eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
-+ eb->h_list.l_count =
-+ cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
-+
-+ /* We'll also be dirtied by the caller, so
-+ * this isn't absolutely necessary.
-+ */
-+ ocfs2_journal_dirty(handle, new_eb_bh[i]);
-+
-+ if (!fl->f_first) {
-+ dealloc->c_first_suballocator = fl->f_next_suballocator;
-+ kfree(fl);
-+ }
-+ kfree(bf);
-+ }
-+
-+ *blk_given = i;
-+
-+bail:
-+ if (unlikely(status < 0)) {
-+ for (i = 0; i < blk_wanted; i++)
-+ brelse(new_eb_bh[i]);
-+ }
-+
-+ return status;
-+}
-+
- int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
- int type, int slot, u64 suballoc,
- u64 blkno, unsigned int bit)
-diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
-index 4a5152ec88a3..571692171dd1 100644
---- a/fs/ocfs2/alloc.h
-+++ b/fs/ocfs2/alloc.h
-@@ -61,6 +61,7 @@ struct ocfs2_extent_tree {
- ocfs2_journal_access_func et_root_journal_access;
- void *et_object;
- unsigned int et_max_leaf_clusters;
-+ struct ocfs2_cached_dealloc_ctxt *et_dealloc;
- };
-
- /*
-diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
-index 77ec9b495027..2ff02dda97d8 100644
---- a/fs/ocfs2/aops.c
-+++ b/fs/ocfs2/aops.c
-@@ -2322,6 +2322,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
-
- ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
-
-+ /* Attach dealloc with extent tree in case that we may reuse extents
-+ * which are already unlinked from current extent tree due to extent
-+ * rotation and merging.
-+ */
-+ et.et_dealloc = &dealloc;
-+
- ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
- &data_ac, &meta_ac);
- if (ret) {
---
-2.14.2
-
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
-Date: Fri, 23 Mar 2018 09:19:21 +0100
-Subject: [PATCH] mm/shmem: do not wait for lock_page() in
- shmem_unused_huge_shrink()
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-shmem_unused_huge_shrink() gets called from reclaim path. Waiting for
-page lock may lead to deadlock there.
-
-There was a bug report that may be attributed to this:
-
-http://lkml.kernel.org/r/alpine.LRH.2.11.1801242349220.30642@mail.ewheeler.net
-
-Replace lock_page() with trylock_page() and skip the page if we failed to
-lock it. We will get to the page on the next scan.
-
-We can test for the PageTransHuge() outside the page lock as we only need
-protection against splitting the page under us. Holding pin oni the page
-is enough for this.
-
-Link: http://lkml.kernel.org/r/20180316210830.43738-1-kirill.shutemov@linux.intel.com
-Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure")
-Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
-Reported-by: Eric Wheeler <linux-mm@lists.ewheeler.net>
-Acked-by: Michal Hocko <mhocko@suse.com>
-Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
-Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
-Cc: Hugh Dickins <hughd@google.com>
-Cc: <stable@vger.kernel.org> [4.8+]
-Signed-off-by: Andrew Morton <>
-(cherry-picked from https://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git/commit/?h=since-4.15&id=73eccc61c701ee7b4223aea2079542a712feeea7)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- mm/shmem.c | 31 ++++++++++++++++++++-----------
- 1 file changed, 20 insertions(+), 11 deletions(-)
-
-diff --git a/mm/shmem.c b/mm/shmem.c
-index 859e4c224b80..2aae929eb90b 100644
---- a/mm/shmem.c
-+++ b/mm/shmem.c
-@@ -483,36 +483,45 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
- info = list_entry(pos, struct shmem_inode_info, shrinklist);
- inode = &info->vfs_inode;
-
-- if (nr_to_split && split >= nr_to_split) {
-- iput(inode);
-- continue;
-- }
-+ if (nr_to_split && split >= nr_to_split)
-+ goto leave;
-
-- page = find_lock_page(inode->i_mapping,
-+ page = find_get_page(inode->i_mapping,
- (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
- if (!page)
- goto drop;
-
-+ /* No huge page at the end of the file: nothing to split */
- if (!PageTransHuge(page)) {
-- unlock_page(page);
- put_page(page);
- goto drop;
- }
-
-+ /*
-+ * Leave the inode on the list if we failed to lock
-+ * the page at this time.
-+ *
-+ * Waiting for the lock may lead to deadlock in the
-+ * reclaim path.
-+ */
-+ if (!trylock_page(page)) {
-+ put_page(page);
-+ goto leave;
-+ }
-+
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
-
-- if (ret) {
-- /* split failed: leave it on the list */
-- iput(inode);
-- continue;
-- }
-+ /* If split failed leave the inode on the list */
-+ if (ret)
-+ goto leave;
-
- split++;
- drop:
- list_del_init(&info->shrinklist);
- removed++;
-+leave:
- iput(inode);
- }
-
---
-2.14.2
-
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Changwei Ge <ge.changwei@h3c.com>
+Date: Wed, 31 Jan 2018 16:15:06 -0800
+Subject: [PATCH] ocfs2: try to reuse extent block in dealloc without
+ meta_alloc
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+A crash issue was reported by John Lightsey with a call trace as follows:
+
+ ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2]
+ ocfs2_change_extent_flag+0x33a/0x470 [ocfs2]
+ ocfs2_mark_extent_written+0x172/0x220 [ocfs2]
+ ocfs2_dio_end_io+0x62d/0x910 [ocfs2]
+ dio_complete+0x19a/0x1a0
+ do_blockdev_direct_IO+0x19dd/0x1eb0
+ __blockdev_direct_IO+0x43/0x50
+ ocfs2_direct_IO+0x8f/0xa0 [ocfs2]
+ generic_file_direct_write+0xb2/0x170
+ __generic_file_write_iter+0xc3/0x1b0
+ ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2]
+ __vfs_write+0xae/0xf0
+ vfs_write+0xb8/0x1b0
+ SyS_write+0x4f/0xb0
+ system_call_fastpath+0x16/0x75
+
+The BUG code told that extent tree wants to grow but no metadata was
+reserved ahead of time. From my investigation into this issue, the root
+cause it that although enough metadata is not reserved, there should be
+enough for following use. Rightmost extent is merged into its left one
+due to a certain times of marking extent written. Because during
+marking extent written, we got many physically continuous extents. At
+last, an empty extent showed up and the rightmost path is removed from
+extent tree.
+
+Add a new mechanism to reuse extent block cached in dealloc which were
+just unlinked from extent tree to solve this crash issue.
+
+Criteria is that during marking extents *written*, if extent rotation
+and merging results in unlinking extent with growing extent tree later
+without any metadata reserved ahead of time, try to reuse those extents
+in dealloc in which deleted extents are cached.
+
+Also, this patch addresses the issue John reported that ::dw_zero_count
+is not calculated properly.
+
+After applying this patch, the issue John reported was gone. Thanks for
+the reproducer provided by John. And this patch has passed
+ocfs2-test(29 cases) suite running by New H3C Group.
+
+[ge.changwei@h3c.com: fix static checker warnning]
+ Link: http://lkml.kernel.org/r/63ADC13FD55D6546B7DECE290D39E373F29196AE@H3CMLB12-EX.srv.huawei-3com.com
+[akpm@linux-foundation.org: brelse(NULL) is legal]
+Link: http://lkml.kernel.org/r/1515479070-32653-2-git-send-email-ge.changwei@h3c.com
+Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
+Reported-by: John Lightsey <john@nixnuts.net>
+Tested-by: John Lightsey <john@nixnuts.net>
+Cc: Joel Becker <jlbec@evilplan.org>
+Cc: Joseph Qi <jiangqi903@gmail.com>
+Cc: Junxiao Bi <junxiao.bi@oracle.com>
+Cc: Dan Carpenter <dan.carpenter@oracle.com>
+Cc: Mark Fasheh <mfasheh@versity.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+(cherry picked from commit 71a36944042b7d9dd71f6a5d1c5ea1c2353b5d42)
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ fs/ocfs2/alloc.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
+ fs/ocfs2/alloc.h | 1 +
+ fs/ocfs2/aops.c | 6 ++
+ 3 files changed, 203 insertions(+), 10 deletions(-)
+
+diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
+index 386aecce881d..9b5e7d8ba710 100644
+--- a/fs/ocfs2/alloc.c
++++ b/fs/ocfs2/alloc.c
+@@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec);
+ static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
+ static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
++
++static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
++ struct ocfs2_extent_tree *et,
++ struct buffer_head **new_eb_bh,
++ int blk_wanted, int *blk_given);
++static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
++
+ static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+ .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
+ .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
+@@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
+ if (!obj)
+ obj = (void *)bh->b_data;
+ et->et_object = obj;
++ et->et_dealloc = NULL;
+
+ et->et_ops->eo_fill_root_el(et);
+ if (!et->et_ops->eo_fill_max_leaf_clusters)
+@@ -1159,7 +1167,7 @@ static int ocfs2_add_branch(handle_t *handle,
+ struct buffer_head **last_eb_bh,
+ struct ocfs2_alloc_context *meta_ac)
+ {
+- int status, new_blocks, i;
++ int status, new_blocks, i, block_given = 0;
+ u64 next_blkno, new_last_eb_blk;
+ struct buffer_head *bh;
+ struct buffer_head **new_eb_bhs = NULL;
+@@ -1214,11 +1222,31 @@ static int ocfs2_add_branch(handle_t *handle,
+ goto bail;
+ }
+
+- status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
+- meta_ac, new_eb_bhs);
+- if (status < 0) {
+- mlog_errno(status);
+- goto bail;
++ /* Firstyly, try to reuse dealloc since we have already estimated how
++ * many extent blocks we may use.
++ */
++ if (!ocfs2_is_dealloc_empty(et)) {
++ status = ocfs2_reuse_blk_from_dealloc(handle, et,
++ new_eb_bhs, new_blocks,
++ &block_given);
++ if (status < 0) {
++ mlog_errno(status);
++ goto bail;
++ }
++ }
++
++ BUG_ON(block_given > new_blocks);
++
++ if (block_given < new_blocks) {
++ BUG_ON(!meta_ac);
++ status = ocfs2_create_new_meta_bhs(handle, et,
++ new_blocks - block_given,
++ meta_ac,
++ &new_eb_bhs[block_given]);
++ if (status < 0) {
++ mlog_errno(status);
++ goto bail;
++ }
+ }
+
+ /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
+@@ -1341,15 +1369,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
+ struct ocfs2_alloc_context *meta_ac,
+ struct buffer_head **ret_new_eb_bh)
+ {
+- int status, i;
++ int status, i, block_given = 0;
+ u32 new_clusters;
+ struct buffer_head *new_eb_bh = NULL;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *root_el;
+ struct ocfs2_extent_list *eb_el;
+
+- status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
+- &new_eb_bh);
++ if (!ocfs2_is_dealloc_empty(et)) {
++ status = ocfs2_reuse_blk_from_dealloc(handle, et,
++ &new_eb_bh, 1,
++ &block_given);
++ } else if (meta_ac) {
++ status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
++ &new_eb_bh);
++
++ } else {
++ BUG();
++ }
++
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+@@ -1512,7 +1550,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
+ int depth = le16_to_cpu(el->l_tree_depth);
+ struct buffer_head *bh = NULL;
+
+- BUG_ON(meta_ac == NULL);
++ BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
+
+ shift = ocfs2_find_branch_target(et, &bh);
+ if (shift < 0) {
+@@ -6593,6 +6631,154 @@ ocfs2_find_per_slot_free_list(int type,
+ return fl;
+ }
+
++static struct ocfs2_per_slot_free_list *
++ocfs2_find_preferred_free_list(int type,
++ int preferred_slot,
++ int *real_slot,
++ struct ocfs2_cached_dealloc_ctxt *ctxt)
++{
++ struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
++
++ while (fl) {
++ if (fl->f_inode_type == type && fl->f_slot == preferred_slot) {
++ *real_slot = fl->f_slot;
++ return fl;
++ }
++
++ fl = fl->f_next_suballocator;
++ }
++
++ /* If we can't find any free list matching preferred slot, just use
++ * the first one.
++ */
++ fl = ctxt->c_first_suballocator;
++ *real_slot = fl->f_slot;
++
++ return fl;
++}
++
++/* Return Value 1 indicates empty */
++static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
++{
++ struct ocfs2_per_slot_free_list *fl = NULL;
++
++ if (!et->et_dealloc)
++ return 1;
++
++ fl = et->et_dealloc->c_first_suballocator;
++ if (!fl)
++ return 1;
++
++ if (!fl->f_first)
++ return 1;
++
++ return 0;
++}
++
++/* If extent was deleted from tree due to extent rotation and merging, and
++ * no metadata is reserved ahead of time. Try to reuse some extents
++ * just deleted. This is only used to reuse extent blocks.
++ * It is supposed to find enough extent blocks in dealloc if our estimation
++ * on metadata is accurate.
++ */
++static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
++ struct ocfs2_extent_tree *et,
++ struct buffer_head **new_eb_bh,
++ int blk_wanted, int *blk_given)
++{
++ int i, status = 0, real_slot;
++ struct ocfs2_cached_dealloc_ctxt *dealloc;
++ struct ocfs2_per_slot_free_list *fl;
++ struct ocfs2_cached_block_free *bf;
++ struct ocfs2_extent_block *eb;
++ struct ocfs2_super *osb =
++ OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
++
++ *blk_given = 0;
++
++ /* If extent tree doesn't have a dealloc, this is not faulty. Just
++ * tell upper caller dealloc can't provide any block and it should
++ * ask for alloc to claim more space.
++ */
++ dealloc = et->et_dealloc;
++ if (!dealloc)
++ goto bail;
++
++ for (i = 0; i < blk_wanted; i++) {
++ /* Prefer to use local slot */
++ fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE,
++ osb->slot_num, &real_slot,
++ dealloc);
++ /* If no more block can be reused, we should claim more
++ * from alloc. Just return here normally.
++ */
++ if (!fl) {
++ status = 0;
++ break;
++ }
++
++ bf = fl->f_first;
++ fl->f_first = bf->free_next;
++
++ new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
++ if (new_eb_bh[i] == NULL) {
++ status = -ENOMEM;
++ mlog_errno(status);
++ goto bail;
++ }
++
++ mlog(0, "Reusing block(%llu) from "
++ "dealloc(local slot:%d, real slot:%d)\n",
++ bf->free_blk, osb->slot_num, real_slot);
++
++ ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
++
++ status = ocfs2_journal_access_eb(handle, et->et_ci,
++ new_eb_bh[i],
++ OCFS2_JOURNAL_ACCESS_CREATE);
++ if (status < 0) {
++ mlog_errno(status);
++ goto bail;
++ }
++
++ memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
++ eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
++
++ /* We can't guarantee that buffer head is still cached, so
++ * polutlate the extent block again.
++ */
++ strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
++ eb->h_blkno = cpu_to_le64(bf->free_blk);
++ eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
++ eb->h_suballoc_slot = cpu_to_le16(real_slot);
++ eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
++ eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
++ eb->h_list.l_count =
++ cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
++
++ /* We'll also be dirtied by the caller, so
++ * this isn't absolutely necessary.
++ */
++ ocfs2_journal_dirty(handle, new_eb_bh[i]);
++
++ if (!fl->f_first) {
++ dealloc->c_first_suballocator = fl->f_next_suballocator;
++ kfree(fl);
++ }
++ kfree(bf);
++ }
++
++ *blk_given = i;
++
++bail:
++ if (unlikely(status < 0)) {
++ for (i = 0; i < blk_wanted; i++)
++ brelse(new_eb_bh[i]);
++ }
++
++ return status;
++}
++
+ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ int type, int slot, u64 suballoc,
+ u64 blkno, unsigned int bit)
+diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
+index 4a5152ec88a3..571692171dd1 100644
+--- a/fs/ocfs2/alloc.h
++++ b/fs/ocfs2/alloc.h
+@@ -61,6 +61,7 @@ struct ocfs2_extent_tree {
+ ocfs2_journal_access_func et_root_journal_access;
+ void *et_object;
+ unsigned int et_max_leaf_clusters;
++ struct ocfs2_cached_dealloc_ctxt *et_dealloc;
+ };
+
+ /*
+diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
+index 77ec9b495027..2ff02dda97d8 100644
+--- a/fs/ocfs2/aops.c
++++ b/fs/ocfs2/aops.c
+@@ -2322,6 +2322,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
+
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+
++ /* Attach dealloc with extent tree in case that we may reuse extents
++ * which are already unlinked from current extent tree due to extent
++ * rotation and merging.
++ */
++ et.et_dealloc = &dealloc;
++
+ ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
+ &data_ac, &meta_ac);
+ if (ret) {
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
+Date: Fri, 23 Mar 2018 09:19:21 +0100
+Subject: [PATCH] mm/shmem: do not wait for lock_page() in
+ shmem_unused_huge_shrink()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+shmem_unused_huge_shrink() gets called from reclaim path. Waiting for
+page lock may lead to deadlock there.
+
+There was a bug report that may be attributed to this:
+
+http://lkml.kernel.org/r/alpine.LRH.2.11.1801242349220.30642@mail.ewheeler.net
+
+Replace lock_page() with trylock_page() and skip the page if we failed to
+lock it. We will get to the page on the next scan.
+
+We can test for the PageTransHuge() outside the page lock as we only need
+protection against splitting the page under us. Holding pin oni the page
+is enough for this.
+
+Link: http://lkml.kernel.org/r/20180316210830.43738-1-kirill.shutemov@linux.intel.com
+Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure")
+Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Reported-by: Eric Wheeler <linux-mm@lists.ewheeler.net>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: <stable@vger.kernel.org> [4.8+]
+Signed-off-by: Andrew Morton <>
+(cherry-picked from https://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git/commit/?h=since-4.15&id=73eccc61c701ee7b4223aea2079542a712feeea7)
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ mm/shmem.c | 31 ++++++++++++++++++++-----------
+ 1 file changed, 20 insertions(+), 11 deletions(-)
+
+diff --git a/mm/shmem.c b/mm/shmem.c
+index 859e4c224b80..2aae929eb90b 100644
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -483,36 +483,45 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
+ info = list_entry(pos, struct shmem_inode_info, shrinklist);
+ inode = &info->vfs_inode;
+
+- if (nr_to_split && split >= nr_to_split) {
+- iput(inode);
+- continue;
+- }
++ if (nr_to_split && split >= nr_to_split)
++ goto leave;
+
+- page = find_lock_page(inode->i_mapping,
++ page = find_get_page(inode->i_mapping,
+ (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
+ if (!page)
+ goto drop;
+
++ /* No huge page at the end of the file: nothing to split */
+ if (!PageTransHuge(page)) {
+- unlock_page(page);
+ put_page(page);
+ goto drop;
+ }
+
++ /*
++ * Leave the inode on the list if we failed to lock
++ * the page at this time.
++ *
++ * Waiting for the lock may lead to deadlock in the
++ * reclaim path.
++ */
++ if (!trylock_page(page)) {
++ put_page(page);
++ goto leave;
++ }
++
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+
+- if (ret) {
+- /* split failed: leave it on the list */
+- iput(inode);
+- continue;
+- }
++ /* If split failed leave the inode on the list */
++ if (ret)
++ goto leave;
+
+ split++;
+ drop:
+ list_del_init(&info->shrinklist);
+ removed++;
++leave:
+ iput(inode);
+ }
+
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
-Date: Thu, 15 Mar 2018 18:07:47 +0300
-Subject: [PATCH] mm/thp: Do not wait for lock_page() in deferred_split_scan()
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-deferred_split_scan() gets called from reclaim path. Waiting for page
-lock may lead to deadlock there.
-
-Replace lock_page() with trylock_page() and skip the page if we failed
-to lock it. We will get to the page on the next scan.
-
-Fixes: 9a982250f773 ("thp: introduce deferred_split_huge_page()")
-
-Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
-Acked-by: Michal Hocko <mhocko@suse.com>
-(cherry-picked from https://patchwork.kernel.org/patch/10284703/)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- mm/huge_memory.c | 4 +++-
- 1 file changed, 3 insertions(+), 1 deletion(-)
-
-diff --git a/mm/huge_memory.c b/mm/huge_memory.c
-index 8b887db33383..5c4093e0be8d 100644
---- a/mm/huge_memory.c
-+++ b/mm/huge_memory.c
-@@ -2621,11 +2621,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
-
- list_for_each_safe(pos, next, &list) {
- page = list_entry((void *)pos, struct page, mapping);
-- lock_page(page);
-+ if (!trylock_page(page))
-+ goto next;
- /* split_huge_page() removes page from list on success */
- if (!split_huge_page(page))
- split++;
- unlock_page(page);
-+next:
- put_page(page);
- }
-
---
-2.14.2
-
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= <f.gruenbichler@proxmox.com>
-Date: Mon, 9 Apr 2018 09:33:25 +0200
-Subject: [PATCH] Revert Ubuntu RETPOLINE checks in kernel Makefile
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-these break builds outside of Ubuntu's packaging.
-
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- scripts/Makefile.build | 8 --------
- 1 file changed, 8 deletions(-)
-
-diff --git a/scripts/Makefile.build b/scripts/Makefile.build
-index d74c3f9f1fa8..436005392047 100644
---- a/scripts/Makefile.build
-+++ b/scripts/Makefile.build
-@@ -282,18 +282,11 @@ objtool_dep = $(objtool_obj) \
- $(wildcard include/config/orc/unwinder.h \
- include/config/stack/validation.h)
-
--ifdef CONFIG_RETPOLINE
--cmd_ubuntu_retpoline = $(CONFIG_SHELL) $(srctree)/scripts/ubuntu-retpoline-extract-one $(@) $(<) "$(filter -m16 %code16gcc.h,$(a_flags))";
--else
--cmd_ubuntu_retpoline =
--endif
--
- define rule_cc_o_c
- $(call echo-cmd,checksrc) $(cmd_checksrc) \
- $(call cmd_and_fixdep,cc_o_c) \
- $(cmd_modversions_c) \
- $(call echo-cmd,objtool) $(cmd_objtool) \
-- $(call echo-cmd,ubuntu-retpoline) $(cmd_ubuntu_retpoline) \
- $(call echo-cmd,record_mcount) $(cmd_record_mcount)
- endef
-
-@@ -301,7 +294,6 @@ define rule_as_o_S
- $(call cmd_and_fixdep,as_o_S) \
- $(cmd_modversions_S) \
- $(call echo-cmd,objtool) $(cmd_objtool)
-- $(call echo-cmd,ubuntu-retpoline) $(cmd_ubuntu_retpoline)
- endef
-
- # List module undefined symbols (or empty line if not enabled)
---
-2.14.2
-
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
+Date: Thu, 15 Mar 2018 18:07:47 +0300
+Subject: [PATCH] mm/thp: Do not wait for lock_page() in deferred_split_scan()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+deferred_split_scan() gets called from reclaim path. Waiting for page
+lock may lead to deadlock there.
+
+Replace lock_page() with trylock_page() and skip the page if we failed
+to lock it. We will get to the page on the next scan.
+
+Fixes: 9a982250f773 ("thp: introduce deferred_split_huge_page()")
+
+Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+(cherry-picked from https://patchwork.kernel.org/patch/10284703/)
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ mm/huge_memory.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index 8b887db33383..5c4093e0be8d 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -2621,11 +2621,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
+
+ list_for_each_safe(pos, next, &list) {
+ page = list_entry((void *)pos, struct page, mapping);
+- lock_page(page);
++ if (!trylock_page(page))
++ goto next;
+ /* split_huge_page() removes page from list on success */
+ if (!split_huge_page(page))
+ split++;
+ unlock_page(page);
++next:
+ put_page(page);
+ }
+
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= <f.gruenbichler@proxmox.com>
+Date: Mon, 9 Apr 2018 09:33:25 +0200
+Subject: [PATCH] Revert Ubuntu RETPOLINE checks in kernel Makefile
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+these break builds outside of Ubuntu's packaging.
+
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ scripts/Makefile.build | 8 --------
+ 1 file changed, 8 deletions(-)
+
+diff --git a/scripts/Makefile.build b/scripts/Makefile.build
+index d74c3f9f1fa8..436005392047 100644
+--- a/scripts/Makefile.build
++++ b/scripts/Makefile.build
+@@ -282,18 +282,11 @@ objtool_dep = $(objtool_obj) \
+ $(wildcard include/config/orc/unwinder.h \
+ include/config/stack/validation.h)
+
+-ifdef CONFIG_RETPOLINE
+-cmd_ubuntu_retpoline = $(CONFIG_SHELL) $(srctree)/scripts/ubuntu-retpoline-extract-one $(@) $(<) "$(filter -m16 %code16gcc.h,$(a_flags))";
+-else
+-cmd_ubuntu_retpoline =
+-endif
+-
+ define rule_cc_o_c
+ $(call echo-cmd,checksrc) $(cmd_checksrc) \
+ $(call cmd_and_fixdep,cc_o_c) \
+ $(cmd_modversions_c) \
+ $(call echo-cmd,objtool) $(cmd_objtool) \
+- $(call echo-cmd,ubuntu-retpoline) $(cmd_ubuntu_retpoline) \
+ $(call echo-cmd,record_mcount) $(cmd_record_mcount)
+ endef
+
+@@ -301,7 +294,6 @@ define rule_as_o_S
+ $(call cmd_and_fixdep,as_o_S) \
+ $(cmd_modversions_S) \
+ $(call echo-cmd,objtool) $(cmd_objtool)
+- $(call echo-cmd,ubuntu-retpoline) $(cmd_ubuntu_retpoline)
+ endef
+
+ # List module undefined symbols (or empty line if not enabled)
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Wolfgang Bumiller <w.bumiller@proxmox.com>
-Date: Mon, 9 Apr 2018 14:56:29 +0200
-Subject: [PATCH] net: fix deadlock while clearing neighbor proxy table
-
-When coming from ndisc_netdev_event() in net/ipv6/ndisc.c,
-neigh_ifdown() is called with &nd_tbl, locking this while
-clearing the proxy neighbor entries when eg. deleting an
-interface. Calling the table's pndisc_destructor() with the
-lock still held, however, can cause a deadlock: When a
-multicast listener is available an IGMP packet of type
-ICMPV6_MGM_REDUCTION may be sent out. When reaching
-ip6_finish_output2(), if no neighbor entry for the target
-address is found, __neigh_create() is called with &nd_tbl,
-which it'll want to lock.
-
-Move the elements into their own list, then unlock the table
-and perform the destruction.
-
-Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=199289
-Fixes: 6fd6ce2056de ("ipv6: Do not depend on rt->n in ip6_finish_output2().")
-Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
----
- net/core/neighbour.c | 28 ++++++++++++++++++----------
- 1 file changed, 18 insertions(+), 10 deletions(-)
-
-diff --git a/net/core/neighbour.c b/net/core/neighbour.c
-index d0713627deb6..3b495739bf65 100644
---- a/net/core/neighbour.c
-+++ b/net/core/neighbour.c
-@@ -55,7 +55,8 @@ static void neigh_timer_handler(unsigned long arg);
- static void __neigh_notify(struct neighbour *n, int type, int flags,
- u32 pid);
- static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid);
--static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
-+static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
-+ struct net_device *dev);
-
- #ifdef CONFIG_PROC_FS
- static const struct file_operations neigh_stat_seq_fops;
-@@ -291,8 +292,7 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
- {
- write_lock_bh(&tbl->lock);
- neigh_flush_dev(tbl, dev);
-- pneigh_ifdown(tbl, dev);
-- write_unlock_bh(&tbl->lock);
-+ pneigh_ifdown_and_unlock(tbl, dev);
-
- del_timer_sync(&tbl->proxy_timer);
- pneigh_queue_purge(&tbl->proxy_queue);
-@@ -681,9 +681,10 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
- return -ENOENT;
- }
-
--static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
-+static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
-+ struct net_device *dev)
- {
-- struct pneigh_entry *n, **np;
-+ struct pneigh_entry *n, **np, *freelist = NULL;
- u32 h;
-
- for (h = 0; h <= PNEIGH_HASHMASK; h++) {
-@@ -691,16 +692,23 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
- while ((n = *np) != NULL) {
- if (!dev || n->dev == dev) {
- *np = n->next;
-- if (tbl->pdestructor)
-- tbl->pdestructor(n);
-- if (n->dev)
-- dev_put(n->dev);
-- kfree(n);
-+ n->next = freelist;
-+ freelist = n;
- continue;
- }
- np = &n->next;
- }
- }
-+ write_unlock_bh(&tbl->lock);
-+ while ((n = freelist)) {
-+ freelist = n->next;
-+ n->next = NULL;
-+ if (tbl->pdestructor)
-+ tbl->pdestructor(n);
-+ if (n->dev)
-+ dev_put(n->dev);
-+ kfree(n);
-+ }
- return -ENOENT;
- }
-
---
-2.14.2
-
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Wolfgang Bumiller <w.bumiller@proxmox.com>
+Date: Mon, 9 Apr 2018 14:56:29 +0200
+Subject: [PATCH] net: fix deadlock while clearing neighbor proxy table
+
+When coming from ndisc_netdev_event() in net/ipv6/ndisc.c,
+neigh_ifdown() is called with &nd_tbl, locking this while
+clearing the proxy neighbor entries when eg. deleting an
+interface. Calling the table's pndisc_destructor() with the
+lock still held, however, can cause a deadlock: When a
+multicast listener is available an IGMP packet of type
+ICMPV6_MGM_REDUCTION may be sent out. When reaching
+ip6_finish_output2(), if no neighbor entry for the target
+address is found, __neigh_create() is called with &nd_tbl,
+which it'll want to lock.
+
+Move the elements into their own list, then unlock the table
+and perform the destruction.
+
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=199289
+Fixes: 6fd6ce2056de ("ipv6: Do not depend on rt->n in ip6_finish_output2().")
+Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
+---
+ net/core/neighbour.c | 28 ++++++++++++++++++----------
+ 1 file changed, 18 insertions(+), 10 deletions(-)
+
+diff --git a/net/core/neighbour.c b/net/core/neighbour.c
+index d0713627deb6..3b495739bf65 100644
+--- a/net/core/neighbour.c
++++ b/net/core/neighbour.c
+@@ -55,7 +55,8 @@ static void neigh_timer_handler(unsigned long arg);
+ static void __neigh_notify(struct neighbour *n, int type, int flags,
+ u32 pid);
+ static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid);
+-static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
++static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
++ struct net_device *dev);
+
+ #ifdef CONFIG_PROC_FS
+ static const struct file_operations neigh_stat_seq_fops;
+@@ -291,8 +292,7 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+ {
+ write_lock_bh(&tbl->lock);
+ neigh_flush_dev(tbl, dev);
+- pneigh_ifdown(tbl, dev);
+- write_unlock_bh(&tbl->lock);
++ pneigh_ifdown_and_unlock(tbl, dev);
+
+ del_timer_sync(&tbl->proxy_timer);
+ pneigh_queue_purge(&tbl->proxy_queue);
+@@ -681,9 +681,10 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
+ return -ENOENT;
+ }
+
+-static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
++static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
++ struct net_device *dev)
+ {
+- struct pneigh_entry *n, **np;
++ struct pneigh_entry *n, **np, *freelist = NULL;
+ u32 h;
+
+ for (h = 0; h <= PNEIGH_HASHMASK; h++) {
+@@ -691,16 +692,23 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+ while ((n = *np) != NULL) {
+ if (!dev || n->dev == dev) {
+ *np = n->next;
+- if (tbl->pdestructor)
+- tbl->pdestructor(n);
+- if (n->dev)
+- dev_put(n->dev);
+- kfree(n);
++ n->next = freelist;
++ freelist = n;
+ continue;
+ }
+ np = &n->next;
+ }
+ }
++ write_unlock_bh(&tbl->lock);
++ while ((n = freelist)) {
++ freelist = n->next;
++ n->next = NULL;
++ if (tbl->pdestructor)
++ tbl->pdestructor(n);
++ if (n->dev)
++ dev_put(n->dev);
++ kfree(n);
++ }
+ return -ENOENT;
+ }
+