]> git.proxmox.com Git - pve-kernel.git/commitdiff
rebase patches
authorFabian Grünbichler <f.gruenbichler@proxmox.com>
Wed, 21 Mar 2018 10:26:28 +0000 (11:26 +0100)
committerThomas Lamprecht <t.lamprecht@proxmox.com>
Wed, 21 Mar 2018 13:45:35 +0000 (14:45 +0100)
and drop those applied upstream

Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
39 files changed:
patches/kernel/0004-kvm-disable-default-dynamic-halt-polling-growth.patch
patches/kernel/0007-IB-core-Avoid-crash-on-pkey-enforcement-failed-in-re.patch [new file with mode: 0644]
patches/kernel/0007-KVM-x86-fix-APIC-page-invalidation.patch [deleted file]
patches/kernel/0008-IB-core-Don-t-enforce-PKey-security-on-SMI-MADs.patch [new file with mode: 0644]
patches/kernel/0008-vhost-fix-skb-leak-in-handle_rx.patch [deleted file]
patches/kernel/0009-KVM-SVM-obey-guest-PAT.patch [new file with mode: 0644]
patches/kernel/0009-tun-free-skb-in-early-errors.patch [deleted file]
patches/kernel/0010-net-sched-em_nbyte-don-t-add-the-data-offset-twice.patch [new file with mode: 0644]
patches/kernel/0010-tap-free-skb-if-flags-error.patch [deleted file]
patches/kernel/0011-IB-core-Avoid-crash-on-pkey-enforcement-failed-in-re.patch [deleted file]
patches/kernel/0011-net-sched-fix-TCF_LAYER_LINK-case-in-tcf_get_base_pt.patch [new file with mode: 0644]
patches/kernel/0012-IB-core-Don-t-enforce-PKey-security-on-SMI-MADs.patch [deleted file]
patches/kernel/0012-KVM-x86-Add-memory-barrier-on-vmcs-field-lookup.patch [new file with mode: 0644]
patches/kernel/0013-EDAC-sb_edac-Fix-missing-break-in-switch.patch [new file with mode: 0644]
patches/kernel/0013-kvm-vmx-Reinstate-support-for-CPUs-without-virtual-N.patch [deleted file]
patches/kernel/0014-KVM-SVM-obey-guest-PAT.patch [deleted file]
patches/kernel/0014-sched-wait-Fix-add_wait_queue-behavioral-change.patch [new file with mode: 0644]
patches/kernel/0015-module-retpoline-Warn-about-missing-retpoline-in-mod.patch [new file with mode: 0644]
patches/kernel/0015-net-sched-em_nbyte-don-t-add-the-data-offset-twice.patch [deleted file]
patches/kernel/0016-net-sched-fix-TCF_LAYER_LINK-case-in-tcf_get_base_pt.patch [deleted file]
patches/kernel/0016-net-tcp-close-sock-if-net-namespace-is-exiting.patch [new file with mode: 0644]
patches/kernel/0017-i40e-Fix-memory-leak-related-filter-programming-stat.patch [deleted file]
patches/kernel/0017-sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch [new file with mode: 0644]
patches/kernel/0018-KVM-x86-Add-memory-barrier-on-vmcs-field-lookup.patch [deleted file]
patches/kernel/0018-sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch [new file with mode: 0644]
patches/kernel/0019-EDAC-sb_edac-Don-t-create-a-second-memory-controller.patch [deleted file]
patches/kernel/0019-lockd-lost-rollback-of-set_grace_period-in-lockd_dow.patch [new file with mode: 0644]
patches/kernel/0020-EDAC-sb_edac-Fix-missing-break-in-switch.patch [deleted file]
patches/kernel/0020-ocfs2-make-metadata-estimation-accurate-and-clear.patch [new file with mode: 0644]
patches/kernel/0021-ocfs2-try-to-reuse-extent-block-in-dealloc-without-m.patch [new file with mode: 0644]
patches/kernel/0021-scsi-lpfc-Fix-loop-mode-target-discovery.patch [deleted file]
patches/kernel/0022-sched-wait-Fix-add_wait_queue-behavioral-change.patch [deleted file]
patches/kernel/0023-module-retpoline-Warn-about-missing-retpoline-in-mod.patch [deleted file]
patches/kernel/0024-net-tcp-close-sock-if-net-namespace-is-exiting.patch [deleted file]
patches/kernel/0025-sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch [deleted file]
patches/kernel/0026-sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch [deleted file]
patches/kernel/0027-lockd-lost-rollback-of-set_grace_period-in-lockd_dow.patch [deleted file]
patches/kernel/0028-ocfs2-make-metadata-estimation-accurate-and-clear.patch [deleted file]
patches/kernel/0029-ocfs2-try-to-reuse-extent-block-in-dealloc-without-m.patch [deleted file]

index 863f9166a25cd473e753f57790d58bd71e0f1ed0..1e8b2d82b92354b00d7a42720f130b7b443d73cc 100644 (file)
@@ -12,7 +12,7 @@ Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
-index 4d81f6ded88e..bfa9c4d34102 100644
+index 3b3e54742263..d0085c9d6297 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -77,7 +77,7 @@ module_param(halt_poll_ns, uint, 0644);
diff --git a/patches/kernel/0007-IB-core-Avoid-crash-on-pkey-enforcement-failed-in-re.patch b/patches/kernel/0007-IB-core-Avoid-crash-on-pkey-enforcement-failed-in-re.patch
new file mode 100644 (file)
index 0000000..f61fa4b
--- /dev/null
@@ -0,0 +1,93 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Parav Pandit <parav@mellanox.com>
+Date: Fri, 5 Jan 2018 23:51:12 +0100
+Subject: [PATCH] IB/core: Avoid crash on pkey enforcement failed in received
+ MADs
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+commit 89548bcafec7ecfeea58c553f0834b5d575a66eb upstream.
+
+Below kernel crash is observed when Pkey security enforcement fails on
+received MADs. This issue is reported in [1].
+
+ib_free_recv_mad() accesses the rmpp_list, whose initialization is
+needed before accessing it.
+When security enformcent fails on received MADs, MAD processing avoided
+due to security checks failed.
+
+OpenSM[3770]: SM port is down
+kernel: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
+kernel: IP: ib_free_recv_mad+0x44/0xa0 [ib_core]
+kernel: PGD 0
+kernel: P4D 0
+kernel:
+kernel: Oops: 0002 [#1] SMP
+kernel: CPU: 0 PID: 2833 Comm: kworker/0:1H Tainted: P          IO    4.13.4-1-pve #1
+kernel: Hardware name: Dell       XS23-TY3        /9CMP63, BIOS 1.71 09/17/2013
+kernel: Workqueue: ib-comp-wq ib_cq_poll_work [ib_core]
+kernel: task: ffffa069c6541600 task.stack: ffffb9a729054000
+kernel: RIP: 0010:ib_free_recv_mad+0x44/0xa0 [ib_core]
+kernel: RSP: 0018:ffffb9a729057d38 EFLAGS: 00010286
+kernel: RAX: ffffa069cb138a48 RBX: ffffa069cb138a10 RCX: 0000000000000000
+kernel: RDX: ffffb9a729057d38 RSI: 0000000000000000 RDI: ffffa069cb138a20
+kernel: RBP: ffffb9a729057d60 R08: ffffa072d2d49800 R09: ffffa069cb138ae0
+kernel: R10: ffffa069cb138ae0 R11: ffffa072b3994e00 R12: ffffb9a729057d38
+kernel: R13: ffffa069d1c90000 R14: 0000000000000000 R15: ffffa069d1c90880
+kernel: FS:  0000000000000000(0000) GS:ffffa069dba00000(0000) knlGS:0000000000000000
+kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+kernel: CR2: 0000000000000008 CR3: 00000011f51f2000 CR4: 00000000000006f0
+kernel: Call Trace:
+kernel:  ib_mad_recv_done+0x5cc/0xb50 [ib_core]
+kernel:  __ib_process_cq+0x5c/0xb0 [ib_core]
+kernel:  ib_cq_poll_work+0x20/0x60 [ib_core]
+kernel:  process_one_work+0x1e9/0x410
+kernel:  worker_thread+0x4b/0x410
+kernel:  kthread+0x109/0x140
+kernel:  ? process_one_work+0x410/0x410
+kernel:  ? kthread_create_on_node+0x70/0x70
+kernel:  ? SyS_exit_group+0x14/0x20
+kernel:  ret_from_fork+0x25/0x30
+kernel: RIP: ib_free_recv_mad+0x44/0xa0 [ib_core] RSP: ffffb9a729057d38
+kernel: CR2: 0000000000000008
+
+[1] : https://www.spinics.net/lists/linux-rdma/msg56190.html
+
+Fixes: 47a2b338fe63 ("IB/core: Enforce security on management datagrams")
+Signed-off-by: Parav Pandit <parav@mellanox.com>
+Reported-by: Chris Blake <chrisrblake93@gmail.com>
+Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
+Reviewed-by: Hal Rosenstock <hal@mellanox.com>
+Signed-off-by: Doug Ledford <dledford@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ drivers/infiniband/core/mad.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
+index f8f53bb90837..cb91245e9163 100644
+--- a/drivers/infiniband/core/mad.c
++++ b/drivers/infiniband/core/mad.c
+@@ -1974,14 +1974,15 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
+       unsigned long flags;
+       int ret;
++      INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
+       ret = ib_mad_enforce_security(mad_agent_priv,
+                                     mad_recv_wc->wc->pkey_index);
+       if (ret) {
+               ib_free_recv_mad(mad_recv_wc);
+               deref_mad_agent(mad_agent_priv);
++              return;
+       }
+-      INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
+       list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
+       if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+               mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
+-- 
+2.14.2
+
diff --git a/patches/kernel/0007-KVM-x86-fix-APIC-page-invalidation.patch b/patches/kernel/0007-KVM-x86-fix-APIC-page-invalidation.patch
deleted file mode 100644 (file)
index 1e693f9..0000000
+++ /dev/null
@@ -1,90 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
-Date: Thu, 30 Nov 2017 19:05:45 +0100
-Subject: [PATCH] KVM: x86: fix APIC page invalidation
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Implementation of the unpinned APIC page didn't update the VMCS address
-cache when invalidation was done through range mmu notifiers.
-This became a problem when the page notifier was removed.
-
-Re-introduce the arch-specific helper and call it from ...range_start.
-
-Fixes: 38b9917350cb ("kvm: vmx: Implement set_apic_access_page_addr")
-Fixes: 369ea8242c0f ("mm/rmap: update to new mmu_notifier semantic v2")
-Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- arch/x86/include/asm/kvm_host.h |  3 +++
- arch/x86/kvm/x86.c              | 14 ++++++++++++++
- virt/kvm/kvm_main.c             |  8 ++++++++
- 3 files changed, 25 insertions(+)
-
-diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
-index 78ec3cda9429..1953c0a5b972 100644
---- a/arch/x86/include/asm/kvm_host.h
-+++ b/arch/x86/include/asm/kvm_host.h
-@@ -1439,4 +1439,7 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
- #endif
- }
-+void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
-+              unsigned long start, unsigned long end);
-+
- #endif /* _ASM_X86_KVM_HOST_H */
-diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
-index f896c441fc2c..eae4aecf3cfe 100644
---- a/arch/x86/kvm/x86.c
-+++ b/arch/x86/kvm/x86.c
-@@ -6711,6 +6711,20 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
-       kvm_x86_ops->tlb_flush(vcpu);
- }
-+void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
-+              unsigned long start, unsigned long end)
-+{
-+      unsigned long apic_address;
-+
-+      /*
-+       * The physical address of apic access page is stored in the VMCS.
-+       * Update it when it becomes invalid.
-+       */
-+      apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
-+      if (start <= apic_address && apic_address < end)
-+              kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
-+}
-+
- void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
- {
-       struct page *page = NULL;
-diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
-index bfa9c4d34102..d0085c9d6297 100644
---- a/virt/kvm/kvm_main.c
-+++ b/virt/kvm/kvm_main.c
-@@ -136,6 +136,11 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
- static unsigned long long kvm_createvm_count;
- static unsigned long long kvm_active_vms;
-+__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
-+              unsigned long start, unsigned long end)
-+{
-+}
-+
- bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
- {
-       if (pfn_valid(pfn))
-@@ -361,6 +366,9 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
-               kvm_flush_remote_tlbs(kvm);
-       spin_unlock(&kvm->mmu_lock);
-+
-+      kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
-+
-       srcu_read_unlock(&kvm->srcu, idx);
- }
--- 
-2.14.2
-
diff --git a/patches/kernel/0008-IB-core-Don-t-enforce-PKey-security-on-SMI-MADs.patch b/patches/kernel/0008-IB-core-Don-t-enforce-PKey-security-on-SMI-MADs.patch
new file mode 100644 (file)
index 0000000..10cb488
--- /dev/null
@@ -0,0 +1,47 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Jurgens <danielj@mellanox.com>
+Date: Mon, 20 Nov 2017 16:47:45 -0600
+Subject: [PATCH] IB/core: Don't enforce PKey security on SMI MADs
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Per the infiniband spec an SMI MAD can have any PKey. Checking the pkey
+on SMI MADs is not necessary, and it seems that some older adapters
+using the mthca driver don't follow the convention of using the default
+PKey, resulting in false denials, or errors querying the PKey cache.
+
+SMI MAD security is still enforced, only agents allowed to manage the
+subnet are able to receive or send SMI MADs.
+
+Reported-by: Chris Blake <chrisrblake93@gmail.com>
+Fixes: 47a2b338fe63("IB/core: Enforce security on management datagrams")
+Signed-off-by: Daniel Jurgens <danielj@mellanox.com>
+Reviewed-by: Parav Pandit <parav@mellanox.com>
+Signed-off-by: Leon Romanovsky <leon@kernel.org>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ drivers/infiniband/core/security.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
+index 70ad19c4c73e..8f9fd3b757db 100644
+--- a/drivers/infiniband/core/security.c
++++ b/drivers/infiniband/core/security.c
+@@ -692,8 +692,11 @@ int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index)
+ {
+       int ret;
+-      if (map->agent.qp->qp_type == IB_QPT_SMI && !map->agent.smp_allowed)
+-              return -EACCES;
++      if (map->agent.qp->qp_type == IB_QPT_SMI) {
++              if (!map->agent.smp_allowed)
++                      return -EACCES;
++              return 0;
++      }
+       ret = ib_security_pkey_access(map->agent.device,
+                                     map->agent.port_num,
+-- 
+2.14.2
+
diff --git a/patches/kernel/0008-vhost-fix-skb-leak-in-handle_rx.patch b/patches/kernel/0008-vhost-fix-skb-leak-in-handle_rx.patch
deleted file mode 100644 (file)
index 2176dbc..0000000
+++ /dev/null
@@ -1,72 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Wei Xu <wexu@redhat.com>
-Date: Fri, 1 Dec 2017 05:10:36 -0500
-Subject: [PATCH] vhost: fix skb leak in handle_rx()
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Matthew found a roughly 40% tcp throughput regression with commit
-c67df11f(vhost_net: try batch dequing from skb array) as discussed
-in the following thread:
-https://www.mail-archive.com/netdev@vger.kernel.org/msg187936.html
-
-Eventually we figured out that it was a skb leak in handle_rx()
-when sending packets to the VM. This usually happens when a guest
-can not drain out vq as fast as vhost fills in, afterwards it sets
-off the traffic jam and leaks skb(s) which occurs as no headcount
-to send on the vq from vhost side.
-
-This can be avoided by making sure we have got enough headcount
-before actually consuming a skb from the batched rx array while
-transmitting, which is simply done by moving checking the zero
-headcount a bit ahead.
-
-Signed-off-by: Wei Xu <wexu@redhat.com>
-Reported-by: Matthew Rosato <mjrosato@linux.vnet.ibm.com>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- drivers/vhost/net.c | 20 ++++++++++----------
- 1 file changed, 10 insertions(+), 10 deletions(-)
-
-diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
-index 1c75572f5a3f..010253847022 100644
---- a/drivers/vhost/net.c
-+++ b/drivers/vhost/net.c
-@@ -781,16 +781,6 @@ static void handle_rx(struct vhost_net *net)
-               /* On error, stop handling until the next kick. */
-               if (unlikely(headcount < 0))
-                       goto out;
--              if (nvq->rx_array)
--                      msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
--              /* On overrun, truncate and discard */
--              if (unlikely(headcount > UIO_MAXIOV)) {
--                      iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
--                      err = sock->ops->recvmsg(sock, &msg,
--                                               1, MSG_DONTWAIT | MSG_TRUNC);
--                      pr_debug("Discarded rx packet: len %zd\n", sock_len);
--                      continue;
--              }
-               /* OK, now we need to know about added descriptors. */
-               if (!headcount) {
-                       if (unlikely(vhost_enable_notify(&net->dev, vq))) {
-@@ -803,6 +793,16 @@ static void handle_rx(struct vhost_net *net)
-                        * they refilled. */
-                       goto out;
-               }
-+              if (nvq->rx_array)
-+                      msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
-+              /* On overrun, truncate and discard */
-+              if (unlikely(headcount > UIO_MAXIOV)) {
-+                      iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
-+                      err = sock->ops->recvmsg(sock, &msg,
-+                                               1, MSG_DONTWAIT | MSG_TRUNC);
-+                      pr_debug("Discarded rx packet: len %zd\n", sock_len);
-+                      continue;
-+              }
-               /* We don't need to be notified again. */
-               iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
-               fixup = msg.msg_iter;
--- 
-2.14.2
-
diff --git a/patches/kernel/0009-KVM-SVM-obey-guest-PAT.patch b/patches/kernel/0009-KVM-SVM-obey-guest-PAT.patch
new file mode 100644 (file)
index 0000000..cc64a62
--- /dev/null
@@ -0,0 +1,56 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Thu, 26 Oct 2017 09:13:27 +0200
+Subject: [PATCH] KVM: SVM: obey guest PAT
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+For many years some users of assigned devices have reported worse
+performance on AMD processors with NPT than on AMD without NPT,
+Intel or bare metal.
+
+The reason turned out to be that SVM is discarding the guest PAT
+setting and uses the default (PA0=PA4=WB, PA1=PA5=WT, PA2=PA6=UC-,
+PA3=UC).  The guest might be using a different setting, and
+especially might want write combining but isn't getting it
+(instead getting slow UC or UC- accesses).
+
+Thanks a lot to geoff@hostfission.com for noticing the relation
+to the g_pat setting.  The patch has been tested also by a bunch
+of people on VFIO users forums.
+
+Fixes: 709ddebf81cb40e3c36c6109a7892e8b93a09464
+Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=196409
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Tested-by: Nick Sarnie <commendsarnex@gmail.com>
+Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
+(cherry picked from commit 15038e14724799b8c205beb5f20f9e54896013c3)
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ arch/x86/kvm/svm.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
+index a8c911fcd73f..e9d0f80fd83a 100644
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -3650,6 +3650,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+       u32 ecx = msr->index;
+       u64 data = msr->data;
+       switch (ecx) {
++      case MSR_IA32_CR_PAT:
++              if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
++                      return 1;
++              vcpu->arch.pat = data;
++              svm->vmcb->save.g_pat = data;
++              mark_dirty(svm->vmcb, VMCB_NPT);
++              break;
+       case MSR_IA32_TSC:
+               kvm_write_tsc(vcpu, msr);
+               break;
+-- 
+2.14.2
+
diff --git a/patches/kernel/0009-tun-free-skb-in-early-errors.patch b/patches/kernel/0009-tun-free-skb-in-early-errors.patch
deleted file mode 100644 (file)
index 0185bbd..0000000
+++ /dev/null
@@ -1,86 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Wei Xu <wexu@redhat.com>
-Date: Fri, 1 Dec 2017 05:10:37 -0500
-Subject: [PATCH] tun: free skb in early errors
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-tun_recvmsg() supports accepting skb by msg_control after
-commit ac77cfd4258f ("tun: support receiving skb through msg_control"),
-the skb if presented should be freed no matter how far it can go
-along, otherwise it would be leaked.
-
-This patch fixes several missed cases.
-
-Signed-off-by: Wei Xu <wexu@redhat.com>
-Reported-by: Matthew Rosato <mjrosato@linux.vnet.ibm.com>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- drivers/net/tun.c | 24 ++++++++++++++++++------
- 1 file changed, 18 insertions(+), 6 deletions(-)
-
-diff --git a/drivers/net/tun.c b/drivers/net/tun.c
-index d1cb1ff83251..d58ae8ad0a4e 100644
---- a/drivers/net/tun.c
-+++ b/drivers/net/tun.c
-@@ -1519,8 +1519,11 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
-       tun_debug(KERN_INFO, tun, "tun_do_read\n");
--      if (!iov_iter_count(to))
-+      if (!iov_iter_count(to)) {
-+              if (skb)
-+                      kfree_skb(skb);
-               return 0;
-+      }
-       if (!skb) {
-               /* Read frames from ring */
-@@ -1636,22 +1639,24 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
- {
-       struct tun_file *tfile = container_of(sock, struct tun_file, socket);
-       struct tun_struct *tun = __tun_get(tfile);
-+      struct sk_buff *skb = m->msg_control;
-       int ret;
--      if (!tun)
--              return -EBADFD;
-+      if (!tun) {
-+              ret = -EBADFD;
-+              goto out_free_skb;
-+      }
-       if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
-               ret = -EINVAL;
--              goto out;
-+              goto out_put_tun;
-       }
-       if (flags & MSG_ERRQUEUE) {
-               ret = sock_recv_errqueue(sock->sk, m, total_len,
-                                        SOL_PACKET, TUN_TX_TIMESTAMP);
-               goto out;
-       }
--      ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT,
--                        m->msg_control);
-+      ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, skb);
-       if (ret > (ssize_t)total_len) {
-               m->msg_flags |= MSG_TRUNC;
-               ret = flags & MSG_TRUNC ? ret : total_len;
-@@ -1659,6 +1664,13 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
- out:
-       tun_put(tun);
-       return ret;
-+
-+out_put_tun:
-+      tun_put(tun);
-+out_free_skb:
-+      if (skb)
-+              kfree_skb(skb);
-+      return ret;
- }
- static int tun_peek_len(struct socket *sock)
--- 
-2.14.2
-
diff --git a/patches/kernel/0010-net-sched-em_nbyte-don-t-add-the-data-offset-twice.patch b/patches/kernel/0010-net-sched-em_nbyte-don-t-add-the-data-offset-twice.patch
new file mode 100644 (file)
index 0000000..d0b118e
--- /dev/null
@@ -0,0 +1,33 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Wolfgang Bumiller <w.bumiller@proxmox.com>
+Date: Fri, 19 Jan 2018 11:12:37 +0100
+Subject: [PATCH] net: sched: em_nbyte: don't add the data offset twice
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+'ptr' is shifted by the offset and then validated,
+the memcmp should not add it a second time.
+
+Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ net/sched/em_nbyte.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
+index df3110d69585..07c10bac06a0 100644
+--- a/net/sched/em_nbyte.c
++++ b/net/sched/em_nbyte.c
+@@ -51,7 +51,7 @@ static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
+       if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
+               return 0;
+-      return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len);
++      return !memcmp(ptr, nbyte->pattern, nbyte->hdr.len);
+ }
+ static struct tcf_ematch_ops em_nbyte_ops = {
+-- 
+2.14.2
+
diff --git a/patches/kernel/0010-tap-free-skb-if-flags-error.patch b/patches/kernel/0010-tap-free-skb-if-flags-error.patch
deleted file mode 100644 (file)
index 87f6502..0000000
+++ /dev/null
@@ -1,58 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Wei Xu <wexu@redhat.com>
-Date: Fri, 1 Dec 2017 05:10:38 -0500
-Subject: [PATCH] tap: free skb if flags error
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-tap_recvmsg() supports accepting skb by msg_control after
-commit 3b4ba04acca8 ("tap: support receiving skb from msg_control"),
-the skb if presented should be freed within the function, otherwise
-it would be leaked.
-
-Signed-off-by: Wei Xu <wexu@redhat.com>
-Reported-by: Matthew Rosato <mjrosato@linux.vnet.ibm.com>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- drivers/net/tap.c | 14 ++++++++++----
- 1 file changed, 10 insertions(+), 4 deletions(-)
-
-diff --git a/drivers/net/tap.c b/drivers/net/tap.c
-index 7a2f6bebfd15..96e5e5b2ae39 100644
---- a/drivers/net/tap.c
-+++ b/drivers/net/tap.c
-@@ -829,8 +829,11 @@ static ssize_t tap_do_read(struct tap_queue *q,
-       DEFINE_WAIT(wait);
-       ssize_t ret = 0;
--      if (!iov_iter_count(to))
-+      if (!iov_iter_count(to)) {
-+              if (skb)
-+                      kfree_skb(skb);
-               return 0;
-+      }
-       if (skb)
-               goto put;
-@@ -1157,11 +1160,14 @@ static int tap_recvmsg(struct socket *sock, struct msghdr *m,
-                      size_t total_len, int flags)
- {
-       struct tap_queue *q = container_of(sock, struct tap_queue, sock);
-+      struct sk_buff *skb = m->msg_control;
-       int ret;
--      if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
-+      if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) {
-+              if (skb)
-+                      kfree_skb(skb);
-               return -EINVAL;
--      ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT,
--                        m->msg_control);
-+      }
-+      ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT, skb);
-       if (ret > total_len) {
-               m->msg_flags |= MSG_TRUNC;
-               ret = flags & MSG_TRUNC ? ret : total_len;
--- 
-2.14.2
-
diff --git a/patches/kernel/0011-IB-core-Avoid-crash-on-pkey-enforcement-failed-in-re.patch b/patches/kernel/0011-IB-core-Avoid-crash-on-pkey-enforcement-failed-in-re.patch
deleted file mode 100644 (file)
index f61fa4b..0000000
+++ /dev/null
@@ -1,93 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Parav Pandit <parav@mellanox.com>
-Date: Fri, 5 Jan 2018 23:51:12 +0100
-Subject: [PATCH] IB/core: Avoid crash on pkey enforcement failed in received
- MADs
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-commit 89548bcafec7ecfeea58c553f0834b5d575a66eb upstream.
-
-Below kernel crash is observed when Pkey security enforcement fails on
-received MADs. This issue is reported in [1].
-
-ib_free_recv_mad() accesses the rmpp_list, whose initialization is
-needed before accessing it.
-When security enformcent fails on received MADs, MAD processing avoided
-due to security checks failed.
-
-OpenSM[3770]: SM port is down
-kernel: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
-kernel: IP: ib_free_recv_mad+0x44/0xa0 [ib_core]
-kernel: PGD 0
-kernel: P4D 0
-kernel:
-kernel: Oops: 0002 [#1] SMP
-kernel: CPU: 0 PID: 2833 Comm: kworker/0:1H Tainted: P          IO    4.13.4-1-pve #1
-kernel: Hardware name: Dell       XS23-TY3        /9CMP63, BIOS 1.71 09/17/2013
-kernel: Workqueue: ib-comp-wq ib_cq_poll_work [ib_core]
-kernel: task: ffffa069c6541600 task.stack: ffffb9a729054000
-kernel: RIP: 0010:ib_free_recv_mad+0x44/0xa0 [ib_core]
-kernel: RSP: 0018:ffffb9a729057d38 EFLAGS: 00010286
-kernel: RAX: ffffa069cb138a48 RBX: ffffa069cb138a10 RCX: 0000000000000000
-kernel: RDX: ffffb9a729057d38 RSI: 0000000000000000 RDI: ffffa069cb138a20
-kernel: RBP: ffffb9a729057d60 R08: ffffa072d2d49800 R09: ffffa069cb138ae0
-kernel: R10: ffffa069cb138ae0 R11: ffffa072b3994e00 R12: ffffb9a729057d38
-kernel: R13: ffffa069d1c90000 R14: 0000000000000000 R15: ffffa069d1c90880
-kernel: FS:  0000000000000000(0000) GS:ffffa069dba00000(0000) knlGS:0000000000000000
-kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
-kernel: CR2: 0000000000000008 CR3: 00000011f51f2000 CR4: 00000000000006f0
-kernel: Call Trace:
-kernel:  ib_mad_recv_done+0x5cc/0xb50 [ib_core]
-kernel:  __ib_process_cq+0x5c/0xb0 [ib_core]
-kernel:  ib_cq_poll_work+0x20/0x60 [ib_core]
-kernel:  process_one_work+0x1e9/0x410
-kernel:  worker_thread+0x4b/0x410
-kernel:  kthread+0x109/0x140
-kernel:  ? process_one_work+0x410/0x410
-kernel:  ? kthread_create_on_node+0x70/0x70
-kernel:  ? SyS_exit_group+0x14/0x20
-kernel:  ret_from_fork+0x25/0x30
-kernel: RIP: ib_free_recv_mad+0x44/0xa0 [ib_core] RSP: ffffb9a729057d38
-kernel: CR2: 0000000000000008
-
-[1] : https://www.spinics.net/lists/linux-rdma/msg56190.html
-
-Fixes: 47a2b338fe63 ("IB/core: Enforce security on management datagrams")
-Signed-off-by: Parav Pandit <parav@mellanox.com>
-Reported-by: Chris Blake <chrisrblake93@gmail.com>
-Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
-Reviewed-by: Hal Rosenstock <hal@mellanox.com>
-Signed-off-by: Doug Ledford <dledford@redhat.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- drivers/infiniband/core/mad.c | 3 ++-
- 1 file changed, 2 insertions(+), 1 deletion(-)
-
-diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
-index f8f53bb90837..cb91245e9163 100644
---- a/drivers/infiniband/core/mad.c
-+++ b/drivers/infiniband/core/mad.c
-@@ -1974,14 +1974,15 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
-       unsigned long flags;
-       int ret;
-+      INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
-       ret = ib_mad_enforce_security(mad_agent_priv,
-                                     mad_recv_wc->wc->pkey_index);
-       if (ret) {
-               ib_free_recv_mad(mad_recv_wc);
-               deref_mad_agent(mad_agent_priv);
-+              return;
-       }
--      INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
-       list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
-       if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
-               mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
--- 
-2.14.2
-
diff --git a/patches/kernel/0011-net-sched-fix-TCF_LAYER_LINK-case-in-tcf_get_base_pt.patch b/patches/kernel/0011-net-sched-fix-TCF_LAYER_LINK-case-in-tcf_get_base_pt.patch
new file mode 100644 (file)
index 0000000..416c277
--- /dev/null
@@ -0,0 +1,34 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Wolfgang Bumiller <w.bumiller@proxmox.com>
+Date: Fri, 19 Jan 2018 11:12:38 +0100
+Subject: [PATCH] net: sched: fix TCF_LAYER_LINK case in tcf_get_base_ptr
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+TCF_LAYER_LINK and TCF_LAYER_NETWORK returned the same pointer as
+skb->data points to the network header.
+Use skb_mac_header instead.
+
+Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ include/net/pkt_cls.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
+index 537d0a0ad4c4..4450961b1554 100644
+--- a/include/net/pkt_cls.h
++++ b/include/net/pkt_cls.h
+@@ -395,7 +395,7 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer)
+ {
+       switch (layer) {
+               case TCF_LAYER_LINK:
+-                      return skb->data;
++                      return skb_mac_header(skb);
+               case TCF_LAYER_NETWORK:
+                       return skb_network_header(skb);
+               case TCF_LAYER_TRANSPORT:
+-- 
+2.14.2
+
diff --git a/patches/kernel/0012-IB-core-Don-t-enforce-PKey-security-on-SMI-MADs.patch b/patches/kernel/0012-IB-core-Don-t-enforce-PKey-security-on-SMI-MADs.patch
deleted file mode 100644 (file)
index 10cb488..0000000
+++ /dev/null
@@ -1,47 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Daniel Jurgens <danielj@mellanox.com>
-Date: Mon, 20 Nov 2017 16:47:45 -0600
-Subject: [PATCH] IB/core: Don't enforce PKey security on SMI MADs
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Per the infiniband spec an SMI MAD can have any PKey. Checking the pkey
-on SMI MADs is not necessary, and it seems that some older adapters
-using the mthca driver don't follow the convention of using the default
-PKey, resulting in false denials, or errors querying the PKey cache.
-
-SMI MAD security is still enforced, only agents allowed to manage the
-subnet are able to receive or send SMI MADs.
-
-Reported-by: Chris Blake <chrisrblake93@gmail.com>
-Fixes: 47a2b338fe63("IB/core: Enforce security on management datagrams")
-Signed-off-by: Daniel Jurgens <danielj@mellanox.com>
-Reviewed-by: Parav Pandit <parav@mellanox.com>
-Signed-off-by: Leon Romanovsky <leon@kernel.org>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- drivers/infiniband/core/security.c | 7 +++++--
- 1 file changed, 5 insertions(+), 2 deletions(-)
-
-diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
-index 70ad19c4c73e..8f9fd3b757db 100644
---- a/drivers/infiniband/core/security.c
-+++ b/drivers/infiniband/core/security.c
-@@ -692,8 +692,11 @@ int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index)
- {
-       int ret;
--      if (map->agent.qp->qp_type == IB_QPT_SMI && !map->agent.smp_allowed)
--              return -EACCES;
-+      if (map->agent.qp->qp_type == IB_QPT_SMI) {
-+              if (!map->agent.smp_allowed)
-+                      return -EACCES;
-+              return 0;
-+      }
-       ret = ib_security_pkey_access(map->agent.device,
-                                     map->agent.port_num,
--- 
-2.14.2
-
diff --git a/patches/kernel/0012-KVM-x86-Add-memory-barrier-on-vmcs-field-lookup.patch b/patches/kernel/0012-KVM-x86-Add-memory-barrier-on-vmcs-field-lookup.patch
new file mode 100644 (file)
index 0000000..0523e06
--- /dev/null
@@ -0,0 +1,49 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Andrew Honig <ahonig@google.com>
+Date: Wed, 10 Jan 2018 10:12:03 -0800
+Subject: [PATCH] KVM: x86: Add memory barrier on vmcs field lookup
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+commit 75f139aaf896d6fdeec2e468ddfa4b2fe469bf40 upstream.
+
+This adds a memory barrier when performing a lookup into
+the vmcs_field_to_offset_table.  This is related to
+CVE-2017-5753.
+
+Signed-off-by: Andrew Honig <ahonig@google.com>
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ arch/x86/kvm/vmx.c | 12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
+index 097f3daae037..b078b56234bb 100644
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -883,8 +883,16 @@ static inline short vmcs_field_to_offset(unsigned long field)
+ {
+       BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
+-      if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
+-          vmcs_field_to_offset_table[field] == 0)
++      if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
++              return -ENOENT;
++
++      /*
++       * FIXME: Mitigation for CVE-2017-5753.  To be replaced with a
++       * generic mechanism.
++       */
++      asm("lfence");
++
++      if (vmcs_field_to_offset_table[field] == 0)
+               return -ENOENT;
+       return vmcs_field_to_offset_table[field];
+-- 
+2.14.2
+
diff --git a/patches/kernel/0013-EDAC-sb_edac-Fix-missing-break-in-switch.patch b/patches/kernel/0013-EDAC-sb_edac-Fix-missing-break-in-switch.patch
new file mode 100644 (file)
index 0000000..0421f14
--- /dev/null
@@ -0,0 +1,37 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
+Date: Mon, 16 Oct 2017 12:40:29 -0500
+Subject: [PATCH] EDAC, sb_edac: Fix missing break in switch
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Add missing break statement in order to prevent the code from falling
+through.
+
+Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
+Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
+Cc: linux-edac <linux-edac@vger.kernel.org>
+Link: http://lkml.kernel.org/r/20171016174029.GA19757@embeddedor.com
+Signed-off-by: Borislav Petkov <bp@suse.de>
+(cherry picked from commit a8e9b186f153a44690ad0363a56716e7077ad28c)
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ drivers/edac/sb_edac.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
+index 5c3e707ff3fc..59af590b660c 100644
+--- a/drivers/edac/sb_edac.c
++++ b/drivers/edac/sb_edac.c
+@@ -2454,6 +2454,7 @@ static int ibridge_mci_bind_devs(struct mem_ctl_info *mci,
+               case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA:
+               case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TA:
+                       pvt->pci_ta = pdev;
++                      break;
+               case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_RAS:
+               case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS:
+                       pvt->pci_ras = pdev;
+-- 
+2.14.2
+
diff --git a/patches/kernel/0013-kvm-vmx-Reinstate-support-for-CPUs-without-virtual-N.patch b/patches/kernel/0013-kvm-vmx-Reinstate-support-for-CPUs-without-virtual-N.patch
deleted file mode 100644 (file)
index 54a3fdb..0000000
+++ /dev/null
@@ -1,299 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Paolo Bonzini <pbonzini@redhat.com>
-Date: Mon, 6 Nov 2017 13:31:12 +0100
-Subject: [PATCH] kvm: vmx: Reinstate support for CPUs without virtual NMI
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-commit 8a1b43922d0d1279e7936ba85c4c2a870403c95f upstream.
-
-This is more or less a revert of commit 2c82878b0cb3 ("KVM: VMX: require
-virtual NMI support", 2017-03-27); it turns out that Core 2 Duo machines
-only had virtual NMIs in some SKUs.
-
-The revert is not trivial because in the meanwhile there have been several
-fixes to nested NMI injection.  Therefore, the entire vNMI state is moved
-to struct loaded_vmcs.
-
-Another change compared to before the patch is a simplification here:
-
-       if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
-           !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
-                                       get_vmcs12(vcpu))))) {
-
-The final condition here is always true (because nested_cpu_has_virtual_nmis
-is always false) and is removed.
-
-Fixes: 2c82878b0cb38fd516fd612c67852a6bbf282003
-Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1490803
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- arch/x86/kvm/vmx.c | 150 +++++++++++++++++++++++++++++++++++++----------------
- 1 file changed, 106 insertions(+), 44 deletions(-)
-
-diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
-index 5edf05ce45de..146caacd8fdd 100644
---- a/arch/x86/kvm/vmx.c
-+++ b/arch/x86/kvm/vmx.c
-@@ -204,6 +204,10 @@ struct loaded_vmcs {
-       bool nmi_known_unmasked;
-       unsigned long vmcs_host_cr3;    /* May not match real cr3 */
-       unsigned long vmcs_host_cr4;    /* May not match real cr4 */
-+      /* Support for vnmi-less CPUs */
-+      int soft_vnmi_blocked;
-+      ktime_t entry_time;
-+      s64 vnmi_blocked_time;
-       struct list_head loaded_vmcss_on_cpu_link;
- };
-@@ -1290,6 +1294,11 @@ static inline bool cpu_has_vmx_invpcid(void)
-               SECONDARY_EXEC_ENABLE_INVPCID;
- }
-+static inline bool cpu_has_virtual_nmis(void)
-+{
-+      return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
-+}
-+
- static inline bool cpu_has_vmx_wbinvd_exit(void)
- {
-       return vmcs_config.cpu_based_2nd_exec_ctrl &
-@@ -1341,11 +1350,6 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
-               (vmcs12->secondary_vm_exec_control & bit);
- }
--static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
--{
--      return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
--}
--
- static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
- {
-       return vmcs12->pin_based_vm_exec_control &
-@@ -3687,9 +3691,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
-                               &_vmexit_control) < 0)
-               return -EIO;
--      min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
--              PIN_BASED_VIRTUAL_NMIS;
--      opt = PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER;
-+      min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-+      opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
-+               PIN_BASED_VMX_PREEMPTION_TIMER;
-       if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
-                               &_pin_based_exec_control) < 0)
-               return -EIO;
-@@ -5549,7 +5553,8 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
- static void enable_nmi_window(struct kvm_vcpu *vcpu)
- {
--      if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
-+      if (!cpu_has_virtual_nmis() ||
-+          vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
-               enable_irq_window(vcpu);
-               return;
-       }
-@@ -5589,6 +5594,19 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
- {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-+      if (!cpu_has_virtual_nmis()) {
-+              /*
-+               * Tracking the NMI-blocked state in software is built upon
-+               * finding the next open IRQ window. This, in turn, depends on
-+               * well-behaving guests: They have to keep IRQs disabled at
-+               * least as long as the NMI handler runs. Otherwise we may
-+               * cause NMI nesting, maybe breaking the guest. But as this is
-+               * highly unlikely, we can live with the residual risk.
-+               */
-+              vmx->loaded_vmcs->soft_vnmi_blocked = 1;
-+              vmx->loaded_vmcs->vnmi_blocked_time = 0;
-+      }
-+
-       ++vcpu->stat.nmi_injections;
-       vmx->loaded_vmcs->nmi_known_unmasked = false;
-@@ -5607,6 +5625,8 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       bool masked;
-+      if (!cpu_has_virtual_nmis())
-+              return vmx->loaded_vmcs->soft_vnmi_blocked;
-       if (vmx->loaded_vmcs->nmi_known_unmasked)
-               return false;
-       masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
-@@ -5618,13 +5638,20 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
- {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
--      vmx->loaded_vmcs->nmi_known_unmasked = !masked;
--      if (masked)
--              vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
--                            GUEST_INTR_STATE_NMI);
--      else
--              vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
--                              GUEST_INTR_STATE_NMI);
-+      if (!cpu_has_virtual_nmis()) {
-+              if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
-+                      vmx->loaded_vmcs->soft_vnmi_blocked = masked;
-+                      vmx->loaded_vmcs->vnmi_blocked_time = 0;
-+              }
-+      } else {
-+              vmx->loaded_vmcs->nmi_known_unmasked = !masked;
-+              if (masked)
-+                      vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
-+                                    GUEST_INTR_STATE_NMI);
-+              else
-+                      vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
-+                                      GUEST_INTR_STATE_NMI);
-+      }
- }
- static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
-@@ -5632,6 +5659,10 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
-       if (to_vmx(vcpu)->nested.nested_run_pending)
-               return 0;
-+      if (!cpu_has_virtual_nmis() &&
-+          to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
-+              return 0;
-+
-       return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-                 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
-                  | GUEST_INTR_STATE_NMI));
-@@ -6360,6 +6391,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
-        * AAK134, BY25.
-        */
-       if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
-+                      cpu_has_virtual_nmis() &&
-                       (exit_qualification & INTR_INFO_UNBLOCK_NMI))
-               vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
-@@ -6834,7 +6866,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
-       }
-       /* Create a new VMCS */
--      item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
-+      item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
-       if (!item)
-               return NULL;
-       item->vmcs02.vmcs = alloc_vmcs();
-@@ -7851,6 +7883,7 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
-        * "blocked by NMI" bit has to be set before next VM entry.
-        */
-       if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
-+                      cpu_has_virtual_nmis() &&
-                       (exit_qualification & INTR_INFO_UNBLOCK_NMI))
-               vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
-                               GUEST_INTR_STATE_NMI);
-@@ -8568,6 +8601,25 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
-               return 0;
-       }
-+      if (unlikely(!cpu_has_virtual_nmis() &&
-+                   vmx->loaded_vmcs->soft_vnmi_blocked)) {
-+              if (vmx_interrupt_allowed(vcpu)) {
-+                      vmx->loaded_vmcs->soft_vnmi_blocked = 0;
-+              } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
-+                         vcpu->arch.nmi_pending) {
-+                      /*
-+                       * This CPU don't support us in finding the end of an
-+                       * NMI-blocked window if the guest runs with IRQs
-+                       * disabled. So we pull the trigger after 1 s of
-+                       * futile waiting, but inform the user about this.
-+                       */
-+                      printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
-+                             "state on VCPU %d after 1 s timeout\n",
-+                             __func__, vcpu->vcpu_id);
-+                      vmx->loaded_vmcs->soft_vnmi_blocked = 0;
-+              }
-+      }
-+
-       if (exit_reason < kvm_vmx_max_exit_handlers
-           && kvm_vmx_exit_handlers[exit_reason])
-               return kvm_vmx_exit_handlers[exit_reason](vcpu);
-@@ -8850,33 +8902,38 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
-       idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
--      if (vmx->loaded_vmcs->nmi_known_unmasked)
--              return;
--      /*
--       * Can't use vmx->exit_intr_info since we're not sure what
--       * the exit reason is.
--       */
--      exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
--      unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
--      vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
--      /*
--       * SDM 3: 27.7.1.2 (September 2008)
--       * Re-set bit "block by NMI" before VM entry if vmexit caused by
--       * a guest IRET fault.
--       * SDM 3: 23.2.2 (September 2008)
--       * Bit 12 is undefined in any of the following cases:
--       *  If the VM exit sets the valid bit in the IDT-vectoring
--       *   information field.
--       *  If the VM exit is due to a double fault.
--       */
--      if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
--          vector != DF_VECTOR && !idtv_info_valid)
--              vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
--                            GUEST_INTR_STATE_NMI);
--      else
--              vmx->loaded_vmcs->nmi_known_unmasked =
--                      !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
--                        & GUEST_INTR_STATE_NMI);
-+      if (cpu_has_virtual_nmis()) {
-+              if (vmx->loaded_vmcs->nmi_known_unmasked)
-+                      return;
-+              /*
-+               * Can't use vmx->exit_intr_info since we're not sure what
-+               * the exit reason is.
-+               */
-+              exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-+              unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
-+              vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
-+              /*
-+               * SDM 3: 27.7.1.2 (September 2008)
-+               * Re-set bit "block by NMI" before VM entry if vmexit caused by
-+               * a guest IRET fault.
-+               * SDM 3: 23.2.2 (September 2008)
-+               * Bit 12 is undefined in any of the following cases:
-+               *  If the VM exit sets the valid bit in the IDT-vectoring
-+               *   information field.
-+               *  If the VM exit is due to a double fault.
-+               */
-+              if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
-+                  vector != DF_VECTOR && !idtv_info_valid)
-+                      vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
-+                                    GUEST_INTR_STATE_NMI);
-+              else
-+                      vmx->loaded_vmcs->nmi_known_unmasked =
-+                              !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
-+                                & GUEST_INTR_STATE_NMI);
-+      } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
-+              vmx->loaded_vmcs->vnmi_blocked_time +=
-+                      ktime_to_ns(ktime_sub(ktime_get(),
-+                                            vmx->loaded_vmcs->entry_time));
- }
- static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
-@@ -8993,6 +9050,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       unsigned long debugctlmsr, cr3, cr4;
-+      /* Record the guest's net vcpu time for enforced NMI injections. */
-+      if (unlikely(!cpu_has_virtual_nmis() &&
-+                   vmx->loaded_vmcs->soft_vnmi_blocked))
-+              vmx->loaded_vmcs->entry_time = ktime_get();
-+
-       /* Don't enter VMX if guest state is invalid, let the exit handler
-          start emulation until we arrive back to a valid state */
-       if (vmx->emulation_required)
--- 
-2.14.2
-
diff --git a/patches/kernel/0014-KVM-SVM-obey-guest-PAT.patch b/patches/kernel/0014-KVM-SVM-obey-guest-PAT.patch
deleted file mode 100644 (file)
index cc64a62..0000000
+++ /dev/null
@@ -1,56 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Paolo Bonzini <pbonzini@redhat.com>
-Date: Thu, 26 Oct 2017 09:13:27 +0200
-Subject: [PATCH] KVM: SVM: obey guest PAT
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-For many years some users of assigned devices have reported worse
-performance on AMD processors with NPT than on AMD without NPT,
-Intel or bare metal.
-
-The reason turned out to be that SVM is discarding the guest PAT
-setting and uses the default (PA0=PA4=WB, PA1=PA5=WT, PA2=PA6=UC-,
-PA3=UC).  The guest might be using a different setting, and
-especially might want write combining but isn't getting it
-(instead getting slow UC or UC- accesses).
-
-Thanks a lot to geoff@hostfission.com for noticing the relation
-to the g_pat setting.  The patch has been tested also by a bunch
-of people on VFIO users forums.
-
-Fixes: 709ddebf81cb40e3c36c6109a7892e8b93a09464
-Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=196409
-Cc: stable@vger.kernel.org
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Reviewed-by: David Hildenbrand <david@redhat.com>
-Tested-by: Nick Sarnie <commendsarnex@gmail.com>
-Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
-(cherry picked from commit 15038e14724799b8c205beb5f20f9e54896013c3)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- arch/x86/kvm/svm.c | 7 +++++++
- 1 file changed, 7 insertions(+)
-
-diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
-index a8c911fcd73f..e9d0f80fd83a 100644
---- a/arch/x86/kvm/svm.c
-+++ b/arch/x86/kvm/svm.c
-@@ -3650,6 +3650,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
-       u32 ecx = msr->index;
-       u64 data = msr->data;
-       switch (ecx) {
-+      case MSR_IA32_CR_PAT:
-+              if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
-+                      return 1;
-+              vcpu->arch.pat = data;
-+              svm->vmcb->save.g_pat = data;
-+              mark_dirty(svm->vmcb, VMCB_NPT);
-+              break;
-       case MSR_IA32_TSC:
-               kvm_write_tsc(vcpu, msr);
-               break;
--- 
-2.14.2
-
diff --git a/patches/kernel/0014-sched-wait-Fix-add_wait_queue-behavioral-change.patch b/patches/kernel/0014-sched-wait-Fix-add_wait_queue-behavioral-change.patch
new file mode 100644 (file)
index 0000000..852ce99
--- /dev/null
@@ -0,0 +1,52 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Omar Sandoval <osandov@fb.com>
+Date: Tue, 5 Dec 2017 23:15:31 -0800
+Subject: [PATCH] sched/wait: Fix add_wait_queue() behavioral change
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The following cleanup commit:
+
+  50816c48997a ("sched/wait: Standardize internal naming of wait-queue entries")
+
+... unintentionally changed the behavior of add_wait_queue() from
+inserting the wait entry at the head of the wait queue to the tail
+of the wait queue.
+
+Beyond a negative performance impact this change in behavior
+theoretically also breaks wait queues which mix exclusive and
+non-exclusive waiters, as non-exclusive waiters will not be
+woken up if they are queued behind enough exclusive waiters.
+
+Signed-off-by: Omar Sandoval <osandov@fb.com>
+Reviewed-by: Jens Axboe <axboe@kernel.dk>
+Acked-by: Peter Zijlstra <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: kernel-team@fb.com
+Fixes: ("sched/wait: Standardize internal naming of wait-queue entries")
+Link: http://lkml.kernel.org/r/a16c8ccffd39bd08fdaa45a5192294c784b803a7.1512544324.git.osandov@fb.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+(cherry picked from commit c6b9d9a33029014446bd9ed84c1688f6d3d4eab9)
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ kernel/sched/wait.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
+index d6afed6d0752..c09ebe92a40a 100644
+--- a/kernel/sched/wait.c
++++ b/kernel/sched/wait.c
+@@ -27,7 +27,7 @@ void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
+       wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
+       spin_lock_irqsave(&wq_head->lock, flags);
+-      __add_wait_queue_entry_tail(wq_head, wq_entry);
++      __add_wait_queue(wq_head, wq_entry);
+       spin_unlock_irqrestore(&wq_head->lock, flags);
+ }
+ EXPORT_SYMBOL(add_wait_queue);
+-- 
+2.14.2
+
diff --git a/patches/kernel/0015-module-retpoline-Warn-about-missing-retpoline-in-mod.patch b/patches/kernel/0015-module-retpoline-Warn-about-missing-retpoline-in-mod.patch
new file mode 100644 (file)
index 0000000..27fffe4
--- /dev/null
@@ -0,0 +1,164 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Andi Kleen <ak@linux.intel.com>
+Date: Thu, 25 Jan 2018 15:50:28 -0800
+Subject: [PATCH] module/retpoline: Warn about missing retpoline in module
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+There's a risk that a kernel which has full retpoline mitigations becomes
+vulnerable when a module gets loaded that hasn't been compiled with the
+right compiler or the right option.
+
+To enable detection of that mismatch at module load time, add a module info
+string "retpoline" at build time when the module was compiled with
+retpoline support. This only covers compiled C source, but assembler source
+or prebuilt object files are not checked.
+
+If a retpoline enabled kernel detects a non retpoline protected module at
+load time, print a warning and report it in the sysfs vulnerability file.
+
+[ tglx: Massaged changelog ]
+
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: David Woodhouse <dwmw2@infradead.org>
+Cc: gregkh@linuxfoundation.org
+Cc: torvalds@linux-foundation.org
+Cc: jeyu@kernel.org
+Cc: arjan@linux.intel.com
+Link: https://lkml.kernel.org/r/20180125235028.31211-1-andi@firstfloor.org
+(backported from commit caf7501a1b4ec964190f31f9c3f163de252273b8)
+Conflicts:
+       arch/x86/kernel/cpu/bugs.c
+context changes
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ include/linux/module.h     |  9 +++++++++
+ arch/x86/kernel/cpu/bugs.c | 19 +++++++++++++++++--
+ kernel/module.c            | 11 +++++++++++
+ scripts/mod/modpost.c      |  9 +++++++++
+ 4 files changed, 46 insertions(+), 2 deletions(-)
+
+diff --git a/include/linux/module.h b/include/linux/module.h
+index e7bdd549e527..c4fdf7661f82 100644
+--- a/include/linux/module.h
++++ b/include/linux/module.h
+@@ -794,6 +794,15 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr,
+ static inline void module_bug_cleanup(struct module *mod) {}
+ #endif        /* CONFIG_GENERIC_BUG */
++#ifdef RETPOLINE
++extern bool retpoline_module_ok(bool has_retpoline);
++#else
++static inline bool retpoline_module_ok(bool has_retpoline)
++{
++      return true;
++}
++#endif
++
+ #ifdef CONFIG_MODULE_SIG
+ static inline bool module_sig_ok(struct module *module)
+ {
+diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
+index d5bafcdb4891..e623bd731a74 100644
+--- a/arch/x86/kernel/cpu/bugs.c
++++ b/arch/x86/kernel/cpu/bugs.c
+@@ -11,6 +11,7 @@
+ #include <linux/utsname.h>
+ #include <linux/cpu.h>
+ #include <linux/smp.h>
++#include <linux/module.h>
+ #include <asm/nospec-branch.h>
+ #include <asm/cmdline.h>
+@@ -93,6 +94,19 @@ static const char *spectre_v2_strings[] = {
+ #define pr_fmt(fmt)     "Spectre V2 mitigation: " fmt
+ static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
++static bool spectre_v2_bad_module;
++
++#ifdef RETPOLINE
++bool retpoline_module_ok(bool has_retpoline)
++{
++      if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
++              return true;
++
++      pr_err("System may be vunerable to spectre v2\n");
++      spectre_v2_bad_module = true;
++      return false;
++}
++#endif
+ static void __init spec2_print_if_insecure(const char *reason)
+ {
+@@ -299,7 +313,8 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
+       if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+               return sprintf(buf, "Not affected\n");
+-      return sprintf(buf, "%s%s\n", spectre_v2_strings[spectre_v2_enabled],
+-                     ibpb_inuse ? ", IBPB (Intel v4)" : "");
++      return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
++                     ibpb_inuse ? ", IBPB (Intel v4)" : "",
++                     spectre_v2_bad_module ? " - vulnerable module loaded" : "");
+ }
+ #endif
+diff --git a/kernel/module.c b/kernel/module.c
+index e5b878b26906..de7db074f793 100644
+--- a/kernel/module.c
++++ b/kernel/module.c
+@@ -2855,6 +2855,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
+ }
+ #endif /* CONFIG_LIVEPATCH */
++static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
++{
++      if (retpoline_module_ok(get_modinfo(info, "retpoline")))
++              return;
++
++      pr_warn("%s: loading module not compiled with retpoline compiler.\n",
++              mod->name);
++}
++
+ /* Sets info->hdr and info->len. */
+ static int copy_module_from_user(const void __user *umod, unsigned long len,
+                                 struct load_info *info)
+@@ -3021,6 +3030,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
+               add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
+       }
++      check_modinfo_retpoline(mod, info);
++
+       if (get_modinfo(info, "staging")) {
+               add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
+               pr_warn("%s: module is from the staging directory, the quality "
+diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
+index 48397feb08fb..cc91f81ac33e 100644
+--- a/scripts/mod/modpost.c
++++ b/scripts/mod/modpost.c
+@@ -2147,6 +2147,14 @@ static void add_intree_flag(struct buffer *b, int is_intree)
+               buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n");
+ }
++/* Cannot check for assembler */
++static void add_retpoline(struct buffer *b)
++{
++      buf_printf(b, "\n#ifdef RETPOLINE\n");
++      buf_printf(b, "MODULE_INFO(retpoline, \"Y\");\n");
++      buf_printf(b, "#endif\n");
++}
++
+ static void add_staging_flag(struct buffer *b, const char *name)
+ {
+       static const char *staging_dir = "drivers/staging";
+@@ -2492,6 +2500,7 @@ int main(int argc, char **argv)
+               add_header(&buf, mod);
+               add_intree_flag(&buf, !external_module);
++              add_retpoline(&buf);
+               add_staging_flag(&buf, mod->name);
+               err |= add_versions(&buf, mod);
+               add_depends(&buf, mod, modules);
+-- 
+2.14.2
+
diff --git a/patches/kernel/0015-net-sched-em_nbyte-don-t-add-the-data-offset-twice.patch b/patches/kernel/0015-net-sched-em_nbyte-don-t-add-the-data-offset-twice.patch
deleted file mode 100644 (file)
index d0b118e..0000000
+++ /dev/null
@@ -1,33 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Wolfgang Bumiller <w.bumiller@proxmox.com>
-Date: Fri, 19 Jan 2018 11:12:37 +0100
-Subject: [PATCH] net: sched: em_nbyte: don't add the data offset twice
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-'ptr' is shifted by the offset and then validated,
-the memcmp should not add it a second time.
-
-Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- net/sched/em_nbyte.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
-index df3110d69585..07c10bac06a0 100644
---- a/net/sched/em_nbyte.c
-+++ b/net/sched/em_nbyte.c
-@@ -51,7 +51,7 @@ static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
-       if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
-               return 0;
--      return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len);
-+      return !memcmp(ptr, nbyte->pattern, nbyte->hdr.len);
- }
- static struct tcf_ematch_ops em_nbyte_ops = {
--- 
-2.14.2
-
diff --git a/patches/kernel/0016-net-sched-fix-TCF_LAYER_LINK-case-in-tcf_get_base_pt.patch b/patches/kernel/0016-net-sched-fix-TCF_LAYER_LINK-case-in-tcf_get_base_pt.patch
deleted file mode 100644 (file)
index 416c277..0000000
+++ /dev/null
@@ -1,34 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Wolfgang Bumiller <w.bumiller@proxmox.com>
-Date: Fri, 19 Jan 2018 11:12:38 +0100
-Subject: [PATCH] net: sched: fix TCF_LAYER_LINK case in tcf_get_base_ptr
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-TCF_LAYER_LINK and TCF_LAYER_NETWORK returned the same pointer as
-skb->data points to the network header.
-Use skb_mac_header instead.
-
-Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- include/net/pkt_cls.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
-index 537d0a0ad4c4..4450961b1554 100644
---- a/include/net/pkt_cls.h
-+++ b/include/net/pkt_cls.h
-@@ -395,7 +395,7 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer)
- {
-       switch (layer) {
-               case TCF_LAYER_LINK:
--                      return skb->data;
-+                      return skb_mac_header(skb);
-               case TCF_LAYER_NETWORK:
-                       return skb_network_header(skb);
-               case TCF_LAYER_TRANSPORT:
--- 
-2.14.2
-
diff --git a/patches/kernel/0016-net-tcp-close-sock-if-net-namespace-is-exiting.patch b/patches/kernel/0016-net-tcp-close-sock-if-net-namespace-is-exiting.patch
new file mode 100644 (file)
index 0000000..d4eb5e6
--- /dev/null
@@ -0,0 +1,127 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Dan Streetman <ddstreet@ieee.org>
+Date: Thu, 18 Jan 2018 16:14:26 -0500
+Subject: [PATCH] net: tcp: close sock if net namespace is exiting
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+When a tcp socket is closed, if it detects that its net namespace is
+exiting, close immediately and do not wait for FIN sequence.
+
+For normal sockets, a reference is taken to their net namespace, so it will
+never exit while the socket is open.  However, kernel sockets do not take a
+reference to their net namespace, so it may begin exiting while the kernel
+socket is still open.  In this case if the kernel socket is a tcp socket,
+it will stay open trying to complete its close sequence.  The sock's dst(s)
+hold a reference to their interface, which are all transferred to the
+namespace's loopback interface when the real interfaces are taken down.
+When the namespace tries to take down its loopback interface, it hangs
+waiting for all references to the loopback interface to release, which
+results in messages like:
+
+unregister_netdevice: waiting for lo to become free. Usage count = 1
+
+These messages continue until the socket finally times out and closes.
+Since the net namespace cleanup holds the net_mutex while calling its
+registered pernet callbacks, any new net namespace initialization is
+blocked until the current net namespace finishes exiting.
+
+After this change, the tcp socket notices the exiting net namespace, and
+closes immediately, releasing its dst(s) and their reference to the
+loopback interface, which lets the net namespace continue exiting.
+
+Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811
+Signed-off-by: Dan Streetman <ddstreet@canonical.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ include/net/net_namespace.h | 10 ++++++++++
+ net/ipv4/tcp.c              |  3 +++
+ net/ipv4/tcp_timer.c        | 15 +++++++++++++++
+ 3 files changed, 28 insertions(+)
+
+diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
+index 1c401bd4c2e0..a5d023fa78db 100644
+--- a/include/net/net_namespace.h
++++ b/include/net/net_namespace.h
+@@ -221,6 +221,11 @@ int net_eq(const struct net *net1, const struct net *net2)
+       return net1 == net2;
+ }
++static inline int check_net(const struct net *net)
++{
++      return atomic_read(&net->count) != 0;
++}
++
+ void net_drop_ns(void *);
+ #else
+@@ -245,6 +250,11 @@ int net_eq(const struct net *net1, const struct net *net2)
+       return 1;
+ }
++static inline int check_net(const struct net *net)
++{
++      return 1;
++}
++
+ #define net_drop_ns NULL
+ #endif
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index a3e91b552edc..fd2a086da910 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -2258,6 +2258,9 @@ void tcp_close(struct sock *sk, long timeout)
+                       tcp_send_active_reset(sk, GFP_ATOMIC);
+                       __NET_INC_STATS(sock_net(sk),
+                                       LINUX_MIB_TCPABORTONMEMORY);
++              } else if (!check_net(sock_net(sk))) {
++                      /* Not possible to send reset; just close */
++                      tcp_set_state(sk, TCP_CLOSE);
+               }
+       }
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
+index e906014890b6..ec1e5de41653 100644
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -50,11 +50,19 @@ static void tcp_write_err(struct sock *sk)
+  *  to prevent DoS attacks. It is called when a retransmission timeout
+  *  or zero probe timeout occurs on orphaned socket.
+  *
++ *  Also close if our net namespace is exiting; in that case there is no
++ *  hope of ever communicating again since all netns interfaces are already
++ *  down (or about to be down), and we need to release our dst references,
++ *  which have been moved to the netns loopback interface, so the namespace
++ *  can finish exiting.  This condition is only possible if we are a kernel
++ *  socket, as those do not hold references to the namespace.
++ *
+  *  Criteria is still not confirmed experimentally and may change.
+  *  We kill the socket, if:
+  *  1. If number of orphaned sockets exceeds an administratively configured
+  *     limit.
+  *  2. If we have strong memory pressure.
++ *  3. If our net namespace is exiting.
+  */
+ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
+ {
+@@ -83,6 +91,13 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
+               __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
+               return 1;
+       }
++
++      if (!check_net(sock_net(sk))) {
++              /* Not possible to send reset; just close */
++              tcp_done(sk);
++              return 1;
++      }
++
+       return 0;
+ }
+-- 
+2.14.2
+
diff --git a/patches/kernel/0017-i40e-Fix-memory-leak-related-filter-programming-stat.patch b/patches/kernel/0017-i40e-Fix-memory-leak-related-filter-programming-stat.patch
deleted file mode 100644 (file)
index e318a18..0000000
+++ /dev/null
@@ -1,127 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Alexander Duyck <alexander.h.duyck@intel.com>
-Date: Wed, 4 Oct 2017 08:44:43 -0700
-Subject: [PATCH] i40e: Fix memory leak related filter programming status
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-It looks like we weren't correctly placing the pages from buffers that had
-been used to return a filter programming status back on the ring. As a
-result they were being overwritten and tracking of the pages was lost.
-
-This change works to correct that by incorporating part of
-i40e_put_rx_buffer into the programming status handler code. As a result we
-should now be correctly placing the pages for those buffers on the
-re-allocation list instead of letting them stay in place.
-
-Fixes: 0e626ff7ccbf ("i40e: Fix support for flow director programming status")
-Reported-by: Anders K. Pedersen <akp@cohaesio.com>
-Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
-Tested-by: Anders K Pedersen <akp@cohaesio.com>
-Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
-(cherry picked from commit 2b9478ffc550f17c6cd8c69057234e91150f5972)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 63 ++++++++++++++++-------------
- 1 file changed, 36 insertions(+), 27 deletions(-)
-
-diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
-index 2194960d5855..391b1878c24b 100644
---- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
-+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
-@@ -1042,6 +1042,32 @@ static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc)
-       return false;
- }
-+/**
-+ * i40e_reuse_rx_page - page flip buffer and store it back on the ring
-+ * @rx_ring: rx descriptor ring to store buffers on
-+ * @old_buff: donor buffer to have page reused
-+ *
-+ * Synchronizes page for reuse by the adapter
-+ **/
-+static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
-+                             struct i40e_rx_buffer *old_buff)
-+{
-+      struct i40e_rx_buffer *new_buff;
-+      u16 nta = rx_ring->next_to_alloc;
-+
-+      new_buff = &rx_ring->rx_bi[nta];
-+
-+      /* update, and store next to alloc */
-+      nta++;
-+      rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-+
-+      /* transfer page from old buffer to new buffer */
-+      new_buff->dma           = old_buff->dma;
-+      new_buff->page          = old_buff->page;
-+      new_buff->page_offset   = old_buff->page_offset;
-+      new_buff->pagecnt_bias  = old_buff->pagecnt_bias;
-+}
-+
- /**
-  * i40e_rx_is_programming_status - check for programming status descriptor
-  * @qw: qword representing status_error_len in CPU ordering
-@@ -1076,15 +1102,24 @@ static void i40e_clean_programming_status(struct i40e_ring *rx_ring,
-                                         union i40e_rx_desc *rx_desc,
-                                         u64 qw)
- {
--      u32 ntc = rx_ring->next_to_clean + 1;
-+      struct i40e_rx_buffer *rx_buffer;
-+      u32 ntc = rx_ring->next_to_clean;
-       u8 id;
-       /* fetch, update, and store next to clean */
-+      rx_buffer = &rx_ring->rx_bi[ntc++];
-       ntc = (ntc < rx_ring->count) ? ntc : 0;
-       rx_ring->next_to_clean = ntc;
-       prefetch(I40E_RX_DESC(rx_ring, ntc));
-+      /* place unused page back on the ring */
-+      i40e_reuse_rx_page(rx_ring, rx_buffer);
-+      rx_ring->rx_stats.page_reuse_count++;
-+
-+      /* clear contents of buffer_info */
-+      rx_buffer->page = NULL;
-+
-       id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
-                 I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT;
-@@ -1643,32 +1678,6 @@ static bool i40e_cleanup_headers(struct i40e_ring *rx_ring, struct sk_buff *skb,
-       return false;
- }
--/**
-- * i40e_reuse_rx_page - page flip buffer and store it back on the ring
-- * @rx_ring: rx descriptor ring to store buffers on
-- * @old_buff: donor buffer to have page reused
-- *
-- * Synchronizes page for reuse by the adapter
-- **/
--static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
--                             struct i40e_rx_buffer *old_buff)
--{
--      struct i40e_rx_buffer *new_buff;
--      u16 nta = rx_ring->next_to_alloc;
--
--      new_buff = &rx_ring->rx_bi[nta];
--
--      /* update, and store next to alloc */
--      nta++;
--      rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
--
--      /* transfer page from old buffer to new buffer */
--      new_buff->dma           = old_buff->dma;
--      new_buff->page          = old_buff->page;
--      new_buff->page_offset   = old_buff->page_offset;
--      new_buff->pagecnt_bias  = old_buff->pagecnt_bias;
--}
--
- /**
-  * i40e_page_is_reusable - check if any reuse is possible
-  * @page: page struct to check
--- 
-2.14.2
-
diff --git a/patches/kernel/0017-sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch b/patches/kernel/0017-sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch
new file mode 100644 (file)
index 0000000..1bed6b0
--- /dev/null
@@ -0,0 +1,89 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Tommi Rantala <tommi.t.rantala@nokia.com>
+Date: Mon, 5 Feb 2018 21:48:14 +0200
+Subject: [PATCH] sctp: fix dst refcnt leak in sctp_v4_get_dst
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Fix dst reference count leak in sctp_v4_get_dst() introduced in commit
+410f03831 ("sctp: add routing output fallback"):
+
+When walking the address_list, successive ip_route_output_key() calls
+may return the same rt->dst with the reference incremented on each call.
+
+The code would not decrement the dst refcount when the dst pointer was
+identical from the previous iteration, causing the dst refcnt leak.
+
+Testcase:
+  ip netns add TEST
+  ip netns exec TEST ip link set lo up
+  ip link add dummy0 type dummy
+  ip link add dummy1 type dummy
+  ip link add dummy2 type dummy
+  ip link set dev dummy0 netns TEST
+  ip link set dev dummy1 netns TEST
+  ip link set dev dummy2 netns TEST
+  ip netns exec TEST ip addr add 192.168.1.1/24 dev dummy0
+  ip netns exec TEST ip link set dummy0 up
+  ip netns exec TEST ip addr add 192.168.1.2/24 dev dummy1
+  ip netns exec TEST ip link set dummy1 up
+  ip netns exec TEST ip addr add 192.168.1.3/24 dev dummy2
+  ip netns exec TEST ip link set dummy2 up
+  ip netns exec TEST sctp_test -H 192.168.1.2 -P 20002 -h 192.168.1.1 -p 20000 -s -B 192.168.1.3
+  ip netns del TEST
+
+In 4.4 and 4.9 kernels this results to:
+  [  354.179591] unregister_netdevice: waiting for lo to become free. Usage count = 1
+  [  364.419674] unregister_netdevice: waiting for lo to become free. Usage count = 1
+  [  374.663664] unregister_netdevice: waiting for lo to become free. Usage count = 1
+  [  384.903717] unregister_netdevice: waiting for lo to become free. Usage count = 1
+  [  395.143724] unregister_netdevice: waiting for lo to become free. Usage count = 1
+  [  405.383645] unregister_netdevice: waiting for lo to become free. Usage count = 1
+  ...
+
+Fixes: 410f03831 ("sctp: add routing output fallback")
+Fixes: 0ca50d12f ("sctp: fix src address selection if using secondary addresses")
+Signed-off-by: Tommi Rantala <tommi.t.rantala@nokia.com>
+Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Acked-by: Neil Horman <nhorman@tuxdriver.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ net/sctp/protocol.c | 10 ++++------
+ 1 file changed, 4 insertions(+), 6 deletions(-)
+
+diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
+index 989a900383b5..e1a3ae4f3cab 100644
+--- a/net/sctp/protocol.c
++++ b/net/sctp/protocol.c
+@@ -514,22 +514,20 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
+               if (IS_ERR(rt))
+                       continue;
+-              if (!dst)
+-                      dst = &rt->dst;
+-
+               /* Ensure the src address belongs to the output
+                * interface.
+                */
+               odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr,
+                                    false);
+               if (!odev || odev->ifindex != fl4->flowi4_oif) {
+-                      if (&rt->dst != dst)
++                      if (!dst)
++                              dst = &rt->dst;
++                      else
+                               dst_release(&rt->dst);
+                       continue;
+               }
+-              if (dst != &rt->dst)
+-                      dst_release(dst);
++              dst_release(dst);
+               dst = &rt->dst;
+               break;
+       }
+-- 
+2.14.2
+
diff --git a/patches/kernel/0018-KVM-x86-Add-memory-barrier-on-vmcs-field-lookup.patch b/patches/kernel/0018-KVM-x86-Add-memory-barrier-on-vmcs-field-lookup.patch
deleted file mode 100644 (file)
index 8fc4603..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Andrew Honig <ahonig@google.com>
-Date: Wed, 10 Jan 2018 10:12:03 -0800
-Subject: [PATCH] KVM: x86: Add memory barrier on vmcs field lookup
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-commit 75f139aaf896d6fdeec2e468ddfa4b2fe469bf40 upstream.
-
-This adds a memory barrier when performing a lookup into
-the vmcs_field_to_offset_table.  This is related to
-CVE-2017-5753.
-
-Signed-off-by: Andrew Honig <ahonig@google.com>
-Reviewed-by: Jim Mattson <jmattson@google.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- arch/x86/kvm/vmx.c | 12 ++++++++++--
- 1 file changed, 10 insertions(+), 2 deletions(-)
-
-diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
-index 146caacd8fdd..80732f87cac0 100644
---- a/arch/x86/kvm/vmx.c
-+++ b/arch/x86/kvm/vmx.c
-@@ -883,8 +883,16 @@ static inline short vmcs_field_to_offset(unsigned long field)
- {
-       BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
--      if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
--          vmcs_field_to_offset_table[field] == 0)
-+      if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
-+              return -ENOENT;
-+
-+      /*
-+       * FIXME: Mitigation for CVE-2017-5753.  To be replaced with a
-+       * generic mechanism.
-+       */
-+      asm("lfence");
-+
-+      if (vmcs_field_to_offset_table[field] == 0)
-               return -ENOENT;
-       return vmcs_field_to_offset_table[field];
--- 
-2.14.2
-
diff --git a/patches/kernel/0018-sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch b/patches/kernel/0018-sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch
new file mode 100644 (file)
index 0000000..e6de54a
--- /dev/null
@@ -0,0 +1,60 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Alexey Kodanev <alexey.kodanev@oracle.com>
+Date: Mon, 5 Feb 2018 15:10:35 +0300
+Subject: [PATCH] sctp: fix dst refcnt leak in sctp_v6_get_dst()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+When going through the bind address list in sctp_v6_get_dst() and
+the previously found address is better ('matchlen > bmatchlen'),
+the code continues to the next iteration without releasing currently
+held destination.
+
+Fix it by releasing 'bdst' before continue to the next iteration, and
+instead of introducing one more '!IS_ERR(bdst)' check for dst_release(),
+move the already existed one right after ip6_dst_lookup_flow(), i.e. we
+shouldn't proceed further if we get an error for the route lookup.
+
+Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using secondary addresses for ipv6")
+Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
+Acked-by: Neil Horman <nhorman@tuxdriver.com>
+Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ net/sctp/ipv6.c | 10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
+index edb462b0b73b..e626d72868fe 100644
+--- a/net/sctp/ipv6.c
++++ b/net/sctp/ipv6.c
+@@ -326,8 +326,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
+               final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
+               bdst = ip6_dst_lookup_flow(sk, fl6, final_p);
+-              if (!IS_ERR(bdst) &&
+-                  ipv6_chk_addr(dev_net(bdst->dev),
++              if (IS_ERR(bdst))
++                      continue;
++
++              if (ipv6_chk_addr(dev_net(bdst->dev),
+                                 &laddr->a.v6.sin6_addr, bdst->dev, 1)) {
+                       if (!IS_ERR_OR_NULL(dst))
+                               dst_release(dst);
+@@ -336,8 +338,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
+               }
+               bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
+-              if (matchlen > bmatchlen)
++              if (matchlen > bmatchlen) {
++                      dst_release(bdst);
+                       continue;
++              }
+               if (!IS_ERR_OR_NULL(dst))
+                       dst_release(dst);
+-- 
+2.14.2
+
diff --git a/patches/kernel/0019-EDAC-sb_edac-Don-t-create-a-second-memory-controller.patch b/patches/kernel/0019-EDAC-sb_edac-Don-t-create-a-second-memory-controller.patch
deleted file mode 100644 (file)
index 4272010..0000000
+++ /dev/null
@@ -1,102 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
-Date: Wed, 13 Sep 2017 18:42:14 +0800
-Subject: [PATCH] EDAC, sb_edac: Don't create a second memory controller if HA1
- is not present
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Yi Zhang reported the following failure on a 2-socket Haswell (E5-2603v3)
-server (DELL PowerEdge 730xd):
-
-  EDAC sbridge: Some needed devices are missing
-  EDAC MC: Removed device 0 for sb_edac.c Haswell SrcID#0_Ha#0: DEV 0000:7f:12.0
-  EDAC MC: Removed device 1 for sb_edac.c Haswell SrcID#1_Ha#0: DEV 0000:ff:12.0
-  EDAC sbridge: Couldn't find mci handler
-  EDAC sbridge: Couldn't find mci handler
-  EDAC sbridge: Failed to register device with error -19.
-
-The refactored sb_edac driver creates the IMC1 (the 2nd memory
-controller) if any IMC1 device is present. In this case only
-HA1_TA of IMC1 was present, but the driver expected to find
-HA1/HA1_TM/HA1_TAD[0-3] devices too, leading to the above failure.
-
-The document [1] says the 'E5-2603 v3' CPU has 4 memory channels max. Yi
-Zhang inserted one DIMM per channel for each CPU, and did random error
-address injection test with this patch:
-
-      4024  addresses fell in TOLM hole area
-     12715  addresses fell in CPU_SrcID#0_Ha#0_Chan#0_DIMM#0
-     12774  addresses fell in CPU_SrcID#0_Ha#0_Chan#1_DIMM#0
-     12798  addresses fell in CPU_SrcID#0_Ha#0_Chan#2_DIMM#0
-     12913  addresses fell in CPU_SrcID#0_Ha#0_Chan#3_DIMM#0
-     12674  addresses fell in CPU_SrcID#1_Ha#0_Chan#0_DIMM#0
-     12686  addresses fell in CPU_SrcID#1_Ha#0_Chan#1_DIMM#0
-     12882  addresses fell in CPU_SrcID#1_Ha#0_Chan#2_DIMM#0
-     12934  addresses fell in CPU_SrcID#1_Ha#0_Chan#3_DIMM#0
-    106400  addresses were injected totally.
-
-The test result shows that all the 4 channels belong to IMC0 per CPU, so
-the server really only has one IMC per CPU.
-
-In the 1st page of chapter 2 in datasheet [2], it also says 'E5-2600 v3'
-implements either one or two IMCs. For CPUs with one IMC, IMC1 is not
-used and should be ignored.
-
-Thus, do not create a second memory controller if the key HA1 is absent.
-
-[1] http://ark.intel.com/products/83349/Intel-Xeon-Processor-E5-2603-v3-15M-Cache-1_60-GHz
-[2] https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/xeon-e5-v3-datasheet-vol-2.pdf
-
-Reported-and-tested-by: Yi Zhang <yizhan@redhat.com>
-Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
-Cc: Tony Luck <tony.luck@intel.com>
-Cc: linux-edac <linux-edac@vger.kernel.org>
-Fixes: e2f747b1f42a ("EDAC, sb_edac: Assign EDAC memory controller per h/w controller")
-Link: http://lkml.kernel.org/r/20170913104214.7325-1-qiuxu.zhuo@intel.com
-[ Massage commit message. ]
-Signed-off-by: Borislav Petkov <bp@suse.de>
-(cherry picked from commit 15cc3ae001873845b5d842e212478a6570c7d938)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- drivers/edac/sb_edac.c | 9 ++++++++-
- 1 file changed, 8 insertions(+), 1 deletion(-)
-
-diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
-index 80d860cb0746..7a3b201d51df 100644
---- a/drivers/edac/sb_edac.c
-+++ b/drivers/edac/sb_edac.c
-@@ -455,6 +455,7 @@ static const struct pci_id_table pci_dev_descr_sbridge_table[] = {
- static const struct pci_id_descr pci_dev_descr_ibridge[] = {
-               /* Processor Home Agent */
-       { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0,        0, IMC0) },
-+      { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1,        1, IMC1) },
-               /* Memory controller */
-       { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA,     0, IMC0) },
-@@ -465,7 +466,6 @@ static const struct pci_id_descr pci_dev_descr_ibridge[] = {
-       { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD3,   0, IMC0) },
-               /* Optional, mode 2HA */
--      { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1,        1, IMC1) },
-       { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TA,     1, IMC1) },
-       { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS,    1, IMC1) },
-       { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0,   1, IMC1) },
-@@ -2260,6 +2260,13 @@ static int sbridge_get_onedevice(struct pci_dev **prev,
- next_imc:
-       sbridge_dev = get_sbridge_dev(bus, dev_descr->dom, multi_bus, sbridge_dev);
-       if (!sbridge_dev) {
-+              /* If the HA1 wasn't found, don't create EDAC second memory controller */
-+              if (dev_descr->dom == IMC1 && devno != 1) {
-+                      edac_dbg(0, "Skip IMC1: %04x:%04x (since HA1 was absent)\n",
-+                               PCI_VENDOR_ID_INTEL, dev_descr->dev_id);
-+                      pci_dev_put(pdev);
-+                      return 0;
-+              }
-               if (dev_descr->dom == SOCK)
-                       goto out_imc;
--- 
-2.14.2
-
diff --git a/patches/kernel/0019-lockd-lost-rollback-of-set_grace_period-in-lockd_dow.patch b/patches/kernel/0019-lockd-lost-rollback-of-set_grace_period-in-lockd_dow.patch
new file mode 100644 (file)
index 0000000..d7ba32d
--- /dev/null
@@ -0,0 +1,46 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Vasily Averin <vvs@virtuozzo.com>
+Date: Thu, 2 Nov 2017 13:03:42 +0300
+Subject: [PATCH] lockd: lost rollback of set_grace_period() in
+ lockd_down_net()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Commit efda760fe95ea ("lockd: fix lockd shutdown race") is incorrect,
+it removes lockd_manager and disarm grace_period_end for init_net only.
+
+If nfsd was started from another net namespace lockd_up_net() calls
+set_grace_period() that adds lockd_manager into per-netns list
+and queues grace_period_end delayed work.
+
+These action should be reverted in lockd_down_net().
+Otherwise it can lead to double list_add on after restart nfsd in netns,
+and to use-after-free if non-disarmed delayed work will be executed after netns destroy.
+
+Fixes: efda760fe95e ("lockd: fix lockd shutdown race")
+Cc: stable@vger.kernel.org
+Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+(cherry picked from commit 3a2b19d1ee5633f76ae8a88da7bc039a5d1732aa)
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ fs/lockd/svc.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
+index 726b6cecf430..fa8f6effcf00 100644
+--- a/fs/lockd/svc.c
++++ b/fs/lockd/svc.c
+@@ -274,6 +274,8 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
+       if (ln->nlmsvc_users) {
+               if (--ln->nlmsvc_users == 0) {
+                       nlm_shutdown_hosts_net(net);
++                      cancel_delayed_work_sync(&ln->grace_period_end);
++                      locks_end_grace(&ln->lockd_manager);
+                       svc_shutdown_net(serv, net);
+                       dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net);
+               }
+-- 
+2.14.2
+
diff --git a/patches/kernel/0020-EDAC-sb_edac-Fix-missing-break-in-switch.patch b/patches/kernel/0020-EDAC-sb_edac-Fix-missing-break-in-switch.patch
deleted file mode 100644 (file)
index b7ca81a..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
-Date: Mon, 16 Oct 2017 12:40:29 -0500
-Subject: [PATCH] EDAC, sb_edac: Fix missing break in switch
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Add missing break statement in order to prevent the code from falling
-through.
-
-Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
-Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
-Cc: linux-edac <linux-edac@vger.kernel.org>
-Link: http://lkml.kernel.org/r/20171016174029.GA19757@embeddedor.com
-Signed-off-by: Borislav Petkov <bp@suse.de>
-(cherry picked from commit a8e9b186f153a44690ad0363a56716e7077ad28c)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- drivers/edac/sb_edac.c | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
-index 7a3b201d51df..fb0264ef83a3 100644
---- a/drivers/edac/sb_edac.c
-+++ b/drivers/edac/sb_edac.c
-@@ -2467,6 +2467,7 @@ static int ibridge_mci_bind_devs(struct mem_ctl_info *mci,
-               case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA:
-               case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TA:
-                       pvt->pci_ta = pdev;
-+                      break;
-               case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_RAS:
-               case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS:
-                       pvt->pci_ras = pdev;
--- 
-2.14.2
-
diff --git a/patches/kernel/0020-ocfs2-make-metadata-estimation-accurate-and-clear.patch b/patches/kernel/0020-ocfs2-make-metadata-estimation-accurate-and-clear.patch
new file mode 100644 (file)
index 0000000..b3cabcf
--- /dev/null
@@ -0,0 +1,61 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Changwei Ge <ge.changwei@h3c.com>
+Date: Wed, 31 Jan 2018 16:15:02 -0800
+Subject: [PATCH] ocfs2: make metadata estimation accurate and clear
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Current code assume that ::w_unwritten_list always has only one item on.
+This is not right and hard to get understood.  So improve how to count
+unwritten item.
+
+Link: http://lkml.kernel.org/r/1515479070-32653-1-git-send-email-ge.changwei@h3c.com
+Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
+Reported-by: John Lightsey <john@nixnuts.net>
+Tested-by: John Lightsey <john@nixnuts.net>
+Cc: Mark Fasheh <mfasheh@versity.com>
+Cc: Joseph Qi <jiangqi903@gmail.com>
+Cc: Junxiao Bi <junxiao.bi@oracle.com>
+Cc: Joel Becker <jlbec@evilplan.org>
+Cc: Changwei Ge <ge.changwei@h3c.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+(cherry picked from commit 63de8bd9328bf2a778fc277503da163ae3defa3c)
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ fs/ocfs2/aops.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
+index 88a31e9340a0..77ec9b495027 100644
+--- a/fs/ocfs2/aops.c
++++ b/fs/ocfs2/aops.c
+@@ -784,6 +784,7 @@ struct ocfs2_write_ctxt {
+       struct ocfs2_cached_dealloc_ctxt w_dealloc;
+       struct list_head                w_unwritten_list;
++      unsigned int                    w_unwritten_count;
+ };
+ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
+@@ -1373,6 +1374,7 @@ static int ocfs2_unwritten_check(struct inode *inode,
+       desc->c_clear_unwritten = 0;
+       list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
+       list_add_tail(&new->ue_node, &wc->w_unwritten_list);
++      wc->w_unwritten_count++;
+       new = NULL;
+ unlock:
+       spin_unlock(&oi->ip_lock);
+@@ -2246,7 +2248,7 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
+               ue->ue_phys = desc->c_phys;
+               list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
+-              dwc->dw_zero_count++;
++              dwc->dw_zero_count += wc->w_unwritten_count;
+       }
+       ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
+-- 
+2.14.2
+
diff --git a/patches/kernel/0021-ocfs2-try-to-reuse-extent-block-in-dealloc-without-m.patch b/patches/kernel/0021-ocfs2-try-to-reuse-extent-block-in-dealloc-without-m.patch
new file mode 100644 (file)
index 0000000..15e3488
--- /dev/null
@@ -0,0 +1,370 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Changwei Ge <ge.changwei@h3c.com>
+Date: Wed, 31 Jan 2018 16:15:06 -0800
+Subject: [PATCH] ocfs2: try to reuse extent block in dealloc without
+ meta_alloc
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+A crash issue was reported by John Lightsey with a call trace as follows:
+
+  ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2]
+  ocfs2_change_extent_flag+0x33a/0x470 [ocfs2]
+  ocfs2_mark_extent_written+0x172/0x220 [ocfs2]
+  ocfs2_dio_end_io+0x62d/0x910 [ocfs2]
+  dio_complete+0x19a/0x1a0
+  do_blockdev_direct_IO+0x19dd/0x1eb0
+  __blockdev_direct_IO+0x43/0x50
+  ocfs2_direct_IO+0x8f/0xa0 [ocfs2]
+  generic_file_direct_write+0xb2/0x170
+  __generic_file_write_iter+0xc3/0x1b0
+  ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2]
+  __vfs_write+0xae/0xf0
+  vfs_write+0xb8/0x1b0
+  SyS_write+0x4f/0xb0
+  system_call_fastpath+0x16/0x75
+
+The BUG code told that extent tree wants to grow but no metadata was
+reserved ahead of time.  From my investigation into this issue, the root
+cause it that although enough metadata is not reserved, there should be
+enough for following use.  Rightmost extent is merged into its left one
+due to a certain times of marking extent written.  Because during
+marking extent written, we got many physically continuous extents.  At
+last, an empty extent showed up and the rightmost path is removed from
+extent tree.
+
+Add a new mechanism to reuse extent block cached in dealloc which were
+just unlinked from extent tree to solve this crash issue.
+
+Criteria is that during marking extents *written*, if extent rotation
+and merging results in unlinking extent with growing extent tree later
+without any metadata reserved ahead of time, try to reuse those extents
+in dealloc in which deleted extents are cached.
+
+Also, this patch addresses the issue John reported that ::dw_zero_count
+is not calculated properly.
+
+After applying this patch, the issue John reported was gone.  Thanks for
+the reproducer provided by John.  And this patch has passed
+ocfs2-test(29 cases) suite running by New H3C Group.
+
+[ge.changwei@h3c.com: fix static checker warnning]
+  Link: http://lkml.kernel.org/r/63ADC13FD55D6546B7DECE290D39E373F29196AE@H3CMLB12-EX.srv.huawei-3com.com
+[akpm@linux-foundation.org: brelse(NULL) is legal]
+Link: http://lkml.kernel.org/r/1515479070-32653-2-git-send-email-ge.changwei@h3c.com
+Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
+Reported-by: John Lightsey <john@nixnuts.net>
+Tested-by: John Lightsey <john@nixnuts.net>
+Cc: Joel Becker <jlbec@evilplan.org>
+Cc: Joseph Qi <jiangqi903@gmail.com>
+Cc: Junxiao Bi <junxiao.bi@oracle.com>
+Cc: Dan Carpenter <dan.carpenter@oracle.com>
+Cc: Mark Fasheh <mfasheh@versity.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+(cherry picked from commit 71a36944042b7d9dd71f6a5d1c5ea1c2353b5d42)
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ fs/ocfs2/alloc.h |   1 +
+ fs/ocfs2/alloc.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
+ fs/ocfs2/aops.c  |   6 ++
+ 3 files changed, 203 insertions(+), 10 deletions(-)
+
+diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
+index 4a5152ec88a3..571692171dd1 100644
+--- a/fs/ocfs2/alloc.h
++++ b/fs/ocfs2/alloc.h
+@@ -61,6 +61,7 @@ struct ocfs2_extent_tree {
+       ocfs2_journal_access_func               et_root_journal_access;
+       void                                    *et_object;
+       unsigned int                            et_max_leaf_clusters;
++      struct ocfs2_cached_dealloc_ctxt        *et_dealloc;
+ };
+ /*
+diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
+index 386aecce881d..9b5e7d8ba710 100644
+--- a/fs/ocfs2/alloc.c
++++ b/fs/ocfs2/alloc.c
+@@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
+                                    struct ocfs2_extent_rec *rec);
+ static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
+ static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
++
++static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
++                                      struct ocfs2_extent_tree *et,
++                                      struct buffer_head **new_eb_bh,
++                                      int blk_wanted, int *blk_given);
++static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
++
+ static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+       .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
+       .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
+@@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
+       if (!obj)
+               obj = (void *)bh->b_data;
+       et->et_object = obj;
++      et->et_dealloc = NULL;
+       et->et_ops->eo_fill_root_el(et);
+       if (!et->et_ops->eo_fill_max_leaf_clusters)
+@@ -1159,7 +1167,7 @@ static int ocfs2_add_branch(handle_t *handle,
+                           struct buffer_head **last_eb_bh,
+                           struct ocfs2_alloc_context *meta_ac)
+ {
+-      int status, new_blocks, i;
++      int status, new_blocks, i, block_given = 0;
+       u64 next_blkno, new_last_eb_blk;
+       struct buffer_head *bh;
+       struct buffer_head **new_eb_bhs = NULL;
+@@ -1214,11 +1222,31 @@ static int ocfs2_add_branch(handle_t *handle,
+               goto bail;
+       }
+-      status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
+-                                         meta_ac, new_eb_bhs);
+-      if (status < 0) {
+-              mlog_errno(status);
+-              goto bail;
++      /* Firstyly, try to reuse dealloc since we have already estimated how
++       * many extent blocks we may use.
++       */
++      if (!ocfs2_is_dealloc_empty(et)) {
++              status = ocfs2_reuse_blk_from_dealloc(handle, et,
++                                                    new_eb_bhs, new_blocks,
++                                                    &block_given);
++              if (status < 0) {
++                      mlog_errno(status);
++                      goto bail;
++              }
++      }
++
++      BUG_ON(block_given > new_blocks);
++
++      if (block_given < new_blocks) {
++              BUG_ON(!meta_ac);
++              status = ocfs2_create_new_meta_bhs(handle, et,
++                                                 new_blocks - block_given,
++                                                 meta_ac,
++                                                 &new_eb_bhs[block_given]);
++              if (status < 0) {
++                      mlog_errno(status);
++                      goto bail;
++              }
+       }
+       /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
+@@ -1341,15 +1369,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
+                                 struct ocfs2_alloc_context *meta_ac,
+                                 struct buffer_head **ret_new_eb_bh)
+ {
+-      int status, i;
++      int status, i, block_given = 0;
+       u32 new_clusters;
+       struct buffer_head *new_eb_bh = NULL;
+       struct ocfs2_extent_block *eb;
+       struct ocfs2_extent_list  *root_el;
+       struct ocfs2_extent_list  *eb_el;
+-      status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
+-                                         &new_eb_bh);
++      if (!ocfs2_is_dealloc_empty(et)) {
++              status = ocfs2_reuse_blk_from_dealloc(handle, et,
++                                                    &new_eb_bh, 1,
++                                                    &block_given);
++      } else if (meta_ac) {
++              status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
++                                                 &new_eb_bh);
++
++      } else {
++              BUG();
++      }
++
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+@@ -1512,7 +1550,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
+       int depth = le16_to_cpu(el->l_tree_depth);
+       struct buffer_head *bh = NULL;
+-      BUG_ON(meta_ac == NULL);
++      BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
+       shift = ocfs2_find_branch_target(et, &bh);
+       if (shift < 0) {
+@@ -6593,6 +6631,154 @@ ocfs2_find_per_slot_free_list(int type,
+       return fl;
+ }
++static struct ocfs2_per_slot_free_list *
++ocfs2_find_preferred_free_list(int type,
++                             int preferred_slot,
++                             int *real_slot,
++                             struct ocfs2_cached_dealloc_ctxt *ctxt)
++{
++      struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
++
++      while (fl) {
++              if (fl->f_inode_type == type && fl->f_slot == preferred_slot) {
++                      *real_slot = fl->f_slot;
++                      return fl;
++              }
++
++              fl = fl->f_next_suballocator;
++      }
++
++      /* If we can't find any free list matching preferred slot, just use
++       * the first one.
++       */
++      fl = ctxt->c_first_suballocator;
++      *real_slot = fl->f_slot;
++
++      return fl;
++}
++
++/* Return Value 1 indicates empty */
++static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
++{
++      struct ocfs2_per_slot_free_list *fl = NULL;
++
++      if (!et->et_dealloc)
++              return 1;
++
++      fl = et->et_dealloc->c_first_suballocator;
++      if (!fl)
++              return 1;
++
++      if (!fl->f_first)
++              return 1;
++
++      return 0;
++}
++
++/* If extent was deleted from tree due to extent rotation and merging, and
++ * no metadata is reserved ahead of time. Try to reuse some extents
++ * just deleted. This is only used to reuse extent blocks.
++ * It is supposed to find enough extent blocks in dealloc if our estimation
++ * on metadata is accurate.
++ */
++static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
++                                      struct ocfs2_extent_tree *et,
++                                      struct buffer_head **new_eb_bh,
++                                      int blk_wanted, int *blk_given)
++{
++      int i, status = 0, real_slot;
++      struct ocfs2_cached_dealloc_ctxt *dealloc;
++      struct ocfs2_per_slot_free_list *fl;
++      struct ocfs2_cached_block_free *bf;
++      struct ocfs2_extent_block *eb;
++      struct ocfs2_super *osb =
++              OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
++
++      *blk_given = 0;
++
++      /* If extent tree doesn't have a dealloc, this is not faulty. Just
++       * tell upper caller dealloc can't provide any block and it should
++       * ask for alloc to claim more space.
++       */
++      dealloc = et->et_dealloc;
++      if (!dealloc)
++              goto bail;
++
++      for (i = 0; i < blk_wanted; i++) {
++              /* Prefer to use local slot */
++              fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE,
++                                                  osb->slot_num, &real_slot,
++                                                  dealloc);
++              /* If no more block can be reused, we should claim more
++               * from alloc. Just return here normally.
++               */
++              if (!fl) {
++                      status = 0;
++                      break;
++              }
++
++              bf = fl->f_first;
++              fl->f_first = bf->free_next;
++
++              new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
++              if (new_eb_bh[i] == NULL) {
++                      status = -ENOMEM;
++                      mlog_errno(status);
++                      goto bail;
++              }
++
++              mlog(0, "Reusing block(%llu) from "
++                   "dealloc(local slot:%d, real slot:%d)\n",
++                   bf->free_blk, osb->slot_num, real_slot);
++
++              ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
++
++              status = ocfs2_journal_access_eb(handle, et->et_ci,
++                                               new_eb_bh[i],
++                                               OCFS2_JOURNAL_ACCESS_CREATE);
++              if (status < 0) {
++                      mlog_errno(status);
++                      goto bail;
++              }
++
++              memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
++              eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
++
++              /* We can't guarantee that buffer head is still cached, so
++               * polutlate the extent block again.
++               */
++              strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
++              eb->h_blkno = cpu_to_le64(bf->free_blk);
++              eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
++              eb->h_suballoc_slot = cpu_to_le16(real_slot);
++              eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
++              eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
++              eb->h_list.l_count =
++                      cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
++
++              /* We'll also be dirtied by the caller, so
++               * this isn't absolutely necessary.
++               */
++              ocfs2_journal_dirty(handle, new_eb_bh[i]);
++
++              if (!fl->f_first) {
++                      dealloc->c_first_suballocator = fl->f_next_suballocator;
++                      kfree(fl);
++              }
++              kfree(bf);
++      }
++
++      *blk_given = i;
++
++bail:
++      if (unlikely(status < 0)) {
++              for (i = 0; i < blk_wanted; i++)
++                      brelse(new_eb_bh[i]);
++      }
++
++      return status;
++}
++
+ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                             int type, int slot, u64 suballoc,
+                             u64 blkno, unsigned int bit)
+diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
+index 77ec9b495027..2ff02dda97d8 100644
+--- a/fs/ocfs2/aops.c
++++ b/fs/ocfs2/aops.c
+@@ -2322,6 +2322,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
+       ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
++      /* Attach dealloc with extent tree in case that we may reuse extents
++       * which are already unlinked from current extent tree due to extent
++       * rotation and merging.
++       */
++      et.et_dealloc = &dealloc;
++
+       ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
+                                   &data_ac, &meta_ac);
+       if (ret) {
+-- 
+2.14.2
+
diff --git a/patches/kernel/0021-scsi-lpfc-Fix-loop-mode-target-discovery.patch b/patches/kernel/0021-scsi-lpfc-Fix-loop-mode-target-discovery.patch
deleted file mode 100644 (file)
index 4f7bf71..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Dick Kennedy <dick.kennedy@broadcom.com>
-Date: Wed, 23 Aug 2017 16:55:31 -0700
-Subject: [PATCH] scsi: lpfc: Fix loop mode target discovery
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-The driver does not discover targets when in loop mode.
-
-The NLP type is correctly getting set when a fabric connection is
-detected but, not for loop. The unknown NLP type means that the driver
-does not issue a PRLI when in loop topology. Thus target discovery
-fails.
-
-Fix by checking the topology during discovery.  If it is loop, set the
-NLP FC4 type to FCP.
-
-Signed-off-by: Dick Kennedy <dick.kennedy@broadcom.com>
-Signed-off-by: James Smart <james.smart@broadcom.com>
-Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
-Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
-(cherry picked from commit 2877cbffb79ed121a6bcc5edbe629d3aba36cd29)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- drivers/scsi/lpfc/lpfc_nportdisc.c | 3 +++
- 1 file changed, 3 insertions(+)
-
-diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c
-index f74cb0142fd4..95b2b43ac37d 100644
---- a/drivers/scsi/lpfc/lpfc_nportdisc.c
-+++ b/drivers/scsi/lpfc/lpfc_nportdisc.c
-@@ -1724,6 +1724,9 @@ lpfc_cmpl_reglogin_reglogin_issue(struct lpfc_vport *vport,
-                               lpfc_nvme_update_localport(vport);
-                       }
-+              } else if (phba->fc_topology == LPFC_TOPOLOGY_LOOP) {
-+                      ndlp->nlp_fc4_type |= NLP_FC4_FCP;
-+
-               } else if (ndlp->nlp_fc4_type == 0) {
-                       rc = lpfc_ns_cmd(vport, SLI_CTNS_GFT_ID,
-                                        0, ndlp->nlp_DID);
--- 
-2.14.2
-
diff --git a/patches/kernel/0022-sched-wait-Fix-add_wait_queue-behavioral-change.patch b/patches/kernel/0022-sched-wait-Fix-add_wait_queue-behavioral-change.patch
deleted file mode 100644 (file)
index 852ce99..0000000
+++ /dev/null
@@ -1,52 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Omar Sandoval <osandov@fb.com>
-Date: Tue, 5 Dec 2017 23:15:31 -0800
-Subject: [PATCH] sched/wait: Fix add_wait_queue() behavioral change
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-The following cleanup commit:
-
-  50816c48997a ("sched/wait: Standardize internal naming of wait-queue entries")
-
-... unintentionally changed the behavior of add_wait_queue() from
-inserting the wait entry at the head of the wait queue to the tail
-of the wait queue.
-
-Beyond a negative performance impact this change in behavior
-theoretically also breaks wait queues which mix exclusive and
-non-exclusive waiters, as non-exclusive waiters will not be
-woken up if they are queued behind enough exclusive waiters.
-
-Signed-off-by: Omar Sandoval <osandov@fb.com>
-Reviewed-by: Jens Axboe <axboe@kernel.dk>
-Acked-by: Peter Zijlstra <peterz@infradead.org>
-Cc: Linus Torvalds <torvalds@linux-foundation.org>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Cc: kernel-team@fb.com
-Fixes: ("sched/wait: Standardize internal naming of wait-queue entries")
-Link: http://lkml.kernel.org/r/a16c8ccffd39bd08fdaa45a5192294c784b803a7.1512544324.git.osandov@fb.com
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-(cherry picked from commit c6b9d9a33029014446bd9ed84c1688f6d3d4eab9)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- kernel/sched/wait.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
-index d6afed6d0752..c09ebe92a40a 100644
---- a/kernel/sched/wait.c
-+++ b/kernel/sched/wait.c
-@@ -27,7 +27,7 @@ void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
-       wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
-       spin_lock_irqsave(&wq_head->lock, flags);
--      __add_wait_queue_entry_tail(wq_head, wq_entry);
-+      __add_wait_queue(wq_head, wq_entry);
-       spin_unlock_irqrestore(&wq_head->lock, flags);
- }
- EXPORT_SYMBOL(add_wait_queue);
--- 
-2.14.2
-
diff --git a/patches/kernel/0023-module-retpoline-Warn-about-missing-retpoline-in-mod.patch b/patches/kernel/0023-module-retpoline-Warn-about-missing-retpoline-in-mod.patch
deleted file mode 100644 (file)
index 27fffe4..0000000
+++ /dev/null
@@ -1,164 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Andi Kleen <ak@linux.intel.com>
-Date: Thu, 25 Jan 2018 15:50:28 -0800
-Subject: [PATCH] module/retpoline: Warn about missing retpoline in module
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-There's a risk that a kernel which has full retpoline mitigations becomes
-vulnerable when a module gets loaded that hasn't been compiled with the
-right compiler or the right option.
-
-To enable detection of that mismatch at module load time, add a module info
-string "retpoline" at build time when the module was compiled with
-retpoline support. This only covers compiled C source, but assembler source
-or prebuilt object files are not checked.
-
-If a retpoline enabled kernel detects a non retpoline protected module at
-load time, print a warning and report it in the sysfs vulnerability file.
-
-[ tglx: Massaged changelog ]
-
-Signed-off-by: Andi Kleen <ak@linux.intel.com>
-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-Cc: David Woodhouse <dwmw2@infradead.org>
-Cc: gregkh@linuxfoundation.org
-Cc: torvalds@linux-foundation.org
-Cc: jeyu@kernel.org
-Cc: arjan@linux.intel.com
-Link: https://lkml.kernel.org/r/20180125235028.31211-1-andi@firstfloor.org
-(backported from commit caf7501a1b4ec964190f31f9c3f163de252273b8)
-Conflicts:
-       arch/x86/kernel/cpu/bugs.c
-context changes
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- include/linux/module.h     |  9 +++++++++
- arch/x86/kernel/cpu/bugs.c | 19 +++++++++++++++++--
- kernel/module.c            | 11 +++++++++++
- scripts/mod/modpost.c      |  9 +++++++++
- 4 files changed, 46 insertions(+), 2 deletions(-)
-
-diff --git a/include/linux/module.h b/include/linux/module.h
-index e7bdd549e527..c4fdf7661f82 100644
---- a/include/linux/module.h
-+++ b/include/linux/module.h
-@@ -794,6 +794,15 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr,
- static inline void module_bug_cleanup(struct module *mod) {}
- #endif        /* CONFIG_GENERIC_BUG */
-+#ifdef RETPOLINE
-+extern bool retpoline_module_ok(bool has_retpoline);
-+#else
-+static inline bool retpoline_module_ok(bool has_retpoline)
-+{
-+      return true;
-+}
-+#endif
-+
- #ifdef CONFIG_MODULE_SIG
- static inline bool module_sig_ok(struct module *module)
- {
-diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
-index d5bafcdb4891..e623bd731a74 100644
---- a/arch/x86/kernel/cpu/bugs.c
-+++ b/arch/x86/kernel/cpu/bugs.c
-@@ -11,6 +11,7 @@
- #include <linux/utsname.h>
- #include <linux/cpu.h>
- #include <linux/smp.h>
-+#include <linux/module.h>
- #include <asm/nospec-branch.h>
- #include <asm/cmdline.h>
-@@ -93,6 +94,19 @@ static const char *spectre_v2_strings[] = {
- #define pr_fmt(fmt)     "Spectre V2 mitigation: " fmt
- static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
-+static bool spectre_v2_bad_module;
-+
-+#ifdef RETPOLINE
-+bool retpoline_module_ok(bool has_retpoline)
-+{
-+      if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
-+              return true;
-+
-+      pr_err("System may be vunerable to spectre v2\n");
-+      spectre_v2_bad_module = true;
-+      return false;
-+}
-+#endif
- static void __init spec2_print_if_insecure(const char *reason)
- {
-@@ -299,7 +313,8 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
-       if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
-               return sprintf(buf, "Not affected\n");
--      return sprintf(buf, "%s%s\n", spectre_v2_strings[spectre_v2_enabled],
--                     ibpb_inuse ? ", IBPB (Intel v4)" : "");
-+      return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
-+                     ibpb_inuse ? ", IBPB (Intel v4)" : "",
-+                     spectre_v2_bad_module ? " - vulnerable module loaded" : "");
- }
- #endif
-diff --git a/kernel/module.c b/kernel/module.c
-index e5b878b26906..de7db074f793 100644
---- a/kernel/module.c
-+++ b/kernel/module.c
-@@ -2855,6 +2855,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
- }
- #endif /* CONFIG_LIVEPATCH */
-+static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
-+{
-+      if (retpoline_module_ok(get_modinfo(info, "retpoline")))
-+              return;
-+
-+      pr_warn("%s: loading module not compiled with retpoline compiler.\n",
-+              mod->name);
-+}
-+
- /* Sets info->hdr and info->len. */
- static int copy_module_from_user(const void __user *umod, unsigned long len,
-                                 struct load_info *info)
-@@ -3021,6 +3030,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
-               add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
-       }
-+      check_modinfo_retpoline(mod, info);
-+
-       if (get_modinfo(info, "staging")) {
-               add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
-               pr_warn("%s: module is from the staging directory, the quality "
-diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
-index 48397feb08fb..cc91f81ac33e 100644
---- a/scripts/mod/modpost.c
-+++ b/scripts/mod/modpost.c
-@@ -2147,6 +2147,14 @@ static void add_intree_flag(struct buffer *b, int is_intree)
-               buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n");
- }
-+/* Cannot check for assembler */
-+static void add_retpoline(struct buffer *b)
-+{
-+      buf_printf(b, "\n#ifdef RETPOLINE\n");
-+      buf_printf(b, "MODULE_INFO(retpoline, \"Y\");\n");
-+      buf_printf(b, "#endif\n");
-+}
-+
- static void add_staging_flag(struct buffer *b, const char *name)
- {
-       static const char *staging_dir = "drivers/staging";
-@@ -2492,6 +2500,7 @@ int main(int argc, char **argv)
-               add_header(&buf, mod);
-               add_intree_flag(&buf, !external_module);
-+              add_retpoline(&buf);
-               add_staging_flag(&buf, mod->name);
-               err |= add_versions(&buf, mod);
-               add_depends(&buf, mod, modules);
--- 
-2.14.2
-
diff --git a/patches/kernel/0024-net-tcp-close-sock-if-net-namespace-is-exiting.patch b/patches/kernel/0024-net-tcp-close-sock-if-net-namespace-is-exiting.patch
deleted file mode 100644 (file)
index d4eb5e6..0000000
+++ /dev/null
@@ -1,127 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Dan Streetman <ddstreet@ieee.org>
-Date: Thu, 18 Jan 2018 16:14:26 -0500
-Subject: [PATCH] net: tcp: close sock if net namespace is exiting
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-When a tcp socket is closed, if it detects that its net namespace is
-exiting, close immediately and do not wait for FIN sequence.
-
-For normal sockets, a reference is taken to their net namespace, so it will
-never exit while the socket is open.  However, kernel sockets do not take a
-reference to their net namespace, so it may begin exiting while the kernel
-socket is still open.  In this case if the kernel socket is a tcp socket,
-it will stay open trying to complete its close sequence.  The sock's dst(s)
-hold a reference to their interface, which are all transferred to the
-namespace's loopback interface when the real interfaces are taken down.
-When the namespace tries to take down its loopback interface, it hangs
-waiting for all references to the loopback interface to release, which
-results in messages like:
-
-unregister_netdevice: waiting for lo to become free. Usage count = 1
-
-These messages continue until the socket finally times out and closes.
-Since the net namespace cleanup holds the net_mutex while calling its
-registered pernet callbacks, any new net namespace initialization is
-blocked until the current net namespace finishes exiting.
-
-After this change, the tcp socket notices the exiting net namespace, and
-closes immediately, releasing its dst(s) and their reference to the
-loopback interface, which lets the net namespace continue exiting.
-
-Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407
-Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811
-Signed-off-by: Dan Streetman <ddstreet@canonical.com>
-Signed-off-by: David S. Miller <davem@davemloft.net>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- include/net/net_namespace.h | 10 ++++++++++
- net/ipv4/tcp.c              |  3 +++
- net/ipv4/tcp_timer.c        | 15 +++++++++++++++
- 3 files changed, 28 insertions(+)
-
-diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
-index 1c401bd4c2e0..a5d023fa78db 100644
---- a/include/net/net_namespace.h
-+++ b/include/net/net_namespace.h
-@@ -221,6 +221,11 @@ int net_eq(const struct net *net1, const struct net *net2)
-       return net1 == net2;
- }
-+static inline int check_net(const struct net *net)
-+{
-+      return atomic_read(&net->count) != 0;
-+}
-+
- void net_drop_ns(void *);
- #else
-@@ -245,6 +250,11 @@ int net_eq(const struct net *net1, const struct net *net2)
-       return 1;
- }
-+static inline int check_net(const struct net *net)
-+{
-+      return 1;
-+}
-+
- #define net_drop_ns NULL
- #endif
-diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
-index a3e91b552edc..fd2a086da910 100644
---- a/net/ipv4/tcp.c
-+++ b/net/ipv4/tcp.c
-@@ -2258,6 +2258,9 @@ void tcp_close(struct sock *sk, long timeout)
-                       tcp_send_active_reset(sk, GFP_ATOMIC);
-                       __NET_INC_STATS(sock_net(sk),
-                                       LINUX_MIB_TCPABORTONMEMORY);
-+              } else if (!check_net(sock_net(sk))) {
-+                      /* Not possible to send reset; just close */
-+                      tcp_set_state(sk, TCP_CLOSE);
-               }
-       }
-diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
-index e906014890b6..ec1e5de41653 100644
---- a/net/ipv4/tcp_timer.c
-+++ b/net/ipv4/tcp_timer.c
-@@ -50,11 +50,19 @@ static void tcp_write_err(struct sock *sk)
-  *  to prevent DoS attacks. It is called when a retransmission timeout
-  *  or zero probe timeout occurs on orphaned socket.
-  *
-+ *  Also close if our net namespace is exiting; in that case there is no
-+ *  hope of ever communicating again since all netns interfaces are already
-+ *  down (or about to be down), and we need to release our dst references,
-+ *  which have been moved to the netns loopback interface, so the namespace
-+ *  can finish exiting.  This condition is only possible if we are a kernel
-+ *  socket, as those do not hold references to the namespace.
-+ *
-  *  Criteria is still not confirmed experimentally and may change.
-  *  We kill the socket, if:
-  *  1. If number of orphaned sockets exceeds an administratively configured
-  *     limit.
-  *  2. If we have strong memory pressure.
-+ *  3. If our net namespace is exiting.
-  */
- static int tcp_out_of_resources(struct sock *sk, bool do_reset)
- {
-@@ -83,6 +91,13 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
-               __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
-               return 1;
-       }
-+
-+      if (!check_net(sock_net(sk))) {
-+              /* Not possible to send reset; just close */
-+              tcp_done(sk);
-+              return 1;
-+      }
-+
-       return 0;
- }
--- 
-2.14.2
-
diff --git a/patches/kernel/0025-sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch b/patches/kernel/0025-sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch
deleted file mode 100644 (file)
index 1bed6b0..0000000
+++ /dev/null
@@ -1,89 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Tommi Rantala <tommi.t.rantala@nokia.com>
-Date: Mon, 5 Feb 2018 21:48:14 +0200
-Subject: [PATCH] sctp: fix dst refcnt leak in sctp_v4_get_dst
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Fix dst reference count leak in sctp_v4_get_dst() introduced in commit
-410f03831 ("sctp: add routing output fallback"):
-
-When walking the address_list, successive ip_route_output_key() calls
-may return the same rt->dst with the reference incremented on each call.
-
-The code would not decrement the dst refcount when the dst pointer was
-identical from the previous iteration, causing the dst refcnt leak.
-
-Testcase:
-  ip netns add TEST
-  ip netns exec TEST ip link set lo up
-  ip link add dummy0 type dummy
-  ip link add dummy1 type dummy
-  ip link add dummy2 type dummy
-  ip link set dev dummy0 netns TEST
-  ip link set dev dummy1 netns TEST
-  ip link set dev dummy2 netns TEST
-  ip netns exec TEST ip addr add 192.168.1.1/24 dev dummy0
-  ip netns exec TEST ip link set dummy0 up
-  ip netns exec TEST ip addr add 192.168.1.2/24 dev dummy1
-  ip netns exec TEST ip link set dummy1 up
-  ip netns exec TEST ip addr add 192.168.1.3/24 dev dummy2
-  ip netns exec TEST ip link set dummy2 up
-  ip netns exec TEST sctp_test -H 192.168.1.2 -P 20002 -h 192.168.1.1 -p 20000 -s -B 192.168.1.3
-  ip netns del TEST
-
-In 4.4 and 4.9 kernels this results to:
-  [  354.179591] unregister_netdevice: waiting for lo to become free. Usage count = 1
-  [  364.419674] unregister_netdevice: waiting for lo to become free. Usage count = 1
-  [  374.663664] unregister_netdevice: waiting for lo to become free. Usage count = 1
-  [  384.903717] unregister_netdevice: waiting for lo to become free. Usage count = 1
-  [  395.143724] unregister_netdevice: waiting for lo to become free. Usage count = 1
-  [  405.383645] unregister_netdevice: waiting for lo to become free. Usage count = 1
-  ...
-
-Fixes: 410f03831 ("sctp: add routing output fallback")
-Fixes: 0ca50d12f ("sctp: fix src address selection if using secondary addresses")
-Signed-off-by: Tommi Rantala <tommi.t.rantala@nokia.com>
-Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
-Acked-by: Neil Horman <nhorman@tuxdriver.com>
-Signed-off-by: David S. Miller <davem@davemloft.net>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- net/sctp/protocol.c | 10 ++++------
- 1 file changed, 4 insertions(+), 6 deletions(-)
-
-diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
-index 989a900383b5..e1a3ae4f3cab 100644
---- a/net/sctp/protocol.c
-+++ b/net/sctp/protocol.c
-@@ -514,22 +514,20 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
-               if (IS_ERR(rt))
-                       continue;
--              if (!dst)
--                      dst = &rt->dst;
--
-               /* Ensure the src address belongs to the output
-                * interface.
-                */
-               odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr,
-                                    false);
-               if (!odev || odev->ifindex != fl4->flowi4_oif) {
--                      if (&rt->dst != dst)
-+                      if (!dst)
-+                              dst = &rt->dst;
-+                      else
-                               dst_release(&rt->dst);
-                       continue;
-               }
--              if (dst != &rt->dst)
--                      dst_release(dst);
-+              dst_release(dst);
-               dst = &rt->dst;
-               break;
-       }
--- 
-2.14.2
-
diff --git a/patches/kernel/0026-sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch b/patches/kernel/0026-sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch
deleted file mode 100644 (file)
index 58087ed..0000000
+++ /dev/null
@@ -1,60 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Alexey Kodanev <alexey.kodanev@oracle.com>
-Date: Mon, 5 Feb 2018 15:10:35 +0300
-Subject: [PATCH] sctp: fix dst refcnt leak in sctp_v6_get_dst()
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-When going through the bind address list in sctp_v6_get_dst() and
-the previously found address is better ('matchlen > bmatchlen'),
-the code continues to the next iteration without releasing currently
-held destination.
-
-Fix it by releasing 'bdst' before continue to the next iteration, and
-instead of introducing one more '!IS_ERR(bdst)' check for dst_release(),
-move the already existed one right after ip6_dst_lookup_flow(), i.e. we
-shouldn't proceed further if we get an error for the route lookup.
-
-Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using secondary addresses for ipv6")
-Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
-Acked-by: Neil Horman <nhorman@tuxdriver.com>
-Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
-Signed-off-by: David S. Miller <davem@davemloft.net>
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- net/sctp/ipv6.c | 10 +++++++---
- 1 file changed, 7 insertions(+), 3 deletions(-)
-
-diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
-index a4b6ffb61495..c5a5ad8ac00f 100644
---- a/net/sctp/ipv6.c
-+++ b/net/sctp/ipv6.c
-@@ -326,8 +326,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
-               final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
-               bdst = ip6_dst_lookup_flow(sk, fl6, final_p);
--              if (!IS_ERR(bdst) &&
--                  ipv6_chk_addr(dev_net(bdst->dev),
-+              if (IS_ERR(bdst))
-+                      continue;
-+
-+              if (ipv6_chk_addr(dev_net(bdst->dev),
-                                 &laddr->a.v6.sin6_addr, bdst->dev, 1)) {
-                       if (!IS_ERR_OR_NULL(dst))
-                               dst_release(dst);
-@@ -336,8 +338,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
-               }
-               bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
--              if (matchlen > bmatchlen)
-+              if (matchlen > bmatchlen) {
-+                      dst_release(bdst);
-                       continue;
-+              }
-               if (!IS_ERR_OR_NULL(dst))
-                       dst_release(dst);
--- 
-2.14.2
-
diff --git a/patches/kernel/0027-lockd-lost-rollback-of-set_grace_period-in-lockd_dow.patch b/patches/kernel/0027-lockd-lost-rollback-of-set_grace_period-in-lockd_dow.patch
deleted file mode 100644 (file)
index d7ba32d..0000000
+++ /dev/null
@@ -1,46 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Vasily Averin <vvs@virtuozzo.com>
-Date: Thu, 2 Nov 2017 13:03:42 +0300
-Subject: [PATCH] lockd: lost rollback of set_grace_period() in
- lockd_down_net()
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Commit efda760fe95ea ("lockd: fix lockd shutdown race") is incorrect,
-it removes lockd_manager and disarm grace_period_end for init_net only.
-
-If nfsd was started from another net namespace lockd_up_net() calls
-set_grace_period() that adds lockd_manager into per-netns list
-and queues grace_period_end delayed work.
-
-These action should be reverted in lockd_down_net().
-Otherwise it can lead to double list_add on after restart nfsd in netns,
-and to use-after-free if non-disarmed delayed work will be executed after netns destroy.
-
-Fixes: efda760fe95e ("lockd: fix lockd shutdown race")
-Cc: stable@vger.kernel.org
-Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
-Signed-off-by: J. Bruce Fields <bfields@redhat.com>
-(cherry picked from commit 3a2b19d1ee5633f76ae8a88da7bc039a5d1732aa)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- fs/lockd/svc.c | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
-index 726b6cecf430..fa8f6effcf00 100644
---- a/fs/lockd/svc.c
-+++ b/fs/lockd/svc.c
-@@ -274,6 +274,8 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
-       if (ln->nlmsvc_users) {
-               if (--ln->nlmsvc_users == 0) {
-                       nlm_shutdown_hosts_net(net);
-+                      cancel_delayed_work_sync(&ln->grace_period_end);
-+                      locks_end_grace(&ln->lockd_manager);
-                       svc_shutdown_net(serv, net);
-                       dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net);
-               }
--- 
-2.14.2
-
diff --git a/patches/kernel/0028-ocfs2-make-metadata-estimation-accurate-and-clear.patch b/patches/kernel/0028-ocfs2-make-metadata-estimation-accurate-and-clear.patch
deleted file mode 100644 (file)
index b3cabcf..0000000
+++ /dev/null
@@ -1,61 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Changwei Ge <ge.changwei@h3c.com>
-Date: Wed, 31 Jan 2018 16:15:02 -0800
-Subject: [PATCH] ocfs2: make metadata estimation accurate and clear
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Current code assume that ::w_unwritten_list always has only one item on.
-This is not right and hard to get understood.  So improve how to count
-unwritten item.
-
-Link: http://lkml.kernel.org/r/1515479070-32653-1-git-send-email-ge.changwei@h3c.com
-Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
-Reported-by: John Lightsey <john@nixnuts.net>
-Tested-by: John Lightsey <john@nixnuts.net>
-Cc: Mark Fasheh <mfasheh@versity.com>
-Cc: Joseph Qi <jiangqi903@gmail.com>
-Cc: Junxiao Bi <junxiao.bi@oracle.com>
-Cc: Joel Becker <jlbec@evilplan.org>
-Cc: Changwei Ge <ge.changwei@h3c.com>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-(cherry picked from commit 63de8bd9328bf2a778fc277503da163ae3defa3c)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- fs/ocfs2/aops.c | 4 +++-
- 1 file changed, 3 insertions(+), 1 deletion(-)
-
-diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
-index 88a31e9340a0..77ec9b495027 100644
---- a/fs/ocfs2/aops.c
-+++ b/fs/ocfs2/aops.c
-@@ -784,6 +784,7 @@ struct ocfs2_write_ctxt {
-       struct ocfs2_cached_dealloc_ctxt w_dealloc;
-       struct list_head                w_unwritten_list;
-+      unsigned int                    w_unwritten_count;
- };
- void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
-@@ -1373,6 +1374,7 @@ static int ocfs2_unwritten_check(struct inode *inode,
-       desc->c_clear_unwritten = 0;
-       list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
-       list_add_tail(&new->ue_node, &wc->w_unwritten_list);
-+      wc->w_unwritten_count++;
-       new = NULL;
- unlock:
-       spin_unlock(&oi->ip_lock);
-@@ -2246,7 +2248,7 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
-               ue->ue_phys = desc->c_phys;
-               list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
--              dwc->dw_zero_count++;
-+              dwc->dw_zero_count += wc->w_unwritten_count;
-       }
-       ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
--- 
-2.14.2
-
diff --git a/patches/kernel/0029-ocfs2-try-to-reuse-extent-block-in-dealloc-without-m.patch b/patches/kernel/0029-ocfs2-try-to-reuse-extent-block-in-dealloc-without-m.patch
deleted file mode 100644 (file)
index 15e3488..0000000
+++ /dev/null
@@ -1,370 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Changwei Ge <ge.changwei@h3c.com>
-Date: Wed, 31 Jan 2018 16:15:06 -0800
-Subject: [PATCH] ocfs2: try to reuse extent block in dealloc without
- meta_alloc
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-A crash issue was reported by John Lightsey with a call trace as follows:
-
-  ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2]
-  ocfs2_change_extent_flag+0x33a/0x470 [ocfs2]
-  ocfs2_mark_extent_written+0x172/0x220 [ocfs2]
-  ocfs2_dio_end_io+0x62d/0x910 [ocfs2]
-  dio_complete+0x19a/0x1a0
-  do_blockdev_direct_IO+0x19dd/0x1eb0
-  __blockdev_direct_IO+0x43/0x50
-  ocfs2_direct_IO+0x8f/0xa0 [ocfs2]
-  generic_file_direct_write+0xb2/0x170
-  __generic_file_write_iter+0xc3/0x1b0
-  ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2]
-  __vfs_write+0xae/0xf0
-  vfs_write+0xb8/0x1b0
-  SyS_write+0x4f/0xb0
-  system_call_fastpath+0x16/0x75
-
-The BUG code told that extent tree wants to grow but no metadata was
-reserved ahead of time.  From my investigation into this issue, the root
-cause it that although enough metadata is not reserved, there should be
-enough for following use.  Rightmost extent is merged into its left one
-due to a certain times of marking extent written.  Because during
-marking extent written, we got many physically continuous extents.  At
-last, an empty extent showed up and the rightmost path is removed from
-extent tree.
-
-Add a new mechanism to reuse extent block cached in dealloc which were
-just unlinked from extent tree to solve this crash issue.
-
-Criteria is that during marking extents *written*, if extent rotation
-and merging results in unlinking extent with growing extent tree later
-without any metadata reserved ahead of time, try to reuse those extents
-in dealloc in which deleted extents are cached.
-
-Also, this patch addresses the issue John reported that ::dw_zero_count
-is not calculated properly.
-
-After applying this patch, the issue John reported was gone.  Thanks for
-the reproducer provided by John.  And this patch has passed
-ocfs2-test(29 cases) suite running by New H3C Group.
-
-[ge.changwei@h3c.com: fix static checker warnning]
-  Link: http://lkml.kernel.org/r/63ADC13FD55D6546B7DECE290D39E373F29196AE@H3CMLB12-EX.srv.huawei-3com.com
-[akpm@linux-foundation.org: brelse(NULL) is legal]
-Link: http://lkml.kernel.org/r/1515479070-32653-2-git-send-email-ge.changwei@h3c.com
-Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
-Reported-by: John Lightsey <john@nixnuts.net>
-Tested-by: John Lightsey <john@nixnuts.net>
-Cc: Joel Becker <jlbec@evilplan.org>
-Cc: Joseph Qi <jiangqi903@gmail.com>
-Cc: Junxiao Bi <junxiao.bi@oracle.com>
-Cc: Dan Carpenter <dan.carpenter@oracle.com>
-Cc: Mark Fasheh <mfasheh@versity.com>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-(cherry picked from commit 71a36944042b7d9dd71f6a5d1c5ea1c2353b5d42)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- fs/ocfs2/alloc.h |   1 +
- fs/ocfs2/alloc.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
- fs/ocfs2/aops.c  |   6 ++
- 3 files changed, 203 insertions(+), 10 deletions(-)
-
-diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
-index 4a5152ec88a3..571692171dd1 100644
---- a/fs/ocfs2/alloc.h
-+++ b/fs/ocfs2/alloc.h
-@@ -61,6 +61,7 @@ struct ocfs2_extent_tree {
-       ocfs2_journal_access_func               et_root_journal_access;
-       void                                    *et_object;
-       unsigned int                            et_max_leaf_clusters;
-+      struct ocfs2_cached_dealloc_ctxt        *et_dealloc;
- };
- /*
-diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
-index 386aecce881d..9b5e7d8ba710 100644
---- a/fs/ocfs2/alloc.c
-+++ b/fs/ocfs2/alloc.c
-@@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
-                                    struct ocfs2_extent_rec *rec);
- static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
- static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
-+
-+static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
-+                                      struct ocfs2_extent_tree *et,
-+                                      struct buffer_head **new_eb_bh,
-+                                      int blk_wanted, int *blk_given);
-+static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
-+
- static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
-       .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
-       .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
-@@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
-       if (!obj)
-               obj = (void *)bh->b_data;
-       et->et_object = obj;
-+      et->et_dealloc = NULL;
-       et->et_ops->eo_fill_root_el(et);
-       if (!et->et_ops->eo_fill_max_leaf_clusters)
-@@ -1159,7 +1167,7 @@ static int ocfs2_add_branch(handle_t *handle,
-                           struct buffer_head **last_eb_bh,
-                           struct ocfs2_alloc_context *meta_ac)
- {
--      int status, new_blocks, i;
-+      int status, new_blocks, i, block_given = 0;
-       u64 next_blkno, new_last_eb_blk;
-       struct buffer_head *bh;
-       struct buffer_head **new_eb_bhs = NULL;
-@@ -1214,11 +1222,31 @@ static int ocfs2_add_branch(handle_t *handle,
-               goto bail;
-       }
--      status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
--                                         meta_ac, new_eb_bhs);
--      if (status < 0) {
--              mlog_errno(status);
--              goto bail;
-+      /* Firstyly, try to reuse dealloc since we have already estimated how
-+       * many extent blocks we may use.
-+       */
-+      if (!ocfs2_is_dealloc_empty(et)) {
-+              status = ocfs2_reuse_blk_from_dealloc(handle, et,
-+                                                    new_eb_bhs, new_blocks,
-+                                                    &block_given);
-+              if (status < 0) {
-+                      mlog_errno(status);
-+                      goto bail;
-+              }
-+      }
-+
-+      BUG_ON(block_given > new_blocks);
-+
-+      if (block_given < new_blocks) {
-+              BUG_ON(!meta_ac);
-+              status = ocfs2_create_new_meta_bhs(handle, et,
-+                                                 new_blocks - block_given,
-+                                                 meta_ac,
-+                                                 &new_eb_bhs[block_given]);
-+              if (status < 0) {
-+                      mlog_errno(status);
-+                      goto bail;
-+              }
-       }
-       /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
-@@ -1341,15 +1369,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
-                                 struct ocfs2_alloc_context *meta_ac,
-                                 struct buffer_head **ret_new_eb_bh)
- {
--      int status, i;
-+      int status, i, block_given = 0;
-       u32 new_clusters;
-       struct buffer_head *new_eb_bh = NULL;
-       struct ocfs2_extent_block *eb;
-       struct ocfs2_extent_list  *root_el;
-       struct ocfs2_extent_list  *eb_el;
--      status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
--                                         &new_eb_bh);
-+      if (!ocfs2_is_dealloc_empty(et)) {
-+              status = ocfs2_reuse_blk_from_dealloc(handle, et,
-+                                                    &new_eb_bh, 1,
-+                                                    &block_given);
-+      } else if (meta_ac) {
-+              status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
-+                                                 &new_eb_bh);
-+
-+      } else {
-+              BUG();
-+      }
-+
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-@@ -1512,7 +1550,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
-       int depth = le16_to_cpu(el->l_tree_depth);
-       struct buffer_head *bh = NULL;
--      BUG_ON(meta_ac == NULL);
-+      BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
-       shift = ocfs2_find_branch_target(et, &bh);
-       if (shift < 0) {
-@@ -6593,6 +6631,154 @@ ocfs2_find_per_slot_free_list(int type,
-       return fl;
- }
-+static struct ocfs2_per_slot_free_list *
-+ocfs2_find_preferred_free_list(int type,
-+                             int preferred_slot,
-+                             int *real_slot,
-+                             struct ocfs2_cached_dealloc_ctxt *ctxt)
-+{
-+      struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
-+
-+      while (fl) {
-+              if (fl->f_inode_type == type && fl->f_slot == preferred_slot) {
-+                      *real_slot = fl->f_slot;
-+                      return fl;
-+              }
-+
-+              fl = fl->f_next_suballocator;
-+      }
-+
-+      /* If we can't find any free list matching preferred slot, just use
-+       * the first one.
-+       */
-+      fl = ctxt->c_first_suballocator;
-+      *real_slot = fl->f_slot;
-+
-+      return fl;
-+}
-+
-+/* Return Value 1 indicates empty */
-+static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
-+{
-+      struct ocfs2_per_slot_free_list *fl = NULL;
-+
-+      if (!et->et_dealloc)
-+              return 1;
-+
-+      fl = et->et_dealloc->c_first_suballocator;
-+      if (!fl)
-+              return 1;
-+
-+      if (!fl->f_first)
-+              return 1;
-+
-+      return 0;
-+}
-+
-+/* If extent was deleted from tree due to extent rotation and merging, and
-+ * no metadata is reserved ahead of time. Try to reuse some extents
-+ * just deleted. This is only used to reuse extent blocks.
-+ * It is supposed to find enough extent blocks in dealloc if our estimation
-+ * on metadata is accurate.
-+ */
-+static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
-+                                      struct ocfs2_extent_tree *et,
-+                                      struct buffer_head **new_eb_bh,
-+                                      int blk_wanted, int *blk_given)
-+{
-+      int i, status = 0, real_slot;
-+      struct ocfs2_cached_dealloc_ctxt *dealloc;
-+      struct ocfs2_per_slot_free_list *fl;
-+      struct ocfs2_cached_block_free *bf;
-+      struct ocfs2_extent_block *eb;
-+      struct ocfs2_super *osb =
-+              OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
-+
-+      *blk_given = 0;
-+
-+      /* If extent tree doesn't have a dealloc, this is not faulty. Just
-+       * tell upper caller dealloc can't provide any block and it should
-+       * ask for alloc to claim more space.
-+       */
-+      dealloc = et->et_dealloc;
-+      if (!dealloc)
-+              goto bail;
-+
-+      for (i = 0; i < blk_wanted; i++) {
-+              /* Prefer to use local slot */
-+              fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE,
-+                                                  osb->slot_num, &real_slot,
-+                                                  dealloc);
-+              /* If no more block can be reused, we should claim more
-+               * from alloc. Just return here normally.
-+               */
-+              if (!fl) {
-+                      status = 0;
-+                      break;
-+              }
-+
-+              bf = fl->f_first;
-+              fl->f_first = bf->free_next;
-+
-+              new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
-+              if (new_eb_bh[i] == NULL) {
-+                      status = -ENOMEM;
-+                      mlog_errno(status);
-+                      goto bail;
-+              }
-+
-+              mlog(0, "Reusing block(%llu) from "
-+                   "dealloc(local slot:%d, real slot:%d)\n",
-+                   bf->free_blk, osb->slot_num, real_slot);
-+
-+              ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
-+
-+              status = ocfs2_journal_access_eb(handle, et->et_ci,
-+                                               new_eb_bh[i],
-+                                               OCFS2_JOURNAL_ACCESS_CREATE);
-+              if (status < 0) {
-+                      mlog_errno(status);
-+                      goto bail;
-+              }
-+
-+              memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
-+              eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
-+
-+              /* We can't guarantee that buffer head is still cached, so
-+               * polutlate the extent block again.
-+               */
-+              strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
-+              eb->h_blkno = cpu_to_le64(bf->free_blk);
-+              eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
-+              eb->h_suballoc_slot = cpu_to_le16(real_slot);
-+              eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
-+              eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
-+              eb->h_list.l_count =
-+                      cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
-+
-+              /* We'll also be dirtied by the caller, so
-+               * this isn't absolutely necessary.
-+               */
-+              ocfs2_journal_dirty(handle, new_eb_bh[i]);
-+
-+              if (!fl->f_first) {
-+                      dealloc->c_first_suballocator = fl->f_next_suballocator;
-+                      kfree(fl);
-+              }
-+              kfree(bf);
-+      }
-+
-+      *blk_given = i;
-+
-+bail:
-+      if (unlikely(status < 0)) {
-+              for (i = 0; i < blk_wanted; i++)
-+                      brelse(new_eb_bh[i]);
-+      }
-+
-+      return status;
-+}
-+
- int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
-                             int type, int slot, u64 suballoc,
-                             u64 blkno, unsigned int bit)
-diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
-index 77ec9b495027..2ff02dda97d8 100644
---- a/fs/ocfs2/aops.c
-+++ b/fs/ocfs2/aops.c
-@@ -2322,6 +2322,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
-       ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
-+      /* Attach dealloc with extent tree in case that we may reuse extents
-+       * which are already unlinked from current extent tree due to extent
-+       * rotation and merging.
-+       */
-+      et.et_dealloc = &dealloc;
-+
-       ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
-                                   &data_ac, &meta_ac);
-       if (ret) {
--- 
-2.14.2
-