]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/commitdiff
Merge branch 'cgroup/for-4.11-rdmacg' into cgroup/for-4.11
authorTejun Heo <tj@kernel.org>
Thu, 2 Feb 2017 18:50:35 +0000 (13:50 -0500)
committerTejun Heo <tj@kernel.org>
Thu, 2 Feb 2017 18:50:35 +0000 (13:50 -0500)
Merge in to resolve conflicts in Documentation/cgroup-v2.txt.  The
conflicts are from multiple section additions and trivial to resolve.

Signed-off-by: Tejun Heo <tj@kernel.org>
14 files changed:
Documentation/cgroup-v1/rdma.txt [new file with mode: 0644]
Documentation/cgroup-v2.txt
drivers/infiniband/core/Makefile
drivers/infiniband/core/cgroup.c [new file with mode: 0644]
drivers/infiniband/core/core_priv.h
drivers/infiniband/core/device.c
drivers/infiniband/core/uverbs_cmd.c
drivers/infiniband/core/uverbs_main.c
include/linux/cgroup_rdma.h [new file with mode: 0644]
include/linux/cgroup_subsys.h
include/rdma/ib_verbs.h
init/Kconfig
kernel/cgroup/Makefile
kernel/cgroup/rdma.c [new file with mode: 0644]

diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt
new file mode 100644 (file)
index 0000000..af61817
--- /dev/null
@@ -0,0 +1,109 @@
+                               RDMA Controller
+                               ----------------
+
+Contents
+--------
+
+1. Overview
+  1-1. What is RDMA controller?
+  1-2. Why RDMA controller needed?
+  1-3. How is RDMA controller implemented?
+2. Usage Examples
+
+1. Overview
+
+1-1. What is RDMA controller?
+-----------------------------
+
+RDMA controller allows user to limit RDMA/IB specific resources that a given
+set of processes can use. These processes are grouped using RDMA controller.
+
+RDMA controller defines two resources which can be limited for processes of a
+cgroup.
+
+1-2. Why RDMA controller needed?
+--------------------------------
+
+Currently user space applications can easily take away all the rdma verb
+specific resources such as AH, CQ, QP, MR etc. Due to which other applications
+in other cgroup or kernel space ULPs may not even get chance to allocate any
+rdma resources. This can leads to service unavailability.
+
+Therefore RDMA controller is needed through which resource consumption
+of processes can be limited. Through this controller different rdma
+resources can be accounted.
+
+1-3. How is RDMA controller implemented?
+----------------------------------------
+
+RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
+resource accounting per cgroup, per device using resource pool structure.
+Each such resource pool is limited up to 64 resources in given resource pool
+by rdma cgroup, which can be extended later if required.
+
+This resource pool object is linked to the cgroup css. Typically there
+are 0 to 4 resource pool instances per cgroup, per device in most use cases.
+But nothing limits to have it more. At present hundreds of RDMA devices per
+single cgroup may not be handled optimally, however there is no
+known use case or requirement for such configuration either.
+
+Since RDMA resources can be allocated from any process and can be freed by any
+of the child processes which shares the address space, rdma resources are
+always owned by the creator cgroup css. This allows process migration from one
+to other cgroup without major complexity of transferring resource ownership;
+because such ownership is not really present due to shared nature of
+rdma resources. Linking resources around css also ensures that cgroups can be
+deleted after processes migrated. This allow progress migration as well with
+active resources, even though that is not a primary use case.
+
+Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
+the caller. Same rdma cgroup should be passed while uncharging the resource.
+This also allows process migrated with active RDMA resource to charge
+to new owner cgroup for new resource. It also allows to uncharge resource of
+a process from previously charged cgroup which is migrated to new cgroup,
+even though that is not a primary use case.
+
+Resource pool object is created in following situations.
+(a) User sets the limit and no previous resource pool exist for the device
+of interest for the cgroup.
+(b) No resource limits were configured, but IB/RDMA stack tries to
+charge the resource. So that it correctly uncharge them when applications are
+running without limits and later on when limits are enforced during uncharging,
+otherwise usage count will drop to negative.
+
+Resource pool is destroyed if all the resource limits are set to max and
+it is the last resource getting deallocated.
+
+User should set all the limit to max value if it intents to remove/unconfigure
+the resource pool for a particular device.
+
+IB stack honors limits enforced by the rdma controller. When application
+query about maximum resource limits of IB device, it returns minimum of
+what is configured by user for a given cgroup and what is supported by
+IB device.
+
+Following resources can be accounted by rdma controller.
+  hca_handle   Maximum number of HCA Handles
+  hca_object   Maximum number of HCA Objects
+
+2. Usage Examples
+-----------------
+
+(a) Configure resource limit:
+echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
+echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
+
+(b) Query resource limit:
+cat /sys/fs/cgroup/rdma/2/rdma.max
+#Output:
+mlx4_0 hca_handle=2 hca_object=2000
+ocrdma1 hca_handle=3 hca_object=max
+
+(c) Query current usage:
+cat /sys/fs/cgroup/rdma/2/rdma.current
+#Output:
+mlx4_0 hca_handle=1 hca_object=20
+ocrdma1 hca_handle=1 hca_object=23
+
+(d) Delete resource limit:
+echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
index 1d101423ca9253abeec77a7ef26549d4d4602141..3b8449f8ac7e80a0ebeaf6dfe8c64b15503f3954 100644 (file)
@@ -49,8 +49,10 @@ CONTENTS
     5-3-2. Writeback
   5-4. PID
     5-4-1. PID Interface Files
-  5-5. Misc
-    5-5-1. perf_event
+  5-5. RDMA
+    5-5-1. RDMA Interface Files
+  5-6. Misc
+    5-6-1. perf_event
 6. Namespace
   6-1. Basics
   6-2. The Root and Views
@@ -1160,9 +1162,45 @@ through fork() or clone(). These will return -EAGAIN if the creation
 of a new process would cause a cgroup policy to be violated.
 
 
-5-5. Misc
+5-5. RDMA
 
-5-5-1. perf_event
+The "rdma" controller regulates the distribution and accounting of
+of RDMA resources.
+
+5-5-1. RDMA Interface Files
+
+  rdma.max
+       A readwrite nested-keyed file that exists for all the cgroups
+       except root that describes current configured resource limit
+       for a RDMA/IB device.
+
+       Lines are keyed by device name and are not ordered.
+       Each line contains space separated resource name and its configured
+       limit that can be distributed.
+
+       The following nested keys are defined.
+
+         hca_handle    Maximum number of HCA Handles
+         hca_object    Maximum number of HCA Objects
+
+       An example for mlx4 and ocrdma device follows.
+
+         mlx4_0 hca_handle=2 hca_object=2000
+         ocrdma1 hca_handle=3 hca_object=max
+
+  rdma.current
+       A read-only file that describes current resource usage.
+       It exists for all the cgroup except root.
+
+       An example for mlx4 and ocrdma device follows.
+
+         mlx4_0 hca_handle=1 hca_object=20
+         ocrdma1 hca_handle=1 hca_object=23
+
+
+5-6. Misc
+
+5-6-1. perf_event
 
 perf_event controller, if not mounted on a legacy hierarchy, is
 automatically enabled on the v2 hierarchy so that perf events can
index edaae9f9853c73b2f990ccbd82ebcee3868010fd..e426ac877d19fb65ab9fb3edd5b106d1c6d02c9a 100644 (file)
@@ -13,6 +13,7 @@ ib_core-y :=                  packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
                                multicast.o mad.o smi.o agent.o mad_rmpp.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
+ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
 
 ib_cm-y :=                     cm.o
 
diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c
new file mode 100644 (file)
index 0000000..126ac5f
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include "core_priv.h"
+
+/**
+ * ib_device_register_rdmacg - register with rdma cgroup.
+ * @device: device to register to participate in resource
+ *          accounting by rdma cgroup.
+ *
+ * Register with the rdma cgroup. Should be called before
+ * exposing rdma device to user space applications to avoid
+ * resource accounting leak.
+ * Returns 0 on success or otherwise failure code.
+ */
+int ib_device_register_rdmacg(struct ib_device *device)
+{
+       device->cg_device.name = device->name;
+       return rdmacg_register_device(&device->cg_device);
+}
+
+/**
+ * ib_device_unregister_rdmacg - unregister with rdma cgroup.
+ * @device: device to unregister.
+ *
+ * Unregister with the rdma cgroup. Should be called after
+ * all the resources are deallocated, and after a stage when any
+ * other resource allocation by user application cannot be done
+ * for this device to avoid any leak in accounting.
+ */
+void ib_device_unregister_rdmacg(struct ib_device *device)
+{
+       rdmacg_unregister_device(&device->cg_device);
+}
+
+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+                        struct ib_device *device,
+                        enum rdmacg_resource_type resource_index)
+{
+       return rdmacg_try_charge(&cg_obj->cg, &device->cg_device,
+                                resource_index);
+}
+EXPORT_SYMBOL(ib_rdmacg_try_charge);
+
+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+                       struct ib_device *device,
+                       enum rdmacg_resource_type resource_index)
+{
+       rdmacg_uncharge(cg_obj->cg, &device->cg_device,
+                       resource_index);
+}
+EXPORT_SYMBOL(ib_rdmacg_uncharge);
index d29372624f3a055719e9592a99094bbc951d9f18..389f6192bddcbf9225102c06c1a891556ffa0074 100644 (file)
@@ -35,6 +35,7 @@
 
 #include <linux/list.h>
 #include <linux/spinlock.h>
+#include <linux/cgroup_rdma.h>
 
 #include <rdma/ib_verbs.h>
 
@@ -121,6 +122,35 @@ int ib_cache_setup_one(struct ib_device *device);
 void ib_cache_cleanup_one(struct ib_device *device);
 void ib_cache_release_one(struct ib_device *device);
 
+#ifdef CONFIG_CGROUP_RDMA
+int ib_device_register_rdmacg(struct ib_device *device);
+void ib_device_unregister_rdmacg(struct ib_device *device);
+
+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+                        struct ib_device *device,
+                        enum rdmacg_resource_type resource_index);
+
+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+                       struct ib_device *device,
+                       enum rdmacg_resource_type resource_index);
+#else
+static inline int ib_device_register_rdmacg(struct ib_device *device)
+{ return 0; }
+
+static inline void ib_device_unregister_rdmacg(struct ib_device *device)
+{ }
+
+static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+                                      struct ib_device *device,
+                                      enum rdmacg_resource_type resource_index)
+{ return 0; }
+
+static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+                                     struct ib_device *device,
+                                     enum rdmacg_resource_type resource_index)
+{ }
+#endif
+
 static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
                                         struct net_device *upper)
 {
index 571974cd391981a6009abfdedf6b848029a366c1..70065386acbc3cb51aa30f92072e499340c735ee 100644 (file)
@@ -360,10 +360,18 @@ int ib_register_device(struct ib_device *device,
                goto out;
        }
 
+       ret = ib_device_register_rdmacg(device);
+       if (ret) {
+               pr_warn("Couldn't register device with rdma cgroup\n");
+               ib_cache_cleanup_one(device);
+               goto out;
+       }
+
        memset(&device->attrs, 0, sizeof(device->attrs));
        ret = device->query_device(device, &device->attrs, &uhw);
        if (ret) {
                pr_warn("Couldn't query the device attributes\n");
+               ib_device_unregister_rdmacg(device);
                ib_cache_cleanup_one(device);
                goto out;
        }
@@ -372,6 +380,7 @@ int ib_register_device(struct ib_device *device,
        if (ret) {
                pr_warn("Couldn't register device %s with driver model\n",
                        device->name);
+               ib_device_unregister_rdmacg(device);
                ib_cache_cleanup_one(device);
                goto out;
        }
@@ -421,6 +430,7 @@ void ib_unregister_device(struct ib_device *device)
 
        mutex_unlock(&device_mutex);
 
+       ib_device_unregister_rdmacg(device);
        ib_device_unregister_sysfs(device);
        ib_cache_cleanup_one(device);
 
index 70078220348383eec62b9b6252a5607f9255e9fa..33bc88a38574445131e82a9a7af29026dc576848 100644 (file)
@@ -316,6 +316,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
        struct ib_udata                   udata;
        struct ib_ucontext               *ucontext;
        struct file                      *filp;
+       struct ib_rdmacg_object          cg_obj;
        int ret;
 
        if (out_len < sizeof resp)
@@ -335,13 +336,18 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
                   (unsigned long) cmd.response + sizeof resp,
                   in_len - sizeof cmd, out_len - sizeof resp);
 
+       ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
+       if (ret)
+               goto err;
+
        ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
        if (IS_ERR(ucontext)) {
                ret = PTR_ERR(ucontext);
-               goto err;
+               goto err_alloc;
        }
 
        ucontext->device = ib_dev;
+       ucontext->cg_obj = cg_obj;
        INIT_LIST_HEAD(&ucontext->pd_list);
        INIT_LIST_HEAD(&ucontext->mr_list);
        INIT_LIST_HEAD(&ucontext->mw_list);
@@ -407,6 +413,9 @@ err_free:
        put_pid(ucontext->tgid);
        ib_dev->dealloc_ucontext(ucontext);
 
+err_alloc:
+       ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
+
 err:
        mutex_unlock(&file->mutex);
        return ret;
@@ -561,6 +570,13 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
                return -ENOMEM;
 
        init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
+       ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
+       if (ret) {
+               kfree(uobj);
+               return ret;
+       }
+
        down_write(&uobj->mutex);
 
        pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
@@ -605,6 +621,7 @@ err_idr:
        ib_dealloc_pd(pd);
 
 err:
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
        put_uobj_write(uobj);
        return ret;
 }
@@ -637,6 +654,8 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
        if (ret)
                goto err_put;
 
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
        uobj->live = 0;
        put_uobj_write(uobj);
 
@@ -1006,6 +1025,10 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
                        goto err_put;
                }
        }
+       ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
+       if (ret)
+               goto err_charge;
 
        mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
                                     cmd.access_flags, &udata);
@@ -1054,6 +1077,9 @@ err_unreg:
        ib_dereg_mr(mr);
 
 err_put:
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
        put_pd_read(pd);
 
 err_free:
@@ -1178,6 +1204,8 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
        if (ret)
                return ret;
 
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
        idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
 
        mutex_lock(&file->mutex);
@@ -1226,6 +1254,11 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
                   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
                   out_len - sizeof(resp));
 
+       ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
+       if (ret)
+               goto err_charge;
+
        mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata);
        if (IS_ERR(mw)) {
                ret = PTR_ERR(mw);
@@ -1271,6 +1304,9 @@ err_unalloc:
        uverbs_dealloc_mw(mw);
 
 err_put:
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
        put_pd_read(pd);
 
 err_free:
@@ -1306,6 +1342,8 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
        if (ret)
                return ret;
 
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
        idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
 
        mutex_lock(&file->mutex);
@@ -1405,6 +1443,11 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
        if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
                attr.flags = cmd->flags;
 
+       ret = ib_rdmacg_try_charge(&obj->uobject.cg_obj, ib_dev,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
+       if (ret)
+               goto err_charge;
+
        cq = ib_dev->create_cq(ib_dev, &attr,
                                             file->ucontext, uhw);
        if (IS_ERR(cq)) {
@@ -1452,6 +1495,10 @@ err_free:
        ib_destroy_cq(cq);
 
 err_file:
+       ib_rdmacg_uncharge(&obj->uobject.cg_obj, ib_dev,
+                          RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
        if (ev_file)
                ib_uverbs_release_ucq(file, ev_file, obj);
 
@@ -1732,6 +1779,8 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
        if (ret)
                return ret;
 
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
        idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
 
        mutex_lock(&file->mutex);
@@ -1904,6 +1953,11 @@ static int create_qp(struct ib_uverbs_file *file,
                        goto err_put;
                }
 
+       ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
+       if (ret)
+               goto err_put;
+
        if (cmd->qp_type == IB_QPT_XRC_TGT)
                qp = ib_create_qp(pd, &attr);
        else
@@ -1911,7 +1965,7 @@ static int create_qp(struct ib_uverbs_file *file,
 
        if (IS_ERR(qp)) {
                ret = PTR_ERR(qp);
-               goto err_put;
+               goto err_create;
        }
 
        if (cmd->qp_type != IB_QPT_XRC_TGT) {
@@ -1992,6 +2046,10 @@ err_cb:
 err_destroy:
        ib_destroy_qp(qp);
 
+err_create:
+       ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, device,
+                          RDMACG_RESOURCE_HCA_OBJECT);
+
 err_put:
        if (xrcd)
                put_xrcd_read(xrcd_uobj);
@@ -2518,6 +2576,8 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
        if (ret)
                return ret;
 
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
        if (obj->uxrcd)
                atomic_dec(&obj->uxrcd->refcnt);
 
@@ -2969,11 +3029,16 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
        memset(&attr.dmac, 0, sizeof(attr.dmac));
        memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
 
+       ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
+       if (ret)
+               goto err_charge;
+
        ah = pd->device->create_ah(pd, &attr, &udata);
 
        if (IS_ERR(ah)) {
                ret = PTR_ERR(ah);
-               goto err_put;
+               goto err_create;
        }
 
        ah->device  = pd->device;
@@ -3012,7 +3077,10 @@ err_copy:
 err_destroy:
        ib_destroy_ah(ah);
 
-err_put:
+err_create:
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
        put_pd_read(pd);
 
 err:
@@ -3046,6 +3114,8 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
        if (ret)
                return ret;
 
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
        idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
 
        mutex_lock(&file->mutex);
@@ -3822,10 +3892,16 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
                err = -EINVAL;
                goto err_free;
        }
+
+       err = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
+       if (err)
+               goto err_free;
+
        flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
        if (IS_ERR(flow_id)) {
                err = PTR_ERR(flow_id);
-               goto err_free;
+               goto err_create;
        }
        flow_id->uobject = uobj;
        uobj->object = flow_id;
@@ -3858,6 +3934,8 @@ err_copy:
        idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
 destroy_flow:
        ib_destroy_flow(flow_id);
+err_create:
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
 err_free:
        kfree(flow_attr);
 err_put:
@@ -3897,8 +3975,11 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
        flow_id = uobj->object;
 
        ret = ib_destroy_flow(flow_id);
-       if (!ret)
+       if (!ret) {
+               ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                uobj->live = 0;
+       }
 
        put_uobj_write(uobj);
 
@@ -3966,6 +4047,11 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
        obj->uevent.events_reported = 0;
        INIT_LIST_HEAD(&obj->uevent.event_list);
 
+       ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, ib_dev,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
+       if (ret)
+               goto err_put_cq;
+
        srq = pd->device->create_srq(pd, &attr, udata);
        if (IS_ERR(srq)) {
                ret = PTR_ERR(srq);
@@ -4030,6 +4116,8 @@ err_destroy:
        ib_destroy_srq(srq);
 
 err_put:
+       ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, ib_dev,
+                          RDMACG_RESOURCE_HCA_OBJECT);
        put_pd_read(pd);
 
 err_put_cq:
@@ -4216,6 +4304,8 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
        if (ret)
                return ret;
 
+       ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
        if (srq_type == IB_SRQT_XRC) {
                us = container_of(obj, struct ib_usrq_object, uevent);
                atomic_dec(&us->uxrcd->refcnt);
index b3f95d453fba73073c42bcbdebe96cce98eabd72..cdbd26d6574b4146d3eb58b59ec42ae99f6b1805 100644 (file)
@@ -51,6 +51,7 @@
 #include <rdma/ib.h>
 
 #include "uverbs.h"
+#include "core_priv.h"
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("InfiniBand userspace verbs access");
@@ -237,6 +238,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
                idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
                ib_destroy_ah(ah);
+               ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                kfree(uobj);
        }
 
@@ -246,6 +249,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
                idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
                uverbs_dealloc_mw(mw);
+               ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                kfree(uobj);
        }
 
@@ -254,6 +259,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
                idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
                ib_destroy_flow(flow_id);
+               ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                kfree(uobj);
        }
 
@@ -266,6 +273,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
                if (qp == qp->real_qp)
                        ib_uverbs_detach_umcast(qp, uqp);
                ib_destroy_qp(qp);
+               ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                ib_uverbs_release_uevent(file, &uqp->uevent);
                kfree(uqp);
        }
@@ -298,6 +307,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
                idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
                ib_destroy_srq(srq);
+               ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                ib_uverbs_release_uevent(file, uevent);
                kfree(uevent);
        }
@@ -310,6 +321,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
                idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
                ib_destroy_cq(cq);
+               ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                ib_uverbs_release_ucq(file, ev_file, ucq);
                kfree(ucq);
        }
@@ -319,6 +332,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
                idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
                ib_dereg_mr(mr);
+               ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                kfree(uobj);
        }
 
@@ -339,11 +354,16 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
                idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
                ib_dealloc_pd(pd);
+               ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+                                  RDMACG_RESOURCE_HCA_OBJECT);
                kfree(uobj);
        }
 
        put_pid(context->tgid);
 
+       ib_rdmacg_uncharge(&context->cg_obj, context->device,
+                          RDMACG_RESOURCE_HCA_HANDLE);
+
        return context->device->dealloc_ucontext(context);
 }
 
diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
new file mode 100644 (file)
index 0000000..e94290b
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#ifndef _CGROUP_RDMA_H
+#define _CGROUP_RDMA_H
+
+#include <linux/cgroup.h>
+
+enum rdmacg_resource_type {
+       RDMACG_RESOURCE_HCA_HANDLE,
+       RDMACG_RESOURCE_HCA_OBJECT,
+       RDMACG_RESOURCE_MAX,
+};
+
+#ifdef CONFIG_CGROUP_RDMA
+
+struct rdma_cgroup {
+       struct cgroup_subsys_state      css;
+
+       /*
+        * head to keep track of all resource pools
+        * that belongs to this cgroup.
+        */
+       struct list_head                rpools;
+};
+
+struct rdmacg_device {
+       struct list_head        dev_node;
+       struct list_head        rpools;
+       char                    *name;
+};
+
+/*
+ * APIs for RDMA/IB stack to publish when a device wants to
+ * participate in resource accounting
+ */
+int rdmacg_register_device(struct rdmacg_device *device);
+void rdmacg_unregister_device(struct rdmacg_device *device);
+
+/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+                     struct rdmacg_device *device,
+                     enum rdmacg_resource_type index);
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+                    struct rdmacg_device *device,
+                    enum rdmacg_resource_type index);
+#endif /* CONFIG_CGROUP_RDMA */
+#endif /* _CGROUP_RDMA_H */
index 0df0336acee9ec10fef79f878c299f1aae33b143..d0e597c445854531b21a0915d1c42190833e5a2e 100644 (file)
@@ -56,6 +56,10 @@ SUBSYS(hugetlb)
 SUBSYS(pids)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_RDMA)
+SUBSYS(rdma)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
index 958a24d8fae794547c486b5b025f3815c96f82e7..63896a477896982d4863ee9dba757f2729cd9a06 100644 (file)
@@ -60,6 +60,7 @@
 #include <linux/atomic.h>
 #include <linux/mmu_notifier.h>
 #include <linux/uaccess.h>
+#include <linux/cgroup_rdma.h>
 
 extern struct workqueue_struct *ib_wq;
 extern struct workqueue_struct *ib_comp_wq;
@@ -1331,6 +1332,12 @@ struct ib_fmr_attr {
 
 struct ib_umem;
 
+struct ib_rdmacg_object {
+#ifdef CONFIG_CGROUP_RDMA
+       struct rdma_cgroup      *cg;            /* owner rdma cgroup */
+#endif
+};
+
 struct ib_ucontext {
        struct ib_device       *device;
        struct list_head        pd_list;
@@ -1363,6 +1370,8 @@ struct ib_ucontext {
        struct list_head        no_private_counters;
        int                     odp_mrs_count;
 #endif
+
+       struct ib_rdmacg_object cg_obj;
 };
 
 struct ib_uobject {
@@ -1370,6 +1379,7 @@ struct ib_uobject {
        struct ib_ucontext     *context;        /* associated user context */
        void                   *object;         /* containing object */
        struct list_head        list;           /* link to context's list */
+       struct ib_rdmacg_object cg_obj;         /* rdmacg object */
        int                     id;             /* index into kernel idr */
        struct kref             ref;
        struct rw_semaphore     mutex;          /* protects .live */
@@ -2118,6 +2128,10 @@ struct ib_device {
        struct attribute_group       *hw_stats_ag;
        struct rdma_hw_stats         *hw_stats;
 
+#ifdef CONFIG_CGROUP_RDMA
+       struct rdmacg_device         cg_device;
+#endif
+
        /**
         * The following mandatory functions are used only at device
         * registration.  Keep functions such as these at the end of this
index 223b734abccdc3b7f3baae457261dd66862fcd1d..ef80d46a32b633f671121344e77392e6055e3cf0 100644 (file)
@@ -1090,6 +1090,16 @@ config CGROUP_PIDS
          since the PIDs limit only affects a process's ability to fork, not to
          attach to a cgroup.
 
+config CGROUP_RDMA
+       bool "RDMA controller"
+       help
+         Provides enforcement of RDMA resources defined by IB stack.
+         It is fairly easy for consumers to exhaust RDMA resources, which
+         can result into resource unavailability to other consumers.
+         RDMA controller is designed to stop this from happening.
+         Attaching processes with active RDMA resources to the cgroup
+         hierarchy is allowed even if can cross the hierarchy's limit.
+
 config CGROUP_FREEZER
        bool "Freezer controller"
        help
index 6d42a3211164d1f1866895effd7da4c4f8975447..387348a40c647ba8f9ee4e34ba1f363a9449a458 100644 (file)
@@ -2,4 +2,5 @@ obj-y := cgroup.o namespace.o cgroup-v1.o
 
 obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
+obj-$(CONFIG_CGROUP_RDMA) += rdma.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
new file mode 100644 (file)
index 0000000..defad3c
--- /dev/null
@@ -0,0 +1,619 @@
+/*
+ * RDMA resource limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop processes from consuming
+ * additional RDMA resources after a certain limit is reached.
+ *
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/cgroup.h>
+#include <linux/parser.h>
+#include <linux/cgroup_rdma.h>
+
+#define RDMACG_MAX_STR "max"
+
+/*
+ * Protects list of resource pools maintained on per cgroup basis
+ * and rdma device list.
+ */
+static DEFINE_MUTEX(rdmacg_mutex);
+static LIST_HEAD(rdmacg_devices);
+
+enum rdmacg_file_type {
+       RDMACG_RESOURCE_TYPE_MAX,
+       RDMACG_RESOURCE_TYPE_STAT,
+};
+
+/*
+ * resource table definition as to be seen by the user.
+ * Need to add entries to it when more resources are
+ * added/defined at IB verb/core layer.
+ */
+static char const *rdmacg_resource_names[] = {
+       [RDMACG_RESOURCE_HCA_HANDLE]    = "hca_handle",
+       [RDMACG_RESOURCE_HCA_OBJECT]    = "hca_object",
+};
+
+/* resource tracker for each resource of rdma cgroup */
+struct rdmacg_resource {
+       int max;
+       int usage;
+};
+
+/*
+ * resource pool object which represents per cgroup, per device
+ * resources. There are multiple instances of this object per cgroup,
+ * therefore it cannot be embedded within rdma_cgroup structure. It
+ * is maintained as list.
+ */
+struct rdmacg_resource_pool {
+       struct rdmacg_device    *device;
+       struct rdmacg_resource  resources[RDMACG_RESOURCE_MAX];
+
+       struct list_head        cg_node;
+       struct list_head        dev_node;
+
+       /* count active user tasks of this pool */
+       u64                     usage_sum;
+       /* total number counts which are set to max */
+       int                     num_max_cnt;
+};
+
+static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
+{
+       return container_of(css, struct rdma_cgroup, css);
+}
+
+static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
+{
+       return css_rdmacg(cg->css.parent);
+}
+
+static inline struct rdma_cgroup *get_current_rdmacg(void)
+{
+       return css_rdmacg(task_get_css(current, rdma_cgrp_id));
+}
+
+static void set_resource_limit(struct rdmacg_resource_pool *rpool,
+                              int index, int new_max)
+{
+       if (new_max == S32_MAX) {
+               if (rpool->resources[index].max != S32_MAX)
+                       rpool->num_max_cnt++;
+       } else {
+               if (rpool->resources[index].max == S32_MAX)
+                       rpool->num_max_cnt--;
+       }
+       rpool->resources[index].max = new_max;
+}
+
+static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
+{
+       int i;
+
+       for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
+               set_resource_limit(rpool, i, S32_MAX);
+}
+
+static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
+{
+       lockdep_assert_held(&rdmacg_mutex);
+
+       list_del(&rpool->cg_node);
+       list_del(&rpool->dev_node);
+       kfree(rpool);
+}
+
+static struct rdmacg_resource_pool *
+find_cg_rpool_locked(struct rdma_cgroup *cg,
+                    struct rdmacg_device *device)
+
+{
+       struct rdmacg_resource_pool *pool;
+
+       lockdep_assert_held(&rdmacg_mutex);
+
+       list_for_each_entry(pool, &cg->rpools, cg_node)
+               if (pool->device == device)
+                       return pool;
+
+       return NULL;
+}
+
+static struct rdmacg_resource_pool *
+get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
+{
+       struct rdmacg_resource_pool *rpool;
+
+       rpool = find_cg_rpool_locked(cg, device);
+       if (rpool)
+               return rpool;
+
+       rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
+       if (!rpool)
+               return ERR_PTR(-ENOMEM);
+
+       rpool->device = device;
+       set_all_resource_max_limit(rpool);
+
+       INIT_LIST_HEAD(&rpool->cg_node);
+       INIT_LIST_HEAD(&rpool->dev_node);
+       list_add_tail(&rpool->cg_node, &cg->rpools);
+       list_add_tail(&rpool->dev_node, &device->rpools);
+       return rpool;
+}
+
+/**
+ * uncharge_cg_locked - uncharge resource for rdma cgroup
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cg (resource pool)
+ *
+ * It also frees the resource pool which was created as part of
+ * charging operation when there are no resources attached to
+ * resource pool.
+ */
+static void
+uncharge_cg_locked(struct rdma_cgroup *cg,
+                  struct rdmacg_device *device,
+                  enum rdmacg_resource_type index)
+{
+       struct rdmacg_resource_pool *rpool;
+
+       rpool = find_cg_rpool_locked(cg, device);
+
+       /*
+        * rpool cannot be null at this stage. Let kernel operate in case
+        * if there a bug in IB stack or rdma controller, instead of crashing
+        * the system.
+        */
+       if (unlikely(!rpool)) {
+               pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
+               return;
+       }
+
+       rpool->resources[index].usage--;
+
+       /*
+        * A negative count (or overflow) is invalid,
+        * it indicates a bug in the rdma controller.
+        */
+       WARN_ON_ONCE(rpool->resources[index].usage < 0);
+       rpool->usage_sum--;
+       if (rpool->usage_sum == 0 &&
+           rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+               /*
+                * No user of the rpool and all entries are set to max, so
+                * safe to delete this rpool.
+                */
+               free_cg_rpool_locked(rpool);
+       }
+}
+
+/**
+ * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
+ *           stop uncharging
+ * @index: index of the resource to uncharge in cg in given resource pool
+ */
+static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
+                                    struct rdmacg_device *device,
+                                    struct rdma_cgroup *stop_cg,
+                                    enum rdmacg_resource_type index)
+{
+       struct rdma_cgroup *p;
+
+       mutex_lock(&rdmacg_mutex);
+
+       for (p = cg; p != stop_cg; p = parent_rdmacg(p))
+               uncharge_cg_locked(p, device, index);
+
+       mutex_unlock(&rdmacg_mutex);
+
+       css_put(&cg->css);
+}
+
+/**
+ * rdmacg_uncharge - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cgroup in given resource pool
+ */
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+                    struct rdmacg_device *device,
+                    enum rdmacg_resource_type index)
+{
+       if (index >= RDMACG_RESOURCE_MAX)
+               return;
+
+       rdmacg_uncharge_hierarchy(cg, device, NULL, index);
+}
+EXPORT_SYMBOL(rdmacg_uncharge);
+
+/**
+ * rdmacg_try_charge - hierarchically try to charge the rdma resource
+ * @rdmacg: pointer to rdma cgroup which will own this resource
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to charge in cgroup (resource pool)
+ *
+ * This function follows charging resource in hierarchical way.
+ * It will fail if the charge would cause the new value to exceed the
+ * hierarchical limit.
+ * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
+ * Returns pointer to rdmacg for this resource when charging is successful.
+ *
+ * Charger needs to account resources on two criteria.
+ * (a) per cgroup & (b) per device resource usage.
+ * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
+ * the configured limits. Per device provides granular configuration
+ * in multi device usage. It allocates resource pool in the hierarchy
+ * for each parent it come across for first resource. Later on resource
+ * pool will be available. Therefore it will be much faster thereon
+ * to charge/uncharge.
+ */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+                     struct rdmacg_device *device,
+                     enum rdmacg_resource_type index)
+{
+       struct rdma_cgroup *cg, *p;
+       struct rdmacg_resource_pool *rpool;
+       s64 new;
+       int ret = 0;
+
+       if (index >= RDMACG_RESOURCE_MAX)
+               return -EINVAL;
+
+       /*
+        * hold on to css, as cgroup can be removed but resource
+        * accounting happens on css.
+        */
+       cg = get_current_rdmacg();
+
+       mutex_lock(&rdmacg_mutex);
+       for (p = cg; p; p = parent_rdmacg(p)) {
+               rpool = get_cg_rpool_locked(p, device);
+               if (IS_ERR(rpool)) {
+                       ret = PTR_ERR(rpool);
+                       goto err;
+               } else {
+                       new = rpool->resources[index].usage + 1;
+                       if (new > rpool->resources[index].max) {
+                               ret = -EAGAIN;
+                               goto err;
+                       } else {
+                               rpool->resources[index].usage = new;
+                               rpool->usage_sum++;
+                       }
+               }
+       }
+       mutex_unlock(&rdmacg_mutex);
+
+       *rdmacg = cg;
+       return 0;
+
+err:
+       mutex_unlock(&rdmacg_mutex);
+       rdmacg_uncharge_hierarchy(cg, device, p, index);
+       return ret;
+}
+EXPORT_SYMBOL(rdmacg_try_charge);
+
+/**
+ * rdmacg_register_device - register rdmacg device to rdma controller.
+ * @device: pointer to rdmacg device whose resources need to be accounted.
+ *
+ * If IB stack wish a device to participate in rdma cgroup resource
+ * tracking, it must invoke this API to register with rdma cgroup before
+ * any user space application can start using the RDMA resources.
+ * Returns 0 on success or EINVAL when table length given is beyond
+ * supported size.
+ */
+int rdmacg_register_device(struct rdmacg_device *device)
+{
+       INIT_LIST_HEAD(&device->dev_node);
+       INIT_LIST_HEAD(&device->rpools);
+
+       mutex_lock(&rdmacg_mutex);
+       list_add_tail(&device->dev_node, &rdmacg_devices);
+       mutex_unlock(&rdmacg_mutex);
+       return 0;
+}
+EXPORT_SYMBOL(rdmacg_register_device);
+
+/**
+ * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
+ * @device: pointer to rdmacg device which was previously registered with rdma
+ *          controller using rdmacg_register_device().
+ *
+ * IB stack must invoke this after all the resources of the IB device
+ * are destroyed and after ensuring that no more resources will be created
+ * when this API is invoked.
+ */
+void rdmacg_unregister_device(struct rdmacg_device *device)
+{
+       struct rdmacg_resource_pool *rpool, *tmp;
+
+       /*
+        * Synchronize with any active resource settings,
+        * usage query happening via configfs.
+        */
+       mutex_lock(&rdmacg_mutex);
+       list_del_init(&device->dev_node);
+
+       /*
+        * Now that this device is off the cgroup list, its safe to free
+        * all the rpool resources.
+        */
+       list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
+               free_cg_rpool_locked(rpool);
+
+       mutex_unlock(&rdmacg_mutex);
+}
+EXPORT_SYMBOL(rdmacg_unregister_device);
+
+static int parse_resource(char *c, int *intval)
+{
+       substring_t argstr;
+       const char **table = &rdmacg_resource_names[0];
+       char *name, *value = c;
+       size_t len;
+       int ret, i = 0;
+
+       name = strsep(&value, "=");
+       if (!name || !value)
+               return -EINVAL;
+
+       len = strlen(value);
+
+       for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+               if (strcmp(table[i], name))
+                       continue;
+
+               argstr.from = value;
+               argstr.to = value + len;
+
+               ret = match_int(&argstr, intval);
+               if (ret >= 0) {
+                       if (*intval < 0)
+                               break;
+                       return i;
+               }
+               if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
+                       *intval = S32_MAX;
+                       return i;
+               }
+               break;
+       }
+       return -EINVAL;
+}
+
+static int rdmacg_parse_limits(char *options,
+                              int *new_limits, unsigned long *enables)
+{
+       char *c;
+       int err = -EINVAL;
+
+       /* parse resource options */
+       while ((c = strsep(&options, " ")) != NULL) {
+               int index, intval;
+
+               index = parse_resource(c, &intval);
+               if (index < 0)
+                       goto err;
+
+               new_limits[index] = intval;
+               *enables |= BIT(index);
+       }
+       return 0;
+
+err:
+       return err;
+}
+
+static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
+{
+       struct rdmacg_device *device;
+
+       lockdep_assert_held(&rdmacg_mutex);
+
+       list_for_each_entry(device, &rdmacg_devices, dev_node)
+               if (!strcmp(name, device->name))
+                       return device;
+
+       return NULL;
+}
+
+static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
+                                      char *buf, size_t nbytes, loff_t off)
+{
+       struct rdma_cgroup *cg = css_rdmacg(of_css(of));
+       const char *dev_name;
+       struct rdmacg_resource_pool *rpool;
+       struct rdmacg_device *device;
+       char *options = strstrip(buf);
+       int *new_limits;
+       unsigned long enables = 0;
+       int i = 0, ret = 0;
+
+       /* extract the device name first */
+       dev_name = strsep(&options, " ");
+       if (!dev_name) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
+       if (!new_limits) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       ret = rdmacg_parse_limits(options, new_limits, &enables);
+       if (ret)
+               goto parse_err;
+
+       /* acquire lock to synchronize with hot plug devices */
+       mutex_lock(&rdmacg_mutex);
+
+       device = rdmacg_get_device_locked(dev_name);
+       if (!device) {
+               ret = -ENODEV;
+               goto dev_err;
+       }
+
+       rpool = get_cg_rpool_locked(cg, device);
+       if (IS_ERR(rpool)) {
+               ret = PTR_ERR(rpool);
+               goto dev_err;
+       }
+
+       /* now set the new limits of the rpool */
+       for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
+               set_resource_limit(rpool, i, new_limits[i]);
+
+       if (rpool->usage_sum == 0 &&
+           rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+               /*
+                * No user of the rpool and all entries are set to max, so
+                * safe to delete this rpool.
+                */
+               free_cg_rpool_locked(rpool);
+       }
+
+dev_err:
+       mutex_unlock(&rdmacg_mutex);
+
+parse_err:
+       kfree(new_limits);
+
+err:
+       return ret ?: nbytes;
+}
+
+static void print_rpool_values(struct seq_file *sf,
+                              struct rdmacg_resource_pool *rpool)
+{
+       enum rdmacg_file_type sf_type;
+       int i;
+       u32 value;
+
+       sf_type = seq_cft(sf)->private;
+
+       for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+               seq_puts(sf, rdmacg_resource_names[i]);
+               seq_putc(sf, '=');
+               if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
+                       if (rpool)
+                               value = rpool->resources[i].max;
+                       else
+                               value = S32_MAX;
+               } else {
+                       if (rpool)
+                               value = rpool->resources[i].usage;
+                       else
+                               value = 0;
+               }
+
+               if (value == S32_MAX)
+                       seq_puts(sf, RDMACG_MAX_STR);
+               else
+                       seq_printf(sf, "%d", value);
+               seq_putc(sf, ' ');
+       }
+}
+
+static int rdmacg_resource_read(struct seq_file *sf, void *v)
+{
+       struct rdmacg_device *device;
+       struct rdmacg_resource_pool *rpool;
+       struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
+
+       mutex_lock(&rdmacg_mutex);
+
+       list_for_each_entry(device, &rdmacg_devices, dev_node) {
+               seq_printf(sf, "%s ", device->name);
+
+               rpool = find_cg_rpool_locked(cg, device);
+               print_rpool_values(sf, rpool);
+
+               seq_putc(sf, '\n');
+       }
+
+       mutex_unlock(&rdmacg_mutex);
+       return 0;
+}
+
+static struct cftype rdmacg_files[] = {
+       {
+               .name = "max",
+               .write = rdmacg_resource_set_max,
+               .seq_show = rdmacg_resource_read,
+               .private = RDMACG_RESOURCE_TYPE_MAX,
+               .flags = CFTYPE_NOT_ON_ROOT,
+       },
+       {
+               .name = "current",
+               .seq_show = rdmacg_resource_read,
+               .private = RDMACG_RESOURCE_TYPE_STAT,
+               .flags = CFTYPE_NOT_ON_ROOT,
+       },
+       { }     /* terminate */
+};
+
+static struct cgroup_subsys_state *
+rdmacg_css_alloc(struct cgroup_subsys_state *parent)
+{
+       struct rdma_cgroup *cg;
+
+       cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+       if (!cg)
+               return ERR_PTR(-ENOMEM);
+
+       INIT_LIST_HEAD(&cg->rpools);
+       return &cg->css;
+}
+
+static void rdmacg_css_free(struct cgroup_subsys_state *css)
+{
+       struct rdma_cgroup *cg = css_rdmacg(css);
+
+       kfree(cg);
+}
+
+/**
+ * rdmacg_css_offline - cgroup css_offline callback
+ * @css: css of interest
+ *
+ * This function is called when @css is about to go away and responsible
+ * for shooting down all rdmacg associated with @css. As part of that it
+ * marks all the resource pool entries to max value, so that when resources are
+ * uncharged, associated resource pool can be freed as well.
+ */
+static void rdmacg_css_offline(struct cgroup_subsys_state *css)
+{
+       struct rdma_cgroup *cg = css_rdmacg(css);
+       struct rdmacg_resource_pool *rpool;
+
+       mutex_lock(&rdmacg_mutex);
+
+       list_for_each_entry(rpool, &cg->rpools, cg_node)
+               set_all_resource_max_limit(rpool);
+
+       mutex_unlock(&rdmacg_mutex);
+}
+
+struct cgroup_subsys rdma_cgrp_subsys = {
+       .css_alloc      = rdmacg_css_alloc,
+       .css_free       = rdmacg_css_free,
+       .css_offline    = rdmacg_css_offline,
+       .legacy_cftypes = rdmacg_files,
+       .dfl_cftypes    = rdmacg_files,
+};