bpf: introduce cgroup storage maps

author Roman Gushchin <guro@fb.com>

Thu, 2 Aug 2018 21:27:18 +0000 (14:27 -0700)

committer Daniel Borkmann <daniel@iogearbox.net>

Thu, 2 Aug 2018 22:47:32 +0000 (00:47 +0200)
author Roman Gushchin <guro@fb.com>
Thu, 2 Aug 2018 21:27:18 +0000 (14:27 -0700)
committer Daniel Borkmann <daniel@iogearbox.net>
Thu, 2 Aug 2018 22:47:32 +0000 (00:47 +0200)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h

index d50c2f0a655ae3f95271d5f8de40f8eabc917c65..7d00d58869edc785590961de2fd176df4a0e6c4a 100644 (file)
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -4,19 +4,39 @@
  
  #include <linux/errno.h>
  #include <linux/jump_label.h>
+#include <linux/rbtree.h>
  #include <uapi/linux/bpf.h>
  
  struct sock;
  struct sockaddr;
  struct cgroup;
  struct sk_buff;
+struct bpf_map;
+struct bpf_prog;
  struct bpf_sock_ops_kern;
+struct bpf_cgroup_storage;
  
  #ifdef CONFIG_CGROUP_BPF
  
  extern struct static_key_false cgroup_bpf_enabled_key;
  #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
  
+struct bpf_cgroup_storage_map;
+
+struct bpf_storage_buffer {
+       struct rcu_head rcu;
+       char data[0];
+};
+
+struct bpf_cgroup_storage {
+       struct bpf_storage_buffer *buf;
+       struct bpf_cgroup_storage_map *map;
+       struct bpf_cgroup_storage_key key;
+       struct list_head list;
+       struct rb_node node;
+       struct rcu_head rcu;
+};
+
  struct bpf_prog_list {
         struct list_head node;
         struct bpf_prog *prog;
@@ -77,6 +97,15 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
  int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
                                       short access, enum bpf_attach_type type);
  
+struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog);
+void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage);
+void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
+                            struct cgroup *cgroup,
+                            enum bpf_attach_type type);
+void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
+int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
+void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
+
  /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
  #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)                            \
  ({                                                                           \
@@ -221,6 +250,15 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
         return -EINVAL;
  }
  
+static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog,
+                                           struct bpf_map *map) { return 0; }
+static inline void bpf_cgroup_storage_release(struct bpf_prog *prog,
+                                             struct bpf_map *map) {}
+static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
+       struct bpf_prog *prog) { return 0; }
+static inline void bpf_cgroup_storage_free(
+       struct bpf_cgroup_storage *storage) {}
+
  #define cgroup_bpf_enabled (0)
  #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
  #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h

index 5a4a256473c34f2c4d3c155588821da1db958873..9d1e4727495e028e7dccdade36dc6373bfdb9fcd 100644 (file)
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -282,6 +282,7 @@ struct bpf_prog_aux {
         struct bpf_prog *prog;
         struct user_struct *user;
         u64 load_time; /* ns since boottime */
+       struct bpf_map *cgroup_storage;
         char name[BPF_OBJ_NAME_LEN];
  #ifdef CONFIG_SECURITY
         void *security;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h

index c5700c2d554900f58e53dd29b868593af349f533..add08be53b6f8620e722b4a1bac67f68a72be333 100644 (file)
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -37,6 +37,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops)
  #ifdef CONFIG_CGROUPS
  BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
  #endif
+#ifdef CONFIG_CGROUP_BPF
+BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
+#endif
  BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
  BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
  BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

index 0ebaaf7f35681d709fc798ff4fcadd4190ecc4ac..b10118ee5afeea687b65d19ddb659e201182abfd 100644 (file)
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -75,6 +75,11 @@ struct bpf_lpm_trie_key {
         __u8    data[0];        /* Arbitrary size */
  };
  
+struct bpf_cgroup_storage_key {
+       __u64   cgroup_inode_id;        /* cgroup inode id */
+       __u32   attach_type;            /* program attach type */
+};
+
  /* BPF syscall commands, see bpf(2) man-page for details. */
  enum bpf_cmd {
         BPF_MAP_CREATE,
@@ -120,6 +125,7 @@ enum bpf_map_type {
         BPF_MAP_TYPE_CPUMAP,
         BPF_MAP_TYPE_XSKMAP,
         BPF_MAP_TYPE_SOCKHASH,
+       BPF_MAP_TYPE_CGROUP_STORAGE,
  };
  
  enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile

index f27f5496d6fe2d4d58d007d78cdc2d1efa682d87..e8906cbad81f09848a5524f5c699e7f3a8f5b12b 100644 (file)
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -3,6 +3,7 @@ obj-y := core.o
  
  obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o
  obj-$(CONFIG_BPF_SYSCALL) += disasm.o
  obj-$(CONFIG_BPF_SYSCALL) += btf.o
  ifeq ($(CONFIG_NET),y)
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c

new file mode 100644 (file)

index 0000000..f23d3fd
--- /dev/null
+++ b/kernel/bpf/local_storage.c
@@ -0,0 +1,376 @@
+//SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf-cgroup.h>
+#include <linux/bpf.h>
+#include <linux/bug.h>
+#include <linux/filter.h>
+#include <linux/mm.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+
+#ifdef CONFIG_CGROUP_BPF
+
+#define LOCAL_STORAGE_CREATE_FLAG_MASK                                 \
+       (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
+struct bpf_cgroup_storage_map {
+       struct bpf_map map;
+
+       spinlock_t lock;
+       struct bpf_prog *prog;
+       struct rb_root root;
+       struct list_head list;
+};
+
+static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map)
+{
+       return container_of(map, struct bpf_cgroup_storage_map, map);
+}
+
+static int bpf_cgroup_storage_key_cmp(
+       const struct bpf_cgroup_storage_key *key1,
+       const struct bpf_cgroup_storage_key *key2)
+{
+       if (key1->cgroup_inode_id < key2->cgroup_inode_id)
+               return -1;
+       else if (key1->cgroup_inode_id > key2->cgroup_inode_id)
+               return 1;
+       else if (key1->attach_type < key2->attach_type)
+               return -1;
+       else if (key1->attach_type > key2->attach_type)
+               return 1;
+       return 0;
+}
+
+static struct bpf_cgroup_storage *cgroup_storage_lookup(
+       struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key,
+       bool locked)
+{
+       struct rb_root *root = &map->root;
+       struct rb_node *node;
+
+       if (!locked)
+               spin_lock_bh(&map->lock);
+
+       node = root->rb_node;
+       while (node) {
+               struct bpf_cgroup_storage *storage;
+
+               storage = container_of(node, struct bpf_cgroup_storage, node);
+
+               switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) {
+               case -1:
+                       node = node->rb_left;
+                       break;
+               case 1:
+                       node = node->rb_right;
+                       break;
+               default:
+                       if (!locked)
+                               spin_unlock_bh(&map->lock);
+                       return storage;
+               }
+       }
+
+       if (!locked)
+               spin_unlock_bh(&map->lock);
+
+       return NULL;
+}
+
+static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map,
+                                struct bpf_cgroup_storage *storage)
+{
+       struct rb_root *root = &map->root;
+       struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+       while (*new) {
+               struct bpf_cgroup_storage *this;
+
+               this = container_of(*new, struct bpf_cgroup_storage, node);
+
+               parent = *new;
+               switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) {
+               case -1:
+                       new = &((*new)->rb_left);
+                       break;
+               case 1:
+                       new = &((*new)->rb_right);
+                       break;
+               default:
+                       return -EEXIST;
+               }
+       }
+
+       rb_link_node(&storage->node, parent, new);
+       rb_insert_color(&storage->node, root);
+
+       return 0;
+}
+
+static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key)
+{
+       struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+       struct bpf_cgroup_storage_key *key = _key;
+       struct bpf_cgroup_storage *storage;
+
+       storage = cgroup_storage_lookup(map, key, false);
+       if (!storage)
+               return NULL;
+
+       return &READ_ONCE(storage->buf)->data[0];
+}
+
+static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
+                                     void *value, u64 flags)
+{
+       struct bpf_cgroup_storage_key *key = _key;
+       struct bpf_cgroup_storage *storage;
+       struct bpf_storage_buffer *new;
+
+       if (flags & BPF_NOEXIST)
+               return -EINVAL;
+
+       storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
+                                       key, false);
+       if (!storage)
+               return -ENOENT;
+
+       new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
+                          map->value_size, __GFP_ZERO | GFP_USER,
+                          map->numa_node);
+       if (!new)
+               return -ENOMEM;
+
+       memcpy(&new->data[0], value, map->value_size);
+
+       new = xchg(&storage->buf, new);
+       kfree_rcu(new, rcu);
+
+       return 0;
+}
+
+static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
+                                      void *_next_key)
+{
+       struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+       struct bpf_cgroup_storage_key *key = _key;
+       struct bpf_cgroup_storage_key *next = _next_key;
+       struct bpf_cgroup_storage *storage;
+
+       spin_lock_bh(&map->lock);
+
+       if (list_empty(&map->list))
+               goto enoent;
+
+       if (key) {
+               storage = cgroup_storage_lookup(map, key, true);
+               if (!storage)
+                       goto enoent;
+
+               storage = list_next_entry(storage, list);
+               if (!storage)
+                       goto enoent;
+       } else {
+               storage = list_first_entry(&map->list,
+                                        struct bpf_cgroup_storage, list);
+       }
+
+       spin_unlock_bh(&map->lock);
+       next->attach_type = storage->key.attach_type;
+       next->cgroup_inode_id = storage->key.cgroup_inode_id;
+       return 0;
+
+enoent:
+       spin_unlock_bh(&map->lock);
+       return -ENOENT;
+}
+
+static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
+{
+       int numa_node = bpf_map_attr_numa_node(attr);
+       struct bpf_cgroup_storage_map *map;
+
+       if (attr->key_size != sizeof(struct bpf_cgroup_storage_key))
+               return ERR_PTR(-EINVAL);
+
+       if (attr->value_size > PAGE_SIZE)
+               return ERR_PTR(-E2BIG);
+
+       if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK)
+               /* reserved bits should not be used */
+               return ERR_PTR(-EINVAL);
+
+       if (attr->max_entries)
+               /* max_entries is not used and enforced to be 0 */
+               return ERR_PTR(-EINVAL);
+
+       map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
+                          __GFP_ZERO | GFP_USER, numa_node);
+       if (!map)
+               return ERR_PTR(-ENOMEM);
+
+       map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map),
+                                 PAGE_SIZE) >> PAGE_SHIFT;
+
+       /* copy mandatory map attributes */
+       bpf_map_init_from_attr(&map->map, attr);
+
+       spin_lock_init(&map->lock);
+       map->root = RB_ROOT;
+       INIT_LIST_HEAD(&map->list);
+
+       return &map->map;
+}
+
+static void cgroup_storage_map_free(struct bpf_map *_map)
+{
+       struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+
+       WARN_ON(!RB_EMPTY_ROOT(&map->root));
+       WARN_ON(!list_empty(&map->list));
+
+       kfree(map);
+}
+
+static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
+{
+       return -EINVAL;
+}
+
+const struct bpf_map_ops cgroup_storage_map_ops = {
+       .map_alloc = cgroup_storage_map_alloc,
+       .map_free = cgroup_storage_map_free,
+       .map_get_next_key = cgroup_storage_get_next_key,
+       .map_lookup_elem = cgroup_storage_lookup_elem,
+       .map_update_elem = cgroup_storage_update_elem,
+       .map_delete_elem = cgroup_storage_delete_elem,
+};
+
+int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
+{
+       struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+       int ret = -EBUSY;
+
+       spin_lock_bh(&map->lock);
+
+       if (map->prog && map->prog != prog)
+               goto unlock;
+       if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map)
+               goto unlock;
+
+       map->prog = prog;
+       prog->aux->cgroup_storage = _map;
+       ret = 0;
+unlock:
+       spin_unlock_bh(&map->lock);
+
+       return ret;
+}
+
+void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map)
+{
+       struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+
+       spin_lock_bh(&map->lock);
+       if (map->prog == prog) {
+               WARN_ON(prog->aux->cgroup_storage != _map);
+               map->prog = NULL;
+               prog->aux->cgroup_storage = NULL;
+       }
+       spin_unlock_bh(&map->lock);
+}
+
+struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog)
+{
+       struct bpf_cgroup_storage *storage;
+       struct bpf_map *map;
+       u32 pages;
+
+       map = prog->aux->cgroup_storage;
+       if (!map)
+               return NULL;
+
+       pages = round_up(sizeof(struct bpf_cgroup_storage) +
+                        sizeof(struct bpf_storage_buffer) +
+                        map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
+       if (bpf_map_charge_memlock(map, pages))
+               return ERR_PTR(-EPERM);
+
+       storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
+                              __GFP_ZERO | GFP_USER, map->numa_node);
+       if (!storage) {
+               bpf_map_uncharge_memlock(map, pages);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) +
+                                   map->value_size, __GFP_ZERO | GFP_USER,
+                                   map->numa_node);
+       if (!storage->buf) {
+               bpf_map_uncharge_memlock(map, pages);
+               kfree(storage);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       storage->map = (struct bpf_cgroup_storage_map *)map;
+
+       return storage;
+}
+
+void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)
+{
+       u32 pages;
+       struct bpf_map *map;
+
+       if (!storage)
+               return;
+
+       map = &storage->map->map;
+       pages = round_up(sizeof(struct bpf_cgroup_storage) +
+                        sizeof(struct bpf_storage_buffer) +
+                        map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
+       bpf_map_uncharge_memlock(map, pages);
+
+       kfree_rcu(storage->buf, rcu);
+       kfree_rcu(storage, rcu);
+}
+
+void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
+                            struct cgroup *cgroup,
+                            enum bpf_attach_type type)
+{
+       struct bpf_cgroup_storage_map *map;
+
+       if (!storage)
+               return;
+
+       storage->key.attach_type = type;
+       storage->key.cgroup_inode_id = cgroup->kn->id.id;
+
+       map = storage->map;
+
+       spin_lock_bh(&map->lock);
+       WARN_ON(cgroup_storage_insert(map, storage));
+       list_add(&storage->list, &map->list);
+       spin_unlock_bh(&map->lock);
+}
+
+void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage)
+{
+       struct bpf_cgroup_storage_map *map;
+       struct rb_root *root;
+
+       if (!storage)
+               return;
+
+       map = storage->map;
+
+       spin_lock_bh(&map->lock);
+       root = &map->root;
+       rb_erase(&storage->node, root);
+
+       list_del(&storage->list);
+       spin_unlock_bh(&map->lock);
+}
+
+#endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c

index 7958252a4d29627e2fb8256914561682077285be..5af4e9e2722dbf8b0e745f0b321d8d05ac6bbf17 100644 (file)
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -957,6 +957,9 @@ static void free_used_maps(struct bpf_prog_aux *aux)
  {
         int i;
  
+       if (aux->cgroup_storage)
+               bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage);
+
         for (i = 0; i < aux->used_map_cnt; i++)
                 bpf_map_put(aux->used_maps[i]);
  
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

index e948303a0ea8d3bdcafc43957cca469ce412f6de..7e75434a9e54040d3a37b76ba7f986fea288538a 100644 (file)
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5154,6 +5154,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
                         }
                         env->used_maps[env->used_map_cnt++] = map;
  
+                       if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE &&
+                           bpf_cgroup_storage_assign(env->prog, map)) {
+                               verbose(env,
+                                       "only one cgroup storage is allowed\n");
+                               fdput(f);
+                               return -EBUSY;
+                       }
+
                         fdput(f);
  next_insn:
                         insn++;
@@ -5180,6 +5188,10 @@ static void release_maps(struct bpf_verifier_env *env)
  {
         int i;
  
+       if (env->prog->aux->cgroup_storage)
+               bpf_cgroup_storage_release(env->prog,
+                                          env->prog->aux->cgroup_storage);
+
         for (i = 0; i < env->used_map_cnt; i++)
                 bpf_map_put(env->used_maps[i]);
  }
author	Roman Gushchin <guro@fb.com>
	Thu, 2 Aug 2018 21:27:18 +0000 (14:27 -0700)
committer	Daniel Borkmann <daniel@iogearbox.net>
	Thu, 2 Aug 2018 22:47:32 +0000 (00:47 +0200)
include/linux/bpf-cgroup.h		patch \| blob \| blame \| history
include/linux/bpf.h		patch \| blob \| blame \| history
include/linux/bpf_types.h		patch \| blob \| blame \| history
include/uapi/linux/bpf.h		patch \| blob \| blame \| history
kernel/bpf/Makefile		patch \| blob \| blame \| history
kernel/bpf/local_storage.c	[new file with mode: 0644]	patch \| blob
kernel/bpf/syscall.c		patch \| blob \| blame \| history
kernel/bpf/verifier.c		patch \| blob \| blame \| history