kernel/bpf/cgroup.c

   1 /*
   2  * Functions to manage eBPF programs attached to cgroups
   3  *
   4  * Copyright (c) 2016 Daniel Mack
   5  *
   6  * This file is subject to the terms and conditions of version 2 of the GNU
   7  * General Public License.  See the file COPYING in the main directory of the
   8  * Linux distribution for more details.
   9  */
  10
  11 #include <linux/kernel.h>
  12 #include <linux/atomic.h>
  13 #include <linux/cgroup.h>
  14 #include <linux/slab.h>
  15 #include <linux/bpf.h>
  16 #include <linux/bpf-cgroup.h>
  17 #include <net/sock.h>
  18
  19 DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
  20 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
  21
  22 /**
  23  * cgroup_bpf_put() - put references of all bpf programs
  24  * @cgrp: the cgroup to modify
  25  */
  26 void cgroup_bpf_put(struct cgroup *cgrp)
  27 {
  28         unsigned int type;
  29
  30         for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
  31                 struct list_head *progs = &cgrp->bpf.progs[type];
  32                 struct bpf_prog_list *pl, *tmp;
  33
  34                 list_for_each_entry_safe(pl, tmp, progs, node) {
  35                         list_del(&pl->node);
  36                         bpf_prog_put(pl->prog);
  37                         kfree(pl);
  38                         static_branch_dec(&cgroup_bpf_enabled_key);
  39                 }
  40                 bpf_prog_array_free(cgrp->bpf.effective[type]);
  41         }
  42 }
  43
  44 /* count number of elements in the list.
  45  * it's slow but the list cannot be long
  46  */
  47 static u32 prog_list_length(struct list_head *head)
  48 {
  49         struct bpf_prog_list *pl;
  50         u32 cnt = 0;
  51
  52         list_for_each_entry(pl, head, node) {
  53                 if (!pl->prog)
  54                         continue;
  55                 cnt++;
  56         }
  57         return cnt;
  58 }
  59
  60 /* if parent has non-overridable prog attached,
  61  * disallow attaching new programs to the descendent cgroup.
  62  * if parent has overridable or multi-prog, allow attaching
  63  */
  64 static bool hierarchy_allows_attach(struct cgroup *cgrp,
  65                                     enum bpf_attach_type type,
  66                                     u32 new_flags)
  67 {
  68         struct cgroup *p;
  69
  70         p = cgroup_parent(cgrp);
  71         if (!p)
  72                 return true;
  73         do {
  74                 u32 flags = p->bpf.flags[type];
  75                 u32 cnt;
  76
  77                 if (flags & BPF_F_ALLOW_MULTI)
  78                         return true;
  79                 cnt = prog_list_length(&p->bpf.progs[type]);
  80                 WARN_ON_ONCE(cnt > 1);
  81                 if (cnt == 1)
  82                         return !!(flags & BPF_F_ALLOW_OVERRIDE);
  83                 p = cgroup_parent(p);
  84         } while (p);
  85         return true;
  86 }
  87
  88 /* compute a chain of effective programs for a given cgroup:
  89  * start from the list of programs in this cgroup and add
  90  * all parent programs.
  91  * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
  92  * to programs in this cgroup
  93  */
  94 static int compute_effective_progs(struct cgroup *cgrp,
  95                                    enum bpf_attach_type type,
  96                                    struct bpf_prog_array __rcu **array)
  97 {
  98         struct bpf_prog_array __rcu *progs;
  99         struct bpf_prog_list *pl;
 100         struct cgroup *p = cgrp;
 101         int cnt = 0;
 102
 103         /* count number of effective programs by walking parents */
 104         do {
 105                 if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
 106                         cnt += prog_list_length(&p->bpf.progs[type]);
 107                 p = cgroup_parent(p);
 108         } while (p);
 109
 110         progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
 111         if (!progs)
 112                 return -ENOMEM;
 113
 114         /* populate the array with effective progs */
 115         cnt = 0;
 116         p = cgrp;
 117         do {
 118                 if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
 119                         list_for_each_entry(pl,
 120                                             &p->bpf.progs[type], node) {
 121                                 if (!pl->prog)
 122                                         continue;
 123                                 rcu_dereference_protected(progs, 1)->
 124                                         progs[cnt++] = pl->prog;
 125                         }
 126                 p = cgroup_parent(p);
 127         } while (p);
 128
 129         *array = progs;
 130         return 0;
 131 }
 132
 133 static void activate_effective_progs(struct cgroup *cgrp,
 134                                      enum bpf_attach_type type,
 135                                      struct bpf_prog_array __rcu *array)
 136 {
 137         struct bpf_prog_array __rcu *old_array;
 138
 139         old_array = xchg(&cgrp->bpf.effective[type], array);
 140         /* free prog array after grace period, since __cgroup_bpf_run_*()
 141          * might be still walking the array
 142          */
 143         bpf_prog_array_free(old_array);
 144 }
 145
 146 /**
 147  * cgroup_bpf_inherit() - inherit effective programs from parent
 148  * @cgrp: the cgroup to modify
 149  */
 150 int cgroup_bpf_inherit(struct cgroup *cgrp)
 151 {
 152 /* has to use marco instead of const int, since compiler thinks
 153  * that array below is variable length
 154  */
 155 #define NR ARRAY_SIZE(cgrp->bpf.effective)
 156         struct bpf_prog_array __rcu *arrays[NR] = {};
 157         int i;
 158
 159         for (i = 0; i < NR; i++)
 160                 INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
 161
 162         for (i = 0; i < NR; i++)
 163                 if (compute_effective_progs(cgrp, i, &arrays[i]))
 164                         goto cleanup;
 165
 166         for (i = 0; i < NR; i++)
 167                 activate_effective_progs(cgrp, i, arrays[i]);
 168
 169         return 0;
 170 cleanup:
 171         for (i = 0; i < NR; i++)
 172                 bpf_prog_array_free(arrays[i]);
 173         return -ENOMEM;
 174 }
 175
 176 #define BPF_CGROUP_MAX_PROGS 64
 177
 178 /**
 179  * __cgroup_bpf_attach() - Attach the program to a cgroup, and
 180  *                         propagate the change to descendants
 181  * @cgrp: The cgroup which descendants to traverse
 182  * @prog: A program to attach
 183  * @type: Type of attach operation
 184  *
 185  * Must be called with cgroup_mutex held.
 186  */
 187 int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 188                         enum bpf_attach_type type, u32 flags)
 189 {
 190         struct list_head *progs = &cgrp->bpf.progs[type];
 191         struct bpf_prog *old_prog = NULL;
 192         struct cgroup_subsys_state *css;
 193         struct bpf_prog_list *pl;
 194         bool pl_was_allocated;
 195         int err;
 196
 197         if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
 198                 /* invalid combination */
 199                 return -EINVAL;
 200
 201         if (!hierarchy_allows_attach(cgrp, type, flags))
 202                 return -EPERM;
 203
 204         if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
 205                 /* Disallow attaching non-overridable on top
 206                  * of existing overridable in this cgroup.
 207                  * Disallow attaching multi-prog if overridable or none
 208                  */
 209                 return -EPERM;
 210
 211         if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
 212                 return -E2BIG;
 213
 214         if (flags & BPF_F_ALLOW_MULTI) {
 215                 list_for_each_entry(pl, progs, node)
 216                         if (pl->prog == prog)
 217                                 /* disallow attaching the same prog twice */
 218                                 return -EINVAL;
 219
 220                 pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 221                 if (!pl)
 222                         return -ENOMEM;
 223                 pl_was_allocated = true;
 224                 pl->prog = prog;
 225                 list_add_tail(&pl->node, progs);
 226         } else {
 227                 if (list_empty(progs)) {
 228                         pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 229                         if (!pl)
 230                                 return -ENOMEM;
 231                         pl_was_allocated = true;
 232                         list_add_tail(&pl->node, progs);
 233                 } else {
 234                         pl = list_first_entry(progs, typeof(*pl), node);
 235                         old_prog = pl->prog;
 236                         pl_was_allocated = false;
 237                 }
 238                 pl->prog = prog;
 239         }
 240
 241         cgrp->bpf.flags[type] = flags;
 242
 243         /* allocate and recompute effective prog arrays */
 244         css_for_each_descendant_pre(css, &cgrp->self) {
 245                 struct cgroup *desc = container_of(css, struct cgroup, self);
 246
 247                 err = compute_effective_progs(desc, type, &desc->bpf.inactive);
 248                 if (err)
 249                         goto cleanup;
 250         }
 251
 252         /* all allocations were successful. Activate all prog arrays */
 253         css_for_each_descendant_pre(css, &cgrp->self) {
 254                 struct cgroup *desc = container_of(css, struct cgroup, self);
 255
 256                 activate_effective_progs(desc, type, desc->bpf.inactive);
 257                 desc->bpf.inactive = NULL;
 258         }
 259
 260         static_branch_inc(&cgroup_bpf_enabled_key);
 261         if (old_prog) {
 262                 bpf_prog_put(old_prog);
 263                 static_branch_dec(&cgroup_bpf_enabled_key);
 264         }
 265         return 0;
 266
 267 cleanup:
 268         /* oom while computing effective. Free all computed effective arrays
 269          * since they were not activated
 270          */
 271         css_for_each_descendant_pre(css, &cgrp->self) {
 272                 struct cgroup *desc = container_of(css, struct cgroup, self);
 273
 274                 bpf_prog_array_free(desc->bpf.inactive);
 275                 desc->bpf.inactive = NULL;
 276         }
 277
 278         /* and cleanup the prog list */
 279         pl->prog = old_prog;
 280         if (pl_was_allocated) {
 281                 list_del(&pl->node);
 282                 kfree(pl);
 283         }
 284         return err;
 285 }
 286
 287 /**
 288  * __cgroup_bpf_detach() - Detach the program from a cgroup, and
 289  *                         propagate the change to descendants
 290  * @cgrp: The cgroup which descendants to traverse
 291  * @prog: A program to detach or NULL
 292  * @type: Type of detach operation
 293  *
 294  * Must be called with cgroup_mutex held.
 295  */
 296 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 297                         enum bpf_attach_type type, u32 unused_flags)
 298 {
 299         struct list_head *progs = &cgrp->bpf.progs[type];
 300         u32 flags = cgrp->bpf.flags[type];
 301         struct bpf_prog *old_prog = NULL;
 302         struct cgroup_subsys_state *css;
 303         struct bpf_prog_list *pl;
 304         int err;
 305
 306         if (flags & BPF_F_ALLOW_MULTI) {
 307                 if (!prog)
 308                         /* to detach MULTI prog the user has to specify valid FD
 309                          * of the program to be detached
 310                          */
 311                         return -EINVAL;
 312         } else {
 313                 if (list_empty(progs))
 314                         /* report error when trying to detach and nothing is attached */
 315                         return -ENOENT;
 316         }
 317
 318         if (flags & BPF_F_ALLOW_MULTI) {
 319                 /* find the prog and detach it */
 320                 list_for_each_entry(pl, progs, node) {
 321                         if (pl->prog != prog)
 322                                 continue;
 323                         old_prog = prog;
 324                         /* mark it deleted, so it's ignored while
 325                          * recomputing effective
 326                          */
 327                         pl->prog = NULL;
 328                         break;
 329                 }
 330                 if (!old_prog)
 331                         return -ENOENT;
 332         } else {
 333                 /* to maintain backward compatibility NONE and OVERRIDE cgroups
 334                  * allow detaching with invalid FD (prog==NULL)
 335                  */
 336                 pl = list_first_entry(progs, typeof(*pl), node);
 337                 old_prog = pl->prog;
 338                 pl->prog = NULL;
 339         }
 340
 341         /* allocate and recompute effective prog arrays */
 342         css_for_each_descendant_pre(css, &cgrp->self) {
 343                 struct cgroup *desc = container_of(css, struct cgroup, self);
 344
 345                 err = compute_effective_progs(desc, type, &desc->bpf.inactive);
 346                 if (err)
 347                         goto cleanup;
 348         }
 349
 350         /* all allocations were successful. Activate all prog arrays */
 351         css_for_each_descendant_pre(css, &cgrp->self) {
 352                 struct cgroup *desc = container_of(css, struct cgroup, self);
 353
 354                 activate_effective_progs(desc, type, desc->bpf.inactive);
 355                 desc->bpf.inactive = NULL;
 356         }
 357
 358         /* now can actually delete it from this cgroup list */
 359         list_del(&pl->node);
 360         kfree(pl);
 361         if (list_empty(progs))
 362                 /* last program was detached, reset flags to zero */
 363                 cgrp->bpf.flags[type] = 0;
 364
 365         bpf_prog_put(old_prog);
 366         static_branch_dec(&cgroup_bpf_enabled_key);
 367         return 0;
 368
 369 cleanup:
 370         /* oom while computing effective. Free all computed effective arrays
 371          * since they were not activated
 372          */
 373         css_for_each_descendant_pre(css, &cgrp->self) {
 374                 struct cgroup *desc = container_of(css, struct cgroup, self);
 375
 376                 bpf_prog_array_free(desc->bpf.inactive);
 377                 desc->bpf.inactive = NULL;
 378         }
 379
 380         /* and restore back old_prog */
 381         pl->prog = old_prog;
 382         return err;
 383 }
 384
 385 /* Must be called with cgroup_mutex held to avoid races. */
 386 int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 387                        union bpf_attr __user *uattr)
 388 {
 389         __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
 390         enum bpf_attach_type type = attr->query.attach_type;
 391         struct list_head *progs = &cgrp->bpf.progs[type];
 392         u32 flags = cgrp->bpf.flags[type];
 393         int cnt, ret = 0, i;
 394
 395         if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
 396                 cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
 397         else
 398                 cnt = prog_list_length(progs);
 399
 400         if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
 401                 return -EFAULT;
 402         if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
 403                 return -EFAULT;
 404         if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
 405                 /* return early if user requested only program count + flags */
 406                 return 0;
 407         if (attr->query.prog_cnt < cnt) {
 408                 cnt = attr->query.prog_cnt;
 409                 ret = -ENOSPC;
 410         }
 411
 412         if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
 413                 return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
 414                                                    prog_ids, cnt);
 415         } else {
 416                 struct bpf_prog_list *pl;
 417                 u32 id;
 418
 419                 i = 0;
 420                 list_for_each_entry(pl, progs, node) {
 421                         id = pl->prog->aux->id;
 422                         if (copy_to_user(prog_ids + i, &id, sizeof(id)))
 423                                 return -EFAULT;
 424                         if (++i == cnt)
 425                                 break;
 426                 }
 427         }
 428         return ret;
 429 }
 430
 431 /**
 432  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
 433  * @sk: The socket sending or receiving traffic
 434  * @skb: The skb that is being sent or received
 435  * @type: The type of program to be exectuted
 436  *
 437  * If no socket is passed, or the socket is not of type INET or INET6,
 438  * this function does nothing and returns 0.
 439  *
 440  * The program type passed in via @type must be suitable for network
 441  * filtering. No further check is performed to assert that.
 442  *
 443  * This function will return %-EPERM if any if an attached program was found
 444  * and if it returned != 1 during execution. In all other cases, 0 is returned.
 445  */
 446 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 447                                 struct sk_buff *skb,
 448                                 enum bpf_attach_type type)
 449 {
 450         unsigned int offset = skb->data - skb_network_header(skb);
 451         struct sock *save_sk;
 452         struct cgroup *cgrp;
 453         int ret;
 454
 455         if (!sk || !sk_fullsock(sk))
 456                 return 0;
 457
 458         if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 459                 return 0;
 460
 461         cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 462         save_sk = skb->sk;
 463         skb->sk = sk;
 464         __skb_push(skb, offset);
 465         ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
 466                                  bpf_prog_run_save_cb);
 467         __skb_pull(skb, offset);
 468         skb->sk = save_sk;
 469         return ret == 1 ? 0 : -EPERM;
 470 }
 471 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
 472
 473 /**
 474  * __cgroup_bpf_run_filter_sk() - Run a program on a sock
 475  * @sk: sock structure to manipulate
 476  * @type: The type of program to be exectuted
 477  *
 478  * socket is passed is expected to be of type INET or INET6.
 479  *
 480  * The program type passed in via @type must be suitable for sock
 481  * filtering. No further check is performed to assert that.
 482  *
 483  * This function will return %-EPERM if any if an attached program was found
 484  * and if it returned != 1 during execution. In all other cases, 0 is returned.
 485  */
 486 int __cgroup_bpf_run_filter_sk(struct sock *sk,
 487                                enum bpf_attach_type type)
 488 {
 489         struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 490         int ret;
 491
 492         ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
 493         return ret == 1 ? 0 : -EPERM;
 494 }
 495 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 496
 497 /**
 498  * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
 499  * @sk: socket to get cgroup from
 500  * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
 501  * sk with connection information (IP addresses, etc.) May not contain
 502  * cgroup info if it is a req sock.
 503  * @type: The type of program to be exectuted
 504  *
 505  * socket passed is expected to be of type INET or INET6.
 506  *
 507  * The program type passed in via @type must be suitable for sock_ops
 508  * filtering. No further check is performed to assert that.
 509  *
 510  * This function will return %-EPERM if any if an attached program was found
 511  * and if it returned != 1 during execution. In all other cases, 0 is returned.
 512  */
 513 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 514                                      struct bpf_sock_ops_kern *sock_ops,
 515                                      enum bpf_attach_type type)
 516 {
 517         struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 518         int ret;
 519
 520         ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
 521                                  BPF_PROG_RUN);
 522         return ret == 1 ? 0 : -EPERM;
 523 }
 524 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
 525
 526 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 527                                       short access, enum bpf_attach_type type)
 528 {
 529         struct cgroup *cgrp;
 530         struct bpf_cgroup_dev_ctx ctx = {
 531                 .access_type = (access << 16) | dev_type,
 532                 .major = major,
 533                 .minor = minor,
 534         };
 535         int allow = 1;
 536
 537         rcu_read_lock();
 538         cgrp = task_dfl_cgroup(current);
 539         allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
 540                                    BPF_PROG_RUN);
 541         rcu_read_unlock();
 542
 543         return !allow;
 544 }
 545 EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
 546
 547 static const struct bpf_func_proto *
 548 cgroup_dev_func_proto(enum bpf_func_id func_id)
 549 {
 550         switch (func_id) {
 551         case BPF_FUNC_map_lookup_elem:
 552                 return &bpf_map_lookup_elem_proto;
 553         case BPF_FUNC_map_update_elem:
 554                 return &bpf_map_update_elem_proto;
 555         case BPF_FUNC_map_delete_elem:
 556                 return &bpf_map_delete_elem_proto;
 557         case BPF_FUNC_get_current_uid_gid:
 558                 return &bpf_get_current_uid_gid_proto;
 559         case BPF_FUNC_trace_printk:
 560                 if (capable(CAP_SYS_ADMIN))
 561                         return bpf_get_trace_printk_proto();
 562         default:
 563                 return NULL;
 564         }
 565 }
 566
 567 static bool cgroup_dev_is_valid_access(int off, int size,
 568                                        enum bpf_access_type type,
 569                                        struct bpf_insn_access_aux *info)
 570 {
 571         const int size_default = sizeof(__u32);
 572
 573         if (type == BPF_WRITE)
 574                 return false;
 575
 576         if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
 577                 return false;
 578         /* The verifier guarantees that size > 0. */
 579         if (off % size != 0)
 580                 return false;
 581
 582         switch (off) {
 583         case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
 584                 bpf_ctx_record_field_size(info, size_default);
 585                 if (!bpf_ctx_narrow_access_ok(off, size, size_default))
 586                         return false;
 587                 break;
 588         default:
 589                 if (size != size_default)
 590                         return false;
 591         }
 592
 593         return true;
 594 }
 595
 596 const struct bpf_prog_ops cg_dev_prog_ops = {
 597 };
 598
 599 const struct bpf_verifier_ops cg_dev_verifier_ops = {
 600         .get_func_proto         = cgroup_dev_func_proto,
 601         .is_valid_access        = cgroup_dev_is_valid_access,
 602 };