datapath/linux/compat/nf_conncount.c

   1 /*
   2  * Backported from upstream commit 5c789e131cbb ("netfilter:
   3  * nf_conncount: Add list lock and gc worker, and RCU for init tree search")
   4  *
   5  * count the number of connections matching an arbitrary key.
   6  *
   7  * (C) 2017 Red Hat GmbH
   8  * Author: Florian Westphal <fw@strlen.de>
   9  *
  10  * split from xt_connlimit.c:
  11  *   (c) 2000 Gerd Knorr <kraxel@bytesex.org>
  12  *   Nov 2002: Martin Bene <martin.bene@icomedias.com>:
  13  *              only ignore TIME_WAIT or gone connections
  14  *   (C) CC Computer Consultants GmbH, 2007
  15  */
  16 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  17 #include <linux/in.h>
  18 #include <linux/in6.h>
  19 #include <linux/ip.h>
  20 #include <linux/ipv6.h>
  21 #include <linux/jhash.h>
  22 #include <linux/slab.h>
  23 #include <linux/list.h>
  24 #include <linux/rbtree.h>
  25 #include <linux/module.h>
  26 #include <linux/random.h>
  27 #include <linux/skbuff.h>
  28 #include <linux/spinlock.h>
  29 #include <linux/netfilter/nf_conntrack_tcp.h>
  30 #include <linux/netfilter/x_tables.h>
  31 #include <net/netfilter/nf_conntrack.h>
  32 #include <net/netfilter/nf_conntrack_count.h>
  33 #include <net/netfilter/nf_conntrack_core.h>
  34 #include <net/netfilter/nf_conntrack_tuple.h>
  35 #include <net/netfilter/nf_conntrack_zones.h>
  36
  37 #define CONNCOUNT_SLOTS         256U
  38
  39 #ifdef CONFIG_LOCKDEP
  40 #define CONNCOUNT_LOCK_SLOTS    8U
  41 #else
  42 #define CONNCOUNT_LOCK_SLOTS    256U
  43 #endif
  44
  45 #define CONNCOUNT_GC_MAX_NODES  8
  46 #define MAX_KEYLEN              5
  47
  48 /* we will save the tuples of all connections we care about */
  49 struct nf_conncount_tuple {
  50         struct list_head                node;
  51         struct nf_conntrack_tuple       tuple;
  52         struct nf_conntrack_zone        zone;
  53         int                             cpu;
  54         u32                             jiffies32;
  55         struct rcu_head                 rcu_head;
  56 };
  57
  58 struct nf_conncount_rb {
  59         struct rb_node node;
  60         struct nf_conncount_list list;
  61         u32 key[MAX_KEYLEN];
  62         struct rcu_head rcu_head;
  63 };
  64
  65 static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp;
  66
  67 struct nf_conncount_data {
  68         unsigned int keylen;
  69         struct rb_root root[CONNCOUNT_SLOTS];
  70         struct net *net;
  71         struct work_struct gc_work;
  72         unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)];
  73         unsigned int gc_tree;
  74 };
  75
  76 static u_int32_t conncount_rnd __read_mostly;
  77 static struct kmem_cache *conncount_rb_cachep __read_mostly;
  78 static struct kmem_cache *conncount_conn_cachep __read_mostly;
  79
  80 static inline bool already_closed(const struct nf_conn *conn)
  81 {
  82         if (nf_ct_protonum(conn) == IPPROTO_TCP)
  83                 return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT ||
  84                        conn->proto.tcp.state == TCP_CONNTRACK_CLOSE;
  85         else
  86                 return false;
  87 }
  88
  89 static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
  90 {
  91         return memcmp(a, b, klen * sizeof(u32));
  92 }
  93
  94 static enum nf_conncount_list_add
  95 nf_conncount_add(struct nf_conncount_list *list,
  96                  const struct nf_conntrack_tuple *tuple,
  97                  const struct nf_conntrack_zone *zone)
  98 {
  99         struct nf_conncount_tuple *conn;
 100
 101         if (WARN_ON_ONCE(list->count > INT_MAX))
 102                 return NF_CONNCOUNT_ERR;
 103
 104         conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
 105         if (conn == NULL)
 106                 return NF_CONNCOUNT_ERR;
 107
 108         conn->tuple = *tuple;
 109         conn->zone = *zone;
 110         conn->cpu = raw_smp_processor_id();
 111         conn->jiffies32 = (u32)jiffies;
 112         spin_lock(&list->list_lock);
 113         if (list->dead == true) {
 114                 kmem_cache_free(conncount_conn_cachep, conn);
 115                 spin_unlock(&list->list_lock);
 116                 return NF_CONNCOUNT_SKIP;
 117         }
 118         list_add_tail(&conn->node, &list->head);
 119         list->count++;
 120         spin_unlock(&list->list_lock);
 121         return NF_CONNCOUNT_ADDED;
 122 }
 123
 124 static void __conn_free(struct rcu_head *h)
 125 {
 126         struct nf_conncount_tuple *conn;
 127
 128         conn = container_of(h, struct nf_conncount_tuple, rcu_head);
 129         kmem_cache_free(conncount_conn_cachep, conn);
 130 }
 131
 132 static bool conn_free(struct nf_conncount_list *list,
 133                       struct nf_conncount_tuple *conn)
 134 {
 135         bool free_entry = false;
 136
 137         spin_lock(&list->list_lock);
 138
 139         if (list->count == 0) {
 140                 spin_unlock(&list->list_lock);
 141                 return free_entry;
 142         }
 143
 144         list->count--;
 145         list_del_rcu(&conn->node);
 146         if (list->count == 0)
 147                 free_entry = true;
 148
 149         spin_unlock(&list->list_lock);
 150         call_rcu(&conn->rcu_head, __conn_free);
 151         return free_entry;
 152 }
 153
 154 static const struct nf_conntrack_tuple_hash *
 155 find_or_evict(struct net *net, struct nf_conncount_list *list,
 156               struct nf_conncount_tuple *conn, bool *free_entry)
 157 {
 158         const struct nf_conntrack_tuple_hash *found;
 159         unsigned long a, b;
 160         int cpu = raw_smp_processor_id();
 161         __s32 age;
 162
 163         found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple);
 164         if (found)
 165                 return found;
 166         b = conn->jiffies32;
 167         a = (u32)jiffies;
 168
 169         /* conn might have been added just before by another cpu and
 170          * might still be unconfirmed.  In this case, nf_conntrack_find()
 171          * returns no result.  Thus only evict if this cpu added the
 172          * stale entry or if the entry is older than two jiffies.
 173          */
 174         age = a - b;
 175         if (conn->cpu == cpu || age >= 2) {
 176                 *free_entry = conn_free(list, conn);
 177                 return ERR_PTR(-ENOENT);
 178         }
 179
 180         return ERR_PTR(-EAGAIN);
 181 }
 182
 183 static void nf_conncount_lookup(struct net *net,
 184                          struct nf_conncount_list *list,
 185                          const struct nf_conntrack_tuple *tuple,
 186                          const struct nf_conntrack_zone *zone,
 187                          bool *addit)
 188 {
 189         const struct nf_conntrack_tuple_hash *found;
 190         struct nf_conncount_tuple *conn, *conn_n;
 191         struct nf_conn *found_ct;
 192         unsigned int collect = 0;
 193         bool free_entry = false;
 194
 195         /* best effort only */
 196         *addit = tuple ? true : false;
 197
 198         /* check the saved connections */
 199         list_for_each_entry_safe(conn, conn_n, &list->head, node) {
 200                 if (collect > CONNCOUNT_GC_MAX_NODES)
 201                         break;
 202
 203                 found = find_or_evict(net, list, conn, &free_entry);
 204                 if (IS_ERR(found)) {
 205                         /* Not found, but might be about to be confirmed */
 206                         if (PTR_ERR(found) == -EAGAIN) {
 207                                 if (!tuple)
 208                                         continue;
 209
 210                                 if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
 211                                     nf_ct_zone_id(&conn->zone, conn->zone.dir) ==
 212                                     nf_ct_zone_id(zone, zone->dir))
 213                                         *addit = false;
 214                         } else if (PTR_ERR(found) == -ENOENT)
 215                                 collect++;
 216                         continue;
 217                 }
 218
 219                 found_ct = nf_ct_tuplehash_to_ctrack(found);
 220
 221                 if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) &&
 222                     nf_ct_zone_equal(found_ct, zone, zone->dir)) {
 223                         /*
 224                          * We should not see tuples twice unless someone hooks
 225                          * this into a table without "-p tcp --syn".
 226                          *
 227                          * Attempt to avoid a re-add in this case.
 228                          */
 229                         *addit = false;
 230                 } else if (already_closed(found_ct)) {
 231                         /*
 232                          * we do not care about connections which are
 233                          * closed already -> ditch it
 234                          */
 235                         nf_ct_put(found_ct);
 236                         conn_free(list, conn);
 237                         collect++;
 238                         continue;
 239                 }
 240
 241                 nf_ct_put(found_ct);
 242         }
 243 }
 244
 245 static void nf_conncount_list_init(struct nf_conncount_list *list)
 246 {
 247         spin_lock_init(&list->list_lock);
 248         INIT_LIST_HEAD(&list->head);
 249         list->count = 1;
 250         list->dead = false;
 251 }
 252
 253 /* Return true if the list is empty */
 254 static bool nf_conncount_gc_list(struct net *net,
 255                           struct nf_conncount_list *list)
 256 {
 257         const struct nf_conntrack_tuple_hash *found;
 258         struct nf_conncount_tuple *conn, *conn_n;
 259         struct nf_conn *found_ct;
 260         unsigned int collected = 0;
 261         bool free_entry = false;
 262
 263         list_for_each_entry_safe(conn, conn_n, &list->head, node) {
 264                 found = find_or_evict(net, list, conn, &free_entry);
 265                 if (IS_ERR(found)) {
 266                         if (PTR_ERR(found) == -ENOENT)  {
 267                                 if (free_entry)
 268                                         return true;
 269                                 collected++;
 270                         }
 271                         continue;
 272                 }
 273
 274                 found_ct = nf_ct_tuplehash_to_ctrack(found);
 275                 if (already_closed(found_ct)) {
 276                         /*
 277                          * we do not care about connections which are
 278                          * closed already -> ditch it
 279                          */
 280                         nf_ct_put(found_ct);
 281                         if (conn_free(list, conn))
 282                                 return true;
 283                         collected++;
 284                         continue;
 285                 }
 286
 287                 nf_ct_put(found_ct);
 288                 if (collected > CONNCOUNT_GC_MAX_NODES)
 289                         return false;
 290         }
 291         return false;
 292 }
 293
 294 static void __tree_nodes_free(struct rcu_head *h)
 295 {
 296         struct nf_conncount_rb *rbconn;
 297
 298         rbconn = container_of(h, struct nf_conncount_rb, rcu_head);
 299         kmem_cache_free(conncount_rb_cachep, rbconn);
 300 }
 301
 302 static void tree_nodes_free(struct rb_root *root,
 303                             struct nf_conncount_rb *gc_nodes[],
 304                             unsigned int gc_count)
 305 {
 306         struct nf_conncount_rb *rbconn;
 307
 308         while (gc_count) {
 309                 rbconn = gc_nodes[--gc_count];
 310                 spin_lock(&rbconn->list.list_lock);
 311                 if (rbconn->list.count == 0 && rbconn->list.dead == false) {
 312                         rbconn->list.dead = true;
 313                         rb_erase(&rbconn->node, root);
 314                         call_rcu(&rbconn->rcu_head, __tree_nodes_free);
 315                 }
 316                 spin_unlock(&rbconn->list.list_lock);
 317         }
 318 }
 319
 320 static void schedule_gc_worker(struct nf_conncount_data *data, int tree)
 321 {
 322         set_bit(tree, data->pending_trees);
 323         schedule_work(&data->gc_work);
 324 }
 325
 326 static unsigned int
 327 insert_tree(struct net *net,
 328             struct nf_conncount_data *data,
 329             struct rb_root *root,
 330             unsigned int hash,
 331             const u32 *key,
 332             u8 keylen,
 333             const struct nf_conntrack_tuple *tuple,
 334             const struct nf_conntrack_zone *zone)
 335 {
 336         enum nf_conncount_list_add ret;
 337         struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
 338         struct rb_node **rbnode, *parent;
 339         struct nf_conncount_rb *rbconn;
 340         struct nf_conncount_tuple *conn;
 341         unsigned int count = 0, gc_count = 0;
 342         bool node_found = false;
 343
 344         spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
 345
 346         parent = NULL;
 347         rbnode = &(root->rb_node);
 348         while (*rbnode) {
 349                 int diff;
 350                 rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
 351
 352                 parent = *rbnode;
 353                 diff = key_diff(key, rbconn->key, keylen);
 354                 if (diff < 0) {
 355                         rbnode = &((*rbnode)->rb_left);
 356                 } else if (diff > 0) {
 357                         rbnode = &((*rbnode)->rb_right);
 358                 } else {
 359                         /* unlikely: other cpu added node already */
 360                         node_found = true;
 361                         ret = nf_conncount_add(&rbconn->list, tuple, zone);
 362                         if (ret == NF_CONNCOUNT_ERR) {
 363                                 count = 0; /* hotdrop */
 364                         } else if (ret == NF_CONNCOUNT_ADDED) {
 365                                 count = rbconn->list.count;
 366                         } else {
 367                                 /* NF_CONNCOUNT_SKIP, rbconn is already
 368                                  * reclaimed by gc, insert a new tree node
 369                                  */
 370                                 node_found = false;
 371                         }
 372                         break;
 373                 }
 374
 375                 if (gc_count >= ARRAY_SIZE(gc_nodes))
 376                         continue;
 377
 378                 if (nf_conncount_gc_list(net, &rbconn->list))
 379                         gc_nodes[gc_count++] = rbconn;
 380         }
 381
 382         if (gc_count) {
 383                 tree_nodes_free(root, gc_nodes, gc_count);
 384                 /* tree_node_free before new allocation permits
 385                  * allocator to re-use newly free'd object.
 386                  *
 387                  * This is a rare event; in most cases we will find
 388                  * existing node to re-use. (or gc_count is 0).
 389                  */
 390
 391                 if (gc_count >= ARRAY_SIZE(gc_nodes))
 392                         schedule_gc_worker(data, hash);
 393         }
 394
 395         if (node_found)
 396                 goto out_unlock;
 397
 398         /* expected case: match, insert new node */
 399         rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
 400         if (rbconn == NULL)
 401                 goto out_unlock;
 402
 403         conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
 404         if (conn == NULL) {
 405                 kmem_cache_free(conncount_rb_cachep, rbconn);
 406                 goto out_unlock;
 407         }
 408
 409         conn->tuple = *tuple;
 410         conn->zone = *zone;
 411         memcpy(rbconn->key, key, sizeof(u32) * keylen);
 412
 413         nf_conncount_list_init(&rbconn->list);
 414         list_add(&conn->node, &rbconn->list.head);
 415         count = 1;
 416
 417         rb_link_node(&rbconn->node, parent, rbnode);
 418         rb_insert_color(&rbconn->node, root);
 419 out_unlock:
 420         spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
 421         return count;
 422 }
 423
 424 static unsigned int
 425 count_tree(struct net *net,
 426            struct nf_conncount_data *data,
 427            const u32 *key,
 428            const struct nf_conntrack_tuple *tuple,
 429            const struct nf_conntrack_zone *zone)
 430 {
 431         enum nf_conncount_list_add ret;
 432         struct rb_root *root;
 433         struct rb_node *parent;
 434         struct nf_conncount_rb *rbconn;
 435         unsigned int hash;
 436         u8 keylen = data->keylen;
 437
 438         hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;
 439         root = &data->root[hash];
 440
 441         parent = rcu_dereference_raw(root->rb_node);
 442         while (parent) {
 443                 int diff;
 444                 bool addit;
 445
 446                 rbconn = rb_entry(parent, struct nf_conncount_rb, node);
 447
 448                 diff = key_diff(key, rbconn->key, keylen);
 449                 if (diff < 0) {
 450                         parent = rcu_dereference_raw(parent->rb_left);
 451                 } else if (diff > 0) {
 452                         parent = rcu_dereference_raw(parent->rb_right);
 453                 } else {
 454                         /* same source network -> be counted! */
 455                         nf_conncount_lookup(net, &rbconn->list, tuple, zone,
 456                                             &addit);
 457
 458                         if (!addit)
 459                                 return rbconn->list.count;
 460
 461                         ret = nf_conncount_add(&rbconn->list, tuple, zone);
 462                         if (ret == NF_CONNCOUNT_ERR) {
 463                                 return 0; /* hotdrop */
 464                         } else if (ret == NF_CONNCOUNT_ADDED) {
 465                                 return rbconn->list.count;
 466                         } else {
 467                                 /* NF_CONNCOUNT_SKIP, rbconn is already
 468                                  * reclaimed by gc, insert a new tree node
 469                                  */
 470                                 break;
 471                         }
 472                 }
 473         }
 474
 475         if (!tuple)
 476                 return 0;
 477
 478         return insert_tree(net, data, root, hash, key, keylen, tuple, zone);
 479 }
 480
 481 static void tree_gc_worker(struct work_struct *work)
 482 {
 483         struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work);
 484         struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn;
 485         struct rb_root *root;
 486         struct rb_node *node;
 487         unsigned int tree, next_tree, gc_count = 0;
 488
 489         tree = data->gc_tree % CONNCOUNT_LOCK_SLOTS;
 490         root = &data->root[tree];
 491
 492         rcu_read_lock();
 493         for (node = rb_first(root); node != NULL; node = rb_next(node)) {
 494                 rbconn = rb_entry(node, struct nf_conncount_rb, node);
 495                 if (nf_conncount_gc_list(data->net, &rbconn->list))
 496                         gc_nodes[gc_count++] = rbconn;
 497         }
 498         rcu_read_unlock();
 499
 500         spin_lock_bh(&nf_conncount_locks[tree]);
 501
 502         if (gc_count) {
 503                 tree_nodes_free(root, gc_nodes, gc_count);
 504         }
 505
 506         clear_bit(tree, data->pending_trees);
 507
 508         next_tree = (tree + 1) % CONNCOUNT_SLOTS;
 509         next_tree = find_next_bit(data->pending_trees, next_tree, CONNCOUNT_SLOTS);
 510
 511         if (next_tree < CONNCOUNT_SLOTS) {
 512                 data->gc_tree = next_tree;
 513                 schedule_work(work);
 514         }
 515
 516         spin_unlock_bh(&nf_conncount_locks[tree]);
 517 }
 518
 519 /* Count and return number of conntrack entries in 'net' with particular 'key'.
 520  * If 'tuple' is not null, insert it into the accounting data structure.
 521  * Call with RCU read lock.
 522  */
 523 unsigned int rpl_nf_conncount_count(struct net *net,
 524                                     struct nf_conncount_data *data,
 525                                     const u32 *key,
 526                                     const struct nf_conntrack_tuple *tuple,
 527                                     const struct nf_conntrack_zone *zone)
 528 {
 529         return count_tree(net, data, key, tuple, zone);
 530 }
 531 EXPORT_SYMBOL_GPL(rpl_nf_conncount_count);
 532
 533 struct nf_conncount_data *rpl_nf_conncount_init(struct net *net, unsigned int family,
 534                                             unsigned int keylen)
 535 {
 536         struct nf_conncount_data *data;
 537         int ret, i;
 538
 539         if (keylen % sizeof(u32) ||
 540             keylen / sizeof(u32) > MAX_KEYLEN ||
 541             keylen == 0)
 542                 return ERR_PTR(-EINVAL);
 543
 544         net_get_random_once(&conncount_rnd, sizeof(conncount_rnd));
 545
 546         data = kmalloc(sizeof(*data), GFP_KERNEL);
 547         if (!data)
 548                 return ERR_PTR(-ENOMEM);
 549
 550         ret = nf_ct_netns_get(net, family);
 551         if (ret < 0) {
 552                 kfree(data);
 553                 return ERR_PTR(ret);
 554         }
 555
 556         for (i = 0; i < ARRAY_SIZE(data->root); ++i)
 557                 data->root[i] = RB_ROOT;
 558
 559         data->keylen = keylen / sizeof(u32);
 560         data->net = net;
 561         INIT_WORK(&data->gc_work, tree_gc_worker);
 562
 563         return data;
 564 }
 565 EXPORT_SYMBOL_GPL(rpl_nf_conncount_init);
 566
 567 static void nf_conncount_cache_free(struct nf_conncount_list *list)
 568 {
 569         struct nf_conncount_tuple *conn, *conn_n;
 570
 571         list_for_each_entry_safe(conn, conn_n, &list->head, node)
 572                 kmem_cache_free(conncount_conn_cachep, conn);
 573 }
 574
 575 static void destroy_tree(struct rb_root *r)
 576 {
 577         struct nf_conncount_rb *rbconn;
 578         struct rb_node *node;
 579
 580         while ((node = rb_first(r)) != NULL) {
 581                 rbconn = rb_entry(node, struct nf_conncount_rb, node);
 582
 583                 rb_erase(node, r);
 584
 585                 nf_conncount_cache_free(&rbconn->list);
 586
 587                 kmem_cache_free(conncount_rb_cachep, rbconn);
 588         }
 589 }
 590
 591 void rpl_nf_conncount_destroy(struct net *net, unsigned int family,
 592                               struct nf_conncount_data *data)
 593 {
 594         unsigned int i;
 595
 596         cancel_work_sync(&data->gc_work);
 597         nf_ct_netns_put(net, family);
 598
 599         for (i = 0; i < ARRAY_SIZE(data->root); ++i)
 600                 destroy_tree(&data->root[i]);
 601
 602         kfree(data);
 603 }
 604 EXPORT_SYMBOL_GPL(rpl_nf_conncount_destroy);
 605
 606 int rpl_nf_conncount_modinit(void)
 607 {
 608         int i;
 609
 610         BUILD_BUG_ON(CONNCOUNT_LOCK_SLOTS > CONNCOUNT_SLOTS);
 611         BUILD_BUG_ON((CONNCOUNT_SLOTS % CONNCOUNT_LOCK_SLOTS) != 0);
 612
 613         for (i = 0; i < CONNCOUNT_LOCK_SLOTS; ++i)
 614                 spin_lock_init(&nf_conncount_locks[i]);
 615
 616         conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple",
 617                                            sizeof(struct nf_conncount_tuple),
 618                                            0, 0, NULL);
 619         if (!conncount_conn_cachep)
 620                 return -ENOMEM;
 621
 622         conncount_rb_cachep = kmem_cache_create("nf_conncount_rb",
 623                                            sizeof(struct nf_conncount_rb),
 624                                            0, 0, NULL);
 625         if (!conncount_rb_cachep) {
 626                 kmem_cache_destroy(conncount_conn_cachep);
 627                 return -ENOMEM;
 628         }
 629
 630         return 0;
 631 }
 632
 633 void rpl_nf_conncount_modexit(void)
 634 {
 635         kmem_cache_destroy(conncount_conn_cachep);
 636         kmem_cache_destroy(conncount_rb_cachep);
 637 }