net/netfilter/nf_conntrack_core.c

   1 /* Connection state tracking for netfilter.  This is separated from,
   2    but required by, the NAT layer; it can also be used by an iptables
   3    extension. */
   4
   5 /* (C) 1999-2001 Paul `Rusty' Russell
   6  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
   7  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
   8  * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License version 2 as
  12  * published by the Free Software Foundation.
  13  */
  14
  15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  16
  17 #include <linux/types.h>
  18 #include <linux/netfilter.h>
  19 #include <linux/module.h>
  20 #include <linux/sched.h>
  21 #include <linux/skbuff.h>
  22 #include <linux/proc_fs.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/stddef.h>
  25 #include <linux/slab.h>
  26 #include <linux/random.h>
  27 #include <linux/jhash.h>
  28 #include <linux/err.h>
  29 #include <linux/percpu.h>
  30 #include <linux/moduleparam.h>
  31 #include <linux/notifier.h>
  32 #include <linux/kernel.h>
  33 #include <linux/netdevice.h>
  34 #include <linux/socket.h>
  35 #include <linux/mm.h>
  36 #include <linux/nsproxy.h>
  37 #include <linux/rculist_nulls.h>
  38
  39 #include <net/netfilter/nf_conntrack.h>
  40 #include <net/netfilter/nf_conntrack_l3proto.h>
  41 #include <net/netfilter/nf_conntrack_l4proto.h>
  42 #include <net/netfilter/nf_conntrack_expect.h>
  43 #include <net/netfilter/nf_conntrack_helper.h>
  44 #include <net/netfilter/nf_conntrack_seqadj.h>
  45 #include <net/netfilter/nf_conntrack_core.h>
  46 #include <net/netfilter/nf_conntrack_extend.h>
  47 #include <net/netfilter/nf_conntrack_acct.h>
  48 #include <net/netfilter/nf_conntrack_ecache.h>
  49 #include <net/netfilter/nf_conntrack_zones.h>
  50 #include <net/netfilter/nf_conntrack_timestamp.h>
  51 #include <net/netfilter/nf_conntrack_timeout.h>
  52 #include <net/netfilter/nf_conntrack_labels.h>
  53 #include <net/netfilter/nf_conntrack_synproxy.h>
  54 #include <net/netfilter/nf_nat.h>
  55 #include <net/netfilter/nf_nat_core.h>
  56 #include <net/netfilter/nf_nat_helper.h>
  57 #include <net/netns/hash.h>
  58
  59 #include "nf_internals.h"
  60
  61 #define NF_CONNTRACK_VERSION    "0.5.0"
  62
  63 int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
  64                                       enum nf_nat_manip_type manip,
  65                                       const struct nlattr *attr) __read_mostly;
  66 EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
  67
  68 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
  69 EXPORT_SYMBOL_GPL(nf_conntrack_locks);
  70
  71 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
  72 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
  73
  74 struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
  75 EXPORT_SYMBOL_GPL(nf_conntrack_hash);
  76
  77 struct conntrack_gc_work {
  78         struct delayed_work     dwork;
  79         u32                     last_bucket;
  80         bool                    exiting;
  81         bool                    early_drop;
  82         long                    next_gc_run;
  83 };
  84
  85 static __read_mostly struct kmem_cache *nf_conntrack_cachep;
  86 static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
  87 static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
  88 static __read_mostly bool nf_conntrack_locks_all;
  89
  90 /* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */
  91 #define GC_MAX_BUCKETS_DIV      128u
  92 /* upper bound of full table scan */
  93 #define GC_MAX_SCAN_JIFFIES     (16u * HZ)
  94 /* desired ratio of entries found to be expired */
  95 #define GC_EVICT_RATIO  50u
  96
  97 static struct conntrack_gc_work conntrack_gc_work;
  98
  99 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
 100 {
 101         /* 1) Acquire the lock */
 102         spin_lock(lock);
 103
 104         /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics
 105          * It pairs with the smp_store_release() in nf_conntrack_all_unlock()
 106          */
 107         if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false))
 108                 return;
 109
 110         /* fast path failed, unlock */
 111         spin_unlock(lock);
 112
 113         /* Slow path 1) get global lock */
 114         spin_lock(&nf_conntrack_locks_all_lock);
 115
 116         /* Slow path 2) get the lock we want */
 117         spin_lock(lock);
 118
 119         /* Slow path 3) release the global lock */
 120         spin_unlock(&nf_conntrack_locks_all_lock);
 121 }
 122 EXPORT_SYMBOL_GPL(nf_conntrack_lock);
 123
 124 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
 125 {
 126         h1 %= CONNTRACK_LOCKS;
 127         h2 %= CONNTRACK_LOCKS;
 128         spin_unlock(&nf_conntrack_locks[h1]);
 129         if (h1 != h2)
 130                 spin_unlock(&nf_conntrack_locks[h2]);
 131 }
 132
 133 /* return true if we need to recompute hashes (in case hash table was resized) */
 134 static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
 135                                      unsigned int h2, unsigned int sequence)
 136 {
 137         h1 %= CONNTRACK_LOCKS;
 138         h2 %= CONNTRACK_LOCKS;
 139         if (h1 <= h2) {
 140                 nf_conntrack_lock(&nf_conntrack_locks[h1]);
 141                 if (h1 != h2)
 142                         spin_lock_nested(&nf_conntrack_locks[h2],
 143                                          SINGLE_DEPTH_NESTING);
 144         } else {
 145                 nf_conntrack_lock(&nf_conntrack_locks[h2]);
 146                 spin_lock_nested(&nf_conntrack_locks[h1],
 147                                  SINGLE_DEPTH_NESTING);
 148         }
 149         if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
 150                 nf_conntrack_double_unlock(h1, h2);
 151                 return true;
 152         }
 153         return false;
 154 }
 155
 156 static void nf_conntrack_all_lock(void)
 157 {
 158         int i;
 159
 160         spin_lock(&nf_conntrack_locks_all_lock);
 161
 162         nf_conntrack_locks_all = true;
 163
 164         for (i = 0; i < CONNTRACK_LOCKS; i++) {
 165                 spin_lock(&nf_conntrack_locks[i]);
 166
 167                 /* This spin_unlock provides the "release" to ensure that
 168                  * nf_conntrack_locks_all==true is visible to everyone that
 169                  * acquired spin_lock(&nf_conntrack_locks[]).
 170                  */
 171                 spin_unlock(&nf_conntrack_locks[i]);
 172         }
 173 }
 174
 175 static void nf_conntrack_all_unlock(void)
 176 {
 177         /* All prior stores must be complete before we clear
 178          * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock()
 179          * might observe the false value but not the entire
 180          * critical section.
 181          * It pairs with the smp_load_acquire() in nf_conntrack_lock()
 182          */
 183         smp_store_release(&nf_conntrack_locks_all, false);
 184         spin_unlock(&nf_conntrack_locks_all_lock);
 185 }
 186
 187 unsigned int nf_conntrack_htable_size __read_mostly;
 188 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 189
 190 unsigned int nf_conntrack_max __read_mostly;
 191 seqcount_t nf_conntrack_generation __read_mostly;
 192 static unsigned int nf_conntrack_hash_rnd __read_mostly;
 193
 194 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
 195                               const struct net *net)
 196 {
 197         unsigned int n;
 198         u32 seed;
 199
 200         get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
 201
 202         /* The direction must be ignored, so we hash everything up to the
 203          * destination ports (which is a multiple of 4) and treat the last
 204          * three bytes manually.
 205          */
 206         seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
 207         n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
 208         return jhash2((u32 *)tuple, n, seed ^
 209                       (((__force __u16)tuple->dst.u.all << 16) |
 210                       tuple->dst.protonum));
 211 }
 212
 213 static u32 scale_hash(u32 hash)
 214 {
 215         return reciprocal_scale(hash, nf_conntrack_htable_size);
 216 }
 217
 218 static u32 __hash_conntrack(const struct net *net,
 219                             const struct nf_conntrack_tuple *tuple,
 220                             unsigned int size)
 221 {
 222         return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
 223 }
 224
 225 static u32 hash_conntrack(const struct net *net,
 226                           const struct nf_conntrack_tuple *tuple)
 227 {
 228         return scale_hash(hash_conntrack_raw(tuple, net));
 229 }
 230
 231 bool
 232 nf_ct_get_tuple(const struct sk_buff *skb,
 233                 unsigned int nhoff,
 234                 unsigned int dataoff,
 235                 u_int16_t l3num,
 236                 u_int8_t protonum,
 237                 struct net *net,
 238                 struct nf_conntrack_tuple *tuple,
 239                 const struct nf_conntrack_l3proto *l3proto,
 240                 const struct nf_conntrack_l4proto *l4proto)
 241 {
 242         memset(tuple, 0, sizeof(*tuple));
 243
 244         tuple->src.l3num = l3num;
 245         if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
 246                 return false;
 247
 248         tuple->dst.protonum = protonum;
 249         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
 250
 251         return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
 252 }
 253 EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
 254
 255 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
 256                        u_int16_t l3num,
 257                        struct net *net, struct nf_conntrack_tuple *tuple)
 258 {
 259         const struct nf_conntrack_l3proto *l3proto;
 260         const struct nf_conntrack_l4proto *l4proto;
 261         unsigned int protoff;
 262         u_int8_t protonum;
 263         int ret;
 264
 265         rcu_read_lock();
 266
 267         l3proto = __nf_ct_l3proto_find(l3num);
 268         ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum);
 269         if (ret != NF_ACCEPT) {
 270                 rcu_read_unlock();
 271                 return false;
 272         }
 273
 274         l4proto = __nf_ct_l4proto_find(l3num, protonum);
 275
 276         ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple,
 277                               l3proto, l4proto);
 278
 279         rcu_read_unlock();
 280         return ret;
 281 }
 282 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
 283
 284 bool
 285 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 286                    const struct nf_conntrack_tuple *orig,
 287                    const struct nf_conntrack_l3proto *l3proto,
 288                    const struct nf_conntrack_l4proto *l4proto)
 289 {
 290         memset(inverse, 0, sizeof(*inverse));
 291
 292         inverse->src.l3num = orig->src.l3num;
 293         if (l3proto->invert_tuple(inverse, orig) == 0)
 294                 return false;
 295
 296         inverse->dst.dir = !orig->dst.dir;
 297
 298         inverse->dst.protonum = orig->dst.protonum;
 299         return l4proto->invert_tuple(inverse, orig);
 300 }
 301 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
 302
 303 static void
 304 clean_from_lists(struct nf_conn *ct)
 305 {
 306         pr_debug("clean_from_lists(%p)\n", ct);
 307         hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
 308         hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
 309
 310         /* Destroy all pending expectations */
 311         nf_ct_remove_expectations(ct);
 312 }
 313
 314 /* must be called with local_bh_disable */
 315 static void nf_ct_add_to_dying_list(struct nf_conn *ct)
 316 {
 317         struct ct_pcpu *pcpu;
 318
 319         /* add this conntrack to the (per cpu) dying list */
 320         ct->cpu = smp_processor_id();
 321         pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
 322
 323         spin_lock(&pcpu->lock);
 324         hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 325                              &pcpu->dying);
 326         spin_unlock(&pcpu->lock);
 327 }
 328
 329 /* must be called with local_bh_disable */
 330 static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
 331 {
 332         struct ct_pcpu *pcpu;
 333
 334         /* add this conntrack to the (per cpu) unconfirmed list */
 335         ct->cpu = smp_processor_id();
 336         pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
 337
 338         spin_lock(&pcpu->lock);
 339         hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 340                              &pcpu->unconfirmed);
 341         spin_unlock(&pcpu->lock);
 342 }
 343
 344 /* must be called with local_bh_disable */
 345 static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
 346 {
 347         struct ct_pcpu *pcpu;
 348
 349         /* We overload first tuple to link into unconfirmed or dying list.*/
 350         pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
 351
 352         spin_lock(&pcpu->lock);
 353         BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
 354         hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
 355         spin_unlock(&pcpu->lock);
 356 }
 357
 358 #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
 359
 360 /* Released via destroy_conntrack() */
 361 struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
 362                                  const struct nf_conntrack_zone *zone,
 363                                  gfp_t flags)
 364 {
 365         struct nf_conn *tmpl, *p;
 366
 367         if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) {
 368                 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags);
 369                 if (!tmpl)
 370                         return NULL;
 371
 372                 p = tmpl;
 373                 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
 374                 if (tmpl != p) {
 375                         tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
 376                         tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
 377                 }
 378         } else {
 379                 tmpl = kzalloc(sizeof(*tmpl), flags);
 380                 if (!tmpl)
 381                         return NULL;
 382         }
 383
 384         tmpl->status = IPS_TEMPLATE;
 385         write_pnet(&tmpl->ct_net, net);
 386         nf_ct_zone_add(tmpl, zone);
 387         atomic_set(&tmpl->ct_general.use, 0);
 388
 389         return tmpl;
 390 }
 391 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
 392
 393 void nf_ct_tmpl_free(struct nf_conn *tmpl)
 394 {
 395         nf_ct_ext_destroy(tmpl);
 396         nf_ct_ext_free(tmpl);
 397
 398         if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
 399                 kfree((char *)tmpl - tmpl->proto.tmpl_padto);
 400         else
 401                 kfree(tmpl);
 402 }
 403 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
 404
 405 static void
 406 destroy_conntrack(struct nf_conntrack *nfct)
 407 {
 408         struct nf_conn *ct = (struct nf_conn *)nfct;
 409         const struct nf_conntrack_l4proto *l4proto;
 410
 411         pr_debug("destroy_conntrack(%p)\n", ct);
 412         WARN_ON(atomic_read(&nfct->use) != 0);
 413
 414         if (unlikely(nf_ct_is_template(ct))) {
 415                 nf_ct_tmpl_free(ct);
 416                 return;
 417         }
 418         l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 419         if (l4proto->destroy)
 420                 l4proto->destroy(ct);
 421
 422         local_bh_disable();
 423         /* Expectations will have been removed in clean_from_lists,
 424          * except TFTP can create an expectation on the first packet,
 425          * before connection is in the list, so we need to clean here,
 426          * too.
 427          */
 428         nf_ct_remove_expectations(ct);
 429
 430         nf_ct_del_from_dying_or_unconfirmed_list(ct);
 431
 432         local_bh_enable();
 433
 434         if (ct->master)
 435                 nf_ct_put(ct->master);
 436
 437         pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
 438         nf_conntrack_free(ct);
 439 }
 440
 441 static void nf_ct_delete_from_lists(struct nf_conn *ct)
 442 {
 443         struct net *net = nf_ct_net(ct);
 444         unsigned int hash, reply_hash;
 445         unsigned int sequence;
 446
 447         nf_ct_helper_destroy(ct);
 448
 449         local_bh_disable();
 450         do {
 451                 sequence = read_seqcount_begin(&nf_conntrack_generation);
 452                 hash = hash_conntrack(net,
 453                                       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 454                 reply_hash = hash_conntrack(net,
 455                                            &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 456         } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 457
 458         clean_from_lists(ct);
 459         nf_conntrack_double_unlock(hash, reply_hash);
 460
 461         nf_ct_add_to_dying_list(ct);
 462
 463         local_bh_enable();
 464 }
 465
 466 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
 467 {
 468         struct nf_conn_tstamp *tstamp;
 469
 470         if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
 471                 return false;
 472
 473         tstamp = nf_conn_tstamp_find(ct);
 474         if (tstamp && tstamp->stop == 0)
 475                 tstamp->stop = ktime_get_real_ns();
 476
 477         if (nf_conntrack_event_report(IPCT_DESTROY, ct,
 478                                     portid, report) < 0) {
 479                 /* destroy event was not delivered. nf_ct_put will
 480                  * be done by event cache worker on redelivery.
 481                  */
 482                 nf_ct_delete_from_lists(ct);
 483                 nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
 484                 return false;
 485         }
 486
 487         nf_conntrack_ecache_work(nf_ct_net(ct));
 488         nf_ct_delete_from_lists(ct);
 489         nf_ct_put(ct);
 490         return true;
 491 }
 492 EXPORT_SYMBOL_GPL(nf_ct_delete);
 493
 494 static inline bool
 495 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
 496                 const struct nf_conntrack_tuple *tuple,
 497                 const struct nf_conntrack_zone *zone,
 498                 const struct net *net)
 499 {
 500         struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
 501
 502         /* A conntrack can be recreated with the equal tuple,
 503          * so we need to check that the conntrack is confirmed
 504          */
 505         return nf_ct_tuple_equal(tuple, &h->tuple) &&
 506                nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
 507                nf_ct_is_confirmed(ct) &&
 508                net_eq(net, nf_ct_net(ct));
 509 }
 510
 511 static inline bool
 512 nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
 513 {
 514         return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 515                                  &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
 516                nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
 517                                  &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
 518                nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
 519                nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
 520                net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
 521 }
 522
 523 /* caller must hold rcu readlock and none of the nf_conntrack_locks */
 524 static void nf_ct_gc_expired(struct nf_conn *ct)
 525 {
 526         if (!atomic_inc_not_zero(&ct->ct_general.use))
 527                 return;
 528
 529         if (nf_ct_should_gc(ct))
 530                 nf_ct_kill(ct);
 531
 532         nf_ct_put(ct);
 533 }
 534
 535 /*
 536  * Warning :
 537  * - Caller must take a reference on returned object
 538  *   and recheck nf_ct_tuple_equal(tuple, &h->tuple)
 539  */
 540 static struct nf_conntrack_tuple_hash *
 541 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
 542                       const struct nf_conntrack_tuple *tuple, u32 hash)
 543 {
 544         struct nf_conntrack_tuple_hash *h;
 545         struct hlist_nulls_head *ct_hash;
 546         struct hlist_nulls_node *n;
 547         unsigned int bucket, hsize;
 548
 549 begin:
 550         nf_conntrack_get_ht(&ct_hash, &hsize);
 551         bucket = reciprocal_scale(hash, hsize);
 552
 553         hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
 554                 struct nf_conn *ct;
 555
 556                 ct = nf_ct_tuplehash_to_ctrack(h);
 557                 if (nf_ct_is_expired(ct)) {
 558                         nf_ct_gc_expired(ct);
 559                         continue;
 560                 }
 561
 562                 if (nf_ct_is_dying(ct))
 563                         continue;
 564
 565                 if (nf_ct_key_equal(h, tuple, zone, net))
 566                         return h;
 567         }
 568         /*
 569          * if the nulls value we got at the end of this lookup is
 570          * not the expected one, we must restart lookup.
 571          * We probably met an item that was moved to another chain.
 572          */
 573         if (get_nulls_value(n) != bucket) {
 574                 NF_CT_STAT_INC_ATOMIC(net, search_restart);
 575                 goto begin;
 576         }
 577
 578         return NULL;
 579 }
 580
 581 /* Find a connection corresponding to a tuple. */
 582 static struct nf_conntrack_tuple_hash *
 583 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
 584                         const struct nf_conntrack_tuple *tuple, u32 hash)
 585 {
 586         struct nf_conntrack_tuple_hash *h;
 587         struct nf_conn *ct;
 588
 589         rcu_read_lock();
 590 begin:
 591         h = ____nf_conntrack_find(net, zone, tuple, hash);
 592         if (h) {
 593                 ct = nf_ct_tuplehash_to_ctrack(h);
 594                 if (unlikely(nf_ct_is_dying(ct) ||
 595                              !atomic_inc_not_zero(&ct->ct_general.use)))
 596                         h = NULL;
 597                 else {
 598                         if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) {
 599                                 nf_ct_put(ct);
 600                                 goto begin;
 601                         }
 602                 }
 603         }
 604         rcu_read_unlock();
 605
 606         return h;
 607 }
 608
 609 struct nf_conntrack_tuple_hash *
 610 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
 611                       const struct nf_conntrack_tuple *tuple)
 612 {
 613         return __nf_conntrack_find_get(net, zone, tuple,
 614                                        hash_conntrack_raw(tuple, net));
 615 }
 616 EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
 617
 618 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
 619                                        unsigned int hash,
 620                                        unsigned int reply_hash)
 621 {
 622         hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 623                            &nf_conntrack_hash[hash]);
 624         hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
 625                            &nf_conntrack_hash[reply_hash]);
 626 }
 627
 628 int
 629 nf_conntrack_hash_check_insert(struct nf_conn *ct)
 630 {
 631         const struct nf_conntrack_zone *zone;
 632         struct net *net = nf_ct_net(ct);
 633         unsigned int hash, reply_hash;
 634         struct nf_conntrack_tuple_hash *h;
 635         struct hlist_nulls_node *n;
 636         unsigned int sequence;
 637
 638         zone = nf_ct_zone(ct);
 639
 640         local_bh_disable();
 641         do {
 642                 sequence = read_seqcount_begin(&nf_conntrack_generation);
 643                 hash = hash_conntrack(net,
 644                                       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 645                 reply_hash = hash_conntrack(net,
 646                                            &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 647         } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 648
 649         /* See if there's one in the list already, including reverse */
 650         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
 651                 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 652                                     zone, net))
 653                         goto out;
 654
 655         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
 656                 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 657                                     zone, net))
 658                         goto out;
 659
 660         smp_wmb();
 661         /* The caller holds a reference to this object */
 662         atomic_set(&ct->ct_general.use, 2);
 663         __nf_conntrack_hash_insert(ct, hash, reply_hash);
 664         nf_conntrack_double_unlock(hash, reply_hash);
 665         NF_CT_STAT_INC(net, insert);
 666         local_bh_enable();
 667         return 0;
 668
 669 out:
 670         nf_conntrack_double_unlock(hash, reply_hash);
 671         NF_CT_STAT_INC(net, insert_failed);
 672         local_bh_enable();
 673         return -EEXIST;
 674 }
 675 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
 676
 677 static inline void nf_ct_acct_update(struct nf_conn *ct,
 678                                      enum ip_conntrack_info ctinfo,
 679                                      unsigned int len)
 680 {
 681         struct nf_conn_acct *acct;
 682
 683         acct = nf_conn_acct_find(ct);
 684         if (acct) {
 685                 struct nf_conn_counter *counter = acct->counter;
 686
 687                 atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
 688                 atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes);
 689         }
 690 }
 691
 692 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 693                              const struct nf_conn *loser_ct)
 694 {
 695         struct nf_conn_acct *acct;
 696
 697         acct = nf_conn_acct_find(loser_ct);
 698         if (acct) {
 699                 struct nf_conn_counter *counter = acct->counter;
 700                 unsigned int bytes;
 701
 702                 /* u32 should be fine since we must have seen one packet. */
 703                 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
 704                 nf_ct_acct_update(ct, ctinfo, bytes);
 705         }
 706 }
 707
 708 /* Resolve race on insertion if this protocol allows this. */
 709 static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
 710                                enum ip_conntrack_info ctinfo,
 711                                struct nf_conntrack_tuple_hash *h)
 712 {
 713         /* This is the conntrack entry already in hashes that won race. */
 714         struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
 715         const struct nf_conntrack_l4proto *l4proto;
 716         enum ip_conntrack_info oldinfo;
 717         struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
 718
 719         l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 720         if (l4proto->allow_clash &&
 721             !nf_ct_is_dying(ct) &&
 722             atomic_inc_not_zero(&ct->ct_general.use)) {
 723                 if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
 724                     nf_ct_match(ct, loser_ct)) {
 725                         nf_ct_acct_merge(ct, ctinfo, loser_ct);
 726                         nf_conntrack_put(&loser_ct->ct_general);
 727                         nf_ct_set(skb, ct, oldinfo);
 728                         return NF_ACCEPT;
 729                 }
 730                 nf_ct_put(ct);
 731         }
 732         NF_CT_STAT_INC(net, drop);
 733         return NF_DROP;
 734 }
 735
 736 /* Confirm a connection given skb; places it in hash table */
 737 int
 738 __nf_conntrack_confirm(struct sk_buff *skb)
 739 {
 740         const struct nf_conntrack_zone *zone;
 741         unsigned int hash, reply_hash;
 742         struct nf_conntrack_tuple_hash *h;
 743         struct nf_conn *ct;
 744         struct nf_conn_help *help;
 745         struct nf_conn_tstamp *tstamp;
 746         struct hlist_nulls_node *n;
 747         enum ip_conntrack_info ctinfo;
 748         struct net *net;
 749         unsigned int sequence;
 750         int ret = NF_DROP;
 751
 752         ct = nf_ct_get(skb, &ctinfo);
 753         net = nf_ct_net(ct);
 754
 755         /* ipt_REJECT uses nf_conntrack_attach to attach related
 756            ICMP/TCP RST packets in other direction.  Actual packet
 757            which created connection will be IP_CT_NEW or for an
 758            expected connection, IP_CT_RELATED. */
 759         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
 760                 return NF_ACCEPT;
 761
 762         zone = nf_ct_zone(ct);
 763         local_bh_disable();
 764
 765         do {
 766                 sequence = read_seqcount_begin(&nf_conntrack_generation);
 767                 /* reuse the hash saved before */
 768                 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
 769                 hash = scale_hash(hash);
 770                 reply_hash = hash_conntrack(net,
 771                                            &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 772
 773         } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 774
 775         /* We're not in hash table, and we refuse to set up related
 776          * connections for unconfirmed conns.  But packet copies and
 777          * REJECT will give spurious warnings here.
 778          */
 779
 780         /* Another skb with the same unconfirmed conntrack may
 781          * win the race. This may happen for bridge(br_flood)
 782          * or broadcast/multicast packets do skb_clone with
 783          * unconfirmed conntrack.
 784          */
 785         if (unlikely(nf_ct_is_confirmed(ct))) {
 786                 WARN_ON_ONCE(1);
 787                 nf_conntrack_double_unlock(hash, reply_hash);
 788                 local_bh_enable();
 789                 return NF_DROP;
 790         }
 791
 792         pr_debug("Confirming conntrack %p\n", ct);
 793         /* We have to check the DYING flag after unlink to prevent
 794          * a race against nf_ct_get_next_corpse() possibly called from
 795          * user context, else we insert an already 'dead' hash, blocking
 796          * further use of that particular connection -JM.
 797          */
 798         nf_ct_del_from_dying_or_unconfirmed_list(ct);
 799
 800         if (unlikely(nf_ct_is_dying(ct))) {
 801                 nf_ct_add_to_dying_list(ct);
 802                 goto dying;
 803         }
 804
 805         /* See if there's one in the list already, including reverse:
 806            NAT could have grabbed it without realizing, since we're
 807            not in the hash.  If there is, we lost race. */
 808         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
 809                 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 810                                     zone, net))
 811                         goto out;
 812
 813         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
 814                 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 815                                     zone, net))
 816                         goto out;
 817
 818         /* Timer relative to confirmation time, not original
 819            setting time, otherwise we'd get timer wrap in
 820            weird delay cases. */
 821         ct->timeout += nfct_time_stamp;
 822         atomic_inc(&ct->ct_general.use);
 823         ct->status |= IPS_CONFIRMED;
 824
 825         /* set conntrack timestamp, if enabled. */
 826         tstamp = nf_conn_tstamp_find(ct);
 827         if (tstamp) {
 828                 if (skb->tstamp == 0)
 829                         __net_timestamp(skb);
 830
 831                 tstamp->start = ktime_to_ns(skb->tstamp);
 832         }
 833         /* Since the lookup is lockless, hash insertion must be done after
 834          * starting the timer and setting the CONFIRMED bit. The RCU barriers
 835          * guarantee that no other CPU can find the conntrack before the above
 836          * stores are visible.
 837          */
 838         __nf_conntrack_hash_insert(ct, hash, reply_hash);
 839         nf_conntrack_double_unlock(hash, reply_hash);
 840         local_bh_enable();
 841
 842         help = nfct_help(ct);
 843         if (help && help->helper)
 844                 nf_conntrack_event_cache(IPCT_HELPER, ct);
 845
 846         nf_conntrack_event_cache(master_ct(ct) ?
 847                                  IPCT_RELATED : IPCT_NEW, ct);
 848         return NF_ACCEPT;
 849
 850 out:
 851         nf_ct_add_to_dying_list(ct);
 852         ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
 853 dying:
 854         nf_conntrack_double_unlock(hash, reply_hash);
 855         NF_CT_STAT_INC(net, insert_failed);
 856         local_bh_enable();
 857         return ret;
 858 }
 859 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
 860
 861 /* Returns true if a connection correspondings to the tuple (required
 862    for NAT). */
 863 int
 864 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 865                          const struct nf_conn *ignored_conntrack)
 866 {
 867         struct net *net = nf_ct_net(ignored_conntrack);
 868         const struct nf_conntrack_zone *zone;
 869         struct nf_conntrack_tuple_hash *h;
 870         struct hlist_nulls_head *ct_hash;
 871         unsigned int hash, hsize;
 872         struct hlist_nulls_node *n;
 873         struct nf_conn *ct;
 874
 875         zone = nf_ct_zone(ignored_conntrack);
 876
 877         rcu_read_lock();
 878  begin:
 879         nf_conntrack_get_ht(&ct_hash, &hsize);
 880         hash = __hash_conntrack(net, tuple, hsize);
 881
 882         hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
 883                 ct = nf_ct_tuplehash_to_ctrack(h);
 884
 885                 if (ct == ignored_conntrack)
 886                         continue;
 887
 888                 if (nf_ct_is_expired(ct)) {
 889                         nf_ct_gc_expired(ct);
 890                         continue;
 891                 }
 892
 893                 if (nf_ct_key_equal(h, tuple, zone, net)) {
 894                         /* Tuple is taken already, so caller will need to find
 895                          * a new source port to use.
 896                          *
 897                          * Only exception:
 898                          * If the *original tuples* are identical, then both
 899                          * conntracks refer to the same flow.
 900                          * This is a rare situation, it can occur e.g. when
 901                          * more than one UDP packet is sent from same socket
 902                          * in different threads.
 903                          *
 904                          * Let nf_ct_resolve_clash() deal with this later.
 905                          */
 906                         if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 907                                               &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple))
 908                                 continue;
 909
 910                         NF_CT_STAT_INC_ATOMIC(net, found);
 911                         rcu_read_unlock();
 912                         return 1;
 913                 }
 914         }
 915
 916         if (get_nulls_value(n) != hash) {
 917                 NF_CT_STAT_INC_ATOMIC(net, search_restart);
 918                 goto begin;
 919         }
 920
 921         rcu_read_unlock();
 922
 923         return 0;
 924 }
 925 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
 926
 927 #define NF_CT_EVICTION_RANGE    8
 928
 929 /* There's a small race here where we may free a just-assured
 930    connection.  Too bad: we're in trouble anyway. */
 931 static unsigned int early_drop_list(struct net *net,
 932                                     struct hlist_nulls_head *head)
 933 {
 934         struct nf_conntrack_tuple_hash *h;
 935         struct hlist_nulls_node *n;
 936         unsigned int drops = 0;
 937         struct nf_conn *tmp;
 938
 939         hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
 940                 tmp = nf_ct_tuplehash_to_ctrack(h);
 941
 942                 if (nf_ct_is_expired(tmp)) {
 943                         nf_ct_gc_expired(tmp);
 944                         continue;
 945                 }
 946
 947                 if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
 948                     !net_eq(nf_ct_net(tmp), net) ||
 949                     nf_ct_is_dying(tmp))
 950                         continue;
 951
 952                 if (!atomic_inc_not_zero(&tmp->ct_general.use))
 953                         continue;
 954
 955                 /* kill only if still in same netns -- might have moved due to
 956                  * SLAB_TYPESAFE_BY_RCU rules.
 957                  *
 958                  * We steal the timer reference.  If that fails timer has
 959                  * already fired or someone else deleted it. Just drop ref
 960                  * and move to next entry.
 961                  */
 962                 if (net_eq(nf_ct_net(tmp), net) &&
 963                     nf_ct_is_confirmed(tmp) &&
 964                     nf_ct_delete(tmp, 0, 0))
 965                         drops++;
 966
 967                 nf_ct_put(tmp);
 968         }
 969
 970         return drops;
 971 }
 972
 973 static noinline int early_drop(struct net *net, unsigned int hash)
 974 {
 975         unsigned int i, bucket;
 976
 977         for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
 978                 struct hlist_nulls_head *ct_hash;
 979                 unsigned int hsize, drops;
 980
 981                 rcu_read_lock();
 982                 nf_conntrack_get_ht(&ct_hash, &hsize);
 983                 if (!i)
 984                         bucket = reciprocal_scale(hash, hsize);
 985                 else
 986                         bucket = (bucket + 1) % hsize;
 987
 988                 drops = early_drop_list(net, &ct_hash[bucket]);
 989                 rcu_read_unlock();
 990
 991                 if (drops) {
 992                         NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops);
 993                         return true;
 994                 }
 995         }
 996
 997         return false;
 998 }
 999
1000 static bool gc_worker_skip_ct(const struct nf_conn *ct)
1001 {
1002         return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct);
1003 }
1004
1005 static bool gc_worker_can_early_drop(const struct nf_conn *ct)
1006 {
1007         const struct nf_conntrack_l4proto *l4proto;
1008
1009         if (!test_bit(IPS_ASSURED_BIT, &ct->status))
1010                 return true;
1011
1012         l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
1013         if (l4proto->can_early_drop && l4proto->can_early_drop(ct))
1014                 return true;
1015
1016         return false;
1017 }
1018
1019 static void gc_worker(struct work_struct *work)
1020 {
1021         unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
1022         unsigned int i, goal, buckets = 0, expired_count = 0;
1023         unsigned int nf_conntrack_max95 = 0;
1024         struct conntrack_gc_work *gc_work;
1025         unsigned int ratio, scanned = 0;
1026         unsigned long next_run;
1027
1028         gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
1029
1030         goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV;
1031         i = gc_work->last_bucket;
1032         if (gc_work->early_drop)
1033                 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
1034
1035         do {
1036                 struct nf_conntrack_tuple_hash *h;
1037                 struct hlist_nulls_head *ct_hash;
1038                 struct hlist_nulls_node *n;
1039                 unsigned int hashsz;
1040                 struct nf_conn *tmp;
1041
1042                 i++;
1043                 rcu_read_lock();
1044
1045                 nf_conntrack_get_ht(&ct_hash, &hashsz);
1046                 if (i >= hashsz)
1047                         i = 0;
1048
1049                 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
1050                         struct net *net;
1051
1052                         tmp = nf_ct_tuplehash_to_ctrack(h);
1053
1054                         scanned++;
1055                         if (nf_ct_is_expired(tmp)) {
1056                                 nf_ct_gc_expired(tmp);
1057                                 expired_count++;
1058                                 continue;
1059                         }
1060
1061                         if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
1062                                 continue;
1063
1064                         net = nf_ct_net(tmp);
1065                         if (atomic_read(&net->ct.count) < nf_conntrack_max95)
1066                                 continue;
1067
1068                         /* need to take reference to avoid possible races */
1069                         if (!atomic_inc_not_zero(&tmp->ct_general.use))
1070                                 continue;
1071
1072                         if (gc_worker_skip_ct(tmp)) {
1073                                 nf_ct_put(tmp);
1074                                 continue;
1075                         }
1076
1077                         if (gc_worker_can_early_drop(tmp))
1078                                 nf_ct_kill(tmp);
1079
1080                         nf_ct_put(tmp);
1081                 }
1082
1083                 /* could check get_nulls_value() here and restart if ct
1084                  * was moved to another chain.  But given gc is best-effort
1085                  * we will just continue with next hash slot.
1086                  */
1087                 rcu_read_unlock();
1088                 cond_resched_rcu_qs();
1089         } while (++buckets < goal);
1090
1091         if (gc_work->exiting)
1092                 return;
1093
1094         /*
1095          * Eviction will normally happen from the packet path, and not
1096          * from this gc worker.
1097          *
1098          * This worker is only here to reap expired entries when system went
1099          * idle after a busy period.
1100          *
1101          * The heuristics below are supposed to balance conflicting goals:
1102          *
1103          * 1. Minimize time until we notice a stale entry
1104          * 2. Maximize scan intervals to not waste cycles
1105          *
1106          * Normally, expire ratio will be close to 0.
1107          *
1108          * As soon as a sizeable fraction of the entries have expired
1109          * increase scan frequency.
1110          */
1111         ratio = scanned ? expired_count * 100 / scanned : 0;
1112         if (ratio > GC_EVICT_RATIO) {
1113                 gc_work->next_gc_run = min_interval;
1114         } else {
1115                 unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV;
1116
1117                 BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0);
1118
1119                 gc_work->next_gc_run += min_interval;
1120                 if (gc_work->next_gc_run > max)
1121                         gc_work->next_gc_run = max;
1122         }
1123
1124         next_run = gc_work->next_gc_run;
1125         gc_work->last_bucket = i;
1126         gc_work->early_drop = false;
1127         queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
1128 }
1129
1130 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
1131 {
1132         INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
1133         gc_work->next_gc_run = HZ;
1134         gc_work->exiting = false;
1135 }
1136
1137 static struct nf_conn *
1138 __nf_conntrack_alloc(struct net *net,
1139                      const struct nf_conntrack_zone *zone,
1140                      const struct nf_conntrack_tuple *orig,
1141                      const struct nf_conntrack_tuple *repl,
1142                      gfp_t gfp, u32 hash)
1143 {
1144         struct nf_conn *ct;
1145
1146         /* We don't want any race condition at early drop stage */
1147         atomic_inc(&net->ct.count);
1148
1149         if (nf_conntrack_max &&
1150             unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
1151                 if (!early_drop(net, hash)) {
1152                         if (!conntrack_gc_work.early_drop)
1153                                 conntrack_gc_work.early_drop = true;
1154                         atomic_dec(&net->ct.count);
1155                         net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
1156                         return ERR_PTR(-ENOMEM);
1157                 }
1158         }
1159
1160         /*
1161          * Do not use kmem_cache_zalloc(), as this cache uses
1162          * SLAB_TYPESAFE_BY_RCU.
1163          */
1164         ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
1165         if (ct == NULL)
1166                 goto out;
1167
1168         spin_lock_init(&ct->lock);
1169         ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
1170         ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
1171         ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
1172         /* save hash for reusing when confirming */
1173         *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
1174         ct->status = 0;
1175         write_pnet(&ct->ct_net, net);
1176         memset(&ct->__nfct_init_offset[0], 0,
1177                offsetof(struct nf_conn, proto) -
1178                offsetof(struct nf_conn, __nfct_init_offset[0]));
1179
1180         nf_ct_zone_add(ct, zone);
1181
1182         /* Because we use RCU lookups, we set ct_general.use to zero before
1183          * this is inserted in any list.
1184          */
1185         atomic_set(&ct->ct_general.use, 0);
1186         return ct;
1187 out:
1188         atomic_dec(&net->ct.count);
1189         return ERR_PTR(-ENOMEM);
1190 }
1191
1192 struct nf_conn *nf_conntrack_alloc(struct net *net,
1193                                    const struct nf_conntrack_zone *zone,
1194                                    const struct nf_conntrack_tuple *orig,
1195                                    const struct nf_conntrack_tuple *repl,
1196                                    gfp_t gfp)
1197 {
1198         return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
1199 }
1200 EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
1201
1202 void nf_conntrack_free(struct nf_conn *ct)
1203 {
1204         struct net *net = nf_ct_net(ct);
1205
1206         /* A freed object has refcnt == 0, that's
1207          * the golden rule for SLAB_TYPESAFE_BY_RCU
1208          */
1209         WARN_ON(atomic_read(&ct->ct_general.use) != 0);
1210
1211         nf_ct_ext_destroy(ct);
1212         nf_ct_ext_free(ct);
1213         kmem_cache_free(nf_conntrack_cachep, ct);
1214         smp_mb__before_atomic();
1215         atomic_dec(&net->ct.count);
1216 }
1217 EXPORT_SYMBOL_GPL(nf_conntrack_free);
1218
1219
1220 /* Allocate a new conntrack: we return -ENOMEM if classification
1221    failed due to stress.  Otherwise it really is unclassifiable. */
1222 static noinline struct nf_conntrack_tuple_hash *
1223 init_conntrack(struct net *net, struct nf_conn *tmpl,
1224                const struct nf_conntrack_tuple *tuple,
1225                const struct nf_conntrack_l3proto *l3proto,
1226                const struct nf_conntrack_l4proto *l4proto,
1227                struct sk_buff *skb,
1228                unsigned int dataoff, u32 hash)
1229 {
1230         struct nf_conn *ct;
1231         struct nf_conn_help *help;
1232         struct nf_conntrack_tuple repl_tuple;
1233         struct nf_conntrack_ecache *ecache;
1234         struct nf_conntrack_expect *exp = NULL;
1235         const struct nf_conntrack_zone *zone;
1236         struct nf_conn_timeout *timeout_ext;
1237         struct nf_conntrack_zone tmp;
1238         unsigned int *timeouts;
1239
1240         if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
1241                 pr_debug("Can't invert tuple.\n");
1242                 return NULL;
1243         }
1244
1245         zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1246         ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
1247                                   hash);
1248         if (IS_ERR(ct))
1249                 return (struct nf_conntrack_tuple_hash *)ct;
1250
1251         if (!nf_ct_add_synproxy(ct, tmpl)) {
1252                 nf_conntrack_free(ct);
1253                 return ERR_PTR(-ENOMEM);
1254         }
1255
1256         timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
1257         if (timeout_ext) {
1258                 timeouts = nf_ct_timeout_data(timeout_ext);
1259                 if (unlikely(!timeouts))
1260                         timeouts = l4proto->get_timeouts(net);
1261         } else {
1262                 timeouts = l4proto->get_timeouts(net);
1263         }
1264
1265         if (!l4proto->new(ct, skb, dataoff, timeouts)) {
1266                 nf_conntrack_free(ct);
1267                 pr_debug("can't track with proto module\n");
1268                 return NULL;
1269         }
1270
1271         if (timeout_ext)
1272                 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
1273                                       GFP_ATOMIC);
1274
1275         nf_ct_acct_ext_add(ct, GFP_ATOMIC);
1276         nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
1277         nf_ct_labels_ext_add(ct);
1278
1279         ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
1280         nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
1281                                  ecache ? ecache->expmask : 0,
1282                              GFP_ATOMIC);
1283
1284         local_bh_disable();
1285         if (net->ct.expect_count) {
1286                 spin_lock(&nf_conntrack_expect_lock);
1287                 exp = nf_ct_find_expectation(net, zone, tuple);
1288                 if (exp) {
1289                         pr_debug("expectation arrives ct=%p exp=%p\n",
1290                                  ct, exp);
1291                         /* Welcome, Mr. Bond.  We've been expecting you... */
1292                         __set_bit(IPS_EXPECTED_BIT, &ct->status);
1293                         /* exp->master safe, refcnt bumped in nf_ct_find_expectation */
1294                         ct->master = exp->master;
1295                         if (exp->helper) {
1296                                 help = nf_ct_helper_ext_add(ct, exp->helper,
1297                                                             GFP_ATOMIC);
1298                                 if (help)
1299                                         rcu_assign_pointer(help->helper, exp->helper);
1300                         }
1301
1302 #ifdef CONFIG_NF_CONNTRACK_MARK
1303                         ct->mark = exp->master->mark;
1304 #endif
1305 #ifdef CONFIG_NF_CONNTRACK_SECMARK
1306                         ct->secmark = exp->master->secmark;
1307 #endif
1308                         NF_CT_STAT_INC(net, expect_new);
1309                 }
1310                 spin_unlock(&nf_conntrack_expect_lock);
1311         }
1312         if (!exp)
1313                 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
1314
1315         /* Now it is inserted into the unconfirmed list, bump refcount */
1316         nf_conntrack_get(&ct->ct_general);
1317         nf_ct_add_to_unconfirmed_list(ct);
1318
1319         local_bh_enable();
1320
1321         if (exp) {
1322                 if (exp->expectfn)
1323                         exp->expectfn(ct, exp);
1324                 nf_ct_expect_put(exp);
1325         }
1326
1327         return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
1328 }
1329
1330 /* On success, returns 0, sets skb->_nfct | ctinfo */
1331 static int
1332 resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
1333                   struct sk_buff *skb,
1334                   unsigned int dataoff,
1335                   u_int16_t l3num,
1336                   u_int8_t protonum,
1337                   const struct nf_conntrack_l3proto *l3proto,
1338                   const struct nf_conntrack_l4proto *l4proto)
1339 {
1340         const struct nf_conntrack_zone *zone;
1341         struct nf_conntrack_tuple tuple;
1342         struct nf_conntrack_tuple_hash *h;
1343         enum ip_conntrack_info ctinfo;
1344         struct nf_conntrack_zone tmp;
1345         struct nf_conn *ct;
1346         u32 hash;
1347
1348         if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
1349                              dataoff, l3num, protonum, net, &tuple, l3proto,
1350                              l4proto)) {
1351                 pr_debug("Can't get tuple\n");
1352                 return 0;
1353         }
1354
1355         /* look for tuple match */
1356         zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1357         hash = hash_conntrack_raw(&tuple, net);
1358         h = __nf_conntrack_find_get(net, zone, &tuple, hash);
1359         if (!h) {
1360                 h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
1361                                    skb, dataoff, hash);
1362                 if (!h)
1363                         return 0;
1364                 if (IS_ERR(h))
1365                         return PTR_ERR(h);
1366         }
1367         ct = nf_ct_tuplehash_to_ctrack(h);
1368
1369         /* It exists; we have (non-exclusive) reference. */
1370         if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1371                 ctinfo = IP_CT_ESTABLISHED_REPLY;
1372         } else {
1373                 /* Once we've had two way comms, always ESTABLISHED. */
1374                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1375                         pr_debug("normal packet for %p\n", ct);
1376                         ctinfo = IP_CT_ESTABLISHED;
1377                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1378                         pr_debug("related packet for %p\n", ct);
1379                         ctinfo = IP_CT_RELATED;
1380                 } else {
1381                         pr_debug("new packet for %p\n", ct);
1382                         ctinfo = IP_CT_NEW;
1383                 }
1384         }
1385         nf_ct_set(skb, ct, ctinfo);
1386         return 0;
1387 }
1388
1389 unsigned int
1390 nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1391                 struct sk_buff *skb)
1392 {
1393         const struct nf_conntrack_l3proto *l3proto;
1394         const struct nf_conntrack_l4proto *l4proto;
1395         struct nf_conn *ct, *tmpl;
1396         enum ip_conntrack_info ctinfo;
1397         unsigned int *timeouts;
1398         unsigned int dataoff;
1399         u_int8_t protonum;
1400         int ret;
1401
1402         tmpl = nf_ct_get(skb, &ctinfo);
1403         if (tmpl || ctinfo == IP_CT_UNTRACKED) {
1404                 /* Previously seen (loopback or untracked)?  Ignore. */
1405                 if ((tmpl && !nf_ct_is_template(tmpl)) ||
1406                      ctinfo == IP_CT_UNTRACKED) {
1407                         NF_CT_STAT_INC_ATOMIC(net, ignore);
1408                         return NF_ACCEPT;
1409                 }
1410                 skb->_nfct = 0;
1411         }
1412
1413         /* rcu_read_lock()ed by nf_hook_thresh */
1414         l3proto = __nf_ct_l3proto_find(pf);
1415         ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
1416                                    &dataoff, &protonum);
1417         if (ret <= 0) {
1418                 pr_debug("not prepared to track yet or error occurred\n");
1419                 NF_CT_STAT_INC_ATOMIC(net, error);
1420                 NF_CT_STAT_INC_ATOMIC(net, invalid);
1421                 ret = -ret;
1422                 goto out;
1423         }
1424
1425         l4proto = __nf_ct_l4proto_find(pf, protonum);
1426
1427         /* It may be an special packet, error, unclean...
1428          * inverse of the return code tells to the netfilter
1429          * core what to do with the packet. */
1430         if (l4proto->error != NULL) {
1431                 ret = l4proto->error(net, tmpl, skb, dataoff, pf, hooknum);
1432                 if (ret <= 0) {
1433                         NF_CT_STAT_INC_ATOMIC(net, error);
1434                         NF_CT_STAT_INC_ATOMIC(net, invalid);
1435                         ret = -ret;
1436                         goto out;
1437                 }
1438                 /* ICMP[v6] protocol trackers may assign one conntrack. */
1439                 if (skb->_nfct)
1440                         goto out;
1441         }
1442 repeat:
1443         ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
1444                                 l3proto, l4proto);
1445         if (ret < 0) {
1446                 /* Too stressed to deal. */
1447                 NF_CT_STAT_INC_ATOMIC(net, drop);
1448                 ret = NF_DROP;
1449                 goto out;
1450         }
1451
1452         ct = nf_ct_get(skb, &ctinfo);
1453         if (!ct) {
1454                 /* Not valid part of a connection */
1455                 NF_CT_STAT_INC_ATOMIC(net, invalid);
1456                 ret = NF_ACCEPT;
1457                 goto out;
1458         }
1459
1460         /* Decide what timeout policy we want to apply to this flow. */
1461         timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
1462
1463         ret = l4proto->packet(ct, skb, dataoff, ctinfo, timeouts);
1464         if (ret <= 0) {
1465                 /* Invalid: inverse of the return code tells
1466                  * the netfilter core what to do */
1467                 pr_debug("nf_conntrack_in: Can't track with proto module\n");
1468                 nf_conntrack_put(&ct->ct_general);
1469                 skb->_nfct = 0;
1470                 NF_CT_STAT_INC_ATOMIC(net, invalid);
1471                 if (ret == -NF_DROP)
1472                         NF_CT_STAT_INC_ATOMIC(net, drop);
1473                 /* Special case: TCP tracker reports an attempt to reopen a
1474                  * closed/aborted connection. We have to go back and create a
1475                  * fresh conntrack.
1476                  */
1477                 if (ret == -NF_REPEAT)
1478                         goto repeat;
1479                 ret = -ret;
1480                 goto out;
1481         }
1482
1483         if (ctinfo == IP_CT_ESTABLISHED_REPLY &&
1484             !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1485                 nf_conntrack_event_cache(IPCT_REPLY, ct);
1486 out:
1487         if (tmpl)
1488                 nf_ct_put(tmpl);
1489
1490         return ret;
1491 }
1492 EXPORT_SYMBOL_GPL(nf_conntrack_in);
1493
1494 bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1495                           const struct nf_conntrack_tuple *orig)
1496 {
1497         bool ret;
1498
1499         rcu_read_lock();
1500         ret = nf_ct_invert_tuple(inverse, orig,
1501                                  __nf_ct_l3proto_find(orig->src.l3num),
1502                                  __nf_ct_l4proto_find(orig->src.l3num,
1503                                                       orig->dst.protonum));
1504         rcu_read_unlock();
1505         return ret;
1506 }
1507 EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr);
1508
1509 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1510    implicitly racy: see __nf_conntrack_confirm */
1511 void nf_conntrack_alter_reply(struct nf_conn *ct,
1512                               const struct nf_conntrack_tuple *newreply)
1513 {
1514         struct nf_conn_help *help = nfct_help(ct);
1515
1516         /* Should be unconfirmed, so not in hash table yet */
1517         WARN_ON(nf_ct_is_confirmed(ct));
1518
1519         pr_debug("Altering reply tuple of %p to ", ct);
1520         nf_ct_dump_tuple(newreply);
1521
1522         ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1523         if (ct->master || (help && !hlist_empty(&help->expectations)))
1524                 return;
1525
1526         rcu_read_lock();
1527         __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
1528         rcu_read_unlock();
1529 }
1530 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
1531
1532 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1533 void __nf_ct_refresh_acct(struct nf_conn *ct,
1534                           enum ip_conntrack_info ctinfo,
1535                           const struct sk_buff *skb,
1536                           unsigned long extra_jiffies,
1537                           int do_acct)
1538 {
1539         WARN_ON(!skb);
1540
1541         /* Only update if this is not a fixed timeout */
1542         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
1543                 goto acct;
1544
1545         /* If not in hash table, timer will not be active yet */
1546         if (nf_ct_is_confirmed(ct))
1547                 extra_jiffies += nfct_time_stamp;
1548
1549         ct->timeout = extra_jiffies;
1550 acct:
1551         if (do_acct)
1552                 nf_ct_acct_update(ct, ctinfo, skb->len);
1553 }
1554 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
1555
1556 bool nf_ct_kill_acct(struct nf_conn *ct,
1557                      enum ip_conntrack_info ctinfo,
1558                      const struct sk_buff *skb)
1559 {
1560         nf_ct_acct_update(ct, ctinfo, skb->len);
1561
1562         return nf_ct_delete(ct, 0, 0);
1563 }
1564 EXPORT_SYMBOL_GPL(nf_ct_kill_acct);
1565
1566 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1567
1568 #include <linux/netfilter/nfnetlink.h>
1569 #include <linux/netfilter/nfnetlink_conntrack.h>
1570 #include <linux/mutex.h>
1571
1572 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1573  * in ip_conntrack_core, since we don't want the protocols to autoload
1574  * or depend on ctnetlink */
1575 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
1576                                const struct nf_conntrack_tuple *tuple)
1577 {
1578         if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) ||
1579             nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port))
1580                 goto nla_put_failure;
1581         return 0;
1582
1583 nla_put_failure:
1584         return -1;
1585 }
1586 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
1587
1588 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
1589         [CTA_PROTO_SRC_PORT]  = { .type = NLA_U16 },
1590         [CTA_PROTO_DST_PORT]  = { .type = NLA_U16 },
1591 };
1592 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
1593
1594 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
1595                                struct nf_conntrack_tuple *t)
1596 {
1597         if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
1598                 return -EINVAL;
1599
1600         t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
1601         t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
1602
1603         return 0;
1604 }
1605 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
1606
1607 unsigned int nf_ct_port_nlattr_tuple_size(void)
1608 {
1609         static unsigned int size __read_mostly;
1610
1611         if (!size)
1612                 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1613
1614         return size;
1615 }
1616 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
1617 #endif
1618
1619 /* Used by ipt_REJECT and ip6t_REJECT. */
1620 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
1621 {
1622         struct nf_conn *ct;
1623         enum ip_conntrack_info ctinfo;
1624
1625         /* This ICMP is in reverse direction to the packet which caused it */
1626         ct = nf_ct_get(skb, &ctinfo);
1627         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1628                 ctinfo = IP_CT_RELATED_REPLY;
1629         else
1630                 ctinfo = IP_CT_RELATED;
1631
1632         /* Attach to new skbuff, and increment count */
1633         nf_ct_set(nskb, ct, ctinfo);
1634         nf_conntrack_get(skb_nfct(nskb));
1635 }
1636
1637 /* Bring out ya dead! */
1638 static struct nf_conn *
1639 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1640                 void *data, unsigned int *bucket)
1641 {
1642         struct nf_conntrack_tuple_hash *h;
1643         struct nf_conn *ct;
1644         struct hlist_nulls_node *n;
1645         spinlock_t *lockp;
1646
1647         for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1648                 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
1649                 local_bh_disable();
1650                 nf_conntrack_lock(lockp);
1651                 if (*bucket < nf_conntrack_htable_size) {
1652                         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
1653                                 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
1654                                         continue;
1655                                 ct = nf_ct_tuplehash_to_ctrack(h);
1656                                 if (iter(ct, data))
1657                                         goto found;
1658                         }
1659                 }
1660                 spin_unlock(lockp);
1661                 local_bh_enable();
1662                 cond_resched();
1663         }
1664
1665         return NULL;
1666 found:
1667         atomic_inc(&ct->ct_general.use);
1668         spin_unlock(lockp);
1669         local_bh_enable();
1670         return ct;
1671 }
1672
1673 static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
1674                                   void *data, u32 portid, int report)
1675 {
1676         unsigned int bucket = 0, sequence;
1677         struct nf_conn *ct;
1678
1679         might_sleep();
1680
1681         for (;;) {
1682                 sequence = read_seqcount_begin(&nf_conntrack_generation);
1683
1684                 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1685                         /* Time to push up daises... */
1686
1687                         nf_ct_delete(ct, portid, report);
1688                         nf_ct_put(ct);
1689                         cond_resched();
1690                 }
1691
1692                 if (!read_seqcount_retry(&nf_conntrack_generation, sequence))
1693                         break;
1694                 bucket = 0;
1695         }
1696 }
1697
1698 struct iter_data {
1699         int (*iter)(struct nf_conn *i, void *data);
1700         void *data;
1701         struct net *net;
1702 };
1703
1704 static int iter_net_only(struct nf_conn *i, void *data)
1705 {
1706         struct iter_data *d = data;
1707
1708         if (!net_eq(d->net, nf_ct_net(i)))
1709                 return 0;
1710
1711         return d->iter(i, d->data);
1712 }
1713
1714 static void
1715 __nf_ct_unconfirmed_destroy(struct net *net)
1716 {
1717         int cpu;
1718
1719         for_each_possible_cpu(cpu) {
1720                 struct nf_conntrack_tuple_hash *h;
1721                 struct hlist_nulls_node *n;
1722                 struct ct_pcpu *pcpu;
1723
1724                 pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
1725
1726                 spin_lock_bh(&pcpu->lock);
1727                 hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
1728                         struct nf_conn *ct;
1729
1730                         ct = nf_ct_tuplehash_to_ctrack(h);
1731
1732                         /* we cannot call iter() on unconfirmed list, the
1733                          * owning cpu can reallocate ct->ext at any time.
1734                          */
1735                         set_bit(IPS_DYING_BIT, &ct->status);
1736                 }
1737                 spin_unlock_bh(&pcpu->lock);
1738                 cond_resched();
1739         }
1740 }
1741
1742 void nf_ct_unconfirmed_destroy(struct net *net)
1743 {
1744         might_sleep();
1745
1746         if (atomic_read(&net->ct.count) > 0) {
1747                 __nf_ct_unconfirmed_destroy(net);
1748                 nf_queue_nf_hook_drop(net);
1749                 synchronize_net();
1750         }
1751 }
1752 EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy);
1753
1754 void nf_ct_iterate_cleanup_net(struct net *net,
1755                                int (*iter)(struct nf_conn *i, void *data),
1756                                void *data, u32 portid, int report)
1757 {
1758         struct iter_data d;
1759
1760         might_sleep();
1761
1762         if (atomic_read(&net->ct.count) == 0)
1763                 return;
1764
1765         d.iter = iter;
1766         d.data = data;
1767         d.net = net;
1768
1769         nf_ct_iterate_cleanup(iter_net_only, &d, portid, report);
1770 }
1771 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
1772
1773 /**
1774  * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table
1775  * @iter: callback to invoke for each conntrack
1776  * @data: data to pass to @iter
1777  *
1778  * Like nf_ct_iterate_cleanup, but first marks conntracks on the
1779  * unconfirmed list as dying (so they will not be inserted into
1780  * main table).
1781  *
1782  * Can only be called in module exit path.
1783  */
1784 void
1785 nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
1786 {
1787         struct net *net;
1788
1789         rtnl_lock();
1790         for_each_net(net) {
1791                 if (atomic_read(&net->ct.count) == 0)
1792                         continue;
1793                 __nf_ct_unconfirmed_destroy(net);
1794                 nf_queue_nf_hook_drop(net);
1795         }
1796         rtnl_unlock();
1797
1798         /* Need to wait for netns cleanup worker to finish, if its
1799          * running -- it might have deleted a net namespace from
1800          * the global list, so our __nf_ct_unconfirmed_destroy() might
1801          * not have affected all namespaces.
1802          */
1803         net_ns_barrier();
1804
1805         /* a conntrack could have been unlinked from unconfirmed list
1806          * before we grabbed pcpu lock in __nf_ct_unconfirmed_destroy().
1807          * This makes sure its inserted into conntrack table.
1808          */
1809         synchronize_net();
1810
1811         nf_ct_iterate_cleanup(iter, data, 0, 0);
1812 }
1813 EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy);
1814
1815 static int kill_all(struct nf_conn *i, void *data)
1816 {
1817         return net_eq(nf_ct_net(i), data);
1818 }
1819
1820 void nf_ct_free_hashtable(void *hash, unsigned int size)
1821 {
1822         if (is_vmalloc_addr(hash))
1823                 vfree(hash);
1824         else
1825                 free_pages((unsigned long)hash,
1826                            get_order(sizeof(struct hlist_head) * size));
1827 }
1828 EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
1829
1830 void nf_conntrack_cleanup_start(void)
1831 {
1832         conntrack_gc_work.exiting = true;
1833         RCU_INIT_POINTER(ip_ct_attach, NULL);
1834 }
1835
1836 void nf_conntrack_cleanup_end(void)
1837 {
1838         RCU_INIT_POINTER(nf_ct_destroy, NULL);
1839
1840         cancel_delayed_work_sync(&conntrack_gc_work.dwork);
1841         nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
1842
1843         nf_conntrack_proto_fini();
1844         nf_conntrack_seqadj_fini();
1845         nf_conntrack_labels_fini();
1846         nf_conntrack_helper_fini();
1847         nf_conntrack_timeout_fini();
1848         nf_conntrack_ecache_fini();
1849         nf_conntrack_tstamp_fini();
1850         nf_conntrack_acct_fini();
1851         nf_conntrack_expect_fini();
1852
1853         kmem_cache_destroy(nf_conntrack_cachep);
1854 }
1855
1856 /*
1857  * Mishearing the voices in his head, our hero wonders how he's
1858  * supposed to kill the mall.
1859  */
1860 void nf_conntrack_cleanup_net(struct net *net)
1861 {
1862         LIST_HEAD(single);
1863
1864         list_add(&net->exit_list, &single);
1865         nf_conntrack_cleanup_net_list(&single);
1866 }
1867
1868 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
1869 {
1870         int busy;
1871         struct net *net;
1872
1873         /*
1874          * This makes sure all current packets have passed through
1875          *  netfilter framework.  Roll on, two-stage module
1876          *  delete...
1877          */
1878         synchronize_net();
1879 i_see_dead_people:
1880         busy = 0;
1881         list_for_each_entry(net, net_exit_list, exit_list) {
1882                 nf_ct_iterate_cleanup(kill_all, net, 0, 0);
1883                 if (atomic_read(&net->ct.count) != 0)
1884                         busy = 1;
1885         }
1886         if (busy) {
1887                 schedule();
1888                 goto i_see_dead_people;
1889         }
1890
1891         list_for_each_entry(net, net_exit_list, exit_list) {
1892                 nf_conntrack_proto_pernet_fini(net);
1893                 nf_conntrack_helper_pernet_fini(net);
1894                 nf_conntrack_ecache_pernet_fini(net);
1895                 nf_conntrack_tstamp_pernet_fini(net);
1896                 nf_conntrack_acct_pernet_fini(net);
1897                 nf_conntrack_expect_pernet_fini(net);
1898                 free_percpu(net->ct.stat);
1899                 free_percpu(net->ct.pcpu_lists);
1900         }
1901 }
1902
1903 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
1904 {
1905         struct hlist_nulls_head *hash;
1906         unsigned int nr_slots, i;
1907         size_t sz;
1908
1909         if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
1910                 return NULL;
1911
1912         BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
1913         nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
1914
1915         if (nr_slots > (UINT_MAX / sizeof(struct hlist_nulls_head)))
1916                 return NULL;
1917
1918         sz = nr_slots * sizeof(struct hlist_nulls_head);
1919         hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
1920                                         get_order(sz));
1921         if (!hash)
1922                 hash = vzalloc(sz);
1923
1924         if (hash && nulls)
1925                 for (i = 0; i < nr_slots; i++)
1926                         INIT_HLIST_NULLS_HEAD(&hash[i], i);
1927
1928         return hash;
1929 }
1930 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
1931
1932 int nf_conntrack_hash_resize(unsigned int hashsize)
1933 {
1934         int i, bucket;
1935         unsigned int old_size;
1936         struct hlist_nulls_head *hash, *old_hash;
1937         struct nf_conntrack_tuple_hash *h;
1938         struct nf_conn *ct;
1939
1940         if (!hashsize)
1941                 return -EINVAL;
1942
1943         hash = nf_ct_alloc_hashtable(&hashsize, 1);
1944         if (!hash)
1945                 return -ENOMEM;
1946
1947         old_size = nf_conntrack_htable_size;
1948         if (old_size == hashsize) {
1949                 nf_ct_free_hashtable(hash, hashsize);
1950                 return 0;
1951         }
1952
1953         local_bh_disable();
1954         nf_conntrack_all_lock();
1955         write_seqcount_begin(&nf_conntrack_generation);
1956
1957         /* Lookups in the old hash might happen in parallel, which means we
1958          * might get false negatives during connection lookup. New connections
1959          * created because of a false negative won't make it into the hash
1960          * though since that required taking the locks.
1961          */
1962
1963         for (i = 0; i < nf_conntrack_htable_size; i++) {
1964                 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
1965                         h = hlist_nulls_entry(nf_conntrack_hash[i].first,
1966                                               struct nf_conntrack_tuple_hash, hnnode);
1967                         ct = nf_ct_tuplehash_to_ctrack(h);
1968                         hlist_nulls_del_rcu(&h->hnnode);
1969                         bucket = __hash_conntrack(nf_ct_net(ct),
1970                                                   &h->tuple, hashsize);
1971                         hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
1972                 }
1973         }
1974         old_size = nf_conntrack_htable_size;
1975         old_hash = nf_conntrack_hash;
1976
1977         nf_conntrack_hash = hash;
1978         nf_conntrack_htable_size = hashsize;
1979
1980         write_seqcount_end(&nf_conntrack_generation);
1981         nf_conntrack_all_unlock();
1982         local_bh_enable();
1983
1984         synchronize_net();
1985         nf_ct_free_hashtable(old_hash, old_size);
1986         return 0;
1987 }
1988
1989 int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
1990 {
1991         unsigned int hashsize;
1992         int rc;
1993
1994         if (current->nsproxy->net_ns != &init_net)
1995                 return -EOPNOTSUPP;
1996
1997         /* On boot, we can set this without any fancy locking. */
1998         if (!nf_conntrack_hash)
1999                 return param_set_uint(val, kp);
2000
2001         rc = kstrtouint(val, 0, &hashsize);
2002         if (rc)
2003                 return rc;
2004
2005         return nf_conntrack_hash_resize(hashsize);
2006 }
2007 EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
2008
2009 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
2010                   &nf_conntrack_htable_size, 0600);
2011
2012 static __always_inline unsigned int total_extension_size(void)
2013 {
2014         /* remember to add new extensions below */
2015         BUILD_BUG_ON(NF_CT_EXT_NUM > 9);
2016
2017         return sizeof(struct nf_ct_ext) +
2018                sizeof(struct nf_conn_help)
2019 #if IS_ENABLED(CONFIG_NF_NAT)
2020                 + sizeof(struct nf_conn_nat)
2021 #endif
2022                 + sizeof(struct nf_conn_seqadj)
2023                 + sizeof(struct nf_conn_acct)
2024 #ifdef CONFIG_NF_CONNTRACK_EVENTS
2025                 + sizeof(struct nf_conntrack_ecache)
2026 #endif
2027 #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
2028                 + sizeof(struct nf_conn_tstamp)
2029 #endif
2030 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
2031                 + sizeof(struct nf_conn_timeout)
2032 #endif
2033 #ifdef CONFIG_NF_CONNTRACK_LABELS
2034                 + sizeof(struct nf_conn_labels)
2035 #endif
2036 #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
2037                 + sizeof(struct nf_conn_synproxy)
2038 #endif
2039         ;
2040 };
2041
2042 int nf_conntrack_init_start(void)
2043 {
2044         int max_factor = 8;
2045         int ret = -ENOMEM;
2046         int i;
2047
2048         /* struct nf_ct_ext uses u8 to store offsets/size */
2049         BUILD_BUG_ON(total_extension_size() > 255u);
2050
2051         seqcount_init(&nf_conntrack_generation);
2052
2053         for (i = 0; i < CONNTRACK_LOCKS; i++)
2054                 spin_lock_init(&nf_conntrack_locks[i]);
2055
2056         if (!nf_conntrack_htable_size) {
2057                 /* Idea from tcp.c: use 1/16384 of memory.
2058                  * On i386: 32MB machine has 512 buckets.
2059                  * >= 1GB machines have 16384 buckets.
2060                  * >= 4GB machines have 65536 buckets.
2061                  */
2062                 nf_conntrack_htable_size
2063                         = (((totalram_pages << PAGE_SHIFT) / 16384)
2064                            / sizeof(struct hlist_head));
2065                 if (totalram_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
2066                         nf_conntrack_htable_size = 65536;
2067                 else if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
2068                         nf_conntrack_htable_size = 16384;
2069                 if (nf_conntrack_htable_size < 32)
2070                         nf_conntrack_htable_size = 32;
2071
2072                 /* Use a max. factor of four by default to get the same max as
2073                  * with the old struct list_heads. When a table size is given
2074                  * we use the old value of 8 to avoid reducing the max.
2075                  * entries. */
2076                 max_factor = 4;
2077         }
2078
2079         nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
2080         if (!nf_conntrack_hash)
2081                 return -ENOMEM;
2082
2083         nf_conntrack_max = max_factor * nf_conntrack_htable_size;
2084
2085         nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
2086                                                 sizeof(struct nf_conn),
2087                                                 NFCT_INFOMASK + 1,
2088                                                 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
2089         if (!nf_conntrack_cachep)
2090                 goto err_cachep;
2091
2092         printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
2093                NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
2094                nf_conntrack_max);
2095
2096         ret = nf_conntrack_expect_init();
2097         if (ret < 0)
2098                 goto err_expect;
2099
2100         ret = nf_conntrack_acct_init();
2101         if (ret < 0)
2102                 goto err_acct;
2103
2104         ret = nf_conntrack_tstamp_init();
2105         if (ret < 0)
2106                 goto err_tstamp;
2107
2108         ret = nf_conntrack_ecache_init();
2109         if (ret < 0)
2110                 goto err_ecache;
2111
2112         ret = nf_conntrack_timeout_init();
2113         if (ret < 0)
2114                 goto err_timeout;
2115
2116         ret = nf_conntrack_helper_init();
2117         if (ret < 0)
2118                 goto err_helper;
2119
2120         ret = nf_conntrack_labels_init();
2121         if (ret < 0)
2122                 goto err_labels;
2123
2124         ret = nf_conntrack_seqadj_init();
2125         if (ret < 0)
2126                 goto err_seqadj;
2127
2128         ret = nf_conntrack_proto_init();
2129         if (ret < 0)
2130                 goto err_proto;
2131
2132         conntrack_gc_work_init(&conntrack_gc_work);
2133         queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
2134
2135         return 0;
2136
2137 err_proto:
2138         nf_conntrack_seqadj_fini();
2139 err_seqadj:
2140         nf_conntrack_labels_fini();
2141 err_labels:
2142         nf_conntrack_helper_fini();
2143 err_helper:
2144         nf_conntrack_timeout_fini();
2145 err_timeout:
2146         nf_conntrack_ecache_fini();
2147 err_ecache:
2148         nf_conntrack_tstamp_fini();
2149 err_tstamp:
2150         nf_conntrack_acct_fini();
2151 err_acct:
2152         nf_conntrack_expect_fini();
2153 err_expect:
2154         kmem_cache_destroy(nf_conntrack_cachep);
2155 err_cachep:
2156         nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
2157         return ret;
2158 }
2159
2160 void nf_conntrack_init_end(void)
2161 {
2162         /* For use by REJECT target */
2163         RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
2164         RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack);
2165 }
2166
2167 /*
2168  * We need to use special "null" values, not used in hash table
2169  */
2170 #define UNCONFIRMED_NULLS_VAL   ((1<<30)+0)
2171 #define DYING_NULLS_VAL         ((1<<30)+1)
2172 #define TEMPLATE_NULLS_VAL      ((1<<30)+2)
2173
2174 int nf_conntrack_init_net(struct net *net)
2175 {
2176         int ret = -ENOMEM;
2177         int cpu;
2178
2179         BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER);
2180         atomic_set(&net->ct.count, 0);
2181
2182         net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
2183         if (!net->ct.pcpu_lists)
2184                 goto err_stat;
2185
2186         for_each_possible_cpu(cpu) {
2187                 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
2188
2189                 spin_lock_init(&pcpu->lock);
2190                 INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
2191                 INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
2192         }
2193
2194         net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
2195         if (!net->ct.stat)
2196                 goto err_pcpu_lists;
2197
2198         ret = nf_conntrack_expect_pernet_init(net);
2199         if (ret < 0)
2200                 goto err_expect;
2201         ret = nf_conntrack_acct_pernet_init(net);
2202         if (ret < 0)
2203                 goto err_acct;
2204         ret = nf_conntrack_tstamp_pernet_init(net);
2205         if (ret < 0)
2206                 goto err_tstamp;
2207         ret = nf_conntrack_ecache_pernet_init(net);
2208         if (ret < 0)
2209                 goto err_ecache;
2210         ret = nf_conntrack_helper_pernet_init(net);
2211         if (ret < 0)
2212                 goto err_helper;
2213         ret = nf_conntrack_proto_pernet_init(net);
2214         if (ret < 0)
2215                 goto err_proto;
2216         return 0;
2217
2218 err_proto:
2219         nf_conntrack_helper_pernet_fini(net);
2220 err_helper:
2221         nf_conntrack_ecache_pernet_fini(net);
2222 err_ecache:
2223         nf_conntrack_tstamp_pernet_fini(net);
2224 err_tstamp:
2225         nf_conntrack_acct_pernet_fini(net);
2226 err_acct:
2227         nf_conntrack_expect_pernet_fini(net);
2228 err_expect:
2229         free_percpu(net->ct.stat);
2230 err_pcpu_lists:
2231         free_percpu(net->ct.pcpu_lists);
2232 err_stat:
2233         return ret;
2234 }