net/netfilter/nf_conntrack_core.c

   1 /* Connection state tracking for netfilter.  This is separated from,
   2    but required by, the NAT layer; it can also be used by an iptables
   3    extension. */
   4
   5 /* (C) 1999-2001 Paul `Rusty' Russell
   6  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
   7  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
   8  * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License version 2 as
  12  * published by the Free Software Foundation.
  13  */
  14
  15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  16
  17 #include <linux/types.h>
  18 #include <linux/netfilter.h>
  19 #include <linux/module.h>
  20 #include <linux/sched.h>
  21 #include <linux/skbuff.h>
  22 #include <linux/proc_fs.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/stddef.h>
  25 #include <linux/slab.h>
  26 #include <linux/random.h>
  27 #include <linux/jhash.h>
  28 #include <linux/err.h>
  29 #include <linux/percpu.h>
  30 #include <linux/moduleparam.h>
  31 #include <linux/notifier.h>
  32 #include <linux/kernel.h>
  33 #include <linux/netdevice.h>
  34 #include <linux/socket.h>
  35 #include <linux/mm.h>
  36 #include <linux/nsproxy.h>
  37 #include <linux/rculist_nulls.h>
  38
  39 #include <net/netfilter/nf_conntrack.h>
  40 #include <net/netfilter/nf_conntrack_l3proto.h>
  41 #include <net/netfilter/nf_conntrack_l4proto.h>
  42 #include <net/netfilter/nf_conntrack_expect.h>
  43 #include <net/netfilter/nf_conntrack_helper.h>
  44 #include <net/netfilter/nf_conntrack_seqadj.h>
  45 #include <net/netfilter/nf_conntrack_core.h>
  46 #include <net/netfilter/nf_conntrack_extend.h>
  47 #include <net/netfilter/nf_conntrack_acct.h>
  48 #include <net/netfilter/nf_conntrack_ecache.h>
  49 #include <net/netfilter/nf_conntrack_zones.h>
  50 #include <net/netfilter/nf_conntrack_timestamp.h>
  51 #include <net/netfilter/nf_conntrack_timeout.h>
  52 #include <net/netfilter/nf_conntrack_labels.h>
  53 #include <net/netfilter/nf_conntrack_synproxy.h>
  54 #include <net/netfilter/nf_nat.h>
  55 #include <net/netfilter/nf_nat_core.h>
  56 #include <net/netfilter/nf_nat_helper.h>
  57 #include <net/netns/hash.h>
  58
  59 #define NF_CONNTRACK_VERSION    "0.5.0"
  60
  61 int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
  62                                       enum nf_nat_manip_type manip,
  63                                       const struct nlattr *attr) __read_mostly;
  64 EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
  65
  66 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
  67 EXPORT_SYMBOL_GPL(nf_conntrack_locks);
  68
  69 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
  70 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
  71
  72 struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
  73 EXPORT_SYMBOL_GPL(nf_conntrack_hash);
  74
  75 struct conntrack_gc_work {
  76         struct delayed_work     dwork;
  77         u32                     last_bucket;
  78         bool                    exiting;
  79 };
  80
  81 static __read_mostly struct kmem_cache *nf_conntrack_cachep;
  82 static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
  83 static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
  84 static __read_mostly bool nf_conntrack_locks_all;
  85
  86 #define GC_MAX_BUCKETS_DIV      64u
  87 #define GC_MAX_BUCKETS          8192u
  88 #define GC_INTERVAL             (5 * HZ)
  89 #define GC_MAX_EVICTS           256u
  90
  91 static struct conntrack_gc_work conntrack_gc_work;
  92
  93 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
  94 {
  95         spin_lock(lock);
  96         while (unlikely(nf_conntrack_locks_all)) {
  97                 spin_unlock(lock);
  98
  99                 /*
 100                  * Order the 'nf_conntrack_locks_all' load vs. the
 101                  * spin_unlock_wait() loads below, to ensure
 102                  * that 'nf_conntrack_locks_all_lock' is indeed held:
 103                  */
 104                 smp_rmb(); /* spin_lock(&nf_conntrack_locks_all_lock) */
 105                 spin_unlock_wait(&nf_conntrack_locks_all_lock);
 106                 spin_lock(lock);
 107         }
 108 }
 109 EXPORT_SYMBOL_GPL(nf_conntrack_lock);
 110
 111 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
 112 {
 113         h1 %= CONNTRACK_LOCKS;
 114         h2 %= CONNTRACK_LOCKS;
 115         spin_unlock(&nf_conntrack_locks[h1]);
 116         if (h1 != h2)
 117                 spin_unlock(&nf_conntrack_locks[h2]);
 118 }
 119
 120 /* return true if we need to recompute hashes (in case hash table was resized) */
 121 static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
 122                                      unsigned int h2, unsigned int sequence)
 123 {
 124         h1 %= CONNTRACK_LOCKS;
 125         h2 %= CONNTRACK_LOCKS;
 126         if (h1 <= h2) {
 127                 nf_conntrack_lock(&nf_conntrack_locks[h1]);
 128                 if (h1 != h2)
 129                         spin_lock_nested(&nf_conntrack_locks[h2],
 130                                          SINGLE_DEPTH_NESTING);
 131         } else {
 132                 nf_conntrack_lock(&nf_conntrack_locks[h2]);
 133                 spin_lock_nested(&nf_conntrack_locks[h1],
 134                                  SINGLE_DEPTH_NESTING);
 135         }
 136         if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
 137                 nf_conntrack_double_unlock(h1, h2);
 138                 return true;
 139         }
 140         return false;
 141 }
 142
 143 static void nf_conntrack_all_lock(void)
 144 {
 145         int i;
 146
 147         spin_lock(&nf_conntrack_locks_all_lock);
 148         nf_conntrack_locks_all = true;
 149
 150         /*
 151          * Order the above store of 'nf_conntrack_locks_all' against
 152          * the spin_unlock_wait() loads below, such that if
 153          * nf_conntrack_lock() observes 'nf_conntrack_locks_all'
 154          * we must observe nf_conntrack_locks[] held:
 155          */
 156         smp_mb(); /* spin_lock(&nf_conntrack_locks_all_lock) */
 157
 158         for (i = 0; i < CONNTRACK_LOCKS; i++) {
 159                 spin_unlock_wait(&nf_conntrack_locks[i]);
 160         }
 161 }
 162
 163 static void nf_conntrack_all_unlock(void)
 164 {
 165         /*
 166          * All prior stores must be complete before we clear
 167          * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock()
 168          * might observe the false value but not the entire
 169          * critical section:
 170          */
 171         smp_store_release(&nf_conntrack_locks_all, false);
 172         spin_unlock(&nf_conntrack_locks_all_lock);
 173 }
 174
 175 unsigned int nf_conntrack_htable_size __read_mostly;
 176 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 177
 178 unsigned int nf_conntrack_max __read_mostly;
 179 seqcount_t nf_conntrack_generation __read_mostly;
 180
 181 DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
 182 EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
 183
 184 static unsigned int nf_conntrack_hash_rnd __read_mostly;
 185
 186 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
 187                               const struct net *net)
 188 {
 189         unsigned int n;
 190         u32 seed;
 191
 192         get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
 193
 194         /* The direction must be ignored, so we hash everything up to the
 195          * destination ports (which is a multiple of 4) and treat the last
 196          * three bytes manually.
 197          */
 198         seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
 199         n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
 200         return jhash2((u32 *)tuple, n, seed ^
 201                       (((__force __u16)tuple->dst.u.all << 16) |
 202                       tuple->dst.protonum));
 203 }
 204
 205 static u32 scale_hash(u32 hash)
 206 {
 207         return reciprocal_scale(hash, nf_conntrack_htable_size);
 208 }
 209
 210 static u32 __hash_conntrack(const struct net *net,
 211                             const struct nf_conntrack_tuple *tuple,
 212                             unsigned int size)
 213 {
 214         return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
 215 }
 216
 217 static u32 hash_conntrack(const struct net *net,
 218                           const struct nf_conntrack_tuple *tuple)
 219 {
 220         return scale_hash(hash_conntrack_raw(tuple, net));
 221 }
 222
 223 bool
 224 nf_ct_get_tuple(const struct sk_buff *skb,
 225                 unsigned int nhoff,
 226                 unsigned int dataoff,
 227                 u_int16_t l3num,
 228                 u_int8_t protonum,
 229                 struct net *net,
 230                 struct nf_conntrack_tuple *tuple,
 231                 const struct nf_conntrack_l3proto *l3proto,
 232                 const struct nf_conntrack_l4proto *l4proto)
 233 {
 234         memset(tuple, 0, sizeof(*tuple));
 235
 236         tuple->src.l3num = l3num;
 237         if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
 238                 return false;
 239
 240         tuple->dst.protonum = protonum;
 241         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
 242
 243         return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
 244 }
 245 EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
 246
 247 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
 248                        u_int16_t l3num,
 249                        struct net *net, struct nf_conntrack_tuple *tuple)
 250 {
 251         struct nf_conntrack_l3proto *l3proto;
 252         struct nf_conntrack_l4proto *l4proto;
 253         unsigned int protoff;
 254         u_int8_t protonum;
 255         int ret;
 256
 257         rcu_read_lock();
 258
 259         l3proto = __nf_ct_l3proto_find(l3num);
 260         ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum);
 261         if (ret != NF_ACCEPT) {
 262                 rcu_read_unlock();
 263                 return false;
 264         }
 265
 266         l4proto = __nf_ct_l4proto_find(l3num, protonum);
 267
 268         ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple,
 269                               l3proto, l4proto);
 270
 271         rcu_read_unlock();
 272         return ret;
 273 }
 274 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
 275
 276 bool
 277 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 278                    const struct nf_conntrack_tuple *orig,
 279                    const struct nf_conntrack_l3proto *l3proto,
 280                    const struct nf_conntrack_l4proto *l4proto)
 281 {
 282         memset(inverse, 0, sizeof(*inverse));
 283
 284         inverse->src.l3num = orig->src.l3num;
 285         if (l3proto->invert_tuple(inverse, orig) == 0)
 286                 return false;
 287
 288         inverse->dst.dir = !orig->dst.dir;
 289
 290         inverse->dst.protonum = orig->dst.protonum;
 291         return l4proto->invert_tuple(inverse, orig);
 292 }
 293 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
 294
 295 static void
 296 clean_from_lists(struct nf_conn *ct)
 297 {
 298         pr_debug("clean_from_lists(%p)\n", ct);
 299         hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
 300         hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
 301
 302         /* Destroy all pending expectations */
 303         nf_ct_remove_expectations(ct);
 304 }
 305
 306 /* must be called with local_bh_disable */
 307 static void nf_ct_add_to_dying_list(struct nf_conn *ct)
 308 {
 309         struct ct_pcpu *pcpu;
 310
 311         /* add this conntrack to the (per cpu) dying list */
 312         ct->cpu = smp_processor_id();
 313         pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
 314
 315         spin_lock(&pcpu->lock);
 316         hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 317                              &pcpu->dying);
 318         spin_unlock(&pcpu->lock);
 319 }
 320
 321 /* must be called with local_bh_disable */
 322 static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
 323 {
 324         struct ct_pcpu *pcpu;
 325
 326         /* add this conntrack to the (per cpu) unconfirmed list */
 327         ct->cpu = smp_processor_id();
 328         pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
 329
 330         spin_lock(&pcpu->lock);
 331         hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 332                              &pcpu->unconfirmed);
 333         spin_unlock(&pcpu->lock);
 334 }
 335
 336 /* must be called with local_bh_disable */
 337 static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
 338 {
 339         struct ct_pcpu *pcpu;
 340
 341         /* We overload first tuple to link into unconfirmed or dying list.*/
 342         pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
 343
 344         spin_lock(&pcpu->lock);
 345         BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
 346         hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
 347         spin_unlock(&pcpu->lock);
 348 }
 349
 350 /* Released via destroy_conntrack() */
 351 struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
 352                                  const struct nf_conntrack_zone *zone,
 353                                  gfp_t flags)
 354 {
 355         struct nf_conn *tmpl;
 356
 357         tmpl = kzalloc(sizeof(*tmpl), flags);
 358         if (tmpl == NULL)
 359                 return NULL;
 360
 361         tmpl->status = IPS_TEMPLATE;
 362         write_pnet(&tmpl->ct_net, net);
 363         nf_ct_zone_add(tmpl, zone);
 364         atomic_set(&tmpl->ct_general.use, 0);
 365
 366         return tmpl;
 367 }
 368 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
 369
 370 void nf_ct_tmpl_free(struct nf_conn *tmpl)
 371 {
 372         nf_ct_ext_destroy(tmpl);
 373         nf_ct_ext_free(tmpl);
 374         kfree(tmpl);
 375 }
 376 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
 377
 378 static void
 379 destroy_conntrack(struct nf_conntrack *nfct)
 380 {
 381         struct nf_conn *ct = (struct nf_conn *)nfct;
 382         struct net *net = nf_ct_net(ct);
 383         struct nf_conntrack_l4proto *l4proto;
 384
 385         pr_debug("destroy_conntrack(%p)\n", ct);
 386         NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
 387
 388         if (unlikely(nf_ct_is_template(ct))) {
 389                 nf_ct_tmpl_free(ct);
 390                 return;
 391         }
 392         rcu_read_lock();
 393         l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 394         if (l4proto->destroy)
 395                 l4proto->destroy(ct);
 396
 397         rcu_read_unlock();
 398
 399         local_bh_disable();
 400         /* Expectations will have been removed in clean_from_lists,
 401          * except TFTP can create an expectation on the first packet,
 402          * before connection is in the list, so we need to clean here,
 403          * too.
 404          */
 405         nf_ct_remove_expectations(ct);
 406
 407         nf_ct_del_from_dying_or_unconfirmed_list(ct);
 408
 409         NF_CT_STAT_INC(net, delete);
 410         local_bh_enable();
 411
 412         if (ct->master)
 413                 nf_ct_put(ct->master);
 414
 415         pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
 416         nf_conntrack_free(ct);
 417 }
 418
 419 static void nf_ct_delete_from_lists(struct nf_conn *ct)
 420 {
 421         struct net *net = nf_ct_net(ct);
 422         unsigned int hash, reply_hash;
 423         unsigned int sequence;
 424
 425         nf_ct_helper_destroy(ct);
 426
 427         local_bh_disable();
 428         do {
 429                 sequence = read_seqcount_begin(&nf_conntrack_generation);
 430                 hash = hash_conntrack(net,
 431                                       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 432                 reply_hash = hash_conntrack(net,
 433                                            &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 434         } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 435
 436         clean_from_lists(ct);
 437         nf_conntrack_double_unlock(hash, reply_hash);
 438
 439         nf_ct_add_to_dying_list(ct);
 440
 441         NF_CT_STAT_INC(net, delete_list);
 442         local_bh_enable();
 443 }
 444
 445 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
 446 {
 447         struct nf_conn_tstamp *tstamp;
 448
 449         if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
 450                 return false;
 451
 452         tstamp = nf_conn_tstamp_find(ct);
 453         if (tstamp && tstamp->stop == 0)
 454                 tstamp->stop = ktime_get_real_ns();
 455
 456         if (nf_conntrack_event_report(IPCT_DESTROY, ct,
 457                                     portid, report) < 0) {
 458                 /* destroy event was not delivered. nf_ct_put will
 459                  * be done by event cache worker on redelivery.
 460                  */
 461                 nf_ct_delete_from_lists(ct);
 462                 nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
 463                 return false;
 464         }
 465
 466         nf_conntrack_ecache_work(nf_ct_net(ct));
 467         nf_ct_delete_from_lists(ct);
 468         nf_ct_put(ct);
 469         return true;
 470 }
 471 EXPORT_SYMBOL_GPL(nf_ct_delete);
 472
 473 static inline bool
 474 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
 475                 const struct nf_conntrack_tuple *tuple,
 476                 const struct nf_conntrack_zone *zone,
 477                 const struct net *net)
 478 {
 479         struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
 480
 481         /* A conntrack can be recreated with the equal tuple,
 482          * so we need to check that the conntrack is confirmed
 483          */
 484         return nf_ct_tuple_equal(tuple, &h->tuple) &&
 485                nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
 486                nf_ct_is_confirmed(ct) &&
 487                net_eq(net, nf_ct_net(ct));
 488 }
 489
 490 /* caller must hold rcu readlock and none of the nf_conntrack_locks */
 491 static void nf_ct_gc_expired(struct nf_conn *ct)
 492 {
 493         if (!atomic_inc_not_zero(&ct->ct_general.use))
 494                 return;
 495
 496         if (nf_ct_should_gc(ct))
 497                 nf_ct_kill(ct);
 498
 499         nf_ct_put(ct);
 500 }
 501
 502 /*
 503  * Warning :
 504  * - Caller must take a reference on returned object
 505  *   and recheck nf_ct_tuple_equal(tuple, &h->tuple)
 506  */
 507 static struct nf_conntrack_tuple_hash *
 508 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
 509                       const struct nf_conntrack_tuple *tuple, u32 hash)
 510 {
 511         struct nf_conntrack_tuple_hash *h;
 512         struct hlist_nulls_head *ct_hash;
 513         struct hlist_nulls_node *n;
 514         unsigned int bucket, hsize;
 515
 516 begin:
 517         nf_conntrack_get_ht(&ct_hash, &hsize);
 518         bucket = reciprocal_scale(hash, hsize);
 519
 520         hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
 521                 struct nf_conn *ct;
 522
 523                 ct = nf_ct_tuplehash_to_ctrack(h);
 524                 if (nf_ct_is_expired(ct)) {
 525                         nf_ct_gc_expired(ct);
 526                         continue;
 527                 }
 528
 529                 if (nf_ct_is_dying(ct))
 530                         continue;
 531
 532                 if (nf_ct_key_equal(h, tuple, zone, net)) {
 533                         NF_CT_STAT_INC_ATOMIC(net, found);
 534                         return h;
 535                 }
 536                 NF_CT_STAT_INC_ATOMIC(net, searched);
 537         }
 538         /*
 539          * if the nulls value we got at the end of this lookup is
 540          * not the expected one, we must restart lookup.
 541          * We probably met an item that was moved to another chain.
 542          */
 543         if (get_nulls_value(n) != bucket) {
 544                 NF_CT_STAT_INC_ATOMIC(net, search_restart);
 545                 goto begin;
 546         }
 547
 548         return NULL;
 549 }
 550
 551 /* Find a connection corresponding to a tuple. */
 552 static struct nf_conntrack_tuple_hash *
 553 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
 554                         const struct nf_conntrack_tuple *tuple, u32 hash)
 555 {
 556         struct nf_conntrack_tuple_hash *h;
 557         struct nf_conn *ct;
 558
 559         rcu_read_lock();
 560 begin:
 561         h = ____nf_conntrack_find(net, zone, tuple, hash);
 562         if (h) {
 563                 ct = nf_ct_tuplehash_to_ctrack(h);
 564                 if (unlikely(nf_ct_is_dying(ct) ||
 565                              !atomic_inc_not_zero(&ct->ct_general.use)))
 566                         h = NULL;
 567                 else {
 568                         if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) {
 569                                 nf_ct_put(ct);
 570                                 goto begin;
 571                         }
 572                 }
 573         }
 574         rcu_read_unlock();
 575
 576         return h;
 577 }
 578
 579 struct nf_conntrack_tuple_hash *
 580 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
 581                       const struct nf_conntrack_tuple *tuple)
 582 {
 583         return __nf_conntrack_find_get(net, zone, tuple,
 584                                        hash_conntrack_raw(tuple, net));
 585 }
 586 EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
 587
 588 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
 589                                        unsigned int hash,
 590                                        unsigned int reply_hash)
 591 {
 592         hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 593                            &nf_conntrack_hash[hash]);
 594         hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
 595                            &nf_conntrack_hash[reply_hash]);
 596 }
 597
 598 int
 599 nf_conntrack_hash_check_insert(struct nf_conn *ct)
 600 {
 601         const struct nf_conntrack_zone *zone;
 602         struct net *net = nf_ct_net(ct);
 603         unsigned int hash, reply_hash;
 604         struct nf_conntrack_tuple_hash *h;
 605         struct hlist_nulls_node *n;
 606         unsigned int sequence;
 607
 608         zone = nf_ct_zone(ct);
 609
 610         local_bh_disable();
 611         do {
 612                 sequence = read_seqcount_begin(&nf_conntrack_generation);
 613                 hash = hash_conntrack(net,
 614                                       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 615                 reply_hash = hash_conntrack(net,
 616                                            &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 617         } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 618
 619         /* See if there's one in the list already, including reverse */
 620         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
 621                 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 622                                     zone, net))
 623                         goto out;
 624
 625         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
 626                 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 627                                     zone, net))
 628                         goto out;
 629
 630         smp_wmb();
 631         /* The caller holds a reference to this object */
 632         atomic_set(&ct->ct_general.use, 2);
 633         __nf_conntrack_hash_insert(ct, hash, reply_hash);
 634         nf_conntrack_double_unlock(hash, reply_hash);
 635         NF_CT_STAT_INC(net, insert);
 636         local_bh_enable();
 637         return 0;
 638
 639 out:
 640         nf_conntrack_double_unlock(hash, reply_hash);
 641         NF_CT_STAT_INC(net, insert_failed);
 642         local_bh_enable();
 643         return -EEXIST;
 644 }
 645 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
 646
 647 static inline void nf_ct_acct_update(struct nf_conn *ct,
 648                                      enum ip_conntrack_info ctinfo,
 649                                      unsigned int len)
 650 {
 651         struct nf_conn_acct *acct;
 652
 653         acct = nf_conn_acct_find(ct);
 654         if (acct) {
 655                 struct nf_conn_counter *counter = acct->counter;
 656
 657                 atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
 658                 atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes);
 659         }
 660 }
 661
 662 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 663                              const struct nf_conn *loser_ct)
 664 {
 665         struct nf_conn_acct *acct;
 666
 667         acct = nf_conn_acct_find(loser_ct);
 668         if (acct) {
 669                 struct nf_conn_counter *counter = acct->counter;
 670                 unsigned int bytes;
 671
 672                 /* u32 should be fine since we must have seen one packet. */
 673                 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
 674                 nf_ct_acct_update(ct, ctinfo, bytes);
 675         }
 676 }
 677
 678 /* Resolve race on insertion if this protocol allows this. */
 679 static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
 680                                enum ip_conntrack_info ctinfo,
 681                                struct nf_conntrack_tuple_hash *h)
 682 {
 683         /* This is the conntrack entry already in hashes that won race. */
 684         struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
 685         struct nf_conntrack_l4proto *l4proto;
 686
 687         l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 688         if (l4proto->allow_clash &&
 689             !nfct_nat(ct) &&
 690             !nf_ct_is_dying(ct) &&
 691             atomic_inc_not_zero(&ct->ct_general.use)) {
 692                 nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct);
 693                 nf_conntrack_put(skb->nfct);
 694                 /* Assign conntrack already in hashes to this skbuff. Don't
 695                  * modify skb->nfctinfo to ensure consistent stateful filtering.
 696                  */
 697                 skb->nfct = &ct->ct_general;
 698                 return NF_ACCEPT;
 699         }
 700         NF_CT_STAT_INC(net, drop);
 701         return NF_DROP;
 702 }
 703
 704 /* Confirm a connection given skb; places it in hash table */
 705 int
 706 __nf_conntrack_confirm(struct sk_buff *skb)
 707 {
 708         const struct nf_conntrack_zone *zone;
 709         unsigned int hash, reply_hash;
 710         struct nf_conntrack_tuple_hash *h;
 711         struct nf_conn *ct;
 712         struct nf_conn_help *help;
 713         struct nf_conn_tstamp *tstamp;
 714         struct hlist_nulls_node *n;
 715         enum ip_conntrack_info ctinfo;
 716         struct net *net;
 717         unsigned int sequence;
 718         int ret = NF_DROP;
 719
 720         ct = nf_ct_get(skb, &ctinfo);
 721         net = nf_ct_net(ct);
 722
 723         /* ipt_REJECT uses nf_conntrack_attach to attach related
 724            ICMP/TCP RST packets in other direction.  Actual packet
 725            which created connection will be IP_CT_NEW or for an
 726            expected connection, IP_CT_RELATED. */
 727         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
 728                 return NF_ACCEPT;
 729
 730         zone = nf_ct_zone(ct);
 731         local_bh_disable();
 732
 733         do {
 734                 sequence = read_seqcount_begin(&nf_conntrack_generation);
 735                 /* reuse the hash saved before */
 736                 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
 737                 hash = scale_hash(hash);
 738                 reply_hash = hash_conntrack(net,
 739                                            &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 740
 741         } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 742
 743         /* We're not in hash table, and we refuse to set up related
 744          * connections for unconfirmed conns.  But packet copies and
 745          * REJECT will give spurious warnings here.
 746          */
 747         /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
 748
 749         /* No external references means no one else could have
 750          * confirmed us.
 751          */
 752         NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
 753         pr_debug("Confirming conntrack %p\n", ct);
 754         /* We have to check the DYING flag after unlink to prevent
 755          * a race against nf_ct_get_next_corpse() possibly called from
 756          * user context, else we insert an already 'dead' hash, blocking
 757          * further use of that particular connection -JM.
 758          */
 759         nf_ct_del_from_dying_or_unconfirmed_list(ct);
 760
 761         if (unlikely(nf_ct_is_dying(ct))) {
 762                 nf_ct_add_to_dying_list(ct);
 763                 goto dying;
 764         }
 765
 766         /* See if there's one in the list already, including reverse:
 767            NAT could have grabbed it without realizing, since we're
 768            not in the hash.  If there is, we lost race. */
 769         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
 770                 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 771                                     zone, net))
 772                         goto out;
 773
 774         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
 775                 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 776                                     zone, net))
 777                         goto out;
 778
 779         /* Timer relative to confirmation time, not original
 780            setting time, otherwise we'd get timer wrap in
 781            weird delay cases. */
 782         ct->timeout += nfct_time_stamp;
 783         atomic_inc(&ct->ct_general.use);
 784         ct->status |= IPS_CONFIRMED;
 785
 786         /* set conntrack timestamp, if enabled. */
 787         tstamp = nf_conn_tstamp_find(ct);
 788         if (tstamp) {
 789                 if (skb->tstamp.tv64 == 0)
 790                         __net_timestamp(skb);
 791
 792                 tstamp->start = ktime_to_ns(skb->tstamp);
 793         }
 794         /* Since the lookup is lockless, hash insertion must be done after
 795          * starting the timer and setting the CONFIRMED bit. The RCU barriers
 796          * guarantee that no other CPU can find the conntrack before the above
 797          * stores are visible.
 798          */
 799         __nf_conntrack_hash_insert(ct, hash, reply_hash);
 800         nf_conntrack_double_unlock(hash, reply_hash);
 801         NF_CT_STAT_INC(net, insert);
 802         local_bh_enable();
 803
 804         help = nfct_help(ct);
 805         if (help && help->helper)
 806                 nf_conntrack_event_cache(IPCT_HELPER, ct);
 807
 808         nf_conntrack_event_cache(master_ct(ct) ?
 809                                  IPCT_RELATED : IPCT_NEW, ct);
 810         return NF_ACCEPT;
 811
 812 out:
 813         nf_ct_add_to_dying_list(ct);
 814         ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
 815 dying:
 816         nf_conntrack_double_unlock(hash, reply_hash);
 817         NF_CT_STAT_INC(net, insert_failed);
 818         local_bh_enable();
 819         return ret;
 820 }
 821 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
 822
 823 /* Returns true if a connection correspondings to the tuple (required
 824    for NAT). */
 825 int
 826 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 827                          const struct nf_conn *ignored_conntrack)
 828 {
 829         struct net *net = nf_ct_net(ignored_conntrack);
 830         const struct nf_conntrack_zone *zone;
 831         struct nf_conntrack_tuple_hash *h;
 832         struct hlist_nulls_head *ct_hash;
 833         unsigned int hash, hsize;
 834         struct hlist_nulls_node *n;
 835         struct nf_conn *ct;
 836
 837         zone = nf_ct_zone(ignored_conntrack);
 838
 839         rcu_read_lock();
 840  begin:
 841         nf_conntrack_get_ht(&ct_hash, &hsize);
 842         hash = __hash_conntrack(net, tuple, hsize);
 843
 844         hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
 845                 ct = nf_ct_tuplehash_to_ctrack(h);
 846
 847                 if (ct == ignored_conntrack)
 848                         continue;
 849
 850                 if (nf_ct_is_expired(ct)) {
 851                         nf_ct_gc_expired(ct);
 852                         continue;
 853                 }
 854
 855                 if (nf_ct_key_equal(h, tuple, zone, net)) {
 856                         NF_CT_STAT_INC_ATOMIC(net, found);
 857                         rcu_read_unlock();
 858                         return 1;
 859                 }
 860                 NF_CT_STAT_INC_ATOMIC(net, searched);
 861         }
 862
 863         if (get_nulls_value(n) != hash) {
 864                 NF_CT_STAT_INC_ATOMIC(net, search_restart);
 865                 goto begin;
 866         }
 867
 868         rcu_read_unlock();
 869
 870         return 0;
 871 }
 872 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
 873
 874 #define NF_CT_EVICTION_RANGE    8
 875
 876 /* There's a small race here where we may free a just-assured
 877    connection.  Too bad: we're in trouble anyway. */
 878 static unsigned int early_drop_list(struct net *net,
 879                                     struct hlist_nulls_head *head)
 880 {
 881         struct nf_conntrack_tuple_hash *h;
 882         struct hlist_nulls_node *n;
 883         unsigned int drops = 0;
 884         struct nf_conn *tmp;
 885
 886         hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
 887                 tmp = nf_ct_tuplehash_to_ctrack(h);
 888
 889                 if (nf_ct_is_expired(tmp)) {
 890                         nf_ct_gc_expired(tmp);
 891                         continue;
 892                 }
 893
 894                 if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
 895                     !net_eq(nf_ct_net(tmp), net) ||
 896                     nf_ct_is_dying(tmp))
 897                         continue;
 898
 899                 if (!atomic_inc_not_zero(&tmp->ct_general.use))
 900                         continue;
 901
 902                 /* kill only if still in same netns -- might have moved due to
 903                  * SLAB_DESTROY_BY_RCU rules.
 904                  *
 905                  * We steal the timer reference.  If that fails timer has
 906                  * already fired or someone else deleted it. Just drop ref
 907                  * and move to next entry.
 908                  */
 909                 if (net_eq(nf_ct_net(tmp), net) &&
 910                     nf_ct_is_confirmed(tmp) &&
 911                     nf_ct_delete(tmp, 0, 0))
 912                         drops++;
 913
 914                 nf_ct_put(tmp);
 915         }
 916
 917         return drops;
 918 }
 919
 920 static noinline int early_drop(struct net *net, unsigned int _hash)
 921 {
 922         unsigned int i;
 923
 924         for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
 925                 struct hlist_nulls_head *ct_hash;
 926                 unsigned int hash, hsize, drops;
 927
 928                 rcu_read_lock();
 929                 nf_conntrack_get_ht(&ct_hash, &hsize);
 930                 hash = reciprocal_scale(_hash++, hsize);
 931
 932                 drops = early_drop_list(net, &ct_hash[hash]);
 933                 rcu_read_unlock();
 934
 935                 if (drops) {
 936                         NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops);
 937                         return true;
 938                 }
 939         }
 940
 941         return false;
 942 }
 943
 944 static void gc_worker(struct work_struct *work)
 945 {
 946         unsigned int i, goal, buckets = 0, expired_count = 0;
 947         unsigned long next_run = GC_INTERVAL;
 948         unsigned int ratio, scanned = 0;
 949         struct conntrack_gc_work *gc_work;
 950
 951         gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 952
 953         goal = min(nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV, GC_MAX_BUCKETS);
 954         i = gc_work->last_bucket;
 955
 956         do {
 957                 struct nf_conntrack_tuple_hash *h;
 958                 struct hlist_nulls_head *ct_hash;
 959                 struct hlist_nulls_node *n;
 960                 unsigned int hashsz;
 961                 struct nf_conn *tmp;
 962
 963                 i++;
 964                 rcu_read_lock();
 965
 966                 nf_conntrack_get_ht(&ct_hash, &hashsz);
 967                 if (i >= hashsz)
 968                         i = 0;
 969
 970                 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
 971                         tmp = nf_ct_tuplehash_to_ctrack(h);
 972
 973                         scanned++;
 974                         if (nf_ct_is_expired(tmp)) {
 975                                 nf_ct_gc_expired(tmp);
 976                                 expired_count++;
 977                                 continue;
 978                         }
 979                 }
 980
 981                 /* could check get_nulls_value() here and restart if ct
 982                  * was moved to another chain.  But given gc is best-effort
 983                  * we will just continue with next hash slot.
 984                  */
 985                 rcu_read_unlock();
 986                 cond_resched_rcu_qs();
 987         } while (++buckets < goal &&
 988                  expired_count < GC_MAX_EVICTS);
 989
 990         if (gc_work->exiting)
 991                 return;
 992
 993         ratio = scanned ? expired_count * 100 / scanned : 0;
 994         if (ratio >= 90)
 995                 next_run = 0;
 996
 997         gc_work->last_bucket = i;
 998         schedule_delayed_work(&gc_work->dwork, next_run);
 999 }
1000
1001 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
1002 {
1003         INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
1004         gc_work->exiting = false;
1005 }
1006
1007 static struct nf_conn *
1008 __nf_conntrack_alloc(struct net *net,
1009                      const struct nf_conntrack_zone *zone,
1010                      const struct nf_conntrack_tuple *orig,
1011                      const struct nf_conntrack_tuple *repl,
1012                      gfp_t gfp, u32 hash)
1013 {
1014         struct nf_conn *ct;
1015
1016         /* We don't want any race condition at early drop stage */
1017         atomic_inc(&net->ct.count);
1018
1019         if (nf_conntrack_max &&
1020             unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
1021                 if (!early_drop(net, hash)) {
1022                         atomic_dec(&net->ct.count);
1023                         net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
1024                         return ERR_PTR(-ENOMEM);
1025                 }
1026         }
1027
1028         /*
1029          * Do not use kmem_cache_zalloc(), as this cache uses
1030          * SLAB_DESTROY_BY_RCU.
1031          */
1032         ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
1033         if (ct == NULL)
1034                 goto out;
1035
1036         spin_lock_init(&ct->lock);
1037         ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
1038         ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
1039         ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
1040         /* save hash for reusing when confirming */
1041         *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
1042         ct->status = 0;
1043         write_pnet(&ct->ct_net, net);
1044         memset(&ct->__nfct_init_offset[0], 0,
1045                offsetof(struct nf_conn, proto) -
1046                offsetof(struct nf_conn, __nfct_init_offset[0]));
1047
1048         nf_ct_zone_add(ct, zone);
1049
1050         /* Because we use RCU lookups, we set ct_general.use to zero before
1051          * this is inserted in any list.
1052          */
1053         atomic_set(&ct->ct_general.use, 0);
1054         return ct;
1055 out:
1056         atomic_dec(&net->ct.count);
1057         return ERR_PTR(-ENOMEM);
1058 }
1059
1060 struct nf_conn *nf_conntrack_alloc(struct net *net,
1061                                    const struct nf_conntrack_zone *zone,
1062                                    const struct nf_conntrack_tuple *orig,
1063                                    const struct nf_conntrack_tuple *repl,
1064                                    gfp_t gfp)
1065 {
1066         return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
1067 }
1068 EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
1069
1070 void nf_conntrack_free(struct nf_conn *ct)
1071 {
1072         struct net *net = nf_ct_net(ct);
1073
1074         /* A freed object has refcnt == 0, that's
1075          * the golden rule for SLAB_DESTROY_BY_RCU
1076          */
1077         NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
1078
1079         nf_ct_ext_destroy(ct);
1080         nf_ct_ext_free(ct);
1081         kmem_cache_free(nf_conntrack_cachep, ct);
1082         smp_mb__before_atomic();
1083         atomic_dec(&net->ct.count);
1084 }
1085 EXPORT_SYMBOL_GPL(nf_conntrack_free);
1086
1087
1088 /* Allocate a new conntrack: we return -ENOMEM if classification
1089    failed due to stress.  Otherwise it really is unclassifiable. */
1090 static struct nf_conntrack_tuple_hash *
1091 init_conntrack(struct net *net, struct nf_conn *tmpl,
1092                const struct nf_conntrack_tuple *tuple,
1093                struct nf_conntrack_l3proto *l3proto,
1094                struct nf_conntrack_l4proto *l4proto,
1095                struct sk_buff *skb,
1096                unsigned int dataoff, u32 hash)
1097 {
1098         struct nf_conn *ct;
1099         struct nf_conn_help *help;
1100         struct nf_conntrack_tuple repl_tuple;
1101         struct nf_conntrack_ecache *ecache;
1102         struct nf_conntrack_expect *exp = NULL;
1103         const struct nf_conntrack_zone *zone;
1104         struct nf_conn_timeout *timeout_ext;
1105         struct nf_conntrack_zone tmp;
1106         unsigned int *timeouts;
1107
1108         if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
1109                 pr_debug("Can't invert tuple.\n");
1110                 return NULL;
1111         }
1112
1113         zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1114         ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
1115                                   hash);
1116         if (IS_ERR(ct))
1117                 return (struct nf_conntrack_tuple_hash *)ct;
1118
1119         if (tmpl && nfct_synproxy(tmpl)) {
1120                 nfct_seqadj_ext_add(ct);
1121                 nfct_synproxy_ext_add(ct);
1122         }
1123
1124         timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
1125         if (timeout_ext) {
1126                 timeouts = nf_ct_timeout_data(timeout_ext);
1127                 if (unlikely(!timeouts))
1128                         timeouts = l4proto->get_timeouts(net);
1129         } else {
1130                 timeouts = l4proto->get_timeouts(net);
1131         }
1132
1133         if (!l4proto->new(ct, skb, dataoff, timeouts)) {
1134                 nf_conntrack_free(ct);
1135                 pr_debug("can't track with proto module\n");
1136                 return NULL;
1137         }
1138
1139         if (timeout_ext)
1140                 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
1141                                       GFP_ATOMIC);
1142
1143         nf_ct_acct_ext_add(ct, GFP_ATOMIC);
1144         nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
1145         nf_ct_labels_ext_add(ct);
1146
1147         ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
1148         nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
1149                                  ecache ? ecache->expmask : 0,
1150                              GFP_ATOMIC);
1151
1152         local_bh_disable();
1153         if (net->ct.expect_count) {
1154                 spin_lock(&nf_conntrack_expect_lock);
1155                 exp = nf_ct_find_expectation(net, zone, tuple);
1156                 if (exp) {
1157                         pr_debug("expectation arrives ct=%p exp=%p\n",
1158                                  ct, exp);
1159                         /* Welcome, Mr. Bond.  We've been expecting you... */
1160                         __set_bit(IPS_EXPECTED_BIT, &ct->status);
1161                         /* exp->master safe, refcnt bumped in nf_ct_find_expectation */
1162                         ct->master = exp->master;
1163                         if (exp->helper) {
1164                                 help = nf_ct_helper_ext_add(ct, exp->helper,
1165                                                             GFP_ATOMIC);
1166                                 if (help)
1167                                         rcu_assign_pointer(help->helper, exp->helper);
1168                         }
1169
1170 #ifdef CONFIG_NF_CONNTRACK_MARK
1171                         ct->mark = exp->master->mark;
1172 #endif
1173 #ifdef CONFIG_NF_CONNTRACK_SECMARK
1174                         ct->secmark = exp->master->secmark;
1175 #endif
1176                         NF_CT_STAT_INC(net, expect_new);
1177                 }
1178                 spin_unlock(&nf_conntrack_expect_lock);
1179         }
1180         if (!exp) {
1181                 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
1182                 NF_CT_STAT_INC(net, new);
1183         }
1184
1185         /* Now it is inserted into the unconfirmed list, bump refcount */
1186         nf_conntrack_get(&ct->ct_general);
1187         nf_ct_add_to_unconfirmed_list(ct);
1188
1189         local_bh_enable();
1190
1191         if (exp) {
1192                 if (exp->expectfn)
1193                         exp->expectfn(ct, exp);
1194                 nf_ct_expect_put(exp);
1195         }
1196
1197         return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
1198 }
1199
1200 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
1201 static inline struct nf_conn *
1202 resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
1203                   struct sk_buff *skb,
1204                   unsigned int dataoff,
1205                   u_int16_t l3num,
1206                   u_int8_t protonum,
1207                   struct nf_conntrack_l3proto *l3proto,
1208                   struct nf_conntrack_l4proto *l4proto,
1209                   int *set_reply,
1210                   enum ip_conntrack_info *ctinfo)
1211 {
1212         const struct nf_conntrack_zone *zone;
1213         struct nf_conntrack_tuple tuple;
1214         struct nf_conntrack_tuple_hash *h;
1215         struct nf_conntrack_zone tmp;
1216         struct nf_conn *ct;
1217         u32 hash;
1218
1219         if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
1220                              dataoff, l3num, protonum, net, &tuple, l3proto,
1221                              l4proto)) {
1222                 pr_debug("Can't get tuple\n");
1223                 return NULL;
1224         }
1225
1226         /* look for tuple match */
1227         zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1228         hash = hash_conntrack_raw(&tuple, net);
1229         h = __nf_conntrack_find_get(net, zone, &tuple, hash);
1230         if (!h) {
1231                 h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
1232                                    skb, dataoff, hash);
1233                 if (!h)
1234                         return NULL;
1235                 if (IS_ERR(h))
1236                         return (void *)h;
1237         }
1238         ct = nf_ct_tuplehash_to_ctrack(h);
1239
1240         /* It exists; we have (non-exclusive) reference. */
1241         if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1242                 *ctinfo = IP_CT_ESTABLISHED_REPLY;
1243                 /* Please set reply bit if this packet OK */
1244                 *set_reply = 1;
1245         } else {
1246                 /* Once we've had two way comms, always ESTABLISHED. */
1247                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1248                         pr_debug("normal packet for %p\n", ct);
1249                         *ctinfo = IP_CT_ESTABLISHED;
1250                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1251                         pr_debug("related packet for %p\n", ct);
1252                         *ctinfo = IP_CT_RELATED;
1253                 } else {
1254                         pr_debug("new packet for %p\n", ct);
1255                         *ctinfo = IP_CT_NEW;
1256                 }
1257                 *set_reply = 0;
1258         }
1259         skb->nfct = &ct->ct_general;
1260         skb->nfctinfo = *ctinfo;
1261         return ct;
1262 }
1263
1264 unsigned int
1265 nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1266                 struct sk_buff *skb)
1267 {
1268         struct nf_conn *ct, *tmpl = NULL;
1269         enum ip_conntrack_info ctinfo;
1270         struct nf_conntrack_l3proto *l3proto;
1271         struct nf_conntrack_l4proto *l4proto;
1272         unsigned int *timeouts;
1273         unsigned int dataoff;
1274         u_int8_t protonum;
1275         int set_reply = 0;
1276         int ret;
1277
1278         if (skb->nfct) {
1279                 /* Previously seen (loopback or untracked)?  Ignore. */
1280                 tmpl = (struct nf_conn *)skb->nfct;
1281                 if (!nf_ct_is_template(tmpl)) {
1282                         NF_CT_STAT_INC_ATOMIC(net, ignore);
1283                         return NF_ACCEPT;
1284                 }
1285                 skb->nfct = NULL;
1286         }
1287
1288         /* rcu_read_lock()ed by nf_hook_slow */
1289         l3proto = __nf_ct_l3proto_find(pf);
1290         ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
1291                                    &dataoff, &protonum);
1292         if (ret <= 0) {
1293                 pr_debug("not prepared to track yet or error occurred\n");
1294                 NF_CT_STAT_INC_ATOMIC(net, error);
1295                 NF_CT_STAT_INC_ATOMIC(net, invalid);
1296                 ret = -ret;
1297                 goto out;
1298         }
1299
1300         l4proto = __nf_ct_l4proto_find(pf, protonum);
1301
1302         /* It may be an special packet, error, unclean...
1303          * inverse of the return code tells to the netfilter
1304          * core what to do with the packet. */
1305         if (l4proto->error != NULL) {
1306                 ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo,
1307                                      pf, hooknum);
1308                 if (ret <= 0) {
1309                         NF_CT_STAT_INC_ATOMIC(net, error);
1310                         NF_CT_STAT_INC_ATOMIC(net, invalid);
1311                         ret = -ret;
1312                         goto out;
1313                 }
1314                 /* ICMP[v6] protocol trackers may assign one conntrack. */
1315                 if (skb->nfct)
1316                         goto out;
1317         }
1318
1319         ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
1320                                l3proto, l4proto, &set_reply, &ctinfo);
1321         if (!ct) {
1322                 /* Not valid part of a connection */
1323                 NF_CT_STAT_INC_ATOMIC(net, invalid);
1324                 ret = NF_ACCEPT;
1325                 goto out;
1326         }
1327
1328         if (IS_ERR(ct)) {
1329                 /* Too stressed to deal. */
1330                 NF_CT_STAT_INC_ATOMIC(net, drop);
1331                 ret = NF_DROP;
1332                 goto out;
1333         }
1334
1335         NF_CT_ASSERT(skb->nfct);
1336
1337         /* Decide what timeout policy we want to apply to this flow. */
1338         timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
1339
1340         ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts);
1341         if (ret <= 0) {
1342                 /* Invalid: inverse of the return code tells
1343                  * the netfilter core what to do */
1344                 pr_debug("nf_conntrack_in: Can't track with proto module\n");
1345                 nf_conntrack_put(skb->nfct);
1346                 skb->nfct = NULL;
1347                 NF_CT_STAT_INC_ATOMIC(net, invalid);
1348                 if (ret == -NF_DROP)
1349                         NF_CT_STAT_INC_ATOMIC(net, drop);
1350                 ret = -ret;
1351                 goto out;
1352         }
1353
1354         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1355                 nf_conntrack_event_cache(IPCT_REPLY, ct);
1356 out:
1357         if (tmpl) {
1358                 /* Special case: we have to repeat this hook, assign the
1359                  * template again to this packet. We assume that this packet
1360                  * has no conntrack assigned. This is used by nf_ct_tcp. */
1361                 if (ret == NF_REPEAT)
1362                         skb->nfct = (struct nf_conntrack *)tmpl;
1363                 else
1364                         nf_ct_put(tmpl);
1365         }
1366
1367         return ret;
1368 }
1369 EXPORT_SYMBOL_GPL(nf_conntrack_in);
1370
1371 bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1372                           const struct nf_conntrack_tuple *orig)
1373 {
1374         bool ret;
1375
1376         rcu_read_lock();
1377         ret = nf_ct_invert_tuple(inverse, orig,
1378                                  __nf_ct_l3proto_find(orig->src.l3num),
1379                                  __nf_ct_l4proto_find(orig->src.l3num,
1380                                                       orig->dst.protonum));
1381         rcu_read_unlock();
1382         return ret;
1383 }
1384 EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr);
1385
1386 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1387    implicitly racy: see __nf_conntrack_confirm */
1388 void nf_conntrack_alter_reply(struct nf_conn *ct,
1389                               const struct nf_conntrack_tuple *newreply)
1390 {
1391         struct nf_conn_help *help = nfct_help(ct);
1392
1393         /* Should be unconfirmed, so not in hash table yet */
1394         NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
1395
1396         pr_debug("Altering reply tuple of %p to ", ct);
1397         nf_ct_dump_tuple(newreply);
1398
1399         ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1400         if (ct->master || (help && !hlist_empty(&help->expectations)))
1401                 return;
1402
1403         rcu_read_lock();
1404         __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
1405         rcu_read_unlock();
1406 }
1407 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
1408
1409 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1410 void __nf_ct_refresh_acct(struct nf_conn *ct,
1411                           enum ip_conntrack_info ctinfo,
1412                           const struct sk_buff *skb,
1413                           unsigned long extra_jiffies,
1414                           int do_acct)
1415 {
1416         NF_CT_ASSERT(skb);
1417
1418         /* Only update if this is not a fixed timeout */
1419         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
1420                 goto acct;
1421
1422         /* If not in hash table, timer will not be active yet */
1423         if (nf_ct_is_confirmed(ct))
1424                 extra_jiffies += nfct_time_stamp;
1425
1426         ct->timeout = extra_jiffies;
1427 acct:
1428         if (do_acct)
1429                 nf_ct_acct_update(ct, ctinfo, skb->len);
1430 }
1431 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
1432
1433 bool __nf_ct_kill_acct(struct nf_conn *ct,
1434                        enum ip_conntrack_info ctinfo,
1435                        const struct sk_buff *skb,
1436                        int do_acct)
1437 {
1438         if (do_acct)
1439                 nf_ct_acct_update(ct, ctinfo, skb->len);
1440
1441         return nf_ct_delete(ct, 0, 0);
1442 }
1443 EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
1444
1445 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1446
1447 #include <linux/netfilter/nfnetlink.h>
1448 #include <linux/netfilter/nfnetlink_conntrack.h>
1449 #include <linux/mutex.h>
1450
1451 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1452  * in ip_conntrack_core, since we don't want the protocols to autoload
1453  * or depend on ctnetlink */
1454 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
1455                                const struct nf_conntrack_tuple *tuple)
1456 {
1457         if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) ||
1458             nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port))
1459                 goto nla_put_failure;
1460         return 0;
1461
1462 nla_put_failure:
1463         return -1;
1464 }
1465 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
1466
1467 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
1468         [CTA_PROTO_SRC_PORT]  = { .type = NLA_U16 },
1469         [CTA_PROTO_DST_PORT]  = { .type = NLA_U16 },
1470 };
1471 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
1472
1473 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
1474                                struct nf_conntrack_tuple *t)
1475 {
1476         if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
1477                 return -EINVAL;
1478
1479         t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
1480         t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
1481
1482         return 0;
1483 }
1484 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
1485
1486 int nf_ct_port_nlattr_tuple_size(void)
1487 {
1488         return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1489 }
1490 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
1491 #endif
1492
1493 /* Used by ipt_REJECT and ip6t_REJECT. */
1494 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
1495 {
1496         struct nf_conn *ct;
1497         enum ip_conntrack_info ctinfo;
1498
1499         /* This ICMP is in reverse direction to the packet which caused it */
1500         ct = nf_ct_get(skb, &ctinfo);
1501         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1502                 ctinfo = IP_CT_RELATED_REPLY;
1503         else
1504                 ctinfo = IP_CT_RELATED;
1505
1506         /* Attach to new skbuff, and increment count */
1507         nskb->nfct = &ct->ct_general;
1508         nskb->nfctinfo = ctinfo;
1509         nf_conntrack_get(nskb->nfct);
1510 }
1511
1512 /* Bring out ya dead! */
1513 static struct nf_conn *
1514 get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
1515                 void *data, unsigned int *bucket)
1516 {
1517         struct nf_conntrack_tuple_hash *h;
1518         struct nf_conn *ct;
1519         struct hlist_nulls_node *n;
1520         int cpu;
1521         spinlock_t *lockp;
1522
1523         for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1524                 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
1525                 local_bh_disable();
1526                 nf_conntrack_lock(lockp);
1527                 if (*bucket < nf_conntrack_htable_size) {
1528                         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
1529                                 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
1530                                         continue;
1531                                 ct = nf_ct_tuplehash_to_ctrack(h);
1532                                 if (net_eq(nf_ct_net(ct), net) &&
1533                                     iter(ct, data))
1534                                         goto found;
1535                         }
1536                 }
1537                 spin_unlock(lockp);
1538                 local_bh_enable();
1539                 cond_resched();
1540         }
1541
1542         for_each_possible_cpu(cpu) {
1543                 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
1544
1545                 spin_lock_bh(&pcpu->lock);
1546                 hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
1547                         ct = nf_ct_tuplehash_to_ctrack(h);
1548                         if (iter(ct, data))
1549                                 set_bit(IPS_DYING_BIT, &ct->status);
1550                 }
1551                 spin_unlock_bh(&pcpu->lock);
1552                 cond_resched();
1553         }
1554         return NULL;
1555 found:
1556         atomic_inc(&ct->ct_general.use);
1557         spin_unlock(lockp);
1558         local_bh_enable();
1559         return ct;
1560 }
1561
1562 void nf_ct_iterate_cleanup(struct net *net,
1563                            int (*iter)(struct nf_conn *i, void *data),
1564                            void *data, u32 portid, int report)
1565 {
1566         struct nf_conn *ct;
1567         unsigned int bucket = 0;
1568
1569         might_sleep();
1570
1571         if (atomic_read(&net->ct.count) == 0)
1572                 return;
1573
1574         while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
1575                 /* Time to push up daises... */
1576
1577                 nf_ct_delete(ct, portid, report);
1578                 nf_ct_put(ct);
1579                 cond_resched();
1580         }
1581 }
1582 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
1583
1584 static int kill_all(struct nf_conn *i, void *data)
1585 {
1586         return 1;
1587 }
1588
1589 void nf_ct_free_hashtable(void *hash, unsigned int size)
1590 {
1591         if (is_vmalloc_addr(hash))
1592                 vfree(hash);
1593         else
1594                 free_pages((unsigned long)hash,
1595                            get_order(sizeof(struct hlist_head) * size));
1596 }
1597 EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
1598
1599 static int untrack_refs(void)
1600 {
1601         int cnt = 0, cpu;
1602
1603         for_each_possible_cpu(cpu) {
1604                 struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
1605
1606                 cnt += atomic_read(&ct->ct_general.use) - 1;
1607         }
1608         return cnt;
1609 }
1610
1611 void nf_conntrack_cleanup_start(void)
1612 {
1613         conntrack_gc_work.exiting = true;
1614         RCU_INIT_POINTER(ip_ct_attach, NULL);
1615 }
1616
1617 void nf_conntrack_cleanup_end(void)
1618 {
1619         RCU_INIT_POINTER(nf_ct_destroy, NULL);
1620         while (untrack_refs() > 0)
1621                 schedule();
1622
1623         cancel_delayed_work_sync(&conntrack_gc_work.dwork);
1624         nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
1625
1626         nf_conntrack_proto_fini();
1627         nf_conntrack_seqadj_fini();
1628         nf_conntrack_labels_fini();
1629         nf_conntrack_helper_fini();
1630         nf_conntrack_timeout_fini();
1631         nf_conntrack_ecache_fini();
1632         nf_conntrack_tstamp_fini();
1633         nf_conntrack_acct_fini();
1634         nf_conntrack_expect_fini();
1635
1636         kmem_cache_destroy(nf_conntrack_cachep);
1637 }
1638
1639 /*
1640  * Mishearing the voices in his head, our hero wonders how he's
1641  * supposed to kill the mall.
1642  */
1643 void nf_conntrack_cleanup_net(struct net *net)
1644 {
1645         LIST_HEAD(single);
1646
1647         list_add(&net->exit_list, &single);
1648         nf_conntrack_cleanup_net_list(&single);
1649 }
1650
1651 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
1652 {
1653         int busy;
1654         struct net *net;
1655
1656         /*
1657          * This makes sure all current packets have passed through
1658          *  netfilter framework.  Roll on, two-stage module
1659          *  delete...
1660          */
1661         synchronize_net();
1662 i_see_dead_people:
1663         busy = 0;
1664         list_for_each_entry(net, net_exit_list, exit_list) {
1665                 nf_ct_iterate_cleanup(net, kill_all, NULL, 0, 0);
1666                 if (atomic_read(&net->ct.count) != 0)
1667                         busy = 1;
1668         }
1669         if (busy) {
1670                 schedule();
1671                 goto i_see_dead_people;
1672         }
1673
1674         list_for_each_entry(net, net_exit_list, exit_list) {
1675                 nf_conntrack_proto_pernet_fini(net);
1676                 nf_conntrack_helper_pernet_fini(net);
1677                 nf_conntrack_ecache_pernet_fini(net);
1678                 nf_conntrack_tstamp_pernet_fini(net);
1679                 nf_conntrack_acct_pernet_fini(net);
1680                 nf_conntrack_expect_pernet_fini(net);
1681                 free_percpu(net->ct.stat);
1682                 free_percpu(net->ct.pcpu_lists);
1683         }
1684 }
1685
1686 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
1687 {
1688         struct hlist_nulls_head *hash;
1689         unsigned int nr_slots, i;
1690         size_t sz;
1691
1692         if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
1693                 return NULL;
1694
1695         BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
1696         nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
1697
1698         if (nr_slots > (UINT_MAX / sizeof(struct hlist_nulls_head)))
1699                 return NULL;
1700
1701         sz = nr_slots * sizeof(struct hlist_nulls_head);
1702         hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
1703                                         get_order(sz));
1704         if (!hash)
1705                 hash = vzalloc(sz);
1706
1707         if (hash && nulls)
1708                 for (i = 0; i < nr_slots; i++)
1709                         INIT_HLIST_NULLS_HEAD(&hash[i], i);
1710
1711         return hash;
1712 }
1713 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
1714
1715 int nf_conntrack_hash_resize(unsigned int hashsize)
1716 {
1717         int i, bucket;
1718         unsigned int old_size;
1719         struct hlist_nulls_head *hash, *old_hash;
1720         struct nf_conntrack_tuple_hash *h;
1721         struct nf_conn *ct;
1722
1723         if (!hashsize)
1724                 return -EINVAL;
1725
1726         hash = nf_ct_alloc_hashtable(&hashsize, 1);
1727         if (!hash)
1728                 return -ENOMEM;
1729
1730         old_size = nf_conntrack_htable_size;
1731         if (old_size == hashsize) {
1732                 nf_ct_free_hashtable(hash, hashsize);
1733                 return 0;
1734         }
1735
1736         local_bh_disable();
1737         nf_conntrack_all_lock();
1738         write_seqcount_begin(&nf_conntrack_generation);
1739
1740         /* Lookups in the old hash might happen in parallel, which means we
1741          * might get false negatives during connection lookup. New connections
1742          * created because of a false negative won't make it into the hash
1743          * though since that required taking the locks.
1744          */
1745
1746         for (i = 0; i < nf_conntrack_htable_size; i++) {
1747                 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
1748                         h = hlist_nulls_entry(nf_conntrack_hash[i].first,
1749                                               struct nf_conntrack_tuple_hash, hnnode);
1750                         ct = nf_ct_tuplehash_to_ctrack(h);
1751                         hlist_nulls_del_rcu(&h->hnnode);
1752                         bucket = __hash_conntrack(nf_ct_net(ct),
1753                                                   &h->tuple, hashsize);
1754                         hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
1755                 }
1756         }
1757         old_size = nf_conntrack_htable_size;
1758         old_hash = nf_conntrack_hash;
1759
1760         nf_conntrack_hash = hash;
1761         nf_conntrack_htable_size = hashsize;
1762
1763         write_seqcount_end(&nf_conntrack_generation);
1764         nf_conntrack_all_unlock();
1765         local_bh_enable();
1766
1767         synchronize_net();
1768         nf_ct_free_hashtable(old_hash, old_size);
1769         return 0;
1770 }
1771
1772 int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
1773 {
1774         unsigned int hashsize;
1775         int rc;
1776
1777         if (current->nsproxy->net_ns != &init_net)
1778                 return -EOPNOTSUPP;
1779
1780         /* On boot, we can set this without any fancy locking. */
1781         if (!nf_conntrack_htable_size)
1782                 return param_set_uint(val, kp);
1783
1784         rc = kstrtouint(val, 0, &hashsize);
1785         if (rc)
1786                 return rc;
1787
1788         return nf_conntrack_hash_resize(hashsize);
1789 }
1790 EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
1791
1792 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
1793                   &nf_conntrack_htable_size, 0600);
1794
1795 void nf_ct_untracked_status_or(unsigned long bits)
1796 {
1797         int cpu;
1798
1799         for_each_possible_cpu(cpu)
1800                 per_cpu(nf_conntrack_untracked, cpu).status |= bits;
1801 }
1802 EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
1803
1804 int nf_conntrack_init_start(void)
1805 {
1806         int max_factor = 8;
1807         int ret = -ENOMEM;
1808         int i, cpu;
1809
1810         seqcount_init(&nf_conntrack_generation);
1811
1812         for (i = 0; i < CONNTRACK_LOCKS; i++)
1813                 spin_lock_init(&nf_conntrack_locks[i]);
1814
1815         if (!nf_conntrack_htable_size) {
1816                 /* Idea from tcp.c: use 1/16384 of memory.
1817                  * On i386: 32MB machine has 512 buckets.
1818                  * >= 1GB machines have 16384 buckets.
1819                  * >= 4GB machines have 65536 buckets.
1820                  */
1821                 nf_conntrack_htable_size
1822                         = (((totalram_pages << PAGE_SHIFT) / 16384)
1823                            / sizeof(struct hlist_head));
1824                 if (totalram_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
1825                         nf_conntrack_htable_size = 65536;
1826                 else if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
1827                         nf_conntrack_htable_size = 16384;
1828                 if (nf_conntrack_htable_size < 32)
1829                         nf_conntrack_htable_size = 32;
1830
1831                 /* Use a max. factor of four by default to get the same max as
1832                  * with the old struct list_heads. When a table size is given
1833                  * we use the old value of 8 to avoid reducing the max.
1834                  * entries. */
1835                 max_factor = 4;
1836         }
1837
1838         nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
1839         if (!nf_conntrack_hash)
1840                 return -ENOMEM;
1841
1842         nf_conntrack_max = max_factor * nf_conntrack_htable_size;
1843
1844         nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
1845                                                 sizeof(struct nf_conn), 0,
1846                                                 SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
1847         if (!nf_conntrack_cachep)
1848                 goto err_cachep;
1849
1850         printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
1851                NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1852                nf_conntrack_max);
1853
1854         ret = nf_conntrack_expect_init();
1855         if (ret < 0)
1856                 goto err_expect;
1857
1858         ret = nf_conntrack_acct_init();
1859         if (ret < 0)
1860                 goto err_acct;
1861
1862         ret = nf_conntrack_tstamp_init();
1863         if (ret < 0)
1864                 goto err_tstamp;
1865
1866         ret = nf_conntrack_ecache_init();
1867         if (ret < 0)
1868                 goto err_ecache;
1869
1870         ret = nf_conntrack_timeout_init();
1871         if (ret < 0)
1872                 goto err_timeout;
1873
1874         ret = nf_conntrack_helper_init();
1875         if (ret < 0)
1876                 goto err_helper;
1877
1878         ret = nf_conntrack_labels_init();
1879         if (ret < 0)
1880                 goto err_labels;
1881
1882         ret = nf_conntrack_seqadj_init();
1883         if (ret < 0)
1884                 goto err_seqadj;
1885
1886         ret = nf_conntrack_proto_init();
1887         if (ret < 0)
1888                 goto err_proto;
1889
1890         /* Set up fake conntrack: to never be deleted, not in any hashes */
1891         for_each_possible_cpu(cpu) {
1892                 struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
1893                 write_pnet(&ct->ct_net, &init_net);
1894                 atomic_set(&ct->ct_general.use, 1);
1895         }
1896         /*  - and look it like as a confirmed connection */
1897         nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
1898
1899         conntrack_gc_work_init(&conntrack_gc_work);
1900         schedule_delayed_work(&conntrack_gc_work.dwork, GC_INTERVAL);
1901
1902         return 0;
1903
1904 err_proto:
1905         nf_conntrack_seqadj_fini();
1906 err_seqadj:
1907         nf_conntrack_labels_fini();
1908 err_labels:
1909         nf_conntrack_helper_fini();
1910 err_helper:
1911         nf_conntrack_timeout_fini();
1912 err_timeout:
1913         nf_conntrack_ecache_fini();
1914 err_ecache:
1915         nf_conntrack_tstamp_fini();
1916 err_tstamp:
1917         nf_conntrack_acct_fini();
1918 err_acct:
1919         nf_conntrack_expect_fini();
1920 err_expect:
1921         kmem_cache_destroy(nf_conntrack_cachep);
1922 err_cachep:
1923         nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
1924         return ret;
1925 }
1926
1927 void nf_conntrack_init_end(void)
1928 {
1929         /* For use by REJECT target */
1930         RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
1931         RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack);
1932 }
1933
1934 /*
1935  * We need to use special "null" values, not used in hash table
1936  */
1937 #define UNCONFIRMED_NULLS_VAL   ((1<<30)+0)
1938 #define DYING_NULLS_VAL         ((1<<30)+1)
1939 #define TEMPLATE_NULLS_VAL      ((1<<30)+2)
1940
1941 int nf_conntrack_init_net(struct net *net)
1942 {
1943         int ret = -ENOMEM;
1944         int cpu;
1945
1946         atomic_set(&net->ct.count, 0);
1947
1948         net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
1949         if (!net->ct.pcpu_lists)
1950                 goto err_stat;
1951
1952         for_each_possible_cpu(cpu) {
1953                 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
1954
1955                 spin_lock_init(&pcpu->lock);
1956                 INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
1957                 INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
1958         }
1959
1960         net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
1961         if (!net->ct.stat)
1962                 goto err_pcpu_lists;
1963
1964         ret = nf_conntrack_expect_pernet_init(net);
1965         if (ret < 0)
1966                 goto err_expect;
1967         ret = nf_conntrack_acct_pernet_init(net);
1968         if (ret < 0)
1969                 goto err_acct;
1970         ret = nf_conntrack_tstamp_pernet_init(net);
1971         if (ret < 0)
1972                 goto err_tstamp;
1973         ret = nf_conntrack_ecache_pernet_init(net);
1974         if (ret < 0)
1975                 goto err_ecache;
1976         ret = nf_conntrack_helper_pernet_init(net);
1977         if (ret < 0)
1978                 goto err_helper;
1979         ret = nf_conntrack_proto_pernet_init(net);
1980         if (ret < 0)
1981                 goto err_proto;
1982         return 0;
1983
1984 err_proto:
1985         nf_conntrack_helper_pernet_fini(net);
1986 err_helper:
1987         nf_conntrack_ecache_pernet_fini(net);
1988 err_ecache:
1989         nf_conntrack_tstamp_pernet_fini(net);
1990 err_tstamp:
1991         nf_conntrack_acct_pernet_fini(net);
1992 err_acct:
1993         nf_conntrack_expect_pernet_fini(net);
1994 err_expect:
1995         free_percpu(net->ct.stat);
1996 err_pcpu_lists:
1997         free_percpu(net->ct.pcpu_lists);
1998 err_stat:
1999         return ret;
2000 }