]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/commitdiff
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/kaber/nf-next-2.6
authorDavid S. Miller <davem@davemloft.net>
Sat, 13 Jun 2009 23:43:28 +0000 (16:43 -0700)
committerDavid S. Miller <davem@davemloft.net>
Sat, 13 Jun 2009 23:43:28 +0000 (16:43 -0700)
12 files changed:
include/linux/list_nulls.h
include/net/netfilter/nf_conntrack.h
include/net/netfilter/nf_conntrack_ecache.h
include/net/netfilter/nf_conntrack_extend.h
include/net/netfilter/nf_conntrack_helper.h
include/net/netns/conntrack.h
net/netfilter/nf_conntrack_core.c
net/netfilter/nf_conntrack_ecache.c
net/netfilter/nf_conntrack_helper.c
net/netfilter/nf_conntrack_netlink.c
net/netfilter/nf_log.c
net/netfilter/x_tables.c

index 93150ecf3ea404d1b5bcb9c1090ea290c22d64ae..5d10ae364b5eb5d0a50d220938dd2f020a8cafb2 100644 (file)
@@ -56,6 +56,18 @@ static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
        return is_a_nulls(h->first);
 }
 
+static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
+                                       struct hlist_nulls_head *h)
+{
+       struct hlist_nulls_node *first = h->first;
+
+       n->next = first;
+       n->pprev = &h->first;
+       h->first = n;
+       if (!is_a_nulls(first))
+               first->pprev = &n->next;
+}
+
 static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
 {
        struct hlist_nulls_node *next = n->next;
@@ -65,6 +77,12 @@ static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
                next->pprev = pprev;
 }
 
+static inline void hlist_nulls_del(struct hlist_nulls_node *n)
+{
+       __hlist_nulls_del(n);
+       n->pprev = LIST_POISON2;
+}
+
 /**
  * hlist_nulls_for_each_entry  - iterate over list of given type
  * @tpos:      the type * to use as a loop cursor.
index ecc79f9590762d9f233ae35186aa96adec14a9f8..a632689b61b40f581e4cc1d25fe65ebcac053995 100644 (file)
@@ -201,6 +201,8 @@ extern struct nf_conntrack_tuple_hash *
 __nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple);
 
 extern void nf_conntrack_hash_insert(struct nf_conn *ct);
+extern void nf_ct_delete_from_lists(struct nf_conn *ct);
+extern void nf_ct_insert_dying_list(struct nf_conn *ct);
 
 extern void nf_conntrack_flush_report(struct net *net, u32 pid, int report);
 
index 1afb907e015a98315f414602459ae4870a544934..4f20d58e2ab74a5155c89d9284eedbec6d2bd19f 100644 (file)
@@ -6,61 +6,54 @@
 #define _NF_CONNTRACK_ECACHE_H
 #include <net/netfilter/nf_conntrack.h>
 
-#include <linux/interrupt.h>
 #include <net/net_namespace.h>
 #include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_common.h>
+#include <linux/netfilter/nf_conntrack_tuple_common.h>
+#include <net/netfilter/nf_conntrack_extend.h>
 
-/* Connection tracking event bits */
+/* Connection tracking event types */
 enum ip_conntrack_events
 {
-       /* New conntrack */
-       IPCT_NEW_BIT = 0,
-       IPCT_NEW = (1 << IPCT_NEW_BIT),
-
-       /* Expected connection */
-       IPCT_RELATED_BIT = 1,
-       IPCT_RELATED = (1 << IPCT_RELATED_BIT),
-
-       /* Destroyed conntrack */
-       IPCT_DESTROY_BIT = 2,
-       IPCT_DESTROY = (1 << IPCT_DESTROY_BIT),
-
-       /* Status has changed */
-       IPCT_STATUS_BIT = 3,
-       IPCT_STATUS = (1 << IPCT_STATUS_BIT),
+       IPCT_NEW                = 0,    /* new conntrack */
+       IPCT_RELATED            = 1,    /* related conntrack */
+       IPCT_DESTROY            = 2,    /* destroyed conntrack */
+       IPCT_STATUS             = 3,    /* status has changed */
+       IPCT_PROTOINFO          = 4,    /* protocol information has changed */
+       IPCT_HELPER             = 5,    /* new helper has been set */
+       IPCT_MARK               = 6,    /* new mark has been set */
+       IPCT_NATSEQADJ          = 7,    /* NAT is doing sequence adjustment */
+       IPCT_SECMARK            = 8,    /* new security mark has been set */
+};
 
-       /* Update of protocol info */
-       IPCT_PROTOINFO_BIT = 4,
-       IPCT_PROTOINFO = (1 << IPCT_PROTOINFO_BIT),
+enum ip_conntrack_expect_events {
+       IPEXP_NEW               = 0,    /* new expectation */
+};
 
-       /* New helper for conntrack */
-       IPCT_HELPER_BIT = 5,
-       IPCT_HELPER = (1 << IPCT_HELPER_BIT),
+struct nf_conntrack_ecache {
+       unsigned long cache;            /* bitops want long */
+       unsigned long missed;           /* missed events */
+       u32 pid;                        /* netlink pid of destroyer */
+};
 
-       /* Mark is set */
-       IPCT_MARK_BIT = 6,
-       IPCT_MARK = (1 << IPCT_MARK_BIT),
+static inline struct nf_conntrack_ecache *
+nf_ct_ecache_find(const struct nf_conn *ct)
+{
+       return nf_ct_ext_find(ct, NF_CT_EXT_ECACHE);
+}
 
-       /* NAT sequence adjustment */
-       IPCT_NATSEQADJ_BIT = 7,
-       IPCT_NATSEQADJ = (1 << IPCT_NATSEQADJ_BIT),
+static inline struct nf_conntrack_ecache *
+nf_ct_ecache_ext_add(struct nf_conn *ct, gfp_t gfp)
+{
+       struct net *net = nf_ct_net(ct);
 
-       /* Secmark is set */
-       IPCT_SECMARK_BIT = 8,
-       IPCT_SECMARK = (1 << IPCT_SECMARK_BIT),
-};
+       if (!net->ct.sysctl_events)
+               return NULL;
 
-enum ip_conntrack_expect_events {
-       IPEXP_NEW_BIT = 0,
-       IPEXP_NEW = (1 << IPEXP_NEW_BIT),
+       return nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp);
 };
 
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
-struct nf_conntrack_ecache {
-       struct nf_conn *ct;
-       unsigned int events;
-};
-
 /* This structure is passed to event handler */
 struct nf_ct_event {
        struct nf_conn *ct;
@@ -76,53 +69,88 @@ extern struct nf_ct_event_notifier *nf_conntrack_event_cb;
 extern int nf_conntrack_register_notifier(struct nf_ct_event_notifier *nb);
 extern void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *nb);
 
-extern void nf_ct_deliver_cached_events(const struct nf_conn *ct);
-extern void __nf_ct_event_cache_init(struct nf_conn *ct);
-extern void nf_ct_event_cache_flush(struct net *net);
+extern void nf_ct_deliver_cached_events(struct nf_conn *ct);
 
 static inline void
 nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct)
 {
-       struct net *net = nf_ct_net(ct);
-       struct nf_conntrack_ecache *ecache;
-
-       local_bh_disable();
-       ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id());
-       if (ct != ecache->ct)
-               __nf_ct_event_cache_init(ct);
-       ecache->events |= event;
-       local_bh_enable();
+       struct nf_conntrack_ecache *e;
+
+       if (nf_conntrack_event_cb == NULL)
+               return;
+
+       e = nf_ct_ecache_find(ct);
+       if (e == NULL)
+               return;
+
+       set_bit(event, &e->cache);
 }
 
-static inline void
-nf_conntrack_event_report(enum ip_conntrack_events event,
-                         struct nf_conn *ct,
-                         u32 pid,
-                         int report)
+static inline int
+nf_conntrack_eventmask_report(unsigned int eventmask,
+                             struct nf_conn *ct,
+                             u32 pid,
+                             int report)
 {
+       int ret = 0;
+       struct net *net = nf_ct_net(ct);
        struct nf_ct_event_notifier *notify;
+       struct nf_conntrack_ecache *e;
 
        rcu_read_lock();
        notify = rcu_dereference(nf_conntrack_event_cb);
        if (notify == NULL)
                goto out_unlock;
 
+       if (!net->ct.sysctl_events)
+               goto out_unlock;
+
+       e = nf_ct_ecache_find(ct);
+       if (e == NULL)
+               goto out_unlock;
+
        if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) {
                struct nf_ct_event item = {
                        .ct     = ct,
-                       .pid    = pid,
+                       .pid    = e->pid ? e->pid : pid,
                        .report = report
                };
-               notify->fcn(event, &item);
+               /* This is a resent of a destroy event? If so, skip missed */
+               unsigned long missed = e->pid ? 0 : e->missed;
+
+               ret = notify->fcn(eventmask | missed, &item);
+               if (unlikely(ret < 0 || missed)) {
+                       spin_lock_bh(&ct->lock);
+                       if (ret < 0) {
+                               /* This is a destroy event that has been
+                                * triggered by a process, we store the PID
+                                * to include it in the retransmission. */
+                               if (eventmask & (1 << IPCT_DESTROY) &&
+                                   e->pid == 0 && pid != 0)
+                                       e->pid = pid;
+                               else
+                                       e->missed |= eventmask;
+                       } else
+                               e->missed &= ~missed;
+                       spin_unlock_bh(&ct->lock);
+               }
        }
 out_unlock:
        rcu_read_unlock();
+       return ret;
 }
 
-static inline void
+static inline int
+nf_conntrack_event_report(enum ip_conntrack_events event, struct nf_conn *ct,
+                         u32 pid, int report)
+{
+       return nf_conntrack_eventmask_report(1 << event, ct, pid, report);
+}
+
+static inline int
 nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct)
 {
-       nf_conntrack_event_report(event, ct, 0, 0);
+       return nf_conntrack_eventmask_report(1 << event, ct, 0, 0);
 }
 
 struct nf_exp_event {
@@ -145,6 +173,7 @@ nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
                          u32 pid,
                          int report)
 {
+       struct net *net = nf_ct_exp_net(exp);
        struct nf_exp_event_notifier *notify;
 
        rcu_read_lock();
@@ -152,13 +181,16 @@ nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
        if (notify == NULL)
                goto out_unlock;
 
+       if (!net->ct.sysctl_events)
+               goto out_unlock;
+
        {
                struct nf_exp_event item = {
                        .exp    = exp,
                        .pid    = pid,
                        .report = report
                };
-               notify->fcn(event, &item);
+               notify->fcn(1 << event, &item);
        }
 out_unlock:
        rcu_read_unlock();
@@ -178,12 +210,16 @@ extern void nf_conntrack_ecache_fini(struct net *net);
 
 static inline void nf_conntrack_event_cache(enum ip_conntrack_events event,
                                            struct nf_conn *ct) {}
-static inline void nf_conntrack_event(enum ip_conntrack_events event,
-                                     struct nf_conn *ct) {}
-static inline void nf_conntrack_event_report(enum ip_conntrack_events event,
-                                            struct nf_conn *ct,
-                                            u32 pid,
-                                            int report) {}
+static inline int nf_conntrack_eventmask_report(unsigned int eventmask,
+                                               struct nf_conn *ct,
+                                               u32 pid,
+                                               int report) { return 0; }
+static inline int nf_conntrack_event(enum ip_conntrack_events event,
+                                    struct nf_conn *ct) { return 0; }
+static inline int nf_conntrack_event_report(enum ip_conntrack_events event,
+                                           struct nf_conn *ct,
+                                           u32 pid,
+                                           int report) { return 0; }
 static inline void nf_ct_deliver_cached_events(const struct nf_conn *ct) {}
 static inline void nf_ct_expect_event(enum ip_conntrack_expect_events event,
                                      struct nf_conntrack_expect *exp) {}
@@ -191,7 +227,6 @@ static inline void nf_ct_expect_event_report(enum ip_conntrack_expect_events e,
                                             struct nf_conntrack_expect *exp,
                                             u32 pid,
                                             int report) {}
-static inline void nf_ct_event_cache_flush(struct net *net) {}
 
 static inline int nf_conntrack_ecache_init(struct net *net)
 {
index da8ee52613a59f202e341fb96b1d5c6c097ab37d..7f8fc5d123c550e437be2279039ea5d4f4bd6d43 100644 (file)
@@ -8,12 +8,14 @@ enum nf_ct_ext_id
        NF_CT_EXT_HELPER,
        NF_CT_EXT_NAT,
        NF_CT_EXT_ACCT,
+       NF_CT_EXT_ECACHE,
        NF_CT_EXT_NUM,
 };
 
 #define NF_CT_EXT_HELPER_TYPE struct nf_conn_help
 #define NF_CT_EXT_NAT_TYPE struct nf_conn_nat
 #define NF_CT_EXT_ACCT_TYPE struct nf_conn_counter
+#define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache
 
 /* Extensions: optional stuff which isn't permanently in struct. */
 struct nf_ct_ext {
index ee2a4b369a0466ac884af3f282dec210e41c2a56..1b7068000927c564be434e32b7e24240a2d44c14 100644 (file)
@@ -50,6 +50,8 @@ extern struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp);
 
 extern int __nf_ct_try_assign_helper(struct nf_conn *ct, gfp_t flags);
 
+extern void nf_ct_helper_destroy(struct nf_conn *ct);
+
 static inline struct nf_conn_help *nfct_help(const struct nf_conn *ct)
 {
        return nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
index 9dc58402bc09e4b5d1c210cf081313cfbccd968d..ba1ba0c5efd1b5047d9251acee2211043e4d0805 100644 (file)
@@ -14,16 +14,17 @@ struct netns_ct {
        struct hlist_nulls_head *hash;
        struct hlist_head       *expect_hash;
        struct hlist_nulls_head unconfirmed;
+       struct hlist_nulls_head dying;
        struct ip_conntrack_stat *stat;
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
-       struct nf_conntrack_ecache *ecache;
-#endif
+       int                     sysctl_events;
+       unsigned int            sysctl_events_retry_timeout;
        int                     sysctl_acct;
        int                     sysctl_checksum;
        unsigned int            sysctl_log_invalid; /* Log invalid packets */
 #ifdef CONFIG_SYSCTL
        struct ctl_table_header *sysctl_header;
        struct ctl_table_header *acct_sysctl_header;
+       struct ctl_table_header *event_sysctl_header;
 #endif
        int                     hash_vmalloc;
        int                     expect_vmalloc;
index edf95695e0aac511f54184b2d18dd0383d68ebbc..5f72b94b4918745f3491b48ba06cbc1714890280 100644 (file)
@@ -39,6 +39,7 @@
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_core.h>
 
@@ -182,10 +183,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
        NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
        NF_CT_ASSERT(!timer_pending(&ct->timeout));
 
-       if (!test_bit(IPS_DYING_BIT, &ct->status))
-               nf_conntrack_event(IPCT_DESTROY, ct);
-       set_bit(IPS_DYING_BIT, &ct->status);
-
        /* To make sure we don't get any weird locking issues here:
         * destroy_conntrack() MUST NOT be called with a write lock
         * to nf_conntrack_lock!!! -HW */
@@ -219,27 +216,70 @@ destroy_conntrack(struct nf_conntrack *nfct)
        nf_conntrack_free(ct);
 }
 
-static void death_by_timeout(unsigned long ul_conntrack)
+void nf_ct_delete_from_lists(struct nf_conn *ct)
 {
-       struct nf_conn *ct = (void *)ul_conntrack;
        struct net *net = nf_ct_net(ct);
-       struct nf_conn_help *help = nfct_help(ct);
-       struct nf_conntrack_helper *helper;
-
-       if (help) {
-               rcu_read_lock();
-               helper = rcu_dereference(help->helper);
-               if (helper && helper->destroy)
-                       helper->destroy(ct);
-               rcu_read_unlock();
-       }
 
+       nf_ct_helper_destroy(ct);
        spin_lock_bh(&nf_conntrack_lock);
        /* Inside lock so preempt is disabled on module removal path.
         * Otherwise we can get spurious warnings. */
        NF_CT_STAT_INC(net, delete_list);
        clean_from_lists(ct);
        spin_unlock_bh(&nf_conntrack_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists);
+
+static void death_by_event(unsigned long ul_conntrack)
+{
+       struct nf_conn *ct = (void *)ul_conntrack;
+       struct net *net = nf_ct_net(ct);
+
+       if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
+               /* bad luck, let's retry again */
+               ct->timeout.expires = jiffies +
+                       (random32() % net->ct.sysctl_events_retry_timeout);
+               add_timer(&ct->timeout);
+               return;
+       }
+       /* we've got the event delivered, now it's dying */
+       set_bit(IPS_DYING_BIT, &ct->status);
+       spin_lock(&nf_conntrack_lock);
+       hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+       spin_unlock(&nf_conntrack_lock);
+       nf_ct_put(ct);
+}
+
+void nf_ct_insert_dying_list(struct nf_conn *ct)
+{
+       struct net *net = nf_ct_net(ct);
+
+       /* add this conntrack to the dying list */
+       spin_lock_bh(&nf_conntrack_lock);
+       hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+                            &net->ct.dying);
+       spin_unlock_bh(&nf_conntrack_lock);
+       /* set a new timer to retry event delivery */
+       setup_timer(&ct->timeout, death_by_event, (unsigned long)ct);
+       ct->timeout.expires = jiffies +
+               (random32() % net->ct.sysctl_events_retry_timeout);
+       add_timer(&ct->timeout);
+}
+EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+       struct nf_conn *ct = (void *)ul_conntrack;
+
+       if (!test_bit(IPS_DYING_BIT, &ct->status) &&
+           unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
+               /* destroy event was not delivered */
+               nf_ct_delete_from_lists(ct);
+               nf_ct_insert_dying_list(ct);
+               return;
+       }
+       set_bit(IPS_DYING_BIT, &ct->status);
+       nf_ct_delete_from_lists(ct);
        nf_ct_put(ct);
 }
 
@@ -577,6 +617,7 @@ init_conntrack(struct net *net,
        }
 
        nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+       nf_ct_ecache_ext_add(ct, GFP_ATOMIC);
 
        spin_lock_bh(&nf_conntrack_lock);
        exp = nf_ct_find_expectation(net, tuple);
@@ -807,8 +848,6 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
        NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
        NF_CT_ASSERT(skb);
 
-       spin_lock_bh(&nf_conntrack_lock);
-
        /* Only update if this is not a fixed timeout */
        if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
                goto acct;
@@ -822,11 +861,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
                /* Only update the timeout if the new timeout is at least
                   HZ jiffies from the old timeout. Need del_timer for race
                   avoidance (may already be dying). */
-               if (newtime - ct->timeout.expires >= HZ
-                   && del_timer(&ct->timeout)) {
-                       ct->timeout.expires = newtime;
-                       add_timer(&ct->timeout);
-               }
+               if (newtime - ct->timeout.expires >= HZ)
+                       mod_timer_pending(&ct->timeout, newtime);
        }
 
 acct:
@@ -835,13 +871,13 @@ acct:
 
                acct = nf_conn_acct_find(ct);
                if (acct) {
+                       spin_lock_bh(&ct->lock);
                        acct[CTINFO2DIR(ctinfo)].packets++;
                        acct[CTINFO2DIR(ctinfo)].bytes +=
                                skb->len - skb_network_offset(skb);
+                       spin_unlock_bh(&ct->lock);
                }
        }
-
-       spin_unlock_bh(&nf_conntrack_lock);
 }
 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
 
@@ -853,14 +889,14 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,
        if (do_acct) {
                struct nf_conn_counter *acct;
 
-               spin_lock_bh(&nf_conntrack_lock);
                acct = nf_conn_acct_find(ct);
                if (acct) {
+                       spin_lock_bh(&ct->lock);
                        acct[CTINFO2DIR(ctinfo)].packets++;
                        acct[CTINFO2DIR(ctinfo)].bytes +=
                                skb->len - skb_network_offset(skb);
+                       spin_unlock_bh(&ct->lock);
                }
-               spin_unlock_bh(&nf_conntrack_lock);
        }
 
        if (del_timer(&ct->timeout)) {
@@ -994,11 +1030,13 @@ static int kill_report(struct nf_conn *i, void *data)
 {
        struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data;
 
-       /* get_next_corpse sets the dying bit for us */
-       nf_conntrack_event_report(IPCT_DESTROY,
-                                 i,
-                                 fr->pid,
-                                 fr->report);
+       /* If we fail to deliver the event, death_by_timeout() will retry */
+       if (nf_conntrack_event_report(IPCT_DESTROY, i,
+                                     fr->pid, fr->report) < 0)
+               return 1;
+
+       /* Avoid the delivery of the destroy event in death_by_timeout(). */
+       set_bit(IPS_DYING_BIT, &i->status);
        return 1;
 }
 
@@ -1027,6 +1065,21 @@ void nf_conntrack_flush_report(struct net *net, u32 pid, int report)
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
 
+static void nf_ct_release_dying_list(void)
+{
+       struct nf_conntrack_tuple_hash *h;
+       struct nf_conn *ct;
+       struct hlist_nulls_node *n;
+
+       spin_lock_bh(&nf_conntrack_lock);
+       hlist_nulls_for_each_entry(h, n, &init_net.ct.dying, hnnode) {
+               ct = nf_ct_tuplehash_to_ctrack(h);
+               /* never fails to remove them, no listeners at this point */
+               nf_ct_kill(ct);
+       }
+       spin_unlock_bh(&nf_conntrack_lock);
+}
+
 static void nf_conntrack_cleanup_init_net(void)
 {
        nf_conntrack_helper_fini();
@@ -1036,10 +1089,9 @@ static void nf_conntrack_cleanup_init_net(void)
 
 static void nf_conntrack_cleanup_net(struct net *net)
 {
-       nf_ct_event_cache_flush(net);
-       nf_conntrack_ecache_fini(net);
  i_see_dead_people:
        nf_ct_iterate_cleanup(net, kill_all, NULL);
+       nf_ct_release_dying_list();
        if (atomic_read(&net->ct.count) != 0) {
                schedule();
                goto i_see_dead_people;
@@ -1050,6 +1102,7 @@ static void nf_conntrack_cleanup_net(struct net *net)
 
        nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
                             nf_conntrack_htable_size);
+       nf_conntrack_ecache_fini(net);
        nf_conntrack_acct_fini(net);
        nf_conntrack_expect_fini(net);
        free_percpu(net->ct.stat);
@@ -1220,14 +1273,12 @@ static int nf_conntrack_init_net(struct net *net)
 
        atomic_set(&net->ct.count, 0);
        INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, 0);
+       INIT_HLIST_NULLS_HEAD(&net->ct.dying, 0);
        net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
        if (!net->ct.stat) {
                ret = -ENOMEM;
                goto err_stat;
        }
-       ret = nf_conntrack_ecache_init(net);
-       if (ret < 0)
-               goto err_ecache;
        net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
                                             &net->ct.hash_vmalloc, 1);
        if (!net->ct.hash) {
@@ -1241,6 +1292,9 @@ static int nf_conntrack_init_net(struct net *net)
        ret = nf_conntrack_acct_init(net);
        if (ret < 0)
                goto err_acct;
+       ret = nf_conntrack_ecache_init(net);
+       if (ret < 0)
+               goto err_ecache;
 
        /* Set up fake conntrack:
            - to never be deleted, not in any hashes */
@@ -1253,14 +1307,14 @@ static int nf_conntrack_init_net(struct net *net)
 
        return 0;
 
+err_ecache:
+       nf_conntrack_acct_fini(net);
 err_acct:
        nf_conntrack_expect_fini(net);
 err_expect:
        nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
                             nf_conntrack_htable_size);
 err_hash:
-       nf_conntrack_ecache_fini(net);
-err_ecache:
        free_percpu(net->ct.stat);
 err_stat:
        return ret;
index 5516b3e64b4330baa7ec14778d762cf760de1df1..aee560b4768dda0522107a9aefeff5d382022516 100644 (file)
@@ -21,6 +21,7 @@
 
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
 
 static DEFINE_MUTEX(nf_ct_ecache_mutex);
 
@@ -32,94 +33,51 @@ EXPORT_SYMBOL_GPL(nf_expect_event_cb);
 
 /* deliver cached events and clear cache entry - must be called with locally
  * disabled softirqs */
-static inline void
-__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
+void nf_ct_deliver_cached_events(struct nf_conn *ct)
 {
+       unsigned long events;
        struct nf_ct_event_notifier *notify;
+       struct nf_conntrack_ecache *e;
 
        rcu_read_lock();
        notify = rcu_dereference(nf_conntrack_event_cb);
        if (notify == NULL)
                goto out_unlock;
 
-       if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
-           && ecache->events) {
+       e = nf_ct_ecache_find(ct);
+       if (e == NULL)
+               goto out_unlock;
+
+       events = xchg(&e->cache, 0);
+
+       if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct) && events) {
                struct nf_ct_event item = {
-                       .ct     = ecache->ct,
+                       .ct     = ct,
                        .pid    = 0,
                        .report = 0
                };
+               int ret;
+               /* We make a copy of the missed event cache without taking
+                * the lock, thus we may send missed events twice. However,
+                * this does not harm and it happens very rarely. */
+               unsigned long missed = e->missed;
 
-               notify->fcn(ecache->events, &item);
+               ret = notify->fcn(events | missed, &item);
+               if (unlikely(ret < 0 || missed)) {
+                       spin_lock_bh(&ct->lock);
+                       if (ret < 0)
+                               e->missed |= events;
+                       else
+                               e->missed &= ~missed;
+                       spin_unlock_bh(&ct->lock);
+               } 
        }
 
-       ecache->events = 0;
-       nf_ct_put(ecache->ct);
-       ecache->ct = NULL;
-
 out_unlock:
        rcu_read_unlock();
 }
-
-/* Deliver all cached events for a particular conntrack. This is called
- * by code prior to async packet handling for freeing the skb */
-void nf_ct_deliver_cached_events(const struct nf_conn *ct)
-{
-       struct net *net = nf_ct_net(ct);
-       struct nf_conntrack_ecache *ecache;
-
-       local_bh_disable();
-       ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id());
-       if (ecache->ct == ct)
-               __nf_ct_deliver_cached_events(ecache);
-       local_bh_enable();
-}
 EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
 
-/* Deliver cached events for old pending events, if current conntrack != old */
-void __nf_ct_event_cache_init(struct nf_conn *ct)
-{
-       struct net *net = nf_ct_net(ct);
-       struct nf_conntrack_ecache *ecache;
-
-       /* take care of delivering potentially old events */
-       ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id());
-       BUG_ON(ecache->ct == ct);
-       if (ecache->ct)
-               __nf_ct_deliver_cached_events(ecache);
-       /* initialize for this conntrack/packet */
-       ecache->ct = ct;
-       nf_conntrack_get(&ct->ct_general);
-}
-EXPORT_SYMBOL_GPL(__nf_ct_event_cache_init);
-
-/* flush the event cache - touches other CPU's data and must not be called
- * while packets are still passing through the code */
-void nf_ct_event_cache_flush(struct net *net)
-{
-       struct nf_conntrack_ecache *ecache;
-       int cpu;
-
-       for_each_possible_cpu(cpu) {
-               ecache = per_cpu_ptr(net->ct.ecache, cpu);
-               if (ecache->ct)
-                       nf_ct_put(ecache->ct);
-       }
-}
-
-int nf_conntrack_ecache_init(struct net *net)
-{
-       net->ct.ecache = alloc_percpu(struct nf_conntrack_ecache);
-       if (!net->ct.ecache)
-               return -ENOMEM;
-       return 0;
-}
-
-void nf_conntrack_ecache_fini(struct net *net)
-{
-       free_percpu(net->ct.ecache);
-}
-
 int nf_conntrack_register_notifier(struct nf_ct_event_notifier *new)
 {
        int ret = 0;
@@ -185,3 +143,118 @@ void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *new)
        mutex_unlock(&nf_ct_ecache_mutex);
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
+
+#define NF_CT_EVENTS_DEFAULT 1
+static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
+static int nf_ct_events_retry_timeout __read_mostly = 15*HZ;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table event_sysctl_table[] = {
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "nf_conntrack_events",
+               .data           = &init_net.ct.sysctl_events,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "nf_conntrack_events_retry_timeout",
+               .data           = &init_net.ct.sysctl_events_retry_timeout,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_jiffies,
+       },
+       {}
+};
+#endif /* CONFIG_SYSCTL */
+
+static struct nf_ct_ext_type event_extend __read_mostly = {
+       .len    = sizeof(struct nf_conntrack_ecache),
+       .align  = __alignof__(struct nf_conntrack_ecache),
+       .id     = NF_CT_EXT_ECACHE,
+};
+
+#ifdef CONFIG_SYSCTL
+static int nf_conntrack_event_init_sysctl(struct net *net)
+{
+       struct ctl_table *table;
+
+       table = kmemdup(event_sysctl_table, sizeof(event_sysctl_table),
+                       GFP_KERNEL);
+       if (!table)
+               goto out;
+
+       table[0].data = &net->ct.sysctl_events;
+       table[1].data = &net->ct.sysctl_events_retry_timeout;
+
+       net->ct.event_sysctl_header =
+               register_net_sysctl_table(net,
+                                         nf_net_netfilter_sysctl_path, table);
+       if (!net->ct.event_sysctl_header) {
+               printk(KERN_ERR "nf_ct_event: can't register to sysctl.\n");
+               goto out_register;
+       }
+       return 0;
+
+out_register:
+       kfree(table);
+out:
+       return -ENOMEM;
+}
+
+static void nf_conntrack_event_fini_sysctl(struct net *net)
+{
+       struct ctl_table *table;
+
+       table = net->ct.event_sysctl_header->ctl_table_arg;
+       unregister_net_sysctl_table(net->ct.event_sysctl_header);
+       kfree(table);
+}
+#else
+static int nf_conntrack_event_init_sysctl(struct net *net)
+{
+       return 0;
+}
+
+static void nf_conntrack_event_fini_sysctl(struct net *net)
+{
+}
+#endif /* CONFIG_SYSCTL */
+
+int nf_conntrack_ecache_init(struct net *net)
+{
+       int ret;
+
+       net->ct.sysctl_events = nf_ct_events;
+       net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout;
+
+       if (net_eq(net, &init_net)) {
+               ret = nf_ct_extend_register(&event_extend);
+               if (ret < 0) {
+                       printk(KERN_ERR "nf_ct_event: Unable to register "
+                                       "event extension.\n");
+                       goto out_extend_register;
+               }
+       }
+
+       ret = nf_conntrack_event_init_sysctl(net);
+       if (ret < 0)
+               goto out_sysctl;
+
+       return 0;
+
+out_sysctl:
+       if (net_eq(net, &init_net))
+               nf_ct_extend_unregister(&event_extend);
+out_extend_register:
+       return ret;
+}
+
+void nf_conntrack_ecache_fini(struct net *net)
+{
+       nf_conntrack_event_fini_sysctl(net);
+       if (net_eq(net, &init_net))
+               nf_ct_extend_unregister(&event_extend);
+}
index 0fa5a422959fc63c13c75abb86747a77fc4c185c..65c2a7bc3afcdccef4ce0eefb35b38f6beda99d7 100644 (file)
@@ -136,6 +136,20 @@ static inline int unhelp(struct nf_conntrack_tuple_hash *i,
        return 0;
 }
 
+void nf_ct_helper_destroy(struct nf_conn *ct)
+{
+       struct nf_conn_help *help = nfct_help(ct);
+       struct nf_conntrack_helper *helper;
+
+       if (help) {
+               rcu_read_lock();
+               helper = rcu_dereference(help->helper);
+               if (helper && helper->destroy)
+                       helper->destroy(ct);
+               rcu_read_unlock();
+       }
+}
+
 int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
 {
        unsigned int h = helper_hash(&me->tuple);
index 4e503ada572857442faa754aab3e528a07d26ae6..49479d1945700b14a8e86de3c6781ed471411694 100644 (file)
@@ -463,15 +463,16 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
        struct sk_buff *skb;
        unsigned int type;
        unsigned int flags = 0, group;
+       int err;
 
        /* ignore our fake conntrack entry */
        if (ct == &nf_conntrack_untracked)
                return 0;
 
-       if (events & IPCT_DESTROY) {
+       if (events & (1 << IPCT_DESTROY)) {
                type = IPCTNL_MSG_CT_DELETE;
                group = NFNLGRP_CONNTRACK_DESTROY;
-       } else  if (events & (IPCT_NEW | IPCT_RELATED)) {
+       } else  if (events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED))) {
                type = IPCTNL_MSG_CT_NEW;
                flags = NLM_F_CREATE|NLM_F_EXCL;
                group = NFNLGRP_CONNTRACK_NEW;
@@ -519,7 +520,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
        if (ctnetlink_dump_status(skb, ct) < 0)
                goto nla_put_failure;
 
-       if (events & IPCT_DESTROY) {
+       if (events & (1 << IPCT_DESTROY)) {
                if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
                    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
                        goto nla_put_failure;
@@ -527,38 +528,41 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
                if (ctnetlink_dump_timeout(skb, ct) < 0)
                        goto nla_put_failure;
 
-               if (events & IPCT_PROTOINFO
+               if (events & (1 << IPCT_PROTOINFO)
                    && ctnetlink_dump_protoinfo(skb, ct) < 0)
                        goto nla_put_failure;
 
-               if ((events & IPCT_HELPER || nfct_help(ct))
+               if ((events & (1 << IPCT_HELPER) || nfct_help(ct))
                    && ctnetlink_dump_helpinfo(skb, ct) < 0)
                        goto nla_put_failure;
 
 #ifdef CONFIG_NF_CONNTRACK_SECMARK
-               if ((events & IPCT_SECMARK || ct->secmark)
+               if ((events & (1 << IPCT_SECMARK) || ct->secmark)
                    && ctnetlink_dump_secmark(skb, ct) < 0)
                        goto nla_put_failure;
 #endif
 
-               if (events & IPCT_RELATED &&
+               if (events & (1 << IPCT_RELATED) &&
                    ctnetlink_dump_master(skb, ct) < 0)
                        goto nla_put_failure;
 
-               if (events & IPCT_NATSEQADJ &&
+               if (events & (1 << IPCT_NATSEQADJ) &&
                    ctnetlink_dump_nat_seq_adj(skb, ct) < 0)
                        goto nla_put_failure;
        }
 
 #ifdef CONFIG_NF_CONNTRACK_MARK
-       if ((events & IPCT_MARK || ct->mark)
+       if ((events & (1 << IPCT_MARK) || ct->mark)
            && ctnetlink_dump_mark(skb, ct) < 0)
                goto nla_put_failure;
 #endif
        rcu_read_unlock();
 
        nlmsg_end(skb, nlh);
-       nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC);
+       err = nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC);
+       if (err == -ENOBUFS || err == -EAGAIN)
+               return -ENOBUFS;
+
        return 0;
 
 nla_put_failure:
@@ -798,10 +802,15 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
                }
        }
 
-       nf_conntrack_event_report(IPCT_DESTROY,
-                                 ct,
-                                 NETLINK_CB(skb).pid,
-                                 nlmsg_report(nlh));
+       if (nf_conntrack_event_report(IPCT_DESTROY, ct,
+                                     NETLINK_CB(skb).pid,
+                                     nlmsg_report(nlh)) < 0) {
+               nf_ct_delete_from_lists(ct);
+               /* we failed to report the event, try later */
+               nf_ct_insert_dying_list(ct);
+               nf_ct_put(ct);
+               return 0;
+       }
 
        /* death_by_timeout would report the event again */
        set_bit(IPS_DYING_BIT, &ct->status);
@@ -1253,6 +1262,7 @@ ctnetlink_create_conntrack(struct nlattr *cda[],
        }
 
        nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+       nf_ct_ecache_ext_add(ct, GFP_ATOMIC);
 
 #if defined(CONFIG_NF_CONNTRACK_MARK)
        if (cda[CTA_MARK])
@@ -1340,13 +1350,13 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
                        else
                                events = IPCT_NEW;
 
-                       nf_conntrack_event_report(IPCT_STATUS |
-                                                 IPCT_HELPER |
-                                                 IPCT_PROTOINFO |
-                                                 IPCT_NATSEQADJ |
-                                                 IPCT_MARK | events,
-                                                 ct, NETLINK_CB(skb).pid,
-                                                 nlmsg_report(nlh));
+                       nf_conntrack_eventmask_report((1 << IPCT_STATUS) |
+                                                     (1 << IPCT_HELPER) |
+                                                     (1 << IPCT_PROTOINFO) |
+                                                     (1 << IPCT_NATSEQADJ) |
+                                                     (1 << IPCT_MARK) | events,
+                                                     ct, NETLINK_CB(skb).pid,
+                                                     nlmsg_report(nlh));
                        nf_ct_put(ct);
                } else
                        spin_unlock_bh(&nf_conntrack_lock);
@@ -1365,13 +1375,13 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
                if (err == 0) {
                        nf_conntrack_get(&ct->ct_general);
                        spin_unlock_bh(&nf_conntrack_lock);
-                       nf_conntrack_event_report(IPCT_STATUS |
-                                                 IPCT_HELPER |
-                                                 IPCT_PROTOINFO |
-                                                 IPCT_NATSEQADJ |
-                                                 IPCT_MARK,
-                                                 ct, NETLINK_CB(skb).pid,
-                                                 nlmsg_report(nlh));
+                       nf_conntrack_eventmask_report((1 << IPCT_STATUS) |
+                                                     (1 << IPCT_HELPER) |
+                                                     (1 << IPCT_PROTOINFO) |
+                                                     (1 << IPCT_NATSEQADJ) |
+                                                     (1 << IPCT_MARK),
+                                                     ct, NETLINK_CB(skb).pid,
+                                                     nlmsg_report(nlh));
                        nf_ct_put(ct);
                } else
                        spin_unlock_bh(&nf_conntrack_lock);
@@ -1515,7 +1525,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
        unsigned int type;
        int flags = 0;
 
-       if (events & IPEXP_NEW) {
+       if (events & (1 << IPEXP_NEW)) {
                type = IPCTNL_MSG_EXP_NEW;
                flags = NLM_F_CREATE|NLM_F_EXCL;
        } else
index beb37311e1a5c3d3c20972d4bab070010572083d..2fefe147750a09b2f8d94f0e78f3b5f35e5b08b0 100644 (file)
@@ -248,14 +248,14 @@ static int nf_log_proc_dostring(ctl_table *table, int write, struct file *filp,
                rcu_assign_pointer(nf_loggers[tindex], logger);
                mutex_unlock(&nf_log_mutex);
        } else {
-               rcu_read_lock();
-               logger = rcu_dereference(nf_loggers[tindex]);
+               mutex_lock(&nf_log_mutex);
+               logger = nf_loggers[tindex];
                if (!logger)
                        table->data = "NONE";
                else
                        table->data = logger->name;
                r = proc_dostring(table, write, filp, buffer, lenp, ppos);
-               rcu_read_unlock();
+               mutex_unlock(&nf_log_mutex);
        }
 
        return r;
index 46dba5f043d53045055da0b3ba7fa573c412612f..025d1a0af78b43c14a38cc0c5b406b36dd5887f5 100644 (file)
@@ -364,14 +364,14 @@ int xt_check_match(struct xt_mtchk_param *par,
                 * ebt_among is exempt from centralized matchsize checking
                 * because it uses a dynamic-size data set.
                 */
-               printk("%s_tables: %s match: invalid size %Zu != %u\n",
+               pr_err("%s_tables: %s match: invalid size %Zu != %u\n",
                       xt_prefix[par->family], par->match->name,
                       XT_ALIGN(par->match->matchsize), size);
                return -EINVAL;
        }
        if (par->match->table != NULL &&
            strcmp(par->match->table, par->table) != 0) {
-               printk("%s_tables: %s match: only valid in %s table, not %s\n",
+               pr_err("%s_tables: %s match: only valid in %s table, not %s\n",
                       xt_prefix[par->family], par->match->name,
                       par->match->table, par->table);
                return -EINVAL;
@@ -379,7 +379,7 @@ int xt_check_match(struct xt_mtchk_param *par,
        if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) {
                char used[64], allow[64];
 
-               printk("%s_tables: %s match: used from hooks %s, but only "
+               pr_err("%s_tables: %s match: used from hooks %s, but only "
                       "valid from %s\n",
                       xt_prefix[par->family], par->match->name,
                       textify_hooks(used, sizeof(used), par->hook_mask),
@@ -387,7 +387,7 @@ int xt_check_match(struct xt_mtchk_param *par,
                return -EINVAL;
        }
        if (par->match->proto && (par->match->proto != proto || inv_proto)) {
-               printk("%s_tables: %s match: only valid for protocol %u\n",
+               pr_err("%s_tables: %s match: only valid for protocol %u\n",
                       xt_prefix[par->family], par->match->name,
                       par->match->proto);
                return -EINVAL;
@@ -514,14 +514,14 @@ int xt_check_target(struct xt_tgchk_param *par,
                    unsigned int size, u_int8_t proto, bool inv_proto)
 {
        if (XT_ALIGN(par->target->targetsize) != size) {
-               printk("%s_tables: %s target: invalid size %Zu != %u\n",
+               pr_err("%s_tables: %s target: invalid size %Zu != %u\n",
                       xt_prefix[par->family], par->target->name,
                       XT_ALIGN(par->target->targetsize), size);
                return -EINVAL;
        }
        if (par->target->table != NULL &&
            strcmp(par->target->table, par->table) != 0) {
-               printk("%s_tables: %s target: only valid in %s table, not %s\n",
+               pr_err("%s_tables: %s target: only valid in %s table, not %s\n",
                       xt_prefix[par->family], par->target->name,
                       par->target->table, par->table);
                return -EINVAL;
@@ -529,7 +529,7 @@ int xt_check_target(struct xt_tgchk_param *par,
        if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) {
                char used[64], allow[64];
 
-               printk("%s_tables: %s target: used from hooks %s, but only "
+               pr_err("%s_tables: %s target: used from hooks %s, but only "
                       "usable from %s\n",
                       xt_prefix[par->family], par->target->name,
                       textify_hooks(used, sizeof(used), par->hook_mask),
@@ -537,7 +537,7 @@ int xt_check_target(struct xt_tgchk_param *par,
                return -EINVAL;
        }
        if (par->target->proto && (par->target->proto != proto || inv_proto)) {
-               printk("%s_tables: %s target: only valid for protocol %u\n",
+               pr_err("%s_tables: %s target: only valid for protocol %u\n",
                       xt_prefix[par->family], par->target->name,
                       par->target->proto);
                return -EINVAL;