]>
Commit | Line | Data |
---|---|---|
f6180121 MJ |
1 | /* Event cache for netfilter. */ |
2 | ||
f229f6ce PM |
3 | /* |
4 | * (C) 2005 Harald Welte <laforge@gnumonks.org> | |
5 | * (C) 2005 Patrick McHardy <kaber@trash.net> | |
6 | * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org> | |
7 | * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org> | |
f6180121 MJ |
8 | * |
9 | * This program is free software; you can redistribute it and/or modify | |
10 | * it under the terms of the GNU General Public License version 2 as | |
11 | * published by the Free Software Foundation. | |
12 | */ | |
13 | ||
14 | #include <linux/types.h> | |
15 | #include <linux/netfilter.h> | |
16 | #include <linux/skbuff.h> | |
17 | #include <linux/vmalloc.h> | |
18 | #include <linux/stddef.h> | |
19 | #include <linux/err.h> | |
20 | #include <linux/percpu.h> | |
f6180121 MJ |
21 | #include <linux/kernel.h> |
22 | #include <linux/netdevice.h> | |
5a0e3ad6 | 23 | #include <linux/slab.h> |
bc3b2d7f | 24 | #include <linux/export.h> |
f6180121 MJ |
25 | |
26 | #include <net/netfilter/nf_conntrack.h> | |
f6180121 | 27 | #include <net/netfilter/nf_conntrack_core.h> |
a0891aa6 | 28 | #include <net/netfilter/nf_conntrack_extend.h> |
f6180121 | 29 | |
e34d5c1a | 30 | static DEFINE_MUTEX(nf_ct_ecache_mutex); |
13b18339 | 31 | |
9500507c FW |
32 | #define ECACHE_RETRY_WAIT (HZ/10) |
33 | ||
34 | enum retry_state { | |
35 | STATE_CONGESTED, | |
36 | STATE_RESTART, | |
37 | STATE_DONE, | |
38 | }; | |
39 | ||
40 | static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) | |
41 | { | |
42 | struct nf_conn *refs[16]; | |
43 | struct nf_conntrack_tuple_hash *h; | |
44 | struct hlist_nulls_node *n; | |
45 | unsigned int evicted = 0; | |
46 | enum retry_state ret = STATE_DONE; | |
47 | ||
48 | spin_lock(&pcpu->lock); | |
49 | ||
50 | hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { | |
51 | struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); | |
616b14b4 | 52 | struct nf_conntrack_ecache *e; |
9500507c | 53 | |
616b14b4 FW |
54 | if (!nf_ct_is_confirmed(ct)) |
55 | continue; | |
56 | ||
57 | e = nf_ct_ecache_find(ct); | |
58 | if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL) | |
9500507c FW |
59 | continue; |
60 | ||
61 | if (nf_conntrack_event(IPCT_DESTROY, ct)) { | |
62 | ret = STATE_CONGESTED; | |
63 | break; | |
64 | } | |
65 | ||
616b14b4 | 66 | e->state = NFCT_ECACHE_DESTROY_SENT; |
9500507c FW |
67 | refs[evicted] = ct; |
68 | ||
69 | if (++evicted >= ARRAY_SIZE(refs)) { | |
70 | ret = STATE_RESTART; | |
71 | break; | |
72 | } | |
73 | } | |
74 | ||
75 | spin_unlock(&pcpu->lock); | |
76 | ||
77 | /* can't _put while holding lock */ | |
78 | while (evicted) | |
79 | nf_ct_put(refs[--evicted]); | |
80 | ||
81 | return ret; | |
82 | } | |
83 | ||
84 | static void ecache_work(struct work_struct *work) | |
85 | { | |
86 | struct netns_ct *ctnet = | |
87 | container_of(work, struct netns_ct, ecache_dwork.work); | |
88 | int cpu, delay = -1; | |
89 | struct ct_pcpu *pcpu; | |
90 | ||
91 | local_bh_disable(); | |
92 | ||
93 | for_each_possible_cpu(cpu) { | |
94 | enum retry_state ret; | |
95 | ||
96 | pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu); | |
97 | ||
98 | ret = ecache_work_evict_list(pcpu); | |
99 | ||
100 | switch (ret) { | |
101 | case STATE_CONGESTED: | |
102 | delay = ECACHE_RETRY_WAIT; | |
103 | goto out; | |
104 | case STATE_RESTART: | |
105 | delay = 0; | |
106 | break; | |
107 | case STATE_DONE: | |
108 | break; | |
109 | } | |
110 | } | |
111 | ||
112 | out: | |
113 | local_bh_enable(); | |
114 | ||
115 | ctnet->ecache_dwork_pending = delay > 0; | |
116 | if (delay >= 0) | |
117 | schedule_delayed_work(&ctnet->ecache_dwork, delay); | |
118 | } | |
119 | ||
3c435e2e FW |
120 | int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, |
121 | u32 portid, int report) | |
122 | { | |
123 | int ret = 0; | |
124 | struct net *net = nf_ct_net(ct); | |
125 | struct nf_ct_event_notifier *notify; | |
126 | struct nf_conntrack_ecache *e; | |
127 | ||
128 | rcu_read_lock(); | |
129 | notify = rcu_dereference(net->ct.nf_conntrack_event_cb); | |
130 | if (!notify) | |
131 | goto out_unlock; | |
132 | ||
133 | e = nf_ct_ecache_find(ct); | |
134 | if (!e) | |
135 | goto out_unlock; | |
136 | ||
616b14b4 | 137 | if (nf_ct_is_confirmed(ct)) { |
3c435e2e FW |
138 | struct nf_ct_event item = { |
139 | .ct = ct, | |
140 | .portid = e->portid ? e->portid : portid, | |
141 | .report = report | |
142 | }; | |
143 | /* This is a resent of a destroy event? If so, skip missed */ | |
144 | unsigned long missed = e->portid ? 0 : e->missed; | |
145 | ||
146 | if (!((eventmask | missed) & e->ctmask)) | |
147 | goto out_unlock; | |
148 | ||
149 | ret = notify->fcn(eventmask | missed, &item); | |
150 | if (unlikely(ret < 0 || missed)) { | |
151 | spin_lock_bh(&ct->lock); | |
152 | if (ret < 0) { | |
153 | /* This is a destroy event that has been | |
154 | * triggered by a process, we store the PORTID | |
155 | * to include it in the retransmission. | |
156 | */ | |
616b14b4 FW |
157 | if (eventmask & (1 << IPCT_DESTROY)) { |
158 | if (e->portid == 0 && portid != 0) | |
159 | e->portid = portid; | |
160 | e->state = NFCT_ECACHE_DESTROY_FAIL; | |
161 | } else { | |
3c435e2e | 162 | e->missed |= eventmask; |
616b14b4 | 163 | } |
3c435e2e FW |
164 | } else { |
165 | e->missed &= ~missed; | |
166 | } | |
167 | spin_unlock_bh(&ct->lock); | |
168 | } | |
169 | } | |
170 | out_unlock: | |
171 | rcu_read_unlock(); | |
172 | return ret; | |
173 | } | |
174 | EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); | |
175 | ||
f6180121 MJ |
176 | /* deliver cached events and clear cache entry - must be called with locally |
177 | * disabled softirqs */ | |
a0891aa6 | 178 | void nf_ct_deliver_cached_events(struct nf_conn *ct) |
f6180121 | 179 | { |
70e9942f | 180 | struct net *net = nf_ct_net(ct); |
58020f77 | 181 | unsigned long events, missed; |
e34d5c1a | 182 | struct nf_ct_event_notifier *notify; |
a0891aa6 | 183 | struct nf_conntrack_ecache *e; |
58020f77 TZ |
184 | struct nf_ct_event item; |
185 | int ret; | |
e34d5c1a PNA |
186 | |
187 | rcu_read_lock(); | |
70e9942f | 188 | notify = rcu_dereference(net->ct.nf_conntrack_event_cb); |
e34d5c1a PNA |
189 | if (notify == NULL) |
190 | goto out_unlock; | |
191 | ||
a0891aa6 PNA |
192 | e = nf_ct_ecache_find(ct); |
193 | if (e == NULL) | |
194 | goto out_unlock; | |
195 | ||
196 | events = xchg(&e->cache, 0); | |
197 | ||
6e354a5e | 198 | if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) |
58020f77 TZ |
199 | goto out_unlock; |
200 | ||
201 | /* We make a copy of the missed event cache without taking | |
202 | * the lock, thus we may send missed events twice. However, | |
203 | * this does not harm and it happens very rarely. */ | |
204 | missed = e->missed; | |
205 | ||
206 | if (!((events | missed) & e->ctmask)) | |
207 | goto out_unlock; | |
208 | ||
209 | item.ct = ct; | |
15e47304 | 210 | item.portid = 0; |
58020f77 TZ |
211 | item.report = 0; |
212 | ||
213 | ret = notify->fcn(events | missed, &item); | |
214 | ||
6e354a5e | 215 | if (likely(ret == 0 && !missed)) |
58020f77 TZ |
216 | goto out_unlock; |
217 | ||
218 | spin_lock_bh(&ct->lock); | |
219 | if (ret < 0) | |
220 | e->missed |= events; | |
221 | else | |
222 | e->missed &= ~missed; | |
223 | spin_unlock_bh(&ct->lock); | |
f6180121 | 224 | |
e34d5c1a PNA |
225 | out_unlock: |
226 | rcu_read_unlock(); | |
f6180121 | 227 | } |
13b18339 | 228 | EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); |
f6180121 | 229 | |
ecdfb48c FW |
230 | void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, |
231 | struct nf_conntrack_expect *exp, | |
232 | u32 portid, int report) | |
233 | ||
234 | { | |
235 | struct net *net = nf_ct_exp_net(exp); | |
236 | struct nf_exp_event_notifier *notify; | |
237 | struct nf_conntrack_ecache *e; | |
238 | ||
239 | rcu_read_lock(); | |
240 | notify = rcu_dereference(net->ct.nf_expect_event_cb); | |
241 | if (!notify) | |
242 | goto out_unlock; | |
243 | ||
244 | e = nf_ct_ecache_find(exp->master); | |
245 | if (!e) | |
246 | goto out_unlock; | |
247 | ||
248 | if (e->expmask & (1 << event)) { | |
249 | struct nf_exp_event item = { | |
250 | .exp = exp, | |
251 | .portid = portid, | |
252 | .report = report | |
253 | }; | |
254 | notify->fcn(1 << event, &item); | |
255 | } | |
256 | out_unlock: | |
257 | rcu_read_unlock(); | |
258 | } | |
259 | ||
70e9942f PNA |
260 | int nf_conntrack_register_notifier(struct net *net, |
261 | struct nf_ct_event_notifier *new) | |
010c7d6f | 262 | { |
031d7709 | 263 | int ret; |
b56f2d55 | 264 | struct nf_ct_event_notifier *notify; |
e34d5c1a PNA |
265 | |
266 | mutex_lock(&nf_ct_ecache_mutex); | |
70e9942f | 267 | notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, |
b56f2d55 PM |
268 | lockdep_is_held(&nf_ct_ecache_mutex)); |
269 | if (notify != NULL) { | |
e34d5c1a PNA |
270 | ret = -EBUSY; |
271 | goto out_unlock; | |
272 | } | |
cf778b00 | 273 | rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); |
031d7709 | 274 | ret = 0; |
e34d5c1a PNA |
275 | |
276 | out_unlock: | |
277 | mutex_unlock(&nf_ct_ecache_mutex); | |
278 | return ret; | |
010c7d6f PM |
279 | } |
280 | EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); | |
281 | ||
70e9942f PNA |
282 | void nf_conntrack_unregister_notifier(struct net *net, |
283 | struct nf_ct_event_notifier *new) | |
010c7d6f | 284 | { |
b56f2d55 PM |
285 | struct nf_ct_event_notifier *notify; |
286 | ||
e34d5c1a | 287 | mutex_lock(&nf_ct_ecache_mutex); |
70e9942f | 288 | notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, |
b56f2d55 PM |
289 | lockdep_is_held(&nf_ct_ecache_mutex)); |
290 | BUG_ON(notify != new); | |
70e9942f | 291 | RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); |
e34d5c1a | 292 | mutex_unlock(&nf_ct_ecache_mutex); |
3b7dabf0 | 293 | /* synchronize_rcu() is called from ctnetlink_exit. */ |
010c7d6f PM |
294 | } |
295 | EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); | |
296 | ||
70e9942f PNA |
297 | int nf_ct_expect_register_notifier(struct net *net, |
298 | struct nf_exp_event_notifier *new) | |
010c7d6f | 299 | { |
031d7709 | 300 | int ret; |
b56f2d55 | 301 | struct nf_exp_event_notifier *notify; |
e34d5c1a PNA |
302 | |
303 | mutex_lock(&nf_ct_ecache_mutex); | |
70e9942f | 304 | notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, |
b56f2d55 PM |
305 | lockdep_is_held(&nf_ct_ecache_mutex)); |
306 | if (notify != NULL) { | |
e34d5c1a PNA |
307 | ret = -EBUSY; |
308 | goto out_unlock; | |
309 | } | |
cf778b00 | 310 | rcu_assign_pointer(net->ct.nf_expect_event_cb, new); |
031d7709 | 311 | ret = 0; |
e34d5c1a PNA |
312 | |
313 | out_unlock: | |
314 | mutex_unlock(&nf_ct_ecache_mutex); | |
315 | return ret; | |
010c7d6f | 316 | } |
6823645d | 317 | EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier); |
010c7d6f | 318 | |
70e9942f PNA |
319 | void nf_ct_expect_unregister_notifier(struct net *net, |
320 | struct nf_exp_event_notifier *new) | |
010c7d6f | 321 | { |
b56f2d55 PM |
322 | struct nf_exp_event_notifier *notify; |
323 | ||
e34d5c1a | 324 | mutex_lock(&nf_ct_ecache_mutex); |
70e9942f | 325 | notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, |
b56f2d55 PM |
326 | lockdep_is_held(&nf_ct_ecache_mutex)); |
327 | BUG_ON(notify != new); | |
70e9942f | 328 | RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL); |
e34d5c1a | 329 | mutex_unlock(&nf_ct_ecache_mutex); |
3b7dabf0 | 330 | /* synchronize_rcu() is called from ctnetlink_exit. */ |
010c7d6f | 331 | } |
6823645d | 332 | EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); |
a0891aa6 PNA |
333 | |
334 | #define NF_CT_EVENTS_DEFAULT 1 | |
335 | static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; | |
336 | ||
337 | #ifdef CONFIG_SYSCTL | |
338 | static struct ctl_table event_sysctl_table[] = { | |
339 | { | |
a0891aa6 PNA |
340 | .procname = "nf_conntrack_events", |
341 | .data = &init_net.ct.sysctl_events, | |
342 | .maxlen = sizeof(unsigned int), | |
343 | .mode = 0644, | |
344 | .proc_handler = proc_dointvec, | |
345 | }, | |
346 | {} | |
347 | }; | |
348 | #endif /* CONFIG_SYSCTL */ | |
349 | ||
23f671a1 | 350 | static const struct nf_ct_ext_type event_extend = { |
a0891aa6 PNA |
351 | .len = sizeof(struct nf_conntrack_ecache), |
352 | .align = __alignof__(struct nf_conntrack_ecache), | |
353 | .id = NF_CT_EXT_ECACHE, | |
354 | }; | |
355 | ||
356 | #ifdef CONFIG_SYSCTL | |
357 | static int nf_conntrack_event_init_sysctl(struct net *net) | |
358 | { | |
359 | struct ctl_table *table; | |
360 | ||
361 | table = kmemdup(event_sysctl_table, sizeof(event_sysctl_table), | |
362 | GFP_KERNEL); | |
363 | if (!table) | |
364 | goto out; | |
365 | ||
366 | table[0].data = &net->ct.sysctl_events; | |
367 | ||
464dc801 EB |
368 | /* Don't export sysctls to unprivileged users */ |
369 | if (net->user_ns != &init_user_ns) | |
370 | table[0].procname = NULL; | |
371 | ||
a0891aa6 | 372 | net->ct.event_sysctl_header = |
ec8f23ce | 373 | register_net_sysctl(net, "net/netfilter", table); |
a0891aa6 PNA |
374 | if (!net->ct.event_sysctl_header) { |
375 | printk(KERN_ERR "nf_ct_event: can't register to sysctl.\n"); | |
376 | goto out_register; | |
377 | } | |
378 | return 0; | |
379 | ||
380 | out_register: | |
381 | kfree(table); | |
382 | out: | |
383 | return -ENOMEM; | |
384 | } | |
385 | ||
386 | static void nf_conntrack_event_fini_sysctl(struct net *net) | |
387 | { | |
388 | struct ctl_table *table; | |
389 | ||
390 | table = net->ct.event_sysctl_header->ctl_table_arg; | |
391 | unregister_net_sysctl_table(net->ct.event_sysctl_header); | |
392 | kfree(table); | |
393 | } | |
394 | #else | |
395 | static int nf_conntrack_event_init_sysctl(struct net *net) | |
396 | { | |
397 | return 0; | |
398 | } | |
399 | ||
400 | static void nf_conntrack_event_fini_sysctl(struct net *net) | |
401 | { | |
402 | } | |
403 | #endif /* CONFIG_SYSCTL */ | |
404 | ||
3fe0f943 | 405 | int nf_conntrack_ecache_pernet_init(struct net *net) |
a0891aa6 | 406 | { |
a0891aa6 | 407 | net->ct.sysctl_events = nf_ct_events; |
9500507c | 408 | INIT_DELAYED_WORK(&net->ct.ecache_dwork, ecache_work); |
3fe0f943 G |
409 | return nf_conntrack_event_init_sysctl(net); |
410 | } | |
a0891aa6 | 411 | |
3fe0f943 G |
412 | void nf_conntrack_ecache_pernet_fini(struct net *net) |
413 | { | |
9500507c | 414 | cancel_delayed_work_sync(&net->ct.ecache_dwork); |
3fe0f943 G |
415 | nf_conntrack_event_fini_sysctl(net); |
416 | } | |
a0891aa6 | 417 | |
3fe0f943 G |
418 | int nf_conntrack_ecache_init(void) |
419 | { | |
420 | int ret = nf_ct_extend_register(&event_extend); | |
a0891aa6 | 421 | if (ret < 0) |
3fe0f943 | 422 | pr_err("nf_ct_event: Unable to register event extension.\n"); |
01026ede FW |
423 | |
424 | BUILD_BUG_ON(__IPCT_MAX >= 16); /* ctmask, missed use u16 */ | |
425 | ||
a0891aa6 PNA |
426 | return ret; |
427 | } | |
428 | ||
3fe0f943 | 429 | void nf_conntrack_ecache_fini(void) |
a0891aa6 | 430 | { |
3fe0f943 | 431 | nf_ct_extend_unregister(&event_extend); |
a0891aa6 | 432 | } |