]>
Commit | Line | Data |
---|---|---|
7eb95156 PE |
1 | /* |
2 | * inet fragments management | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or | |
5 | * modify it under the terms of the GNU General Public License | |
6 | * as published by the Free Software Foundation; either version | |
7 | * 2 of the License, or (at your option) any later version. | |
8 | * | |
9 | * Authors: Pavel Emelyanov <xemul@openvz.org> | |
10 | * Started as consolidation of ipv4/ip_fragment.c, | |
11 | * ipv6/reassembly. and ipv6 nf conntrack reassembly | |
12 | */ | |
13 | ||
14 | #include <linux/list.h> | |
15 | #include <linux/spinlock.h> | |
16 | #include <linux/module.h> | |
17 | #include <linux/timer.h> | |
18 | #include <linux/mm.h> | |
321a3a99 | 19 | #include <linux/random.h> |
1e4b8287 PE |
20 | #include <linux/skbuff.h> |
21 | #include <linux/rtnetlink.h> | |
5a0e3ad6 | 22 | #include <linux/slab.h> |
7eb95156 | 23 | |
5a3da1fe | 24 | #include <net/sock.h> |
7eb95156 | 25 | #include <net/inet_frag.h> |
be991971 | 26 | #include <net/inet_ecn.h> |
4faadabc PO |
27 | #include <net/ip.h> |
28 | #include <net/ipv6.h> | |
29 | ||
30 | /* Use skb->cb to track consecutive/adjacent fragments coming at | |
31 | * the end of the queue. Nodes in the rb-tree queue will | |
32 | * contain "runs" of one or more adjacent fragments. | |
33 | * | |
34 | * Invariants: | |
35 | * - next_frag is NULL at the tail of a "run"; | |
36 | * - the head of a "run" has the sum of all fragment lengths in frag_run_len. | |
37 | */ | |
38 | struct ipfrag_skb_cb { | |
39 | union { | |
40 | struct inet_skb_parm h4; | |
41 | struct inet6_skb_parm h6; | |
42 | }; | |
43 | struct sk_buff *next_frag; | |
44 | int frag_run_len; | |
45 | }; | |
46 | ||
47 | #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) | |
48 | ||
49 | static void fragcb_clear(struct sk_buff *skb) | |
50 | { | |
51 | RB_CLEAR_NODE(&skb->rbnode); | |
52 | FRAG_CB(skb)->next_frag = NULL; | |
53 | FRAG_CB(skb)->frag_run_len = skb->len; | |
54 | } | |
55 | ||
56 | /* Append skb to the last "run". */ | |
57 | static void fragrun_append_to_last(struct inet_frag_queue *q, | |
58 | struct sk_buff *skb) | |
59 | { | |
60 | fragcb_clear(skb); | |
61 | ||
62 | FRAG_CB(q->last_run_head)->frag_run_len += skb->len; | |
63 | FRAG_CB(q->fragments_tail)->next_frag = skb; | |
64 | q->fragments_tail = skb; | |
65 | } | |
66 | ||
67 | /* Create a new "run" with the skb. */ | |
68 | static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) | |
69 | { | |
70 | BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); | |
71 | fragcb_clear(skb); | |
72 | ||
73 | if (q->last_run_head) | |
74 | rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, | |
75 | &q->last_run_head->rbnode.rb_right); | |
76 | else | |
77 | rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); | |
78 | rb_insert_color(&skb->rbnode, &q->rb_fragments); | |
79 | ||
80 | q->fragments_tail = skb; | |
81 | q->last_run_head = skb; | |
82 | } | |
be991971 HFS |
83 | |
84 | /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements | |
85 | * Value : 0xff if frame should be dropped. | |
86 | * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field | |
87 | */ | |
88 | const u8 ip_frag_ecn_table[16] = { | |
89 | /* at least one fragment had CE, and others ECT_0 or ECT_1 */ | |
90 | [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, | |
91 | [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, | |
92 | [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, | |
93 | ||
94 | /* invalid combinations : drop frame */ | |
95 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, | |
96 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, | |
97 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, | |
98 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, | |
99 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, | |
100 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, | |
101 | [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, | |
102 | }; | |
103 | EXPORT_SYMBOL(ip_frag_ecn_table); | |
7eb95156 | 104 | |
d4ad4d22 | 105 | int inet_frags_init(struct inet_frags *f) |
7eb95156 | 106 | { |
d4ad4d22 NA |
107 | f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, |
108 | NULL); | |
109 | if (!f->frags_cachep) | |
110 | return -ENOMEM; | |
111 | ||
112 | return 0; | |
7eb95156 PE |
113 | } |
114 | EXPORT_SYMBOL(inet_frags_init); | |
115 | ||
116 | void inet_frags_fini(struct inet_frags *f) | |
117 | { | |
4e722036 ED |
118 | /* We must wait that all inet_frag_destroy_rcu() have completed. */ |
119 | rcu_barrier(); | |
120 | ||
d4ad4d22 | 121 | kmem_cache_destroy(f->frags_cachep); |
4e722036 | 122 | f->frags_cachep = NULL; |
7eb95156 PE |
123 | } |
124 | EXPORT_SYMBOL(inet_frags_fini); | |
277e650d | 125 | |
4e722036 | 126 | static void inet_frags_free_cb(void *ptr, void *arg) |
277e650d | 127 | { |
4e722036 | 128 | struct inet_frag_queue *fq = ptr; |
19952cc4 | 129 | |
4e722036 ED |
130 | /* If we can not cancel the timer, it means this frag_queue |
131 | * is already disappearing, we have nothing to do. | |
132 | * Otherwise, we own a refcount until the end of this function. | |
133 | */ | |
19952cc4 | 134 | |
4e722036 ED |
135 | spin_lock_bh(&fq->lock); |
136 | if (!(fq->flags & INET_FRAG_COMPLETE)) { | |
137 | fq->flags |= INET_FRAG_COMPLETE; | |
138 | refcount_dec(&fq->refcnt); | |
ab1c724f | 139 | } |
4e722036 | 140 | spin_unlock_bh(&fq->lock); |
ab1c724f | 141 | |
4e722036 | 142 | inet_frag_put(fq); |
ab1c724f FW |
143 | } |
144 | ||
4e722036 | 145 | void inet_frags_exit_net(struct netns_frags *nf) |
ab1c724f | 146 | { |
4e722036 | 147 | nf->low_thresh = 0; /* prevent creation of new frags */ |
ab1c724f | 148 | |
4e722036 | 149 | rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); |
277e650d | 150 | } |
4e722036 | 151 | EXPORT_SYMBOL(inet_frags_exit_net); |
277e650d | 152 | |
9bdf9ca9 | 153 | void inet_frag_kill(struct inet_frag_queue *fq) |
277e650d PE |
154 | { |
155 | if (del_timer(&fq->timer)) | |
edcb6918 | 156 | refcount_dec(&fq->refcnt); |
277e650d | 157 | |
06aa8b8a | 158 | if (!(fq->flags & INET_FRAG_COMPLETE)) { |
4e722036 ED |
159 | struct netns_frags *nf = fq->net; |
160 | ||
161 | fq->flags |= INET_FRAG_COMPLETE; | |
162 | rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params); | |
edcb6918 | 163 | refcount_dec(&fq->refcnt); |
277e650d PE |
164 | } |
165 | } | |
277e650d | 166 | EXPORT_SYMBOL(inet_frag_kill); |
1e4b8287 | 167 | |
4e722036 ED |
168 | static void inet_frag_destroy_rcu(struct rcu_head *head) |
169 | { | |
170 | struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, | |
171 | rcu); | |
172 | struct inet_frags *f = q->net->f; | |
173 | ||
174 | if (f->destructor) | |
175 | f->destructor(q); | |
176 | kmem_cache_free(f->frags_cachep, q); | |
177 | } | |
178 | ||
9bdf9ca9 | 179 | void inet_frag_destroy(struct inet_frag_queue *q) |
1e4b8287 PE |
180 | { |
181 | struct sk_buff *fp; | |
6ddc0822 | 182 | struct netns_frags *nf; |
d433673e | 183 | unsigned int sum, sum_truesize = 0; |
9bdf9ca9 | 184 | struct inet_frags *f; |
1e4b8287 | 185 | |
06aa8b8a | 186 | WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); |
547b792c | 187 | WARN_ON(del_timer(&q->timer) != 0); |
1e4b8287 PE |
188 | |
189 | /* Release all fragment data. */ | |
190 | fp = q->fragments; | |
6ddc0822 | 191 | nf = q->net; |
9bdf9ca9 | 192 | f = nf->f; |
4faadabc PO |
193 | if (fp) { |
194 | do { | |
195 | struct sk_buff *xp = fp->next; | |
196 | ||
197 | sum_truesize += fp->truesize; | |
198 | kfree_skb(fp); | |
199 | fp = xp; | |
200 | } while (fp); | |
201 | } else { | |
c76249de | 202 | sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments); |
1e4b8287 | 203 | } |
d433673e | 204 | sum = sum_truesize + f->qsize; |
1e4b8287 | 205 | |
4e722036 | 206 | call_rcu(&q->rcu, inet_frag_destroy_rcu); |
5719b296 FW |
207 | |
208 | sub_frag_mem_limit(nf, sum); | |
1e4b8287 PE |
209 | } |
210 | EXPORT_SYMBOL(inet_frag_destroy); | |
8e7999c4 | 211 | |
ac18e750 | 212 | static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, |
f926e236 NA |
213 | struct inet_frags *f, |
214 | void *arg) | |
e521db9d PE |
215 | { |
216 | struct inet_frag_queue *q; | |
217 | ||
4e722036 ED |
218 | if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) |
219 | return NULL; | |
220 | ||
d4ad4d22 | 221 | q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); |
51456b29 | 222 | if (!q) |
e521db9d PE |
223 | return NULL; |
224 | ||
54db0cc2 | 225 | q->net = nf; |
c6fda282 | 226 | f->constructor(q, arg); |
0e60d245 | 227 | add_frag_mem_limit(nf, f->qsize); |
d433673e | 228 | |
78802011 | 229 | timer_setup(&q->timer, f->frag_expire, 0); |
e521db9d | 230 | spin_lock_init(&q->lock); |
4e722036 | 231 | refcount_set(&q->refcnt, 3); |
e521db9d PE |
232 | |
233 | return q; | |
234 | } | |
c6fda282 | 235 | |
ac18e750 | 236 | static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, |
d56a7663 ED |
237 | void *arg, |
238 | struct inet_frag_queue **prev) | |
c6fda282 | 239 | { |
4e722036 | 240 | struct inet_frags *f = nf->f; |
c6fda282 PE |
241 | struct inet_frag_queue *q; |
242 | ||
ac18e750 | 243 | q = inet_frag_alloc(nf, f, arg); |
d56a7663 ED |
244 | if (!q) { |
245 | *prev = ERR_PTR(-ENOMEM); | |
c6fda282 | 246 | return NULL; |
d56a7663 | 247 | } |
4e722036 | 248 | mod_timer(&q->timer, jiffies + nf->timeout); |
abd6523d | 249 | |
d56a7663 ED |
250 | *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key, |
251 | &q->node, f->rhash_params); | |
252 | if (*prev) { | |
4e722036 ED |
253 | q->flags |= INET_FRAG_COMPLETE; |
254 | inet_frag_kill(q); | |
255 | inet_frag_destroy(q); | |
afa6122a ED |
256 | return NULL; |
257 | } | |
4e722036 ED |
258 | return q; |
259 | } | |
afa6122a | 260 | |
4e722036 ED |
261 | /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ |
262 | struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) | |
263 | { | |
d56a7663 | 264 | struct inet_frag_queue *fq = NULL, *prev; |
abd6523d | 265 | |
4e722036 | 266 | rcu_read_lock(); |
e3a57d18 | 267 | |
d56a7663 ED |
268 | prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); |
269 | if (!prev) | |
270 | fq = inet_frag_create(nf, key, &prev); | |
271 | if (prev && !IS_ERR(prev)) { | |
272 | fq = prev; | |
4e722036 ED |
273 | if (!refcount_inc_not_zero(&fq->refcnt)) |
274 | fq = NULL; | |
e3a57d18 | 275 | } |
4e722036 | 276 | rcu_read_unlock(); |
e3a57d18 | 277 | |
d56a7663 | 278 | return fq; |
abd6523d PE |
279 | } |
280 | EXPORT_SYMBOL(inet_frag_find); | |
4faadabc PO |
281 | |
282 | int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, | |
283 | int offset, int end) | |
284 | { | |
285 | struct sk_buff *last = q->fragments_tail; | |
286 | ||
287 | /* RFC5722, Section 4, amended by Errata ID : 3089 | |
288 | * When reassembling an IPv6 datagram, if | |
289 | * one or more its constituent fragments is determined to be an | |
290 | * overlapping fragment, the entire datagram (and any constituent | |
291 | * fragments) MUST be silently discarded. | |
292 | * | |
293 | * Duplicates, however, should be ignored (i.e. skb dropped, but the | |
294 | * queue/fragments kept for later reassembly). | |
295 | */ | |
296 | if (!last) | |
297 | fragrun_create(q, skb); /* First fragment. */ | |
298 | else if (last->ip_defrag_offset + last->len < end) { | |
299 | /* This is the common case: skb goes to the end. */ | |
300 | /* Detect and discard overlaps. */ | |
301 | if (offset < last->ip_defrag_offset + last->len) | |
302 | return IPFRAG_OVERLAP; | |
303 | if (offset == last->ip_defrag_offset + last->len) | |
304 | fragrun_append_to_last(q, skb); | |
305 | else | |
306 | fragrun_create(q, skb); | |
307 | } else { | |
308 | /* Binary search. Note that skb can become the first fragment, | |
309 | * but not the last (covered above). | |
310 | */ | |
311 | struct rb_node **rbn, *parent; | |
312 | ||
313 | rbn = &q->rb_fragments.rb_node; | |
314 | do { | |
315 | struct sk_buff *curr; | |
316 | int curr_run_end; | |
317 | ||
318 | parent = *rbn; | |
319 | curr = rb_to_skb(parent); | |
320 | curr_run_end = curr->ip_defrag_offset + | |
321 | FRAG_CB(curr)->frag_run_len; | |
322 | if (end <= curr->ip_defrag_offset) | |
323 | rbn = &parent->rb_left; | |
324 | else if (offset >= curr_run_end) | |
325 | rbn = &parent->rb_right; | |
326 | else if (offset >= curr->ip_defrag_offset && | |
327 | end <= curr_run_end) | |
328 | return IPFRAG_DUP; | |
329 | else | |
330 | return IPFRAG_OVERLAP; | |
331 | } while (*rbn); | |
332 | /* Here we have parent properly set, and rbn pointing to | |
333 | * one of its NULL left/right children. Insert skb. | |
334 | */ | |
335 | fragcb_clear(skb); | |
336 | rb_link_node(&skb->rbnode, parent, rbn); | |
337 | rb_insert_color(&skb->rbnode, &q->rb_fragments); | |
338 | } | |
339 | ||
340 | skb->ip_defrag_offset = offset; | |
341 | ||
342 | return IPFRAG_OK; | |
343 | } | |
344 | EXPORT_SYMBOL(inet_frag_queue_insert); | |
345 | ||
346 | void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, | |
347 | struct sk_buff *parent) | |
348 | { | |
349 | struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); | |
350 | struct sk_buff **nextp; | |
351 | int delta; | |
352 | ||
353 | if (head != skb) { | |
354 | fp = skb_clone(skb, GFP_ATOMIC); | |
355 | if (!fp) | |
356 | return NULL; | |
357 | FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; | |
358 | if (RB_EMPTY_NODE(&skb->rbnode)) | |
359 | FRAG_CB(parent)->next_frag = fp; | |
360 | else | |
361 | rb_replace_node(&skb->rbnode, &fp->rbnode, | |
362 | &q->rb_fragments); | |
363 | if (q->fragments_tail == skb) | |
364 | q->fragments_tail = fp; | |
365 | skb_morph(skb, head); | |
366 | FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; | |
367 | rb_replace_node(&head->rbnode, &skb->rbnode, | |
368 | &q->rb_fragments); | |
369 | consume_skb(head); | |
370 | head = skb; | |
371 | } | |
372 | WARN_ON(head->ip_defrag_offset != 0); | |
373 | ||
374 | delta = -head->truesize; | |
375 | ||
376 | /* Head of list must not be cloned. */ | |
377 | if (skb_unclone(head, GFP_ATOMIC)) | |
378 | return NULL; | |
379 | ||
380 | delta += head->truesize; | |
381 | if (delta) | |
382 | add_frag_mem_limit(q->net, delta); | |
383 | ||
384 | /* If the first fragment is fragmented itself, we split | |
385 | * it to two chunks: the first with data and paged part | |
386 | * and the second, holding only fragments. | |
387 | */ | |
388 | if (skb_has_frag_list(head)) { | |
389 | struct sk_buff *clone; | |
390 | int i, plen = 0; | |
391 | ||
392 | clone = alloc_skb(0, GFP_ATOMIC); | |
393 | if (!clone) | |
394 | return NULL; | |
395 | skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; | |
396 | skb_frag_list_init(head); | |
397 | for (i = 0; i < skb_shinfo(head)->nr_frags; i++) | |
398 | plen += skb_frag_size(&skb_shinfo(head)->frags[i]); | |
399 | clone->data_len = head->data_len - plen; | |
400 | clone->len = clone->data_len; | |
401 | head->truesize += clone->truesize; | |
402 | clone->csum = 0; | |
403 | clone->ip_summed = head->ip_summed; | |
404 | add_frag_mem_limit(q->net, clone->truesize); | |
405 | skb_shinfo(head)->frag_list = clone; | |
406 | nextp = &clone->next; | |
407 | } else { | |
408 | nextp = &skb_shinfo(head)->frag_list; | |
409 | } | |
410 | ||
411 | return nextp; | |
412 | } | |
413 | EXPORT_SYMBOL(inet_frag_reasm_prepare); | |
414 | ||
415 | void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, | |
416 | void *reasm_data) | |
417 | { | |
418 | struct sk_buff **nextp = (struct sk_buff **)reasm_data; | |
419 | struct rb_node *rbn; | |
420 | struct sk_buff *fp; | |
421 | ||
422 | skb_push(head, head->data - skb_network_header(head)); | |
423 | ||
424 | /* Traverse the tree in order, to build frag_list. */ | |
425 | fp = FRAG_CB(head)->next_frag; | |
426 | rbn = rb_next(&head->rbnode); | |
427 | rb_erase(&head->rbnode, &q->rb_fragments); | |
428 | while (rbn || fp) { | |
429 | /* fp points to the next sk_buff in the current run; | |
430 | * rbn points to the next run. | |
431 | */ | |
432 | /* Go through the current run. */ | |
433 | while (fp) { | |
434 | *nextp = fp; | |
435 | nextp = &fp->next; | |
436 | fp->prev = NULL; | |
437 | memset(&fp->rbnode, 0, sizeof(fp->rbnode)); | |
438 | fp->sk = NULL; | |
439 | head->data_len += fp->len; | |
440 | head->len += fp->len; | |
441 | if (head->ip_summed != fp->ip_summed) | |
442 | head->ip_summed = CHECKSUM_NONE; | |
443 | else if (head->ip_summed == CHECKSUM_COMPLETE) | |
444 | head->csum = csum_add(head->csum, fp->csum); | |
445 | head->truesize += fp->truesize; | |
446 | fp = FRAG_CB(fp)->next_frag; | |
447 | } | |
448 | /* Move to the next run. */ | |
449 | if (rbn) { | |
450 | struct rb_node *rbnext = rb_next(rbn); | |
451 | ||
452 | fp = rb_to_skb(rbn); | |
453 | rb_erase(rbn, &q->rb_fragments); | |
454 | rbn = rbnext; | |
455 | } | |
456 | } | |
457 | sub_frag_mem_limit(q->net, head->truesize); | |
458 | ||
459 | *nextp = NULL; | |
460 | head->next = NULL; | |
461 | head->prev = NULL; | |
462 | head->tstamp = q->stamp; | |
463 | } | |
464 | EXPORT_SYMBOL(inet_frag_reasm_finish); | |
465 | ||
466 | struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) | |
467 | { | |
468 | struct sk_buff *head; | |
469 | ||
470 | if (q->fragments) { | |
471 | head = q->fragments; | |
472 | q->fragments = head->next; | |
473 | } else { | |
474 | struct sk_buff *skb; | |
475 | ||
476 | head = skb_rb_first(&q->rb_fragments); | |
477 | if (!head) | |
478 | return NULL; | |
479 | skb = FRAG_CB(head)->next_frag; | |
480 | if (skb) | |
481 | rb_replace_node(&head->rbnode, &skb->rbnode, | |
482 | &q->rb_fragments); | |
483 | else | |
484 | rb_erase(&head->rbnode, &q->rb_fragments); | |
485 | memset(&head->rbnode, 0, sizeof(head->rbnode)); | |
486 | barrier(); | |
487 | } | |
488 | if (head == q->fragments_tail) | |
489 | q->fragments_tail = NULL; | |
490 | ||
491 | sub_frag_mem_limit(q->net, head->truesize); | |
492 | ||
493 | return head; | |
494 | } | |
495 | EXPORT_SYMBOL(inet_frag_pull_head); |