]>
git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/blob - net/ipv4/inet_fragment.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * inet fragments management
5 * Authors: Pavel Emelyanov <xemul@openvz.org>
6 * Started as consolidation of ipv4/ip_fragment.c,
7 * ipv6/reassembly. and ipv6 nf conntrack reassembly
10 #include <linux/list.h>
11 #include <linux/spinlock.h>
12 #include <linux/module.h>
13 #include <linux/timer.h>
15 #include <linux/random.h>
16 #include <linux/skbuff.h>
17 #include <linux/rtnetlink.h>
18 #include <linux/slab.h>
19 #include <linux/rhashtable.h>
22 #include <net/inet_frag.h>
23 #include <net/inet_ecn.h>
27 /* Use skb->cb to track consecutive/adjacent fragments coming at
28 * the end of the queue. Nodes in the rb-tree queue will
29 * contain "runs" of one or more adjacent fragments.
32 * - next_frag is NULL at the tail of a "run";
33 * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
35 struct ipfrag_skb_cb
{
37 struct inet_skb_parm h4
;
38 struct inet6_skb_parm h6
;
40 struct sk_buff
*next_frag
;
44 #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
46 static void fragcb_clear(struct sk_buff
*skb
)
48 RB_CLEAR_NODE(&skb
->rbnode
);
49 FRAG_CB(skb
)->next_frag
= NULL
;
50 FRAG_CB(skb
)->frag_run_len
= skb
->len
;
53 /* Append skb to the last "run". */
54 static void fragrun_append_to_last(struct inet_frag_queue
*q
,
59 FRAG_CB(q
->last_run_head
)->frag_run_len
+= skb
->len
;
60 FRAG_CB(q
->fragments_tail
)->next_frag
= skb
;
61 q
->fragments_tail
= skb
;
64 /* Create a new "run" with the skb. */
65 static void fragrun_create(struct inet_frag_queue
*q
, struct sk_buff
*skb
)
67 BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb
) > sizeof(skb
->cb
));
71 rb_link_node(&skb
->rbnode
, &q
->last_run_head
->rbnode
,
72 &q
->last_run_head
->rbnode
.rb_right
);
74 rb_link_node(&skb
->rbnode
, NULL
, &q
->rb_fragments
.rb_node
);
75 rb_insert_color(&skb
->rbnode
, &q
->rb_fragments
);
77 q
->fragments_tail
= skb
;
78 q
->last_run_head
= skb
;
81 /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
82 * Value : 0xff if frame should be dropped.
83 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
85 const u8 ip_frag_ecn_table
[16] = {
86 /* at least one fragment had CE, and others ECT_0 or ECT_1 */
87 [IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_0
] = INET_ECN_CE
,
88 [IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_1
] = INET_ECN_CE
,
89 [IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_0
| IPFRAG_ECN_ECT_1
] = INET_ECN_CE
,
91 /* invalid combinations : drop frame */
92 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_CE
] = 0xff,
93 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_ECT_0
] = 0xff,
94 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_ECT_1
] = 0xff,
95 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_ECT_0
| IPFRAG_ECN_ECT_1
] = 0xff,
96 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_0
] = 0xff,
97 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_1
] = 0xff,
98 [IPFRAG_ECN_NOT_ECT
| IPFRAG_ECN_CE
| IPFRAG_ECN_ECT_0
| IPFRAG_ECN_ECT_1
] = 0xff,
100 EXPORT_SYMBOL(ip_frag_ecn_table
);
102 int inet_frags_init(struct inet_frags
*f
)
104 f
->frags_cachep
= kmem_cache_create(f
->frags_cache_name
, f
->qsize
, 0, 0,
106 if (!f
->frags_cachep
)
111 EXPORT_SYMBOL(inet_frags_init
);
113 void inet_frags_fini(struct inet_frags
*f
)
115 /* We must wait that all inet_frag_destroy_rcu() have completed. */
118 kmem_cache_destroy(f
->frags_cachep
);
119 f
->frags_cachep
= NULL
;
121 EXPORT_SYMBOL(inet_frags_fini
);
123 static void inet_frags_free_cb(void *ptr
, void *arg
)
125 struct inet_frag_queue
*fq
= ptr
;
127 /* If we can not cancel the timer, it means this frag_queue
128 * is already disappearing, we have nothing to do.
129 * Otherwise, we own a refcount until the end of this function.
131 if (!del_timer(&fq
->timer
))
134 spin_lock_bh(&fq
->lock
);
135 if (!(fq
->flags
& INET_FRAG_COMPLETE
)) {
136 fq
->flags
|= INET_FRAG_COMPLETE
;
137 refcount_dec(&fq
->refcnt
);
139 spin_unlock_bh(&fq
->lock
);
144 void inet_frags_exit_net(struct netns_frags
*nf
)
146 nf
->high_thresh
= 0; /* prevent creation of new frags */
148 rhashtable_free_and_destroy(&nf
->rhashtable
, inet_frags_free_cb
, NULL
);
150 EXPORT_SYMBOL(inet_frags_exit_net
);
152 void inet_frag_kill(struct inet_frag_queue
*fq
)
154 if (del_timer(&fq
->timer
))
155 refcount_dec(&fq
->refcnt
);
157 if (!(fq
->flags
& INET_FRAG_COMPLETE
)) {
158 struct netns_frags
*nf
= fq
->net
;
160 fq
->flags
|= INET_FRAG_COMPLETE
;
161 rhashtable_remove_fast(&nf
->rhashtable
, &fq
->node
, nf
->f
->rhash_params
);
162 refcount_dec(&fq
->refcnt
);
165 EXPORT_SYMBOL(inet_frag_kill
);
167 static void inet_frag_destroy_rcu(struct rcu_head
*head
)
169 struct inet_frag_queue
*q
= container_of(head
, struct inet_frag_queue
,
171 struct inet_frags
*f
= q
->net
->f
;
175 kmem_cache_free(f
->frags_cachep
, q
);
178 unsigned int inet_frag_rbtree_purge(struct rb_root
*root
)
180 struct rb_node
*p
= rb_first(root
);
181 unsigned int sum
= 0;
184 struct sk_buff
*skb
= rb_entry(p
, struct sk_buff
, rbnode
);
187 rb_erase(&skb
->rbnode
, root
);
189 struct sk_buff
*next
= FRAG_CB(skb
)->next_frag
;
191 sum
+= skb
->truesize
;
198 EXPORT_SYMBOL(inet_frag_rbtree_purge
);
200 void inet_frag_destroy(struct inet_frag_queue
*q
)
202 struct netns_frags
*nf
;
203 unsigned int sum
, sum_truesize
= 0;
204 struct inet_frags
*f
;
206 WARN_ON(!(q
->flags
& INET_FRAG_COMPLETE
));
207 WARN_ON(del_timer(&q
->timer
) != 0);
209 /* Release all fragment data. */
212 sum_truesize
= inet_frag_rbtree_purge(&q
->rb_fragments
);
213 sum
= sum_truesize
+ f
->qsize
;
215 call_rcu(&q
->rcu
, inet_frag_destroy_rcu
);
217 sub_frag_mem_limit(nf
, sum
);
219 EXPORT_SYMBOL(inet_frag_destroy
);
221 static struct inet_frag_queue
*inet_frag_alloc(struct netns_frags
*nf
,
222 struct inet_frags
*f
,
225 struct inet_frag_queue
*q
;
227 q
= kmem_cache_zalloc(f
->frags_cachep
, GFP_ATOMIC
);
232 f
->constructor(q
, arg
);
233 add_frag_mem_limit(nf
, f
->qsize
);
235 timer_setup(&q
->timer
, f
->frag_expire
, 0);
236 spin_lock_init(&q
->lock
);
237 refcount_set(&q
->refcnt
, 3);
242 static struct inet_frag_queue
*inet_frag_create(struct netns_frags
*nf
,
244 struct inet_frag_queue
**prev
)
246 struct inet_frags
*f
= nf
->f
;
247 struct inet_frag_queue
*q
;
249 q
= inet_frag_alloc(nf
, f
, arg
);
251 *prev
= ERR_PTR(-ENOMEM
);
254 mod_timer(&q
->timer
, jiffies
+ nf
->timeout
);
256 *prev
= rhashtable_lookup_get_insert_key(&nf
->rhashtable
, &q
->key
,
257 &q
->node
, f
->rhash_params
);
259 q
->flags
|= INET_FRAG_COMPLETE
;
261 inet_frag_destroy(q
);
267 /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
268 struct inet_frag_queue
*inet_frag_find(struct netns_frags
*nf
, void *key
)
270 struct inet_frag_queue
*fq
= NULL
, *prev
;
272 if (!nf
->high_thresh
|| frag_mem_limit(nf
) > nf
->high_thresh
)
277 prev
= rhashtable_lookup(&nf
->rhashtable
, key
, nf
->f
->rhash_params
);
279 fq
= inet_frag_create(nf
, key
, &prev
);
280 if (prev
&& !IS_ERR(prev
)) {
282 if (!refcount_inc_not_zero(&fq
->refcnt
))
288 EXPORT_SYMBOL(inet_frag_find
);
290 int inet_frag_queue_insert(struct inet_frag_queue
*q
, struct sk_buff
*skb
,
293 struct sk_buff
*last
= q
->fragments_tail
;
295 /* RFC5722, Section 4, amended by Errata ID : 3089
296 * When reassembling an IPv6 datagram, if
297 * one or more its constituent fragments is determined to be an
298 * overlapping fragment, the entire datagram (and any constituent
299 * fragments) MUST be silently discarded.
301 * Duplicates, however, should be ignored (i.e. skb dropped, but the
302 * queue/fragments kept for later reassembly).
305 fragrun_create(q
, skb
); /* First fragment. */
306 else if (last
->ip_defrag_offset
+ last
->len
< end
) {
307 /* This is the common case: skb goes to the end. */
308 /* Detect and discard overlaps. */
309 if (offset
< last
->ip_defrag_offset
+ last
->len
)
310 return IPFRAG_OVERLAP
;
311 if (offset
== last
->ip_defrag_offset
+ last
->len
)
312 fragrun_append_to_last(q
, skb
);
314 fragrun_create(q
, skb
);
316 /* Binary search. Note that skb can become the first fragment,
317 * but not the last (covered above).
319 struct rb_node
**rbn
, *parent
;
321 rbn
= &q
->rb_fragments
.rb_node
;
323 struct sk_buff
*curr
;
327 curr
= rb_to_skb(parent
);
328 curr_run_end
= curr
->ip_defrag_offset
+
329 FRAG_CB(curr
)->frag_run_len
;
330 if (end
<= curr
->ip_defrag_offset
)
331 rbn
= &parent
->rb_left
;
332 else if (offset
>= curr_run_end
)
333 rbn
= &parent
->rb_right
;
334 else if (offset
>= curr
->ip_defrag_offset
&&
338 return IPFRAG_OVERLAP
;
340 /* Here we have parent properly set, and rbn pointing to
341 * one of its NULL left/right children. Insert skb.
344 rb_link_node(&skb
->rbnode
, parent
, rbn
);
345 rb_insert_color(&skb
->rbnode
, &q
->rb_fragments
);
348 skb
->ip_defrag_offset
= offset
;
352 EXPORT_SYMBOL(inet_frag_queue_insert
);
354 void *inet_frag_reasm_prepare(struct inet_frag_queue
*q
, struct sk_buff
*skb
,
355 struct sk_buff
*parent
)
357 struct sk_buff
*fp
, *head
= skb_rb_first(&q
->rb_fragments
);
358 struct sk_buff
**nextp
;
362 fp
= skb_clone(skb
, GFP_ATOMIC
);
365 FRAG_CB(fp
)->next_frag
= FRAG_CB(skb
)->next_frag
;
366 if (RB_EMPTY_NODE(&skb
->rbnode
))
367 FRAG_CB(parent
)->next_frag
= fp
;
369 rb_replace_node(&skb
->rbnode
, &fp
->rbnode
,
371 if (q
->fragments_tail
== skb
)
372 q
->fragments_tail
= fp
;
373 skb_morph(skb
, head
);
374 FRAG_CB(skb
)->next_frag
= FRAG_CB(head
)->next_frag
;
375 rb_replace_node(&head
->rbnode
, &skb
->rbnode
,
380 WARN_ON(head
->ip_defrag_offset
!= 0);
382 delta
= -head
->truesize
;
384 /* Head of list must not be cloned. */
385 if (skb_unclone(head
, GFP_ATOMIC
))
388 delta
+= head
->truesize
;
390 add_frag_mem_limit(q
->net
, delta
);
392 /* If the first fragment is fragmented itself, we split
393 * it to two chunks: the first with data and paged part
394 * and the second, holding only fragments.
396 if (skb_has_frag_list(head
)) {
397 struct sk_buff
*clone
;
400 clone
= alloc_skb(0, GFP_ATOMIC
);
403 skb_shinfo(clone
)->frag_list
= skb_shinfo(head
)->frag_list
;
404 skb_frag_list_init(head
);
405 for (i
= 0; i
< skb_shinfo(head
)->nr_frags
; i
++)
406 plen
+= skb_frag_size(&skb_shinfo(head
)->frags
[i
]);
407 clone
->data_len
= head
->data_len
- plen
;
408 clone
->len
= clone
->data_len
;
409 head
->truesize
+= clone
->truesize
;
411 clone
->ip_summed
= head
->ip_summed
;
412 add_frag_mem_limit(q
->net
, clone
->truesize
);
413 skb_shinfo(head
)->frag_list
= clone
;
414 nextp
= &clone
->next
;
416 nextp
= &skb_shinfo(head
)->frag_list
;
421 EXPORT_SYMBOL(inet_frag_reasm_prepare
);
423 void inet_frag_reasm_finish(struct inet_frag_queue
*q
, struct sk_buff
*head
,
426 struct sk_buff
**nextp
= (struct sk_buff
**)reasm_data
;
430 skb_push(head
, head
->data
- skb_network_header(head
));
432 /* Traverse the tree in order, to build frag_list. */
433 fp
= FRAG_CB(head
)->next_frag
;
434 rbn
= rb_next(&head
->rbnode
);
435 rb_erase(&head
->rbnode
, &q
->rb_fragments
);
437 /* fp points to the next sk_buff in the current run;
438 * rbn points to the next run.
440 /* Go through the current run. */
445 memset(&fp
->rbnode
, 0, sizeof(fp
->rbnode
));
447 head
->data_len
+= fp
->len
;
448 head
->len
+= fp
->len
;
449 if (head
->ip_summed
!= fp
->ip_summed
)
450 head
->ip_summed
= CHECKSUM_NONE
;
451 else if (head
->ip_summed
== CHECKSUM_COMPLETE
)
452 head
->csum
= csum_add(head
->csum
, fp
->csum
);
453 head
->truesize
+= fp
->truesize
;
454 fp
= FRAG_CB(fp
)->next_frag
;
456 /* Move to the next run. */
458 struct rb_node
*rbnext
= rb_next(rbn
);
461 rb_erase(rbn
, &q
->rb_fragments
);
465 sub_frag_mem_limit(q
->net
, head
->truesize
);
468 skb_mark_not_on_list(head
);
470 head
->tstamp
= q
->stamp
;
472 EXPORT_SYMBOL(inet_frag_reasm_finish
);
474 struct sk_buff
*inet_frag_pull_head(struct inet_frag_queue
*q
)
476 struct sk_buff
*head
, *skb
;
478 head
= skb_rb_first(&q
->rb_fragments
);
481 skb
= FRAG_CB(head
)->next_frag
;
483 rb_replace_node(&head
->rbnode
, &skb
->rbnode
,
486 rb_erase(&head
->rbnode
, &q
->rb_fragments
);
487 memset(&head
->rbnode
, 0, sizeof(head
->rbnode
));
490 if (head
== q
->fragments_tail
)
491 q
->fragments_tail
= NULL
;
493 sub_frag_mem_limit(q
->net
, head
->truesize
);
497 EXPORT_SYMBOL(inet_frag_pull_head
);