]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - net/ipv4/fib_trie.c
fib_trie: Optimize fib_table_insert
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / fib_trie.c
1 /*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally described in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
49 */
50
51 #define VERSION "0.409"
52
53 #include <asm/uaccess.h>
54 #include <linux/bitops.h>
55 #include <linux/types.h>
56 #include <linux/kernel.h>
57 #include <linux/mm.h>
58 #include <linux/string.h>
59 #include <linux/socket.h>
60 #include <linux/sockios.h>
61 #include <linux/errno.h>
62 #include <linux/in.h>
63 #include <linux/inet.h>
64 #include <linux/inetdevice.h>
65 #include <linux/netdevice.h>
66 #include <linux/if_arp.h>
67 #include <linux/proc_fs.h>
68 #include <linux/rcupdate.h>
69 #include <linux/skbuff.h>
70 #include <linux/netlink.h>
71 #include <linux/init.h>
72 #include <linux/list.h>
73 #include <linux/slab.h>
74 #include <linux/export.h>
75 #include <net/net_namespace.h>
76 #include <net/ip.h>
77 #include <net/protocol.h>
78 #include <net/route.h>
79 #include <net/tcp.h>
80 #include <net/sock.h>
81 #include <net/ip_fib.h>
82 #include "fib_lookup.h"
83
84 #define MAX_STAT_DEPTH 32
85
86 #define KEYLENGTH (8*sizeof(t_key))
87
88 typedef unsigned int t_key;
89
90 #define IS_TNODE(n) ((n)->bits)
91 #define IS_LEAF(n) (!(n)->bits)
92
93 #define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits)
94 #define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv))
95
96 struct tnode {
97 t_key key;
98 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
99 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
100 struct tnode __rcu *parent;
101 struct rcu_head rcu;
102 union {
103 /* The fields in this struct are valid if bits > 0 (TNODE) */
104 struct {
105 unsigned int full_children; /* KEYLENGTH bits needed */
106 unsigned int empty_children; /* KEYLENGTH bits needed */
107 struct tnode __rcu *child[0];
108 };
109 /* This list pointer if valid if bits == 0 (LEAF) */
110 struct hlist_head list;
111 };
112 };
113
114 struct leaf_info {
115 struct hlist_node hlist;
116 int plen;
117 u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
118 struct list_head falh;
119 struct rcu_head rcu;
120 };
121
122 #ifdef CONFIG_IP_FIB_TRIE_STATS
123 struct trie_use_stats {
124 unsigned int gets;
125 unsigned int backtrack;
126 unsigned int semantic_match_passed;
127 unsigned int semantic_match_miss;
128 unsigned int null_node_hit;
129 unsigned int resize_node_skipped;
130 };
131 #endif
132
133 struct trie_stat {
134 unsigned int totdepth;
135 unsigned int maxdepth;
136 unsigned int tnodes;
137 unsigned int leaves;
138 unsigned int nullpointers;
139 unsigned int prefixes;
140 unsigned int nodesizes[MAX_STAT_DEPTH];
141 };
142
143 struct trie {
144 struct tnode __rcu *trie;
145 #ifdef CONFIG_IP_FIB_TRIE_STATS
146 struct trie_use_stats __percpu *stats;
147 #endif
148 };
149
150 static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
151 int wasfull);
152 static struct tnode *resize(struct trie *t, struct tnode *tn);
153 static struct tnode *inflate(struct trie *t, struct tnode *tn);
154 static struct tnode *halve(struct trie *t, struct tnode *tn);
155 /* tnodes to free after resize(); protected by RTNL */
156 static struct callback_head *tnode_free_head;
157 static size_t tnode_free_size;
158
159 /*
160 * synchronize_rcu after call_rcu for that many pages; it should be especially
161 * useful before resizing the root node with PREEMPT_NONE configs; the value was
162 * obtained experimentally, aiming to avoid visible slowdown.
163 */
164 static const int sync_pages = 128;
165
166 static struct kmem_cache *fn_alias_kmem __read_mostly;
167 static struct kmem_cache *trie_leaf_kmem __read_mostly;
168
169 /* caller must hold RTNL */
170 #define node_parent(n) rtnl_dereference((n)->parent)
171
172 /* caller must hold RCU read lock or RTNL */
173 #define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
174
175 /* wrapper for rcu_assign_pointer */
176 static inline void node_set_parent(struct tnode *n, struct tnode *tp)
177 {
178 if (n)
179 rcu_assign_pointer(n->parent, tp);
180 }
181
182 #define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
183
184 /* This provides us with the number of children in this node, in the case of a
185 * leaf this will return 0 meaning none of the children are accessible.
186 */
187 static inline int tnode_child_length(const struct tnode *tn)
188 {
189 return (1ul << tn->bits) & ~(1ul);
190 }
191
192 /*
193 * caller must hold RTNL
194 */
195 static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i)
196 {
197 BUG_ON(i >= tnode_child_length(tn));
198
199 return rtnl_dereference(tn->child[i]);
200 }
201
202 /*
203 * caller must hold RCU read lock or RTNL
204 */
205 static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
206 {
207 BUG_ON(i >= tnode_child_length(tn));
208
209 return rcu_dereference_rtnl(tn->child[i]);
210 }
211
212 static inline t_key mask_pfx(t_key k, unsigned int l)
213 {
214 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
215 }
216
217 static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
218 {
219 if (offset < KEYLENGTH)
220 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
221 else
222 return 0;
223 }
224
225 /*
226 To understand this stuff, an understanding of keys and all their bits is
227 necessary. Every node in the trie has a key associated with it, but not
228 all of the bits in that key are significant.
229
230 Consider a node 'n' and its parent 'tp'.
231
232 If n is a leaf, every bit in its key is significant. Its presence is
233 necessitated by path compression, since during a tree traversal (when
234 searching for a leaf - unless we are doing an insertion) we will completely
235 ignore all skipped bits we encounter. Thus we need to verify, at the end of
236 a potentially successful search, that we have indeed been walking the
237 correct key path.
238
239 Note that we can never "miss" the correct key in the tree if present by
240 following the wrong path. Path compression ensures that segments of the key
241 that are the same for all keys with a given prefix are skipped, but the
242 skipped part *is* identical for each node in the subtrie below the skipped
243 bit! trie_insert() in this implementation takes care of that - note the
244 call to tkey_sub_equals() in trie_insert().
245
246 if n is an internal node - a 'tnode' here, the various parts of its key
247 have many different meanings.
248
249 Example:
250 _________________________________________________________________
251 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
252 -----------------------------------------------------------------
253 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
254
255 _________________________________________________________________
256 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
257 -----------------------------------------------------------------
258 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
259
260 tp->pos = 7
261 tp->bits = 3
262 n->pos = 15
263 n->bits = 4
264
265 First, let's just ignore the bits that come before the parent tp, that is
266 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
267 not use them for anything.
268
269 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
270 index into the parent's child array. That is, they will be used to find
271 'n' among tp's children.
272
273 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
274 for the node n.
275
276 All the bits we have seen so far are significant to the node n. The rest
277 of the bits are really not needed or indeed known in n->key.
278
279 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
280 n's child array, and will of course be different for each child.
281
282
283 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
284 at this point.
285
286 */
287
288 static const int halve_threshold = 25;
289 static const int inflate_threshold = 50;
290 static const int halve_threshold_root = 15;
291 static const int inflate_threshold_root = 30;
292
293 static void __alias_free_mem(struct rcu_head *head)
294 {
295 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
296 kmem_cache_free(fn_alias_kmem, fa);
297 }
298
299 static inline void alias_free_mem_rcu(struct fib_alias *fa)
300 {
301 call_rcu(&fa->rcu, __alias_free_mem);
302 }
303
304 #define TNODE_KMALLOC_MAX \
305 ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
306
307 static void __node_free_rcu(struct rcu_head *head)
308 {
309 struct tnode *n = container_of(head, struct tnode, rcu);
310
311 if (IS_LEAF(n))
312 kmem_cache_free(trie_leaf_kmem, n);
313 else if (n->bits <= TNODE_KMALLOC_MAX)
314 kfree(n);
315 else
316 vfree(n);
317 }
318
319 #define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
320
321 static inline void free_leaf_info(struct leaf_info *leaf)
322 {
323 kfree_rcu(leaf, rcu);
324 }
325
326 static struct tnode *tnode_alloc(size_t size)
327 {
328 if (size <= PAGE_SIZE)
329 return kzalloc(size, GFP_KERNEL);
330 else
331 return vzalloc(size);
332 }
333
334 static void tnode_free_safe(struct tnode *tn)
335 {
336 BUG_ON(IS_LEAF(tn));
337 tn->rcu.next = tnode_free_head;
338 tnode_free_head = &tn->rcu;
339 }
340
341 static void tnode_free_flush(void)
342 {
343 struct callback_head *head;
344
345 while ((head = tnode_free_head)) {
346 struct tnode *tn = container_of(head, struct tnode, rcu);
347
348 tnode_free_head = head->next;
349 tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
350
351 node_free(tn);
352 }
353
354 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
355 tnode_free_size = 0;
356 synchronize_rcu();
357 }
358 }
359
360 static struct tnode *leaf_new(t_key key)
361 {
362 struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
363 if (l) {
364 l->parent = NULL;
365 /* set key and pos to reflect full key value
366 * any trailing zeros in the key should be ignored
367 * as the nodes are searched
368 */
369 l->key = key;
370 l->pos = KEYLENGTH;
371 /* set bits to 0 indicating we are not a tnode */
372 l->bits = 0;
373
374 INIT_HLIST_HEAD(&l->list);
375 }
376 return l;
377 }
378
379 static struct leaf_info *leaf_info_new(int plen)
380 {
381 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
382 if (li) {
383 li->plen = plen;
384 li->mask_plen = ntohl(inet_make_mask(plen));
385 INIT_LIST_HEAD(&li->falh);
386 }
387 return li;
388 }
389
390 static struct tnode *tnode_new(t_key key, int pos, int bits)
391 {
392 size_t sz = offsetof(struct tnode, child[1 << bits]);
393 struct tnode *tn = tnode_alloc(sz);
394 unsigned int shift = pos + bits;
395
396 /* verify bits and pos their msb bits clear and values are valid */
397 BUG_ON(!bits || (shift > KEYLENGTH));
398
399 if (tn) {
400 tn->parent = NULL;
401 tn->pos = pos;
402 tn->bits = bits;
403 tn->key = mask_pfx(key, pos);
404 tn->full_children = 0;
405 tn->empty_children = 1<<bits;
406 }
407
408 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
409 sizeof(struct tnode *) << bits);
410 return tn;
411 }
412
413 /*
414 * Check whether a tnode 'n' is "full", i.e. it is an internal node
415 * and no bits are skipped. See discussion in dyntree paper p. 6
416 */
417
418 static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
419 {
420 return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
421 }
422
423 static inline void put_child(struct tnode *tn, int i,
424 struct tnode *n)
425 {
426 tnode_put_child_reorg(tn, i, n, -1);
427 }
428
429 /*
430 * Add a child at position i overwriting the old value.
431 * Update the value of full_children and empty_children.
432 */
433
434 static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
435 int wasfull)
436 {
437 struct tnode *chi = rtnl_dereference(tn->child[i]);
438 int isfull;
439
440 BUG_ON(i >= 1<<tn->bits);
441
442 /* update emptyChildren */
443 if (n == NULL && chi != NULL)
444 tn->empty_children++;
445 else if (n != NULL && chi == NULL)
446 tn->empty_children--;
447
448 /* update fullChildren */
449 if (wasfull == -1)
450 wasfull = tnode_full(tn, chi);
451
452 isfull = tnode_full(tn, n);
453 if (wasfull && !isfull)
454 tn->full_children--;
455 else if (!wasfull && isfull)
456 tn->full_children++;
457
458 node_set_parent(n, tn);
459
460 rcu_assign_pointer(tn->child[i], n);
461 }
462
463 static void put_child_root(struct tnode *tp, struct trie *t,
464 t_key key, struct tnode *n)
465 {
466 if (tp)
467 put_child(tp, get_index(key, tp), n);
468 else
469 rcu_assign_pointer(t->trie, n);
470 }
471
472 #define MAX_WORK 10
473 static struct tnode *resize(struct trie *t, struct tnode *tn)
474 {
475 struct tnode *old_tn, *n = NULL;
476 int inflate_threshold_use;
477 int halve_threshold_use;
478 int max_work;
479
480 if (!tn)
481 return NULL;
482
483 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
484 tn, inflate_threshold, halve_threshold);
485
486 /* No children */
487 if (tn->empty_children > (tnode_child_length(tn) - 1))
488 goto no_children;
489
490 /* One child */
491 if (tn->empty_children == (tnode_child_length(tn) - 1))
492 goto one_child;
493 /*
494 * Double as long as the resulting node has a number of
495 * nonempty nodes that are above the threshold.
496 */
497
498 /*
499 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
500 * the Helsinki University of Technology and Matti Tikkanen of Nokia
501 * Telecommunications, page 6:
502 * "A node is doubled if the ratio of non-empty children to all
503 * children in the *doubled* node is at least 'high'."
504 *
505 * 'high' in this instance is the variable 'inflate_threshold'. It
506 * is expressed as a percentage, so we multiply it with
507 * tnode_child_length() and instead of multiplying by 2 (since the
508 * child array will be doubled by inflate()) and multiplying
509 * the left-hand side by 100 (to handle the percentage thing) we
510 * multiply the left-hand side by 50.
511 *
512 * The left-hand side may look a bit weird: tnode_child_length(tn)
513 * - tn->empty_children is of course the number of non-null children
514 * in the current node. tn->full_children is the number of "full"
515 * children, that is non-null tnodes with a skip value of 0.
516 * All of those will be doubled in the resulting inflated tnode, so
517 * we just count them one extra time here.
518 *
519 * A clearer way to write this would be:
520 *
521 * to_be_doubled = tn->full_children;
522 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
523 * tn->full_children;
524 *
525 * new_child_length = tnode_child_length(tn) * 2;
526 *
527 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
528 * new_child_length;
529 * if (new_fill_factor >= inflate_threshold)
530 *
531 * ...and so on, tho it would mess up the while () loop.
532 *
533 * anyway,
534 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
535 * inflate_threshold
536 *
537 * avoid a division:
538 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
539 * inflate_threshold * new_child_length
540 *
541 * expand not_to_be_doubled and to_be_doubled, and shorten:
542 * 100 * (tnode_child_length(tn) - tn->empty_children +
543 * tn->full_children) >= inflate_threshold * new_child_length
544 *
545 * expand new_child_length:
546 * 100 * (tnode_child_length(tn) - tn->empty_children +
547 * tn->full_children) >=
548 * inflate_threshold * tnode_child_length(tn) * 2
549 *
550 * shorten again:
551 * 50 * (tn->full_children + tnode_child_length(tn) -
552 * tn->empty_children) >= inflate_threshold *
553 * tnode_child_length(tn)
554 *
555 */
556
557 /* Keep root node larger */
558
559 if (!node_parent(tn)) {
560 inflate_threshold_use = inflate_threshold_root;
561 halve_threshold_use = halve_threshold_root;
562 } else {
563 inflate_threshold_use = inflate_threshold;
564 halve_threshold_use = halve_threshold;
565 }
566
567 max_work = MAX_WORK;
568 while ((tn->full_children > 0 && max_work-- &&
569 50 * (tn->full_children + tnode_child_length(tn)
570 - tn->empty_children)
571 >= inflate_threshold_use * tnode_child_length(tn))) {
572
573 old_tn = tn;
574 tn = inflate(t, tn);
575
576 if (IS_ERR(tn)) {
577 tn = old_tn;
578 #ifdef CONFIG_IP_FIB_TRIE_STATS
579 this_cpu_inc(t->stats->resize_node_skipped);
580 #endif
581 break;
582 }
583 }
584
585 /* Return if at least one inflate is run */
586 if (max_work != MAX_WORK)
587 return tn;
588
589 /*
590 * Halve as long as the number of empty children in this
591 * node is above threshold.
592 */
593
594 max_work = MAX_WORK;
595 while (tn->bits > 1 && max_work-- &&
596 100 * (tnode_child_length(tn) - tn->empty_children) <
597 halve_threshold_use * tnode_child_length(tn)) {
598
599 old_tn = tn;
600 tn = halve(t, tn);
601 if (IS_ERR(tn)) {
602 tn = old_tn;
603 #ifdef CONFIG_IP_FIB_TRIE_STATS
604 this_cpu_inc(t->stats->resize_node_skipped);
605 #endif
606 break;
607 }
608 }
609
610
611 /* Only one child remains */
612 if (tn->empty_children == (tnode_child_length(tn) - 1)) {
613 unsigned long i;
614 one_child:
615 for (i = tnode_child_length(tn); !n && i;)
616 n = tnode_get_child(tn, --i);
617 no_children:
618 /* compress one level */
619 node_set_parent(n, NULL);
620 tnode_free_safe(tn);
621 return n;
622 }
623 return tn;
624 }
625
626
627 static void tnode_clean_free(struct tnode *tn)
628 {
629 struct tnode *tofree;
630 int i;
631
632 for (i = 0; i < tnode_child_length(tn); i++) {
633 tofree = rtnl_dereference(tn->child[i]);
634 if (tofree)
635 node_free(tofree);
636 }
637 node_free(tn);
638 }
639
640 static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
641 {
642 int olen = tnode_child_length(oldtnode);
643 struct tnode *tn;
644 int i;
645
646 pr_debug("In inflate\n");
647
648 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
649
650 if (!tn)
651 return ERR_PTR(-ENOMEM);
652
653 /*
654 * Preallocate and store tnodes before the actual work so we
655 * don't get into an inconsistent state if memory allocation
656 * fails. In case of failure we return the oldnode and inflate
657 * of tnode is ignored.
658 */
659
660 for (i = 0; i < olen; i++) {
661 struct tnode *inode;
662
663 inode = tnode_get_child(oldtnode, i);
664 if (tnode_full(oldtnode, inode) && inode->bits > 1) {
665 struct tnode *left, *right;
666 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
667
668 left = tnode_new(inode->key&(~m), inode->pos + 1,
669 inode->bits - 1);
670 if (!left)
671 goto nomem;
672
673 right = tnode_new(inode->key|m, inode->pos + 1,
674 inode->bits - 1);
675
676 if (!right) {
677 node_free(left);
678 goto nomem;
679 }
680
681 put_child(tn, 2*i, left);
682 put_child(tn, 2*i+1, right);
683 }
684 }
685
686 for (i = 0; i < olen; i++) {
687 struct tnode *inode = tnode_get_child(oldtnode, i);
688 struct tnode *left, *right;
689 int size, j;
690
691 /* An empty child */
692 if (inode == NULL)
693 continue;
694
695 /* A leaf or an internal node with skipped bits */
696 if (!tnode_full(oldtnode, inode)) {
697 put_child(tn,
698 tkey_extract_bits(inode->key, tn->pos, tn->bits),
699 inode);
700 continue;
701 }
702
703 /* An internal node with two children */
704 if (inode->bits == 1) {
705 put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
706 put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
707
708 tnode_free_safe(inode);
709 continue;
710 }
711
712 /* An internal node with more than two children */
713
714 /* We will replace this node 'inode' with two new
715 * ones, 'left' and 'right', each with half of the
716 * original children. The two new nodes will have
717 * a position one bit further down the key and this
718 * means that the "significant" part of their keys
719 * (see the discussion near the top of this file)
720 * will differ by one bit, which will be "0" in
721 * left's key and "1" in right's key. Since we are
722 * moving the key position by one step, the bit that
723 * we are moving away from - the bit at position
724 * (inode->pos) - is the one that will differ between
725 * left and right. So... we synthesize that bit in the
726 * two new keys.
727 * The mask 'm' below will be a single "one" bit at
728 * the position (inode->pos)
729 */
730
731 /* Use the old key, but set the new significant
732 * bit to zero.
733 */
734
735 left = tnode_get_child(tn, 2*i);
736 put_child(tn, 2*i, NULL);
737
738 BUG_ON(!left);
739
740 right = tnode_get_child(tn, 2*i+1);
741 put_child(tn, 2*i+1, NULL);
742
743 BUG_ON(!right);
744
745 size = tnode_child_length(left);
746 for (j = 0; j < size; j++) {
747 put_child(left, j, rtnl_dereference(inode->child[j]));
748 put_child(right, j, rtnl_dereference(inode->child[j + size]));
749 }
750 put_child(tn, 2*i, resize(t, left));
751 put_child(tn, 2*i+1, resize(t, right));
752
753 tnode_free_safe(inode);
754 }
755 tnode_free_safe(oldtnode);
756 return tn;
757 nomem:
758 tnode_clean_free(tn);
759 return ERR_PTR(-ENOMEM);
760 }
761
762 static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
763 {
764 int olen = tnode_child_length(oldtnode);
765 struct tnode *tn, *left, *right;
766 int i;
767
768 pr_debug("In halve\n");
769
770 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
771
772 if (!tn)
773 return ERR_PTR(-ENOMEM);
774
775 /*
776 * Preallocate and store tnodes before the actual work so we
777 * don't get into an inconsistent state if memory allocation
778 * fails. In case of failure we return the oldnode and halve
779 * of tnode is ignored.
780 */
781
782 for (i = 0; i < olen; i += 2) {
783 left = tnode_get_child(oldtnode, i);
784 right = tnode_get_child(oldtnode, i+1);
785
786 /* Two nonempty children */
787 if (left && right) {
788 struct tnode *newn;
789
790 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
791
792 if (!newn)
793 goto nomem;
794
795 put_child(tn, i/2, newn);
796 }
797
798 }
799
800 for (i = 0; i < olen; i += 2) {
801 struct tnode *newBinNode;
802
803 left = tnode_get_child(oldtnode, i);
804 right = tnode_get_child(oldtnode, i+1);
805
806 /* At least one of the children is empty */
807 if (left == NULL) {
808 if (right == NULL) /* Both are empty */
809 continue;
810 put_child(tn, i/2, right);
811 continue;
812 }
813
814 if (right == NULL) {
815 put_child(tn, i/2, left);
816 continue;
817 }
818
819 /* Two nonempty children */
820 newBinNode = tnode_get_child(tn, i/2);
821 put_child(tn, i/2, NULL);
822 put_child(newBinNode, 0, left);
823 put_child(newBinNode, 1, right);
824 put_child(tn, i/2, resize(t, newBinNode));
825 }
826 tnode_free_safe(oldtnode);
827 return tn;
828 nomem:
829 tnode_clean_free(tn);
830 return ERR_PTR(-ENOMEM);
831 }
832
833 /* readside must use rcu_read_lock currently dump routines
834 via get_fa_head and dump */
835
836 static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
837 {
838 struct hlist_head *head = &l->list;
839 struct leaf_info *li;
840
841 hlist_for_each_entry_rcu(li, head, hlist)
842 if (li->plen == plen)
843 return li;
844
845 return NULL;
846 }
847
848 static inline struct list_head *get_fa_head(struct tnode *l, int plen)
849 {
850 struct leaf_info *li = find_leaf_info(l, plen);
851
852 if (!li)
853 return NULL;
854
855 return &li->falh;
856 }
857
858 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
859 {
860 struct leaf_info *li = NULL, *last = NULL;
861
862 if (hlist_empty(head)) {
863 hlist_add_head_rcu(&new->hlist, head);
864 } else {
865 hlist_for_each_entry(li, head, hlist) {
866 if (new->plen > li->plen)
867 break;
868
869 last = li;
870 }
871 if (last)
872 hlist_add_behind_rcu(&new->hlist, &last->hlist);
873 else
874 hlist_add_before_rcu(&new->hlist, &li->hlist);
875 }
876 }
877
878 /* rcu_read_lock needs to be hold by caller from readside */
879 static struct tnode *fib_find_node(struct trie *t, u32 key)
880 {
881 struct tnode *n = rcu_dereference_rtnl(t->trie);
882
883 while (n) {
884 unsigned long index = get_index(key, n);
885
886 /* This bit of code is a bit tricky but it combines multiple
887 * checks into a single check. The prefix consists of the
888 * prefix plus zeros for the bits in the cindex. The index
889 * is the difference between the key and this value. From
890 * this we can actually derive several pieces of data.
891 * if !(index >> bits)
892 * we know the value is cindex
893 * else
894 * we have a mismatch in skip bits and failed
895 */
896 if (index >> n->bits)
897 return NULL;
898
899 /* we have found a leaf. Prefixes have already been compared */
900 if (IS_LEAF(n))
901 break;
902
903 n = rcu_dereference_rtnl(n->child[index]);
904 }
905
906 return n;
907 }
908
909 static void trie_rebalance(struct trie *t, struct tnode *tn)
910 {
911 int wasfull;
912 t_key cindex, key;
913 struct tnode *tp;
914
915 key = tn->key;
916
917 while (tn != NULL && (tp = node_parent(tn)) != NULL) {
918 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
919 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
920 tn = resize(t, tn);
921
922 tnode_put_child_reorg(tp, cindex, tn, wasfull);
923
924 tp = node_parent(tn);
925 if (!tp)
926 rcu_assign_pointer(t->trie, tn);
927
928 tnode_free_flush();
929 if (!tp)
930 break;
931 tn = tp;
932 }
933
934 /* Handle last (top) tnode */
935 if (IS_TNODE(tn))
936 tn = resize(t, tn);
937
938 rcu_assign_pointer(t->trie, tn);
939 tnode_free_flush();
940 }
941
942 /* only used from updater-side */
943
944 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
945 {
946 struct list_head *fa_head = NULL;
947 struct tnode *l, *n, *tp = NULL;
948 struct leaf_info *li;
949
950 li = leaf_info_new(plen);
951 if (!li)
952 return NULL;
953 fa_head = &li->falh;
954
955 n = rtnl_dereference(t->trie);
956
957 /* If we point to NULL, stop. Either the tree is empty and we should
958 * just put a new leaf in if, or we have reached an empty child slot,
959 * and we should just put our new leaf in that.
960 *
961 * If we hit a node with a key that does't match then we should stop
962 * and create a new tnode to replace that node and insert ourselves
963 * and the other node into the new tnode.
964 */
965 while (n) {
966 unsigned long index = get_index(key, n);
967
968 /* This bit of code is a bit tricky but it combines multiple
969 * checks into a single check. The prefix consists of the
970 * prefix plus zeros for the "bits" in the prefix. The index
971 * is the difference between the key and this value. From
972 * this we can actually derive several pieces of data.
973 * if !(index >> bits)
974 * we know the value is child index
975 * else
976 * we have a mismatch in skip bits and failed
977 */
978 if (index >> n->bits)
979 break;
980
981 /* we have found a leaf. Prefixes have already been compared */
982 if (IS_LEAF(n)) {
983 /* Case 1: n is a leaf, and prefixes match*/
984 insert_leaf_info(&n->list, li);
985 return fa_head;
986 }
987
988 tp = n;
989 n = rcu_dereference_rtnl(n->child[index]);
990 }
991
992 l = leaf_new(key);
993 if (!l) {
994 free_leaf_info(li);
995 return NULL;
996 }
997
998 insert_leaf_info(&l->list, li);
999
1000 /* Case 2: n is a LEAF or a TNODE and the key doesn't match.
1001 *
1002 * Add a new tnode here
1003 * first tnode need some special handling
1004 * leaves us in position for handling as case 3
1005 */
1006 if (n) {
1007 struct tnode *tn;
1008 int newpos;
1009
1010 newpos = KEYLENGTH - __fls(n->key ^ key) - 1;
1011
1012 tn = tnode_new(key, newpos, 1);
1013 if (!tn) {
1014 free_leaf_info(li);
1015 node_free(l);
1016 return NULL;
1017 }
1018
1019 /* initialize routes out of node */
1020 NODE_INIT_PARENT(tn, tp);
1021 put_child(tn, get_index(key, tn) ^ 1, n);
1022
1023 /* start adding routes into the node */
1024 put_child_root(tp, t, key, tn);
1025 node_set_parent(n, tn);
1026
1027 /* parent now has a NULL spot where the leaf can go */
1028 tp = tn;
1029 }
1030
1031 /* Case 3: n is NULL, and will just insert a new leaf */
1032 if (tp) {
1033 NODE_INIT_PARENT(l, tp);
1034 put_child(tp, get_index(key, tp), l);
1035 trie_rebalance(t, tp);
1036 } else {
1037 rcu_assign_pointer(t->trie, l);
1038 }
1039
1040 return fa_head;
1041 }
1042
1043 /*
1044 * Caller must hold RTNL.
1045 */
1046 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1047 {
1048 struct trie *t = (struct trie *) tb->tb_data;
1049 struct fib_alias *fa, *new_fa;
1050 struct list_head *fa_head = NULL;
1051 struct fib_info *fi;
1052 int plen = cfg->fc_dst_len;
1053 u8 tos = cfg->fc_tos;
1054 u32 key, mask;
1055 int err;
1056 struct tnode *l;
1057
1058 if (plen > 32)
1059 return -EINVAL;
1060
1061 key = ntohl(cfg->fc_dst);
1062
1063 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1064
1065 mask = ntohl(inet_make_mask(plen));
1066
1067 if (key & ~mask)
1068 return -EINVAL;
1069
1070 key = key & mask;
1071
1072 fi = fib_create_info(cfg);
1073 if (IS_ERR(fi)) {
1074 err = PTR_ERR(fi);
1075 goto err;
1076 }
1077
1078 l = fib_find_node(t, key);
1079 fa = NULL;
1080
1081 if (l) {
1082 fa_head = get_fa_head(l, plen);
1083 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1084 }
1085
1086 /* Now fa, if non-NULL, points to the first fib alias
1087 * with the same keys [prefix,tos,priority], if such key already
1088 * exists or to the node before which we will insert new one.
1089 *
1090 * If fa is NULL, we will need to allocate a new one and
1091 * insert to the head of f.
1092 *
1093 * If f is NULL, no fib node matched the destination key
1094 * and we need to allocate a new one of those as well.
1095 */
1096
1097 if (fa && fa->fa_tos == tos &&
1098 fa->fa_info->fib_priority == fi->fib_priority) {
1099 struct fib_alias *fa_first, *fa_match;
1100
1101 err = -EEXIST;
1102 if (cfg->fc_nlflags & NLM_F_EXCL)
1103 goto out;
1104
1105 /* We have 2 goals:
1106 * 1. Find exact match for type, scope, fib_info to avoid
1107 * duplicate routes
1108 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1109 */
1110 fa_match = NULL;
1111 fa_first = fa;
1112 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1113 list_for_each_entry_continue(fa, fa_head, fa_list) {
1114 if (fa->fa_tos != tos)
1115 break;
1116 if (fa->fa_info->fib_priority != fi->fib_priority)
1117 break;
1118 if (fa->fa_type == cfg->fc_type &&
1119 fa->fa_info == fi) {
1120 fa_match = fa;
1121 break;
1122 }
1123 }
1124
1125 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1126 struct fib_info *fi_drop;
1127 u8 state;
1128
1129 fa = fa_first;
1130 if (fa_match) {
1131 if (fa == fa_match)
1132 err = 0;
1133 goto out;
1134 }
1135 err = -ENOBUFS;
1136 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1137 if (new_fa == NULL)
1138 goto out;
1139
1140 fi_drop = fa->fa_info;
1141 new_fa->fa_tos = fa->fa_tos;
1142 new_fa->fa_info = fi;
1143 new_fa->fa_type = cfg->fc_type;
1144 state = fa->fa_state;
1145 new_fa->fa_state = state & ~FA_S_ACCESSED;
1146
1147 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1148 alias_free_mem_rcu(fa);
1149
1150 fib_release_info(fi_drop);
1151 if (state & FA_S_ACCESSED)
1152 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1153 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1154 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1155
1156 goto succeeded;
1157 }
1158 /* Error if we find a perfect match which
1159 * uses the same scope, type, and nexthop
1160 * information.
1161 */
1162 if (fa_match)
1163 goto out;
1164
1165 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1166 fa = fa_first;
1167 }
1168 err = -ENOENT;
1169 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1170 goto out;
1171
1172 err = -ENOBUFS;
1173 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1174 if (new_fa == NULL)
1175 goto out;
1176
1177 new_fa->fa_info = fi;
1178 new_fa->fa_tos = tos;
1179 new_fa->fa_type = cfg->fc_type;
1180 new_fa->fa_state = 0;
1181 /*
1182 * Insert new entry to the list.
1183 */
1184
1185 if (!fa_head) {
1186 fa_head = fib_insert_node(t, key, plen);
1187 if (unlikely(!fa_head)) {
1188 err = -ENOMEM;
1189 goto out_free_new_fa;
1190 }
1191 }
1192
1193 if (!plen)
1194 tb->tb_num_default++;
1195
1196 list_add_tail_rcu(&new_fa->fa_list,
1197 (fa ? &fa->fa_list : fa_head));
1198
1199 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1200 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1201 &cfg->fc_nlinfo, 0);
1202 succeeded:
1203 return 0;
1204
1205 out_free_new_fa:
1206 kmem_cache_free(fn_alias_kmem, new_fa);
1207 out:
1208 fib_release_info(fi);
1209 err:
1210 return err;
1211 }
1212
1213 /* should be called with rcu_read_lock */
1214 static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
1215 t_key key, const struct flowi4 *flp,
1216 struct fib_result *res, int fib_flags)
1217 {
1218 struct leaf_info *li;
1219 struct hlist_head *hhead = &l->list;
1220
1221 hlist_for_each_entry_rcu(li, hhead, hlist) {
1222 struct fib_alias *fa;
1223
1224 if (l->key != (key & li->mask_plen))
1225 continue;
1226
1227 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1228 struct fib_info *fi = fa->fa_info;
1229 int nhsel, err;
1230
1231 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1232 continue;
1233 if (fi->fib_dead)
1234 continue;
1235 if (fa->fa_info->fib_scope < flp->flowi4_scope)
1236 continue;
1237 fib_alias_accessed(fa);
1238 err = fib_props[fa->fa_type].error;
1239 if (unlikely(err < 0)) {
1240 #ifdef CONFIG_IP_FIB_TRIE_STATS
1241 this_cpu_inc(t->stats->semantic_match_passed);
1242 #endif
1243 return err;
1244 }
1245 if (fi->fib_flags & RTNH_F_DEAD)
1246 continue;
1247 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1248 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1249
1250 if (nh->nh_flags & RTNH_F_DEAD)
1251 continue;
1252 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1253 continue;
1254
1255 #ifdef CONFIG_IP_FIB_TRIE_STATS
1256 this_cpu_inc(t->stats->semantic_match_passed);
1257 #endif
1258 res->prefixlen = li->plen;
1259 res->nh_sel = nhsel;
1260 res->type = fa->fa_type;
1261 res->scope = fi->fib_scope;
1262 res->fi = fi;
1263 res->table = tb;
1264 res->fa_head = &li->falh;
1265 if (!(fib_flags & FIB_LOOKUP_NOREF))
1266 atomic_inc(&fi->fib_clntref);
1267 return 0;
1268 }
1269 }
1270
1271 #ifdef CONFIG_IP_FIB_TRIE_STATS
1272 this_cpu_inc(t->stats->semantic_match_miss);
1273 #endif
1274 }
1275
1276 return 1;
1277 }
1278
1279 static inline t_key prefix_mismatch(t_key key, struct tnode *n)
1280 {
1281 t_key prefix = n->key;
1282
1283 return (key ^ prefix) & (prefix | -prefix);
1284 }
1285
1286 int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1287 struct fib_result *res, int fib_flags)
1288 {
1289 struct trie *t = (struct trie *)tb->tb_data;
1290 #ifdef CONFIG_IP_FIB_TRIE_STATS
1291 struct trie_use_stats __percpu *stats = t->stats;
1292 #endif
1293 const t_key key = ntohl(flp->daddr);
1294 struct tnode *n, *pn;
1295 t_key cindex;
1296 int ret = 1;
1297
1298 rcu_read_lock();
1299
1300 n = rcu_dereference(t->trie);
1301 if (!n)
1302 goto failed;
1303
1304 #ifdef CONFIG_IP_FIB_TRIE_STATS
1305 this_cpu_inc(stats->gets);
1306 #endif
1307
1308 pn = n;
1309 cindex = 0;
1310
1311 /* Step 1: Travel to the longest prefix match in the trie */
1312 for (;;) {
1313 unsigned long index = get_index(key, n);
1314
1315 /* This bit of code is a bit tricky but it combines multiple
1316 * checks into a single check. The prefix consists of the
1317 * prefix plus zeros for the "bits" in the prefix. The index
1318 * is the difference between the key and this value. From
1319 * this we can actually derive several pieces of data.
1320 * if !(index >> bits)
1321 * we know the value is child index
1322 * else
1323 * we have a mismatch in skip bits and failed
1324 */
1325 if (index >> n->bits)
1326 break;
1327
1328 /* we have found a leaf. Prefixes have already been compared */
1329 if (IS_LEAF(n))
1330 goto found;
1331
1332 /* only record pn and cindex if we are going to be chopping
1333 * bits later. Otherwise we are just wasting cycles.
1334 */
1335 if (index) {
1336 pn = n;
1337 cindex = index;
1338 }
1339
1340 n = rcu_dereference(n->child[index]);
1341 if (unlikely(!n))
1342 goto backtrace;
1343 }
1344
1345 /* Step 2: Sort out leaves and begin backtracing for longest prefix */
1346 for (;;) {
1347 /* record the pointer where our next node pointer is stored */
1348 struct tnode __rcu **cptr = n->child;
1349
1350 /* This test verifies that none of the bits that differ
1351 * between the key and the prefix exist in the region of
1352 * the lsb and higher in the prefix.
1353 */
1354 if (unlikely(prefix_mismatch(key, n)))
1355 goto backtrace;
1356
1357 /* exit out and process leaf */
1358 if (unlikely(IS_LEAF(n)))
1359 break;
1360
1361 /* Don't bother recording parent info. Since we are in
1362 * prefix match mode we will have to come back to wherever
1363 * we started this traversal anyway
1364 */
1365
1366 while ((n = rcu_dereference(*cptr)) == NULL) {
1367 backtrace:
1368 #ifdef CONFIG_IP_FIB_TRIE_STATS
1369 if (!n)
1370 this_cpu_inc(stats->null_node_hit);
1371 #endif
1372 /* If we are at cindex 0 there are no more bits for
1373 * us to strip at this level so we must ascend back
1374 * up one level to see if there are any more bits to
1375 * be stripped there.
1376 */
1377 while (!cindex) {
1378 t_key pkey = pn->key;
1379
1380 pn = node_parent_rcu(pn);
1381 if (unlikely(!pn))
1382 goto failed;
1383 #ifdef CONFIG_IP_FIB_TRIE_STATS
1384 this_cpu_inc(stats->backtrack);
1385 #endif
1386 /* Get Child's index */
1387 cindex = get_index(pkey, pn);
1388 }
1389
1390 /* strip the least significant bit from the cindex */
1391 cindex &= cindex - 1;
1392
1393 /* grab pointer for next child node */
1394 cptr = &pn->child[cindex];
1395 }
1396 }
1397
1398 found:
1399 /* Step 3: Process the leaf, if that fails fall back to backtracing */
1400 ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
1401 if (unlikely(ret > 0))
1402 goto backtrace;
1403 failed:
1404 rcu_read_unlock();
1405 return ret;
1406 }
1407 EXPORT_SYMBOL_GPL(fib_table_lookup);
1408
1409 /*
1410 * Remove the leaf and return parent.
1411 */
1412 static void trie_leaf_remove(struct trie *t, struct tnode *l)
1413 {
1414 struct tnode *tp = node_parent(l);
1415
1416 pr_debug("entering trie_leaf_remove(%p)\n", l);
1417
1418 if (tp) {
1419 put_child(tp, get_index(l->key, tp), NULL);
1420 trie_rebalance(t, tp);
1421 } else {
1422 RCU_INIT_POINTER(t->trie, NULL);
1423 }
1424
1425 node_free(l);
1426 }
1427
1428 /*
1429 * Caller must hold RTNL.
1430 */
1431 int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1432 {
1433 struct trie *t = (struct trie *) tb->tb_data;
1434 u32 key, mask;
1435 int plen = cfg->fc_dst_len;
1436 u8 tos = cfg->fc_tos;
1437 struct fib_alias *fa, *fa_to_delete;
1438 struct list_head *fa_head;
1439 struct tnode *l;
1440 struct leaf_info *li;
1441
1442 if (plen > 32)
1443 return -EINVAL;
1444
1445 key = ntohl(cfg->fc_dst);
1446 mask = ntohl(inet_make_mask(plen));
1447
1448 if (key & ~mask)
1449 return -EINVAL;
1450
1451 key = key & mask;
1452 l = fib_find_node(t, key);
1453
1454 if (!l)
1455 return -ESRCH;
1456
1457 li = find_leaf_info(l, plen);
1458
1459 if (!li)
1460 return -ESRCH;
1461
1462 fa_head = &li->falh;
1463 fa = fib_find_alias(fa_head, tos, 0);
1464
1465 if (!fa)
1466 return -ESRCH;
1467
1468 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1469
1470 fa_to_delete = NULL;
1471 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1472 list_for_each_entry_continue(fa, fa_head, fa_list) {
1473 struct fib_info *fi = fa->fa_info;
1474
1475 if (fa->fa_tos != tos)
1476 break;
1477
1478 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1479 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1480 fa->fa_info->fib_scope == cfg->fc_scope) &&
1481 (!cfg->fc_prefsrc ||
1482 fi->fib_prefsrc == cfg->fc_prefsrc) &&
1483 (!cfg->fc_protocol ||
1484 fi->fib_protocol == cfg->fc_protocol) &&
1485 fib_nh_match(cfg, fi) == 0) {
1486 fa_to_delete = fa;
1487 break;
1488 }
1489 }
1490
1491 if (!fa_to_delete)
1492 return -ESRCH;
1493
1494 fa = fa_to_delete;
1495 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1496 &cfg->fc_nlinfo, 0);
1497
1498 list_del_rcu(&fa->fa_list);
1499
1500 if (!plen)
1501 tb->tb_num_default--;
1502
1503 if (list_empty(fa_head)) {
1504 hlist_del_rcu(&li->hlist);
1505 free_leaf_info(li);
1506 }
1507
1508 if (hlist_empty(&l->list))
1509 trie_leaf_remove(t, l);
1510
1511 if (fa->fa_state & FA_S_ACCESSED)
1512 rt_cache_flush(cfg->fc_nlinfo.nl_net);
1513
1514 fib_release_info(fa->fa_info);
1515 alias_free_mem_rcu(fa);
1516 return 0;
1517 }
1518
1519 static int trie_flush_list(struct list_head *head)
1520 {
1521 struct fib_alias *fa, *fa_node;
1522 int found = 0;
1523
1524 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1525 struct fib_info *fi = fa->fa_info;
1526
1527 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1528 list_del_rcu(&fa->fa_list);
1529 fib_release_info(fa->fa_info);
1530 alias_free_mem_rcu(fa);
1531 found++;
1532 }
1533 }
1534 return found;
1535 }
1536
1537 static int trie_flush_leaf(struct tnode *l)
1538 {
1539 int found = 0;
1540 struct hlist_head *lih = &l->list;
1541 struct hlist_node *tmp;
1542 struct leaf_info *li = NULL;
1543
1544 hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1545 found += trie_flush_list(&li->falh);
1546
1547 if (list_empty(&li->falh)) {
1548 hlist_del_rcu(&li->hlist);
1549 free_leaf_info(li);
1550 }
1551 }
1552 return found;
1553 }
1554
1555 /*
1556 * Scan for the next right leaf starting at node p->child[idx]
1557 * Since we have back pointer, no recursion necessary.
1558 */
1559 static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
1560 {
1561 do {
1562 t_key idx;
1563
1564 if (c)
1565 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1566 else
1567 idx = 0;
1568
1569 while (idx < 1u << p->bits) {
1570 c = tnode_get_child_rcu(p, idx++);
1571 if (!c)
1572 continue;
1573
1574 if (IS_LEAF(c))
1575 return c;
1576
1577 /* Rescan start scanning in new node */
1578 p = c;
1579 idx = 0;
1580 }
1581
1582 /* Node empty, walk back up to parent */
1583 c = p;
1584 } while ((p = node_parent_rcu(c)) != NULL);
1585
1586 return NULL; /* Root of trie */
1587 }
1588
1589 static struct tnode *trie_firstleaf(struct trie *t)
1590 {
1591 struct tnode *n = rcu_dereference_rtnl(t->trie);
1592
1593 if (!n)
1594 return NULL;
1595
1596 if (IS_LEAF(n)) /* trie is just a leaf */
1597 return n;
1598
1599 return leaf_walk_rcu(n, NULL);
1600 }
1601
1602 static struct tnode *trie_nextleaf(struct tnode *l)
1603 {
1604 struct tnode *p = node_parent_rcu(l);
1605
1606 if (!p)
1607 return NULL; /* trie with just one leaf */
1608
1609 return leaf_walk_rcu(p, l);
1610 }
1611
1612 static struct tnode *trie_leafindex(struct trie *t, int index)
1613 {
1614 struct tnode *l = trie_firstleaf(t);
1615
1616 while (l && index-- > 0)
1617 l = trie_nextleaf(l);
1618
1619 return l;
1620 }
1621
1622
1623 /*
1624 * Caller must hold RTNL.
1625 */
1626 int fib_table_flush(struct fib_table *tb)
1627 {
1628 struct trie *t = (struct trie *) tb->tb_data;
1629 struct tnode *l, *ll = NULL;
1630 int found = 0;
1631
1632 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1633 found += trie_flush_leaf(l);
1634
1635 if (ll && hlist_empty(&ll->list))
1636 trie_leaf_remove(t, ll);
1637 ll = l;
1638 }
1639
1640 if (ll && hlist_empty(&ll->list))
1641 trie_leaf_remove(t, ll);
1642
1643 pr_debug("trie_flush found=%d\n", found);
1644 return found;
1645 }
1646
1647 void fib_free_table(struct fib_table *tb)
1648 {
1649 #ifdef CONFIG_IP_FIB_TRIE_STATS
1650 struct trie *t = (struct trie *)tb->tb_data;
1651
1652 free_percpu(t->stats);
1653 #endif /* CONFIG_IP_FIB_TRIE_STATS */
1654 kfree(tb);
1655 }
1656
1657 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1658 struct fib_table *tb,
1659 struct sk_buff *skb, struct netlink_callback *cb)
1660 {
1661 int i, s_i;
1662 struct fib_alias *fa;
1663 __be32 xkey = htonl(key);
1664
1665 s_i = cb->args[5];
1666 i = 0;
1667
1668 /* rcu_read_lock is hold by caller */
1669
1670 list_for_each_entry_rcu(fa, fah, fa_list) {
1671 if (i < s_i) {
1672 i++;
1673 continue;
1674 }
1675
1676 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1677 cb->nlh->nlmsg_seq,
1678 RTM_NEWROUTE,
1679 tb->tb_id,
1680 fa->fa_type,
1681 xkey,
1682 plen,
1683 fa->fa_tos,
1684 fa->fa_info, NLM_F_MULTI) < 0) {
1685 cb->args[5] = i;
1686 return -1;
1687 }
1688 i++;
1689 }
1690 cb->args[5] = i;
1691 return skb->len;
1692 }
1693
1694 static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
1695 struct sk_buff *skb, struct netlink_callback *cb)
1696 {
1697 struct leaf_info *li;
1698 int i, s_i;
1699
1700 s_i = cb->args[4];
1701 i = 0;
1702
1703 /* rcu_read_lock is hold by caller */
1704 hlist_for_each_entry_rcu(li, &l->list, hlist) {
1705 if (i < s_i) {
1706 i++;
1707 continue;
1708 }
1709
1710 if (i > s_i)
1711 cb->args[5] = 0;
1712
1713 if (list_empty(&li->falh))
1714 continue;
1715
1716 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1717 cb->args[4] = i;
1718 return -1;
1719 }
1720 i++;
1721 }
1722
1723 cb->args[4] = i;
1724 return skb->len;
1725 }
1726
1727 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1728 struct netlink_callback *cb)
1729 {
1730 struct tnode *l;
1731 struct trie *t = (struct trie *) tb->tb_data;
1732 t_key key = cb->args[2];
1733 int count = cb->args[3];
1734
1735 rcu_read_lock();
1736 /* Dump starting at last key.
1737 * Note: 0.0.0.0/0 (ie default) is first key.
1738 */
1739 if (count == 0)
1740 l = trie_firstleaf(t);
1741 else {
1742 /* Normally, continue from last key, but if that is missing
1743 * fallback to using slow rescan
1744 */
1745 l = fib_find_node(t, key);
1746 if (!l)
1747 l = trie_leafindex(t, count);
1748 }
1749
1750 while (l) {
1751 cb->args[2] = l->key;
1752 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1753 cb->args[3] = count;
1754 rcu_read_unlock();
1755 return -1;
1756 }
1757
1758 ++count;
1759 l = trie_nextleaf(l);
1760 memset(&cb->args[4], 0,
1761 sizeof(cb->args) - 4*sizeof(cb->args[0]));
1762 }
1763 cb->args[3] = count;
1764 rcu_read_unlock();
1765
1766 return skb->len;
1767 }
1768
1769 void __init fib_trie_init(void)
1770 {
1771 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1772 sizeof(struct fib_alias),
1773 0, SLAB_PANIC, NULL);
1774
1775 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1776 max(sizeof(struct tnode),
1777 sizeof(struct leaf_info)),
1778 0, SLAB_PANIC, NULL);
1779 }
1780
1781
1782 struct fib_table *fib_trie_table(u32 id)
1783 {
1784 struct fib_table *tb;
1785 struct trie *t;
1786
1787 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1788 GFP_KERNEL);
1789 if (tb == NULL)
1790 return NULL;
1791
1792 tb->tb_id = id;
1793 tb->tb_default = -1;
1794 tb->tb_num_default = 0;
1795
1796 t = (struct trie *) tb->tb_data;
1797 RCU_INIT_POINTER(t->trie, NULL);
1798 #ifdef CONFIG_IP_FIB_TRIE_STATS
1799 t->stats = alloc_percpu(struct trie_use_stats);
1800 if (!t->stats) {
1801 kfree(tb);
1802 tb = NULL;
1803 }
1804 #endif
1805
1806 return tb;
1807 }
1808
1809 #ifdef CONFIG_PROC_FS
1810 /* Depth first Trie walk iterator */
1811 struct fib_trie_iter {
1812 struct seq_net_private p;
1813 struct fib_table *tb;
1814 struct tnode *tnode;
1815 unsigned int index;
1816 unsigned int depth;
1817 };
1818
1819 static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
1820 {
1821 struct tnode *tn = iter->tnode;
1822 unsigned int cindex = iter->index;
1823 struct tnode *p;
1824
1825 /* A single entry routing table */
1826 if (!tn)
1827 return NULL;
1828
1829 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1830 iter->tnode, iter->index, iter->depth);
1831 rescan:
1832 while (cindex < (1<<tn->bits)) {
1833 struct tnode *n = tnode_get_child_rcu(tn, cindex);
1834
1835 if (n) {
1836 if (IS_LEAF(n)) {
1837 iter->tnode = tn;
1838 iter->index = cindex + 1;
1839 } else {
1840 /* push down one level */
1841 iter->tnode = n;
1842 iter->index = 0;
1843 ++iter->depth;
1844 }
1845 return n;
1846 }
1847
1848 ++cindex;
1849 }
1850
1851 /* Current node exhausted, pop back up */
1852 p = node_parent_rcu(tn);
1853 if (p) {
1854 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
1855 tn = p;
1856 --iter->depth;
1857 goto rescan;
1858 }
1859
1860 /* got root? */
1861 return NULL;
1862 }
1863
1864 static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
1865 struct trie *t)
1866 {
1867 struct tnode *n;
1868
1869 if (!t)
1870 return NULL;
1871
1872 n = rcu_dereference(t->trie);
1873 if (!n)
1874 return NULL;
1875
1876 if (IS_TNODE(n)) {
1877 iter->tnode = n;
1878 iter->index = 0;
1879 iter->depth = 1;
1880 } else {
1881 iter->tnode = NULL;
1882 iter->index = 0;
1883 iter->depth = 0;
1884 }
1885
1886 return n;
1887 }
1888
1889 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
1890 {
1891 struct tnode *n;
1892 struct fib_trie_iter iter;
1893
1894 memset(s, 0, sizeof(*s));
1895
1896 rcu_read_lock();
1897 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
1898 if (IS_LEAF(n)) {
1899 struct leaf_info *li;
1900
1901 s->leaves++;
1902 s->totdepth += iter.depth;
1903 if (iter.depth > s->maxdepth)
1904 s->maxdepth = iter.depth;
1905
1906 hlist_for_each_entry_rcu(li, &n->list, hlist)
1907 ++s->prefixes;
1908 } else {
1909 int i;
1910
1911 s->tnodes++;
1912 if (n->bits < MAX_STAT_DEPTH)
1913 s->nodesizes[n->bits]++;
1914
1915 for (i = 0; i < tnode_child_length(n); i++)
1916 if (!rcu_access_pointer(n->child[i]))
1917 s->nullpointers++;
1918 }
1919 }
1920 rcu_read_unlock();
1921 }
1922
1923 /*
1924 * This outputs /proc/net/fib_triestats
1925 */
1926 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
1927 {
1928 unsigned int i, max, pointers, bytes, avdepth;
1929
1930 if (stat->leaves)
1931 avdepth = stat->totdepth*100 / stat->leaves;
1932 else
1933 avdepth = 0;
1934
1935 seq_printf(seq, "\tAver depth: %u.%02d\n",
1936 avdepth / 100, avdepth % 100);
1937 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
1938
1939 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
1940 bytes = sizeof(struct tnode) * stat->leaves;
1941
1942 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
1943 bytes += sizeof(struct leaf_info) * stat->prefixes;
1944
1945 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
1946 bytes += sizeof(struct tnode) * stat->tnodes;
1947
1948 max = MAX_STAT_DEPTH;
1949 while (max > 0 && stat->nodesizes[max-1] == 0)
1950 max--;
1951
1952 pointers = 0;
1953 for (i = 1; i < max; i++)
1954 if (stat->nodesizes[i] != 0) {
1955 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
1956 pointers += (1<<i) * stat->nodesizes[i];
1957 }
1958 seq_putc(seq, '\n');
1959 seq_printf(seq, "\tPointers: %u\n", pointers);
1960
1961 bytes += sizeof(struct tnode *) * pointers;
1962 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
1963 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
1964 }
1965
1966 #ifdef CONFIG_IP_FIB_TRIE_STATS
1967 static void trie_show_usage(struct seq_file *seq,
1968 const struct trie_use_stats __percpu *stats)
1969 {
1970 struct trie_use_stats s = { 0 };
1971 int cpu;
1972
1973 /* loop through all of the CPUs and gather up the stats */
1974 for_each_possible_cpu(cpu) {
1975 const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
1976
1977 s.gets += pcpu->gets;
1978 s.backtrack += pcpu->backtrack;
1979 s.semantic_match_passed += pcpu->semantic_match_passed;
1980 s.semantic_match_miss += pcpu->semantic_match_miss;
1981 s.null_node_hit += pcpu->null_node_hit;
1982 s.resize_node_skipped += pcpu->resize_node_skipped;
1983 }
1984
1985 seq_printf(seq, "\nCounters:\n---------\n");
1986 seq_printf(seq, "gets = %u\n", s.gets);
1987 seq_printf(seq, "backtracks = %u\n", s.backtrack);
1988 seq_printf(seq, "semantic match passed = %u\n",
1989 s.semantic_match_passed);
1990 seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
1991 seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
1992 seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
1993 }
1994 #endif /* CONFIG_IP_FIB_TRIE_STATS */
1995
1996 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
1997 {
1998 if (tb->tb_id == RT_TABLE_LOCAL)
1999 seq_puts(seq, "Local:\n");
2000 else if (tb->tb_id == RT_TABLE_MAIN)
2001 seq_puts(seq, "Main:\n");
2002 else
2003 seq_printf(seq, "Id %d:\n", tb->tb_id);
2004 }
2005
2006
2007 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2008 {
2009 struct net *net = (struct net *)seq->private;
2010 unsigned int h;
2011
2012 seq_printf(seq,
2013 "Basic info: size of leaf:"
2014 " %Zd bytes, size of tnode: %Zd bytes.\n",
2015 sizeof(struct tnode), sizeof(struct tnode));
2016
2017 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2018 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2019 struct fib_table *tb;
2020
2021 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2022 struct trie *t = (struct trie *) tb->tb_data;
2023 struct trie_stat stat;
2024
2025 if (!t)
2026 continue;
2027
2028 fib_table_print(seq, tb);
2029
2030 trie_collect_stats(t, &stat);
2031 trie_show_stats(seq, &stat);
2032 #ifdef CONFIG_IP_FIB_TRIE_STATS
2033 trie_show_usage(seq, t->stats);
2034 #endif
2035 }
2036 }
2037
2038 return 0;
2039 }
2040
2041 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2042 {
2043 return single_open_net(inode, file, fib_triestat_seq_show);
2044 }
2045
2046 static const struct file_operations fib_triestat_fops = {
2047 .owner = THIS_MODULE,
2048 .open = fib_triestat_seq_open,
2049 .read = seq_read,
2050 .llseek = seq_lseek,
2051 .release = single_release_net,
2052 };
2053
2054 static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2055 {
2056 struct fib_trie_iter *iter = seq->private;
2057 struct net *net = seq_file_net(seq);
2058 loff_t idx = 0;
2059 unsigned int h;
2060
2061 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2062 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2063 struct fib_table *tb;
2064
2065 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2066 struct tnode *n;
2067
2068 for (n = fib_trie_get_first(iter,
2069 (struct trie *) tb->tb_data);
2070 n; n = fib_trie_get_next(iter))
2071 if (pos == idx++) {
2072 iter->tb = tb;
2073 return n;
2074 }
2075 }
2076 }
2077
2078 return NULL;
2079 }
2080
2081 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2082 __acquires(RCU)
2083 {
2084 rcu_read_lock();
2085 return fib_trie_get_idx(seq, *pos);
2086 }
2087
2088 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2089 {
2090 struct fib_trie_iter *iter = seq->private;
2091 struct net *net = seq_file_net(seq);
2092 struct fib_table *tb = iter->tb;
2093 struct hlist_node *tb_node;
2094 unsigned int h;
2095 struct tnode *n;
2096
2097 ++*pos;
2098 /* next node in same table */
2099 n = fib_trie_get_next(iter);
2100 if (n)
2101 return n;
2102
2103 /* walk rest of this hash chain */
2104 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2105 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2106 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2107 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2108 if (n)
2109 goto found;
2110 }
2111
2112 /* new hash chain */
2113 while (++h < FIB_TABLE_HASHSZ) {
2114 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2115 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2116 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2117 if (n)
2118 goto found;
2119 }
2120 }
2121 return NULL;
2122
2123 found:
2124 iter->tb = tb;
2125 return n;
2126 }
2127
2128 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2129 __releases(RCU)
2130 {
2131 rcu_read_unlock();
2132 }
2133
2134 static void seq_indent(struct seq_file *seq, int n)
2135 {
2136 while (n-- > 0)
2137 seq_puts(seq, " ");
2138 }
2139
2140 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2141 {
2142 switch (s) {
2143 case RT_SCOPE_UNIVERSE: return "universe";
2144 case RT_SCOPE_SITE: return "site";
2145 case RT_SCOPE_LINK: return "link";
2146 case RT_SCOPE_HOST: return "host";
2147 case RT_SCOPE_NOWHERE: return "nowhere";
2148 default:
2149 snprintf(buf, len, "scope=%d", s);
2150 return buf;
2151 }
2152 }
2153
2154 static const char *const rtn_type_names[__RTN_MAX] = {
2155 [RTN_UNSPEC] = "UNSPEC",
2156 [RTN_UNICAST] = "UNICAST",
2157 [RTN_LOCAL] = "LOCAL",
2158 [RTN_BROADCAST] = "BROADCAST",
2159 [RTN_ANYCAST] = "ANYCAST",
2160 [RTN_MULTICAST] = "MULTICAST",
2161 [RTN_BLACKHOLE] = "BLACKHOLE",
2162 [RTN_UNREACHABLE] = "UNREACHABLE",
2163 [RTN_PROHIBIT] = "PROHIBIT",
2164 [RTN_THROW] = "THROW",
2165 [RTN_NAT] = "NAT",
2166 [RTN_XRESOLVE] = "XRESOLVE",
2167 };
2168
2169 static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2170 {
2171 if (t < __RTN_MAX && rtn_type_names[t])
2172 return rtn_type_names[t];
2173 snprintf(buf, len, "type %u", t);
2174 return buf;
2175 }
2176
2177 /* Pretty print the trie */
2178 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2179 {
2180 const struct fib_trie_iter *iter = seq->private;
2181 struct tnode *n = v;
2182
2183 if (!node_parent_rcu(n))
2184 fib_table_print(seq, iter->tb);
2185
2186 if (IS_TNODE(n)) {
2187 __be32 prf = htonl(n->key);
2188
2189 seq_indent(seq, iter->depth - 1);
2190 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2191 &prf, n->pos, n->bits, n->full_children,
2192 n->empty_children);
2193 } else {
2194 struct leaf_info *li;
2195 __be32 val = htonl(n->key);
2196
2197 seq_indent(seq, iter->depth);
2198 seq_printf(seq, " |-- %pI4\n", &val);
2199
2200 hlist_for_each_entry_rcu(li, &n->list, hlist) {
2201 struct fib_alias *fa;
2202
2203 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2204 char buf1[32], buf2[32];
2205
2206 seq_indent(seq, iter->depth+1);
2207 seq_printf(seq, " /%d %s %s", li->plen,
2208 rtn_scope(buf1, sizeof(buf1),
2209 fa->fa_info->fib_scope),
2210 rtn_type(buf2, sizeof(buf2),
2211 fa->fa_type));
2212 if (fa->fa_tos)
2213 seq_printf(seq, " tos=%d", fa->fa_tos);
2214 seq_putc(seq, '\n');
2215 }
2216 }
2217 }
2218
2219 return 0;
2220 }
2221
2222 static const struct seq_operations fib_trie_seq_ops = {
2223 .start = fib_trie_seq_start,
2224 .next = fib_trie_seq_next,
2225 .stop = fib_trie_seq_stop,
2226 .show = fib_trie_seq_show,
2227 };
2228
2229 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2230 {
2231 return seq_open_net(inode, file, &fib_trie_seq_ops,
2232 sizeof(struct fib_trie_iter));
2233 }
2234
2235 static const struct file_operations fib_trie_fops = {
2236 .owner = THIS_MODULE,
2237 .open = fib_trie_seq_open,
2238 .read = seq_read,
2239 .llseek = seq_lseek,
2240 .release = seq_release_net,
2241 };
2242
2243 struct fib_route_iter {
2244 struct seq_net_private p;
2245 struct trie *main_trie;
2246 loff_t pos;
2247 t_key key;
2248 };
2249
2250 static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2251 {
2252 struct tnode *l = NULL;
2253 struct trie *t = iter->main_trie;
2254
2255 /* use cache location of last found key */
2256 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2257 pos -= iter->pos;
2258 else {
2259 iter->pos = 0;
2260 l = trie_firstleaf(t);
2261 }
2262
2263 while (l && pos-- > 0) {
2264 iter->pos++;
2265 l = trie_nextleaf(l);
2266 }
2267
2268 if (l)
2269 iter->key = pos; /* remember it */
2270 else
2271 iter->pos = 0; /* forget it */
2272
2273 return l;
2274 }
2275
2276 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2277 __acquires(RCU)
2278 {
2279 struct fib_route_iter *iter = seq->private;
2280 struct fib_table *tb;
2281
2282 rcu_read_lock();
2283 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2284 if (!tb)
2285 return NULL;
2286
2287 iter->main_trie = (struct trie *) tb->tb_data;
2288 if (*pos == 0)
2289 return SEQ_START_TOKEN;
2290 else
2291 return fib_route_get_idx(iter, *pos - 1);
2292 }
2293
2294 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2295 {
2296 struct fib_route_iter *iter = seq->private;
2297 struct tnode *l = v;
2298
2299 ++*pos;
2300 if (v == SEQ_START_TOKEN) {
2301 iter->pos = 0;
2302 l = trie_firstleaf(iter->main_trie);
2303 } else {
2304 iter->pos++;
2305 l = trie_nextleaf(l);
2306 }
2307
2308 if (l)
2309 iter->key = l->key;
2310 else
2311 iter->pos = 0;
2312 return l;
2313 }
2314
2315 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2316 __releases(RCU)
2317 {
2318 rcu_read_unlock();
2319 }
2320
2321 static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2322 {
2323 unsigned int flags = 0;
2324
2325 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2326 flags = RTF_REJECT;
2327 if (fi && fi->fib_nh->nh_gw)
2328 flags |= RTF_GATEWAY;
2329 if (mask == htonl(0xFFFFFFFF))
2330 flags |= RTF_HOST;
2331 flags |= RTF_UP;
2332 return flags;
2333 }
2334
2335 /*
2336 * This outputs /proc/net/route.
2337 * The format of the file is not supposed to be changed
2338 * and needs to be same as fib_hash output to avoid breaking
2339 * legacy utilities
2340 */
2341 static int fib_route_seq_show(struct seq_file *seq, void *v)
2342 {
2343 struct tnode *l = v;
2344 struct leaf_info *li;
2345
2346 if (v == SEQ_START_TOKEN) {
2347 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2348 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2349 "\tWindow\tIRTT");
2350 return 0;
2351 }
2352
2353 hlist_for_each_entry_rcu(li, &l->list, hlist) {
2354 struct fib_alias *fa;
2355 __be32 mask, prefix;
2356
2357 mask = inet_make_mask(li->plen);
2358 prefix = htonl(l->key);
2359
2360 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2361 const struct fib_info *fi = fa->fa_info;
2362 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2363
2364 if (fa->fa_type == RTN_BROADCAST
2365 || fa->fa_type == RTN_MULTICAST)
2366 continue;
2367
2368 seq_setwidth(seq, 127);
2369
2370 if (fi)
2371 seq_printf(seq,
2372 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2373 "%d\t%08X\t%d\t%u\t%u",
2374 fi->fib_dev ? fi->fib_dev->name : "*",
2375 prefix,
2376 fi->fib_nh->nh_gw, flags, 0, 0,
2377 fi->fib_priority,
2378 mask,
2379 (fi->fib_advmss ?
2380 fi->fib_advmss + 40 : 0),
2381 fi->fib_window,
2382 fi->fib_rtt >> 3);
2383 else
2384 seq_printf(seq,
2385 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2386 "%d\t%08X\t%d\t%u\t%u",
2387 prefix, 0, flags, 0, 0, 0,
2388 mask, 0, 0, 0);
2389
2390 seq_pad(seq, '\n');
2391 }
2392 }
2393
2394 return 0;
2395 }
2396
2397 static const struct seq_operations fib_route_seq_ops = {
2398 .start = fib_route_seq_start,
2399 .next = fib_route_seq_next,
2400 .stop = fib_route_seq_stop,
2401 .show = fib_route_seq_show,
2402 };
2403
2404 static int fib_route_seq_open(struct inode *inode, struct file *file)
2405 {
2406 return seq_open_net(inode, file, &fib_route_seq_ops,
2407 sizeof(struct fib_route_iter));
2408 }
2409
2410 static const struct file_operations fib_route_fops = {
2411 .owner = THIS_MODULE,
2412 .open = fib_route_seq_open,
2413 .read = seq_read,
2414 .llseek = seq_lseek,
2415 .release = seq_release_net,
2416 };
2417
2418 int __net_init fib_proc_init(struct net *net)
2419 {
2420 if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2421 goto out1;
2422
2423 if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2424 &fib_triestat_fops))
2425 goto out2;
2426
2427 if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2428 goto out3;
2429
2430 return 0;
2431
2432 out3:
2433 remove_proc_entry("fib_triestat", net->proc_net);
2434 out2:
2435 remove_proc_entry("fib_trie", net->proc_net);
2436 out1:
2437 return -ENOMEM;
2438 }
2439
2440 void __net_exit fib_proc_exit(struct net *net)
2441 {
2442 remove_proc_entry("fib_trie", net->proc_net);
2443 remove_proc_entry("fib_triestat", net->proc_net);
2444 remove_proc_entry("route", net->proc_net);
2445 }
2446
2447 #endif /* CONFIG_PROC_FS */