]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/ipv4/fib_trie.c
fib_trie: Optimize fib_find_node
[mirror_ubuntu-zesty-kernel.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
e905a9ed 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
19baf839 11 * Agricultural Sciences.
e905a9ed 12 *
19baf839
RO
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
25985edc 15 * This work is based on the LPC-trie which is originally described in:
e905a9ed 16 *
19baf839
RO
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
631dd1a8 19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
19baf839
RO
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
19baf839
RO
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
fd966255
RO
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
19baf839
RO
49 */
50
80b71b80 51#define VERSION "0.409"
19baf839 52
19baf839 53#include <asm/uaccess.h>
1977f032 54#include <linux/bitops.h>
19baf839
RO
55#include <linux/types.h>
56#include <linux/kernel.h>
19baf839
RO
57#include <linux/mm.h>
58#include <linux/string.h>
59#include <linux/socket.h>
60#include <linux/sockios.h>
61#include <linux/errno.h>
62#include <linux/in.h>
63#include <linux/inet.h>
cd8787ab 64#include <linux/inetdevice.h>
19baf839
RO
65#include <linux/netdevice.h>
66#include <linux/if_arp.h>
67#include <linux/proc_fs.h>
2373ce1c 68#include <linux/rcupdate.h>
19baf839
RO
69#include <linux/skbuff.h>
70#include <linux/netlink.h>
71#include <linux/init.h>
72#include <linux/list.h>
5a0e3ad6 73#include <linux/slab.h>
bc3b2d7f 74#include <linux/export.h>
457c4cbc 75#include <net/net_namespace.h>
19baf839
RO
76#include <net/ip.h>
77#include <net/protocol.h>
78#include <net/route.h>
79#include <net/tcp.h>
80#include <net/sock.h>
81#include <net/ip_fib.h>
82#include "fib_lookup.h"
83
06ef921d 84#define MAX_STAT_DEPTH 32
19baf839 85
19baf839 86#define KEYLENGTH (8*sizeof(t_key))
19baf839 87
19baf839
RO
88typedef unsigned int t_key;
89
64c9b6fb
AD
90#define IS_TNODE(n) ((n)->bits)
91#define IS_LEAF(n) (!(n)->bits)
2373ce1c 92
9f9e636d
AD
93#define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits)
94#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv))
95
64c9b6fb
AD
96struct tnode {
97 t_key key;
98 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
99 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
100 struct tnode __rcu *parent;
37fd30f2 101 struct rcu_head rcu;
adaf9816
AD
102 union {
103 /* The fields in this struct are valid if bits > 0 (TNODE) */
104 struct {
105 unsigned int full_children; /* KEYLENGTH bits needed */
106 unsigned int empty_children; /* KEYLENGTH bits needed */
107 struct tnode __rcu *child[0];
108 };
109 /* This list pointer if valid if bits == 0 (LEAF) */
110 struct hlist_head list;
111 };
19baf839
RO
112};
113
114struct leaf_info {
115 struct hlist_node hlist;
116 int plen;
5c74501f 117 u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
19baf839 118 struct list_head falh;
5c74501f 119 struct rcu_head rcu;
19baf839
RO
120};
121
19baf839
RO
122#ifdef CONFIG_IP_FIB_TRIE_STATS
123struct trie_use_stats {
124 unsigned int gets;
125 unsigned int backtrack;
126 unsigned int semantic_match_passed;
127 unsigned int semantic_match_miss;
128 unsigned int null_node_hit;
2f36895a 129 unsigned int resize_node_skipped;
19baf839
RO
130};
131#endif
132
133struct trie_stat {
134 unsigned int totdepth;
135 unsigned int maxdepth;
136 unsigned int tnodes;
137 unsigned int leaves;
138 unsigned int nullpointers;
93672292 139 unsigned int prefixes;
06ef921d 140 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 141};
19baf839
RO
142
143struct trie {
adaf9816 144 struct tnode __rcu *trie;
19baf839 145#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 146 struct trie_use_stats __percpu *stats;
19baf839 147#endif
19baf839
RO
148};
149
adaf9816 150static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
a07f5f50 151 int wasfull);
adaf9816 152static struct tnode *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
153static struct tnode *inflate(struct trie *t, struct tnode *tn);
154static struct tnode *halve(struct trie *t, struct tnode *tn);
e0f7cb8c 155/* tnodes to free after resize(); protected by RTNL */
37fd30f2 156static struct callback_head *tnode_free_head;
c3059477
JP
157static size_t tnode_free_size;
158
159/*
160 * synchronize_rcu after call_rcu for that many pages; it should be especially
161 * useful before resizing the root node with PREEMPT_NONE configs; the value was
162 * obtained experimentally, aiming to avoid visible slowdown.
163 */
164static const int sync_pages = 128;
19baf839 165
e18b890b 166static struct kmem_cache *fn_alias_kmem __read_mostly;
bc3c8c1e 167static struct kmem_cache *trie_leaf_kmem __read_mostly;
19baf839 168
64c9b6fb
AD
169/* caller must hold RTNL */
170#define node_parent(n) rtnl_dereference((n)->parent)
0a5c0475 171
64c9b6fb
AD
172/* caller must hold RCU read lock or RTNL */
173#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
0a5c0475 174
64c9b6fb 175/* wrapper for rcu_assign_pointer */
adaf9816 176static inline void node_set_parent(struct tnode *n, struct tnode *tp)
b59cfbf7 177{
adaf9816
AD
178 if (n)
179 rcu_assign_pointer(n->parent, tp);
06801916
SH
180}
181
64c9b6fb
AD
182#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
183
184/* This provides us with the number of children in this node, in the case of a
185 * leaf this will return 0 meaning none of the children are accessible.
6440cc9e 186 */
64c9b6fb 187static inline int tnode_child_length(const struct tnode *tn)
06801916 188{
64c9b6fb 189 return (1ul << tn->bits) & ~(1ul);
06801916 190}
2373ce1c 191
0a5c0475
ED
192/*
193 * caller must hold RTNL
194 */
adaf9816 195static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i)
b59cfbf7 196{
64c9b6fb 197 BUG_ON(i >= tnode_child_length(tn));
2373ce1c 198
0a5c0475 199 return rtnl_dereference(tn->child[i]);
b59cfbf7
ED
200}
201
0a5c0475
ED
202/*
203 * caller must hold RCU read lock or RTNL
204 */
adaf9816 205static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
19baf839 206{
64c9b6fb 207 BUG_ON(i >= tnode_child_length(tn));
19baf839 208
0a5c0475 209 return rcu_dereference_rtnl(tn->child[i]);
19baf839
RO
210}
211
3b004569 212static inline t_key mask_pfx(t_key k, unsigned int l)
ab66b4a7
SH
213{
214 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
215}
216
3b004569 217static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
19baf839 218{
91b9a277 219 if (offset < KEYLENGTH)
19baf839 220 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 221 else
19baf839
RO
222 return 0;
223}
224
225static inline int tkey_equals(t_key a, t_key b)
226{
c877efb2 227 return a == b;
19baf839
RO
228}
229
230static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
231{
c877efb2
SH
232 if (bits == 0 || offset >= KEYLENGTH)
233 return 1;
91b9a277
OJ
234 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
235 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 236}
19baf839
RO
237
238static inline int tkey_mismatch(t_key a, int offset, t_key b)
239{
240 t_key diff = a ^ b;
241 int i = offset;
242
c877efb2
SH
243 if (!diff)
244 return 0;
245 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
246 i++;
247 return i;
248}
249
19baf839 250/*
e905a9ed
YH
251 To understand this stuff, an understanding of keys and all their bits is
252 necessary. Every node in the trie has a key associated with it, but not
19baf839
RO
253 all of the bits in that key are significant.
254
255 Consider a node 'n' and its parent 'tp'.
256
e905a9ed
YH
257 If n is a leaf, every bit in its key is significant. Its presence is
258 necessitated by path compression, since during a tree traversal (when
259 searching for a leaf - unless we are doing an insertion) we will completely
260 ignore all skipped bits we encounter. Thus we need to verify, at the end of
261 a potentially successful search, that we have indeed been walking the
19baf839
RO
262 correct key path.
263
e905a9ed
YH
264 Note that we can never "miss" the correct key in the tree if present by
265 following the wrong path. Path compression ensures that segments of the key
266 that are the same for all keys with a given prefix are skipped, but the
267 skipped part *is* identical for each node in the subtrie below the skipped
268 bit! trie_insert() in this implementation takes care of that - note the
19baf839
RO
269 call to tkey_sub_equals() in trie_insert().
270
e905a9ed 271 if n is an internal node - a 'tnode' here, the various parts of its key
19baf839
RO
272 have many different meanings.
273
e905a9ed 274 Example:
19baf839
RO
275 _________________________________________________________________
276 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
277 -----------------------------------------------------------------
e905a9ed 278 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
279
280 _________________________________________________________________
281 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
282 -----------------------------------------------------------------
283 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
284
285 tp->pos = 7
286 tp->bits = 3
287 n->pos = 15
91b9a277 288 n->bits = 4
19baf839 289
e905a9ed
YH
290 First, let's just ignore the bits that come before the parent tp, that is
291 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
19baf839
RO
292 not use them for anything.
293
294 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
e905a9ed 295 index into the parent's child array. That is, they will be used to find
19baf839
RO
296 'n' among tp's children.
297
298 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
299 for the node n.
300
e905a9ed 301 All the bits we have seen so far are significant to the node n. The rest
19baf839
RO
302 of the bits are really not needed or indeed known in n->key.
303
e905a9ed 304 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
19baf839 305 n's child array, and will of course be different for each child.
e905a9ed 306
c877efb2 307
19baf839
RO
308 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
309 at this point.
310
311*/
312
f5026fab
DL
313static const int halve_threshold = 25;
314static const int inflate_threshold = 50;
345aa031 315static const int halve_threshold_root = 15;
80b71b80 316static const int inflate_threshold_root = 30;
2373ce1c
RO
317
318static void __alias_free_mem(struct rcu_head *head)
19baf839 319{
2373ce1c
RO
320 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
321 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
322}
323
2373ce1c 324static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 325{
2373ce1c
RO
326 call_rcu(&fa->rcu, __alias_free_mem);
327}
91b9a277 328
37fd30f2 329#define TNODE_KMALLOC_MAX \
adaf9816 330 ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
91b9a277 331
37fd30f2 332static void __node_free_rcu(struct rcu_head *head)
387a5487 333{
adaf9816 334 struct tnode *n = container_of(head, struct tnode, rcu);
37fd30f2
AD
335
336 if (IS_LEAF(n))
337 kmem_cache_free(trie_leaf_kmem, n);
338 else if (n->bits <= TNODE_KMALLOC_MAX)
339 kfree(n);
340 else
341 vfree(n);
387a5487
SH
342}
343
37fd30f2
AD
344#define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
345
2373ce1c 346static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 347{
bceb0f45 348 kfree_rcu(leaf, rcu);
19baf839
RO
349}
350
8d965444 351static struct tnode *tnode_alloc(size_t size)
f0e36f8c 352{
2373ce1c 353 if (size <= PAGE_SIZE)
8d965444 354 return kzalloc(size, GFP_KERNEL);
15be75cd 355 else
7a1c8e5a 356 return vzalloc(size);
15be75cd 357}
2373ce1c 358
e0f7cb8c
JP
359static void tnode_free_safe(struct tnode *tn)
360{
361 BUG_ON(IS_LEAF(tn));
37fd30f2
AD
362 tn->rcu.next = tnode_free_head;
363 tnode_free_head = &tn->rcu;
e0f7cb8c
JP
364}
365
366static void tnode_free_flush(void)
367{
37fd30f2
AD
368 struct callback_head *head;
369
370 while ((head = tnode_free_head)) {
371 struct tnode *tn = container_of(head, struct tnode, rcu);
372
373 tnode_free_head = head->next;
374 tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
e0f7cb8c 375
37fd30f2 376 node_free(tn);
e0f7cb8c 377 }
c3059477
JP
378
379 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
380 tnode_free_size = 0;
381 synchronize_rcu();
382 }
e0f7cb8c
JP
383}
384
adaf9816 385static struct tnode *leaf_new(t_key key)
2373ce1c 386{
adaf9816 387 struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
2373ce1c 388 if (l) {
64c9b6fb
AD
389 l->parent = NULL;
390 /* set key and pos to reflect full key value
391 * any trailing zeros in the key should be ignored
392 * as the nodes are searched
393 */
394 l->key = key;
395 l->pos = KEYLENGTH;
396 /* set bits to 0 indicating we are not a tnode */
397 l->bits = 0;
398
2373ce1c
RO
399 INIT_HLIST_HEAD(&l->list);
400 }
401 return l;
402}
403
404static struct leaf_info *leaf_info_new(int plen)
405{
406 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
407 if (li) {
408 li->plen = plen;
5c74501f 409 li->mask_plen = ntohl(inet_make_mask(plen));
2373ce1c
RO
410 INIT_LIST_HEAD(&li->falh);
411 }
412 return li;
413}
414
a07f5f50 415static struct tnode *tnode_new(t_key key, int pos, int bits)
19baf839 416{
37fd30f2 417 size_t sz = offsetof(struct tnode, child[1 << bits]);
f0e36f8c 418 struct tnode *tn = tnode_alloc(sz);
64c9b6fb
AD
419 unsigned int shift = pos + bits;
420
421 /* verify bits and pos their msb bits clear and values are valid */
422 BUG_ON(!bits || (shift > KEYLENGTH));
19baf839 423
91b9a277 424 if (tn) {
64c9b6fb 425 tn->parent = NULL;
19baf839
RO
426 tn->pos = pos;
427 tn->bits = bits;
64c9b6fb 428 tn->key = mask_pfx(key, pos);
19baf839
RO
429 tn->full_children = 0;
430 tn->empty_children = 1<<bits;
431 }
c877efb2 432
a034ee3c 433 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
adaf9816 434 sizeof(struct tnode *) << bits);
19baf839
RO
435 return tn;
436}
437
19baf839
RO
438/*
439 * Check whether a tnode 'n' is "full", i.e. it is an internal node
440 * and no bits are skipped. See discussion in dyntree paper p. 6
441 */
442
adaf9816 443static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
19baf839 444{
64c9b6fb 445 return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
19baf839
RO
446}
447
61648d91 448static inline void put_child(struct tnode *tn, int i,
adaf9816 449 struct tnode *n)
19baf839
RO
450{
451 tnode_put_child_reorg(tn, i, n, -1);
452}
453
c877efb2 454 /*
19baf839
RO
455 * Add a child at position i overwriting the old value.
456 * Update the value of full_children and empty_children.
457 */
458
adaf9816 459static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
a07f5f50 460 int wasfull)
19baf839 461{
adaf9816 462 struct tnode *chi = rtnl_dereference(tn->child[i]);
19baf839
RO
463 int isfull;
464
0c7770c7
SH
465 BUG_ON(i >= 1<<tn->bits);
466
19baf839
RO
467 /* update emptyChildren */
468 if (n == NULL && chi != NULL)
469 tn->empty_children++;
470 else if (n != NULL && chi == NULL)
471 tn->empty_children--;
c877efb2 472
19baf839 473 /* update fullChildren */
91b9a277 474 if (wasfull == -1)
19baf839
RO
475 wasfull = tnode_full(tn, chi);
476
477 isfull = tnode_full(tn, n);
c877efb2 478 if (wasfull && !isfull)
19baf839 479 tn->full_children--;
c877efb2 480 else if (!wasfull && isfull)
19baf839 481 tn->full_children++;
91b9a277 482
64c9b6fb 483 node_set_parent(n, tn);
19baf839 484
cf778b00 485 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
486}
487
80b71b80 488#define MAX_WORK 10
adaf9816 489static struct tnode *resize(struct trie *t, struct tnode *tn)
19baf839 490{
adaf9816 491 struct tnode *old_tn, *n = NULL;
e6308be8
RO
492 int inflate_threshold_use;
493 int halve_threshold_use;
80b71b80 494 int max_work;
19baf839 495
e905a9ed 496 if (!tn)
19baf839
RO
497 return NULL;
498
0c7770c7
SH
499 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
500 tn, inflate_threshold, halve_threshold);
19baf839
RO
501
502 /* No children */
64c9b6fb
AD
503 if (tn->empty_children > (tnode_child_length(tn) - 1))
504 goto no_children;
505
19baf839 506 /* One child */
64c9b6fb 507 if (tn->empty_children == (tnode_child_length(tn) - 1))
80b71b80 508 goto one_child;
c877efb2 509 /*
19baf839
RO
510 * Double as long as the resulting node has a number of
511 * nonempty nodes that are above the threshold.
512 */
513
514 /*
c877efb2
SH
515 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
516 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 517 * Telecommunications, page 6:
c877efb2 518 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
519 * children in the *doubled* node is at least 'high'."
520 *
c877efb2
SH
521 * 'high' in this instance is the variable 'inflate_threshold'. It
522 * is expressed as a percentage, so we multiply it with
523 * tnode_child_length() and instead of multiplying by 2 (since the
524 * child array will be doubled by inflate()) and multiplying
525 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 526 * multiply the left-hand side by 50.
c877efb2
SH
527 *
528 * The left-hand side may look a bit weird: tnode_child_length(tn)
529 * - tn->empty_children is of course the number of non-null children
530 * in the current node. tn->full_children is the number of "full"
19baf839 531 * children, that is non-null tnodes with a skip value of 0.
c877efb2 532 * All of those will be doubled in the resulting inflated tnode, so
19baf839 533 * we just count them one extra time here.
c877efb2 534 *
19baf839 535 * A clearer way to write this would be:
c877efb2 536 *
19baf839 537 * to_be_doubled = tn->full_children;
c877efb2 538 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
539 * tn->full_children;
540 *
541 * new_child_length = tnode_child_length(tn) * 2;
542 *
c877efb2 543 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
544 * new_child_length;
545 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
546 *
547 * ...and so on, tho it would mess up the while () loop.
548 *
19baf839
RO
549 * anyway,
550 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
551 * inflate_threshold
c877efb2 552 *
19baf839
RO
553 * avoid a division:
554 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
555 * inflate_threshold * new_child_length
c877efb2 556 *
19baf839 557 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 558 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 559 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 560 *
19baf839 561 * expand new_child_length:
c877efb2 562 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 563 * tn->full_children) >=
19baf839 564 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 565 *
19baf839 566 * shorten again:
c877efb2 567 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 568 * tn->empty_children) >= inflate_threshold *
19baf839 569 * tnode_child_length(tn)
c877efb2 570 *
19baf839
RO
571 */
572
e6308be8
RO
573 /* Keep root node larger */
574
64c9b6fb 575 if (!node_parent(tn)) {
80b71b80
JL
576 inflate_threshold_use = inflate_threshold_root;
577 halve_threshold_use = halve_threshold_root;
a034ee3c 578 } else {
e6308be8 579 inflate_threshold_use = inflate_threshold;
80b71b80
JL
580 halve_threshold_use = halve_threshold;
581 }
e6308be8 582
80b71b80
JL
583 max_work = MAX_WORK;
584 while ((tn->full_children > 0 && max_work-- &&
a07f5f50
SH
585 50 * (tn->full_children + tnode_child_length(tn)
586 - tn->empty_children)
587 >= inflate_threshold_use * tnode_child_length(tn))) {
19baf839 588
2f80b3c8
RO
589 old_tn = tn;
590 tn = inflate(t, tn);
a07f5f50 591
2f80b3c8
RO
592 if (IS_ERR(tn)) {
593 tn = old_tn;
2f36895a 594#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 595 this_cpu_inc(t->stats->resize_node_skipped);
2f36895a
RO
596#endif
597 break;
598 }
19baf839
RO
599 }
600
80b71b80 601 /* Return if at least one inflate is run */
a034ee3c 602 if (max_work != MAX_WORK)
adaf9816 603 return tn;
80b71b80 604
19baf839
RO
605 /*
606 * Halve as long as the number of empty children in this
607 * node is above threshold.
608 */
2f36895a 609
80b71b80
JL
610 max_work = MAX_WORK;
611 while (tn->bits > 1 && max_work-- &&
19baf839 612 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 613 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 614
2f80b3c8
RO
615 old_tn = tn;
616 tn = halve(t, tn);
617 if (IS_ERR(tn)) {
618 tn = old_tn;
2f36895a 619#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 620 this_cpu_inc(t->stats->resize_node_skipped);
2f36895a
RO
621#endif
622 break;
623 }
624 }
19baf839 625
c877efb2 626
19baf839 627 /* Only one child remains */
64c9b6fb
AD
628 if (tn->empty_children == (tnode_child_length(tn) - 1)) {
629 unsigned long i;
80b71b80 630one_child:
64c9b6fb
AD
631 for (i = tnode_child_length(tn); !n && i;)
632 n = tnode_get_child(tn, --i);
633no_children:
634 /* compress one level */
635 node_set_parent(n, NULL);
636 tnode_free_safe(tn);
637 return n;
80b71b80 638 }
adaf9816 639 return tn;
19baf839
RO
640}
641
0a5c0475
ED
642
643static void tnode_clean_free(struct tnode *tn)
644{
adaf9816 645 struct tnode *tofree;
0a5c0475 646 int i;
0a5c0475
ED
647
648 for (i = 0; i < tnode_child_length(tn); i++) {
37fd30f2 649 tofree = rtnl_dereference(tn->child[i]);
0a5c0475 650 if (tofree)
37fd30f2 651 node_free(tofree);
0a5c0475 652 }
37fd30f2 653 node_free(tn);
0a5c0475
ED
654}
655
adaf9816 656static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
19baf839 657{
adaf9816
AD
658 int olen = tnode_child_length(oldtnode);
659 struct tnode *tn;
19baf839
RO
660 int i;
661
0c7770c7 662 pr_debug("In inflate\n");
19baf839
RO
663
664 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
665
0c7770c7 666 if (!tn)
2f80b3c8 667 return ERR_PTR(-ENOMEM);
2f36895a
RO
668
669 /*
c877efb2
SH
670 * Preallocate and store tnodes before the actual work so we
671 * don't get into an inconsistent state if memory allocation
672 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
673 * of tnode is ignored.
674 */
91b9a277
OJ
675
676 for (i = 0; i < olen; i++) {
a07f5f50 677 struct tnode *inode;
2f36895a 678
adaf9816
AD
679 inode = tnode_get_child(oldtnode, i);
680 if (tnode_full(oldtnode, inode) && inode->bits > 1) {
2f36895a 681 struct tnode *left, *right;
ab66b4a7 682 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
c877efb2 683
2f36895a
RO
684 left = tnode_new(inode->key&(~m), inode->pos + 1,
685 inode->bits - 1);
2f80b3c8
RO
686 if (!left)
687 goto nomem;
91b9a277 688
2f36895a
RO
689 right = tnode_new(inode->key|m, inode->pos + 1,
690 inode->bits - 1);
691
e905a9ed 692 if (!right) {
37fd30f2 693 node_free(left);
2f80b3c8 694 goto nomem;
e905a9ed 695 }
2f36895a 696
adaf9816
AD
697 put_child(tn, 2*i, left);
698 put_child(tn, 2*i+1, right);
2f36895a
RO
699 }
700 }
701
91b9a277 702 for (i = 0; i < olen; i++) {
adaf9816 703 struct tnode *inode = tnode_get_child(oldtnode, i);
91b9a277
OJ
704 struct tnode *left, *right;
705 int size, j;
c877efb2 706
19baf839 707 /* An empty child */
adaf9816 708 if (inode == NULL)
19baf839
RO
709 continue;
710
711 /* A leaf or an internal node with skipped bits */
adaf9816 712 if (!tnode_full(oldtnode, inode)) {
bbe34cf8 713 put_child(tn,
adaf9816
AD
714 tkey_extract_bits(inode->key, tn->pos, tn->bits),
715 inode);
19baf839
RO
716 continue;
717 }
718
719 /* An internal node with two children */
19baf839 720 if (inode->bits == 1) {
61648d91
LM
721 put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
722 put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
19baf839 723
e0f7cb8c 724 tnode_free_safe(inode);
91b9a277 725 continue;
19baf839
RO
726 }
727
91b9a277
OJ
728 /* An internal node with more than two children */
729
730 /* We will replace this node 'inode' with two new
731 * ones, 'left' and 'right', each with half of the
732 * original children. The two new nodes will have
733 * a position one bit further down the key and this
734 * means that the "significant" part of their keys
735 * (see the discussion near the top of this file)
736 * will differ by one bit, which will be "0" in
737 * left's key and "1" in right's key. Since we are
738 * moving the key position by one step, the bit that
739 * we are moving away from - the bit at position
740 * (inode->pos) - is the one that will differ between
741 * left and right. So... we synthesize that bit in the
742 * two new keys.
743 * The mask 'm' below will be a single "one" bit at
744 * the position (inode->pos)
745 */
19baf839 746
91b9a277
OJ
747 /* Use the old key, but set the new significant
748 * bit to zero.
749 */
2f36895a 750
adaf9816 751 left = tnode_get_child(tn, 2*i);
61648d91 752 put_child(tn, 2*i, NULL);
2f36895a 753
91b9a277 754 BUG_ON(!left);
2f36895a 755
adaf9816 756 right = tnode_get_child(tn, 2*i+1);
61648d91 757 put_child(tn, 2*i+1, NULL);
19baf839 758
91b9a277 759 BUG_ON(!right);
19baf839 760
91b9a277
OJ
761 size = tnode_child_length(left);
762 for (j = 0; j < size; j++) {
61648d91
LM
763 put_child(left, j, rtnl_dereference(inode->child[j]));
764 put_child(right, j, rtnl_dereference(inode->child[j + size]));
19baf839 765 }
61648d91
LM
766 put_child(tn, 2*i, resize(t, left));
767 put_child(tn, 2*i+1, resize(t, right));
91b9a277 768
e0f7cb8c 769 tnode_free_safe(inode);
19baf839 770 }
e0f7cb8c 771 tnode_free_safe(oldtnode);
19baf839 772 return tn;
2f80b3c8 773nomem:
0a5c0475
ED
774 tnode_clean_free(tn);
775 return ERR_PTR(-ENOMEM);
19baf839
RO
776}
777
adaf9816 778static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
19baf839 779{
adaf9816
AD
780 int olen = tnode_child_length(oldtnode);
781 struct tnode *tn, *left, *right;
19baf839 782 int i;
19baf839 783
0c7770c7 784 pr_debug("In halve\n");
c877efb2
SH
785
786 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 787
2f80b3c8
RO
788 if (!tn)
789 return ERR_PTR(-ENOMEM);
2f36895a
RO
790
791 /*
c877efb2
SH
792 * Preallocate and store tnodes before the actual work so we
793 * don't get into an inconsistent state if memory allocation
794 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
795 * of tnode is ignored.
796 */
797
91b9a277 798 for (i = 0; i < olen; i += 2) {
2f36895a
RO
799 left = tnode_get_child(oldtnode, i);
800 right = tnode_get_child(oldtnode, i+1);
c877efb2 801
2f36895a 802 /* Two nonempty children */
0c7770c7 803 if (left && right) {
2f80b3c8 804 struct tnode *newn;
0c7770c7 805
2f80b3c8 806 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
807
808 if (!newn)
2f80b3c8 809 goto nomem;
0c7770c7 810
adaf9816 811 put_child(tn, i/2, newn);
2f36895a 812 }
2f36895a 813
2f36895a 814 }
19baf839 815
91b9a277
OJ
816 for (i = 0; i < olen; i += 2) {
817 struct tnode *newBinNode;
818
19baf839
RO
819 left = tnode_get_child(oldtnode, i);
820 right = tnode_get_child(oldtnode, i+1);
c877efb2 821
19baf839
RO
822 /* At least one of the children is empty */
823 if (left == NULL) {
824 if (right == NULL) /* Both are empty */
825 continue;
61648d91 826 put_child(tn, i/2, right);
91b9a277 827 continue;
0c7770c7 828 }
91b9a277
OJ
829
830 if (right == NULL) {
61648d91 831 put_child(tn, i/2, left);
91b9a277
OJ
832 continue;
833 }
c877efb2 834
19baf839 835 /* Two nonempty children */
adaf9816 836 newBinNode = tnode_get_child(tn, i/2);
61648d91
LM
837 put_child(tn, i/2, NULL);
838 put_child(newBinNode, 0, left);
839 put_child(newBinNode, 1, right);
840 put_child(tn, i/2, resize(t, newBinNode));
19baf839 841 }
e0f7cb8c 842 tnode_free_safe(oldtnode);
19baf839 843 return tn;
2f80b3c8 844nomem:
0a5c0475
ED
845 tnode_clean_free(tn);
846 return ERR_PTR(-ENOMEM);
19baf839
RO
847}
848
772cb712 849/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
850 via get_fa_head and dump */
851
adaf9816 852static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
19baf839 853{
772cb712 854 struct hlist_head *head = &l->list;
19baf839
RO
855 struct leaf_info *li;
856
b67bfe0d 857 hlist_for_each_entry_rcu(li, head, hlist)
c877efb2 858 if (li->plen == plen)
19baf839 859 return li;
91b9a277 860
19baf839
RO
861 return NULL;
862}
863
adaf9816 864static inline struct list_head *get_fa_head(struct tnode *l, int plen)
19baf839 865{
772cb712 866 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 867
91b9a277
OJ
868 if (!li)
869 return NULL;
c877efb2 870
91b9a277 871 return &li->falh;
19baf839
RO
872}
873
874static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
875{
e905a9ed 876 struct leaf_info *li = NULL, *last = NULL;
e905a9ed
YH
877
878 if (hlist_empty(head)) {
879 hlist_add_head_rcu(&new->hlist, head);
880 } else {
b67bfe0d 881 hlist_for_each_entry(li, head, hlist) {
e905a9ed
YH
882 if (new->plen > li->plen)
883 break;
884
885 last = li;
886 }
887 if (last)
1d023284 888 hlist_add_behind_rcu(&new->hlist, &last->hlist);
e905a9ed
YH
889 else
890 hlist_add_before_rcu(&new->hlist, &li->hlist);
891 }
19baf839
RO
892}
893
2373ce1c 894/* rcu_read_lock needs to be hold by caller from readside */
adaf9816 895static struct tnode *fib_find_node(struct trie *t, u32 key)
19baf839 896{
adaf9816 897 struct tnode *n = rcu_dereference_rtnl(t->trie);
939afb06
AD
898
899 while (n) {
900 unsigned long index = get_index(key, n);
901
902 /* This bit of code is a bit tricky but it combines multiple
903 * checks into a single check. The prefix consists of the
904 * prefix plus zeros for the bits in the cindex. The index
905 * is the difference between the key and this value. From
906 * this we can actually derive several pieces of data.
907 * if !(index >> bits)
908 * we know the value is cindex
909 * else
910 * we have a mismatch in skip bits and failed
911 */
912 if (index >> n->bits)
913 return NULL;
914
915 /* we have found a leaf. Prefixes have already been compared */
916 if (IS_LEAF(n))
19baf839 917 break;
19baf839 918
939afb06
AD
919 n = rcu_dereference_rtnl(n->child[index]);
920 }
91b9a277 921
939afb06 922 return n;
19baf839
RO
923}
924
7b85576d 925static void trie_rebalance(struct trie *t, struct tnode *tn)
19baf839 926{
19baf839 927 int wasfull;
3ed18d76 928 t_key cindex, key;
06801916 929 struct tnode *tp;
19baf839 930
3ed18d76
RO
931 key = tn->key;
932
64c9b6fb 933 while (tn != NULL && (tp = node_parent(tn)) != NULL) {
19baf839
RO
934 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
935 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
adaf9816 936 tn = resize(t, tn);
a07f5f50 937
adaf9816 938 tnode_put_child_reorg(tp, cindex, tn, wasfull);
91b9a277 939
64c9b6fb 940 tp = node_parent(tn);
008440e3 941 if (!tp)
adaf9816 942 rcu_assign_pointer(t->trie, tn);
008440e3 943
e0f7cb8c 944 tnode_free_flush();
06801916 945 if (!tp)
19baf839 946 break;
06801916 947 tn = tp;
19baf839 948 }
06801916 949
19baf839 950 /* Handle last (top) tnode */
7b85576d 951 if (IS_TNODE(tn))
adaf9816 952 tn = resize(t, tn);
19baf839 953
adaf9816 954 rcu_assign_pointer(t->trie, tn);
7b85576d 955 tnode_free_flush();
19baf839
RO
956}
957
2373ce1c
RO
958/* only used from updater-side */
959
fea86ad8 960static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
19baf839
RO
961{
962 int pos, newpos;
963 struct tnode *tp = NULL, *tn = NULL;
adaf9816
AD
964 struct tnode *n;
965 struct tnode *l;
19baf839 966 int missbit;
c877efb2 967 struct list_head *fa_head = NULL;
19baf839
RO
968 struct leaf_info *li;
969 t_key cindex;
970
971 pos = 0;
0a5c0475 972 n = rtnl_dereference(t->trie);
19baf839 973
c877efb2
SH
974 /* If we point to NULL, stop. Either the tree is empty and we should
975 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 976 * and we should just put our new leaf in that.
c877efb2
SH
977 * If we point to a T_TNODE, check if it matches our key. Note that
978 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
979 * not be the parent's 'pos'+'bits'!
980 *
c877efb2 981 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
982 * the index from our key, push the T_TNODE and walk the tree.
983 *
984 * If it doesn't, we have to replace it with a new T_TNODE.
985 *
c877efb2
SH
986 * If we point to a T_LEAF, it might or might not have the same key
987 * as we do. If it does, just change the value, update the T_LEAF's
988 * value, and return it.
19baf839
RO
989 * If it doesn't, we need to replace it with a T_TNODE.
990 */
991
64c9b6fb 992 while (n && IS_TNODE(n)) {
adaf9816
AD
993 if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
994 tp = n;
995 pos = n->pos + n->bits;
996 n = tnode_get_child(n,
a07f5f50 997 tkey_extract_bits(key,
adaf9816
AD
998 n->pos,
999 n->bits));
19baf839 1000
adaf9816 1001 BUG_ON(n && node_parent(n) != tp);
91b9a277 1002 } else
19baf839
RO
1003 break;
1004 }
1005
1006 /*
1007 * n ----> NULL, LEAF or TNODE
1008 *
c877efb2 1009 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1010 */
1011
91b9a277 1012 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1013
1014 /* Case 1: n is a leaf. Compare prefixes */
1015
c877efb2 1016 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
19baf839 1017 li = leaf_info_new(plen);
91b9a277 1018
fea86ad8
SH
1019 if (!li)
1020 return NULL;
19baf839
RO
1021
1022 fa_head = &li->falh;
adaf9816 1023 insert_leaf_info(&n->list, li);
19baf839
RO
1024 goto done;
1025 }
64c9b6fb 1026 l = leaf_new(key);
19baf839 1027
fea86ad8
SH
1028 if (!l)
1029 return NULL;
19baf839 1030
19baf839
RO
1031 li = leaf_info_new(plen);
1032
c877efb2 1033 if (!li) {
37fd30f2 1034 node_free(l);
fea86ad8 1035 return NULL;
f835e471 1036 }
19baf839
RO
1037
1038 fa_head = &li->falh;
1039 insert_leaf_info(&l->list, li);
1040
19baf839 1041 if (t->trie && n == NULL) {
91b9a277 1042 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839 1043
adaf9816 1044 node_set_parent(l, tp);
19baf839 1045
91b9a277 1046 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
adaf9816 1047 put_child(tp, cindex, l);
91b9a277
OJ
1048 } else {
1049 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1050 /*
1051 * Add a new tnode here
19baf839
RO
1052 * first tnode need some special handling
1053 */
1054
c877efb2 1055 if (n) {
4c60f1d6 1056 pos = tp ? tp->pos+tp->bits : 0;
19baf839
RO
1057 newpos = tkey_mismatch(key, pos, n->key);
1058 tn = tnode_new(n->key, newpos, 1);
91b9a277 1059 } else {
19baf839 1060 newpos = 0;
c877efb2 1061 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1062 }
19baf839 1063
c877efb2 1064 if (!tn) {
f835e471 1065 free_leaf_info(li);
37fd30f2 1066 node_free(l);
fea86ad8 1067 return NULL;
91b9a277
OJ
1068 }
1069
adaf9816 1070 node_set_parent(tn, tp);
19baf839 1071
91b9a277 1072 missbit = tkey_extract_bits(key, newpos, 1);
adaf9816 1073 put_child(tn, missbit, l);
61648d91 1074 put_child(tn, 1-missbit, n);
19baf839 1075
c877efb2 1076 if (tp) {
19baf839 1077 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
adaf9816 1078 put_child(tp, cindex, tn);
91b9a277 1079 } else {
adaf9816 1080 rcu_assign_pointer(t->trie, tn);
19baf839 1081 }
e962f302
AD
1082
1083 tp = tn;
19baf839 1084 }
91b9a277
OJ
1085
1086 if (tp && tp->pos + tp->bits > 32)
058bd4d2
JP
1087 pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1088 tp, tp->pos, tp->bits, key, plen);
91b9a277 1089
19baf839 1090 /* Rebalance the trie */
2373ce1c 1091
7b85576d 1092 trie_rebalance(t, tp);
f835e471 1093done:
19baf839
RO
1094 return fa_head;
1095}
1096
d562f1f8
RO
1097/*
1098 * Caller must hold RTNL.
1099 */
16c6cf8b 1100int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1101{
1102 struct trie *t = (struct trie *) tb->tb_data;
1103 struct fib_alias *fa, *new_fa;
c877efb2 1104 struct list_head *fa_head = NULL;
19baf839 1105 struct fib_info *fi;
4e902c57
TG
1106 int plen = cfg->fc_dst_len;
1107 u8 tos = cfg->fc_tos;
19baf839
RO
1108 u32 key, mask;
1109 int err;
adaf9816 1110 struct tnode *l;
19baf839
RO
1111
1112 if (plen > 32)
1113 return -EINVAL;
1114
4e902c57 1115 key = ntohl(cfg->fc_dst);
19baf839 1116
2dfe55b4 1117 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
19baf839 1118
91b9a277 1119 mask = ntohl(inet_make_mask(plen));
19baf839 1120
c877efb2 1121 if (key & ~mask)
19baf839
RO
1122 return -EINVAL;
1123
1124 key = key & mask;
1125
4e902c57
TG
1126 fi = fib_create_info(cfg);
1127 if (IS_ERR(fi)) {
1128 err = PTR_ERR(fi);
19baf839 1129 goto err;
4e902c57 1130 }
19baf839
RO
1131
1132 l = fib_find_node(t, key);
c877efb2 1133 fa = NULL;
19baf839 1134
c877efb2 1135 if (l) {
19baf839
RO
1136 fa_head = get_fa_head(l, plen);
1137 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1138 }
1139
1140 /* Now fa, if non-NULL, points to the first fib alias
1141 * with the same keys [prefix,tos,priority], if such key already
1142 * exists or to the node before which we will insert new one.
1143 *
1144 * If fa is NULL, we will need to allocate a new one and
1145 * insert to the head of f.
1146 *
1147 * If f is NULL, no fib node matched the destination key
1148 * and we need to allocate a new one of those as well.
1149 */
1150
936f6f8e
JA
1151 if (fa && fa->fa_tos == tos &&
1152 fa->fa_info->fib_priority == fi->fib_priority) {
1153 struct fib_alias *fa_first, *fa_match;
19baf839
RO
1154
1155 err = -EEXIST;
4e902c57 1156 if (cfg->fc_nlflags & NLM_F_EXCL)
19baf839
RO
1157 goto out;
1158
936f6f8e
JA
1159 /* We have 2 goals:
1160 * 1. Find exact match for type, scope, fib_info to avoid
1161 * duplicate routes
1162 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1163 */
1164 fa_match = NULL;
1165 fa_first = fa;
1166 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1167 list_for_each_entry_continue(fa, fa_head, fa_list) {
1168 if (fa->fa_tos != tos)
1169 break;
1170 if (fa->fa_info->fib_priority != fi->fib_priority)
1171 break;
1172 if (fa->fa_type == cfg->fc_type &&
936f6f8e
JA
1173 fa->fa_info == fi) {
1174 fa_match = fa;
1175 break;
1176 }
1177 }
1178
4e902c57 1179 if (cfg->fc_nlflags & NLM_F_REPLACE) {
19baf839
RO
1180 struct fib_info *fi_drop;
1181 u8 state;
1182
936f6f8e
JA
1183 fa = fa_first;
1184 if (fa_match) {
1185 if (fa == fa_match)
1186 err = 0;
6725033f 1187 goto out;
936f6f8e 1188 }
2373ce1c 1189 err = -ENOBUFS;
e94b1766 1190 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
2373ce1c
RO
1191 if (new_fa == NULL)
1192 goto out;
19baf839
RO
1193
1194 fi_drop = fa->fa_info;
2373ce1c
RO
1195 new_fa->fa_tos = fa->fa_tos;
1196 new_fa->fa_info = fi;
4e902c57 1197 new_fa->fa_type = cfg->fc_type;
19baf839 1198 state = fa->fa_state;
936f6f8e 1199 new_fa->fa_state = state & ~FA_S_ACCESSED;
19baf839 1200
2373ce1c
RO
1201 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1202 alias_free_mem_rcu(fa);
19baf839
RO
1203
1204 fib_release_info(fi_drop);
1205 if (state & FA_S_ACCESSED)
4ccfe6d4 1206 rt_cache_flush(cfg->fc_nlinfo.nl_net);
b8f55831
MK
1207 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1208 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
19baf839 1209
91b9a277 1210 goto succeeded;
19baf839
RO
1211 }
1212 /* Error if we find a perfect match which
1213 * uses the same scope, type, and nexthop
1214 * information.
1215 */
936f6f8e
JA
1216 if (fa_match)
1217 goto out;
a07f5f50 1218
4e902c57 1219 if (!(cfg->fc_nlflags & NLM_F_APPEND))
936f6f8e 1220 fa = fa_first;
19baf839
RO
1221 }
1222 err = -ENOENT;
4e902c57 1223 if (!(cfg->fc_nlflags & NLM_F_CREATE))
19baf839
RO
1224 goto out;
1225
1226 err = -ENOBUFS;
e94b1766 1227 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
19baf839
RO
1228 if (new_fa == NULL)
1229 goto out;
1230
1231 new_fa->fa_info = fi;
1232 new_fa->fa_tos = tos;
4e902c57 1233 new_fa->fa_type = cfg->fc_type;
19baf839 1234 new_fa->fa_state = 0;
19baf839
RO
1235 /*
1236 * Insert new entry to the list.
1237 */
1238
c877efb2 1239 if (!fa_head) {
fea86ad8
SH
1240 fa_head = fib_insert_node(t, key, plen);
1241 if (unlikely(!fa_head)) {
1242 err = -ENOMEM;
f835e471 1243 goto out_free_new_fa;
fea86ad8 1244 }
f835e471 1245 }
19baf839 1246
21d8c49e
DM
1247 if (!plen)
1248 tb->tb_num_default++;
1249
2373ce1c
RO
1250 list_add_tail_rcu(&new_fa->fa_list,
1251 (fa ? &fa->fa_list : fa_head));
19baf839 1252
4ccfe6d4 1253 rt_cache_flush(cfg->fc_nlinfo.nl_net);
4e902c57 1254 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
b8f55831 1255 &cfg->fc_nlinfo, 0);
19baf839
RO
1256succeeded:
1257 return 0;
f835e471
RO
1258
1259out_free_new_fa:
1260 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1261out:
1262 fib_release_info(fi);
91b9a277 1263err:
19baf839
RO
1264 return err;
1265}
1266
772cb712 1267/* should be called with rcu_read_lock */
adaf9816 1268static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
22bd5b9b 1269 t_key key, const struct flowi4 *flp,
ebc0ffae 1270 struct fib_result *res, int fib_flags)
19baf839 1271{
19baf839
RO
1272 struct leaf_info *li;
1273 struct hlist_head *hhead = &l->list;
c877efb2 1274
b67bfe0d 1275 hlist_for_each_entry_rcu(li, hhead, hlist) {
3be0686b 1276 struct fib_alias *fa;
a07f5f50 1277
5c74501f 1278 if (l->key != (key & li->mask_plen))
19baf839
RO
1279 continue;
1280
3be0686b
DM
1281 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1282 struct fib_info *fi = fa->fa_info;
1283 int nhsel, err;
a07f5f50 1284
22bd5b9b 1285 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
3be0686b 1286 continue;
dccd9ecc
DM
1287 if (fi->fib_dead)
1288 continue;
37e826c5 1289 if (fa->fa_info->fib_scope < flp->flowi4_scope)
3be0686b
DM
1290 continue;
1291 fib_alias_accessed(fa);
1292 err = fib_props[fa->fa_type].error;
9f9e636d 1293 if (unlikely(err < 0)) {
19baf839 1294#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1295 this_cpu_inc(t->stats->semantic_match_passed);
3be0686b 1296#endif
1fbc7843 1297 return err;
3be0686b
DM
1298 }
1299 if (fi->fib_flags & RTNH_F_DEAD)
1300 continue;
1301 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1302 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1303
1304 if (nh->nh_flags & RTNH_F_DEAD)
1305 continue;
22bd5b9b 1306 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
3be0686b
DM
1307 continue;
1308
1309#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1310 this_cpu_inc(t->stats->semantic_match_passed);
3be0686b 1311#endif
5c74501f 1312 res->prefixlen = li->plen;
3be0686b
DM
1313 res->nh_sel = nhsel;
1314 res->type = fa->fa_type;
9f9e636d 1315 res->scope = fi->fib_scope;
3be0686b
DM
1316 res->fi = fi;
1317 res->table = tb;
1318 res->fa_head = &li->falh;
1319 if (!(fib_flags & FIB_LOOKUP_NOREF))
5c74501f 1320 atomic_inc(&fi->fib_clntref);
3be0686b
DM
1321 return 0;
1322 }
1323 }
1324
1325#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1326 this_cpu_inc(t->stats->semantic_match_miss);
19baf839 1327#endif
19baf839 1328 }
a07f5f50 1329
2e655571 1330 return 1;
19baf839
RO
1331}
1332
9f9e636d
AD
1333static inline t_key prefix_mismatch(t_key key, struct tnode *n)
1334{
1335 t_key prefix = n->key;
1336
1337 return (key ^ prefix) & (prefix | -prefix);
1338}
1339
22bd5b9b 1340int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
ebc0ffae 1341 struct fib_result *res, int fib_flags)
19baf839 1342{
9f9e636d 1343 struct trie *t = (struct trie *)tb->tb_data;
8274a97a
AD
1344#ifdef CONFIG_IP_FIB_TRIE_STATS
1345 struct trie_use_stats __percpu *stats = t->stats;
1346#endif
9f9e636d
AD
1347 const t_key key = ntohl(flp->daddr);
1348 struct tnode *n, *pn;
1349 t_key cindex;
1350 int ret = 1;
91b9a277 1351
2373ce1c 1352 rcu_read_lock();
91b9a277 1353
2373ce1c 1354 n = rcu_dereference(t->trie);
c877efb2 1355 if (!n)
19baf839
RO
1356 goto failed;
1357
1358#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1359 this_cpu_inc(stats->gets);
19baf839
RO
1360#endif
1361
adaf9816 1362 pn = n;
9f9e636d
AD
1363 cindex = 0;
1364
1365 /* Step 1: Travel to the longest prefix match in the trie */
1366 for (;;) {
1367 unsigned long index = get_index(key, n);
1368
1369 /* This bit of code is a bit tricky but it combines multiple
1370 * checks into a single check. The prefix consists of the
1371 * prefix plus zeros for the "bits" in the prefix. The index
1372 * is the difference between the key and this value. From
1373 * this we can actually derive several pieces of data.
1374 * if !(index >> bits)
1375 * we know the value is child index
1376 * else
1377 * we have a mismatch in skip bits and failed
1378 */
1379 if (index >> n->bits)
1380 break;
19baf839 1381
9f9e636d
AD
1382 /* we have found a leaf. Prefixes have already been compared */
1383 if (IS_LEAF(n))
a07f5f50 1384 goto found;
19baf839 1385
9f9e636d
AD
1386 /* only record pn and cindex if we are going to be chopping
1387 * bits later. Otherwise we are just wasting cycles.
91b9a277 1388 */
9f9e636d
AD
1389 if (index) {
1390 pn = n;
1391 cindex = index;
91b9a277 1392 }
19baf839 1393
9f9e636d
AD
1394 n = rcu_dereference(n->child[index]);
1395 if (unlikely(!n))
1396 goto backtrace;
1397 }
19baf839 1398
9f9e636d
AD
1399 /* Step 2: Sort out leaves and begin backtracing for longest prefix */
1400 for (;;) {
1401 /* record the pointer where our next node pointer is stored */
1402 struct tnode __rcu **cptr = n->child;
19baf839 1403
9f9e636d
AD
1404 /* This test verifies that none of the bits that differ
1405 * between the key and the prefix exist in the region of
1406 * the lsb and higher in the prefix.
91b9a277 1407 */
9f9e636d
AD
1408 if (unlikely(prefix_mismatch(key, n)))
1409 goto backtrace;
91b9a277 1410
9f9e636d
AD
1411 /* exit out and process leaf */
1412 if (unlikely(IS_LEAF(n)))
1413 break;
91b9a277 1414
9f9e636d
AD
1415 /* Don't bother recording parent info. Since we are in
1416 * prefix match mode we will have to come back to wherever
1417 * we started this traversal anyway
91b9a277 1418 */
91b9a277 1419
9f9e636d 1420 while ((n = rcu_dereference(*cptr)) == NULL) {
19baf839 1421backtrace:
19baf839 1422#ifdef CONFIG_IP_FIB_TRIE_STATS
9f9e636d
AD
1423 if (!n)
1424 this_cpu_inc(stats->null_node_hit);
19baf839 1425#endif
9f9e636d
AD
1426 /* If we are at cindex 0 there are no more bits for
1427 * us to strip at this level so we must ascend back
1428 * up one level to see if there are any more bits to
1429 * be stripped there.
1430 */
1431 while (!cindex) {
1432 t_key pkey = pn->key;
1433
1434 pn = node_parent_rcu(pn);
1435 if (unlikely(!pn))
1436 goto failed;
1437#ifdef CONFIG_IP_FIB_TRIE_STATS
1438 this_cpu_inc(stats->backtrack);
1439#endif
1440 /* Get Child's index */
1441 cindex = get_index(pkey, pn);
1442 }
1443
1444 /* strip the least significant bit from the cindex */
1445 cindex &= cindex - 1;
1446
1447 /* grab pointer for next child node */
1448 cptr = &pn->child[cindex];
c877efb2 1449 }
19baf839 1450 }
9f9e636d 1451
19baf839 1452found:
9f9e636d
AD
1453 /* Step 3: Process the leaf, if that fails fall back to backtracing */
1454 ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
1455 if (unlikely(ret > 0))
1456 goto backtrace;
1457failed:
2373ce1c 1458 rcu_read_unlock();
19baf839
RO
1459 return ret;
1460}
6fc01438 1461EXPORT_SYMBOL_GPL(fib_table_lookup);
19baf839 1462
9195bef7
SH
1463/*
1464 * Remove the leaf and return parent.
1465 */
adaf9816 1466static void trie_leaf_remove(struct trie *t, struct tnode *l)
19baf839 1467{
64c9b6fb 1468 struct tnode *tp = node_parent(l);
c877efb2 1469
9195bef7 1470 pr_debug("entering trie_leaf_remove(%p)\n", l);
19baf839 1471
c877efb2 1472 if (tp) {
9195bef7 1473 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
61648d91 1474 put_child(tp, cindex, NULL);
7b85576d 1475 trie_rebalance(t, tp);
91b9a277 1476 } else
a9b3cd7f 1477 RCU_INIT_POINTER(t->trie, NULL);
19baf839 1478
37fd30f2 1479 node_free(l);
19baf839
RO
1480}
1481
d562f1f8
RO
1482/*
1483 * Caller must hold RTNL.
1484 */
16c6cf8b 1485int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1486{
1487 struct trie *t = (struct trie *) tb->tb_data;
1488 u32 key, mask;
4e902c57
TG
1489 int plen = cfg->fc_dst_len;
1490 u8 tos = cfg->fc_tos;
19baf839
RO
1491 struct fib_alias *fa, *fa_to_delete;
1492 struct list_head *fa_head;
adaf9816 1493 struct tnode *l;
91b9a277
OJ
1494 struct leaf_info *li;
1495
c877efb2 1496 if (plen > 32)
19baf839
RO
1497 return -EINVAL;
1498
4e902c57 1499 key = ntohl(cfg->fc_dst);
91b9a277 1500 mask = ntohl(inet_make_mask(plen));
19baf839 1501
c877efb2 1502 if (key & ~mask)
19baf839
RO
1503 return -EINVAL;
1504
1505 key = key & mask;
1506 l = fib_find_node(t, key);
1507
c877efb2 1508 if (!l)
19baf839
RO
1509 return -ESRCH;
1510
ad5b3102
IM
1511 li = find_leaf_info(l, plen);
1512
1513 if (!li)
1514 return -ESRCH;
1515
1516 fa_head = &li->falh;
19baf839
RO
1517 fa = fib_find_alias(fa_head, tos, 0);
1518
1519 if (!fa)
1520 return -ESRCH;
1521
0c7770c7 1522 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1523
1524 fa_to_delete = NULL;
936f6f8e
JA
1525 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1526 list_for_each_entry_continue(fa, fa_head, fa_list) {
19baf839
RO
1527 struct fib_info *fi = fa->fa_info;
1528
1529 if (fa->fa_tos != tos)
1530 break;
1531
4e902c57
TG
1532 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1533 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
37e826c5 1534 fa->fa_info->fib_scope == cfg->fc_scope) &&
74cb3c10
JA
1535 (!cfg->fc_prefsrc ||
1536 fi->fib_prefsrc == cfg->fc_prefsrc) &&
4e902c57
TG
1537 (!cfg->fc_protocol ||
1538 fi->fib_protocol == cfg->fc_protocol) &&
1539 fib_nh_match(cfg, fi) == 0) {
19baf839
RO
1540 fa_to_delete = fa;
1541 break;
1542 }
1543 }
1544
91b9a277
OJ
1545 if (!fa_to_delete)
1546 return -ESRCH;
19baf839 1547
91b9a277 1548 fa = fa_to_delete;
4e902c57 1549 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
b8f55831 1550 &cfg->fc_nlinfo, 0);
91b9a277 1551
2373ce1c 1552 list_del_rcu(&fa->fa_list);
19baf839 1553
21d8c49e
DM
1554 if (!plen)
1555 tb->tb_num_default--;
1556
91b9a277 1557 if (list_empty(fa_head)) {
2373ce1c 1558 hlist_del_rcu(&li->hlist);
91b9a277 1559 free_leaf_info(li);
2373ce1c 1560 }
19baf839 1561
91b9a277 1562 if (hlist_empty(&l->list))
9195bef7 1563 trie_leaf_remove(t, l);
19baf839 1564
91b9a277 1565 if (fa->fa_state & FA_S_ACCESSED)
4ccfe6d4 1566 rt_cache_flush(cfg->fc_nlinfo.nl_net);
19baf839 1567
2373ce1c
RO
1568 fib_release_info(fa->fa_info);
1569 alias_free_mem_rcu(fa);
91b9a277 1570 return 0;
19baf839
RO
1571}
1572
ef3660ce 1573static int trie_flush_list(struct list_head *head)
19baf839
RO
1574{
1575 struct fib_alias *fa, *fa_node;
1576 int found = 0;
1577
1578 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1579 struct fib_info *fi = fa->fa_info;
19baf839 1580
2373ce1c
RO
1581 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1582 list_del_rcu(&fa->fa_list);
1583 fib_release_info(fa->fa_info);
1584 alias_free_mem_rcu(fa);
19baf839
RO
1585 found++;
1586 }
1587 }
1588 return found;
1589}
1590
adaf9816 1591static int trie_flush_leaf(struct tnode *l)
19baf839
RO
1592{
1593 int found = 0;
1594 struct hlist_head *lih = &l->list;
b67bfe0d 1595 struct hlist_node *tmp;
19baf839
RO
1596 struct leaf_info *li = NULL;
1597
b67bfe0d 1598 hlist_for_each_entry_safe(li, tmp, lih, hlist) {
ef3660ce 1599 found += trie_flush_list(&li->falh);
19baf839
RO
1600
1601 if (list_empty(&li->falh)) {
2373ce1c 1602 hlist_del_rcu(&li->hlist);
19baf839
RO
1603 free_leaf_info(li);
1604 }
1605 }
1606 return found;
1607}
1608
82cfbb00
SH
1609/*
1610 * Scan for the next right leaf starting at node p->child[idx]
1611 * Since we have back pointer, no recursion necessary.
1612 */
adaf9816 1613static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
19baf839 1614{
82cfbb00
SH
1615 do {
1616 t_key idx;
c877efb2 1617
c877efb2 1618 if (c)
82cfbb00 1619 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
c877efb2 1620 else
82cfbb00 1621 idx = 0;
2373ce1c 1622
82cfbb00
SH
1623 while (idx < 1u << p->bits) {
1624 c = tnode_get_child_rcu(p, idx++);
2373ce1c 1625 if (!c)
91b9a277
OJ
1626 continue;
1627
aab515d7 1628 if (IS_LEAF(c))
adaf9816 1629 return c;
82cfbb00
SH
1630
1631 /* Rescan start scanning in new node */
adaf9816 1632 p = c;
82cfbb00 1633 idx = 0;
19baf839 1634 }
82cfbb00
SH
1635
1636 /* Node empty, walk back up to parent */
adaf9816 1637 c = p;
a034ee3c 1638 } while ((p = node_parent_rcu(c)) != NULL);
82cfbb00
SH
1639
1640 return NULL; /* Root of trie */
1641}
1642
adaf9816 1643static struct tnode *trie_firstleaf(struct trie *t)
82cfbb00 1644{
adaf9816 1645 struct tnode *n = rcu_dereference_rtnl(t->trie);
82cfbb00
SH
1646
1647 if (!n)
1648 return NULL;
1649
1650 if (IS_LEAF(n)) /* trie is just a leaf */
adaf9816 1651 return n;
82cfbb00
SH
1652
1653 return leaf_walk_rcu(n, NULL);
1654}
1655
adaf9816 1656static struct tnode *trie_nextleaf(struct tnode *l)
82cfbb00 1657{
adaf9816 1658 struct tnode *p = node_parent_rcu(l);
82cfbb00
SH
1659
1660 if (!p)
1661 return NULL; /* trie with just one leaf */
1662
adaf9816 1663 return leaf_walk_rcu(p, l);
19baf839
RO
1664}
1665
adaf9816 1666static struct tnode *trie_leafindex(struct trie *t, int index)
71d67e66 1667{
adaf9816 1668 struct tnode *l = trie_firstleaf(t);
71d67e66 1669
ec28cf73 1670 while (l && index-- > 0)
71d67e66 1671 l = trie_nextleaf(l);
ec28cf73 1672
71d67e66
SH
1673 return l;
1674}
1675
1676
d562f1f8
RO
1677/*
1678 * Caller must hold RTNL.
1679 */
16c6cf8b 1680int fib_table_flush(struct fib_table *tb)
19baf839
RO
1681{
1682 struct trie *t = (struct trie *) tb->tb_data;
adaf9816 1683 struct tnode *l, *ll = NULL;
82cfbb00 1684 int found = 0;
19baf839 1685
82cfbb00 1686 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
ef3660ce 1687 found += trie_flush_leaf(l);
19baf839
RO
1688
1689 if (ll && hlist_empty(&ll->list))
9195bef7 1690 trie_leaf_remove(t, ll);
19baf839
RO
1691 ll = l;
1692 }
1693
1694 if (ll && hlist_empty(&ll->list))
9195bef7 1695 trie_leaf_remove(t, ll);
19baf839 1696
0c7770c7 1697 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1698 return found;
1699}
1700
4aa2c466
PE
1701void fib_free_table(struct fib_table *tb)
1702{
8274a97a
AD
1703#ifdef CONFIG_IP_FIB_TRIE_STATS
1704 struct trie *t = (struct trie *)tb->tb_data;
1705
1706 free_percpu(t->stats);
1707#endif /* CONFIG_IP_FIB_TRIE_STATS */
4aa2c466
PE
1708 kfree(tb);
1709}
1710
a07f5f50
SH
1711static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1712 struct fib_table *tb,
19baf839
RO
1713 struct sk_buff *skb, struct netlink_callback *cb)
1714{
1715 int i, s_i;
1716 struct fib_alias *fa;
32ab5f80 1717 __be32 xkey = htonl(key);
19baf839 1718
71d67e66 1719 s_i = cb->args[5];
19baf839
RO
1720 i = 0;
1721
2373ce1c
RO
1722 /* rcu_read_lock is hold by caller */
1723
1724 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1725 if (i < s_i) {
1726 i++;
1727 continue;
1728 }
19baf839 1729
15e47304 1730 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
19baf839
RO
1731 cb->nlh->nlmsg_seq,
1732 RTM_NEWROUTE,
1733 tb->tb_id,
1734 fa->fa_type,
be403ea1 1735 xkey,
19baf839
RO
1736 plen,
1737 fa->fa_tos,
64347f78 1738 fa->fa_info, NLM_F_MULTI) < 0) {
71d67e66 1739 cb->args[5] = i;
19baf839 1740 return -1;
91b9a277 1741 }
19baf839
RO
1742 i++;
1743 }
71d67e66 1744 cb->args[5] = i;
19baf839
RO
1745 return skb->len;
1746}
1747
adaf9816 1748static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
a88ee229 1749 struct sk_buff *skb, struct netlink_callback *cb)
19baf839 1750{
a88ee229 1751 struct leaf_info *li;
a88ee229 1752 int i, s_i;
19baf839 1753
71d67e66 1754 s_i = cb->args[4];
a88ee229 1755 i = 0;
19baf839 1756
a88ee229 1757 /* rcu_read_lock is hold by caller */
b67bfe0d 1758 hlist_for_each_entry_rcu(li, &l->list, hlist) {
a88ee229
SH
1759 if (i < s_i) {
1760 i++;
19baf839 1761 continue;
a88ee229 1762 }
91b9a277 1763
a88ee229 1764 if (i > s_i)
71d67e66 1765 cb->args[5] = 0;
19baf839 1766
a88ee229 1767 if (list_empty(&li->falh))
19baf839
RO
1768 continue;
1769
a88ee229 1770 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
71d67e66 1771 cb->args[4] = i;
19baf839
RO
1772 return -1;
1773 }
a88ee229 1774 i++;
19baf839 1775 }
a88ee229 1776
71d67e66 1777 cb->args[4] = i;
19baf839
RO
1778 return skb->len;
1779}
1780
16c6cf8b
SH
1781int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1782 struct netlink_callback *cb)
19baf839 1783{
adaf9816 1784 struct tnode *l;
19baf839 1785 struct trie *t = (struct trie *) tb->tb_data;
d5ce8a0e 1786 t_key key = cb->args[2];
71d67e66 1787 int count = cb->args[3];
19baf839 1788
2373ce1c 1789 rcu_read_lock();
d5ce8a0e
SH
1790 /* Dump starting at last key.
1791 * Note: 0.0.0.0/0 (ie default) is first key.
1792 */
71d67e66 1793 if (count == 0)
d5ce8a0e
SH
1794 l = trie_firstleaf(t);
1795 else {
71d67e66
SH
1796 /* Normally, continue from last key, but if that is missing
1797 * fallback to using slow rescan
1798 */
d5ce8a0e 1799 l = fib_find_node(t, key);
71d67e66
SH
1800 if (!l)
1801 l = trie_leafindex(t, count);
d5ce8a0e 1802 }
a88ee229 1803
d5ce8a0e
SH
1804 while (l) {
1805 cb->args[2] = l->key;
a88ee229 1806 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
71d67e66 1807 cb->args[3] = count;
a88ee229 1808 rcu_read_unlock();
a88ee229 1809 return -1;
19baf839 1810 }
d5ce8a0e 1811
71d67e66 1812 ++count;
d5ce8a0e 1813 l = trie_nextleaf(l);
71d67e66
SH
1814 memset(&cb->args[4], 0,
1815 sizeof(cb->args) - 4*sizeof(cb->args[0]));
19baf839 1816 }
71d67e66 1817 cb->args[3] = count;
2373ce1c 1818 rcu_read_unlock();
a88ee229 1819
19baf839 1820 return skb->len;
19baf839
RO
1821}
1822
5348ba85 1823void __init fib_trie_init(void)
7f9b8052 1824{
a07f5f50
SH
1825 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1826 sizeof(struct fib_alias),
bc3c8c1e
SH
1827 0, SLAB_PANIC, NULL);
1828
1829 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
adaf9816 1830 max(sizeof(struct tnode),
bc3c8c1e
SH
1831 sizeof(struct leaf_info)),
1832 0, SLAB_PANIC, NULL);
7f9b8052 1833}
19baf839 1834
7f9b8052 1835
5348ba85 1836struct fib_table *fib_trie_table(u32 id)
19baf839
RO
1837{
1838 struct fib_table *tb;
1839 struct trie *t;
1840
19baf839
RO
1841 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1842 GFP_KERNEL);
1843 if (tb == NULL)
1844 return NULL;
1845
1846 tb->tb_id = id;
971b893e 1847 tb->tb_default = -1;
21d8c49e 1848 tb->tb_num_default = 0;
19baf839
RO
1849
1850 t = (struct trie *) tb->tb_data;
8274a97a
AD
1851 RCU_INIT_POINTER(t->trie, NULL);
1852#ifdef CONFIG_IP_FIB_TRIE_STATS
1853 t->stats = alloc_percpu(struct trie_use_stats);
1854 if (!t->stats) {
1855 kfree(tb);
1856 tb = NULL;
1857 }
1858#endif
19baf839 1859
19baf839
RO
1860 return tb;
1861}
1862
cb7b593c
SH
1863#ifdef CONFIG_PROC_FS
1864/* Depth first Trie walk iterator */
1865struct fib_trie_iter {
1c340b2f 1866 struct seq_net_private p;
3d3b2d25 1867 struct fib_table *tb;
cb7b593c 1868 struct tnode *tnode;
a034ee3c
ED
1869 unsigned int index;
1870 unsigned int depth;
cb7b593c 1871};
19baf839 1872
adaf9816 1873static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 1874{
cb7b593c 1875 struct tnode *tn = iter->tnode;
a034ee3c 1876 unsigned int cindex = iter->index;
cb7b593c 1877 struct tnode *p;
19baf839 1878
6640e697
EB
1879 /* A single entry routing table */
1880 if (!tn)
1881 return NULL;
1882
cb7b593c
SH
1883 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1884 iter->tnode, iter->index, iter->depth);
1885rescan:
1886 while (cindex < (1<<tn->bits)) {
adaf9816 1887 struct tnode *n = tnode_get_child_rcu(tn, cindex);
19baf839 1888
cb7b593c
SH
1889 if (n) {
1890 if (IS_LEAF(n)) {
1891 iter->tnode = tn;
1892 iter->index = cindex + 1;
1893 } else {
1894 /* push down one level */
adaf9816 1895 iter->tnode = n;
cb7b593c
SH
1896 iter->index = 0;
1897 ++iter->depth;
1898 }
1899 return n;
1900 }
19baf839 1901
cb7b593c
SH
1902 ++cindex;
1903 }
91b9a277 1904
cb7b593c 1905 /* Current node exhausted, pop back up */
adaf9816 1906 p = node_parent_rcu(tn);
cb7b593c
SH
1907 if (p) {
1908 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
1909 tn = p;
1910 --iter->depth;
1911 goto rescan;
19baf839 1912 }
cb7b593c
SH
1913
1914 /* got root? */
1915 return NULL;
19baf839
RO
1916}
1917
adaf9816 1918static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
cb7b593c 1919 struct trie *t)
19baf839 1920{
adaf9816 1921 struct tnode *n;
5ddf0eb2 1922
132adf54 1923 if (!t)
5ddf0eb2
RO
1924 return NULL;
1925
1926 n = rcu_dereference(t->trie);
3d3b2d25 1927 if (!n)
5ddf0eb2 1928 return NULL;
19baf839 1929
3d3b2d25 1930 if (IS_TNODE(n)) {
adaf9816 1931 iter->tnode = n;
3d3b2d25
SH
1932 iter->index = 0;
1933 iter->depth = 1;
1934 } else {
1935 iter->tnode = NULL;
1936 iter->index = 0;
1937 iter->depth = 0;
91b9a277 1938 }
3d3b2d25
SH
1939
1940 return n;
cb7b593c 1941}
91b9a277 1942
cb7b593c
SH
1943static void trie_collect_stats(struct trie *t, struct trie_stat *s)
1944{
adaf9816 1945 struct tnode *n;
cb7b593c 1946 struct fib_trie_iter iter;
91b9a277 1947
cb7b593c 1948 memset(s, 0, sizeof(*s));
91b9a277 1949
cb7b593c 1950 rcu_read_lock();
3d3b2d25 1951 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
cb7b593c 1952 if (IS_LEAF(n)) {
93672292 1953 struct leaf_info *li;
93672292 1954
cb7b593c
SH
1955 s->leaves++;
1956 s->totdepth += iter.depth;
1957 if (iter.depth > s->maxdepth)
1958 s->maxdepth = iter.depth;
93672292 1959
adaf9816 1960 hlist_for_each_entry_rcu(li, &n->list, hlist)
93672292 1961 ++s->prefixes;
cb7b593c 1962 } else {
cb7b593c
SH
1963 int i;
1964
1965 s->tnodes++;
adaf9816
AD
1966 if (n->bits < MAX_STAT_DEPTH)
1967 s->nodesizes[n->bits]++;
06ef921d 1968
adaf9816
AD
1969 for (i = 0; i < tnode_child_length(n); i++)
1970 if (!rcu_access_pointer(n->child[i]))
cb7b593c 1971 s->nullpointers++;
19baf839 1972 }
19baf839 1973 }
2373ce1c 1974 rcu_read_unlock();
19baf839
RO
1975}
1976
cb7b593c
SH
1977/*
1978 * This outputs /proc/net/fib_triestats
1979 */
1980static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 1981{
a034ee3c 1982 unsigned int i, max, pointers, bytes, avdepth;
c877efb2 1983
cb7b593c
SH
1984 if (stat->leaves)
1985 avdepth = stat->totdepth*100 / stat->leaves;
1986 else
1987 avdepth = 0;
91b9a277 1988
a07f5f50
SH
1989 seq_printf(seq, "\tAver depth: %u.%02d\n",
1990 avdepth / 100, avdepth % 100);
cb7b593c 1991 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 1992
cb7b593c 1993 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
adaf9816 1994 bytes = sizeof(struct tnode) * stat->leaves;
93672292
SH
1995
1996 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
1997 bytes += sizeof(struct leaf_info) * stat->prefixes;
1998
187b5188 1999 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
cb7b593c 2000 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 2001
06ef921d
RO
2002 max = MAX_STAT_DEPTH;
2003 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 2004 max--;
19baf839 2005
cb7b593c 2006 pointers = 0;
f585a991 2007 for (i = 1; i < max; i++)
cb7b593c 2008 if (stat->nodesizes[i] != 0) {
187b5188 2009 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
cb7b593c
SH
2010 pointers += (1<<i) * stat->nodesizes[i];
2011 }
2012 seq_putc(seq, '\n');
187b5188 2013 seq_printf(seq, "\tPointers: %u\n", pointers);
2373ce1c 2014
adaf9816 2015 bytes += sizeof(struct tnode *) * pointers;
187b5188
SH
2016 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2017 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
66a2f7fd 2018}
2373ce1c 2019
cb7b593c 2020#ifdef CONFIG_IP_FIB_TRIE_STATS
66a2f7fd 2021static void trie_show_usage(struct seq_file *seq,
8274a97a 2022 const struct trie_use_stats __percpu *stats)
66a2f7fd 2023{
8274a97a
AD
2024 struct trie_use_stats s = { 0 };
2025 int cpu;
2026
2027 /* loop through all of the CPUs and gather up the stats */
2028 for_each_possible_cpu(cpu) {
2029 const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
2030
2031 s.gets += pcpu->gets;
2032 s.backtrack += pcpu->backtrack;
2033 s.semantic_match_passed += pcpu->semantic_match_passed;
2034 s.semantic_match_miss += pcpu->semantic_match_miss;
2035 s.null_node_hit += pcpu->null_node_hit;
2036 s.resize_node_skipped += pcpu->resize_node_skipped;
2037 }
2038
66a2f7fd 2039 seq_printf(seq, "\nCounters:\n---------\n");
8274a97a
AD
2040 seq_printf(seq, "gets = %u\n", s.gets);
2041 seq_printf(seq, "backtracks = %u\n", s.backtrack);
a07f5f50 2042 seq_printf(seq, "semantic match passed = %u\n",
8274a97a
AD
2043 s.semantic_match_passed);
2044 seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
2045 seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
2046 seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
cb7b593c 2047}
66a2f7fd
SH
2048#endif /* CONFIG_IP_FIB_TRIE_STATS */
2049
3d3b2d25 2050static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
d717a9a6 2051{
3d3b2d25
SH
2052 if (tb->tb_id == RT_TABLE_LOCAL)
2053 seq_puts(seq, "Local:\n");
2054 else if (tb->tb_id == RT_TABLE_MAIN)
2055 seq_puts(seq, "Main:\n");
2056 else
2057 seq_printf(seq, "Id %d:\n", tb->tb_id);
d717a9a6 2058}
19baf839 2059
3d3b2d25 2060
cb7b593c
SH
2061static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2062{
1c340b2f 2063 struct net *net = (struct net *)seq->private;
3d3b2d25 2064 unsigned int h;
877a9bff 2065
d717a9a6 2066 seq_printf(seq,
a07f5f50
SH
2067 "Basic info: size of leaf:"
2068 " %Zd bytes, size of tnode: %Zd bytes.\n",
adaf9816 2069 sizeof(struct tnode), sizeof(struct tnode));
d717a9a6 2070
3d3b2d25
SH
2071 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2072 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
3d3b2d25
SH
2073 struct fib_table *tb;
2074
b67bfe0d 2075 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
3d3b2d25
SH
2076 struct trie *t = (struct trie *) tb->tb_data;
2077 struct trie_stat stat;
877a9bff 2078
3d3b2d25
SH
2079 if (!t)
2080 continue;
2081
2082 fib_table_print(seq, tb);
2083
2084 trie_collect_stats(t, &stat);
2085 trie_show_stats(seq, &stat);
2086#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 2087 trie_show_usage(seq, t->stats);
3d3b2d25
SH
2088#endif
2089 }
2090 }
19baf839 2091
cb7b593c 2092 return 0;
19baf839
RO
2093}
2094
cb7b593c 2095static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2096{
de05c557 2097 return single_open_net(inode, file, fib_triestat_seq_show);
1c340b2f
DL
2098}
2099
9a32144e 2100static const struct file_operations fib_triestat_fops = {
cb7b593c
SH
2101 .owner = THIS_MODULE,
2102 .open = fib_triestat_seq_open,
2103 .read = seq_read,
2104 .llseek = seq_lseek,
b6fcbdb4 2105 .release = single_release_net,
cb7b593c
SH
2106};
2107
adaf9816 2108static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
19baf839 2109{
1218854a
YH
2110 struct fib_trie_iter *iter = seq->private;
2111 struct net *net = seq_file_net(seq);
cb7b593c 2112 loff_t idx = 0;
3d3b2d25 2113 unsigned int h;
cb7b593c 2114
3d3b2d25
SH
2115 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2116 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
3d3b2d25 2117 struct fib_table *tb;
cb7b593c 2118
b67bfe0d 2119 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
adaf9816 2120 struct tnode *n;
3d3b2d25
SH
2121
2122 for (n = fib_trie_get_first(iter,
2123 (struct trie *) tb->tb_data);
2124 n; n = fib_trie_get_next(iter))
2125 if (pos == idx++) {
2126 iter->tb = tb;
2127 return n;
2128 }
2129 }
cb7b593c 2130 }
3d3b2d25 2131
19baf839
RO
2132 return NULL;
2133}
2134
cb7b593c 2135static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
c95aaf9a 2136 __acquires(RCU)
19baf839 2137{
cb7b593c 2138 rcu_read_lock();
1218854a 2139 return fib_trie_get_idx(seq, *pos);
19baf839
RO
2140}
2141
cb7b593c 2142static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2143{
cb7b593c 2144 struct fib_trie_iter *iter = seq->private;
1218854a 2145 struct net *net = seq_file_net(seq);
3d3b2d25
SH
2146 struct fib_table *tb = iter->tb;
2147 struct hlist_node *tb_node;
2148 unsigned int h;
adaf9816 2149 struct tnode *n;
cb7b593c 2150
19baf839 2151 ++*pos;
3d3b2d25
SH
2152 /* next node in same table */
2153 n = fib_trie_get_next(iter);
2154 if (n)
2155 return n;
19baf839 2156
3d3b2d25
SH
2157 /* walk rest of this hash chain */
2158 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
0a5c0475 2159 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
3d3b2d25
SH
2160 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2161 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2162 if (n)
2163 goto found;
2164 }
19baf839 2165
3d3b2d25
SH
2166 /* new hash chain */
2167 while (++h < FIB_TABLE_HASHSZ) {
2168 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
b67bfe0d 2169 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
3d3b2d25
SH
2170 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2171 if (n)
2172 goto found;
2173 }
2174 }
cb7b593c 2175 return NULL;
3d3b2d25
SH
2176
2177found:
2178 iter->tb = tb;
2179 return n;
cb7b593c 2180}
19baf839 2181
cb7b593c 2182static void fib_trie_seq_stop(struct seq_file *seq, void *v)
c95aaf9a 2183 __releases(RCU)
19baf839 2184{
cb7b593c
SH
2185 rcu_read_unlock();
2186}
91b9a277 2187
cb7b593c
SH
2188static void seq_indent(struct seq_file *seq, int n)
2189{
a034ee3c
ED
2190 while (n-- > 0)
2191 seq_puts(seq, " ");
cb7b593c 2192}
19baf839 2193
28d36e37 2194static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
cb7b593c 2195{
132adf54 2196 switch (s) {
cb7b593c
SH
2197 case RT_SCOPE_UNIVERSE: return "universe";
2198 case RT_SCOPE_SITE: return "site";
2199 case RT_SCOPE_LINK: return "link";
2200 case RT_SCOPE_HOST: return "host";
2201 case RT_SCOPE_NOWHERE: return "nowhere";
2202 default:
28d36e37 2203 snprintf(buf, len, "scope=%d", s);
cb7b593c
SH
2204 return buf;
2205 }
2206}
19baf839 2207
36cbd3dc 2208static const char *const rtn_type_names[__RTN_MAX] = {
cb7b593c
SH
2209 [RTN_UNSPEC] = "UNSPEC",
2210 [RTN_UNICAST] = "UNICAST",
2211 [RTN_LOCAL] = "LOCAL",
2212 [RTN_BROADCAST] = "BROADCAST",
2213 [RTN_ANYCAST] = "ANYCAST",
2214 [RTN_MULTICAST] = "MULTICAST",
2215 [RTN_BLACKHOLE] = "BLACKHOLE",
2216 [RTN_UNREACHABLE] = "UNREACHABLE",
2217 [RTN_PROHIBIT] = "PROHIBIT",
2218 [RTN_THROW] = "THROW",
2219 [RTN_NAT] = "NAT",
2220 [RTN_XRESOLVE] = "XRESOLVE",
2221};
19baf839 2222
a034ee3c 2223static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
cb7b593c 2224{
cb7b593c
SH
2225 if (t < __RTN_MAX && rtn_type_names[t])
2226 return rtn_type_names[t];
28d36e37 2227 snprintf(buf, len, "type %u", t);
cb7b593c 2228 return buf;
19baf839
RO
2229}
2230
cb7b593c
SH
2231/* Pretty print the trie */
2232static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2233{
cb7b593c 2234 const struct fib_trie_iter *iter = seq->private;
adaf9816 2235 struct tnode *n = v;
c877efb2 2236
3d3b2d25
SH
2237 if (!node_parent_rcu(n))
2238 fib_table_print(seq, iter->tb);
095b8501 2239
cb7b593c 2240 if (IS_TNODE(n)) {
adaf9816 2241 __be32 prf = htonl(n->key);
91b9a277 2242
adaf9816 2243 seq_indent(seq, iter->depth - 1);
673d57e7 2244 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
adaf9816
AD
2245 &prf, n->pos, n->bits, n->full_children,
2246 n->empty_children);
cb7b593c 2247 } else {
1328042e 2248 struct leaf_info *li;
adaf9816 2249 __be32 val = htonl(n->key);
cb7b593c
SH
2250
2251 seq_indent(seq, iter->depth);
673d57e7 2252 seq_printf(seq, " |-- %pI4\n", &val);
1328042e 2253
adaf9816 2254 hlist_for_each_entry_rcu(li, &n->list, hlist) {
1328042e
SH
2255 struct fib_alias *fa;
2256
2257 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2258 char buf1[32], buf2[32];
2259
2260 seq_indent(seq, iter->depth+1);
2261 seq_printf(seq, " /%d %s %s", li->plen,
2262 rtn_scope(buf1, sizeof(buf1),
37e826c5 2263 fa->fa_info->fib_scope),
1328042e
SH
2264 rtn_type(buf2, sizeof(buf2),
2265 fa->fa_type));
2266 if (fa->fa_tos)
b9c4d82a 2267 seq_printf(seq, " tos=%d", fa->fa_tos);
1328042e 2268 seq_putc(seq, '\n');
cb7b593c
SH
2269 }
2270 }
19baf839 2271 }
cb7b593c 2272
19baf839
RO
2273 return 0;
2274}
2275
f690808e 2276static const struct seq_operations fib_trie_seq_ops = {
cb7b593c
SH
2277 .start = fib_trie_seq_start,
2278 .next = fib_trie_seq_next,
2279 .stop = fib_trie_seq_stop,
2280 .show = fib_trie_seq_show,
19baf839
RO
2281};
2282
cb7b593c 2283static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839 2284{
1c340b2f
DL
2285 return seq_open_net(inode, file, &fib_trie_seq_ops,
2286 sizeof(struct fib_trie_iter));
19baf839
RO
2287}
2288
9a32144e 2289static const struct file_operations fib_trie_fops = {
cb7b593c
SH
2290 .owner = THIS_MODULE,
2291 .open = fib_trie_seq_open,
2292 .read = seq_read,
2293 .llseek = seq_lseek,
1c340b2f 2294 .release = seq_release_net,
19baf839
RO
2295};
2296
8315f5d8
SH
2297struct fib_route_iter {
2298 struct seq_net_private p;
2299 struct trie *main_trie;
2300 loff_t pos;
2301 t_key key;
2302};
2303
adaf9816 2304static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
8315f5d8 2305{
adaf9816 2306 struct tnode *l = NULL;
8315f5d8
SH
2307 struct trie *t = iter->main_trie;
2308
2309 /* use cache location of last found key */
2310 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2311 pos -= iter->pos;
2312 else {
2313 iter->pos = 0;
2314 l = trie_firstleaf(t);
2315 }
2316
2317 while (l && pos-- > 0) {
2318 iter->pos++;
2319 l = trie_nextleaf(l);
2320 }
2321
2322 if (l)
2323 iter->key = pos; /* remember it */
2324 else
2325 iter->pos = 0; /* forget it */
2326
2327 return l;
2328}
2329
2330static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2331 __acquires(RCU)
2332{
2333 struct fib_route_iter *iter = seq->private;
2334 struct fib_table *tb;
2335
2336 rcu_read_lock();
1218854a 2337 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
8315f5d8
SH
2338 if (!tb)
2339 return NULL;
2340
2341 iter->main_trie = (struct trie *) tb->tb_data;
2342 if (*pos == 0)
2343 return SEQ_START_TOKEN;
2344 else
2345 return fib_route_get_idx(iter, *pos - 1);
2346}
2347
2348static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2349{
2350 struct fib_route_iter *iter = seq->private;
adaf9816 2351 struct tnode *l = v;
8315f5d8
SH
2352
2353 ++*pos;
2354 if (v == SEQ_START_TOKEN) {
2355 iter->pos = 0;
2356 l = trie_firstleaf(iter->main_trie);
2357 } else {
2358 iter->pos++;
2359 l = trie_nextleaf(l);
2360 }
2361
2362 if (l)
2363 iter->key = l->key;
2364 else
2365 iter->pos = 0;
2366 return l;
2367}
2368
2369static void fib_route_seq_stop(struct seq_file *seq, void *v)
2370 __releases(RCU)
2371{
2372 rcu_read_unlock();
2373}
2374
a034ee3c 2375static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
19baf839 2376{
a034ee3c 2377 unsigned int flags = 0;
19baf839 2378
a034ee3c
ED
2379 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2380 flags = RTF_REJECT;
cb7b593c
SH
2381 if (fi && fi->fib_nh->nh_gw)
2382 flags |= RTF_GATEWAY;
32ab5f80 2383 if (mask == htonl(0xFFFFFFFF))
cb7b593c
SH
2384 flags |= RTF_HOST;
2385 flags |= RTF_UP;
2386 return flags;
19baf839
RO
2387}
2388
cb7b593c
SH
2389/*
2390 * This outputs /proc/net/route.
2391 * The format of the file is not supposed to be changed
a034ee3c 2392 * and needs to be same as fib_hash output to avoid breaking
cb7b593c
SH
2393 * legacy utilities
2394 */
2395static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2396{
adaf9816 2397 struct tnode *l = v;
1328042e 2398 struct leaf_info *li;
19baf839 2399
cb7b593c
SH
2400 if (v == SEQ_START_TOKEN) {
2401 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2402 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2403 "\tWindow\tIRTT");
2404 return 0;
2405 }
19baf839 2406
b67bfe0d 2407 hlist_for_each_entry_rcu(li, &l->list, hlist) {
cb7b593c 2408 struct fib_alias *fa;
32ab5f80 2409 __be32 mask, prefix;
91b9a277 2410
cb7b593c
SH
2411 mask = inet_make_mask(li->plen);
2412 prefix = htonl(l->key);
19baf839 2413
cb7b593c 2414 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2415 const struct fib_info *fi = fa->fa_info;
a034ee3c 2416 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
19baf839 2417
cb7b593c
SH
2418 if (fa->fa_type == RTN_BROADCAST
2419 || fa->fa_type == RTN_MULTICAST)
2420 continue;
19baf839 2421
652586df
TH
2422 seq_setwidth(seq, 127);
2423
cb7b593c 2424 if (fi)
5e659e4c
PE
2425 seq_printf(seq,
2426 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
652586df 2427 "%d\t%08X\t%d\t%u\t%u",
cb7b593c
SH
2428 fi->fib_dev ? fi->fib_dev->name : "*",
2429 prefix,
2430 fi->fib_nh->nh_gw, flags, 0, 0,
2431 fi->fib_priority,
2432 mask,
a07f5f50
SH
2433 (fi->fib_advmss ?
2434 fi->fib_advmss + 40 : 0),
cb7b593c 2435 fi->fib_window,
652586df 2436 fi->fib_rtt >> 3);
cb7b593c 2437 else
5e659e4c
PE
2438 seq_printf(seq,
2439 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
652586df 2440 "%d\t%08X\t%d\t%u\t%u",
cb7b593c 2441 prefix, 0, flags, 0, 0, 0,
652586df 2442 mask, 0, 0, 0);
19baf839 2443
652586df 2444 seq_pad(seq, '\n');
cb7b593c 2445 }
19baf839
RO
2446 }
2447
2448 return 0;
2449}
2450
f690808e 2451static const struct seq_operations fib_route_seq_ops = {
8315f5d8
SH
2452 .start = fib_route_seq_start,
2453 .next = fib_route_seq_next,
2454 .stop = fib_route_seq_stop,
cb7b593c 2455 .show = fib_route_seq_show,
19baf839
RO
2456};
2457
cb7b593c 2458static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839 2459{
1c340b2f 2460 return seq_open_net(inode, file, &fib_route_seq_ops,
8315f5d8 2461 sizeof(struct fib_route_iter));
19baf839
RO
2462}
2463
9a32144e 2464static const struct file_operations fib_route_fops = {
cb7b593c
SH
2465 .owner = THIS_MODULE,
2466 .open = fib_route_seq_open,
2467 .read = seq_read,
2468 .llseek = seq_lseek,
1c340b2f 2469 .release = seq_release_net,
19baf839
RO
2470};
2471
61a02653 2472int __net_init fib_proc_init(struct net *net)
19baf839 2473{
d4beaa66 2474 if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
cb7b593c
SH
2475 goto out1;
2476
d4beaa66
G
2477 if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2478 &fib_triestat_fops))
cb7b593c
SH
2479 goto out2;
2480
d4beaa66 2481 if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
cb7b593c
SH
2482 goto out3;
2483
19baf839 2484 return 0;
cb7b593c
SH
2485
2486out3:
ece31ffd 2487 remove_proc_entry("fib_triestat", net->proc_net);
cb7b593c 2488out2:
ece31ffd 2489 remove_proc_entry("fib_trie", net->proc_net);
cb7b593c
SH
2490out1:
2491 return -ENOMEM;
19baf839
RO
2492}
2493
61a02653 2494void __net_exit fib_proc_exit(struct net *net)
19baf839 2495{
ece31ffd
G
2496 remove_proc_entry("fib_trie", net->proc_net);
2497 remove_proc_entry("fib_triestat", net->proc_net);
2498 remove_proc_entry("route", net->proc_net);
19baf839
RO
2499}
2500
2501#endif /* CONFIG_PROC_FS */