]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - net/ipv4/fib_trie.c
fib_trie: Optimize fib_table_insert
[mirror_ubuntu-jammy-kernel.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
e905a9ed 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
19baf839 11 * Agricultural Sciences.
e905a9ed 12 *
19baf839
RO
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
25985edc 15 * This work is based on the LPC-trie which is originally described in:
e905a9ed 16 *
19baf839
RO
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
631dd1a8 19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
19baf839
RO
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
19baf839
RO
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
fd966255
RO
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
19baf839
RO
49 */
50
80b71b80 51#define VERSION "0.409"
19baf839 52
19baf839 53#include <asm/uaccess.h>
1977f032 54#include <linux/bitops.h>
19baf839
RO
55#include <linux/types.h>
56#include <linux/kernel.h>
19baf839
RO
57#include <linux/mm.h>
58#include <linux/string.h>
59#include <linux/socket.h>
60#include <linux/sockios.h>
61#include <linux/errno.h>
62#include <linux/in.h>
63#include <linux/inet.h>
cd8787ab 64#include <linux/inetdevice.h>
19baf839
RO
65#include <linux/netdevice.h>
66#include <linux/if_arp.h>
67#include <linux/proc_fs.h>
2373ce1c 68#include <linux/rcupdate.h>
19baf839
RO
69#include <linux/skbuff.h>
70#include <linux/netlink.h>
71#include <linux/init.h>
72#include <linux/list.h>
5a0e3ad6 73#include <linux/slab.h>
bc3b2d7f 74#include <linux/export.h>
457c4cbc 75#include <net/net_namespace.h>
19baf839
RO
76#include <net/ip.h>
77#include <net/protocol.h>
78#include <net/route.h>
79#include <net/tcp.h>
80#include <net/sock.h>
81#include <net/ip_fib.h>
82#include "fib_lookup.h"
83
06ef921d 84#define MAX_STAT_DEPTH 32
19baf839 85
19baf839 86#define KEYLENGTH (8*sizeof(t_key))
19baf839 87
19baf839
RO
88typedef unsigned int t_key;
89
64c9b6fb
AD
90#define IS_TNODE(n) ((n)->bits)
91#define IS_LEAF(n) (!(n)->bits)
2373ce1c 92
9f9e636d
AD
93#define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits)
94#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv))
95
64c9b6fb
AD
96struct tnode {
97 t_key key;
98 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
99 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
100 struct tnode __rcu *parent;
37fd30f2 101 struct rcu_head rcu;
adaf9816
AD
102 union {
103 /* The fields in this struct are valid if bits > 0 (TNODE) */
104 struct {
105 unsigned int full_children; /* KEYLENGTH bits needed */
106 unsigned int empty_children; /* KEYLENGTH bits needed */
107 struct tnode __rcu *child[0];
108 };
109 /* This list pointer if valid if bits == 0 (LEAF) */
110 struct hlist_head list;
111 };
19baf839
RO
112};
113
114struct leaf_info {
115 struct hlist_node hlist;
116 int plen;
5c74501f 117 u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
19baf839 118 struct list_head falh;
5c74501f 119 struct rcu_head rcu;
19baf839
RO
120};
121
19baf839
RO
122#ifdef CONFIG_IP_FIB_TRIE_STATS
123struct trie_use_stats {
124 unsigned int gets;
125 unsigned int backtrack;
126 unsigned int semantic_match_passed;
127 unsigned int semantic_match_miss;
128 unsigned int null_node_hit;
2f36895a 129 unsigned int resize_node_skipped;
19baf839
RO
130};
131#endif
132
133struct trie_stat {
134 unsigned int totdepth;
135 unsigned int maxdepth;
136 unsigned int tnodes;
137 unsigned int leaves;
138 unsigned int nullpointers;
93672292 139 unsigned int prefixes;
06ef921d 140 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 141};
19baf839
RO
142
143struct trie {
adaf9816 144 struct tnode __rcu *trie;
19baf839 145#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 146 struct trie_use_stats __percpu *stats;
19baf839 147#endif
19baf839
RO
148};
149
adaf9816 150static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
a07f5f50 151 int wasfull);
adaf9816 152static struct tnode *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
153static struct tnode *inflate(struct trie *t, struct tnode *tn);
154static struct tnode *halve(struct trie *t, struct tnode *tn);
e0f7cb8c 155/* tnodes to free after resize(); protected by RTNL */
37fd30f2 156static struct callback_head *tnode_free_head;
c3059477
JP
157static size_t tnode_free_size;
158
159/*
160 * synchronize_rcu after call_rcu for that many pages; it should be especially
161 * useful before resizing the root node with PREEMPT_NONE configs; the value was
162 * obtained experimentally, aiming to avoid visible slowdown.
163 */
164static const int sync_pages = 128;
19baf839 165
e18b890b 166static struct kmem_cache *fn_alias_kmem __read_mostly;
bc3c8c1e 167static struct kmem_cache *trie_leaf_kmem __read_mostly;
19baf839 168
64c9b6fb
AD
169/* caller must hold RTNL */
170#define node_parent(n) rtnl_dereference((n)->parent)
0a5c0475 171
64c9b6fb
AD
172/* caller must hold RCU read lock or RTNL */
173#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
0a5c0475 174
64c9b6fb 175/* wrapper for rcu_assign_pointer */
adaf9816 176static inline void node_set_parent(struct tnode *n, struct tnode *tp)
b59cfbf7 177{
adaf9816
AD
178 if (n)
179 rcu_assign_pointer(n->parent, tp);
06801916
SH
180}
181
64c9b6fb
AD
182#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
183
184/* This provides us with the number of children in this node, in the case of a
185 * leaf this will return 0 meaning none of the children are accessible.
6440cc9e 186 */
64c9b6fb 187static inline int tnode_child_length(const struct tnode *tn)
06801916 188{
64c9b6fb 189 return (1ul << tn->bits) & ~(1ul);
06801916 190}
2373ce1c 191
0a5c0475
ED
192/*
193 * caller must hold RTNL
194 */
adaf9816 195static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i)
b59cfbf7 196{
64c9b6fb 197 BUG_ON(i >= tnode_child_length(tn));
2373ce1c 198
0a5c0475 199 return rtnl_dereference(tn->child[i]);
b59cfbf7
ED
200}
201
0a5c0475
ED
202/*
203 * caller must hold RCU read lock or RTNL
204 */
adaf9816 205static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
19baf839 206{
64c9b6fb 207 BUG_ON(i >= tnode_child_length(tn));
19baf839 208
0a5c0475 209 return rcu_dereference_rtnl(tn->child[i]);
19baf839
RO
210}
211
3b004569 212static inline t_key mask_pfx(t_key k, unsigned int l)
ab66b4a7
SH
213{
214 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
215}
216
3b004569 217static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
19baf839 218{
91b9a277 219 if (offset < KEYLENGTH)
19baf839 220 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 221 else
19baf839
RO
222 return 0;
223}
224
19baf839 225/*
e905a9ed
YH
226 To understand this stuff, an understanding of keys and all their bits is
227 necessary. Every node in the trie has a key associated with it, but not
19baf839
RO
228 all of the bits in that key are significant.
229
230 Consider a node 'n' and its parent 'tp'.
231
e905a9ed
YH
232 If n is a leaf, every bit in its key is significant. Its presence is
233 necessitated by path compression, since during a tree traversal (when
234 searching for a leaf - unless we are doing an insertion) we will completely
235 ignore all skipped bits we encounter. Thus we need to verify, at the end of
236 a potentially successful search, that we have indeed been walking the
19baf839
RO
237 correct key path.
238
e905a9ed
YH
239 Note that we can never "miss" the correct key in the tree if present by
240 following the wrong path. Path compression ensures that segments of the key
241 that are the same for all keys with a given prefix are skipped, but the
242 skipped part *is* identical for each node in the subtrie below the skipped
243 bit! trie_insert() in this implementation takes care of that - note the
19baf839
RO
244 call to tkey_sub_equals() in trie_insert().
245
e905a9ed 246 if n is an internal node - a 'tnode' here, the various parts of its key
19baf839
RO
247 have many different meanings.
248
e905a9ed 249 Example:
19baf839
RO
250 _________________________________________________________________
251 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
252 -----------------------------------------------------------------
e905a9ed 253 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
254
255 _________________________________________________________________
256 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
257 -----------------------------------------------------------------
258 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
259
260 tp->pos = 7
261 tp->bits = 3
262 n->pos = 15
91b9a277 263 n->bits = 4
19baf839 264
e905a9ed
YH
265 First, let's just ignore the bits that come before the parent tp, that is
266 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
19baf839
RO
267 not use them for anything.
268
269 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
e905a9ed 270 index into the parent's child array. That is, they will be used to find
19baf839
RO
271 'n' among tp's children.
272
273 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
274 for the node n.
275
e905a9ed 276 All the bits we have seen so far are significant to the node n. The rest
19baf839
RO
277 of the bits are really not needed or indeed known in n->key.
278
e905a9ed 279 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
19baf839 280 n's child array, and will of course be different for each child.
e905a9ed 281
c877efb2 282
19baf839
RO
283 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
284 at this point.
285
286*/
287
f5026fab
DL
288static const int halve_threshold = 25;
289static const int inflate_threshold = 50;
345aa031 290static const int halve_threshold_root = 15;
80b71b80 291static const int inflate_threshold_root = 30;
2373ce1c
RO
292
293static void __alias_free_mem(struct rcu_head *head)
19baf839 294{
2373ce1c
RO
295 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
296 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
297}
298
2373ce1c 299static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 300{
2373ce1c
RO
301 call_rcu(&fa->rcu, __alias_free_mem);
302}
91b9a277 303
37fd30f2 304#define TNODE_KMALLOC_MAX \
adaf9816 305 ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
91b9a277 306
37fd30f2 307static void __node_free_rcu(struct rcu_head *head)
387a5487 308{
adaf9816 309 struct tnode *n = container_of(head, struct tnode, rcu);
37fd30f2
AD
310
311 if (IS_LEAF(n))
312 kmem_cache_free(trie_leaf_kmem, n);
313 else if (n->bits <= TNODE_KMALLOC_MAX)
314 kfree(n);
315 else
316 vfree(n);
387a5487
SH
317}
318
37fd30f2
AD
319#define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
320
2373ce1c 321static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 322{
bceb0f45 323 kfree_rcu(leaf, rcu);
19baf839
RO
324}
325
8d965444 326static struct tnode *tnode_alloc(size_t size)
f0e36f8c 327{
2373ce1c 328 if (size <= PAGE_SIZE)
8d965444 329 return kzalloc(size, GFP_KERNEL);
15be75cd 330 else
7a1c8e5a 331 return vzalloc(size);
15be75cd 332}
2373ce1c 333
e0f7cb8c
JP
334static void tnode_free_safe(struct tnode *tn)
335{
336 BUG_ON(IS_LEAF(tn));
37fd30f2
AD
337 tn->rcu.next = tnode_free_head;
338 tnode_free_head = &tn->rcu;
e0f7cb8c
JP
339}
340
341static void tnode_free_flush(void)
342{
37fd30f2
AD
343 struct callback_head *head;
344
345 while ((head = tnode_free_head)) {
346 struct tnode *tn = container_of(head, struct tnode, rcu);
347
348 tnode_free_head = head->next;
349 tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
e0f7cb8c 350
37fd30f2 351 node_free(tn);
e0f7cb8c 352 }
c3059477
JP
353
354 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
355 tnode_free_size = 0;
356 synchronize_rcu();
357 }
e0f7cb8c
JP
358}
359
adaf9816 360static struct tnode *leaf_new(t_key key)
2373ce1c 361{
adaf9816 362 struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
2373ce1c 363 if (l) {
64c9b6fb
AD
364 l->parent = NULL;
365 /* set key and pos to reflect full key value
366 * any trailing zeros in the key should be ignored
367 * as the nodes are searched
368 */
369 l->key = key;
370 l->pos = KEYLENGTH;
371 /* set bits to 0 indicating we are not a tnode */
372 l->bits = 0;
373
2373ce1c
RO
374 INIT_HLIST_HEAD(&l->list);
375 }
376 return l;
377}
378
379static struct leaf_info *leaf_info_new(int plen)
380{
381 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
382 if (li) {
383 li->plen = plen;
5c74501f 384 li->mask_plen = ntohl(inet_make_mask(plen));
2373ce1c
RO
385 INIT_LIST_HEAD(&li->falh);
386 }
387 return li;
388}
389
a07f5f50 390static struct tnode *tnode_new(t_key key, int pos, int bits)
19baf839 391{
37fd30f2 392 size_t sz = offsetof(struct tnode, child[1 << bits]);
f0e36f8c 393 struct tnode *tn = tnode_alloc(sz);
64c9b6fb
AD
394 unsigned int shift = pos + bits;
395
396 /* verify bits and pos their msb bits clear and values are valid */
397 BUG_ON(!bits || (shift > KEYLENGTH));
19baf839 398
91b9a277 399 if (tn) {
64c9b6fb 400 tn->parent = NULL;
19baf839
RO
401 tn->pos = pos;
402 tn->bits = bits;
64c9b6fb 403 tn->key = mask_pfx(key, pos);
19baf839
RO
404 tn->full_children = 0;
405 tn->empty_children = 1<<bits;
406 }
c877efb2 407
a034ee3c 408 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
adaf9816 409 sizeof(struct tnode *) << bits);
19baf839
RO
410 return tn;
411}
412
19baf839
RO
413/*
414 * Check whether a tnode 'n' is "full", i.e. it is an internal node
415 * and no bits are skipped. See discussion in dyntree paper p. 6
416 */
417
adaf9816 418static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
19baf839 419{
64c9b6fb 420 return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
19baf839
RO
421}
422
61648d91 423static inline void put_child(struct tnode *tn, int i,
adaf9816 424 struct tnode *n)
19baf839
RO
425{
426 tnode_put_child_reorg(tn, i, n, -1);
427}
428
c877efb2 429 /*
19baf839
RO
430 * Add a child at position i overwriting the old value.
431 * Update the value of full_children and empty_children.
432 */
433
adaf9816 434static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
a07f5f50 435 int wasfull)
19baf839 436{
adaf9816 437 struct tnode *chi = rtnl_dereference(tn->child[i]);
19baf839
RO
438 int isfull;
439
0c7770c7
SH
440 BUG_ON(i >= 1<<tn->bits);
441
19baf839
RO
442 /* update emptyChildren */
443 if (n == NULL && chi != NULL)
444 tn->empty_children++;
445 else if (n != NULL && chi == NULL)
446 tn->empty_children--;
c877efb2 447
19baf839 448 /* update fullChildren */
91b9a277 449 if (wasfull == -1)
19baf839
RO
450 wasfull = tnode_full(tn, chi);
451
452 isfull = tnode_full(tn, n);
c877efb2 453 if (wasfull && !isfull)
19baf839 454 tn->full_children--;
c877efb2 455 else if (!wasfull && isfull)
19baf839 456 tn->full_children++;
91b9a277 457
64c9b6fb 458 node_set_parent(n, tn);
19baf839 459
cf778b00 460 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
461}
462
836a0123
AD
463static void put_child_root(struct tnode *tp, struct trie *t,
464 t_key key, struct tnode *n)
465{
466 if (tp)
467 put_child(tp, get_index(key, tp), n);
468 else
469 rcu_assign_pointer(t->trie, n);
470}
471
80b71b80 472#define MAX_WORK 10
adaf9816 473static struct tnode *resize(struct trie *t, struct tnode *tn)
19baf839 474{
adaf9816 475 struct tnode *old_tn, *n = NULL;
e6308be8
RO
476 int inflate_threshold_use;
477 int halve_threshold_use;
80b71b80 478 int max_work;
19baf839 479
e905a9ed 480 if (!tn)
19baf839
RO
481 return NULL;
482
0c7770c7
SH
483 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
484 tn, inflate_threshold, halve_threshold);
19baf839
RO
485
486 /* No children */
64c9b6fb
AD
487 if (tn->empty_children > (tnode_child_length(tn) - 1))
488 goto no_children;
489
19baf839 490 /* One child */
64c9b6fb 491 if (tn->empty_children == (tnode_child_length(tn) - 1))
80b71b80 492 goto one_child;
c877efb2 493 /*
19baf839
RO
494 * Double as long as the resulting node has a number of
495 * nonempty nodes that are above the threshold.
496 */
497
498 /*
c877efb2
SH
499 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
500 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 501 * Telecommunications, page 6:
c877efb2 502 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
503 * children in the *doubled* node is at least 'high'."
504 *
c877efb2
SH
505 * 'high' in this instance is the variable 'inflate_threshold'. It
506 * is expressed as a percentage, so we multiply it with
507 * tnode_child_length() and instead of multiplying by 2 (since the
508 * child array will be doubled by inflate()) and multiplying
509 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 510 * multiply the left-hand side by 50.
c877efb2
SH
511 *
512 * The left-hand side may look a bit weird: tnode_child_length(tn)
513 * - tn->empty_children is of course the number of non-null children
514 * in the current node. tn->full_children is the number of "full"
19baf839 515 * children, that is non-null tnodes with a skip value of 0.
c877efb2 516 * All of those will be doubled in the resulting inflated tnode, so
19baf839 517 * we just count them one extra time here.
c877efb2 518 *
19baf839 519 * A clearer way to write this would be:
c877efb2 520 *
19baf839 521 * to_be_doubled = tn->full_children;
c877efb2 522 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
523 * tn->full_children;
524 *
525 * new_child_length = tnode_child_length(tn) * 2;
526 *
c877efb2 527 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
528 * new_child_length;
529 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
530 *
531 * ...and so on, tho it would mess up the while () loop.
532 *
19baf839
RO
533 * anyway,
534 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
535 * inflate_threshold
c877efb2 536 *
19baf839
RO
537 * avoid a division:
538 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
539 * inflate_threshold * new_child_length
c877efb2 540 *
19baf839 541 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 542 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 543 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 544 *
19baf839 545 * expand new_child_length:
c877efb2 546 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 547 * tn->full_children) >=
19baf839 548 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 549 *
19baf839 550 * shorten again:
c877efb2 551 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 552 * tn->empty_children) >= inflate_threshold *
19baf839 553 * tnode_child_length(tn)
c877efb2 554 *
19baf839
RO
555 */
556
e6308be8
RO
557 /* Keep root node larger */
558
64c9b6fb 559 if (!node_parent(tn)) {
80b71b80
JL
560 inflate_threshold_use = inflate_threshold_root;
561 halve_threshold_use = halve_threshold_root;
a034ee3c 562 } else {
e6308be8 563 inflate_threshold_use = inflate_threshold;
80b71b80
JL
564 halve_threshold_use = halve_threshold;
565 }
e6308be8 566
80b71b80
JL
567 max_work = MAX_WORK;
568 while ((tn->full_children > 0 && max_work-- &&
a07f5f50
SH
569 50 * (tn->full_children + tnode_child_length(tn)
570 - tn->empty_children)
571 >= inflate_threshold_use * tnode_child_length(tn))) {
19baf839 572
2f80b3c8
RO
573 old_tn = tn;
574 tn = inflate(t, tn);
a07f5f50 575
2f80b3c8
RO
576 if (IS_ERR(tn)) {
577 tn = old_tn;
2f36895a 578#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 579 this_cpu_inc(t->stats->resize_node_skipped);
2f36895a
RO
580#endif
581 break;
582 }
19baf839
RO
583 }
584
80b71b80 585 /* Return if at least one inflate is run */
a034ee3c 586 if (max_work != MAX_WORK)
adaf9816 587 return tn;
80b71b80 588
19baf839
RO
589 /*
590 * Halve as long as the number of empty children in this
591 * node is above threshold.
592 */
2f36895a 593
80b71b80
JL
594 max_work = MAX_WORK;
595 while (tn->bits > 1 && max_work-- &&
19baf839 596 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 597 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 598
2f80b3c8
RO
599 old_tn = tn;
600 tn = halve(t, tn);
601 if (IS_ERR(tn)) {
602 tn = old_tn;
2f36895a 603#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 604 this_cpu_inc(t->stats->resize_node_skipped);
2f36895a
RO
605#endif
606 break;
607 }
608 }
19baf839 609
c877efb2 610
19baf839 611 /* Only one child remains */
64c9b6fb
AD
612 if (tn->empty_children == (tnode_child_length(tn) - 1)) {
613 unsigned long i;
80b71b80 614one_child:
64c9b6fb
AD
615 for (i = tnode_child_length(tn); !n && i;)
616 n = tnode_get_child(tn, --i);
617no_children:
618 /* compress one level */
619 node_set_parent(n, NULL);
620 tnode_free_safe(tn);
621 return n;
80b71b80 622 }
adaf9816 623 return tn;
19baf839
RO
624}
625
0a5c0475
ED
626
627static void tnode_clean_free(struct tnode *tn)
628{
adaf9816 629 struct tnode *tofree;
0a5c0475 630 int i;
0a5c0475
ED
631
632 for (i = 0; i < tnode_child_length(tn); i++) {
37fd30f2 633 tofree = rtnl_dereference(tn->child[i]);
0a5c0475 634 if (tofree)
37fd30f2 635 node_free(tofree);
0a5c0475 636 }
37fd30f2 637 node_free(tn);
0a5c0475
ED
638}
639
adaf9816 640static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
19baf839 641{
adaf9816
AD
642 int olen = tnode_child_length(oldtnode);
643 struct tnode *tn;
19baf839
RO
644 int i;
645
0c7770c7 646 pr_debug("In inflate\n");
19baf839
RO
647
648 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
649
0c7770c7 650 if (!tn)
2f80b3c8 651 return ERR_PTR(-ENOMEM);
2f36895a
RO
652
653 /*
c877efb2
SH
654 * Preallocate and store tnodes before the actual work so we
655 * don't get into an inconsistent state if memory allocation
656 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
657 * of tnode is ignored.
658 */
91b9a277
OJ
659
660 for (i = 0; i < olen; i++) {
a07f5f50 661 struct tnode *inode;
2f36895a 662
adaf9816
AD
663 inode = tnode_get_child(oldtnode, i);
664 if (tnode_full(oldtnode, inode) && inode->bits > 1) {
2f36895a 665 struct tnode *left, *right;
ab66b4a7 666 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
c877efb2 667
2f36895a
RO
668 left = tnode_new(inode->key&(~m), inode->pos + 1,
669 inode->bits - 1);
2f80b3c8
RO
670 if (!left)
671 goto nomem;
91b9a277 672
2f36895a
RO
673 right = tnode_new(inode->key|m, inode->pos + 1,
674 inode->bits - 1);
675
e905a9ed 676 if (!right) {
37fd30f2 677 node_free(left);
2f80b3c8 678 goto nomem;
e905a9ed 679 }
2f36895a 680
adaf9816
AD
681 put_child(tn, 2*i, left);
682 put_child(tn, 2*i+1, right);
2f36895a
RO
683 }
684 }
685
91b9a277 686 for (i = 0; i < olen; i++) {
adaf9816 687 struct tnode *inode = tnode_get_child(oldtnode, i);
91b9a277
OJ
688 struct tnode *left, *right;
689 int size, j;
c877efb2 690
19baf839 691 /* An empty child */
adaf9816 692 if (inode == NULL)
19baf839
RO
693 continue;
694
695 /* A leaf or an internal node with skipped bits */
adaf9816 696 if (!tnode_full(oldtnode, inode)) {
bbe34cf8 697 put_child(tn,
adaf9816
AD
698 tkey_extract_bits(inode->key, tn->pos, tn->bits),
699 inode);
19baf839
RO
700 continue;
701 }
702
703 /* An internal node with two children */
19baf839 704 if (inode->bits == 1) {
61648d91
LM
705 put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
706 put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
19baf839 707
e0f7cb8c 708 tnode_free_safe(inode);
91b9a277 709 continue;
19baf839
RO
710 }
711
91b9a277
OJ
712 /* An internal node with more than two children */
713
714 /* We will replace this node 'inode' with two new
715 * ones, 'left' and 'right', each with half of the
716 * original children. The two new nodes will have
717 * a position one bit further down the key and this
718 * means that the "significant" part of their keys
719 * (see the discussion near the top of this file)
720 * will differ by one bit, which will be "0" in
721 * left's key and "1" in right's key. Since we are
722 * moving the key position by one step, the bit that
723 * we are moving away from - the bit at position
724 * (inode->pos) - is the one that will differ between
725 * left and right. So... we synthesize that bit in the
726 * two new keys.
727 * The mask 'm' below will be a single "one" bit at
728 * the position (inode->pos)
729 */
19baf839 730
91b9a277
OJ
731 /* Use the old key, but set the new significant
732 * bit to zero.
733 */
2f36895a 734
adaf9816 735 left = tnode_get_child(tn, 2*i);
61648d91 736 put_child(tn, 2*i, NULL);
2f36895a 737
91b9a277 738 BUG_ON(!left);
2f36895a 739
adaf9816 740 right = tnode_get_child(tn, 2*i+1);
61648d91 741 put_child(tn, 2*i+1, NULL);
19baf839 742
91b9a277 743 BUG_ON(!right);
19baf839 744
91b9a277
OJ
745 size = tnode_child_length(left);
746 for (j = 0; j < size; j++) {
61648d91
LM
747 put_child(left, j, rtnl_dereference(inode->child[j]));
748 put_child(right, j, rtnl_dereference(inode->child[j + size]));
19baf839 749 }
61648d91
LM
750 put_child(tn, 2*i, resize(t, left));
751 put_child(tn, 2*i+1, resize(t, right));
91b9a277 752
e0f7cb8c 753 tnode_free_safe(inode);
19baf839 754 }
e0f7cb8c 755 tnode_free_safe(oldtnode);
19baf839 756 return tn;
2f80b3c8 757nomem:
0a5c0475
ED
758 tnode_clean_free(tn);
759 return ERR_PTR(-ENOMEM);
19baf839
RO
760}
761
adaf9816 762static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
19baf839 763{
adaf9816
AD
764 int olen = tnode_child_length(oldtnode);
765 struct tnode *tn, *left, *right;
19baf839 766 int i;
19baf839 767
0c7770c7 768 pr_debug("In halve\n");
c877efb2
SH
769
770 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 771
2f80b3c8
RO
772 if (!tn)
773 return ERR_PTR(-ENOMEM);
2f36895a
RO
774
775 /*
c877efb2
SH
776 * Preallocate and store tnodes before the actual work so we
777 * don't get into an inconsistent state if memory allocation
778 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
779 * of tnode is ignored.
780 */
781
91b9a277 782 for (i = 0; i < olen; i += 2) {
2f36895a
RO
783 left = tnode_get_child(oldtnode, i);
784 right = tnode_get_child(oldtnode, i+1);
c877efb2 785
2f36895a 786 /* Two nonempty children */
0c7770c7 787 if (left && right) {
2f80b3c8 788 struct tnode *newn;
0c7770c7 789
2f80b3c8 790 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
791
792 if (!newn)
2f80b3c8 793 goto nomem;
0c7770c7 794
adaf9816 795 put_child(tn, i/2, newn);
2f36895a 796 }
2f36895a 797
2f36895a 798 }
19baf839 799
91b9a277
OJ
800 for (i = 0; i < olen; i += 2) {
801 struct tnode *newBinNode;
802
19baf839
RO
803 left = tnode_get_child(oldtnode, i);
804 right = tnode_get_child(oldtnode, i+1);
c877efb2 805
19baf839
RO
806 /* At least one of the children is empty */
807 if (left == NULL) {
808 if (right == NULL) /* Both are empty */
809 continue;
61648d91 810 put_child(tn, i/2, right);
91b9a277 811 continue;
0c7770c7 812 }
91b9a277
OJ
813
814 if (right == NULL) {
61648d91 815 put_child(tn, i/2, left);
91b9a277
OJ
816 continue;
817 }
c877efb2 818
19baf839 819 /* Two nonempty children */
adaf9816 820 newBinNode = tnode_get_child(tn, i/2);
61648d91
LM
821 put_child(tn, i/2, NULL);
822 put_child(newBinNode, 0, left);
823 put_child(newBinNode, 1, right);
824 put_child(tn, i/2, resize(t, newBinNode));
19baf839 825 }
e0f7cb8c 826 tnode_free_safe(oldtnode);
19baf839 827 return tn;
2f80b3c8 828nomem:
0a5c0475
ED
829 tnode_clean_free(tn);
830 return ERR_PTR(-ENOMEM);
19baf839
RO
831}
832
772cb712 833/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
834 via get_fa_head and dump */
835
adaf9816 836static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
19baf839 837{
772cb712 838 struct hlist_head *head = &l->list;
19baf839
RO
839 struct leaf_info *li;
840
b67bfe0d 841 hlist_for_each_entry_rcu(li, head, hlist)
c877efb2 842 if (li->plen == plen)
19baf839 843 return li;
91b9a277 844
19baf839
RO
845 return NULL;
846}
847
adaf9816 848static inline struct list_head *get_fa_head(struct tnode *l, int plen)
19baf839 849{
772cb712 850 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 851
91b9a277
OJ
852 if (!li)
853 return NULL;
c877efb2 854
91b9a277 855 return &li->falh;
19baf839
RO
856}
857
858static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
859{
e905a9ed 860 struct leaf_info *li = NULL, *last = NULL;
e905a9ed
YH
861
862 if (hlist_empty(head)) {
863 hlist_add_head_rcu(&new->hlist, head);
864 } else {
b67bfe0d 865 hlist_for_each_entry(li, head, hlist) {
e905a9ed
YH
866 if (new->plen > li->plen)
867 break;
868
869 last = li;
870 }
871 if (last)
1d023284 872 hlist_add_behind_rcu(&new->hlist, &last->hlist);
e905a9ed
YH
873 else
874 hlist_add_before_rcu(&new->hlist, &li->hlist);
875 }
19baf839
RO
876}
877
2373ce1c 878/* rcu_read_lock needs to be hold by caller from readside */
adaf9816 879static struct tnode *fib_find_node(struct trie *t, u32 key)
19baf839 880{
adaf9816 881 struct tnode *n = rcu_dereference_rtnl(t->trie);
939afb06
AD
882
883 while (n) {
884 unsigned long index = get_index(key, n);
885
886 /* This bit of code is a bit tricky but it combines multiple
887 * checks into a single check. The prefix consists of the
888 * prefix plus zeros for the bits in the cindex. The index
889 * is the difference between the key and this value. From
890 * this we can actually derive several pieces of data.
891 * if !(index >> bits)
892 * we know the value is cindex
893 * else
894 * we have a mismatch in skip bits and failed
895 */
896 if (index >> n->bits)
897 return NULL;
898
899 /* we have found a leaf. Prefixes have already been compared */
900 if (IS_LEAF(n))
19baf839 901 break;
19baf839 902
939afb06
AD
903 n = rcu_dereference_rtnl(n->child[index]);
904 }
91b9a277 905
939afb06 906 return n;
19baf839
RO
907}
908
7b85576d 909static void trie_rebalance(struct trie *t, struct tnode *tn)
19baf839 910{
19baf839 911 int wasfull;
3ed18d76 912 t_key cindex, key;
06801916 913 struct tnode *tp;
19baf839 914
3ed18d76
RO
915 key = tn->key;
916
64c9b6fb 917 while (tn != NULL && (tp = node_parent(tn)) != NULL) {
19baf839
RO
918 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
919 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
adaf9816 920 tn = resize(t, tn);
a07f5f50 921
adaf9816 922 tnode_put_child_reorg(tp, cindex, tn, wasfull);
91b9a277 923
64c9b6fb 924 tp = node_parent(tn);
008440e3 925 if (!tp)
adaf9816 926 rcu_assign_pointer(t->trie, tn);
008440e3 927
e0f7cb8c 928 tnode_free_flush();
06801916 929 if (!tp)
19baf839 930 break;
06801916 931 tn = tp;
19baf839 932 }
06801916 933
19baf839 934 /* Handle last (top) tnode */
7b85576d 935 if (IS_TNODE(tn))
adaf9816 936 tn = resize(t, tn);
19baf839 937
adaf9816 938 rcu_assign_pointer(t->trie, tn);
7b85576d 939 tnode_free_flush();
19baf839
RO
940}
941
2373ce1c
RO
942/* only used from updater-side */
943
fea86ad8 944static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
19baf839 945{
c877efb2 946 struct list_head *fa_head = NULL;
836a0123 947 struct tnode *l, *n, *tp = NULL;
19baf839 948 struct leaf_info *li;
19baf839 949
836a0123
AD
950 li = leaf_info_new(plen);
951 if (!li)
952 return NULL;
953 fa_head = &li->falh;
954
0a5c0475 955 n = rtnl_dereference(t->trie);
19baf839 956
c877efb2
SH
957 /* If we point to NULL, stop. Either the tree is empty and we should
958 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 959 * and we should just put our new leaf in that.
19baf839 960 *
836a0123
AD
961 * If we hit a node with a key that does't match then we should stop
962 * and create a new tnode to replace that node and insert ourselves
963 * and the other node into the new tnode.
19baf839 964 */
836a0123
AD
965 while (n) {
966 unsigned long index = get_index(key, n);
19baf839 967
836a0123
AD
968 /* This bit of code is a bit tricky but it combines multiple
969 * checks into a single check. The prefix consists of the
970 * prefix plus zeros for the "bits" in the prefix. The index
971 * is the difference between the key and this value. From
972 * this we can actually derive several pieces of data.
973 * if !(index >> bits)
974 * we know the value is child index
975 * else
976 * we have a mismatch in skip bits and failed
977 */
978 if (index >> n->bits)
19baf839 979 break;
19baf839 980
836a0123
AD
981 /* we have found a leaf. Prefixes have already been compared */
982 if (IS_LEAF(n)) {
983 /* Case 1: n is a leaf, and prefixes match*/
984 insert_leaf_info(&n->list, li);
985 return fa_head;
986 }
19baf839 987
836a0123
AD
988 tp = n;
989 n = rcu_dereference_rtnl(n->child[index]);
19baf839 990 }
19baf839 991
836a0123
AD
992 l = leaf_new(key);
993 if (!l) {
994 free_leaf_info(li);
fea86ad8 995 return NULL;
f835e471 996 }
19baf839 997
19baf839
RO
998 insert_leaf_info(&l->list, li);
999
836a0123
AD
1000 /* Case 2: n is a LEAF or a TNODE and the key doesn't match.
1001 *
1002 * Add a new tnode here
1003 * first tnode need some special handling
1004 * leaves us in position for handling as case 3
1005 */
1006 if (n) {
1007 struct tnode *tn;
1008 int newpos;
19baf839 1009
836a0123 1010 newpos = KEYLENGTH - __fls(n->key ^ key) - 1;
19baf839 1011
836a0123 1012 tn = tnode_new(key, newpos, 1);
c877efb2 1013 if (!tn) {
f835e471 1014 free_leaf_info(li);
37fd30f2 1015 node_free(l);
fea86ad8 1016 return NULL;
91b9a277
OJ
1017 }
1018
836a0123
AD
1019 /* initialize routes out of node */
1020 NODE_INIT_PARENT(tn, tp);
1021 put_child(tn, get_index(key, tn) ^ 1, n);
19baf839 1022
836a0123
AD
1023 /* start adding routes into the node */
1024 put_child_root(tp, t, key, tn);
1025 node_set_parent(n, tn);
e962f302 1026
836a0123 1027 /* parent now has a NULL spot where the leaf can go */
e962f302 1028 tp = tn;
19baf839 1029 }
91b9a277 1030
836a0123
AD
1031 /* Case 3: n is NULL, and will just insert a new leaf */
1032 if (tp) {
1033 NODE_INIT_PARENT(l, tp);
1034 put_child(tp, get_index(key, tp), l);
1035 trie_rebalance(t, tp);
1036 } else {
1037 rcu_assign_pointer(t->trie, l);
1038 }
2373ce1c 1039
19baf839
RO
1040 return fa_head;
1041}
1042
d562f1f8
RO
1043/*
1044 * Caller must hold RTNL.
1045 */
16c6cf8b 1046int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1047{
1048 struct trie *t = (struct trie *) tb->tb_data;
1049 struct fib_alias *fa, *new_fa;
c877efb2 1050 struct list_head *fa_head = NULL;
19baf839 1051 struct fib_info *fi;
4e902c57
TG
1052 int plen = cfg->fc_dst_len;
1053 u8 tos = cfg->fc_tos;
19baf839
RO
1054 u32 key, mask;
1055 int err;
adaf9816 1056 struct tnode *l;
19baf839
RO
1057
1058 if (plen > 32)
1059 return -EINVAL;
1060
4e902c57 1061 key = ntohl(cfg->fc_dst);
19baf839 1062
2dfe55b4 1063 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
19baf839 1064
91b9a277 1065 mask = ntohl(inet_make_mask(plen));
19baf839 1066
c877efb2 1067 if (key & ~mask)
19baf839
RO
1068 return -EINVAL;
1069
1070 key = key & mask;
1071
4e902c57
TG
1072 fi = fib_create_info(cfg);
1073 if (IS_ERR(fi)) {
1074 err = PTR_ERR(fi);
19baf839 1075 goto err;
4e902c57 1076 }
19baf839
RO
1077
1078 l = fib_find_node(t, key);
c877efb2 1079 fa = NULL;
19baf839 1080
c877efb2 1081 if (l) {
19baf839
RO
1082 fa_head = get_fa_head(l, plen);
1083 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1084 }
1085
1086 /* Now fa, if non-NULL, points to the first fib alias
1087 * with the same keys [prefix,tos,priority], if such key already
1088 * exists or to the node before which we will insert new one.
1089 *
1090 * If fa is NULL, we will need to allocate a new one and
1091 * insert to the head of f.
1092 *
1093 * If f is NULL, no fib node matched the destination key
1094 * and we need to allocate a new one of those as well.
1095 */
1096
936f6f8e
JA
1097 if (fa && fa->fa_tos == tos &&
1098 fa->fa_info->fib_priority == fi->fib_priority) {
1099 struct fib_alias *fa_first, *fa_match;
19baf839
RO
1100
1101 err = -EEXIST;
4e902c57 1102 if (cfg->fc_nlflags & NLM_F_EXCL)
19baf839
RO
1103 goto out;
1104
936f6f8e
JA
1105 /* We have 2 goals:
1106 * 1. Find exact match for type, scope, fib_info to avoid
1107 * duplicate routes
1108 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1109 */
1110 fa_match = NULL;
1111 fa_first = fa;
1112 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1113 list_for_each_entry_continue(fa, fa_head, fa_list) {
1114 if (fa->fa_tos != tos)
1115 break;
1116 if (fa->fa_info->fib_priority != fi->fib_priority)
1117 break;
1118 if (fa->fa_type == cfg->fc_type &&
936f6f8e
JA
1119 fa->fa_info == fi) {
1120 fa_match = fa;
1121 break;
1122 }
1123 }
1124
4e902c57 1125 if (cfg->fc_nlflags & NLM_F_REPLACE) {
19baf839
RO
1126 struct fib_info *fi_drop;
1127 u8 state;
1128
936f6f8e
JA
1129 fa = fa_first;
1130 if (fa_match) {
1131 if (fa == fa_match)
1132 err = 0;
6725033f 1133 goto out;
936f6f8e 1134 }
2373ce1c 1135 err = -ENOBUFS;
e94b1766 1136 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
2373ce1c
RO
1137 if (new_fa == NULL)
1138 goto out;
19baf839
RO
1139
1140 fi_drop = fa->fa_info;
2373ce1c
RO
1141 new_fa->fa_tos = fa->fa_tos;
1142 new_fa->fa_info = fi;
4e902c57 1143 new_fa->fa_type = cfg->fc_type;
19baf839 1144 state = fa->fa_state;
936f6f8e 1145 new_fa->fa_state = state & ~FA_S_ACCESSED;
19baf839 1146
2373ce1c
RO
1147 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1148 alias_free_mem_rcu(fa);
19baf839
RO
1149
1150 fib_release_info(fi_drop);
1151 if (state & FA_S_ACCESSED)
4ccfe6d4 1152 rt_cache_flush(cfg->fc_nlinfo.nl_net);
b8f55831
MK
1153 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1154 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
19baf839 1155
91b9a277 1156 goto succeeded;
19baf839
RO
1157 }
1158 /* Error if we find a perfect match which
1159 * uses the same scope, type, and nexthop
1160 * information.
1161 */
936f6f8e
JA
1162 if (fa_match)
1163 goto out;
a07f5f50 1164
4e902c57 1165 if (!(cfg->fc_nlflags & NLM_F_APPEND))
936f6f8e 1166 fa = fa_first;
19baf839
RO
1167 }
1168 err = -ENOENT;
4e902c57 1169 if (!(cfg->fc_nlflags & NLM_F_CREATE))
19baf839
RO
1170 goto out;
1171
1172 err = -ENOBUFS;
e94b1766 1173 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
19baf839
RO
1174 if (new_fa == NULL)
1175 goto out;
1176
1177 new_fa->fa_info = fi;
1178 new_fa->fa_tos = tos;
4e902c57 1179 new_fa->fa_type = cfg->fc_type;
19baf839 1180 new_fa->fa_state = 0;
19baf839
RO
1181 /*
1182 * Insert new entry to the list.
1183 */
1184
c877efb2 1185 if (!fa_head) {
fea86ad8
SH
1186 fa_head = fib_insert_node(t, key, plen);
1187 if (unlikely(!fa_head)) {
1188 err = -ENOMEM;
f835e471 1189 goto out_free_new_fa;
fea86ad8 1190 }
f835e471 1191 }
19baf839 1192
21d8c49e
DM
1193 if (!plen)
1194 tb->tb_num_default++;
1195
2373ce1c
RO
1196 list_add_tail_rcu(&new_fa->fa_list,
1197 (fa ? &fa->fa_list : fa_head));
19baf839 1198
4ccfe6d4 1199 rt_cache_flush(cfg->fc_nlinfo.nl_net);
4e902c57 1200 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
b8f55831 1201 &cfg->fc_nlinfo, 0);
19baf839
RO
1202succeeded:
1203 return 0;
f835e471
RO
1204
1205out_free_new_fa:
1206 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1207out:
1208 fib_release_info(fi);
91b9a277 1209err:
19baf839
RO
1210 return err;
1211}
1212
772cb712 1213/* should be called with rcu_read_lock */
adaf9816 1214static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
22bd5b9b 1215 t_key key, const struct flowi4 *flp,
ebc0ffae 1216 struct fib_result *res, int fib_flags)
19baf839 1217{
19baf839
RO
1218 struct leaf_info *li;
1219 struct hlist_head *hhead = &l->list;
c877efb2 1220
b67bfe0d 1221 hlist_for_each_entry_rcu(li, hhead, hlist) {
3be0686b 1222 struct fib_alias *fa;
a07f5f50 1223
5c74501f 1224 if (l->key != (key & li->mask_plen))
19baf839
RO
1225 continue;
1226
3be0686b
DM
1227 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1228 struct fib_info *fi = fa->fa_info;
1229 int nhsel, err;
a07f5f50 1230
22bd5b9b 1231 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
3be0686b 1232 continue;
dccd9ecc
DM
1233 if (fi->fib_dead)
1234 continue;
37e826c5 1235 if (fa->fa_info->fib_scope < flp->flowi4_scope)
3be0686b
DM
1236 continue;
1237 fib_alias_accessed(fa);
1238 err = fib_props[fa->fa_type].error;
9f9e636d 1239 if (unlikely(err < 0)) {
19baf839 1240#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1241 this_cpu_inc(t->stats->semantic_match_passed);
3be0686b 1242#endif
1fbc7843 1243 return err;
3be0686b
DM
1244 }
1245 if (fi->fib_flags & RTNH_F_DEAD)
1246 continue;
1247 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1248 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1249
1250 if (nh->nh_flags & RTNH_F_DEAD)
1251 continue;
22bd5b9b 1252 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
3be0686b
DM
1253 continue;
1254
1255#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1256 this_cpu_inc(t->stats->semantic_match_passed);
3be0686b 1257#endif
5c74501f 1258 res->prefixlen = li->plen;
3be0686b
DM
1259 res->nh_sel = nhsel;
1260 res->type = fa->fa_type;
9f9e636d 1261 res->scope = fi->fib_scope;
3be0686b
DM
1262 res->fi = fi;
1263 res->table = tb;
1264 res->fa_head = &li->falh;
1265 if (!(fib_flags & FIB_LOOKUP_NOREF))
5c74501f 1266 atomic_inc(&fi->fib_clntref);
3be0686b
DM
1267 return 0;
1268 }
1269 }
1270
1271#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1272 this_cpu_inc(t->stats->semantic_match_miss);
19baf839 1273#endif
19baf839 1274 }
a07f5f50 1275
2e655571 1276 return 1;
19baf839
RO
1277}
1278
9f9e636d
AD
1279static inline t_key prefix_mismatch(t_key key, struct tnode *n)
1280{
1281 t_key prefix = n->key;
1282
1283 return (key ^ prefix) & (prefix | -prefix);
1284}
1285
22bd5b9b 1286int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
ebc0ffae 1287 struct fib_result *res, int fib_flags)
19baf839 1288{
9f9e636d 1289 struct trie *t = (struct trie *)tb->tb_data;
8274a97a
AD
1290#ifdef CONFIG_IP_FIB_TRIE_STATS
1291 struct trie_use_stats __percpu *stats = t->stats;
1292#endif
9f9e636d
AD
1293 const t_key key = ntohl(flp->daddr);
1294 struct tnode *n, *pn;
1295 t_key cindex;
1296 int ret = 1;
91b9a277 1297
2373ce1c 1298 rcu_read_lock();
91b9a277 1299
2373ce1c 1300 n = rcu_dereference(t->trie);
c877efb2 1301 if (!n)
19baf839
RO
1302 goto failed;
1303
1304#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 1305 this_cpu_inc(stats->gets);
19baf839
RO
1306#endif
1307
adaf9816 1308 pn = n;
9f9e636d
AD
1309 cindex = 0;
1310
1311 /* Step 1: Travel to the longest prefix match in the trie */
1312 for (;;) {
1313 unsigned long index = get_index(key, n);
1314
1315 /* This bit of code is a bit tricky but it combines multiple
1316 * checks into a single check. The prefix consists of the
1317 * prefix plus zeros for the "bits" in the prefix. The index
1318 * is the difference between the key and this value. From
1319 * this we can actually derive several pieces of data.
1320 * if !(index >> bits)
1321 * we know the value is child index
1322 * else
1323 * we have a mismatch in skip bits and failed
1324 */
1325 if (index >> n->bits)
1326 break;
19baf839 1327
9f9e636d
AD
1328 /* we have found a leaf. Prefixes have already been compared */
1329 if (IS_LEAF(n))
a07f5f50 1330 goto found;
19baf839 1331
9f9e636d
AD
1332 /* only record pn and cindex if we are going to be chopping
1333 * bits later. Otherwise we are just wasting cycles.
91b9a277 1334 */
9f9e636d
AD
1335 if (index) {
1336 pn = n;
1337 cindex = index;
91b9a277 1338 }
19baf839 1339
9f9e636d
AD
1340 n = rcu_dereference(n->child[index]);
1341 if (unlikely(!n))
1342 goto backtrace;
1343 }
19baf839 1344
9f9e636d
AD
1345 /* Step 2: Sort out leaves and begin backtracing for longest prefix */
1346 for (;;) {
1347 /* record the pointer where our next node pointer is stored */
1348 struct tnode __rcu **cptr = n->child;
19baf839 1349
9f9e636d
AD
1350 /* This test verifies that none of the bits that differ
1351 * between the key and the prefix exist in the region of
1352 * the lsb and higher in the prefix.
91b9a277 1353 */
9f9e636d
AD
1354 if (unlikely(prefix_mismatch(key, n)))
1355 goto backtrace;
91b9a277 1356
9f9e636d
AD
1357 /* exit out and process leaf */
1358 if (unlikely(IS_LEAF(n)))
1359 break;
91b9a277 1360
9f9e636d
AD
1361 /* Don't bother recording parent info. Since we are in
1362 * prefix match mode we will have to come back to wherever
1363 * we started this traversal anyway
91b9a277 1364 */
91b9a277 1365
9f9e636d 1366 while ((n = rcu_dereference(*cptr)) == NULL) {
19baf839 1367backtrace:
19baf839 1368#ifdef CONFIG_IP_FIB_TRIE_STATS
9f9e636d
AD
1369 if (!n)
1370 this_cpu_inc(stats->null_node_hit);
19baf839 1371#endif
9f9e636d
AD
1372 /* If we are at cindex 0 there are no more bits for
1373 * us to strip at this level so we must ascend back
1374 * up one level to see if there are any more bits to
1375 * be stripped there.
1376 */
1377 while (!cindex) {
1378 t_key pkey = pn->key;
1379
1380 pn = node_parent_rcu(pn);
1381 if (unlikely(!pn))
1382 goto failed;
1383#ifdef CONFIG_IP_FIB_TRIE_STATS
1384 this_cpu_inc(stats->backtrack);
1385#endif
1386 /* Get Child's index */
1387 cindex = get_index(pkey, pn);
1388 }
1389
1390 /* strip the least significant bit from the cindex */
1391 cindex &= cindex - 1;
1392
1393 /* grab pointer for next child node */
1394 cptr = &pn->child[cindex];
c877efb2 1395 }
19baf839 1396 }
9f9e636d 1397
19baf839 1398found:
9f9e636d
AD
1399 /* Step 3: Process the leaf, if that fails fall back to backtracing */
1400 ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
1401 if (unlikely(ret > 0))
1402 goto backtrace;
1403failed:
2373ce1c 1404 rcu_read_unlock();
19baf839
RO
1405 return ret;
1406}
6fc01438 1407EXPORT_SYMBOL_GPL(fib_table_lookup);
19baf839 1408
9195bef7
SH
1409/*
1410 * Remove the leaf and return parent.
1411 */
adaf9816 1412static void trie_leaf_remove(struct trie *t, struct tnode *l)
19baf839 1413{
64c9b6fb 1414 struct tnode *tp = node_parent(l);
c877efb2 1415
9195bef7 1416 pr_debug("entering trie_leaf_remove(%p)\n", l);
19baf839 1417
c877efb2 1418 if (tp) {
836a0123 1419 put_child(tp, get_index(l->key, tp), NULL);
7b85576d 1420 trie_rebalance(t, tp);
836a0123 1421 } else {
a9b3cd7f 1422 RCU_INIT_POINTER(t->trie, NULL);
836a0123 1423 }
19baf839 1424
37fd30f2 1425 node_free(l);
19baf839
RO
1426}
1427
d562f1f8
RO
1428/*
1429 * Caller must hold RTNL.
1430 */
16c6cf8b 1431int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1432{
1433 struct trie *t = (struct trie *) tb->tb_data;
1434 u32 key, mask;
4e902c57
TG
1435 int plen = cfg->fc_dst_len;
1436 u8 tos = cfg->fc_tos;
19baf839
RO
1437 struct fib_alias *fa, *fa_to_delete;
1438 struct list_head *fa_head;
adaf9816 1439 struct tnode *l;
91b9a277
OJ
1440 struct leaf_info *li;
1441
c877efb2 1442 if (plen > 32)
19baf839
RO
1443 return -EINVAL;
1444
4e902c57 1445 key = ntohl(cfg->fc_dst);
91b9a277 1446 mask = ntohl(inet_make_mask(plen));
19baf839 1447
c877efb2 1448 if (key & ~mask)
19baf839
RO
1449 return -EINVAL;
1450
1451 key = key & mask;
1452 l = fib_find_node(t, key);
1453
c877efb2 1454 if (!l)
19baf839
RO
1455 return -ESRCH;
1456
ad5b3102
IM
1457 li = find_leaf_info(l, plen);
1458
1459 if (!li)
1460 return -ESRCH;
1461
1462 fa_head = &li->falh;
19baf839
RO
1463 fa = fib_find_alias(fa_head, tos, 0);
1464
1465 if (!fa)
1466 return -ESRCH;
1467
0c7770c7 1468 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1469
1470 fa_to_delete = NULL;
936f6f8e
JA
1471 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1472 list_for_each_entry_continue(fa, fa_head, fa_list) {
19baf839
RO
1473 struct fib_info *fi = fa->fa_info;
1474
1475 if (fa->fa_tos != tos)
1476 break;
1477
4e902c57
TG
1478 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1479 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
37e826c5 1480 fa->fa_info->fib_scope == cfg->fc_scope) &&
74cb3c10
JA
1481 (!cfg->fc_prefsrc ||
1482 fi->fib_prefsrc == cfg->fc_prefsrc) &&
4e902c57
TG
1483 (!cfg->fc_protocol ||
1484 fi->fib_protocol == cfg->fc_protocol) &&
1485 fib_nh_match(cfg, fi) == 0) {
19baf839
RO
1486 fa_to_delete = fa;
1487 break;
1488 }
1489 }
1490
91b9a277
OJ
1491 if (!fa_to_delete)
1492 return -ESRCH;
19baf839 1493
91b9a277 1494 fa = fa_to_delete;
4e902c57 1495 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
b8f55831 1496 &cfg->fc_nlinfo, 0);
91b9a277 1497
2373ce1c 1498 list_del_rcu(&fa->fa_list);
19baf839 1499
21d8c49e
DM
1500 if (!plen)
1501 tb->tb_num_default--;
1502
91b9a277 1503 if (list_empty(fa_head)) {
2373ce1c 1504 hlist_del_rcu(&li->hlist);
91b9a277 1505 free_leaf_info(li);
2373ce1c 1506 }
19baf839 1507
91b9a277 1508 if (hlist_empty(&l->list))
9195bef7 1509 trie_leaf_remove(t, l);
19baf839 1510
91b9a277 1511 if (fa->fa_state & FA_S_ACCESSED)
4ccfe6d4 1512 rt_cache_flush(cfg->fc_nlinfo.nl_net);
19baf839 1513
2373ce1c
RO
1514 fib_release_info(fa->fa_info);
1515 alias_free_mem_rcu(fa);
91b9a277 1516 return 0;
19baf839
RO
1517}
1518
ef3660ce 1519static int trie_flush_list(struct list_head *head)
19baf839
RO
1520{
1521 struct fib_alias *fa, *fa_node;
1522 int found = 0;
1523
1524 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1525 struct fib_info *fi = fa->fa_info;
19baf839 1526
2373ce1c
RO
1527 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1528 list_del_rcu(&fa->fa_list);
1529 fib_release_info(fa->fa_info);
1530 alias_free_mem_rcu(fa);
19baf839
RO
1531 found++;
1532 }
1533 }
1534 return found;
1535}
1536
adaf9816 1537static int trie_flush_leaf(struct tnode *l)
19baf839
RO
1538{
1539 int found = 0;
1540 struct hlist_head *lih = &l->list;
b67bfe0d 1541 struct hlist_node *tmp;
19baf839
RO
1542 struct leaf_info *li = NULL;
1543
b67bfe0d 1544 hlist_for_each_entry_safe(li, tmp, lih, hlist) {
ef3660ce 1545 found += trie_flush_list(&li->falh);
19baf839
RO
1546
1547 if (list_empty(&li->falh)) {
2373ce1c 1548 hlist_del_rcu(&li->hlist);
19baf839
RO
1549 free_leaf_info(li);
1550 }
1551 }
1552 return found;
1553}
1554
82cfbb00
SH
1555/*
1556 * Scan for the next right leaf starting at node p->child[idx]
1557 * Since we have back pointer, no recursion necessary.
1558 */
adaf9816 1559static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
19baf839 1560{
82cfbb00
SH
1561 do {
1562 t_key idx;
c877efb2 1563
c877efb2 1564 if (c)
82cfbb00 1565 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
c877efb2 1566 else
82cfbb00 1567 idx = 0;
2373ce1c 1568
82cfbb00
SH
1569 while (idx < 1u << p->bits) {
1570 c = tnode_get_child_rcu(p, idx++);
2373ce1c 1571 if (!c)
91b9a277
OJ
1572 continue;
1573
aab515d7 1574 if (IS_LEAF(c))
adaf9816 1575 return c;
82cfbb00
SH
1576
1577 /* Rescan start scanning in new node */
adaf9816 1578 p = c;
82cfbb00 1579 idx = 0;
19baf839 1580 }
82cfbb00
SH
1581
1582 /* Node empty, walk back up to parent */
adaf9816 1583 c = p;
a034ee3c 1584 } while ((p = node_parent_rcu(c)) != NULL);
82cfbb00
SH
1585
1586 return NULL; /* Root of trie */
1587}
1588
adaf9816 1589static struct tnode *trie_firstleaf(struct trie *t)
82cfbb00 1590{
adaf9816 1591 struct tnode *n = rcu_dereference_rtnl(t->trie);
82cfbb00
SH
1592
1593 if (!n)
1594 return NULL;
1595
1596 if (IS_LEAF(n)) /* trie is just a leaf */
adaf9816 1597 return n;
82cfbb00
SH
1598
1599 return leaf_walk_rcu(n, NULL);
1600}
1601
adaf9816 1602static struct tnode *trie_nextleaf(struct tnode *l)
82cfbb00 1603{
adaf9816 1604 struct tnode *p = node_parent_rcu(l);
82cfbb00
SH
1605
1606 if (!p)
1607 return NULL; /* trie with just one leaf */
1608
adaf9816 1609 return leaf_walk_rcu(p, l);
19baf839
RO
1610}
1611
adaf9816 1612static struct tnode *trie_leafindex(struct trie *t, int index)
71d67e66 1613{
adaf9816 1614 struct tnode *l = trie_firstleaf(t);
71d67e66 1615
ec28cf73 1616 while (l && index-- > 0)
71d67e66 1617 l = trie_nextleaf(l);
ec28cf73 1618
71d67e66
SH
1619 return l;
1620}
1621
1622
d562f1f8
RO
1623/*
1624 * Caller must hold RTNL.
1625 */
16c6cf8b 1626int fib_table_flush(struct fib_table *tb)
19baf839
RO
1627{
1628 struct trie *t = (struct trie *) tb->tb_data;
adaf9816 1629 struct tnode *l, *ll = NULL;
82cfbb00 1630 int found = 0;
19baf839 1631
82cfbb00 1632 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
ef3660ce 1633 found += trie_flush_leaf(l);
19baf839
RO
1634
1635 if (ll && hlist_empty(&ll->list))
9195bef7 1636 trie_leaf_remove(t, ll);
19baf839
RO
1637 ll = l;
1638 }
1639
1640 if (ll && hlist_empty(&ll->list))
9195bef7 1641 trie_leaf_remove(t, ll);
19baf839 1642
0c7770c7 1643 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1644 return found;
1645}
1646
4aa2c466
PE
1647void fib_free_table(struct fib_table *tb)
1648{
8274a97a
AD
1649#ifdef CONFIG_IP_FIB_TRIE_STATS
1650 struct trie *t = (struct trie *)tb->tb_data;
1651
1652 free_percpu(t->stats);
1653#endif /* CONFIG_IP_FIB_TRIE_STATS */
4aa2c466
PE
1654 kfree(tb);
1655}
1656
a07f5f50
SH
1657static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1658 struct fib_table *tb,
19baf839
RO
1659 struct sk_buff *skb, struct netlink_callback *cb)
1660{
1661 int i, s_i;
1662 struct fib_alias *fa;
32ab5f80 1663 __be32 xkey = htonl(key);
19baf839 1664
71d67e66 1665 s_i = cb->args[5];
19baf839
RO
1666 i = 0;
1667
2373ce1c
RO
1668 /* rcu_read_lock is hold by caller */
1669
1670 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1671 if (i < s_i) {
1672 i++;
1673 continue;
1674 }
19baf839 1675
15e47304 1676 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
19baf839
RO
1677 cb->nlh->nlmsg_seq,
1678 RTM_NEWROUTE,
1679 tb->tb_id,
1680 fa->fa_type,
be403ea1 1681 xkey,
19baf839
RO
1682 plen,
1683 fa->fa_tos,
64347f78 1684 fa->fa_info, NLM_F_MULTI) < 0) {
71d67e66 1685 cb->args[5] = i;
19baf839 1686 return -1;
91b9a277 1687 }
19baf839
RO
1688 i++;
1689 }
71d67e66 1690 cb->args[5] = i;
19baf839
RO
1691 return skb->len;
1692}
1693
adaf9816 1694static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
a88ee229 1695 struct sk_buff *skb, struct netlink_callback *cb)
19baf839 1696{
a88ee229 1697 struct leaf_info *li;
a88ee229 1698 int i, s_i;
19baf839 1699
71d67e66 1700 s_i = cb->args[4];
a88ee229 1701 i = 0;
19baf839 1702
a88ee229 1703 /* rcu_read_lock is hold by caller */
b67bfe0d 1704 hlist_for_each_entry_rcu(li, &l->list, hlist) {
a88ee229
SH
1705 if (i < s_i) {
1706 i++;
19baf839 1707 continue;
a88ee229 1708 }
91b9a277 1709
a88ee229 1710 if (i > s_i)
71d67e66 1711 cb->args[5] = 0;
19baf839 1712
a88ee229 1713 if (list_empty(&li->falh))
19baf839
RO
1714 continue;
1715
a88ee229 1716 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
71d67e66 1717 cb->args[4] = i;
19baf839
RO
1718 return -1;
1719 }
a88ee229 1720 i++;
19baf839 1721 }
a88ee229 1722
71d67e66 1723 cb->args[4] = i;
19baf839
RO
1724 return skb->len;
1725}
1726
16c6cf8b
SH
1727int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1728 struct netlink_callback *cb)
19baf839 1729{
adaf9816 1730 struct tnode *l;
19baf839 1731 struct trie *t = (struct trie *) tb->tb_data;
d5ce8a0e 1732 t_key key = cb->args[2];
71d67e66 1733 int count = cb->args[3];
19baf839 1734
2373ce1c 1735 rcu_read_lock();
d5ce8a0e
SH
1736 /* Dump starting at last key.
1737 * Note: 0.0.0.0/0 (ie default) is first key.
1738 */
71d67e66 1739 if (count == 0)
d5ce8a0e
SH
1740 l = trie_firstleaf(t);
1741 else {
71d67e66
SH
1742 /* Normally, continue from last key, but if that is missing
1743 * fallback to using slow rescan
1744 */
d5ce8a0e 1745 l = fib_find_node(t, key);
71d67e66
SH
1746 if (!l)
1747 l = trie_leafindex(t, count);
d5ce8a0e 1748 }
a88ee229 1749
d5ce8a0e
SH
1750 while (l) {
1751 cb->args[2] = l->key;
a88ee229 1752 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
71d67e66 1753 cb->args[3] = count;
a88ee229 1754 rcu_read_unlock();
a88ee229 1755 return -1;
19baf839 1756 }
d5ce8a0e 1757
71d67e66 1758 ++count;
d5ce8a0e 1759 l = trie_nextleaf(l);
71d67e66
SH
1760 memset(&cb->args[4], 0,
1761 sizeof(cb->args) - 4*sizeof(cb->args[0]));
19baf839 1762 }
71d67e66 1763 cb->args[3] = count;
2373ce1c 1764 rcu_read_unlock();
a88ee229 1765
19baf839 1766 return skb->len;
19baf839
RO
1767}
1768
5348ba85 1769void __init fib_trie_init(void)
7f9b8052 1770{
a07f5f50
SH
1771 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1772 sizeof(struct fib_alias),
bc3c8c1e
SH
1773 0, SLAB_PANIC, NULL);
1774
1775 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
adaf9816 1776 max(sizeof(struct tnode),
bc3c8c1e
SH
1777 sizeof(struct leaf_info)),
1778 0, SLAB_PANIC, NULL);
7f9b8052 1779}
19baf839 1780
7f9b8052 1781
5348ba85 1782struct fib_table *fib_trie_table(u32 id)
19baf839
RO
1783{
1784 struct fib_table *tb;
1785 struct trie *t;
1786
19baf839
RO
1787 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1788 GFP_KERNEL);
1789 if (tb == NULL)
1790 return NULL;
1791
1792 tb->tb_id = id;
971b893e 1793 tb->tb_default = -1;
21d8c49e 1794 tb->tb_num_default = 0;
19baf839
RO
1795
1796 t = (struct trie *) tb->tb_data;
8274a97a
AD
1797 RCU_INIT_POINTER(t->trie, NULL);
1798#ifdef CONFIG_IP_FIB_TRIE_STATS
1799 t->stats = alloc_percpu(struct trie_use_stats);
1800 if (!t->stats) {
1801 kfree(tb);
1802 tb = NULL;
1803 }
1804#endif
19baf839 1805
19baf839
RO
1806 return tb;
1807}
1808
cb7b593c
SH
1809#ifdef CONFIG_PROC_FS
1810/* Depth first Trie walk iterator */
1811struct fib_trie_iter {
1c340b2f 1812 struct seq_net_private p;
3d3b2d25 1813 struct fib_table *tb;
cb7b593c 1814 struct tnode *tnode;
a034ee3c
ED
1815 unsigned int index;
1816 unsigned int depth;
cb7b593c 1817};
19baf839 1818
adaf9816 1819static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 1820{
cb7b593c 1821 struct tnode *tn = iter->tnode;
a034ee3c 1822 unsigned int cindex = iter->index;
cb7b593c 1823 struct tnode *p;
19baf839 1824
6640e697
EB
1825 /* A single entry routing table */
1826 if (!tn)
1827 return NULL;
1828
cb7b593c
SH
1829 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1830 iter->tnode, iter->index, iter->depth);
1831rescan:
1832 while (cindex < (1<<tn->bits)) {
adaf9816 1833 struct tnode *n = tnode_get_child_rcu(tn, cindex);
19baf839 1834
cb7b593c
SH
1835 if (n) {
1836 if (IS_LEAF(n)) {
1837 iter->tnode = tn;
1838 iter->index = cindex + 1;
1839 } else {
1840 /* push down one level */
adaf9816 1841 iter->tnode = n;
cb7b593c
SH
1842 iter->index = 0;
1843 ++iter->depth;
1844 }
1845 return n;
1846 }
19baf839 1847
cb7b593c
SH
1848 ++cindex;
1849 }
91b9a277 1850
cb7b593c 1851 /* Current node exhausted, pop back up */
adaf9816 1852 p = node_parent_rcu(tn);
cb7b593c
SH
1853 if (p) {
1854 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
1855 tn = p;
1856 --iter->depth;
1857 goto rescan;
19baf839 1858 }
cb7b593c
SH
1859
1860 /* got root? */
1861 return NULL;
19baf839
RO
1862}
1863
adaf9816 1864static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
cb7b593c 1865 struct trie *t)
19baf839 1866{
adaf9816 1867 struct tnode *n;
5ddf0eb2 1868
132adf54 1869 if (!t)
5ddf0eb2
RO
1870 return NULL;
1871
1872 n = rcu_dereference(t->trie);
3d3b2d25 1873 if (!n)
5ddf0eb2 1874 return NULL;
19baf839 1875
3d3b2d25 1876 if (IS_TNODE(n)) {
adaf9816 1877 iter->tnode = n;
3d3b2d25
SH
1878 iter->index = 0;
1879 iter->depth = 1;
1880 } else {
1881 iter->tnode = NULL;
1882 iter->index = 0;
1883 iter->depth = 0;
91b9a277 1884 }
3d3b2d25
SH
1885
1886 return n;
cb7b593c 1887}
91b9a277 1888
cb7b593c
SH
1889static void trie_collect_stats(struct trie *t, struct trie_stat *s)
1890{
adaf9816 1891 struct tnode *n;
cb7b593c 1892 struct fib_trie_iter iter;
91b9a277 1893
cb7b593c 1894 memset(s, 0, sizeof(*s));
91b9a277 1895
cb7b593c 1896 rcu_read_lock();
3d3b2d25 1897 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
cb7b593c 1898 if (IS_LEAF(n)) {
93672292 1899 struct leaf_info *li;
93672292 1900
cb7b593c
SH
1901 s->leaves++;
1902 s->totdepth += iter.depth;
1903 if (iter.depth > s->maxdepth)
1904 s->maxdepth = iter.depth;
93672292 1905
adaf9816 1906 hlist_for_each_entry_rcu(li, &n->list, hlist)
93672292 1907 ++s->prefixes;
cb7b593c 1908 } else {
cb7b593c
SH
1909 int i;
1910
1911 s->tnodes++;
adaf9816
AD
1912 if (n->bits < MAX_STAT_DEPTH)
1913 s->nodesizes[n->bits]++;
06ef921d 1914
adaf9816
AD
1915 for (i = 0; i < tnode_child_length(n); i++)
1916 if (!rcu_access_pointer(n->child[i]))
cb7b593c 1917 s->nullpointers++;
19baf839 1918 }
19baf839 1919 }
2373ce1c 1920 rcu_read_unlock();
19baf839
RO
1921}
1922
cb7b593c
SH
1923/*
1924 * This outputs /proc/net/fib_triestats
1925 */
1926static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 1927{
a034ee3c 1928 unsigned int i, max, pointers, bytes, avdepth;
c877efb2 1929
cb7b593c
SH
1930 if (stat->leaves)
1931 avdepth = stat->totdepth*100 / stat->leaves;
1932 else
1933 avdepth = 0;
91b9a277 1934
a07f5f50
SH
1935 seq_printf(seq, "\tAver depth: %u.%02d\n",
1936 avdepth / 100, avdepth % 100);
cb7b593c 1937 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 1938
cb7b593c 1939 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
adaf9816 1940 bytes = sizeof(struct tnode) * stat->leaves;
93672292
SH
1941
1942 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
1943 bytes += sizeof(struct leaf_info) * stat->prefixes;
1944
187b5188 1945 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
cb7b593c 1946 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 1947
06ef921d
RO
1948 max = MAX_STAT_DEPTH;
1949 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 1950 max--;
19baf839 1951
cb7b593c 1952 pointers = 0;
f585a991 1953 for (i = 1; i < max; i++)
cb7b593c 1954 if (stat->nodesizes[i] != 0) {
187b5188 1955 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
cb7b593c
SH
1956 pointers += (1<<i) * stat->nodesizes[i];
1957 }
1958 seq_putc(seq, '\n');
187b5188 1959 seq_printf(seq, "\tPointers: %u\n", pointers);
2373ce1c 1960
adaf9816 1961 bytes += sizeof(struct tnode *) * pointers;
187b5188
SH
1962 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
1963 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
66a2f7fd 1964}
2373ce1c 1965
cb7b593c 1966#ifdef CONFIG_IP_FIB_TRIE_STATS
66a2f7fd 1967static void trie_show_usage(struct seq_file *seq,
8274a97a 1968 const struct trie_use_stats __percpu *stats)
66a2f7fd 1969{
8274a97a
AD
1970 struct trie_use_stats s = { 0 };
1971 int cpu;
1972
1973 /* loop through all of the CPUs and gather up the stats */
1974 for_each_possible_cpu(cpu) {
1975 const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
1976
1977 s.gets += pcpu->gets;
1978 s.backtrack += pcpu->backtrack;
1979 s.semantic_match_passed += pcpu->semantic_match_passed;
1980 s.semantic_match_miss += pcpu->semantic_match_miss;
1981 s.null_node_hit += pcpu->null_node_hit;
1982 s.resize_node_skipped += pcpu->resize_node_skipped;
1983 }
1984
66a2f7fd 1985 seq_printf(seq, "\nCounters:\n---------\n");
8274a97a
AD
1986 seq_printf(seq, "gets = %u\n", s.gets);
1987 seq_printf(seq, "backtracks = %u\n", s.backtrack);
a07f5f50 1988 seq_printf(seq, "semantic match passed = %u\n",
8274a97a
AD
1989 s.semantic_match_passed);
1990 seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
1991 seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
1992 seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
cb7b593c 1993}
66a2f7fd
SH
1994#endif /* CONFIG_IP_FIB_TRIE_STATS */
1995
3d3b2d25 1996static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
d717a9a6 1997{
3d3b2d25
SH
1998 if (tb->tb_id == RT_TABLE_LOCAL)
1999 seq_puts(seq, "Local:\n");
2000 else if (tb->tb_id == RT_TABLE_MAIN)
2001 seq_puts(seq, "Main:\n");
2002 else
2003 seq_printf(seq, "Id %d:\n", tb->tb_id);
d717a9a6 2004}
19baf839 2005
3d3b2d25 2006
cb7b593c
SH
2007static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2008{
1c340b2f 2009 struct net *net = (struct net *)seq->private;
3d3b2d25 2010 unsigned int h;
877a9bff 2011
d717a9a6 2012 seq_printf(seq,
a07f5f50
SH
2013 "Basic info: size of leaf:"
2014 " %Zd bytes, size of tnode: %Zd bytes.\n",
adaf9816 2015 sizeof(struct tnode), sizeof(struct tnode));
d717a9a6 2016
3d3b2d25
SH
2017 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2018 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
3d3b2d25
SH
2019 struct fib_table *tb;
2020
b67bfe0d 2021 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
3d3b2d25
SH
2022 struct trie *t = (struct trie *) tb->tb_data;
2023 struct trie_stat stat;
877a9bff 2024
3d3b2d25
SH
2025 if (!t)
2026 continue;
2027
2028 fib_table_print(seq, tb);
2029
2030 trie_collect_stats(t, &stat);
2031 trie_show_stats(seq, &stat);
2032#ifdef CONFIG_IP_FIB_TRIE_STATS
8274a97a 2033 trie_show_usage(seq, t->stats);
3d3b2d25
SH
2034#endif
2035 }
2036 }
19baf839 2037
cb7b593c 2038 return 0;
19baf839
RO
2039}
2040
cb7b593c 2041static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2042{
de05c557 2043 return single_open_net(inode, file, fib_triestat_seq_show);
1c340b2f
DL
2044}
2045
9a32144e 2046static const struct file_operations fib_triestat_fops = {
cb7b593c
SH
2047 .owner = THIS_MODULE,
2048 .open = fib_triestat_seq_open,
2049 .read = seq_read,
2050 .llseek = seq_lseek,
b6fcbdb4 2051 .release = single_release_net,
cb7b593c
SH
2052};
2053
adaf9816 2054static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
19baf839 2055{
1218854a
YH
2056 struct fib_trie_iter *iter = seq->private;
2057 struct net *net = seq_file_net(seq);
cb7b593c 2058 loff_t idx = 0;
3d3b2d25 2059 unsigned int h;
cb7b593c 2060
3d3b2d25
SH
2061 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2062 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
3d3b2d25 2063 struct fib_table *tb;
cb7b593c 2064
b67bfe0d 2065 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
adaf9816 2066 struct tnode *n;
3d3b2d25
SH
2067
2068 for (n = fib_trie_get_first(iter,
2069 (struct trie *) tb->tb_data);
2070 n; n = fib_trie_get_next(iter))
2071 if (pos == idx++) {
2072 iter->tb = tb;
2073 return n;
2074 }
2075 }
cb7b593c 2076 }
3d3b2d25 2077
19baf839
RO
2078 return NULL;
2079}
2080
cb7b593c 2081static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
c95aaf9a 2082 __acquires(RCU)
19baf839 2083{
cb7b593c 2084 rcu_read_lock();
1218854a 2085 return fib_trie_get_idx(seq, *pos);
19baf839
RO
2086}
2087
cb7b593c 2088static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2089{
cb7b593c 2090 struct fib_trie_iter *iter = seq->private;
1218854a 2091 struct net *net = seq_file_net(seq);
3d3b2d25
SH
2092 struct fib_table *tb = iter->tb;
2093 struct hlist_node *tb_node;
2094 unsigned int h;
adaf9816 2095 struct tnode *n;
cb7b593c 2096
19baf839 2097 ++*pos;
3d3b2d25
SH
2098 /* next node in same table */
2099 n = fib_trie_get_next(iter);
2100 if (n)
2101 return n;
19baf839 2102
3d3b2d25
SH
2103 /* walk rest of this hash chain */
2104 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
0a5c0475 2105 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
3d3b2d25
SH
2106 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2107 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2108 if (n)
2109 goto found;
2110 }
19baf839 2111
3d3b2d25
SH
2112 /* new hash chain */
2113 while (++h < FIB_TABLE_HASHSZ) {
2114 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
b67bfe0d 2115 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
3d3b2d25
SH
2116 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2117 if (n)
2118 goto found;
2119 }
2120 }
cb7b593c 2121 return NULL;
3d3b2d25
SH
2122
2123found:
2124 iter->tb = tb;
2125 return n;
cb7b593c 2126}
19baf839 2127
cb7b593c 2128static void fib_trie_seq_stop(struct seq_file *seq, void *v)
c95aaf9a 2129 __releases(RCU)
19baf839 2130{
cb7b593c
SH
2131 rcu_read_unlock();
2132}
91b9a277 2133
cb7b593c
SH
2134static void seq_indent(struct seq_file *seq, int n)
2135{
a034ee3c
ED
2136 while (n-- > 0)
2137 seq_puts(seq, " ");
cb7b593c 2138}
19baf839 2139
28d36e37 2140static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
cb7b593c 2141{
132adf54 2142 switch (s) {
cb7b593c
SH
2143 case RT_SCOPE_UNIVERSE: return "universe";
2144 case RT_SCOPE_SITE: return "site";
2145 case RT_SCOPE_LINK: return "link";
2146 case RT_SCOPE_HOST: return "host";
2147 case RT_SCOPE_NOWHERE: return "nowhere";
2148 default:
28d36e37 2149 snprintf(buf, len, "scope=%d", s);
cb7b593c
SH
2150 return buf;
2151 }
2152}
19baf839 2153
36cbd3dc 2154static const char *const rtn_type_names[__RTN_MAX] = {
cb7b593c
SH
2155 [RTN_UNSPEC] = "UNSPEC",
2156 [RTN_UNICAST] = "UNICAST",
2157 [RTN_LOCAL] = "LOCAL",
2158 [RTN_BROADCAST] = "BROADCAST",
2159 [RTN_ANYCAST] = "ANYCAST",
2160 [RTN_MULTICAST] = "MULTICAST",
2161 [RTN_BLACKHOLE] = "BLACKHOLE",
2162 [RTN_UNREACHABLE] = "UNREACHABLE",
2163 [RTN_PROHIBIT] = "PROHIBIT",
2164 [RTN_THROW] = "THROW",
2165 [RTN_NAT] = "NAT",
2166 [RTN_XRESOLVE] = "XRESOLVE",
2167};
19baf839 2168
a034ee3c 2169static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
cb7b593c 2170{
cb7b593c
SH
2171 if (t < __RTN_MAX && rtn_type_names[t])
2172 return rtn_type_names[t];
28d36e37 2173 snprintf(buf, len, "type %u", t);
cb7b593c 2174 return buf;
19baf839
RO
2175}
2176
cb7b593c
SH
2177/* Pretty print the trie */
2178static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2179{
cb7b593c 2180 const struct fib_trie_iter *iter = seq->private;
adaf9816 2181 struct tnode *n = v;
c877efb2 2182
3d3b2d25
SH
2183 if (!node_parent_rcu(n))
2184 fib_table_print(seq, iter->tb);
095b8501 2185
cb7b593c 2186 if (IS_TNODE(n)) {
adaf9816 2187 __be32 prf = htonl(n->key);
91b9a277 2188
adaf9816 2189 seq_indent(seq, iter->depth - 1);
673d57e7 2190 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
adaf9816
AD
2191 &prf, n->pos, n->bits, n->full_children,
2192 n->empty_children);
cb7b593c 2193 } else {
1328042e 2194 struct leaf_info *li;
adaf9816 2195 __be32 val = htonl(n->key);
cb7b593c
SH
2196
2197 seq_indent(seq, iter->depth);
673d57e7 2198 seq_printf(seq, " |-- %pI4\n", &val);
1328042e 2199
adaf9816 2200 hlist_for_each_entry_rcu(li, &n->list, hlist) {
1328042e
SH
2201 struct fib_alias *fa;
2202
2203 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2204 char buf1[32], buf2[32];
2205
2206 seq_indent(seq, iter->depth+1);
2207 seq_printf(seq, " /%d %s %s", li->plen,
2208 rtn_scope(buf1, sizeof(buf1),
37e826c5 2209 fa->fa_info->fib_scope),
1328042e
SH
2210 rtn_type(buf2, sizeof(buf2),
2211 fa->fa_type));
2212 if (fa->fa_tos)
b9c4d82a 2213 seq_printf(seq, " tos=%d", fa->fa_tos);
1328042e 2214 seq_putc(seq, '\n');
cb7b593c
SH
2215 }
2216 }
19baf839 2217 }
cb7b593c 2218
19baf839
RO
2219 return 0;
2220}
2221
f690808e 2222static const struct seq_operations fib_trie_seq_ops = {
cb7b593c
SH
2223 .start = fib_trie_seq_start,
2224 .next = fib_trie_seq_next,
2225 .stop = fib_trie_seq_stop,
2226 .show = fib_trie_seq_show,
19baf839
RO
2227};
2228
cb7b593c 2229static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839 2230{
1c340b2f
DL
2231 return seq_open_net(inode, file, &fib_trie_seq_ops,
2232 sizeof(struct fib_trie_iter));
19baf839
RO
2233}
2234
9a32144e 2235static const struct file_operations fib_trie_fops = {
cb7b593c
SH
2236 .owner = THIS_MODULE,
2237 .open = fib_trie_seq_open,
2238 .read = seq_read,
2239 .llseek = seq_lseek,
1c340b2f 2240 .release = seq_release_net,
19baf839
RO
2241};
2242
8315f5d8
SH
2243struct fib_route_iter {
2244 struct seq_net_private p;
2245 struct trie *main_trie;
2246 loff_t pos;
2247 t_key key;
2248};
2249
adaf9816 2250static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
8315f5d8 2251{
adaf9816 2252 struct tnode *l = NULL;
8315f5d8
SH
2253 struct trie *t = iter->main_trie;
2254
2255 /* use cache location of last found key */
2256 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2257 pos -= iter->pos;
2258 else {
2259 iter->pos = 0;
2260 l = trie_firstleaf(t);
2261 }
2262
2263 while (l && pos-- > 0) {
2264 iter->pos++;
2265 l = trie_nextleaf(l);
2266 }
2267
2268 if (l)
2269 iter->key = pos; /* remember it */
2270 else
2271 iter->pos = 0; /* forget it */
2272
2273 return l;
2274}
2275
2276static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2277 __acquires(RCU)
2278{
2279 struct fib_route_iter *iter = seq->private;
2280 struct fib_table *tb;
2281
2282 rcu_read_lock();
1218854a 2283 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
8315f5d8
SH
2284 if (!tb)
2285 return NULL;
2286
2287 iter->main_trie = (struct trie *) tb->tb_data;
2288 if (*pos == 0)
2289 return SEQ_START_TOKEN;
2290 else
2291 return fib_route_get_idx(iter, *pos - 1);
2292}
2293
2294static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2295{
2296 struct fib_route_iter *iter = seq->private;
adaf9816 2297 struct tnode *l = v;
8315f5d8
SH
2298
2299 ++*pos;
2300 if (v == SEQ_START_TOKEN) {
2301 iter->pos = 0;
2302 l = trie_firstleaf(iter->main_trie);
2303 } else {
2304 iter->pos++;
2305 l = trie_nextleaf(l);
2306 }
2307
2308 if (l)
2309 iter->key = l->key;
2310 else
2311 iter->pos = 0;
2312 return l;
2313}
2314
2315static void fib_route_seq_stop(struct seq_file *seq, void *v)
2316 __releases(RCU)
2317{
2318 rcu_read_unlock();
2319}
2320
a034ee3c 2321static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
19baf839 2322{
a034ee3c 2323 unsigned int flags = 0;
19baf839 2324
a034ee3c
ED
2325 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2326 flags = RTF_REJECT;
cb7b593c
SH
2327 if (fi && fi->fib_nh->nh_gw)
2328 flags |= RTF_GATEWAY;
32ab5f80 2329 if (mask == htonl(0xFFFFFFFF))
cb7b593c
SH
2330 flags |= RTF_HOST;
2331 flags |= RTF_UP;
2332 return flags;
19baf839
RO
2333}
2334
cb7b593c
SH
2335/*
2336 * This outputs /proc/net/route.
2337 * The format of the file is not supposed to be changed
a034ee3c 2338 * and needs to be same as fib_hash output to avoid breaking
cb7b593c
SH
2339 * legacy utilities
2340 */
2341static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2342{
adaf9816 2343 struct tnode *l = v;
1328042e 2344 struct leaf_info *li;
19baf839 2345
cb7b593c
SH
2346 if (v == SEQ_START_TOKEN) {
2347 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2348 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2349 "\tWindow\tIRTT");
2350 return 0;
2351 }
19baf839 2352
b67bfe0d 2353 hlist_for_each_entry_rcu(li, &l->list, hlist) {
cb7b593c 2354 struct fib_alias *fa;
32ab5f80 2355 __be32 mask, prefix;
91b9a277 2356
cb7b593c
SH
2357 mask = inet_make_mask(li->plen);
2358 prefix = htonl(l->key);
19baf839 2359
cb7b593c 2360 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2361 const struct fib_info *fi = fa->fa_info;
a034ee3c 2362 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
19baf839 2363
cb7b593c
SH
2364 if (fa->fa_type == RTN_BROADCAST
2365 || fa->fa_type == RTN_MULTICAST)
2366 continue;
19baf839 2367
652586df
TH
2368 seq_setwidth(seq, 127);
2369
cb7b593c 2370 if (fi)
5e659e4c
PE
2371 seq_printf(seq,
2372 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
652586df 2373 "%d\t%08X\t%d\t%u\t%u",
cb7b593c
SH
2374 fi->fib_dev ? fi->fib_dev->name : "*",
2375 prefix,
2376 fi->fib_nh->nh_gw, flags, 0, 0,
2377 fi->fib_priority,
2378 mask,
a07f5f50
SH
2379 (fi->fib_advmss ?
2380 fi->fib_advmss + 40 : 0),
cb7b593c 2381 fi->fib_window,
652586df 2382 fi->fib_rtt >> 3);
cb7b593c 2383 else
5e659e4c
PE
2384 seq_printf(seq,
2385 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
652586df 2386 "%d\t%08X\t%d\t%u\t%u",
cb7b593c 2387 prefix, 0, flags, 0, 0, 0,
652586df 2388 mask, 0, 0, 0);
19baf839 2389
652586df 2390 seq_pad(seq, '\n');
cb7b593c 2391 }
19baf839
RO
2392 }
2393
2394 return 0;
2395}
2396
f690808e 2397static const struct seq_operations fib_route_seq_ops = {
8315f5d8
SH
2398 .start = fib_route_seq_start,
2399 .next = fib_route_seq_next,
2400 .stop = fib_route_seq_stop,
cb7b593c 2401 .show = fib_route_seq_show,
19baf839
RO
2402};
2403
cb7b593c 2404static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839 2405{
1c340b2f 2406 return seq_open_net(inode, file, &fib_route_seq_ops,
8315f5d8 2407 sizeof(struct fib_route_iter));
19baf839
RO
2408}
2409
9a32144e 2410static const struct file_operations fib_route_fops = {
cb7b593c
SH
2411 .owner = THIS_MODULE,
2412 .open = fib_route_seq_open,
2413 .read = seq_read,
2414 .llseek = seq_lseek,
1c340b2f 2415 .release = seq_release_net,
19baf839
RO
2416};
2417
61a02653 2418int __net_init fib_proc_init(struct net *net)
19baf839 2419{
d4beaa66 2420 if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
cb7b593c
SH
2421 goto out1;
2422
d4beaa66
G
2423 if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2424 &fib_triestat_fops))
cb7b593c
SH
2425 goto out2;
2426
d4beaa66 2427 if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
cb7b593c
SH
2428 goto out3;
2429
19baf839 2430 return 0;
cb7b593c
SH
2431
2432out3:
ece31ffd 2433 remove_proc_entry("fib_triestat", net->proc_net);
cb7b593c 2434out2:
ece31ffd 2435 remove_proc_entry("fib_trie", net->proc_net);
cb7b593c
SH
2436out1:
2437 return -ENOMEM;
19baf839
RO
2438}
2439
61a02653 2440void __net_exit fib_proc_exit(struct net *net)
19baf839 2441{
ece31ffd
G
2442 remove_proc_entry("fib_trie", net->proc_net);
2443 remove_proc_entry("fib_triestat", net->proc_net);
2444 remove_proc_entry("route", net->proc_net);
19baf839
RO
2445}
2446
2447#endif /* CONFIG_PROC_FS */