]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - net/ipv4/fib_trie.c
[IPV6]: Remove unused declarations in include/net/ip6_route.h.
[mirror_ubuntu-jammy-kernel.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
e905a9ed 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
19baf839 11 * Agricultural Sciences.
e905a9ed 12 *
19baf839
RO
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
e905a9ed 16 *
19baf839
RO
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
fd966255
RO
44 *
45 * Substantial contributions to this work comes from:
46 *
47 * David S. Miller, <davem@davemloft.net>
48 * Stephen Hemminger <shemminger@osdl.org>
49 * Paul E. McKenney <paulmck@us.ibm.com>
50 * Patrick McHardy <kaber@trash.net>
19baf839
RO
51 */
52
05eee48c 53#define VERSION "0.408"
19baf839 54
19baf839
RO
55#include <asm/uaccess.h>
56#include <asm/system.h>
1977f032 57#include <linux/bitops.h>
19baf839
RO
58#include <linux/types.h>
59#include <linux/kernel.h>
19baf839
RO
60#include <linux/mm.h>
61#include <linux/string.h>
62#include <linux/socket.h>
63#include <linux/sockios.h>
64#include <linux/errno.h>
65#include <linux/in.h>
66#include <linux/inet.h>
cd8787ab 67#include <linux/inetdevice.h>
19baf839
RO
68#include <linux/netdevice.h>
69#include <linux/if_arp.h>
70#include <linux/proc_fs.h>
2373ce1c 71#include <linux/rcupdate.h>
19baf839
RO
72#include <linux/skbuff.h>
73#include <linux/netlink.h>
74#include <linux/init.h>
75#include <linux/list.h>
457c4cbc 76#include <net/net_namespace.h>
19baf839
RO
77#include <net/ip.h>
78#include <net/protocol.h>
79#include <net/route.h>
80#include <net/tcp.h>
81#include <net/sock.h>
82#include <net/ip_fib.h>
83#include "fib_lookup.h"
84
06ef921d 85#define MAX_STAT_DEPTH 32
19baf839 86
19baf839 87#define KEYLENGTH (8*sizeof(t_key))
19baf839 88
19baf839
RO
89typedef unsigned int t_key;
90
91#define T_TNODE 0
92#define T_LEAF 1
93#define NODE_TYPE_MASK 0x1UL
2373ce1c
RO
94#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95
91b9a277
OJ
96#define IS_TNODE(n) (!(n->parent & T_LEAF))
97#define IS_LEAF(n) (n->parent & T_LEAF)
19baf839
RO
98
99struct node {
91b9a277 100 unsigned long parent;
8d965444 101 t_key key;
19baf839
RO
102};
103
104struct leaf {
91b9a277 105 unsigned long parent;
8d965444 106 t_key key;
19baf839 107 struct hlist_head list;
2373ce1c 108 struct rcu_head rcu;
19baf839
RO
109};
110
111struct leaf_info {
112 struct hlist_node hlist;
2373ce1c 113 struct rcu_head rcu;
19baf839
RO
114 int plen;
115 struct list_head falh;
116};
117
118struct tnode {
91b9a277 119 unsigned long parent;
8d965444 120 t_key key;
112d8cfc
ED
121 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
122 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
8d965444
ED
123 unsigned int full_children; /* KEYLENGTH bits needed */
124 unsigned int empty_children; /* KEYLENGTH bits needed */
2373ce1c 125 struct rcu_head rcu;
91b9a277 126 struct node *child[0];
19baf839
RO
127};
128
129#ifdef CONFIG_IP_FIB_TRIE_STATS
130struct trie_use_stats {
131 unsigned int gets;
132 unsigned int backtrack;
133 unsigned int semantic_match_passed;
134 unsigned int semantic_match_miss;
135 unsigned int null_node_hit;
2f36895a 136 unsigned int resize_node_skipped;
19baf839
RO
137};
138#endif
139
140struct trie_stat {
141 unsigned int totdepth;
142 unsigned int maxdepth;
143 unsigned int tnodes;
144 unsigned int leaves;
145 unsigned int nullpointers;
93672292 146 unsigned int prefixes;
06ef921d 147 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 148};
19baf839
RO
149
150struct trie {
91b9a277 151 struct node *trie;
19baf839
RO
152#ifdef CONFIG_IP_FIB_TRIE_STATS
153 struct trie_use_stats stats;
154#endif
19baf839
RO
155};
156
19baf839 157static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
a07f5f50
SH
158static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
159 int wasfull);
19baf839 160static struct node *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
161static struct tnode *inflate(struct trie *t, struct tnode *tn);
162static struct tnode *halve(struct trie *t, struct tnode *tn);
19baf839 163static void tnode_free(struct tnode *tn);
19baf839 164
e18b890b 165static struct kmem_cache *fn_alias_kmem __read_mostly;
bc3c8c1e 166static struct kmem_cache *trie_leaf_kmem __read_mostly;
19baf839 167
06801916
SH
168static inline struct tnode *node_parent(struct node *node)
169{
b59cfbf7
ED
170 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
171}
172
173static inline struct tnode *node_parent_rcu(struct node *node)
174{
175 struct tnode *ret = node_parent(node);
06801916 176
06801916
SH
177 return rcu_dereference(ret);
178}
179
6440cc9e
SH
180/* Same as rcu_assign_pointer
181 * but that macro() assumes that value is a pointer.
182 */
06801916
SH
183static inline void node_set_parent(struct node *node, struct tnode *ptr)
184{
6440cc9e
SH
185 smp_wmb();
186 node->parent = (unsigned long)ptr | NODE_TYPE(node);
06801916 187}
2373ce1c 188
b59cfbf7
ED
189static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
190{
191 BUG_ON(i >= 1U << tn->bits);
2373ce1c 192
b59cfbf7
ED
193 return tn->child[i];
194}
195
196static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
19baf839 197{
b59cfbf7 198 struct node *ret = tnode_get_child(tn, i);
19baf839 199
b59cfbf7 200 return rcu_dereference(ret);
19baf839
RO
201}
202
bb435b8d 203static inline int tnode_child_length(const struct tnode *tn)
19baf839 204{
91b9a277 205 return 1 << tn->bits;
19baf839
RO
206}
207
ab66b4a7
SH
208static inline t_key mask_pfx(t_key k, unsigned short l)
209{
210 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
211}
212
19baf839
RO
213static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
214{
91b9a277 215 if (offset < KEYLENGTH)
19baf839 216 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 217 else
19baf839
RO
218 return 0;
219}
220
221static inline int tkey_equals(t_key a, t_key b)
222{
c877efb2 223 return a == b;
19baf839
RO
224}
225
226static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
227{
c877efb2
SH
228 if (bits == 0 || offset >= KEYLENGTH)
229 return 1;
91b9a277
OJ
230 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
231 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 232}
19baf839
RO
233
234static inline int tkey_mismatch(t_key a, int offset, t_key b)
235{
236 t_key diff = a ^ b;
237 int i = offset;
238
c877efb2
SH
239 if (!diff)
240 return 0;
241 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
242 i++;
243 return i;
244}
245
19baf839 246/*
e905a9ed
YH
247 To understand this stuff, an understanding of keys and all their bits is
248 necessary. Every node in the trie has a key associated with it, but not
19baf839
RO
249 all of the bits in that key are significant.
250
251 Consider a node 'n' and its parent 'tp'.
252
e905a9ed
YH
253 If n is a leaf, every bit in its key is significant. Its presence is
254 necessitated by path compression, since during a tree traversal (when
255 searching for a leaf - unless we are doing an insertion) we will completely
256 ignore all skipped bits we encounter. Thus we need to verify, at the end of
257 a potentially successful search, that we have indeed been walking the
19baf839
RO
258 correct key path.
259
e905a9ed
YH
260 Note that we can never "miss" the correct key in the tree if present by
261 following the wrong path. Path compression ensures that segments of the key
262 that are the same for all keys with a given prefix are skipped, but the
263 skipped part *is* identical for each node in the subtrie below the skipped
264 bit! trie_insert() in this implementation takes care of that - note the
19baf839
RO
265 call to tkey_sub_equals() in trie_insert().
266
e905a9ed 267 if n is an internal node - a 'tnode' here, the various parts of its key
19baf839
RO
268 have many different meanings.
269
e905a9ed 270 Example:
19baf839
RO
271 _________________________________________________________________
272 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
273 -----------------------------------------------------------------
e905a9ed 274 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
275
276 _________________________________________________________________
277 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
278 -----------------------------------------------------------------
279 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
280
281 tp->pos = 7
282 tp->bits = 3
283 n->pos = 15
91b9a277 284 n->bits = 4
19baf839 285
e905a9ed
YH
286 First, let's just ignore the bits that come before the parent tp, that is
287 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
19baf839
RO
288 not use them for anything.
289
290 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
e905a9ed 291 index into the parent's child array. That is, they will be used to find
19baf839
RO
292 'n' among tp's children.
293
294 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
295 for the node n.
296
e905a9ed 297 All the bits we have seen so far are significant to the node n. The rest
19baf839
RO
298 of the bits are really not needed or indeed known in n->key.
299
e905a9ed 300 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
19baf839 301 n's child array, and will of course be different for each child.
e905a9ed 302
c877efb2 303
19baf839
RO
304 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
305 at this point.
306
307*/
308
0c7770c7 309static inline void check_tnode(const struct tnode *tn)
19baf839 310{
0c7770c7 311 WARN_ON(tn && tn->pos+tn->bits > 32);
19baf839
RO
312}
313
f5026fab
DL
314static const int halve_threshold = 25;
315static const int inflate_threshold = 50;
316static const int halve_threshold_root = 8;
317static const int inflate_threshold_root = 15;
19baf839 318
2373ce1c
RO
319
320static void __alias_free_mem(struct rcu_head *head)
19baf839 321{
2373ce1c
RO
322 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
323 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
324}
325
2373ce1c 326static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 327{
2373ce1c
RO
328 call_rcu(&fa->rcu, __alias_free_mem);
329}
91b9a277 330
2373ce1c
RO
331static void __leaf_free_rcu(struct rcu_head *head)
332{
bc3c8c1e
SH
333 struct leaf *l = container_of(head, struct leaf, rcu);
334 kmem_cache_free(trie_leaf_kmem, l);
2373ce1c 335}
91b9a277 336
2373ce1c 337static void __leaf_info_free_rcu(struct rcu_head *head)
19baf839 338{
2373ce1c 339 kfree(container_of(head, struct leaf_info, rcu));
19baf839
RO
340}
341
2373ce1c 342static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 343{
2373ce1c 344 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
19baf839
RO
345}
346
8d965444 347static struct tnode *tnode_alloc(size_t size)
f0e36f8c 348{
2373ce1c
RO
349 struct page *pages;
350
351 if (size <= PAGE_SIZE)
8d965444 352 return kzalloc(size, GFP_KERNEL);
2373ce1c
RO
353
354 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
355 if (!pages)
356 return NULL;
357
358 return page_address(pages);
f0e36f8c
PM
359}
360
2373ce1c 361static void __tnode_free_rcu(struct rcu_head *head)
f0e36f8c 362{
2373ce1c 363 struct tnode *tn = container_of(head, struct tnode, rcu);
8d965444
ED
364 size_t size = sizeof(struct tnode) +
365 (sizeof(struct node *) << tn->bits);
f0e36f8c
PM
366
367 if (size <= PAGE_SIZE)
368 kfree(tn);
369 else
370 free_pages((unsigned long)tn, get_order(size));
371}
372
2373ce1c
RO
373static inline void tnode_free(struct tnode *tn)
374{
132adf54 375 if (IS_LEAF(tn)) {
550e29bc
RO
376 struct leaf *l = (struct leaf *) tn;
377 call_rcu_bh(&l->rcu, __leaf_free_rcu);
132adf54 378 } else
550e29bc 379 call_rcu(&tn->rcu, __tnode_free_rcu);
2373ce1c
RO
380}
381
382static struct leaf *leaf_new(void)
383{
bc3c8c1e 384 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
2373ce1c
RO
385 if (l) {
386 l->parent = T_LEAF;
387 INIT_HLIST_HEAD(&l->list);
388 }
389 return l;
390}
391
392static struct leaf_info *leaf_info_new(int plen)
393{
394 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
395 if (li) {
396 li->plen = plen;
397 INIT_LIST_HEAD(&li->falh);
398 }
399 return li;
400}
401
a07f5f50 402static struct tnode *tnode_new(t_key key, int pos, int bits)
19baf839 403{
8d965444 404 size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
f0e36f8c 405 struct tnode *tn = tnode_alloc(sz);
19baf839 406
91b9a277 407 if (tn) {
2373ce1c 408 tn->parent = T_TNODE;
19baf839
RO
409 tn->pos = pos;
410 tn->bits = bits;
411 tn->key = key;
412 tn->full_children = 0;
413 tn->empty_children = 1<<bits;
414 }
c877efb2 415
8d965444
ED
416 pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
417 (unsigned long) (sizeof(struct node) << bits));
19baf839
RO
418 return tn;
419}
420
19baf839
RO
421/*
422 * Check whether a tnode 'n' is "full", i.e. it is an internal node
423 * and no bits are skipped. See discussion in dyntree paper p. 6
424 */
425
bb435b8d 426static inline int tnode_full(const struct tnode *tn, const struct node *n)
19baf839 427{
c877efb2 428 if (n == NULL || IS_LEAF(n))
19baf839
RO
429 return 0;
430
431 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
432}
433
a07f5f50
SH
434static inline void put_child(struct trie *t, struct tnode *tn, int i,
435 struct node *n)
19baf839
RO
436{
437 tnode_put_child_reorg(tn, i, n, -1);
438}
439
c877efb2 440 /*
19baf839
RO
441 * Add a child at position i overwriting the old value.
442 * Update the value of full_children and empty_children.
443 */
444
a07f5f50
SH
445static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
446 int wasfull)
19baf839 447{
2373ce1c 448 struct node *chi = tn->child[i];
19baf839
RO
449 int isfull;
450
0c7770c7
SH
451 BUG_ON(i >= 1<<tn->bits);
452
19baf839
RO
453 /* update emptyChildren */
454 if (n == NULL && chi != NULL)
455 tn->empty_children++;
456 else if (n != NULL && chi == NULL)
457 tn->empty_children--;
c877efb2 458
19baf839 459 /* update fullChildren */
91b9a277 460 if (wasfull == -1)
19baf839
RO
461 wasfull = tnode_full(tn, chi);
462
463 isfull = tnode_full(tn, n);
c877efb2 464 if (wasfull && !isfull)
19baf839 465 tn->full_children--;
c877efb2 466 else if (!wasfull && isfull)
19baf839 467 tn->full_children++;
91b9a277 468
c877efb2 469 if (n)
06801916 470 node_set_parent(n, tn);
19baf839 471
2373ce1c 472 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
473}
474
c877efb2 475static struct node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
476{
477 int i;
2f36895a 478 int err = 0;
2f80b3c8 479 struct tnode *old_tn;
e6308be8
RO
480 int inflate_threshold_use;
481 int halve_threshold_use;
05eee48c 482 int max_resize;
19baf839 483
e905a9ed 484 if (!tn)
19baf839
RO
485 return NULL;
486
0c7770c7
SH
487 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
488 tn, inflate_threshold, halve_threshold);
19baf839
RO
489
490 /* No children */
491 if (tn->empty_children == tnode_child_length(tn)) {
492 tnode_free(tn);
493 return NULL;
494 }
495 /* One child */
496 if (tn->empty_children == tnode_child_length(tn) - 1)
497 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 498 struct node *n;
19baf839 499
91b9a277 500 n = tn->child[i];
2373ce1c 501 if (!n)
91b9a277 502 continue;
91b9a277
OJ
503
504 /* compress one level */
06801916 505 node_set_parent(n, NULL);
91b9a277
OJ
506 tnode_free(tn);
507 return n;
19baf839 508 }
c877efb2 509 /*
19baf839
RO
510 * Double as long as the resulting node has a number of
511 * nonempty nodes that are above the threshold.
512 */
513
514 /*
c877efb2
SH
515 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
516 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 517 * Telecommunications, page 6:
c877efb2 518 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
519 * children in the *doubled* node is at least 'high'."
520 *
c877efb2
SH
521 * 'high' in this instance is the variable 'inflate_threshold'. It
522 * is expressed as a percentage, so we multiply it with
523 * tnode_child_length() and instead of multiplying by 2 (since the
524 * child array will be doubled by inflate()) and multiplying
525 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 526 * multiply the left-hand side by 50.
c877efb2
SH
527 *
528 * The left-hand side may look a bit weird: tnode_child_length(tn)
529 * - tn->empty_children is of course the number of non-null children
530 * in the current node. tn->full_children is the number of "full"
19baf839 531 * children, that is non-null tnodes with a skip value of 0.
c877efb2 532 * All of those will be doubled in the resulting inflated tnode, so
19baf839 533 * we just count them one extra time here.
c877efb2 534 *
19baf839 535 * A clearer way to write this would be:
c877efb2 536 *
19baf839 537 * to_be_doubled = tn->full_children;
c877efb2 538 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
539 * tn->full_children;
540 *
541 * new_child_length = tnode_child_length(tn) * 2;
542 *
c877efb2 543 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
544 * new_child_length;
545 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
546 *
547 * ...and so on, tho it would mess up the while () loop.
548 *
19baf839
RO
549 * anyway,
550 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
551 * inflate_threshold
c877efb2 552 *
19baf839
RO
553 * avoid a division:
554 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
555 * inflate_threshold * new_child_length
c877efb2 556 *
19baf839 557 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 558 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 559 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 560 *
19baf839 561 * expand new_child_length:
c877efb2 562 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 563 * tn->full_children) >=
19baf839 564 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 565 *
19baf839 566 * shorten again:
c877efb2 567 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 568 * tn->empty_children) >= inflate_threshold *
19baf839 569 * tnode_child_length(tn)
c877efb2 570 *
19baf839
RO
571 */
572
573 check_tnode(tn);
c877efb2 574
e6308be8
RO
575 /* Keep root node larger */
576
132adf54 577 if (!tn->parent)
e6308be8 578 inflate_threshold_use = inflate_threshold_root;
e905a9ed 579 else
e6308be8
RO
580 inflate_threshold_use = inflate_threshold;
581
2f36895a 582 err = 0;
05eee48c
RO
583 max_resize = 10;
584 while ((tn->full_children > 0 && max_resize-- &&
a07f5f50
SH
585 50 * (tn->full_children + tnode_child_length(tn)
586 - tn->empty_children)
587 >= inflate_threshold_use * tnode_child_length(tn))) {
19baf839 588
2f80b3c8
RO
589 old_tn = tn;
590 tn = inflate(t, tn);
a07f5f50 591
2f80b3c8
RO
592 if (IS_ERR(tn)) {
593 tn = old_tn;
2f36895a
RO
594#ifdef CONFIG_IP_FIB_TRIE_STATS
595 t->stats.resize_node_skipped++;
596#endif
597 break;
598 }
19baf839
RO
599 }
600
05eee48c
RO
601 if (max_resize < 0) {
602 if (!tn->parent)
a07f5f50
SH
603 pr_warning("Fix inflate_threshold_root."
604 " Now=%d size=%d bits\n",
605 inflate_threshold_root, tn->bits);
05eee48c 606 else
a07f5f50
SH
607 pr_warning("Fix inflate_threshold."
608 " Now=%d size=%d bits\n",
609 inflate_threshold, tn->bits);
05eee48c
RO
610 }
611
19baf839
RO
612 check_tnode(tn);
613
614 /*
615 * Halve as long as the number of empty children in this
616 * node is above threshold.
617 */
2f36895a 618
e6308be8
RO
619
620 /* Keep root node larger */
621
132adf54 622 if (!tn->parent)
e6308be8 623 halve_threshold_use = halve_threshold_root;
e905a9ed 624 else
e6308be8
RO
625 halve_threshold_use = halve_threshold;
626
2f36895a 627 err = 0;
05eee48c
RO
628 max_resize = 10;
629 while (tn->bits > 1 && max_resize-- &&
19baf839 630 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 631 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 632
2f80b3c8
RO
633 old_tn = tn;
634 tn = halve(t, tn);
635 if (IS_ERR(tn)) {
636 tn = old_tn;
2f36895a
RO
637#ifdef CONFIG_IP_FIB_TRIE_STATS
638 t->stats.resize_node_skipped++;
639#endif
640 break;
641 }
642 }
19baf839 643
05eee48c
RO
644 if (max_resize < 0) {
645 if (!tn->parent)
a07f5f50
SH
646 pr_warning("Fix halve_threshold_root."
647 " Now=%d size=%d bits\n",
648 halve_threshold_root, tn->bits);
05eee48c 649 else
a07f5f50
SH
650 pr_warning("Fix halve_threshold."
651 " Now=%d size=%d bits\n",
652 halve_threshold, tn->bits);
05eee48c 653 }
c877efb2 654
19baf839 655 /* Only one child remains */
19baf839
RO
656 if (tn->empty_children == tnode_child_length(tn) - 1)
657 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 658 struct node *n;
19baf839 659
91b9a277 660 n = tn->child[i];
2373ce1c 661 if (!n)
91b9a277 662 continue;
91b9a277
OJ
663
664 /* compress one level */
665
06801916 666 node_set_parent(n, NULL);
91b9a277
OJ
667 tnode_free(tn);
668 return n;
19baf839
RO
669 }
670
671 return (struct node *) tn;
672}
673
2f80b3c8 674static struct tnode *inflate(struct trie *t, struct tnode *tn)
19baf839 675{
19baf839
RO
676 struct tnode *oldtnode = tn;
677 int olen = tnode_child_length(tn);
678 int i;
679
0c7770c7 680 pr_debug("In inflate\n");
19baf839
RO
681
682 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
683
0c7770c7 684 if (!tn)
2f80b3c8 685 return ERR_PTR(-ENOMEM);
2f36895a
RO
686
687 /*
c877efb2
SH
688 * Preallocate and store tnodes before the actual work so we
689 * don't get into an inconsistent state if memory allocation
690 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
691 * of tnode is ignored.
692 */
91b9a277
OJ
693
694 for (i = 0; i < olen; i++) {
a07f5f50 695 struct tnode *inode;
2f36895a 696
a07f5f50 697 inode = (struct tnode *) tnode_get_child(oldtnode, i);
2f36895a
RO
698 if (inode &&
699 IS_TNODE(inode) &&
700 inode->pos == oldtnode->pos + oldtnode->bits &&
701 inode->bits > 1) {
702 struct tnode *left, *right;
ab66b4a7 703 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
c877efb2 704
2f36895a
RO
705 left = tnode_new(inode->key&(~m), inode->pos + 1,
706 inode->bits - 1);
2f80b3c8
RO
707 if (!left)
708 goto nomem;
91b9a277 709
2f36895a
RO
710 right = tnode_new(inode->key|m, inode->pos + 1,
711 inode->bits - 1);
712
e905a9ed 713 if (!right) {
2f80b3c8
RO
714 tnode_free(left);
715 goto nomem;
e905a9ed 716 }
2f36895a
RO
717
718 put_child(t, tn, 2*i, (struct node *) left);
719 put_child(t, tn, 2*i+1, (struct node *) right);
720 }
721 }
722
91b9a277 723 for (i = 0; i < olen; i++) {
c95aaf9a 724 struct tnode *inode;
19baf839 725 struct node *node = tnode_get_child(oldtnode, i);
91b9a277
OJ
726 struct tnode *left, *right;
727 int size, j;
c877efb2 728
19baf839
RO
729 /* An empty child */
730 if (node == NULL)
731 continue;
732
733 /* A leaf or an internal node with skipped bits */
734
c877efb2 735 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 736 tn->pos + tn->bits - 1) {
a07f5f50
SH
737 if (tkey_extract_bits(node->key,
738 oldtnode->pos + oldtnode->bits,
739 1) == 0)
19baf839
RO
740 put_child(t, tn, 2*i, node);
741 else
742 put_child(t, tn, 2*i+1, node);
743 continue;
744 }
745
746 /* An internal node with two children */
747 inode = (struct tnode *) node;
748
749 if (inode->bits == 1) {
750 put_child(t, tn, 2*i, inode->child[0]);
751 put_child(t, tn, 2*i+1, inode->child[1]);
752
753 tnode_free(inode);
91b9a277 754 continue;
19baf839
RO
755 }
756
91b9a277
OJ
757 /* An internal node with more than two children */
758
759 /* We will replace this node 'inode' with two new
760 * ones, 'left' and 'right', each with half of the
761 * original children. The two new nodes will have
762 * a position one bit further down the key and this
763 * means that the "significant" part of their keys
764 * (see the discussion near the top of this file)
765 * will differ by one bit, which will be "0" in
766 * left's key and "1" in right's key. Since we are
767 * moving the key position by one step, the bit that
768 * we are moving away from - the bit at position
769 * (inode->pos) - is the one that will differ between
770 * left and right. So... we synthesize that bit in the
771 * two new keys.
772 * The mask 'm' below will be a single "one" bit at
773 * the position (inode->pos)
774 */
19baf839 775
91b9a277
OJ
776 /* Use the old key, but set the new significant
777 * bit to zero.
778 */
2f36895a 779
91b9a277
OJ
780 left = (struct tnode *) tnode_get_child(tn, 2*i);
781 put_child(t, tn, 2*i, NULL);
2f36895a 782
91b9a277 783 BUG_ON(!left);
2f36895a 784
91b9a277
OJ
785 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
786 put_child(t, tn, 2*i+1, NULL);
19baf839 787
91b9a277 788 BUG_ON(!right);
19baf839 789
91b9a277
OJ
790 size = tnode_child_length(left);
791 for (j = 0; j < size; j++) {
792 put_child(t, left, j, inode->child[j]);
793 put_child(t, right, j, inode->child[j + size]);
19baf839 794 }
91b9a277
OJ
795 put_child(t, tn, 2*i, resize(t, left));
796 put_child(t, tn, 2*i+1, resize(t, right));
797
798 tnode_free(inode);
19baf839
RO
799 }
800 tnode_free(oldtnode);
801 return tn;
2f80b3c8
RO
802nomem:
803 {
804 int size = tnode_child_length(tn);
805 int j;
806
0c7770c7 807 for (j = 0; j < size; j++)
2f80b3c8
RO
808 if (tn->child[j])
809 tnode_free((struct tnode *)tn->child[j]);
810
811 tnode_free(tn);
0c7770c7 812
2f80b3c8
RO
813 return ERR_PTR(-ENOMEM);
814 }
19baf839
RO
815}
816
2f80b3c8 817static struct tnode *halve(struct trie *t, struct tnode *tn)
19baf839
RO
818{
819 struct tnode *oldtnode = tn;
820 struct node *left, *right;
821 int i;
822 int olen = tnode_child_length(tn);
823
0c7770c7 824 pr_debug("In halve\n");
c877efb2
SH
825
826 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 827
2f80b3c8
RO
828 if (!tn)
829 return ERR_PTR(-ENOMEM);
2f36895a
RO
830
831 /*
c877efb2
SH
832 * Preallocate and store tnodes before the actual work so we
833 * don't get into an inconsistent state if memory allocation
834 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
835 * of tnode is ignored.
836 */
837
91b9a277 838 for (i = 0; i < olen; i += 2) {
2f36895a
RO
839 left = tnode_get_child(oldtnode, i);
840 right = tnode_get_child(oldtnode, i+1);
c877efb2 841
2f36895a 842 /* Two nonempty children */
0c7770c7 843 if (left && right) {
2f80b3c8 844 struct tnode *newn;
0c7770c7 845
2f80b3c8 846 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
847
848 if (!newn)
2f80b3c8 849 goto nomem;
0c7770c7 850
2f80b3c8 851 put_child(t, tn, i/2, (struct node *)newn);
2f36895a 852 }
2f36895a 853
2f36895a 854 }
19baf839 855
91b9a277
OJ
856 for (i = 0; i < olen; i += 2) {
857 struct tnode *newBinNode;
858
19baf839
RO
859 left = tnode_get_child(oldtnode, i);
860 right = tnode_get_child(oldtnode, i+1);
c877efb2 861
19baf839
RO
862 /* At least one of the children is empty */
863 if (left == NULL) {
864 if (right == NULL) /* Both are empty */
865 continue;
866 put_child(t, tn, i/2, right);
91b9a277 867 continue;
0c7770c7 868 }
91b9a277
OJ
869
870 if (right == NULL) {
19baf839 871 put_child(t, tn, i/2, left);
91b9a277
OJ
872 continue;
873 }
c877efb2 874
19baf839 875 /* Two nonempty children */
91b9a277
OJ
876 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
877 put_child(t, tn, i/2, NULL);
91b9a277
OJ
878 put_child(t, newBinNode, 0, left);
879 put_child(t, newBinNode, 1, right);
880 put_child(t, tn, i/2, resize(t, newBinNode));
19baf839
RO
881 }
882 tnode_free(oldtnode);
883 return tn;
2f80b3c8
RO
884nomem:
885 {
886 int size = tnode_child_length(tn);
887 int j;
888
0c7770c7 889 for (j = 0; j < size; j++)
2f80b3c8
RO
890 if (tn->child[j])
891 tnode_free((struct tnode *)tn->child[j]);
892
893 tnode_free(tn);
0c7770c7 894
2f80b3c8
RO
895 return ERR_PTR(-ENOMEM);
896 }
19baf839
RO
897}
898
772cb712 899/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
900 via get_fa_head and dump */
901
772cb712 902static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
19baf839 903{
772cb712 904 struct hlist_head *head = &l->list;
19baf839
RO
905 struct hlist_node *node;
906 struct leaf_info *li;
907
2373ce1c 908 hlist_for_each_entry_rcu(li, node, head, hlist)
c877efb2 909 if (li->plen == plen)
19baf839 910 return li;
91b9a277 911
19baf839
RO
912 return NULL;
913}
914
a07f5f50 915static inline struct list_head *get_fa_head(struct leaf *l, int plen)
19baf839 916{
772cb712 917 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 918
91b9a277
OJ
919 if (!li)
920 return NULL;
c877efb2 921
91b9a277 922 return &li->falh;
19baf839
RO
923}
924
925static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
926{
e905a9ed
YH
927 struct leaf_info *li = NULL, *last = NULL;
928 struct hlist_node *node;
929
930 if (hlist_empty(head)) {
931 hlist_add_head_rcu(&new->hlist, head);
932 } else {
933 hlist_for_each_entry(li, node, head, hlist) {
934 if (new->plen > li->plen)
935 break;
936
937 last = li;
938 }
939 if (last)
940 hlist_add_after_rcu(&last->hlist, &new->hlist);
941 else
942 hlist_add_before_rcu(&new->hlist, &li->hlist);
943 }
19baf839
RO
944}
945
2373ce1c
RO
946/* rcu_read_lock needs to be hold by caller from readside */
947
19baf839
RO
948static struct leaf *
949fib_find_node(struct trie *t, u32 key)
950{
951 int pos;
952 struct tnode *tn;
953 struct node *n;
954
955 pos = 0;
2373ce1c 956 n = rcu_dereference(t->trie);
19baf839
RO
957
958 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
959 tn = (struct tnode *) n;
91b9a277 960
19baf839 961 check_tnode(tn);
91b9a277 962
c877efb2 963 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
91b9a277 964 pos = tn->pos + tn->bits;
a07f5f50
SH
965 n = tnode_get_child_rcu(tn,
966 tkey_extract_bits(key,
967 tn->pos,
968 tn->bits));
91b9a277 969 } else
19baf839
RO
970 break;
971 }
972 /* Case we have found a leaf. Compare prefixes */
973
91b9a277
OJ
974 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
975 return (struct leaf *)n;
976
19baf839
RO
977 return NULL;
978}
979
980static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
981{
19baf839 982 int wasfull;
06801916
SH
983 t_key cindex, key = tn->key;
984 struct tnode *tp;
19baf839 985
06801916 986 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
19baf839
RO
987 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
988 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
a07f5f50
SH
989 tn = (struct tnode *) resize(t, (struct tnode *)tn);
990
991 tnode_put_child_reorg((struct tnode *)tp, cindex,
992 (struct node *)tn, wasfull);
91b9a277 993
06801916
SH
994 tp = node_parent((struct node *) tn);
995 if (!tp)
19baf839 996 break;
06801916 997 tn = tp;
19baf839 998 }
06801916 999
19baf839 1000 /* Handle last (top) tnode */
c877efb2 1001 if (IS_TNODE(tn))
a07f5f50 1002 tn = (struct tnode *)resize(t, (struct tnode *)tn);
19baf839 1003
a07f5f50 1004 return (struct node *)tn;
19baf839
RO
1005}
1006
2373ce1c
RO
1007/* only used from updater-side */
1008
fea86ad8 1009static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
19baf839
RO
1010{
1011 int pos, newpos;
1012 struct tnode *tp = NULL, *tn = NULL;
1013 struct node *n;
1014 struct leaf *l;
1015 int missbit;
c877efb2 1016 struct list_head *fa_head = NULL;
19baf839
RO
1017 struct leaf_info *li;
1018 t_key cindex;
1019
1020 pos = 0;
c877efb2 1021 n = t->trie;
19baf839 1022
c877efb2
SH
1023 /* If we point to NULL, stop. Either the tree is empty and we should
1024 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 1025 * and we should just put our new leaf in that.
c877efb2
SH
1026 * If we point to a T_TNODE, check if it matches our key. Note that
1027 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
1028 * not be the parent's 'pos'+'bits'!
1029 *
c877efb2 1030 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
1031 * the index from our key, push the T_TNODE and walk the tree.
1032 *
1033 * If it doesn't, we have to replace it with a new T_TNODE.
1034 *
c877efb2
SH
1035 * If we point to a T_LEAF, it might or might not have the same key
1036 * as we do. If it does, just change the value, update the T_LEAF's
1037 * value, and return it.
19baf839
RO
1038 * If it doesn't, we need to replace it with a T_TNODE.
1039 */
1040
1041 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1042 tn = (struct tnode *) n;
91b9a277 1043
c877efb2 1044 check_tnode(tn);
91b9a277 1045
c877efb2 1046 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839 1047 tp = tn;
91b9a277 1048 pos = tn->pos + tn->bits;
a07f5f50
SH
1049 n = tnode_get_child(tn,
1050 tkey_extract_bits(key,
1051 tn->pos,
1052 tn->bits));
19baf839 1053
06801916 1054 BUG_ON(n && node_parent(n) != tn);
91b9a277 1055 } else
19baf839
RO
1056 break;
1057 }
1058
1059 /*
1060 * n ----> NULL, LEAF or TNODE
1061 *
c877efb2 1062 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1063 */
1064
91b9a277 1065 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1066
1067 /* Case 1: n is a leaf. Compare prefixes */
1068
c877efb2 1069 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
c95aaf9a 1070 l = (struct leaf *) n;
19baf839 1071 li = leaf_info_new(plen);
91b9a277 1072
fea86ad8
SH
1073 if (!li)
1074 return NULL;
19baf839
RO
1075
1076 fa_head = &li->falh;
1077 insert_leaf_info(&l->list, li);
1078 goto done;
1079 }
19baf839
RO
1080 l = leaf_new();
1081
fea86ad8
SH
1082 if (!l)
1083 return NULL;
19baf839
RO
1084
1085 l->key = key;
1086 li = leaf_info_new(plen);
1087
c877efb2 1088 if (!li) {
f835e471 1089 tnode_free((struct tnode *) l);
fea86ad8 1090 return NULL;
f835e471 1091 }
19baf839
RO
1092
1093 fa_head = &li->falh;
1094 insert_leaf_info(&l->list, li);
1095
19baf839 1096 if (t->trie && n == NULL) {
91b9a277 1097 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839 1098
06801916 1099 node_set_parent((struct node *)l, tp);
19baf839 1100
91b9a277
OJ
1101 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1102 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1103 } else {
1104 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1105 /*
1106 * Add a new tnode here
19baf839
RO
1107 * first tnode need some special handling
1108 */
1109
1110 if (tp)
91b9a277 1111 pos = tp->pos+tp->bits;
19baf839 1112 else
91b9a277
OJ
1113 pos = 0;
1114
c877efb2 1115 if (n) {
19baf839
RO
1116 newpos = tkey_mismatch(key, pos, n->key);
1117 tn = tnode_new(n->key, newpos, 1);
91b9a277 1118 } else {
19baf839 1119 newpos = 0;
c877efb2 1120 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1121 }
19baf839 1122
c877efb2 1123 if (!tn) {
f835e471
RO
1124 free_leaf_info(li);
1125 tnode_free((struct tnode *) l);
fea86ad8 1126 return NULL;
91b9a277
OJ
1127 }
1128
06801916 1129 node_set_parent((struct node *)tn, tp);
19baf839 1130
91b9a277 1131 missbit = tkey_extract_bits(key, newpos, 1);
19baf839
RO
1132 put_child(t, tn, missbit, (struct node *)l);
1133 put_child(t, tn, 1-missbit, n);
1134
c877efb2 1135 if (tp) {
19baf839 1136 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
a07f5f50
SH
1137 put_child(t, (struct tnode *)tp, cindex,
1138 (struct node *)tn);
91b9a277 1139 } else {
a07f5f50 1140 rcu_assign_pointer(t->trie, (struct node *)tn);
19baf839
RO
1141 tp = tn;
1142 }
1143 }
91b9a277
OJ
1144
1145 if (tp && tp->pos + tp->bits > 32)
a07f5f50
SH
1146 pr_warning("fib_trie"
1147 " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1148 tp, tp->pos, tp->bits, key, plen);
91b9a277 1149
19baf839 1150 /* Rebalance the trie */
2373ce1c
RO
1151
1152 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
f835e471 1153done:
19baf839
RO
1154 return fa_head;
1155}
1156
d562f1f8
RO
1157/*
1158 * Caller must hold RTNL.
1159 */
4e902c57 1160static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1161{
1162 struct trie *t = (struct trie *) tb->tb_data;
1163 struct fib_alias *fa, *new_fa;
c877efb2 1164 struct list_head *fa_head = NULL;
19baf839 1165 struct fib_info *fi;
4e902c57
TG
1166 int plen = cfg->fc_dst_len;
1167 u8 tos = cfg->fc_tos;
19baf839
RO
1168 u32 key, mask;
1169 int err;
1170 struct leaf *l;
1171
1172 if (plen > 32)
1173 return -EINVAL;
1174
4e902c57 1175 key = ntohl(cfg->fc_dst);
19baf839 1176
2dfe55b4 1177 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
19baf839 1178
91b9a277 1179 mask = ntohl(inet_make_mask(plen));
19baf839 1180
c877efb2 1181 if (key & ~mask)
19baf839
RO
1182 return -EINVAL;
1183
1184 key = key & mask;
1185
4e902c57
TG
1186 fi = fib_create_info(cfg);
1187 if (IS_ERR(fi)) {
1188 err = PTR_ERR(fi);
19baf839 1189 goto err;
4e902c57 1190 }
19baf839
RO
1191
1192 l = fib_find_node(t, key);
c877efb2 1193 fa = NULL;
19baf839 1194
c877efb2 1195 if (l) {
19baf839
RO
1196 fa_head = get_fa_head(l, plen);
1197 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1198 }
1199
1200 /* Now fa, if non-NULL, points to the first fib alias
1201 * with the same keys [prefix,tos,priority], if such key already
1202 * exists or to the node before which we will insert new one.
1203 *
1204 * If fa is NULL, we will need to allocate a new one and
1205 * insert to the head of f.
1206 *
1207 * If f is NULL, no fib node matched the destination key
1208 * and we need to allocate a new one of those as well.
1209 */
1210
936f6f8e
JA
1211 if (fa && fa->fa_tos == tos &&
1212 fa->fa_info->fib_priority == fi->fib_priority) {
1213 struct fib_alias *fa_first, *fa_match;
19baf839
RO
1214
1215 err = -EEXIST;
4e902c57 1216 if (cfg->fc_nlflags & NLM_F_EXCL)
19baf839
RO
1217 goto out;
1218
936f6f8e
JA
1219 /* We have 2 goals:
1220 * 1. Find exact match for type, scope, fib_info to avoid
1221 * duplicate routes
1222 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1223 */
1224 fa_match = NULL;
1225 fa_first = fa;
1226 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1227 list_for_each_entry_continue(fa, fa_head, fa_list) {
1228 if (fa->fa_tos != tos)
1229 break;
1230 if (fa->fa_info->fib_priority != fi->fib_priority)
1231 break;
1232 if (fa->fa_type == cfg->fc_type &&
1233 fa->fa_scope == cfg->fc_scope &&
1234 fa->fa_info == fi) {
1235 fa_match = fa;
1236 break;
1237 }
1238 }
1239
4e902c57 1240 if (cfg->fc_nlflags & NLM_F_REPLACE) {
19baf839
RO
1241 struct fib_info *fi_drop;
1242 u8 state;
1243
936f6f8e
JA
1244 fa = fa_first;
1245 if (fa_match) {
1246 if (fa == fa_match)
1247 err = 0;
6725033f 1248 goto out;
936f6f8e 1249 }
2373ce1c 1250 err = -ENOBUFS;
e94b1766 1251 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
2373ce1c
RO
1252 if (new_fa == NULL)
1253 goto out;
19baf839
RO
1254
1255 fi_drop = fa->fa_info;
2373ce1c
RO
1256 new_fa->fa_tos = fa->fa_tos;
1257 new_fa->fa_info = fi;
4e902c57
TG
1258 new_fa->fa_type = cfg->fc_type;
1259 new_fa->fa_scope = cfg->fc_scope;
19baf839 1260 state = fa->fa_state;
936f6f8e 1261 new_fa->fa_state = state & ~FA_S_ACCESSED;
19baf839 1262
2373ce1c
RO
1263 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1264 alias_free_mem_rcu(fa);
19baf839
RO
1265
1266 fib_release_info(fi_drop);
1267 if (state & FA_S_ACCESSED)
91b9a277 1268 rt_cache_flush(-1);
b8f55831
MK
1269 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1270 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
19baf839 1271
91b9a277 1272 goto succeeded;
19baf839
RO
1273 }
1274 /* Error if we find a perfect match which
1275 * uses the same scope, type, and nexthop
1276 * information.
1277 */
936f6f8e
JA
1278 if (fa_match)
1279 goto out;
a07f5f50 1280
4e902c57 1281 if (!(cfg->fc_nlflags & NLM_F_APPEND))
936f6f8e 1282 fa = fa_first;
19baf839
RO
1283 }
1284 err = -ENOENT;
4e902c57 1285 if (!(cfg->fc_nlflags & NLM_F_CREATE))
19baf839
RO
1286 goto out;
1287
1288 err = -ENOBUFS;
e94b1766 1289 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
19baf839
RO
1290 if (new_fa == NULL)
1291 goto out;
1292
1293 new_fa->fa_info = fi;
1294 new_fa->fa_tos = tos;
4e902c57
TG
1295 new_fa->fa_type = cfg->fc_type;
1296 new_fa->fa_scope = cfg->fc_scope;
19baf839 1297 new_fa->fa_state = 0;
19baf839
RO
1298 /*
1299 * Insert new entry to the list.
1300 */
1301
c877efb2 1302 if (!fa_head) {
fea86ad8
SH
1303 fa_head = fib_insert_node(t, key, plen);
1304 if (unlikely(!fa_head)) {
1305 err = -ENOMEM;
f835e471 1306 goto out_free_new_fa;
fea86ad8 1307 }
f835e471 1308 }
19baf839 1309
2373ce1c
RO
1310 list_add_tail_rcu(&new_fa->fa_list,
1311 (fa ? &fa->fa_list : fa_head));
19baf839
RO
1312
1313 rt_cache_flush(-1);
4e902c57 1314 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
b8f55831 1315 &cfg->fc_nlinfo, 0);
19baf839
RO
1316succeeded:
1317 return 0;
f835e471
RO
1318
1319out_free_new_fa:
1320 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1321out:
1322 fib_release_info(fi);
91b9a277 1323err:
19baf839
RO
1324 return err;
1325}
1326
772cb712 1327/* should be called with rcu_read_lock */
a07f5f50
SH
1328static int check_leaf(struct trie *t, struct leaf *l,
1329 t_key key, const struct flowi *flp,
1330 struct fib_result *res)
19baf839 1331{
19baf839
RO
1332 struct leaf_info *li;
1333 struct hlist_head *hhead = &l->list;
1334 struct hlist_node *node;
c877efb2 1335
2373ce1c 1336 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
a07f5f50
SH
1337 int err;
1338 int plen = li->plen;
1339 __be32 mask = inet_make_mask(plen);
1340
888454c5 1341 if (l->key != (key & ntohl(mask)))
19baf839
RO
1342 continue;
1343
a07f5f50
SH
1344 err = fib_semantic_match(&li->falh, flp, res,
1345 htonl(l->key), mask, plen);
1346
19baf839 1347#ifdef CONFIG_IP_FIB_TRIE_STATS
a07f5f50 1348 if (err <= 0)
19baf839 1349 t->stats.semantic_match_passed++;
a07f5f50
SH
1350 else
1351 t->stats.semantic_match_miss++;
19baf839 1352#endif
a07f5f50
SH
1353 if (err <= 0)
1354 return plen;
19baf839 1355 }
a07f5f50
SH
1356
1357 return -1;
19baf839
RO
1358}
1359
a07f5f50
SH
1360static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1361 struct fib_result *res)
19baf839
RO
1362{
1363 struct trie *t = (struct trie *) tb->tb_data;
1364 int plen, ret = 0;
1365 struct node *n;
1366 struct tnode *pn;
1367 int pos, bits;
91b9a277 1368 t_key key = ntohl(flp->fl4_dst);
19baf839
RO
1369 int chopped_off;
1370 t_key cindex = 0;
1371 int current_prefix_length = KEYLENGTH;
91b9a277
OJ
1372 struct tnode *cn;
1373 t_key node_prefix, key_prefix, pref_mismatch;
1374 int mp;
1375
2373ce1c 1376 rcu_read_lock();
91b9a277 1377
2373ce1c 1378 n = rcu_dereference(t->trie);
c877efb2 1379 if (!n)
19baf839
RO
1380 goto failed;
1381
1382#ifdef CONFIG_IP_FIB_TRIE_STATS
1383 t->stats.gets++;
1384#endif
1385
1386 /* Just a leaf? */
1387 if (IS_LEAF(n)) {
a07f5f50
SH
1388 plen = check_leaf(t, (struct leaf *)n, key, flp, res);
1389 if (plen < 0)
1390 goto failed;
1391 ret = 0;
1392 goto found;
19baf839 1393 }
a07f5f50 1394
19baf839
RO
1395 pn = (struct tnode *) n;
1396 chopped_off = 0;
c877efb2 1397
91b9a277 1398 while (pn) {
19baf839
RO
1399 pos = pn->pos;
1400 bits = pn->bits;
1401
c877efb2 1402 if (!chopped_off)
ab66b4a7
SH
1403 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1404 pos, bits);
19baf839
RO
1405
1406 n = tnode_get_child(pn, cindex);
1407
1408 if (n == NULL) {
1409#ifdef CONFIG_IP_FIB_TRIE_STATS
1410 t->stats.null_node_hit++;
1411#endif
1412 goto backtrace;
1413 }
1414
91b9a277 1415 if (IS_LEAF(n)) {
a07f5f50
SH
1416 plen = check_leaf(t, (struct leaf *)n, key, flp, res);
1417 if (plen < 0)
91b9a277 1418 goto backtrace;
a07f5f50
SH
1419
1420 ret = 0;
1421 goto found;
91b9a277
OJ
1422 }
1423
91b9a277 1424 cn = (struct tnode *)n;
19baf839 1425
91b9a277
OJ
1426 /*
1427 * It's a tnode, and we can do some extra checks here if we
1428 * like, to avoid descending into a dead-end branch.
1429 * This tnode is in the parent's child array at index
1430 * key[p_pos..p_pos+p_bits] but potentially with some bits
1431 * chopped off, so in reality the index may be just a
1432 * subprefix, padded with zero at the end.
1433 * We can also take a look at any skipped bits in this
1434 * tnode - everything up to p_pos is supposed to be ok,
1435 * and the non-chopped bits of the index (se previous
1436 * paragraph) are also guaranteed ok, but the rest is
1437 * considered unknown.
1438 *
1439 * The skipped bits are key[pos+bits..cn->pos].
1440 */
19baf839 1441
91b9a277
OJ
1442 /* If current_prefix_length < pos+bits, we are already doing
1443 * actual prefix matching, which means everything from
1444 * pos+(bits-chopped_off) onward must be zero along some
1445 * branch of this subtree - otherwise there is *no* valid
1446 * prefix present. Here we can only check the skipped
1447 * bits. Remember, since we have already indexed into the
1448 * parent's child array, we know that the bits we chopped of
1449 * *are* zero.
1450 */
19baf839 1451
a07f5f50
SH
1452 /* NOTA BENE: Checking only skipped bits
1453 for the new node here */
19baf839 1454
91b9a277
OJ
1455 if (current_prefix_length < pos+bits) {
1456 if (tkey_extract_bits(cn->key, current_prefix_length,
a07f5f50
SH
1457 cn->pos - current_prefix_length)
1458 || !(cn->child[0]))
91b9a277
OJ
1459 goto backtrace;
1460 }
19baf839 1461
91b9a277
OJ
1462 /*
1463 * If chopped_off=0, the index is fully validated and we
1464 * only need to look at the skipped bits for this, the new,
1465 * tnode. What we actually want to do is to find out if
1466 * these skipped bits match our key perfectly, or if we will
1467 * have to count on finding a matching prefix further down,
1468 * because if we do, we would like to have some way of
1469 * verifying the existence of such a prefix at this point.
1470 */
19baf839 1471
91b9a277
OJ
1472 /* The only thing we can do at this point is to verify that
1473 * any such matching prefix can indeed be a prefix to our
1474 * key, and if the bits in the node we are inspecting that
1475 * do not match our key are not ZERO, this cannot be true.
1476 * Thus, find out where there is a mismatch (before cn->pos)
1477 * and verify that all the mismatching bits are zero in the
1478 * new tnode's key.
1479 */
19baf839 1480
a07f5f50
SH
1481 /*
1482 * Note: We aren't very concerned about the piece of
1483 * the key that precede pn->pos+pn->bits, since these
1484 * have already been checked. The bits after cn->pos
1485 * aren't checked since these are by definition
1486 * "unknown" at this point. Thus, what we want to see
1487 * is if we are about to enter the "prefix matching"
1488 * state, and in that case verify that the skipped
1489 * bits that will prevail throughout this subtree are
1490 * zero, as they have to be if we are to find a
1491 * matching prefix.
91b9a277
OJ
1492 */
1493
ab66b4a7
SH
1494 node_prefix = mask_pfx(cn->key, cn->pos);
1495 key_prefix = mask_pfx(key, cn->pos);
91b9a277
OJ
1496 pref_mismatch = key_prefix^node_prefix;
1497 mp = 0;
1498
a07f5f50
SH
1499 /*
1500 * In short: If skipped bits in this node do not match
1501 * the search key, enter the "prefix matching"
1502 * state.directly.
91b9a277
OJ
1503 */
1504 if (pref_mismatch) {
1505 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1506 mp++;
a07f5f50 1507 pref_mismatch = pref_mismatch << 1;
91b9a277
OJ
1508 }
1509 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1510
1511 if (key_prefix != 0)
1512 goto backtrace;
1513
1514 if (current_prefix_length >= cn->pos)
1515 current_prefix_length = mp;
c877efb2 1516 }
a07f5f50 1517
91b9a277
OJ
1518 pn = (struct tnode *)n; /* Descend */
1519 chopped_off = 0;
1520 continue;
1521
19baf839
RO
1522backtrace:
1523 chopped_off++;
1524
1525 /* As zero don't change the child key (cindex) */
a07f5f50
SH
1526 while ((chopped_off <= pn->bits)
1527 && !(cindex & (1<<(chopped_off-1))))
19baf839 1528 chopped_off++;
19baf839
RO
1529
1530 /* Decrease current_... with bits chopped off */
1531 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
a07f5f50
SH
1532 current_prefix_length = pn->pos + pn->bits
1533 - chopped_off;
91b9a277 1534
19baf839 1535 /*
c877efb2 1536 * Either we do the actual chop off according or if we have
19baf839
RO
1537 * chopped off all bits in this tnode walk up to our parent.
1538 */
1539
91b9a277 1540 if (chopped_off <= pn->bits) {
19baf839 1541 cindex &= ~(1 << (chopped_off-1));
91b9a277 1542 } else {
06801916
SH
1543 struct tnode *parent = node_parent((struct node *) pn);
1544 if (!parent)
19baf839 1545 goto failed;
91b9a277 1546
19baf839 1547 /* Get Child's index */
06801916
SH
1548 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1549 pn = parent;
19baf839
RO
1550 chopped_off = 0;
1551
1552#ifdef CONFIG_IP_FIB_TRIE_STATS
1553 t->stats.backtrack++;
1554#endif
1555 goto backtrace;
c877efb2 1556 }
19baf839
RO
1557 }
1558failed:
c877efb2 1559 ret = 1;
19baf839 1560found:
2373ce1c 1561 rcu_read_unlock();
19baf839
RO
1562 return ret;
1563}
1564
9195bef7
SH
1565/*
1566 * Remove the leaf and return parent.
1567 */
1568static void trie_leaf_remove(struct trie *t, struct leaf *l)
19baf839 1569{
9195bef7 1570 struct tnode *tp = node_parent((struct node *) l);
c877efb2 1571
9195bef7 1572 pr_debug("entering trie_leaf_remove(%p)\n", l);
19baf839 1573
c877efb2 1574 if (tp) {
9195bef7 1575 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
19baf839 1576 put_child(t, (struct tnode *)tp, cindex, NULL);
2373ce1c 1577 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
91b9a277 1578 } else
2373ce1c 1579 rcu_assign_pointer(t->trie, NULL);
19baf839 1580
9195bef7 1581 tnode_free((struct tnode *) l);
19baf839
RO
1582}
1583
d562f1f8
RO
1584/*
1585 * Caller must hold RTNL.
1586 */
4e902c57 1587static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1588{
1589 struct trie *t = (struct trie *) tb->tb_data;
1590 u32 key, mask;
4e902c57
TG
1591 int plen = cfg->fc_dst_len;
1592 u8 tos = cfg->fc_tos;
19baf839
RO
1593 struct fib_alias *fa, *fa_to_delete;
1594 struct list_head *fa_head;
1595 struct leaf *l;
91b9a277
OJ
1596 struct leaf_info *li;
1597
c877efb2 1598 if (plen > 32)
19baf839
RO
1599 return -EINVAL;
1600
4e902c57 1601 key = ntohl(cfg->fc_dst);
91b9a277 1602 mask = ntohl(inet_make_mask(plen));
19baf839 1603
c877efb2 1604 if (key & ~mask)
19baf839
RO
1605 return -EINVAL;
1606
1607 key = key & mask;
1608 l = fib_find_node(t, key);
1609
c877efb2 1610 if (!l)
19baf839
RO
1611 return -ESRCH;
1612
1613 fa_head = get_fa_head(l, plen);
1614 fa = fib_find_alias(fa_head, tos, 0);
1615
1616 if (!fa)
1617 return -ESRCH;
1618
0c7770c7 1619 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1620
1621 fa_to_delete = NULL;
936f6f8e
JA
1622 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1623 list_for_each_entry_continue(fa, fa_head, fa_list) {
19baf839
RO
1624 struct fib_info *fi = fa->fa_info;
1625
1626 if (fa->fa_tos != tos)
1627 break;
1628
4e902c57
TG
1629 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1630 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1631 fa->fa_scope == cfg->fc_scope) &&
1632 (!cfg->fc_protocol ||
1633 fi->fib_protocol == cfg->fc_protocol) &&
1634 fib_nh_match(cfg, fi) == 0) {
19baf839
RO
1635 fa_to_delete = fa;
1636 break;
1637 }
1638 }
1639
91b9a277
OJ
1640 if (!fa_to_delete)
1641 return -ESRCH;
19baf839 1642
91b9a277 1643 fa = fa_to_delete;
4e902c57 1644 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
b8f55831 1645 &cfg->fc_nlinfo, 0);
91b9a277
OJ
1646
1647 l = fib_find_node(t, key);
772cb712 1648 li = find_leaf_info(l, plen);
19baf839 1649
2373ce1c 1650 list_del_rcu(&fa->fa_list);
19baf839 1651
91b9a277 1652 if (list_empty(fa_head)) {
2373ce1c 1653 hlist_del_rcu(&li->hlist);
91b9a277 1654 free_leaf_info(li);
2373ce1c 1655 }
19baf839 1656
91b9a277 1657 if (hlist_empty(&l->list))
9195bef7 1658 trie_leaf_remove(t, l);
19baf839 1659
91b9a277
OJ
1660 if (fa->fa_state & FA_S_ACCESSED)
1661 rt_cache_flush(-1);
19baf839 1662
2373ce1c
RO
1663 fib_release_info(fa->fa_info);
1664 alias_free_mem_rcu(fa);
91b9a277 1665 return 0;
19baf839
RO
1666}
1667
1668static int trie_flush_list(struct trie *t, struct list_head *head)
1669{
1670 struct fib_alias *fa, *fa_node;
1671 int found = 0;
1672
1673 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1674 struct fib_info *fi = fa->fa_info;
19baf839 1675
2373ce1c
RO
1676 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1677 list_del_rcu(&fa->fa_list);
1678 fib_release_info(fa->fa_info);
1679 alias_free_mem_rcu(fa);
19baf839
RO
1680 found++;
1681 }
1682 }
1683 return found;
1684}
1685
1686static int trie_flush_leaf(struct trie *t, struct leaf *l)
1687{
1688 int found = 0;
1689 struct hlist_head *lih = &l->list;
1690 struct hlist_node *node, *tmp;
1691 struct leaf_info *li = NULL;
1692
1693 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
19baf839
RO
1694 found += trie_flush_list(t, &li->falh);
1695
1696 if (list_empty(&li->falh)) {
2373ce1c 1697 hlist_del_rcu(&li->hlist);
19baf839
RO
1698 free_leaf_info(li);
1699 }
1700 }
1701 return found;
1702}
1703
82cfbb00
SH
1704/*
1705 * Scan for the next right leaf starting at node p->child[idx]
1706 * Since we have back pointer, no recursion necessary.
1707 */
1708static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
19baf839 1709{
82cfbb00
SH
1710 do {
1711 t_key idx;
c877efb2 1712
c877efb2 1713 if (c)
82cfbb00 1714 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
c877efb2 1715 else
82cfbb00 1716 idx = 0;
2373ce1c 1717
82cfbb00
SH
1718 while (idx < 1u << p->bits) {
1719 c = tnode_get_child_rcu(p, idx++);
2373ce1c 1720 if (!c)
91b9a277
OJ
1721 continue;
1722
82cfbb00
SH
1723 if (IS_LEAF(c)) {
1724 prefetch(p->child[idx]);
1725 return (struct leaf *) c;
19baf839 1726 }
82cfbb00
SH
1727
1728 /* Rescan start scanning in new node */
1729 p = (struct tnode *) c;
1730 idx = 0;
19baf839 1731 }
82cfbb00
SH
1732
1733 /* Node empty, walk back up to parent */
91b9a277 1734 c = (struct node *) p;
82cfbb00
SH
1735 } while ( (p = node_parent_rcu(c)) != NULL);
1736
1737 return NULL; /* Root of trie */
1738}
1739
82cfbb00
SH
1740static struct leaf *trie_firstleaf(struct trie *t)
1741{
1742 struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1743
1744 if (!n)
1745 return NULL;
1746
1747 if (IS_LEAF(n)) /* trie is just a leaf */
1748 return (struct leaf *) n;
1749
1750 return leaf_walk_rcu(n, NULL);
1751}
1752
1753static struct leaf *trie_nextleaf(struct leaf *l)
1754{
1755 struct node *c = (struct node *) l;
1756 struct tnode *p = node_parent(c);
1757
1758 if (!p)
1759 return NULL; /* trie with just one leaf */
1760
1761 return leaf_walk_rcu(p, c);
19baf839
RO
1762}
1763
71d67e66
SH
1764static struct leaf *trie_leafindex(struct trie *t, int index)
1765{
1766 struct leaf *l = trie_firstleaf(t);
1767
ec28cf73 1768 while (l && index-- > 0)
71d67e66 1769 l = trie_nextleaf(l);
ec28cf73 1770
71d67e66
SH
1771 return l;
1772}
1773
1774
d562f1f8
RO
1775/*
1776 * Caller must hold RTNL.
1777 */
19baf839
RO
1778static int fn_trie_flush(struct fib_table *tb)
1779{
1780 struct trie *t = (struct trie *) tb->tb_data;
9195bef7 1781 struct leaf *l, *ll = NULL;
82cfbb00 1782 int found = 0;
19baf839 1783
82cfbb00 1784 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
19baf839
RO
1785 found += trie_flush_leaf(t, l);
1786
1787 if (ll && hlist_empty(&ll->list))
9195bef7 1788 trie_leaf_remove(t, ll);
19baf839
RO
1789 ll = l;
1790 }
1791
1792 if (ll && hlist_empty(&ll->list))
9195bef7 1793 trie_leaf_remove(t, ll);
19baf839 1794
0c7770c7 1795 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1796 return found;
1797}
1798
a07f5f50
SH
1799static void fn_trie_select_default(struct fib_table *tb,
1800 const struct flowi *flp,
1801 struct fib_result *res)
19baf839
RO
1802{
1803 struct trie *t = (struct trie *) tb->tb_data;
1804 int order, last_idx;
1805 struct fib_info *fi = NULL;
1806 struct fib_info *last_resort;
1807 struct fib_alias *fa = NULL;
1808 struct list_head *fa_head;
1809 struct leaf *l;
1810
1811 last_idx = -1;
1812 last_resort = NULL;
1813 order = -1;
1814
2373ce1c 1815 rcu_read_lock();
c877efb2 1816
19baf839 1817 l = fib_find_node(t, 0);
c877efb2 1818 if (!l)
19baf839
RO
1819 goto out;
1820
1821 fa_head = get_fa_head(l, 0);
c877efb2 1822 if (!fa_head)
19baf839
RO
1823 goto out;
1824
c877efb2 1825 if (list_empty(fa_head))
19baf839
RO
1826 goto out;
1827
2373ce1c 1828 list_for_each_entry_rcu(fa, fa_head, fa_list) {
19baf839 1829 struct fib_info *next_fi = fa->fa_info;
91b9a277 1830
19baf839
RO
1831 if (fa->fa_scope != res->scope ||
1832 fa->fa_type != RTN_UNICAST)
1833 continue;
91b9a277 1834
19baf839
RO
1835 if (next_fi->fib_priority > res->fi->fib_priority)
1836 break;
1837 if (!next_fi->fib_nh[0].nh_gw ||
1838 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1839 continue;
1840 fa->fa_state |= FA_S_ACCESSED;
91b9a277 1841
19baf839
RO
1842 if (fi == NULL) {
1843 if (next_fi != res->fi)
1844 break;
1845 } else if (!fib_detect_death(fi, order, &last_resort,
971b893e 1846 &last_idx, tb->tb_default)) {
a2bbe682 1847 fib_result_assign(res, fi);
971b893e 1848 tb->tb_default = order;
19baf839
RO
1849 goto out;
1850 }
1851 fi = next_fi;
1852 order++;
1853 }
1854 if (order <= 0 || fi == NULL) {
971b893e 1855 tb->tb_default = -1;
19baf839
RO
1856 goto out;
1857 }
1858
971b893e
DL
1859 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1860 tb->tb_default)) {
a2bbe682 1861 fib_result_assign(res, fi);
971b893e 1862 tb->tb_default = order;
19baf839
RO
1863 goto out;
1864 }
a2bbe682
DL
1865 if (last_idx >= 0)
1866 fib_result_assign(res, last_resort);
971b893e
DL
1867 tb->tb_default = last_idx;
1868out:
2373ce1c 1869 rcu_read_unlock();
19baf839
RO
1870}
1871
a07f5f50
SH
1872static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1873 struct fib_table *tb,
19baf839
RO
1874 struct sk_buff *skb, struct netlink_callback *cb)
1875{
1876 int i, s_i;
1877 struct fib_alias *fa;
32ab5f80 1878 __be32 xkey = htonl(key);
19baf839 1879
71d67e66 1880 s_i = cb->args[5];
19baf839
RO
1881 i = 0;
1882
2373ce1c
RO
1883 /* rcu_read_lock is hold by caller */
1884
1885 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1886 if (i < s_i) {
1887 i++;
1888 continue;
1889 }
19baf839
RO
1890
1891 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1892 cb->nlh->nlmsg_seq,
1893 RTM_NEWROUTE,
1894 tb->tb_id,
1895 fa->fa_type,
1896 fa->fa_scope,
be403ea1 1897 xkey,
19baf839
RO
1898 plen,
1899 fa->fa_tos,
64347f78 1900 fa->fa_info, NLM_F_MULTI) < 0) {
71d67e66 1901 cb->args[5] = i;
19baf839 1902 return -1;
91b9a277 1903 }
19baf839
RO
1904 i++;
1905 }
71d67e66 1906 cb->args[5] = i;
19baf839
RO
1907 return skb->len;
1908}
1909
a88ee229
SH
1910static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1911 struct sk_buff *skb, struct netlink_callback *cb)
19baf839 1912{
a88ee229
SH
1913 struct leaf_info *li;
1914 struct hlist_node *node;
1915 int i, s_i;
19baf839 1916
71d67e66 1917 s_i = cb->args[4];
a88ee229 1918 i = 0;
19baf839 1919
a88ee229
SH
1920 /* rcu_read_lock is hold by caller */
1921 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1922 if (i < s_i) {
1923 i++;
19baf839 1924 continue;
a88ee229 1925 }
91b9a277 1926
a88ee229 1927 if (i > s_i)
71d67e66 1928 cb->args[5] = 0;
19baf839 1929
a88ee229 1930 if (list_empty(&li->falh))
19baf839
RO
1931 continue;
1932
a88ee229 1933 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
71d67e66 1934 cb->args[4] = i;
19baf839
RO
1935 return -1;
1936 }
a88ee229 1937 i++;
19baf839 1938 }
a88ee229 1939
71d67e66 1940 cb->args[4] = i;
19baf839
RO
1941 return skb->len;
1942}
1943
a07f5f50
SH
1944static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1945 struct netlink_callback *cb)
19baf839 1946{
a88ee229 1947 struct leaf *l;
19baf839 1948 struct trie *t = (struct trie *) tb->tb_data;
d5ce8a0e 1949 t_key key = cb->args[2];
71d67e66 1950 int count = cb->args[3];
19baf839 1951
2373ce1c 1952 rcu_read_lock();
d5ce8a0e
SH
1953 /* Dump starting at last key.
1954 * Note: 0.0.0.0/0 (ie default) is first key.
1955 */
71d67e66 1956 if (count == 0)
d5ce8a0e
SH
1957 l = trie_firstleaf(t);
1958 else {
71d67e66
SH
1959 /* Normally, continue from last key, but if that is missing
1960 * fallback to using slow rescan
1961 */
d5ce8a0e 1962 l = fib_find_node(t, key);
71d67e66
SH
1963 if (!l)
1964 l = trie_leafindex(t, count);
d5ce8a0e 1965 }
a88ee229 1966
d5ce8a0e
SH
1967 while (l) {
1968 cb->args[2] = l->key;
a88ee229 1969 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
71d67e66 1970 cb->args[3] = count;
a88ee229 1971 rcu_read_unlock();
a88ee229 1972 return -1;
19baf839 1973 }
d5ce8a0e 1974
71d67e66 1975 ++count;
d5ce8a0e 1976 l = trie_nextleaf(l);
71d67e66
SH
1977 memset(&cb->args[4], 0,
1978 sizeof(cb->args) - 4*sizeof(cb->args[0]));
19baf839 1979 }
71d67e66 1980 cb->args[3] = count;
2373ce1c 1981 rcu_read_unlock();
a88ee229 1982
19baf839 1983 return skb->len;
19baf839
RO
1984}
1985
7f9b8052
SH
1986void __init fib_hash_init(void)
1987{
a07f5f50
SH
1988 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1989 sizeof(struct fib_alias),
bc3c8c1e
SH
1990 0, SLAB_PANIC, NULL);
1991
1992 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1993 max(sizeof(struct leaf),
1994 sizeof(struct leaf_info)),
1995 0, SLAB_PANIC, NULL);
7f9b8052 1996}
19baf839 1997
7f9b8052
SH
1998
1999/* Fix more generic FIB names for init later */
2000struct fib_table *fib_hash_table(u32 id)
19baf839
RO
2001{
2002 struct fib_table *tb;
2003 struct trie *t;
2004
19baf839
RO
2005 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2006 GFP_KERNEL);
2007 if (tb == NULL)
2008 return NULL;
2009
2010 tb->tb_id = id;
971b893e 2011 tb->tb_default = -1;
19baf839
RO
2012 tb->tb_lookup = fn_trie_lookup;
2013 tb->tb_insert = fn_trie_insert;
2014 tb->tb_delete = fn_trie_delete;
2015 tb->tb_flush = fn_trie_flush;
2016 tb->tb_select_default = fn_trie_select_default;
2017 tb->tb_dump = fn_trie_dump;
19baf839
RO
2018
2019 t = (struct trie *) tb->tb_data;
c28a1cf4 2020 memset(t, 0, sizeof(*t));
19baf839 2021
19baf839 2022 if (id == RT_TABLE_LOCAL)
a07f5f50 2023 pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
19baf839
RO
2024
2025 return tb;
2026}
2027
cb7b593c
SH
2028#ifdef CONFIG_PROC_FS
2029/* Depth first Trie walk iterator */
2030struct fib_trie_iter {
1c340b2f 2031 struct seq_net_private p;
3d3b2d25 2032 struct fib_table *tb;
cb7b593c 2033 struct tnode *tnode;
cb7b593c
SH
2034 unsigned index;
2035 unsigned depth;
2036};
19baf839 2037
cb7b593c 2038static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 2039{
cb7b593c
SH
2040 struct tnode *tn = iter->tnode;
2041 unsigned cindex = iter->index;
2042 struct tnode *p;
19baf839 2043
6640e697
EB
2044 /* A single entry routing table */
2045 if (!tn)
2046 return NULL;
2047
cb7b593c
SH
2048 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2049 iter->tnode, iter->index, iter->depth);
2050rescan:
2051 while (cindex < (1<<tn->bits)) {
b59cfbf7 2052 struct node *n = tnode_get_child_rcu(tn, cindex);
19baf839 2053
cb7b593c
SH
2054 if (n) {
2055 if (IS_LEAF(n)) {
2056 iter->tnode = tn;
2057 iter->index = cindex + 1;
2058 } else {
2059 /* push down one level */
2060 iter->tnode = (struct tnode *) n;
2061 iter->index = 0;
2062 ++iter->depth;
2063 }
2064 return n;
2065 }
19baf839 2066
cb7b593c
SH
2067 ++cindex;
2068 }
91b9a277 2069
cb7b593c 2070 /* Current node exhausted, pop back up */
b59cfbf7 2071 p = node_parent_rcu((struct node *)tn);
cb7b593c
SH
2072 if (p) {
2073 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2074 tn = p;
2075 --iter->depth;
2076 goto rescan;
19baf839 2077 }
cb7b593c
SH
2078
2079 /* got root? */
2080 return NULL;
19baf839
RO
2081}
2082
cb7b593c
SH
2083static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2084 struct trie *t)
19baf839 2085{
3d3b2d25 2086 struct node *n;
5ddf0eb2 2087
132adf54 2088 if (!t)
5ddf0eb2
RO
2089 return NULL;
2090
2091 n = rcu_dereference(t->trie);
3d3b2d25 2092 if (!n)
5ddf0eb2 2093 return NULL;
19baf839 2094
3d3b2d25
SH
2095 if (IS_TNODE(n)) {
2096 iter->tnode = (struct tnode *) n;
2097 iter->index = 0;
2098 iter->depth = 1;
2099 } else {
2100 iter->tnode = NULL;
2101 iter->index = 0;
2102 iter->depth = 0;
91b9a277 2103 }
3d3b2d25
SH
2104
2105 return n;
cb7b593c 2106}
91b9a277 2107
cb7b593c
SH
2108static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2109{
2110 struct node *n;
2111 struct fib_trie_iter iter;
91b9a277 2112
cb7b593c 2113 memset(s, 0, sizeof(*s));
91b9a277 2114
cb7b593c 2115 rcu_read_lock();
3d3b2d25 2116 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
cb7b593c 2117 if (IS_LEAF(n)) {
93672292
SH
2118 struct leaf *l = (struct leaf *)n;
2119 struct leaf_info *li;
2120 struct hlist_node *tmp;
2121
cb7b593c
SH
2122 s->leaves++;
2123 s->totdepth += iter.depth;
2124 if (iter.depth > s->maxdepth)
2125 s->maxdepth = iter.depth;
93672292
SH
2126
2127 hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2128 ++s->prefixes;
cb7b593c
SH
2129 } else {
2130 const struct tnode *tn = (const struct tnode *) n;
2131 int i;
2132
2133 s->tnodes++;
132adf54 2134 if (tn->bits < MAX_STAT_DEPTH)
06ef921d
RO
2135 s->nodesizes[tn->bits]++;
2136
cb7b593c
SH
2137 for (i = 0; i < (1<<tn->bits); i++)
2138 if (!tn->child[i])
2139 s->nullpointers++;
19baf839 2140 }
19baf839 2141 }
2373ce1c 2142 rcu_read_unlock();
19baf839
RO
2143}
2144
cb7b593c
SH
2145/*
2146 * This outputs /proc/net/fib_triestats
2147 */
2148static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 2149{
cb7b593c 2150 unsigned i, max, pointers, bytes, avdepth;
c877efb2 2151
cb7b593c
SH
2152 if (stat->leaves)
2153 avdepth = stat->totdepth*100 / stat->leaves;
2154 else
2155 avdepth = 0;
91b9a277 2156
a07f5f50
SH
2157 seq_printf(seq, "\tAver depth: %u.%02d\n",
2158 avdepth / 100, avdepth % 100);
cb7b593c 2159 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 2160
cb7b593c 2161 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
cb7b593c 2162 bytes = sizeof(struct leaf) * stat->leaves;
93672292
SH
2163
2164 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2165 bytes += sizeof(struct leaf_info) * stat->prefixes;
2166
187b5188 2167 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
cb7b593c 2168 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 2169
06ef921d
RO
2170 max = MAX_STAT_DEPTH;
2171 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 2172 max--;
19baf839 2173
cb7b593c
SH
2174 pointers = 0;
2175 for (i = 1; i <= max; i++)
2176 if (stat->nodesizes[i] != 0) {
187b5188 2177 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
cb7b593c
SH
2178 pointers += (1<<i) * stat->nodesizes[i];
2179 }
2180 seq_putc(seq, '\n');
187b5188 2181 seq_printf(seq, "\tPointers: %u\n", pointers);
2373ce1c 2182
cb7b593c 2183 bytes += sizeof(struct node *) * pointers;
187b5188
SH
2184 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2185 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
66a2f7fd 2186}
2373ce1c 2187
cb7b593c 2188#ifdef CONFIG_IP_FIB_TRIE_STATS
66a2f7fd
SH
2189static void trie_show_usage(struct seq_file *seq,
2190 const struct trie_use_stats *stats)
2191{
2192 seq_printf(seq, "\nCounters:\n---------\n");
a07f5f50
SH
2193 seq_printf(seq, "gets = %u\n", stats->gets);
2194 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2195 seq_printf(seq, "semantic match passed = %u\n",
2196 stats->semantic_match_passed);
2197 seq_printf(seq, "semantic match miss = %u\n",
2198 stats->semantic_match_miss);
2199 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2200 seq_printf(seq, "skipped node resize = %u\n\n",
2201 stats->resize_node_skipped);
cb7b593c 2202}
66a2f7fd
SH
2203#endif /* CONFIG_IP_FIB_TRIE_STATS */
2204
3d3b2d25 2205static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
d717a9a6 2206{
3d3b2d25
SH
2207 if (tb->tb_id == RT_TABLE_LOCAL)
2208 seq_puts(seq, "Local:\n");
2209 else if (tb->tb_id == RT_TABLE_MAIN)
2210 seq_puts(seq, "Main:\n");
2211 else
2212 seq_printf(seq, "Id %d:\n", tb->tb_id);
d717a9a6 2213}
19baf839 2214
3d3b2d25 2215
cb7b593c
SH
2216static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2217{
1c340b2f 2218 struct net *net = (struct net *)seq->private;
3d3b2d25 2219 unsigned int h;
877a9bff 2220
d717a9a6 2221 seq_printf(seq,
a07f5f50
SH
2222 "Basic info: size of leaf:"
2223 " %Zd bytes, size of tnode: %Zd bytes.\n",
d717a9a6
SH
2224 sizeof(struct leaf), sizeof(struct tnode));
2225
3d3b2d25
SH
2226 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2227 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2228 struct hlist_node *node;
2229 struct fib_table *tb;
2230
2231 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2232 struct trie *t = (struct trie *) tb->tb_data;
2233 struct trie_stat stat;
877a9bff 2234
3d3b2d25
SH
2235 if (!t)
2236 continue;
2237
2238 fib_table_print(seq, tb);
2239
2240 trie_collect_stats(t, &stat);
2241 trie_show_stats(seq, &stat);
2242#ifdef CONFIG_IP_FIB_TRIE_STATS
2243 trie_show_usage(seq, &t->stats);
2244#endif
2245 }
2246 }
19baf839 2247
cb7b593c 2248 return 0;
19baf839
RO
2249}
2250
cb7b593c 2251static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2252{
1c340b2f
DL
2253 int err;
2254 struct net *net;
2255
2256 net = get_proc_net(inode);
2257 if (net == NULL)
2258 return -ENXIO;
2259 err = single_open(file, fib_triestat_seq_show, net);
2260 if (err < 0) {
2261 put_net(net);
2262 return err;
2263 }
2264 return 0;
2265}
2266
2267static int fib_triestat_seq_release(struct inode *ino, struct file *f)
2268{
2269 struct seq_file *seq = f->private_data;
2270 put_net(seq->private);
2271 return single_release(ino, f);
19baf839
RO
2272}
2273
9a32144e 2274static const struct file_operations fib_triestat_fops = {
cb7b593c
SH
2275 .owner = THIS_MODULE,
2276 .open = fib_triestat_seq_open,
2277 .read = seq_read,
2278 .llseek = seq_lseek,
1c340b2f 2279 .release = fib_triestat_seq_release,
cb7b593c
SH
2280};
2281
1218854a 2282static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
19baf839 2283{
1218854a
YH
2284 struct fib_trie_iter *iter = seq->private;
2285 struct net *net = seq_file_net(seq);
cb7b593c 2286 loff_t idx = 0;
3d3b2d25 2287 unsigned int h;
cb7b593c 2288
3d3b2d25
SH
2289 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2290 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2291 struct hlist_node *node;
2292 struct fib_table *tb;
cb7b593c 2293
3d3b2d25
SH
2294 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2295 struct node *n;
2296
2297 for (n = fib_trie_get_first(iter,
2298 (struct trie *) tb->tb_data);
2299 n; n = fib_trie_get_next(iter))
2300 if (pos == idx++) {
2301 iter->tb = tb;
2302 return n;
2303 }
2304 }
cb7b593c 2305 }
3d3b2d25 2306
19baf839
RO
2307 return NULL;
2308}
2309
cb7b593c 2310static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
c95aaf9a 2311 __acquires(RCU)
19baf839 2312{
cb7b593c 2313 rcu_read_lock();
1218854a 2314 return fib_trie_get_idx(seq, *pos);
19baf839
RO
2315}
2316
cb7b593c 2317static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2318{
cb7b593c 2319 struct fib_trie_iter *iter = seq->private;
1218854a 2320 struct net *net = seq_file_net(seq);
3d3b2d25
SH
2321 struct fib_table *tb = iter->tb;
2322 struct hlist_node *tb_node;
2323 unsigned int h;
2324 struct node *n;
cb7b593c 2325
19baf839 2326 ++*pos;
3d3b2d25
SH
2327 /* next node in same table */
2328 n = fib_trie_get_next(iter);
2329 if (n)
2330 return n;
19baf839 2331
3d3b2d25
SH
2332 /* walk rest of this hash chain */
2333 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2334 while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2335 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2336 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2337 if (n)
2338 goto found;
2339 }
19baf839 2340
3d3b2d25
SH
2341 /* new hash chain */
2342 while (++h < FIB_TABLE_HASHSZ) {
2343 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2344 hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2345 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2346 if (n)
2347 goto found;
2348 }
2349 }
cb7b593c 2350 return NULL;
3d3b2d25
SH
2351
2352found:
2353 iter->tb = tb;
2354 return n;
cb7b593c 2355}
19baf839 2356
cb7b593c 2357static void fib_trie_seq_stop(struct seq_file *seq, void *v)
c95aaf9a 2358 __releases(RCU)
19baf839 2359{
cb7b593c
SH
2360 rcu_read_unlock();
2361}
91b9a277 2362
cb7b593c
SH
2363static void seq_indent(struct seq_file *seq, int n)
2364{
2365 while (n-- > 0) seq_puts(seq, " ");
2366}
19baf839 2367
28d36e37 2368static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
cb7b593c 2369{
132adf54 2370 switch (s) {
cb7b593c
SH
2371 case RT_SCOPE_UNIVERSE: return "universe";
2372 case RT_SCOPE_SITE: return "site";
2373 case RT_SCOPE_LINK: return "link";
2374 case RT_SCOPE_HOST: return "host";
2375 case RT_SCOPE_NOWHERE: return "nowhere";
2376 default:
28d36e37 2377 snprintf(buf, len, "scope=%d", s);
cb7b593c
SH
2378 return buf;
2379 }
2380}
19baf839 2381
cb7b593c
SH
2382static const char *rtn_type_names[__RTN_MAX] = {
2383 [RTN_UNSPEC] = "UNSPEC",
2384 [RTN_UNICAST] = "UNICAST",
2385 [RTN_LOCAL] = "LOCAL",
2386 [RTN_BROADCAST] = "BROADCAST",
2387 [RTN_ANYCAST] = "ANYCAST",
2388 [RTN_MULTICAST] = "MULTICAST",
2389 [RTN_BLACKHOLE] = "BLACKHOLE",
2390 [RTN_UNREACHABLE] = "UNREACHABLE",
2391 [RTN_PROHIBIT] = "PROHIBIT",
2392 [RTN_THROW] = "THROW",
2393 [RTN_NAT] = "NAT",
2394 [RTN_XRESOLVE] = "XRESOLVE",
2395};
19baf839 2396
28d36e37 2397static inline const char *rtn_type(char *buf, size_t len, unsigned t)
cb7b593c 2398{
cb7b593c
SH
2399 if (t < __RTN_MAX && rtn_type_names[t])
2400 return rtn_type_names[t];
28d36e37 2401 snprintf(buf, len, "type %u", t);
cb7b593c 2402 return buf;
19baf839
RO
2403}
2404
cb7b593c
SH
2405/* Pretty print the trie */
2406static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2407{
cb7b593c
SH
2408 const struct fib_trie_iter *iter = seq->private;
2409 struct node *n = v;
c877efb2 2410
3d3b2d25
SH
2411 if (!node_parent_rcu(n))
2412 fib_table_print(seq, iter->tb);
095b8501 2413
cb7b593c
SH
2414 if (IS_TNODE(n)) {
2415 struct tnode *tn = (struct tnode *) n;
ab66b4a7 2416 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
91b9a277 2417
1d25cd6c
RO
2418 seq_indent(seq, iter->depth-1);
2419 seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n",
e905a9ed 2420 NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
1d25cd6c 2421 tn->empty_children);
e905a9ed 2422
cb7b593c
SH
2423 } else {
2424 struct leaf *l = (struct leaf *) n;
1328042e
SH
2425 struct leaf_info *li;
2426 struct hlist_node *node;
32ab5f80 2427 __be32 val = htonl(l->key);
cb7b593c
SH
2428
2429 seq_indent(seq, iter->depth);
2430 seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
1328042e
SH
2431
2432 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2433 struct fib_alias *fa;
2434
2435 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2436 char buf1[32], buf2[32];
2437
2438 seq_indent(seq, iter->depth+1);
2439 seq_printf(seq, " /%d %s %s", li->plen,
2440 rtn_scope(buf1, sizeof(buf1),
2441 fa->fa_scope),
2442 rtn_type(buf2, sizeof(buf2),
2443 fa->fa_type));
2444 if (fa->fa_tos)
b9c4d82a 2445 seq_printf(seq, " tos=%d", fa->fa_tos);
1328042e 2446 seq_putc(seq, '\n');
cb7b593c
SH
2447 }
2448 }
19baf839 2449 }
cb7b593c 2450
19baf839
RO
2451 return 0;
2452}
2453
f690808e 2454static const struct seq_operations fib_trie_seq_ops = {
cb7b593c
SH
2455 .start = fib_trie_seq_start,
2456 .next = fib_trie_seq_next,
2457 .stop = fib_trie_seq_stop,
2458 .show = fib_trie_seq_show,
19baf839
RO
2459};
2460
cb7b593c 2461static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839 2462{
1c340b2f
DL
2463 return seq_open_net(inode, file, &fib_trie_seq_ops,
2464 sizeof(struct fib_trie_iter));
19baf839
RO
2465}
2466
9a32144e 2467static const struct file_operations fib_trie_fops = {
cb7b593c
SH
2468 .owner = THIS_MODULE,
2469 .open = fib_trie_seq_open,
2470 .read = seq_read,
2471 .llseek = seq_lseek,
1c340b2f 2472 .release = seq_release_net,
19baf839
RO
2473};
2474
8315f5d8
SH
2475struct fib_route_iter {
2476 struct seq_net_private p;
2477 struct trie *main_trie;
2478 loff_t pos;
2479 t_key key;
2480};
2481
2482static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2483{
2484 struct leaf *l = NULL;
2485 struct trie *t = iter->main_trie;
2486
2487 /* use cache location of last found key */
2488 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2489 pos -= iter->pos;
2490 else {
2491 iter->pos = 0;
2492 l = trie_firstleaf(t);
2493 }
2494
2495 while (l && pos-- > 0) {
2496 iter->pos++;
2497 l = trie_nextleaf(l);
2498 }
2499
2500 if (l)
2501 iter->key = pos; /* remember it */
2502 else
2503 iter->pos = 0; /* forget it */
2504
2505 return l;
2506}
2507
2508static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2509 __acquires(RCU)
2510{
2511 struct fib_route_iter *iter = seq->private;
2512 struct fib_table *tb;
2513
2514 rcu_read_lock();
1218854a 2515 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
8315f5d8
SH
2516 if (!tb)
2517 return NULL;
2518
2519 iter->main_trie = (struct trie *) tb->tb_data;
2520 if (*pos == 0)
2521 return SEQ_START_TOKEN;
2522 else
2523 return fib_route_get_idx(iter, *pos - 1);
2524}
2525
2526static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2527{
2528 struct fib_route_iter *iter = seq->private;
2529 struct leaf *l = v;
2530
2531 ++*pos;
2532 if (v == SEQ_START_TOKEN) {
2533 iter->pos = 0;
2534 l = trie_firstleaf(iter->main_trie);
2535 } else {
2536 iter->pos++;
2537 l = trie_nextleaf(l);
2538 }
2539
2540 if (l)
2541 iter->key = l->key;
2542 else
2543 iter->pos = 0;
2544 return l;
2545}
2546
2547static void fib_route_seq_stop(struct seq_file *seq, void *v)
2548 __releases(RCU)
2549{
2550 rcu_read_unlock();
2551}
2552
32ab5f80 2553static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
19baf839 2554{
cb7b593c
SH
2555 static unsigned type2flags[RTN_MAX + 1] = {
2556 [7] = RTF_REJECT, [8] = RTF_REJECT,
2557 };
2558 unsigned flags = type2flags[type];
19baf839 2559
cb7b593c
SH
2560 if (fi && fi->fib_nh->nh_gw)
2561 flags |= RTF_GATEWAY;
32ab5f80 2562 if (mask == htonl(0xFFFFFFFF))
cb7b593c
SH
2563 flags |= RTF_HOST;
2564 flags |= RTF_UP;
2565 return flags;
19baf839
RO
2566}
2567
cb7b593c
SH
2568/*
2569 * This outputs /proc/net/route.
2570 * The format of the file is not supposed to be changed
2571 * and needs to be same as fib_hash output to avoid breaking
2572 * legacy utilities
2573 */
2574static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2575{
cb7b593c 2576 struct leaf *l = v;
1328042e
SH
2577 struct leaf_info *li;
2578 struct hlist_node *node;
19baf839 2579
cb7b593c
SH
2580 if (v == SEQ_START_TOKEN) {
2581 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2582 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2583 "\tWindow\tIRTT");
2584 return 0;
2585 }
19baf839 2586
1328042e 2587 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
cb7b593c 2588 struct fib_alias *fa;
32ab5f80 2589 __be32 mask, prefix;
91b9a277 2590
cb7b593c
SH
2591 mask = inet_make_mask(li->plen);
2592 prefix = htonl(l->key);
19baf839 2593
cb7b593c 2594 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2595 const struct fib_info *fi = fa->fa_info;
cb7b593c 2596 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
1328042e 2597 char bf[128];
19baf839 2598
cb7b593c
SH
2599 if (fa->fa_type == RTN_BROADCAST
2600 || fa->fa_type == RTN_MULTICAST)
2601 continue;
19baf839 2602
cb7b593c
SH
2603 if (fi)
2604 snprintf(bf, sizeof(bf),
2605 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2606 fi->fib_dev ? fi->fib_dev->name : "*",
2607 prefix,
2608 fi->fib_nh->nh_gw, flags, 0, 0,
2609 fi->fib_priority,
2610 mask,
a07f5f50
SH
2611 (fi->fib_advmss ?
2612 fi->fib_advmss + 40 : 0),
cb7b593c
SH
2613 fi->fib_window,
2614 fi->fib_rtt >> 3);
2615 else
2616 snprintf(bf, sizeof(bf),
2617 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2618 prefix, 0, flags, 0, 0, 0,
2619 mask, 0, 0, 0);
19baf839 2620
cb7b593c
SH
2621 seq_printf(seq, "%-127s\n", bf);
2622 }
19baf839
RO
2623 }
2624
2625 return 0;
2626}
2627
f690808e 2628static const struct seq_operations fib_route_seq_ops = {
8315f5d8
SH
2629 .start = fib_route_seq_start,
2630 .next = fib_route_seq_next,
2631 .stop = fib_route_seq_stop,
cb7b593c 2632 .show = fib_route_seq_show,
19baf839
RO
2633};
2634
cb7b593c 2635static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839 2636{
1c340b2f 2637 return seq_open_net(inode, file, &fib_route_seq_ops,
8315f5d8 2638 sizeof(struct fib_route_iter));
19baf839
RO
2639}
2640
9a32144e 2641static const struct file_operations fib_route_fops = {
cb7b593c
SH
2642 .owner = THIS_MODULE,
2643 .open = fib_route_seq_open,
2644 .read = seq_read,
2645 .llseek = seq_lseek,
1c340b2f 2646 .release = seq_release_net,
19baf839
RO
2647};
2648
61a02653 2649int __net_init fib_proc_init(struct net *net)
19baf839 2650{
61a02653 2651 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
cb7b593c
SH
2652 goto out1;
2653
61a02653
DL
2654 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2655 &fib_triestat_fops))
cb7b593c
SH
2656 goto out2;
2657
61a02653 2658 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
cb7b593c
SH
2659 goto out3;
2660
19baf839 2661 return 0;
cb7b593c
SH
2662
2663out3:
61a02653 2664 proc_net_remove(net, "fib_triestat");
cb7b593c 2665out2:
61a02653 2666 proc_net_remove(net, "fib_trie");
cb7b593c
SH
2667out1:
2668 return -ENOMEM;
19baf839
RO
2669}
2670
61a02653 2671void __net_exit fib_proc_exit(struct net *net)
19baf839 2672{
61a02653
DL
2673 proc_net_remove(net, "fib_trie");
2674 proc_net_remove(net, "fib_triestat");
2675 proc_net_remove(net, "route");
19baf839
RO
2676}
2677
2678#endif /* CONFIG_PROC_FS */