]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - net/ipv4/fib_trie.c
[IPV4]: Prepare FIB core for RCU.
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
44 */
45
2f36895a 46#define VERSION "0.325"
19baf839
RO
47
48#include <linux/config.h>
49#include <asm/uaccess.h>
50#include <asm/system.h>
51#include <asm/bitops.h>
52#include <linux/types.h>
53#include <linux/kernel.h>
54#include <linux/sched.h>
55#include <linux/mm.h>
56#include <linux/string.h>
57#include <linux/socket.h>
58#include <linux/sockios.h>
59#include <linux/errno.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_arp.h>
64#include <linux/proc_fs.h>
65#include <linux/skbuff.h>
66#include <linux/netlink.h>
67#include <linux/init.h>
68#include <linux/list.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
72#include <net/tcp.h>
73#include <net/sock.h>
74#include <net/ip_fib.h>
75#include "fib_lookup.h"
76
77#undef CONFIG_IP_FIB_TRIE_STATS
78#define MAX_CHILDS 16384
79
19baf839
RO
80#define KEYLENGTH (8*sizeof(t_key))
81#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
82#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
83
84static DEFINE_RWLOCK(fib_lock);
85
86typedef unsigned int t_key;
87
88#define T_TNODE 0
89#define T_LEAF 1
90#define NODE_TYPE_MASK 0x1UL
91b9a277
OJ
91#define NODE_PARENT(node) \
92 ((struct tnode *)((node)->parent & ~NODE_TYPE_MASK))
93#define NODE_SET_PARENT(node, ptr) \
94 ((node)->parent = (((unsigned long)(ptr)) | \
95 ((node)->parent & NODE_TYPE_MASK)))
96#define NODE_INIT_PARENT(node, type) \
97 ((node)->parent = (type))
98#define NODE_TYPE(node) \
99 ((node)->parent & NODE_TYPE_MASK)
100
101#define IS_TNODE(n) (!(n->parent & T_LEAF))
102#define IS_LEAF(n) (n->parent & T_LEAF)
19baf839
RO
103
104struct node {
91b9a277
OJ
105 t_key key;
106 unsigned long parent;
19baf839
RO
107};
108
109struct leaf {
91b9a277
OJ
110 t_key key;
111 unsigned long parent;
19baf839
RO
112 struct hlist_head list;
113};
114
115struct leaf_info {
116 struct hlist_node hlist;
117 int plen;
118 struct list_head falh;
119};
120
121struct tnode {
91b9a277
OJ
122 t_key key;
123 unsigned long parent;
124 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
125 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short full_children; /* KEYLENGTH bits needed */
127 unsigned short empty_children; /* KEYLENGTH bits needed */
128 struct node *child[0];
19baf839
RO
129};
130
131#ifdef CONFIG_IP_FIB_TRIE_STATS
132struct trie_use_stats {
133 unsigned int gets;
134 unsigned int backtrack;
135 unsigned int semantic_match_passed;
136 unsigned int semantic_match_miss;
137 unsigned int null_node_hit;
2f36895a 138 unsigned int resize_node_skipped;
19baf839
RO
139};
140#endif
141
142struct trie_stat {
143 unsigned int totdepth;
144 unsigned int maxdepth;
145 unsigned int tnodes;
146 unsigned int leaves;
147 unsigned int nullpointers;
148 unsigned int nodesizes[MAX_CHILDS];
c877efb2 149};
19baf839
RO
150
151struct trie {
91b9a277 152 struct node *trie;
19baf839
RO
153#ifdef CONFIG_IP_FIB_TRIE_STATS
154 struct trie_use_stats stats;
155#endif
91b9a277 156 int size;
19baf839
RO
157 unsigned int revision;
158};
159
19baf839
RO
160static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
161static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
19baf839 162static struct node *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
163static struct tnode *inflate(struct trie *t, struct tnode *tn);
164static struct tnode *halve(struct trie *t, struct tnode *tn);
19baf839
RO
165static void tnode_free(struct tnode *tn);
166static void trie_dump_seq(struct seq_file *seq, struct trie *t);
19baf839
RO
167
168static kmem_cache_t *fn_alias_kmem;
169static struct trie *trie_local = NULL, *trie_main = NULL;
170
c877efb2 171static inline struct node *tnode_get_child(struct tnode *tn, int i)
19baf839 172{
91b9a277 173 BUG_ON(i >= 1 << tn->bits);
19baf839 174
91b9a277 175 return tn->child[i];
19baf839
RO
176}
177
bb435b8d 178static inline int tnode_child_length(const struct tnode *tn)
19baf839 179{
91b9a277 180 return 1 << tn->bits;
19baf839
RO
181}
182
19baf839
RO
183static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
184{
91b9a277 185 if (offset < KEYLENGTH)
19baf839 186 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 187 else
19baf839
RO
188 return 0;
189}
190
191static inline int tkey_equals(t_key a, t_key b)
192{
c877efb2 193 return a == b;
19baf839
RO
194}
195
196static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
197{
c877efb2
SH
198 if (bits == 0 || offset >= KEYLENGTH)
199 return 1;
91b9a277
OJ
200 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
201 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 202}
19baf839
RO
203
204static inline int tkey_mismatch(t_key a, int offset, t_key b)
205{
206 t_key diff = a ^ b;
207 int i = offset;
208
c877efb2
SH
209 if (!diff)
210 return 0;
211 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
212 i++;
213 return i;
214}
215
91b9a277 216/* Candidate for fib_semantics */
19baf839
RO
217
218static void fn_free_alias(struct fib_alias *fa)
219{
220 fib_release_info(fa->fa_info);
221 kmem_cache_free(fn_alias_kmem, fa);
222}
223
224/*
225 To understand this stuff, an understanding of keys and all their bits is
226 necessary. Every node in the trie has a key associated with it, but not
227 all of the bits in that key are significant.
228
229 Consider a node 'n' and its parent 'tp'.
230
231 If n is a leaf, every bit in its key is significant. Its presence is
232 necessitaded by path compression, since during a tree traversal (when
233 searching for a leaf - unless we are doing an insertion) we will completely
234 ignore all skipped bits we encounter. Thus we need to verify, at the end of
235 a potentially successful search, that we have indeed been walking the
236 correct key path.
237
238 Note that we can never "miss" the correct key in the tree if present by
239 following the wrong path. Path compression ensures that segments of the key
240 that are the same for all keys with a given prefix are skipped, but the
241 skipped part *is* identical for each node in the subtrie below the skipped
242 bit! trie_insert() in this implementation takes care of that - note the
243 call to tkey_sub_equals() in trie_insert().
244
245 if n is an internal node - a 'tnode' here, the various parts of its key
246 have many different meanings.
247
248 Example:
249 _________________________________________________________________
250 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
251 -----------------------------------------------------------------
252 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
253
254 _________________________________________________________________
255 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
256 -----------------------------------------------------------------
257 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
258
259 tp->pos = 7
260 tp->bits = 3
261 n->pos = 15
91b9a277 262 n->bits = 4
19baf839
RO
263
264 First, let's just ignore the bits that come before the parent tp, that is
265 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
266 not use them for anything.
267
268 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
269 index into the parent's child array. That is, they will be used to find
270 'n' among tp's children.
271
272 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
273 for the node n.
274
275 All the bits we have seen so far are significant to the node n. The rest
276 of the bits are really not needed or indeed known in n->key.
277
278 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
279 n's child array, and will of course be different for each child.
280
c877efb2 281
19baf839
RO
282 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
283 at this point.
284
285*/
286
0c7770c7 287static inline void check_tnode(const struct tnode *tn)
19baf839 288{
0c7770c7 289 WARN_ON(tn && tn->pos+tn->bits > 32);
19baf839
RO
290}
291
292static int halve_threshold = 25;
293static int inflate_threshold = 50;
294
295static struct leaf *leaf_new(void)
296{
297 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
c877efb2 298 if (l) {
19baf839
RO
299 NODE_INIT_PARENT(l, T_LEAF);
300 INIT_HLIST_HEAD(&l->list);
301 }
302 return l;
303}
304
305static struct leaf_info *leaf_info_new(int plen)
306{
307 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
91b9a277
OJ
308
309 if (!li)
310 return NULL;
311
312 li->plen = plen;
313 INIT_LIST_HEAD(&li->falh);
314
19baf839
RO
315 return li;
316}
317
318static inline void free_leaf(struct leaf *l)
319{
320 kfree(l);
321}
322
323static inline void free_leaf_info(struct leaf_info *li)
324{
325 kfree(li);
326}
327
f0e36f8c
PM
328static struct tnode *tnode_alloc(unsigned int size)
329{
330 if (size <= PAGE_SIZE) {
331 return kmalloc(size, GFP_KERNEL);
332 } else {
333 return (struct tnode *)
c877efb2 334 __get_free_pages(GFP_KERNEL, get_order(size));
f0e36f8c
PM
335 }
336}
337
338static void __tnode_free(struct tnode *tn)
339{
340 unsigned int size = sizeof(struct tnode) +
91b9a277 341 (1 << tn->bits) * sizeof(struct node *);
f0e36f8c
PM
342
343 if (size <= PAGE_SIZE)
344 kfree(tn);
345 else
346 free_pages((unsigned long)tn, get_order(size));
347}
348
19baf839
RO
349static struct tnode* tnode_new(t_key key, int pos, int bits)
350{
351 int nchildren = 1<<bits;
352 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
f0e36f8c 353 struct tnode *tn = tnode_alloc(sz);
19baf839 354
91b9a277 355 if (tn) {
19baf839
RO
356 memset(tn, 0, sz);
357 NODE_INIT_PARENT(tn, T_TNODE);
358 tn->pos = pos;
359 tn->bits = bits;
360 tn->key = key;
361 tn->full_children = 0;
362 tn->empty_children = 1<<bits;
363 }
c877efb2 364
0c7770c7
SH
365 pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
366 (unsigned int) (sizeof(struct node) * 1<<bits));
19baf839
RO
367 return tn;
368}
369
370static void tnode_free(struct tnode *tn)
371{
c877efb2 372 if (IS_LEAF(tn)) {
19baf839 373 free_leaf((struct leaf *)tn);
0c7770c7 374 pr_debug("FL %p \n", tn);
91b9a277 375 } else {
f0e36f8c 376 __tnode_free(tn);
0c7770c7 377 pr_debug("FT %p \n", tn);
19baf839
RO
378 }
379}
380
381/*
382 * Check whether a tnode 'n' is "full", i.e. it is an internal node
383 * and no bits are skipped. See discussion in dyntree paper p. 6
384 */
385
bb435b8d 386static inline int tnode_full(const struct tnode *tn, const struct node *n)
19baf839 387{
c877efb2 388 if (n == NULL || IS_LEAF(n))
19baf839
RO
389 return 0;
390
391 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
392}
393
c877efb2 394static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
19baf839
RO
395{
396 tnode_put_child_reorg(tn, i, n, -1);
397}
398
c877efb2 399 /*
19baf839
RO
400 * Add a child at position i overwriting the old value.
401 * Update the value of full_children and empty_children.
402 */
403
c877efb2 404static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
19baf839
RO
405{
406 struct node *chi;
407 int isfull;
408
0c7770c7
SH
409 BUG_ON(i >= 1<<tn->bits);
410
19baf839 411 write_lock_bh(&fib_lock);
c877efb2 412 chi = tn->child[i];
19baf839
RO
413
414 /* update emptyChildren */
415 if (n == NULL && chi != NULL)
416 tn->empty_children++;
417 else if (n != NULL && chi == NULL)
418 tn->empty_children--;
c877efb2 419
19baf839 420 /* update fullChildren */
91b9a277 421 if (wasfull == -1)
19baf839
RO
422 wasfull = tnode_full(tn, chi);
423
424 isfull = tnode_full(tn, n);
c877efb2 425 if (wasfull && !isfull)
19baf839 426 tn->full_children--;
c877efb2 427 else if (!wasfull && isfull)
19baf839 428 tn->full_children++;
91b9a277 429
c877efb2
SH
430 if (n)
431 NODE_SET_PARENT(n, tn);
19baf839
RO
432
433 tn->child[i] = n;
434 write_unlock_bh(&fib_lock);
435}
436
c877efb2 437static struct node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
438{
439 int i;
2f36895a 440 int err = 0;
2f80b3c8 441 struct tnode *old_tn;
19baf839
RO
442
443 if (!tn)
444 return NULL;
445
0c7770c7
SH
446 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
447 tn, inflate_threshold, halve_threshold);
19baf839
RO
448
449 /* No children */
450 if (tn->empty_children == tnode_child_length(tn)) {
451 tnode_free(tn);
452 return NULL;
453 }
454 /* One child */
455 if (tn->empty_children == tnode_child_length(tn) - 1)
456 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 457 struct node *n;
19baf839
RO
458
459 write_lock_bh(&fib_lock);
91b9a277
OJ
460 n = tn->child[i];
461 if (!n) {
19baf839 462 write_unlock_bh(&fib_lock);
91b9a277 463 continue;
19baf839 464 }
91b9a277
OJ
465
466 /* compress one level */
467 NODE_INIT_PARENT(n, NODE_TYPE(n));
468
19baf839 469 write_unlock_bh(&fib_lock);
91b9a277
OJ
470 tnode_free(tn);
471 return n;
19baf839 472 }
c877efb2 473 /*
19baf839
RO
474 * Double as long as the resulting node has a number of
475 * nonempty nodes that are above the threshold.
476 */
477
478 /*
c877efb2
SH
479 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
480 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 481 * Telecommunications, page 6:
c877efb2 482 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
483 * children in the *doubled* node is at least 'high'."
484 *
c877efb2
SH
485 * 'high' in this instance is the variable 'inflate_threshold'. It
486 * is expressed as a percentage, so we multiply it with
487 * tnode_child_length() and instead of multiplying by 2 (since the
488 * child array will be doubled by inflate()) and multiplying
489 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 490 * multiply the left-hand side by 50.
c877efb2
SH
491 *
492 * The left-hand side may look a bit weird: tnode_child_length(tn)
493 * - tn->empty_children is of course the number of non-null children
494 * in the current node. tn->full_children is the number of "full"
19baf839 495 * children, that is non-null tnodes with a skip value of 0.
c877efb2 496 * All of those will be doubled in the resulting inflated tnode, so
19baf839 497 * we just count them one extra time here.
c877efb2 498 *
19baf839 499 * A clearer way to write this would be:
c877efb2 500 *
19baf839 501 * to_be_doubled = tn->full_children;
c877efb2 502 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
503 * tn->full_children;
504 *
505 * new_child_length = tnode_child_length(tn) * 2;
506 *
c877efb2 507 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
508 * new_child_length;
509 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
510 *
511 * ...and so on, tho it would mess up the while () loop.
512 *
19baf839
RO
513 * anyway,
514 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
515 * inflate_threshold
c877efb2 516 *
19baf839
RO
517 * avoid a division:
518 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
519 * inflate_threshold * new_child_length
c877efb2 520 *
19baf839 521 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 522 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 523 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 524 *
19baf839 525 * expand new_child_length:
c877efb2 526 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 527 * tn->full_children) >=
19baf839 528 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 529 *
19baf839 530 * shorten again:
c877efb2 531 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 532 * tn->empty_children) >= inflate_threshold *
19baf839 533 * tnode_child_length(tn)
c877efb2 534 *
19baf839
RO
535 */
536
537 check_tnode(tn);
c877efb2 538
2f36895a 539 err = 0;
19baf839
RO
540 while ((tn->full_children > 0 &&
541 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
542 inflate_threshold * tnode_child_length(tn))) {
543
2f80b3c8
RO
544 old_tn = tn;
545 tn = inflate(t, tn);
546 if (IS_ERR(tn)) {
547 tn = old_tn;
2f36895a
RO
548#ifdef CONFIG_IP_FIB_TRIE_STATS
549 t->stats.resize_node_skipped++;
550#endif
551 break;
552 }
19baf839
RO
553 }
554
555 check_tnode(tn);
556
557 /*
558 * Halve as long as the number of empty children in this
559 * node is above threshold.
560 */
2f36895a
RO
561
562 err = 0;
19baf839
RO
563 while (tn->bits > 1 &&
564 100 * (tnode_child_length(tn) - tn->empty_children) <
2f36895a
RO
565 halve_threshold * tnode_child_length(tn)) {
566
2f80b3c8
RO
567 old_tn = tn;
568 tn = halve(t, tn);
569 if (IS_ERR(tn)) {
570 tn = old_tn;
2f36895a
RO
571#ifdef CONFIG_IP_FIB_TRIE_STATS
572 t->stats.resize_node_skipped++;
573#endif
574 break;
575 }
576 }
19baf839 577
c877efb2 578
19baf839
RO
579 /* Only one child remains */
580
581 if (tn->empty_children == tnode_child_length(tn) - 1)
582 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 583 struct node *n;
19baf839 584
91b9a277 585 write_lock_bh(&fib_lock);
19baf839 586
91b9a277
OJ
587 n = tn->child[i];
588 if (!n) {
19baf839 589 write_unlock_bh(&fib_lock);
91b9a277 590 continue;
19baf839 591 }
91b9a277
OJ
592
593 /* compress one level */
594
595 NODE_INIT_PARENT(n, NODE_TYPE(n));
596
19baf839 597 write_unlock_bh(&fib_lock);
91b9a277
OJ
598 tnode_free(tn);
599 return n;
19baf839
RO
600 }
601
602 return (struct node *) tn;
603}
604
2f80b3c8 605static struct tnode *inflate(struct trie *t, struct tnode *tn)
19baf839
RO
606{
607 struct tnode *inode;
608 struct tnode *oldtnode = tn;
609 int olen = tnode_child_length(tn);
610 int i;
611
0c7770c7 612 pr_debug("In inflate\n");
19baf839
RO
613
614 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
615
0c7770c7 616 if (!tn)
2f80b3c8 617 return ERR_PTR(-ENOMEM);
2f36895a
RO
618
619 /*
c877efb2
SH
620 * Preallocate and store tnodes before the actual work so we
621 * don't get into an inconsistent state if memory allocation
622 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
623 * of tnode is ignored.
624 */
91b9a277
OJ
625
626 for (i = 0; i < olen; i++) {
2f36895a
RO
627 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
628
629 if (inode &&
630 IS_TNODE(inode) &&
631 inode->pos == oldtnode->pos + oldtnode->bits &&
632 inode->bits > 1) {
633 struct tnode *left, *right;
2f36895a 634 t_key m = TKEY_GET_MASK(inode->pos, 1);
c877efb2 635
2f36895a
RO
636 left = tnode_new(inode->key&(~m), inode->pos + 1,
637 inode->bits - 1);
2f80b3c8
RO
638 if (!left)
639 goto nomem;
91b9a277 640
2f36895a
RO
641 right = tnode_new(inode->key|m, inode->pos + 1,
642 inode->bits - 1);
643
2f80b3c8
RO
644 if (!right) {
645 tnode_free(left);
646 goto nomem;
647 }
2f36895a
RO
648
649 put_child(t, tn, 2*i, (struct node *) left);
650 put_child(t, tn, 2*i+1, (struct node *) right);
651 }
652 }
653
91b9a277 654 for (i = 0; i < olen; i++) {
19baf839 655 struct node *node = tnode_get_child(oldtnode, i);
91b9a277
OJ
656 struct tnode *left, *right;
657 int size, j;
c877efb2 658
19baf839
RO
659 /* An empty child */
660 if (node == NULL)
661 continue;
662
663 /* A leaf or an internal node with skipped bits */
664
c877efb2 665 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 666 tn->pos + tn->bits - 1) {
c877efb2 667 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
19baf839
RO
668 1) == 0)
669 put_child(t, tn, 2*i, node);
670 else
671 put_child(t, tn, 2*i+1, node);
672 continue;
673 }
674
675 /* An internal node with two children */
676 inode = (struct tnode *) node;
677
678 if (inode->bits == 1) {
679 put_child(t, tn, 2*i, inode->child[0]);
680 put_child(t, tn, 2*i+1, inode->child[1]);
681
682 tnode_free(inode);
91b9a277 683 continue;
19baf839
RO
684 }
685
91b9a277
OJ
686 /* An internal node with more than two children */
687
688 /* We will replace this node 'inode' with two new
689 * ones, 'left' and 'right', each with half of the
690 * original children. The two new nodes will have
691 * a position one bit further down the key and this
692 * means that the "significant" part of their keys
693 * (see the discussion near the top of this file)
694 * will differ by one bit, which will be "0" in
695 * left's key and "1" in right's key. Since we are
696 * moving the key position by one step, the bit that
697 * we are moving away from - the bit at position
698 * (inode->pos) - is the one that will differ between
699 * left and right. So... we synthesize that bit in the
700 * two new keys.
701 * The mask 'm' below will be a single "one" bit at
702 * the position (inode->pos)
703 */
19baf839 704
91b9a277
OJ
705 /* Use the old key, but set the new significant
706 * bit to zero.
707 */
2f36895a 708
91b9a277
OJ
709 left = (struct tnode *) tnode_get_child(tn, 2*i);
710 put_child(t, tn, 2*i, NULL);
2f36895a 711
91b9a277 712 BUG_ON(!left);
2f36895a 713
91b9a277
OJ
714 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
715 put_child(t, tn, 2*i+1, NULL);
19baf839 716
91b9a277 717 BUG_ON(!right);
19baf839 718
91b9a277
OJ
719 size = tnode_child_length(left);
720 for (j = 0; j < size; j++) {
721 put_child(t, left, j, inode->child[j]);
722 put_child(t, right, j, inode->child[j + size]);
19baf839 723 }
91b9a277
OJ
724 put_child(t, tn, 2*i, resize(t, left));
725 put_child(t, tn, 2*i+1, resize(t, right));
726
727 tnode_free(inode);
19baf839
RO
728 }
729 tnode_free(oldtnode);
730 return tn;
2f80b3c8
RO
731nomem:
732 {
733 int size = tnode_child_length(tn);
734 int j;
735
0c7770c7 736 for (j = 0; j < size; j++)
2f80b3c8
RO
737 if (tn->child[j])
738 tnode_free((struct tnode *)tn->child[j]);
739
740 tnode_free(tn);
0c7770c7 741
2f80b3c8
RO
742 return ERR_PTR(-ENOMEM);
743 }
19baf839
RO
744}
745
2f80b3c8 746static struct tnode *halve(struct trie *t, struct tnode *tn)
19baf839
RO
747{
748 struct tnode *oldtnode = tn;
749 struct node *left, *right;
750 int i;
751 int olen = tnode_child_length(tn);
752
0c7770c7 753 pr_debug("In halve\n");
c877efb2
SH
754
755 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 756
2f80b3c8
RO
757 if (!tn)
758 return ERR_PTR(-ENOMEM);
2f36895a
RO
759
760 /*
c877efb2
SH
761 * Preallocate and store tnodes before the actual work so we
762 * don't get into an inconsistent state if memory allocation
763 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
764 * of tnode is ignored.
765 */
766
91b9a277 767 for (i = 0; i < olen; i += 2) {
2f36895a
RO
768 left = tnode_get_child(oldtnode, i);
769 right = tnode_get_child(oldtnode, i+1);
c877efb2 770
2f36895a 771 /* Two nonempty children */
0c7770c7 772 if (left && right) {
2f80b3c8 773 struct tnode *newn;
0c7770c7 774
2f80b3c8 775 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
776
777 if (!newn)
2f80b3c8 778 goto nomem;
0c7770c7 779
2f80b3c8 780 put_child(t, tn, i/2, (struct node *)newn);
2f36895a 781 }
2f36895a 782
2f36895a 783 }
19baf839 784
91b9a277
OJ
785 for (i = 0; i < olen; i += 2) {
786 struct tnode *newBinNode;
787
19baf839
RO
788 left = tnode_get_child(oldtnode, i);
789 right = tnode_get_child(oldtnode, i+1);
c877efb2 790
19baf839
RO
791 /* At least one of the children is empty */
792 if (left == NULL) {
793 if (right == NULL) /* Both are empty */
794 continue;
795 put_child(t, tn, i/2, right);
91b9a277 796 continue;
0c7770c7 797 }
91b9a277
OJ
798
799 if (right == NULL) {
19baf839 800 put_child(t, tn, i/2, left);
91b9a277
OJ
801 continue;
802 }
c877efb2 803
19baf839 804 /* Two nonempty children */
91b9a277
OJ
805 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
806 put_child(t, tn, i/2, NULL);
91b9a277
OJ
807 put_child(t, newBinNode, 0, left);
808 put_child(t, newBinNode, 1, right);
809 put_child(t, tn, i/2, resize(t, newBinNode));
19baf839
RO
810 }
811 tnode_free(oldtnode);
812 return tn;
2f80b3c8
RO
813nomem:
814 {
815 int size = tnode_child_length(tn);
816 int j;
817
0c7770c7 818 for (j = 0; j < size; j++)
2f80b3c8
RO
819 if (tn->child[j])
820 tnode_free((struct tnode *)tn->child[j]);
821
822 tnode_free(tn);
0c7770c7 823
2f80b3c8
RO
824 return ERR_PTR(-ENOMEM);
825 }
19baf839
RO
826}
827
91b9a277 828static void trie_init(struct trie *t)
19baf839 829{
91b9a277
OJ
830 if (!t)
831 return;
832
833 t->size = 0;
834 t->trie = NULL;
835 t->revision = 0;
19baf839 836#ifdef CONFIG_IP_FIB_TRIE_STATS
91b9a277 837 memset(&t->stats, 0, sizeof(struct trie_use_stats));
19baf839 838#endif
19baf839
RO
839}
840
841static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
842{
843 struct hlist_node *node;
844 struct leaf_info *li;
845
91b9a277 846 hlist_for_each_entry(li, node, head, hlist)
c877efb2 847 if (li->plen == plen)
19baf839 848 return li;
91b9a277 849
19baf839
RO
850 return NULL;
851}
852
853static inline struct list_head * get_fa_head(struct leaf *l, int plen)
854{
19baf839 855 struct leaf_info *li = find_leaf_info(&l->list, plen);
c877efb2 856
91b9a277
OJ
857 if (!li)
858 return NULL;
c877efb2 859
91b9a277 860 return &li->falh;
19baf839
RO
861}
862
863static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
864{
c877efb2 865 struct leaf_info *li = NULL, *last = NULL;
91b9a277 866 struct hlist_node *node;
19baf839
RO
867
868 write_lock_bh(&fib_lock);
c877efb2 869
91b9a277 870 if (hlist_empty(head)) {
19baf839 871 hlist_add_head(&new->hlist, head);
91b9a277
OJ
872 } else {
873 hlist_for_each_entry(li, node, head, hlist) {
c877efb2 874 if (new->plen > li->plen)
19baf839 875 break;
91b9a277 876
19baf839
RO
877 last = li;
878 }
c877efb2 879 if (last)
19baf839 880 hlist_add_after(&last->hlist, &new->hlist);
c877efb2 881 else
19baf839
RO
882 hlist_add_before(&new->hlist, &li->hlist);
883 }
884 write_unlock_bh(&fib_lock);
885}
886
887static struct leaf *
888fib_find_node(struct trie *t, u32 key)
889{
890 int pos;
891 struct tnode *tn;
892 struct node *n;
893
894 pos = 0;
c877efb2 895 n = t->trie;
19baf839
RO
896
897 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
898 tn = (struct tnode *) n;
91b9a277 899
19baf839 900 check_tnode(tn);
91b9a277 901
c877efb2 902 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
91b9a277 903 pos = tn->pos + tn->bits;
19baf839 904 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
91b9a277 905 } else
19baf839
RO
906 break;
907 }
908 /* Case we have found a leaf. Compare prefixes */
909
91b9a277
OJ
910 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
911 return (struct leaf *)n;
912
19baf839
RO
913 return NULL;
914}
915
916static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
917{
91b9a277 918 int i;
19baf839
RO
919 int wasfull;
920 t_key cindex, key;
921 struct tnode *tp = NULL;
922
19baf839
RO
923 key = tn->key;
924 i = 0;
925
926 while (tn != NULL && NODE_PARENT(tn) != NULL) {
91b9a277 927 BUG_ON(i > 12); /* Why is this a bug? -ojn */
19baf839
RO
928 i++;
929
930 tp = NODE_PARENT(tn);
931 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
932 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
933 tn = (struct tnode *) resize (t, (struct tnode *)tn);
934 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
91b9a277 935
c877efb2 936 if (!NODE_PARENT(tn))
19baf839
RO
937 break;
938
939 tn = NODE_PARENT(tn);
940 }
941 /* Handle last (top) tnode */
c877efb2 942 if (IS_TNODE(tn))
19baf839
RO
943 tn = (struct tnode*) resize(t, (struct tnode *)tn);
944
945 return (struct node*) tn;
946}
947
f835e471
RO
948static struct list_head *
949fib_insert_node(struct trie *t, int *err, u32 key, int plen)
19baf839
RO
950{
951 int pos, newpos;
952 struct tnode *tp = NULL, *tn = NULL;
953 struct node *n;
954 struct leaf *l;
955 int missbit;
c877efb2 956 struct list_head *fa_head = NULL;
19baf839
RO
957 struct leaf_info *li;
958 t_key cindex;
959
960 pos = 0;
c877efb2 961 n = t->trie;
19baf839 962
c877efb2
SH
963 /* If we point to NULL, stop. Either the tree is empty and we should
964 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 965 * and we should just put our new leaf in that.
c877efb2
SH
966 * If we point to a T_TNODE, check if it matches our key. Note that
967 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
968 * not be the parent's 'pos'+'bits'!
969 *
c877efb2 970 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
971 * the index from our key, push the T_TNODE and walk the tree.
972 *
973 * If it doesn't, we have to replace it with a new T_TNODE.
974 *
c877efb2
SH
975 * If we point to a T_LEAF, it might or might not have the same key
976 * as we do. If it does, just change the value, update the T_LEAF's
977 * value, and return it.
19baf839
RO
978 * If it doesn't, we need to replace it with a T_TNODE.
979 */
980
981 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
982 tn = (struct tnode *) n;
91b9a277 983
c877efb2 984 check_tnode(tn);
91b9a277 985
c877efb2 986 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839 987 tp = tn;
91b9a277 988 pos = tn->pos + tn->bits;
19baf839
RO
989 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
990
0c7770c7 991 BUG_ON(n && NODE_PARENT(n) != tn);
91b9a277 992 } else
19baf839
RO
993 break;
994 }
995
996 /*
997 * n ----> NULL, LEAF or TNODE
998 *
c877efb2 999 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1000 */
1001
91b9a277 1002 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1003
1004 /* Case 1: n is a leaf. Compare prefixes */
1005
c877efb2 1006 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
91b9a277
OJ
1007 struct leaf *l = (struct leaf *) n;
1008
19baf839 1009 li = leaf_info_new(plen);
91b9a277 1010
c877efb2 1011 if (!li) {
f835e471
RO
1012 *err = -ENOMEM;
1013 goto err;
1014 }
19baf839
RO
1015
1016 fa_head = &li->falh;
1017 insert_leaf_info(&l->list, li);
1018 goto done;
1019 }
1020 t->size++;
1021 l = leaf_new();
1022
c877efb2 1023 if (!l) {
f835e471
RO
1024 *err = -ENOMEM;
1025 goto err;
1026 }
19baf839
RO
1027
1028 l->key = key;
1029 li = leaf_info_new(plen);
1030
c877efb2 1031 if (!li) {
f835e471
RO
1032 tnode_free((struct tnode *) l);
1033 *err = -ENOMEM;
1034 goto err;
1035 }
19baf839
RO
1036
1037 fa_head = &li->falh;
1038 insert_leaf_info(&l->list, li);
1039
19baf839 1040 if (t->trie && n == NULL) {
91b9a277 1041 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839
RO
1042
1043 NODE_SET_PARENT(l, tp);
19baf839 1044
91b9a277
OJ
1045 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1046 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1047 } else {
1048 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1049 /*
1050 * Add a new tnode here
19baf839
RO
1051 * first tnode need some special handling
1052 */
1053
1054 if (tp)
91b9a277 1055 pos = tp->pos+tp->bits;
19baf839 1056 else
91b9a277
OJ
1057 pos = 0;
1058
c877efb2 1059 if (n) {
19baf839
RO
1060 newpos = tkey_mismatch(key, pos, n->key);
1061 tn = tnode_new(n->key, newpos, 1);
91b9a277 1062 } else {
19baf839 1063 newpos = 0;
c877efb2 1064 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1065 }
19baf839 1066
c877efb2 1067 if (!tn) {
f835e471
RO
1068 free_leaf_info(li);
1069 tnode_free((struct tnode *) l);
1070 *err = -ENOMEM;
1071 goto err;
91b9a277
OJ
1072 }
1073
19baf839
RO
1074 NODE_SET_PARENT(tn, tp);
1075
91b9a277 1076 missbit = tkey_extract_bits(key, newpos, 1);
19baf839
RO
1077 put_child(t, tn, missbit, (struct node *)l);
1078 put_child(t, tn, 1-missbit, n);
1079
c877efb2 1080 if (tp) {
19baf839
RO
1081 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1082 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
91b9a277 1083 } else {
19baf839
RO
1084 t->trie = (struct node*) tn; /* First tnode */
1085 tp = tn;
1086 }
1087 }
91b9a277
OJ
1088
1089 if (tp && tp->pos + tp->bits > 32)
c877efb2 1090 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
19baf839 1091 tp, tp->pos, tp->bits, key, plen);
91b9a277 1092
19baf839
RO
1093 /* Rebalance the trie */
1094 t->trie = trie_rebalance(t, tp);
f835e471
RO
1095done:
1096 t->revision++;
91b9a277 1097err:
19baf839
RO
1098 return fa_head;
1099}
1100
1101static int
1102fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1103 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1104{
1105 struct trie *t = (struct trie *) tb->tb_data;
1106 struct fib_alias *fa, *new_fa;
c877efb2 1107 struct list_head *fa_head = NULL;
19baf839
RO
1108 struct fib_info *fi;
1109 int plen = r->rtm_dst_len;
1110 int type = r->rtm_type;
1111 u8 tos = r->rtm_tos;
1112 u32 key, mask;
1113 int err;
1114 struct leaf *l;
1115
1116 if (plen > 32)
1117 return -EINVAL;
1118
1119 key = 0;
c877efb2 1120 if (rta->rta_dst)
19baf839
RO
1121 memcpy(&key, rta->rta_dst, 4);
1122
1123 key = ntohl(key);
1124
0c7770c7 1125 pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
19baf839 1126
91b9a277 1127 mask = ntohl(inet_make_mask(plen));
19baf839 1128
c877efb2 1129 if (key & ~mask)
19baf839
RO
1130 return -EINVAL;
1131
1132 key = key & mask;
1133
91b9a277
OJ
1134 fi = fib_create_info(r, rta, nlhdr, &err);
1135
1136 if (!fi)
19baf839
RO
1137 goto err;
1138
1139 l = fib_find_node(t, key);
c877efb2 1140 fa = NULL;
19baf839 1141
c877efb2 1142 if (l) {
19baf839
RO
1143 fa_head = get_fa_head(l, plen);
1144 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1145 }
1146
1147 /* Now fa, if non-NULL, points to the first fib alias
1148 * with the same keys [prefix,tos,priority], if such key already
1149 * exists or to the node before which we will insert new one.
1150 *
1151 * If fa is NULL, we will need to allocate a new one and
1152 * insert to the head of f.
1153 *
1154 * If f is NULL, no fib node matched the destination key
1155 * and we need to allocate a new one of those as well.
1156 */
1157
91b9a277 1158 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
19baf839
RO
1159 struct fib_alias *fa_orig;
1160
1161 err = -EEXIST;
1162 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1163 goto out;
1164
1165 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1166 struct fib_info *fi_drop;
1167 u8 state;
1168
1169 write_lock_bh(&fib_lock);
1170
1171 fi_drop = fa->fa_info;
1172 fa->fa_info = fi;
1173 fa->fa_type = type;
1174 fa->fa_scope = r->rtm_scope;
1175 state = fa->fa_state;
1176 fa->fa_state &= ~FA_S_ACCESSED;
1177
1178 write_unlock_bh(&fib_lock);
1179
1180 fib_release_info(fi_drop);
1181 if (state & FA_S_ACCESSED)
91b9a277 1182 rt_cache_flush(-1);
19baf839 1183
91b9a277 1184 goto succeeded;
19baf839
RO
1185 }
1186 /* Error if we find a perfect match which
1187 * uses the same scope, type, and nexthop
1188 * information.
1189 */
1190 fa_orig = fa;
1191 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1192 if (fa->fa_tos != tos)
1193 break;
1194 if (fa->fa_info->fib_priority != fi->fib_priority)
1195 break;
1196 if (fa->fa_type == type &&
1197 fa->fa_scope == r->rtm_scope &&
1198 fa->fa_info == fi) {
1199 goto out;
1200 }
1201 }
1202 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1203 fa = fa_orig;
1204 }
1205 err = -ENOENT;
91b9a277 1206 if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
19baf839
RO
1207 goto out;
1208
1209 err = -ENOBUFS;
1210 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1211 if (new_fa == NULL)
1212 goto out;
1213
1214 new_fa->fa_info = fi;
1215 new_fa->fa_tos = tos;
1216 new_fa->fa_type = type;
1217 new_fa->fa_scope = r->rtm_scope;
1218 new_fa->fa_state = 0;
19baf839
RO
1219 /*
1220 * Insert new entry to the list.
1221 */
1222
c877efb2 1223 if (!fa_head) {
f835e471
RO
1224 fa_head = fib_insert_node(t, &err, key, plen);
1225 err = 0;
c877efb2 1226 if (err)
f835e471
RO
1227 goto out_free_new_fa;
1228 }
19baf839
RO
1229
1230 write_lock_bh(&fib_lock);
1231
91b9a277 1232 list_add_tail(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head));
19baf839
RO
1233
1234 write_unlock_bh(&fib_lock);
1235
1236 rt_cache_flush(-1);
1237 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1238succeeded:
1239 return 0;
f835e471
RO
1240
1241out_free_new_fa:
1242 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1243out:
1244 fib_release_info(fi);
91b9a277 1245err:
19baf839
RO
1246 return err;
1247}
1248
0c7770c7
SH
1249static inline int check_leaf(struct trie *t, struct leaf *l,
1250 t_key key, int *plen, const struct flowi *flp,
06c74270 1251 struct fib_result *res)
19baf839 1252{
06c74270 1253 int err, i;
19baf839
RO
1254 t_key mask;
1255 struct leaf_info *li;
1256 struct hlist_head *hhead = &l->list;
1257 struct hlist_node *node;
c877efb2 1258
19baf839 1259 hlist_for_each_entry(li, node, hhead, hlist) {
19baf839
RO
1260 i = li->plen;
1261 mask = ntohl(inet_make_mask(i));
c877efb2 1262 if (l->key != (key & mask))
19baf839
RO
1263 continue;
1264
06c74270 1265 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
19baf839
RO
1266 *plen = i;
1267#ifdef CONFIG_IP_FIB_TRIE_STATS
1268 t->stats.semantic_match_passed++;
1269#endif
06c74270 1270 return err;
19baf839
RO
1271 }
1272#ifdef CONFIG_IP_FIB_TRIE_STATS
1273 t->stats.semantic_match_miss++;
1274#endif
1275 }
06c74270 1276 return 1;
19baf839
RO
1277}
1278
1279static int
1280fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1281{
1282 struct trie *t = (struct trie *) tb->tb_data;
1283 int plen, ret = 0;
1284 struct node *n;
1285 struct tnode *pn;
1286 int pos, bits;
91b9a277 1287 t_key key = ntohl(flp->fl4_dst);
19baf839
RO
1288 int chopped_off;
1289 t_key cindex = 0;
1290 int current_prefix_length = KEYLENGTH;
91b9a277
OJ
1291 struct tnode *cn;
1292 t_key node_prefix, key_prefix, pref_mismatch;
1293 int mp;
1294
19baf839
RO
1295 n = t->trie;
1296
1297 read_lock(&fib_lock);
91b9a277 1298
c877efb2 1299 if (!n)
19baf839
RO
1300 goto failed;
1301
1302#ifdef CONFIG_IP_FIB_TRIE_STATS
1303 t->stats.gets++;
1304#endif
1305
1306 /* Just a leaf? */
1307 if (IS_LEAF(n)) {
06c74270 1308 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
19baf839
RO
1309 goto found;
1310 goto failed;
1311 }
1312 pn = (struct tnode *) n;
1313 chopped_off = 0;
c877efb2 1314
91b9a277 1315 while (pn) {
19baf839
RO
1316 pos = pn->pos;
1317 bits = pn->bits;
1318
c877efb2 1319 if (!chopped_off)
19baf839
RO
1320 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1321
1322 n = tnode_get_child(pn, cindex);
1323
1324 if (n == NULL) {
1325#ifdef CONFIG_IP_FIB_TRIE_STATS
1326 t->stats.null_node_hit++;
1327#endif
1328 goto backtrace;
1329 }
1330
91b9a277
OJ
1331 if (IS_LEAF(n)) {
1332 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1333 goto found;
1334 else
1335 goto backtrace;
1336 }
1337
19baf839
RO
1338#define HL_OPTIMIZE
1339#ifdef HL_OPTIMIZE
91b9a277 1340 cn = (struct tnode *)n;
19baf839 1341
91b9a277
OJ
1342 /*
1343 * It's a tnode, and we can do some extra checks here if we
1344 * like, to avoid descending into a dead-end branch.
1345 * This tnode is in the parent's child array at index
1346 * key[p_pos..p_pos+p_bits] but potentially with some bits
1347 * chopped off, so in reality the index may be just a
1348 * subprefix, padded with zero at the end.
1349 * We can also take a look at any skipped bits in this
1350 * tnode - everything up to p_pos is supposed to be ok,
1351 * and the non-chopped bits of the index (se previous
1352 * paragraph) are also guaranteed ok, but the rest is
1353 * considered unknown.
1354 *
1355 * The skipped bits are key[pos+bits..cn->pos].
1356 */
19baf839 1357
91b9a277
OJ
1358 /* If current_prefix_length < pos+bits, we are already doing
1359 * actual prefix matching, which means everything from
1360 * pos+(bits-chopped_off) onward must be zero along some
1361 * branch of this subtree - otherwise there is *no* valid
1362 * prefix present. Here we can only check the skipped
1363 * bits. Remember, since we have already indexed into the
1364 * parent's child array, we know that the bits we chopped of
1365 * *are* zero.
1366 */
19baf839 1367
91b9a277 1368 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
19baf839 1369
91b9a277
OJ
1370 if (current_prefix_length < pos+bits) {
1371 if (tkey_extract_bits(cn->key, current_prefix_length,
1372 cn->pos - current_prefix_length) != 0 ||
1373 !(cn->child[0]))
1374 goto backtrace;
1375 }
19baf839 1376
91b9a277
OJ
1377 /*
1378 * If chopped_off=0, the index is fully validated and we
1379 * only need to look at the skipped bits for this, the new,
1380 * tnode. What we actually want to do is to find out if
1381 * these skipped bits match our key perfectly, or if we will
1382 * have to count on finding a matching prefix further down,
1383 * because if we do, we would like to have some way of
1384 * verifying the existence of such a prefix at this point.
1385 */
19baf839 1386
91b9a277
OJ
1387 /* The only thing we can do at this point is to verify that
1388 * any such matching prefix can indeed be a prefix to our
1389 * key, and if the bits in the node we are inspecting that
1390 * do not match our key are not ZERO, this cannot be true.
1391 * Thus, find out where there is a mismatch (before cn->pos)
1392 * and verify that all the mismatching bits are zero in the
1393 * new tnode's key.
1394 */
19baf839 1395
91b9a277
OJ
1396 /* Note: We aren't very concerned about the piece of the key
1397 * that precede pn->pos+pn->bits, since these have already been
1398 * checked. The bits after cn->pos aren't checked since these are
1399 * by definition "unknown" at this point. Thus, what we want to
1400 * see is if we are about to enter the "prefix matching" state,
1401 * and in that case verify that the skipped bits that will prevail
1402 * throughout this subtree are zero, as they have to be if we are
1403 * to find a matching prefix.
1404 */
1405
1406 node_prefix = MASK_PFX(cn->key, cn->pos);
1407 key_prefix = MASK_PFX(key, cn->pos);
1408 pref_mismatch = key_prefix^node_prefix;
1409 mp = 0;
1410
1411 /* In short: If skipped bits in this node do not match the search
1412 * key, enter the "prefix matching" state.directly.
1413 */
1414 if (pref_mismatch) {
1415 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1416 mp++;
1417 pref_mismatch = pref_mismatch <<1;
1418 }
1419 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1420
1421 if (key_prefix != 0)
1422 goto backtrace;
1423
1424 if (current_prefix_length >= cn->pos)
1425 current_prefix_length = mp;
c877efb2 1426 }
91b9a277
OJ
1427#endif
1428 pn = (struct tnode *)n; /* Descend */
1429 chopped_off = 0;
1430 continue;
1431
19baf839
RO
1432backtrace:
1433 chopped_off++;
1434
1435 /* As zero don't change the child key (cindex) */
91b9a277 1436 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
19baf839 1437 chopped_off++;
19baf839
RO
1438
1439 /* Decrease current_... with bits chopped off */
1440 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1441 current_prefix_length = pn->pos + pn->bits - chopped_off;
91b9a277 1442
19baf839 1443 /*
c877efb2 1444 * Either we do the actual chop off according or if we have
19baf839
RO
1445 * chopped off all bits in this tnode walk up to our parent.
1446 */
1447
91b9a277 1448 if (chopped_off <= pn->bits) {
19baf839 1449 cindex &= ~(1 << (chopped_off-1));
91b9a277 1450 } else {
c877efb2 1451 if (NODE_PARENT(pn) == NULL)
19baf839 1452 goto failed;
91b9a277 1453
19baf839
RO
1454 /* Get Child's index */
1455 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1456 pn = NODE_PARENT(pn);
1457 chopped_off = 0;
1458
1459#ifdef CONFIG_IP_FIB_TRIE_STATS
1460 t->stats.backtrack++;
1461#endif
1462 goto backtrace;
c877efb2 1463 }
19baf839
RO
1464 }
1465failed:
c877efb2 1466 ret = 1;
19baf839
RO
1467found:
1468 read_unlock(&fib_lock);
1469 return ret;
1470}
1471
1472static int trie_leaf_remove(struct trie *t, t_key key)
1473{
1474 t_key cindex;
1475 struct tnode *tp = NULL;
1476 struct node *n = t->trie;
1477 struct leaf *l;
1478
0c7770c7 1479 pr_debug("entering trie_leaf_remove(%p)\n", n);
19baf839
RO
1480
1481 /* Note that in the case skipped bits, those bits are *not* checked!
c877efb2 1482 * When we finish this, we will have NULL or a T_LEAF, and the
19baf839
RO
1483 * T_LEAF may or may not match our key.
1484 */
1485
91b9a277 1486 while (n != NULL && IS_TNODE(n)) {
19baf839
RO
1487 struct tnode *tn = (struct tnode *) n;
1488 check_tnode(tn);
1489 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1490
0c7770c7 1491 BUG_ON(n && NODE_PARENT(n) != tn);
91b9a277 1492 }
19baf839
RO
1493 l = (struct leaf *) n;
1494
c877efb2 1495 if (!n || !tkey_equals(l->key, key))
19baf839 1496 return 0;
c877efb2
SH
1497
1498 /*
1499 * Key found.
1500 * Remove the leaf and rebalance the tree
19baf839
RO
1501 */
1502
1503 t->revision++;
1504 t->size--;
1505
1506 tp = NODE_PARENT(n);
1507 tnode_free((struct tnode *) n);
1508
c877efb2 1509 if (tp) {
19baf839
RO
1510 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1511 put_child(t, (struct tnode *)tp, cindex, NULL);
1512 t->trie = trie_rebalance(t, tp);
91b9a277 1513 } else
19baf839
RO
1514 t->trie = NULL;
1515
1516 return 1;
1517}
1518
1519static int
1520fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
91b9a277 1521 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
19baf839
RO
1522{
1523 struct trie *t = (struct trie *) tb->tb_data;
1524 u32 key, mask;
1525 int plen = r->rtm_dst_len;
1526 u8 tos = r->rtm_tos;
1527 struct fib_alias *fa, *fa_to_delete;
1528 struct list_head *fa_head;
1529 struct leaf *l;
91b9a277
OJ
1530 int kill_li = 0;
1531 struct leaf_info *li;
1532
19baf839 1533
c877efb2 1534 if (plen > 32)
19baf839
RO
1535 return -EINVAL;
1536
1537 key = 0;
c877efb2 1538 if (rta->rta_dst)
19baf839
RO
1539 memcpy(&key, rta->rta_dst, 4);
1540
1541 key = ntohl(key);
91b9a277 1542 mask = ntohl(inet_make_mask(plen));
19baf839 1543
c877efb2 1544 if (key & ~mask)
19baf839
RO
1545 return -EINVAL;
1546
1547 key = key & mask;
1548 l = fib_find_node(t, key);
1549
c877efb2 1550 if (!l)
19baf839
RO
1551 return -ESRCH;
1552
1553 fa_head = get_fa_head(l, plen);
1554 fa = fib_find_alias(fa_head, tos, 0);
1555
1556 if (!fa)
1557 return -ESRCH;
1558
0c7770c7 1559 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1560
1561 fa_to_delete = NULL;
1562 fa_head = fa->fa_list.prev;
1563 list_for_each_entry(fa, fa_head, fa_list) {
1564 struct fib_info *fi = fa->fa_info;
1565
1566 if (fa->fa_tos != tos)
1567 break;
1568
1569 if ((!r->rtm_type ||
1570 fa->fa_type == r->rtm_type) &&
1571 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1572 fa->fa_scope == r->rtm_scope) &&
1573 (!r->rtm_protocol ||
1574 fi->fib_protocol == r->rtm_protocol) &&
1575 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1576 fa_to_delete = fa;
1577 break;
1578 }
1579 }
1580
91b9a277
OJ
1581 if (!fa_to_delete)
1582 return -ESRCH;
19baf839 1583
91b9a277
OJ
1584 fa = fa_to_delete;
1585 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1586
1587 l = fib_find_node(t, key);
1588 li = find_leaf_info(&l->list, plen);
19baf839 1589
91b9a277 1590 write_lock_bh(&fib_lock);
19baf839 1591
91b9a277 1592 list_del(&fa->fa_list);
19baf839 1593
91b9a277
OJ
1594 if (list_empty(fa_head)) {
1595 hlist_del(&li->hlist);
1596 kill_li = 1;
1597 }
1598 write_unlock_bh(&fib_lock);
19baf839 1599
91b9a277
OJ
1600 if (kill_li)
1601 free_leaf_info(li);
19baf839 1602
91b9a277
OJ
1603 if (hlist_empty(&l->list))
1604 trie_leaf_remove(t, key);
19baf839 1605
91b9a277
OJ
1606 if (fa->fa_state & FA_S_ACCESSED)
1607 rt_cache_flush(-1);
19baf839 1608
91b9a277
OJ
1609 fn_free_alias(fa);
1610 return 0;
19baf839
RO
1611}
1612
1613static int trie_flush_list(struct trie *t, struct list_head *head)
1614{
1615 struct fib_alias *fa, *fa_node;
1616 int found = 0;
1617
1618 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1619 struct fib_info *fi = fa->fa_info;
19baf839 1620
91b9a277 1621 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
c877efb2 1622 write_lock_bh(&fib_lock);
19baf839 1623 list_del(&fa->fa_list);
c877efb2 1624 write_unlock_bh(&fib_lock);
19baf839
RO
1625
1626 fn_free_alias(fa);
1627 found++;
1628 }
1629 }
1630 return found;
1631}
1632
1633static int trie_flush_leaf(struct trie *t, struct leaf *l)
1634{
1635 int found = 0;
1636 struct hlist_head *lih = &l->list;
1637 struct hlist_node *node, *tmp;
1638 struct leaf_info *li = NULL;
1639
1640 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
19baf839
RO
1641 found += trie_flush_list(t, &li->falh);
1642
1643 if (list_empty(&li->falh)) {
c877efb2 1644 write_lock_bh(&fib_lock);
19baf839 1645 hlist_del(&li->hlist);
c877efb2 1646 write_unlock_bh(&fib_lock);
19baf839
RO
1647
1648 free_leaf_info(li);
1649 }
1650 }
1651 return found;
1652}
1653
1654static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1655{
1656 struct node *c = (struct node *) thisleaf;
1657 struct tnode *p;
1658 int idx;
1659
c877efb2
SH
1660 if (c == NULL) {
1661 if (t->trie == NULL)
19baf839
RO
1662 return NULL;
1663
1664 if (IS_LEAF(t->trie)) /* trie w. just a leaf */
1665 return (struct leaf *) t->trie;
1666
1667 p = (struct tnode*) t->trie; /* Start */
91b9a277 1668 } else
19baf839 1669 p = (struct tnode *) NODE_PARENT(c);
c877efb2 1670
19baf839
RO
1671 while (p) {
1672 int pos, last;
1673
1674 /* Find the next child of the parent */
c877efb2
SH
1675 if (c)
1676 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1677 else
19baf839
RO
1678 pos = 0;
1679
1680 last = 1 << p->bits;
91b9a277
OJ
1681 for (idx = pos; idx < last ; idx++) {
1682 if (!p->child[idx])
1683 continue;
1684
1685 /* Decend if tnode */
1686 while (IS_TNODE(p->child[idx])) {
1687 p = (struct tnode*) p->child[idx];
1688 idx = 0;
1689
1690 /* Rightmost non-NULL branch */
1691 if (p && IS_TNODE(p))
1692 while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
1693
1694 /* Done with this tnode? */
1695 if (idx >= (1 << p->bits) || p->child[idx] == NULL)
1696 goto up;
19baf839 1697 }
91b9a277 1698 return (struct leaf*) p->child[idx];
19baf839
RO
1699 }
1700up:
1701 /* No more children go up one step */
91b9a277 1702 c = (struct node *) p;
19baf839
RO
1703 p = (struct tnode *) NODE_PARENT(p);
1704 }
1705 return NULL; /* Ready. Root of trie */
1706}
1707
1708static int fn_trie_flush(struct fib_table *tb)
1709{
1710 struct trie *t = (struct trie *) tb->tb_data;
1711 struct leaf *ll = NULL, *l = NULL;
1712 int found = 0, h;
1713
1714 t->revision++;
1715
91b9a277 1716 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
19baf839
RO
1717 found += trie_flush_leaf(t, l);
1718
1719 if (ll && hlist_empty(&ll->list))
1720 trie_leaf_remove(t, ll->key);
1721 ll = l;
1722 }
1723
1724 if (ll && hlist_empty(&ll->list))
1725 trie_leaf_remove(t, ll->key);
1726
0c7770c7 1727 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1728 return found;
1729}
1730
91b9a277 1731static int trie_last_dflt = -1;
19baf839
RO
1732
1733static void
1734fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1735{
1736 struct trie *t = (struct trie *) tb->tb_data;
1737 int order, last_idx;
1738 struct fib_info *fi = NULL;
1739 struct fib_info *last_resort;
1740 struct fib_alias *fa = NULL;
1741 struct list_head *fa_head;
1742 struct leaf *l;
1743
1744 last_idx = -1;
1745 last_resort = NULL;
1746 order = -1;
1747
1748 read_lock(&fib_lock);
c877efb2 1749
19baf839 1750 l = fib_find_node(t, 0);
c877efb2 1751 if (!l)
19baf839
RO
1752 goto out;
1753
1754 fa_head = get_fa_head(l, 0);
c877efb2 1755 if (!fa_head)
19baf839
RO
1756 goto out;
1757
c877efb2 1758 if (list_empty(fa_head))
19baf839
RO
1759 goto out;
1760
1761 list_for_each_entry(fa, fa_head, fa_list) {
1762 struct fib_info *next_fi = fa->fa_info;
91b9a277 1763
19baf839
RO
1764 if (fa->fa_scope != res->scope ||
1765 fa->fa_type != RTN_UNICAST)
1766 continue;
91b9a277 1767
19baf839
RO
1768 if (next_fi->fib_priority > res->fi->fib_priority)
1769 break;
1770 if (!next_fi->fib_nh[0].nh_gw ||
1771 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1772 continue;
1773 fa->fa_state |= FA_S_ACCESSED;
91b9a277 1774
19baf839
RO
1775 if (fi == NULL) {
1776 if (next_fi != res->fi)
1777 break;
1778 } else if (!fib_detect_death(fi, order, &last_resort,
1779 &last_idx, &trie_last_dflt)) {
1780 if (res->fi)
1781 fib_info_put(res->fi);
1782 res->fi = fi;
1783 atomic_inc(&fi->fib_clntref);
1784 trie_last_dflt = order;
1785 goto out;
1786 }
1787 fi = next_fi;
1788 order++;
1789 }
1790 if (order <= 0 || fi == NULL) {
1791 trie_last_dflt = -1;
1792 goto out;
1793 }
1794
1795 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1796 if (res->fi)
1797 fib_info_put(res->fi);
1798 res->fi = fi;
1799 atomic_inc(&fi->fib_clntref);
1800 trie_last_dflt = order;
1801 goto out;
1802 }
1803 if (last_idx >= 0) {
1804 if (res->fi)
1805 fib_info_put(res->fi);
1806 res->fi = last_resort;
1807 if (last_resort)
1808 atomic_inc(&last_resort->fib_clntref);
1809 }
1810 trie_last_dflt = last_idx;
1811 out:;
c877efb2 1812 read_unlock(&fib_lock);
19baf839
RO
1813}
1814
c877efb2 1815static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
19baf839
RO
1816 struct sk_buff *skb, struct netlink_callback *cb)
1817{
1818 int i, s_i;
1819 struct fib_alias *fa;
1820
91b9a277 1821 u32 xkey = htonl(key);
19baf839 1822
91b9a277 1823 s_i = cb->args[3];
19baf839
RO
1824 i = 0;
1825
1826 list_for_each_entry(fa, fah, fa_list) {
1827 if (i < s_i) {
1828 i++;
1829 continue;
1830 }
1831 if (fa->fa_info->fib_nh == NULL) {
1832 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1833 i++;
1834 continue;
1835 }
1836 if (fa->fa_info == NULL) {
1837 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1838 i++;
1839 continue;
1840 }
1841
1842 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1843 cb->nlh->nlmsg_seq,
1844 RTM_NEWROUTE,
1845 tb->tb_id,
1846 fa->fa_type,
1847 fa->fa_scope,
1848 &xkey,
1849 plen,
1850 fa->fa_tos,
90f66914 1851 fa->fa_info, 0) < 0) {
19baf839
RO
1852 cb->args[3] = i;
1853 return -1;
91b9a277 1854 }
19baf839
RO
1855 i++;
1856 }
91b9a277 1857 cb->args[3] = i;
19baf839
RO
1858 return skb->len;
1859}
1860
c877efb2 1861static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
19baf839
RO
1862 struct netlink_callback *cb)
1863{
1864 int h, s_h;
1865 struct list_head *fa_head;
1866 struct leaf *l = NULL;
19baf839 1867
91b9a277 1868 s_h = cb->args[2];
19baf839 1869
91b9a277 1870 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
19baf839
RO
1871 if (h < s_h)
1872 continue;
1873 if (h > s_h)
1874 memset(&cb->args[3], 0,
1875 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1876
1877 fa_head = get_fa_head(l, plen);
91b9a277 1878
c877efb2 1879 if (!fa_head)
19baf839
RO
1880 continue;
1881
c877efb2 1882 if (list_empty(fa_head))
19baf839
RO
1883 continue;
1884
1885 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
91b9a277 1886 cb->args[2] = h;
19baf839
RO
1887 return -1;
1888 }
1889 }
91b9a277 1890 cb->args[2] = h;
19baf839
RO
1891 return skb->len;
1892}
1893
1894static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1895{
1896 int m, s_m;
1897 struct trie *t = (struct trie *) tb->tb_data;
1898
1899 s_m = cb->args[1];
1900
1901 read_lock(&fib_lock);
91b9a277 1902 for (m = 0; m <= 32; m++) {
19baf839
RO
1903 if (m < s_m)
1904 continue;
1905 if (m > s_m)
1906 memset(&cb->args[2], 0,
91b9a277 1907 sizeof(cb->args) - 2*sizeof(cb->args[0]));
19baf839
RO
1908
1909 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1910 cb->args[1] = m;
1911 goto out;
1912 }
1913 }
1914 read_unlock(&fib_lock);
1915 cb->args[1] = m;
1916 return skb->len;
91b9a277 1917out:
19baf839
RO
1918 read_unlock(&fib_lock);
1919 return -1;
1920}
1921
1922/* Fix more generic FIB names for init later */
1923
1924#ifdef CONFIG_IP_MULTIPLE_TABLES
1925struct fib_table * fib_hash_init(int id)
1926#else
1927struct fib_table * __init fib_hash_init(int id)
1928#endif
1929{
1930 struct fib_table *tb;
1931 struct trie *t;
1932
1933 if (fn_alias_kmem == NULL)
1934 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1935 sizeof(struct fib_alias),
1936 0, SLAB_HWCACHE_ALIGN,
1937 NULL, NULL);
1938
1939 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1940 GFP_KERNEL);
1941 if (tb == NULL)
1942 return NULL;
1943
1944 tb->tb_id = id;
1945 tb->tb_lookup = fn_trie_lookup;
1946 tb->tb_insert = fn_trie_insert;
1947 tb->tb_delete = fn_trie_delete;
1948 tb->tb_flush = fn_trie_flush;
1949 tb->tb_select_default = fn_trie_select_default;
1950 tb->tb_dump = fn_trie_dump;
1951 memset(tb->tb_data, 0, sizeof(struct trie));
1952
1953 t = (struct trie *) tb->tb_data;
1954
1955 trie_init(t);
1956
c877efb2 1957 if (id == RT_TABLE_LOCAL)
91b9a277 1958 trie_local = t;
c877efb2 1959 else if (id == RT_TABLE_MAIN)
91b9a277 1960 trie_main = t;
19baf839
RO
1961
1962 if (id == RT_TABLE_LOCAL)
1963 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
1964
1965 return tb;
1966}
1967
1968/* Trie dump functions */
1969
1970static void putspace_seq(struct seq_file *seq, int n)
1971{
91b9a277
OJ
1972 while (n--)
1973 seq_printf(seq, " ");
19baf839
RO
1974}
1975
1976static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
1977{
1978 while (bits--)
1979 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
1980}
1981
c877efb2 1982static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
19baf839
RO
1983 int pend, int cindex, int bits)
1984{
1985 putspace_seq(seq, indent);
1986 if (IS_LEAF(n))
1987 seq_printf(seq, "|");
1988 else
1989 seq_printf(seq, "+");
1990 if (bits) {
1991 seq_printf(seq, "%d/", cindex);
1992 printbin_seq(seq, cindex, bits);
1993 seq_printf(seq, ": ");
91b9a277 1994 } else
19baf839
RO
1995 seq_printf(seq, "<root>: ");
1996 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
1997
19baf839 1998 if (IS_LEAF(n)) {
91b9a277 1999 struct leaf *l = (struct leaf *)n;
19baf839
RO
2000 struct fib_alias *fa;
2001 int i;
91b9a277
OJ
2002
2003 seq_printf(seq, "key=%d.%d.%d.%d\n",
2004 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2005
2006 for (i = 32; i >= 0; i--)
2007 if (find_leaf_info(&l->list, i)) {
19baf839 2008 struct list_head *fa_head = get_fa_head(l, i);
91b9a277 2009
c877efb2 2010 if (!fa_head)
19baf839
RO
2011 continue;
2012
c877efb2 2013 if (list_empty(fa_head))
19baf839
RO
2014 continue;
2015
2016 putspace_seq(seq, indent+2);
2017 seq_printf(seq, "{/%d...dumping}\n", i);
2018
19baf839
RO
2019 list_for_each_entry(fa, fa_head, fa_list) {
2020 putspace_seq(seq, indent+2);
19baf839
RO
2021 if (fa->fa_info == NULL) {
2022 seq_printf(seq, "Error fa_info=NULL\n");
2023 continue;
2024 }
91b9a277
OJ
2025 if (fa->fa_info->fib_nh == NULL) {
2026 seq_printf(seq, "Error _fib_nh=NULL\n");
2027 continue;
2028 }
19baf839
RO
2029
2030 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2031 fa->fa_type,
2032 fa->fa_scope,
2033 fa->fa_tos);
2034 }
2035 }
91b9a277 2036 } else {
c877efb2 2037 struct tnode *tn = (struct tnode *)n;
91b9a277
OJ
2038 int plen = ((struct tnode *)n)->pos;
2039 t_key prf = MASK_PFX(n->key, plen);
2040
2041 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2042 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2043
19baf839 2044 putspace_seq(seq, indent); seq_printf(seq, "| ");
91b9a277 2045 seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos));
19baf839
RO
2046 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2047 seq_printf(seq, "}\n");
2048 putspace_seq(seq, indent); seq_printf(seq, "| ");
2049 seq_printf(seq, "{pos=%d", tn->pos);
2050 seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
2051 seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
2052 putspace_seq(seq, indent); seq_printf(seq, "| ");
2053 seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
2054 }
2055}
2056
2057static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2058{
c877efb2 2059 struct node *n = t->trie;
91b9a277
OJ
2060 int cindex = 0;
2061 int indent = 1;
2062 int pend = 0;
19baf839 2063 int depth = 0;
91b9a277 2064 struct tnode *tn;
19baf839
RO
2065
2066 read_lock(&fib_lock);
2067
2068 seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
19baf839 2069
91b9a277
OJ
2070 if (!n) {
2071 seq_printf(seq, "------ trie is empty\n");
2072
2073 read_unlock(&fib_lock);
2074 return;
2075 }
2076
2077 printnode_seq(seq, indent, n, pend, cindex, 0);
2078
2079 if (!IS_TNODE(n)) {
2080 read_unlock(&fib_lock);
2081 return;
2082 }
2083
2084 tn = (struct tnode *)n;
2085 pend = tn->pos+tn->bits;
2086 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2087 indent += 3;
2088 depth++;
2089
2090 while (tn && cindex < (1 << tn->bits)) {
2091 if (tn->child[cindex]) {
2092 /* Got a child */
2093
2094 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2095 if (IS_LEAF(tn->child[cindex])) {
2096 cindex++;
2097 } else {
19baf839 2098 /*
91b9a277 2099 * New tnode. Decend one level
19baf839 2100 */
19baf839 2101
91b9a277
OJ
2102 depth++;
2103 tn = (struct tnode *)tn->child[cindex];
2104 pend = tn->pos + tn->bits;
2105 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2106 indent += 3;
2107 cindex = 0;
19baf839 2108 }
91b9a277
OJ
2109 } else
2110 cindex++;
2111
2112 /*
2113 * Test if we are done
2114 */
2115
2116 while (cindex >= (1 << tn->bits)) {
2117 /*
2118 * Move upwards and test for root
2119 * pop off all traversed nodes
2120 */
2121
2122 if (NODE_PARENT(tn) == NULL) {
2123 tn = NULL;
2124 break;
2125 }
2126
2127 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2128 cindex++;
2129 tn = NODE_PARENT(tn);
2130 pend = tn->pos + tn->bits;
2131 indent -= 3;
2132 depth--;
19baf839 2133 }
19baf839 2134 }
19baf839
RO
2135
2136 read_unlock(&fib_lock);
2137}
2138
2139static struct trie_stat *trie_stat_new(void)
2140{
91b9a277 2141 struct trie_stat *s;
19baf839 2142 int i;
c877efb2 2143
91b9a277
OJ
2144 s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2145 if (!s)
2146 return NULL;
2147
2148 s->totdepth = 0;
2149 s->maxdepth = 0;
2150 s->tnodes = 0;
2151 s->leaves = 0;
2152 s->nullpointers = 0;
2153
2154 for (i = 0; i < MAX_CHILDS; i++)
2155 s->nodesizes[i] = 0;
2156
19baf839 2157 return s;
c877efb2 2158}
19baf839
RO
2159
2160static struct trie_stat *trie_collect_stats(struct trie *t)
2161{
c877efb2 2162 struct node *n = t->trie;
19baf839
RO
2163 struct trie_stat *s = trie_stat_new();
2164 int cindex = 0;
19baf839
RO
2165 int pend = 0;
2166 int depth = 0;
2167
91b9a277
OJ
2168 if (!s)
2169 return NULL;
2170 if (!n)
2171 return s;
19baf839 2172
91b9a277 2173 read_lock(&fib_lock);
19baf839 2174
91b9a277
OJ
2175 if (IS_TNODE(n)) {
2176 struct tnode *tn = (struct tnode *)n;
2177 pend = tn->pos+tn->bits;
2178 s->nodesizes[tn->bits]++;
2179 depth++;
2180
2181 while (tn && cindex < (1 << tn->bits)) {
2182 if (tn->child[cindex]) {
2183 /* Got a child */
19baf839 2184
91b9a277
OJ
2185 if (IS_LEAF(tn->child[cindex])) {
2186 cindex++;
2187
2188 /* stats */
2189 if (depth > s->maxdepth)
2190 s->maxdepth = depth;
2191 s->totdepth += depth;
2192 s->leaves++;
2193 } else {
19baf839 2194 /*
91b9a277 2195 * New tnode. Decend one level
19baf839 2196 */
91b9a277
OJ
2197
2198 s->tnodes++;
2199 s->nodesizes[tn->bits]++;
2200 depth++;
2201
2202 n = tn->child[cindex];
2203 tn = (struct tnode *)n;
2204 pend = tn->pos+tn->bits;
2205
2206 cindex = 0;
19baf839 2207 }
91b9a277
OJ
2208 } else {
2209 cindex++;
2210 s->nullpointers++;
19baf839 2211 }
91b9a277
OJ
2212
2213 /*
2214 * Test if we are done
2215 */
2216
2217 while (cindex >= (1 << tn->bits)) {
2218 /*
2219 * Move upwards and test for root
2220 * pop off all traversed nodes
2221 */
2222
2223 if (NODE_PARENT(tn) == NULL) {
2224 tn = NULL;
2225 n = NULL;
2226 break;
2227 }
2228
2229 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2230 tn = NODE_PARENT(tn);
2231 cindex++;
2232 n = (struct node *)tn;
2233 pend = tn->pos+tn->bits;
2234 depth--;
2235 }
19baf839
RO
2236 }
2237 }
2238
91b9a277 2239 read_unlock(&fib_lock);
19baf839
RO
2240 return s;
2241}
2242
2243#ifdef CONFIG_PROC_FS
2244
2245static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2246{
2247 return NULL;
2248}
2249
2250static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2251{
2252 return NULL;
2253}
2254
2255static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2256{
91b9a277
OJ
2257 if (!ip_fib_main_table)
2258 return NULL;
19baf839 2259
91b9a277
OJ
2260 if (*pos)
2261 return fib_triestat_get_next(seq);
2262 else
2263 return SEQ_START_TOKEN;
19baf839
RO
2264}
2265
2266static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2267{
2268 ++*pos;
91b9a277
OJ
2269 if (v == SEQ_START_TOKEN)
2270 return fib_triestat_get_first(seq);
2271 else
2272 return fib_triestat_get_next(seq);
19baf839
RO
2273}
2274
2275static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2276{
2277
2278}
2279
c877efb2 2280/*
19baf839
RO
2281 * This outputs /proc/net/fib_triestats
2282 *
2283 * It always works in backward compatibility mode.
2284 * The format of the file is not supposed to be changed.
2285 */
2286
2287static void collect_and_show(struct trie *t, struct seq_file *seq)
2288{
2289 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2290 int i, max, pointers;
91b9a277 2291 struct trie_stat *stat;
19baf839
RO
2292 int avdepth;
2293
2294 stat = trie_collect_stats(t);
2295
91b9a277 2296 bytes = 0;
19baf839
RO
2297 seq_printf(seq, "trie=%p\n", t);
2298
2299 if (stat) {
2300 if (stat->leaves)
91b9a277 2301 avdepth = stat->totdepth*100 / stat->leaves;
19baf839 2302 else
91b9a277
OJ
2303 avdepth = 0;
2304 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100);
19baf839 2305 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
91b9a277 2306
19baf839
RO
2307 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2308 bytes += sizeof(struct leaf) * stat->leaves;
2309 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2310 bytes += sizeof(struct tnode) * stat->tnodes;
2311
2312 max = MAX_CHILDS-1;
2313
2314 while (max >= 0 && stat->nodesizes[max] == 0)
2315 max--;
2316 pointers = 0;
2317
c877efb2 2318 for (i = 1; i <= max; i++)
19baf839
RO
2319 if (stat->nodesizes[i] != 0) {
2320 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2321 pointers += (1<<i) * stat->nodesizes[i];
2322 }
2323 seq_printf(seq, "\n");
2324 seq_printf(seq, "Pointers: %d\n", pointers);
2325 bytes += sizeof(struct node *) * pointers;
2326 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2327 seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
2328
2329 kfree(stat);
2330 }
2331
2332#ifdef CONFIG_IP_FIB_TRIE_STATS
2333 seq_printf(seq, "Counters:\n---------\n");
2334 seq_printf(seq,"gets = %d\n", t->stats.gets);
2335 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2336 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2337 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2338 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2f36895a 2339 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
19baf839
RO
2340#ifdef CLEAR_STATS
2341 memset(&(t->stats), 0, sizeof(t->stats));
2342#endif
2343#endif /* CONFIG_IP_FIB_TRIE_STATS */
2344}
2345
2346static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2347{
2348 char bf[128];
c877efb2 2349
19baf839 2350 if (v == SEQ_START_TOKEN) {
c877efb2 2351 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
19baf839 2352 sizeof(struct leaf), sizeof(struct tnode));
c877efb2 2353 if (trie_local)
19baf839
RO
2354 collect_and_show(trie_local, seq);
2355
c877efb2 2356 if (trie_main)
19baf839 2357 collect_and_show(trie_main, seq);
91b9a277
OJ
2358 } else {
2359 snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400);
2360
19baf839
RO
2361 seq_printf(seq, "%-127s\n", bf);
2362 }
2363 return 0;
2364}
2365
2366static struct seq_operations fib_triestat_seq_ops = {
c877efb2
SH
2367 .start = fib_triestat_seq_start,
2368 .next = fib_triestat_seq_next,
2369 .stop = fib_triestat_seq_stop,
2370 .show = fib_triestat_seq_show,
19baf839
RO
2371};
2372
2373static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2374{
2375 struct seq_file *seq;
2376 int rc = -ENOMEM;
2377
2378 rc = seq_open(file, &fib_triestat_seq_ops);
2379 if (rc)
2380 goto out_kfree;
2381
c877efb2 2382 seq = file->private_data;
19baf839
RO
2383out:
2384 return rc;
2385out_kfree:
2386 goto out;
2387}
2388
2389static struct file_operations fib_triestat_seq_fops = {
c877efb2
SH
2390 .owner = THIS_MODULE,
2391 .open = fib_triestat_seq_open,
2392 .read = seq_read,
2393 .llseek = seq_lseek,
2394 .release = seq_release_private,
19baf839
RO
2395};
2396
2397int __init fib_stat_proc_init(void)
2398{
2399 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2400 return -ENOMEM;
2401 return 0;
2402}
2403
2404void __init fib_stat_proc_exit(void)
2405{
2406 proc_net_remove("fib_triestat");
2407}
2408
2409static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2410{
2411 return NULL;
2412}
2413
2414static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2415{
2416 return NULL;
2417}
2418
2419static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2420{
91b9a277
OJ
2421 if (!ip_fib_main_table)
2422 return NULL;
19baf839 2423
91b9a277
OJ
2424 if (*pos)
2425 return fib_trie_get_next(seq);
2426 else
2427 return SEQ_START_TOKEN;
19baf839
RO
2428}
2429
2430static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2431{
2432 ++*pos;
91b9a277
OJ
2433 if (v == SEQ_START_TOKEN)
2434 return fib_trie_get_first(seq);
2435 else
2436 return fib_trie_get_next(seq);
2437
19baf839
RO
2438}
2439
2440static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2441{
19baf839
RO
2442}
2443
c877efb2 2444/*
19baf839
RO
2445 * This outputs /proc/net/fib_trie.
2446 *
2447 * It always works in backward compatibility mode.
2448 * The format of the file is not supposed to be changed.
2449 */
2450
2451static int fib_trie_seq_show(struct seq_file *seq, void *v)
2452{
2453 char bf[128];
2454
2455 if (v == SEQ_START_TOKEN) {
c877efb2 2456 if (trie_local)
19baf839
RO
2457 trie_dump_seq(seq, trie_local);
2458
c877efb2 2459 if (trie_main)
19baf839 2460 trie_dump_seq(seq, trie_main);
91b9a277 2461 } else {
19baf839
RO
2462 snprintf(bf, sizeof(bf),
2463 "*\t%08X\t%08X", 200, 400);
2464 seq_printf(seq, "%-127s\n", bf);
2465 }
2466
2467 return 0;
2468}
2469
2470static struct seq_operations fib_trie_seq_ops = {
c877efb2
SH
2471 .start = fib_trie_seq_start,
2472 .next = fib_trie_seq_next,
2473 .stop = fib_trie_seq_stop,
2474 .show = fib_trie_seq_show,
19baf839
RO
2475};
2476
2477static int fib_trie_seq_open(struct inode *inode, struct file *file)
2478{
2479 struct seq_file *seq;
2480 int rc = -ENOMEM;
2481
2482 rc = seq_open(file, &fib_trie_seq_ops);
2483 if (rc)
2484 goto out_kfree;
2485
c877efb2 2486 seq = file->private_data;
19baf839
RO
2487out:
2488 return rc;
2489out_kfree:
2490 goto out;
2491}
2492
2493static struct file_operations fib_trie_seq_fops = {
c877efb2
SH
2494 .owner = THIS_MODULE,
2495 .open = fib_trie_seq_open,
2496 .read = seq_read,
2497 .llseek = seq_lseek,
2498 .release= seq_release_private,
19baf839
RO
2499};
2500
2501int __init fib_proc_init(void)
2502{
2503 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2504 return -ENOMEM;
2505 return 0;
2506}
2507
2508void __init fib_proc_exit(void)
2509{
2510 proc_net_remove("fib_trie");
2511}
2512
2513#endif /* CONFIG_PROC_FS */