]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/ipv4/fib_trie.c
[DCCP]: Initial dccp_poll implementation
[mirror_ubuntu-zesty-kernel.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
44 */
45
2f36895a 46#define VERSION "0.325"
19baf839
RO
47
48#include <linux/config.h>
49#include <asm/uaccess.h>
50#include <asm/system.h>
51#include <asm/bitops.h>
52#include <linux/types.h>
53#include <linux/kernel.h>
54#include <linux/sched.h>
55#include <linux/mm.h>
56#include <linux/string.h>
57#include <linux/socket.h>
58#include <linux/sockios.h>
59#include <linux/errno.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_arp.h>
64#include <linux/proc_fs.h>
65#include <linux/skbuff.h>
66#include <linux/netlink.h>
67#include <linux/init.h>
68#include <linux/list.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
72#include <net/tcp.h>
73#include <net/sock.h>
74#include <net/ip_fib.h>
75#include "fib_lookup.h"
76
77#undef CONFIG_IP_FIB_TRIE_STATS
78#define MAX_CHILDS 16384
79
19baf839
RO
80#define KEYLENGTH (8*sizeof(t_key))
81#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
82#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
83
84static DEFINE_RWLOCK(fib_lock);
85
86typedef unsigned int t_key;
87
88#define T_TNODE 0
89#define T_LEAF 1
90#define NODE_TYPE_MASK 0x1UL
91b9a277
OJ
91#define NODE_PARENT(node) \
92 ((struct tnode *)((node)->parent & ~NODE_TYPE_MASK))
93#define NODE_SET_PARENT(node, ptr) \
94 ((node)->parent = (((unsigned long)(ptr)) | \
95 ((node)->parent & NODE_TYPE_MASK)))
96#define NODE_INIT_PARENT(node, type) \
97 ((node)->parent = (type))
98#define NODE_TYPE(node) \
99 ((node)->parent & NODE_TYPE_MASK)
100
101#define IS_TNODE(n) (!(n->parent & T_LEAF))
102#define IS_LEAF(n) (n->parent & T_LEAF)
19baf839
RO
103
104struct node {
91b9a277
OJ
105 t_key key;
106 unsigned long parent;
19baf839
RO
107};
108
109struct leaf {
91b9a277
OJ
110 t_key key;
111 unsigned long parent;
19baf839
RO
112 struct hlist_head list;
113};
114
115struct leaf_info {
116 struct hlist_node hlist;
117 int plen;
118 struct list_head falh;
119};
120
121struct tnode {
91b9a277
OJ
122 t_key key;
123 unsigned long parent;
124 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
125 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short full_children; /* KEYLENGTH bits needed */
127 unsigned short empty_children; /* KEYLENGTH bits needed */
128 struct node *child[0];
19baf839
RO
129};
130
131#ifdef CONFIG_IP_FIB_TRIE_STATS
132struct trie_use_stats {
133 unsigned int gets;
134 unsigned int backtrack;
135 unsigned int semantic_match_passed;
136 unsigned int semantic_match_miss;
137 unsigned int null_node_hit;
2f36895a 138 unsigned int resize_node_skipped;
19baf839
RO
139};
140#endif
141
142struct trie_stat {
143 unsigned int totdepth;
144 unsigned int maxdepth;
145 unsigned int tnodes;
146 unsigned int leaves;
147 unsigned int nullpointers;
148 unsigned int nodesizes[MAX_CHILDS];
c877efb2 149};
19baf839
RO
150
151struct trie {
91b9a277 152 struct node *trie;
19baf839
RO
153#ifdef CONFIG_IP_FIB_TRIE_STATS
154 struct trie_use_stats stats;
155#endif
91b9a277 156 int size;
19baf839
RO
157 unsigned int revision;
158};
159
160static int trie_debug = 0;
161
91b9a277
OJ
162#define DBG(x...) do { if (trie_debug) printk(x); } while (0)
163
19baf839
RO
164static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
165static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
19baf839 166static struct node *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
167static struct tnode *inflate(struct trie *t, struct tnode *tn);
168static struct tnode *halve(struct trie *t, struct tnode *tn);
19baf839
RO
169static void tnode_free(struct tnode *tn);
170static void trie_dump_seq(struct seq_file *seq, struct trie *t);
171extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
172extern int fib_detect_death(struct fib_info *fi, int order,
91b9a277 173 struct fib_info **last_resort, int *last_idx, int *dflt);
19baf839
RO
174
175extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
91b9a277 176 struct nlmsghdr *n, struct netlink_skb_parms *req);
19baf839
RO
177
178static kmem_cache_t *fn_alias_kmem;
179static struct trie *trie_local = NULL, *trie_main = NULL;
180
c877efb2 181static inline struct node *tnode_get_child(struct tnode *tn, int i)
19baf839 182{
91b9a277 183 BUG_ON(i >= 1 << tn->bits);
19baf839 184
91b9a277 185 return tn->child[i];
19baf839
RO
186}
187
bb435b8d 188static inline int tnode_child_length(const struct tnode *tn)
19baf839 189{
91b9a277 190 return 1 << tn->bits;
19baf839
RO
191}
192
19baf839
RO
193static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
194{
91b9a277 195 if (offset < KEYLENGTH)
19baf839 196 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 197 else
19baf839
RO
198 return 0;
199}
200
201static inline int tkey_equals(t_key a, t_key b)
202{
c877efb2 203 return a == b;
19baf839
RO
204}
205
206static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
207{
c877efb2
SH
208 if (bits == 0 || offset >= KEYLENGTH)
209 return 1;
91b9a277
OJ
210 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
211 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 212}
19baf839
RO
213
214static inline int tkey_mismatch(t_key a, int offset, t_key b)
215{
216 t_key diff = a ^ b;
217 int i = offset;
218
c877efb2
SH
219 if (!diff)
220 return 0;
221 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
222 i++;
223 return i;
224}
225
91b9a277 226/* Candidate for fib_semantics */
19baf839
RO
227
228static void fn_free_alias(struct fib_alias *fa)
229{
230 fib_release_info(fa->fa_info);
231 kmem_cache_free(fn_alias_kmem, fa);
232}
233
234/*
235 To understand this stuff, an understanding of keys and all their bits is
236 necessary. Every node in the trie has a key associated with it, but not
237 all of the bits in that key are significant.
238
239 Consider a node 'n' and its parent 'tp'.
240
241 If n is a leaf, every bit in its key is significant. Its presence is
242 necessitaded by path compression, since during a tree traversal (when
243 searching for a leaf - unless we are doing an insertion) we will completely
244 ignore all skipped bits we encounter. Thus we need to verify, at the end of
245 a potentially successful search, that we have indeed been walking the
246 correct key path.
247
248 Note that we can never "miss" the correct key in the tree if present by
249 following the wrong path. Path compression ensures that segments of the key
250 that are the same for all keys with a given prefix are skipped, but the
251 skipped part *is* identical for each node in the subtrie below the skipped
252 bit! trie_insert() in this implementation takes care of that - note the
253 call to tkey_sub_equals() in trie_insert().
254
255 if n is an internal node - a 'tnode' here, the various parts of its key
256 have many different meanings.
257
258 Example:
259 _________________________________________________________________
260 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
261 -----------------------------------------------------------------
262 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
263
264 _________________________________________________________________
265 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
266 -----------------------------------------------------------------
267 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
268
269 tp->pos = 7
270 tp->bits = 3
271 n->pos = 15
91b9a277 272 n->bits = 4
19baf839
RO
273
274 First, let's just ignore the bits that come before the parent tp, that is
275 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
276 not use them for anything.
277
278 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
279 index into the parent's child array. That is, they will be used to find
280 'n' among tp's children.
281
282 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
283 for the node n.
284
285 All the bits we have seen so far are significant to the node n. The rest
286 of the bits are really not needed or indeed known in n->key.
287
288 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
289 n's child array, and will of course be different for each child.
290
c877efb2 291
19baf839
RO
292 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
293 at this point.
294
295*/
296
297static void check_tnode(struct tnode *tn)
298{
c877efb2 299 if (tn && tn->pos+tn->bits > 32) {
19baf839
RO
300 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
301 }
302}
303
304static int halve_threshold = 25;
305static int inflate_threshold = 50;
306
307static struct leaf *leaf_new(void)
308{
309 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
c877efb2 310 if (l) {
19baf839
RO
311 NODE_INIT_PARENT(l, T_LEAF);
312 INIT_HLIST_HEAD(&l->list);
313 }
314 return l;
315}
316
317static struct leaf_info *leaf_info_new(int plen)
318{
319 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
91b9a277
OJ
320
321 if (!li)
322 return NULL;
323
324 li->plen = plen;
325 INIT_LIST_HEAD(&li->falh);
326
19baf839
RO
327 return li;
328}
329
330static inline void free_leaf(struct leaf *l)
331{
332 kfree(l);
333}
334
335static inline void free_leaf_info(struct leaf_info *li)
336{
337 kfree(li);
338}
339
f0e36f8c
PM
340static struct tnode *tnode_alloc(unsigned int size)
341{
342 if (size <= PAGE_SIZE) {
343 return kmalloc(size, GFP_KERNEL);
344 } else {
345 return (struct tnode *)
c877efb2 346 __get_free_pages(GFP_KERNEL, get_order(size));
f0e36f8c
PM
347 }
348}
349
350static void __tnode_free(struct tnode *tn)
351{
352 unsigned int size = sizeof(struct tnode) +
91b9a277 353 (1 << tn->bits) * sizeof(struct node *);
f0e36f8c
PM
354
355 if (size <= PAGE_SIZE)
356 kfree(tn);
357 else
358 free_pages((unsigned long)tn, get_order(size));
359}
360
19baf839
RO
361static struct tnode* tnode_new(t_key key, int pos, int bits)
362{
363 int nchildren = 1<<bits;
364 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
f0e36f8c 365 struct tnode *tn = tnode_alloc(sz);
19baf839 366
91b9a277 367 if (tn) {
19baf839
RO
368 memset(tn, 0, sz);
369 NODE_INIT_PARENT(tn, T_TNODE);
370 tn->pos = pos;
371 tn->bits = bits;
372 tn->key = key;
373 tn->full_children = 0;
374 tn->empty_children = 1<<bits;
375 }
c877efb2 376
91b9a277
OJ
377 DBG("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
378 (unsigned int) (sizeof(struct node) * 1<<bits));
19baf839
RO
379 return tn;
380}
381
382static void tnode_free(struct tnode *tn)
383{
91b9a277
OJ
384 BUG_ON(!tn);
385
c877efb2 386 if (IS_LEAF(tn)) {
19baf839 387 free_leaf((struct leaf *)tn);
91b9a277
OJ
388 DBG("FL %p \n", tn);
389 } else {
f0e36f8c 390 __tnode_free(tn);
91b9a277 391 DBG("FT %p \n", tn);
19baf839
RO
392 }
393}
394
395/*
396 * Check whether a tnode 'n' is "full", i.e. it is an internal node
397 * and no bits are skipped. See discussion in dyntree paper p. 6
398 */
399
bb435b8d 400static inline int tnode_full(const struct tnode *tn, const struct node *n)
19baf839 401{
c877efb2 402 if (n == NULL || IS_LEAF(n))
19baf839
RO
403 return 0;
404
405 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
406}
407
c877efb2 408static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
19baf839
RO
409{
410 tnode_put_child_reorg(tn, i, n, -1);
411}
412
c877efb2 413 /*
19baf839
RO
414 * Add a child at position i overwriting the old value.
415 * Update the value of full_children and empty_children.
416 */
417
c877efb2 418static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
19baf839
RO
419{
420 struct node *chi;
421 int isfull;
422
c877efb2 423 if (i >= 1<<tn->bits) {
19baf839 424 printk("bits=%d, i=%d\n", tn->bits, i);
91b9a277 425 BUG();
19baf839
RO
426 }
427 write_lock_bh(&fib_lock);
c877efb2 428 chi = tn->child[i];
19baf839
RO
429
430 /* update emptyChildren */
431 if (n == NULL && chi != NULL)
432 tn->empty_children++;
433 else if (n != NULL && chi == NULL)
434 tn->empty_children--;
c877efb2 435
19baf839 436 /* update fullChildren */
91b9a277 437 if (wasfull == -1)
19baf839
RO
438 wasfull = tnode_full(tn, chi);
439
440 isfull = tnode_full(tn, n);
c877efb2 441 if (wasfull && !isfull)
19baf839 442 tn->full_children--;
c877efb2 443 else if (!wasfull && isfull)
19baf839 444 tn->full_children++;
91b9a277 445
c877efb2
SH
446 if (n)
447 NODE_SET_PARENT(n, tn);
19baf839
RO
448
449 tn->child[i] = n;
450 write_unlock_bh(&fib_lock);
451}
452
c877efb2 453static struct node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
454{
455 int i;
2f36895a 456 int err = 0;
2f80b3c8 457 struct tnode *old_tn;
19baf839
RO
458
459 if (!tn)
460 return NULL;
461
91b9a277
OJ
462 DBG("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
463 tn, inflate_threshold, halve_threshold);
19baf839
RO
464
465 /* No children */
466 if (tn->empty_children == tnode_child_length(tn)) {
467 tnode_free(tn);
468 return NULL;
469 }
470 /* One child */
471 if (tn->empty_children == tnode_child_length(tn) - 1)
472 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 473 struct node *n;
19baf839
RO
474
475 write_lock_bh(&fib_lock);
91b9a277
OJ
476 n = tn->child[i];
477 if (!n) {
19baf839 478 write_unlock_bh(&fib_lock);
91b9a277 479 continue;
19baf839 480 }
91b9a277
OJ
481
482 /* compress one level */
483 NODE_INIT_PARENT(n, NODE_TYPE(n));
484
19baf839 485 write_unlock_bh(&fib_lock);
91b9a277
OJ
486 tnode_free(tn);
487 return n;
19baf839 488 }
c877efb2 489 /*
19baf839
RO
490 * Double as long as the resulting node has a number of
491 * nonempty nodes that are above the threshold.
492 */
493
494 /*
c877efb2
SH
495 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
496 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 497 * Telecommunications, page 6:
c877efb2 498 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
499 * children in the *doubled* node is at least 'high'."
500 *
c877efb2
SH
501 * 'high' in this instance is the variable 'inflate_threshold'. It
502 * is expressed as a percentage, so we multiply it with
503 * tnode_child_length() and instead of multiplying by 2 (since the
504 * child array will be doubled by inflate()) and multiplying
505 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 506 * multiply the left-hand side by 50.
c877efb2
SH
507 *
508 * The left-hand side may look a bit weird: tnode_child_length(tn)
509 * - tn->empty_children is of course the number of non-null children
510 * in the current node. tn->full_children is the number of "full"
19baf839 511 * children, that is non-null tnodes with a skip value of 0.
c877efb2 512 * All of those will be doubled in the resulting inflated tnode, so
19baf839 513 * we just count them one extra time here.
c877efb2 514 *
19baf839 515 * A clearer way to write this would be:
c877efb2 516 *
19baf839 517 * to_be_doubled = tn->full_children;
c877efb2 518 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
519 * tn->full_children;
520 *
521 * new_child_length = tnode_child_length(tn) * 2;
522 *
c877efb2 523 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
524 * new_child_length;
525 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
526 *
527 * ...and so on, tho it would mess up the while () loop.
528 *
19baf839
RO
529 * anyway,
530 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
531 * inflate_threshold
c877efb2 532 *
19baf839
RO
533 * avoid a division:
534 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
535 * inflate_threshold * new_child_length
c877efb2 536 *
19baf839 537 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 538 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 539 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 540 *
19baf839 541 * expand new_child_length:
c877efb2 542 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 543 * tn->full_children) >=
19baf839 544 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 545 *
19baf839 546 * shorten again:
c877efb2 547 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 548 * tn->empty_children) >= inflate_threshold *
19baf839 549 * tnode_child_length(tn)
c877efb2 550 *
19baf839
RO
551 */
552
553 check_tnode(tn);
c877efb2 554
2f36895a 555 err = 0;
19baf839
RO
556 while ((tn->full_children > 0 &&
557 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
558 inflate_threshold * tnode_child_length(tn))) {
559
2f80b3c8
RO
560 old_tn = tn;
561 tn = inflate(t, tn);
562 if (IS_ERR(tn)) {
563 tn = old_tn;
2f36895a
RO
564#ifdef CONFIG_IP_FIB_TRIE_STATS
565 t->stats.resize_node_skipped++;
566#endif
567 break;
568 }
19baf839
RO
569 }
570
571 check_tnode(tn);
572
573 /*
574 * Halve as long as the number of empty children in this
575 * node is above threshold.
576 */
2f36895a
RO
577
578 err = 0;
19baf839
RO
579 while (tn->bits > 1 &&
580 100 * (tnode_child_length(tn) - tn->empty_children) <
2f36895a
RO
581 halve_threshold * tnode_child_length(tn)) {
582
2f80b3c8
RO
583 old_tn = tn;
584 tn = halve(t, tn);
585 if (IS_ERR(tn)) {
586 tn = old_tn;
2f36895a
RO
587#ifdef CONFIG_IP_FIB_TRIE_STATS
588 t->stats.resize_node_skipped++;
589#endif
590 break;
591 }
592 }
19baf839 593
c877efb2 594
19baf839
RO
595 /* Only one child remains */
596
597 if (tn->empty_children == tnode_child_length(tn) - 1)
598 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 599 struct node *n;
19baf839 600
91b9a277 601 write_lock_bh(&fib_lock);
19baf839 602
91b9a277
OJ
603 n = tn->child[i];
604 if (!n) {
19baf839 605 write_unlock_bh(&fib_lock);
91b9a277 606 continue;
19baf839 607 }
91b9a277
OJ
608
609 /* compress one level */
610
611 NODE_INIT_PARENT(n, NODE_TYPE(n));
612
19baf839 613 write_unlock_bh(&fib_lock);
91b9a277
OJ
614 tnode_free(tn);
615 return n;
19baf839
RO
616 }
617
618 return (struct node *) tn;
619}
620
2f80b3c8 621static struct tnode *inflate(struct trie *t, struct tnode *tn)
19baf839
RO
622{
623 struct tnode *inode;
624 struct tnode *oldtnode = tn;
625 int olen = tnode_child_length(tn);
626 int i;
627
91b9a277 628 DBG("In inflate\n");
19baf839
RO
629
630 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
631
2f80b3c8
RO
632 if (!tn)
633 return ERR_PTR(-ENOMEM);
2f36895a
RO
634
635 /*
c877efb2
SH
636 * Preallocate and store tnodes before the actual work so we
637 * don't get into an inconsistent state if memory allocation
638 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
639 * of tnode is ignored.
640 */
91b9a277
OJ
641
642 for (i = 0; i < olen; i++) {
2f36895a
RO
643 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
644
645 if (inode &&
646 IS_TNODE(inode) &&
647 inode->pos == oldtnode->pos + oldtnode->bits &&
648 inode->bits > 1) {
649 struct tnode *left, *right;
2f36895a 650 t_key m = TKEY_GET_MASK(inode->pos, 1);
c877efb2 651
2f36895a
RO
652 left = tnode_new(inode->key&(~m), inode->pos + 1,
653 inode->bits - 1);
2f80b3c8
RO
654 if (!left)
655 goto nomem;
91b9a277 656
2f36895a
RO
657 right = tnode_new(inode->key|m, inode->pos + 1,
658 inode->bits - 1);
659
2f80b3c8
RO
660 if (!right) {
661 tnode_free(left);
662 goto nomem;
663 }
2f36895a
RO
664
665 put_child(t, tn, 2*i, (struct node *) left);
666 put_child(t, tn, 2*i+1, (struct node *) right);
667 }
668 }
669
91b9a277 670 for (i = 0; i < olen; i++) {
19baf839 671 struct node *node = tnode_get_child(oldtnode, i);
91b9a277
OJ
672 struct tnode *left, *right;
673 int size, j;
c877efb2 674
19baf839
RO
675 /* An empty child */
676 if (node == NULL)
677 continue;
678
679 /* A leaf or an internal node with skipped bits */
680
c877efb2 681 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 682 tn->pos + tn->bits - 1) {
c877efb2 683 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
19baf839
RO
684 1) == 0)
685 put_child(t, tn, 2*i, node);
686 else
687 put_child(t, tn, 2*i+1, node);
688 continue;
689 }
690
691 /* An internal node with two children */
692 inode = (struct tnode *) node;
693
694 if (inode->bits == 1) {
695 put_child(t, tn, 2*i, inode->child[0]);
696 put_child(t, tn, 2*i+1, inode->child[1]);
697
698 tnode_free(inode);
91b9a277 699 continue;
19baf839
RO
700 }
701
91b9a277
OJ
702 /* An internal node with more than two children */
703
704 /* We will replace this node 'inode' with two new
705 * ones, 'left' and 'right', each with half of the
706 * original children. The two new nodes will have
707 * a position one bit further down the key and this
708 * means that the "significant" part of their keys
709 * (see the discussion near the top of this file)
710 * will differ by one bit, which will be "0" in
711 * left's key and "1" in right's key. Since we are
712 * moving the key position by one step, the bit that
713 * we are moving away from - the bit at position
714 * (inode->pos) - is the one that will differ between
715 * left and right. So... we synthesize that bit in the
716 * two new keys.
717 * The mask 'm' below will be a single "one" bit at
718 * the position (inode->pos)
719 */
19baf839 720
91b9a277
OJ
721 /* Use the old key, but set the new significant
722 * bit to zero.
723 */
2f36895a 724
91b9a277
OJ
725 left = (struct tnode *) tnode_get_child(tn, 2*i);
726 put_child(t, tn, 2*i, NULL);
2f36895a 727
91b9a277 728 BUG_ON(!left);
2f36895a 729
91b9a277
OJ
730 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
731 put_child(t, tn, 2*i+1, NULL);
19baf839 732
91b9a277 733 BUG_ON(!right);
19baf839 734
91b9a277
OJ
735 size = tnode_child_length(left);
736 for (j = 0; j < size; j++) {
737 put_child(t, left, j, inode->child[j]);
738 put_child(t, right, j, inode->child[j + size]);
19baf839 739 }
91b9a277
OJ
740 put_child(t, tn, 2*i, resize(t, left));
741 put_child(t, tn, 2*i+1, resize(t, right));
742
743 tnode_free(inode);
19baf839
RO
744 }
745 tnode_free(oldtnode);
746 return tn;
2f80b3c8
RO
747nomem:
748 {
749 int size = tnode_child_length(tn);
750 int j;
751
752 for(j = 0; j < size; j++)
753 if (tn->child[j])
754 tnode_free((struct tnode *)tn->child[j]);
755
756 tnode_free(tn);
757
758 return ERR_PTR(-ENOMEM);
759 }
19baf839
RO
760}
761
2f80b3c8 762static struct tnode *halve(struct trie *t, struct tnode *tn)
19baf839
RO
763{
764 struct tnode *oldtnode = tn;
765 struct node *left, *right;
766 int i;
767 int olen = tnode_child_length(tn);
768
91b9a277 769 DBG("In halve\n");
c877efb2
SH
770
771 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 772
2f80b3c8
RO
773 if (!tn)
774 return ERR_PTR(-ENOMEM);
2f36895a
RO
775
776 /*
c877efb2
SH
777 * Preallocate and store tnodes before the actual work so we
778 * don't get into an inconsistent state if memory allocation
779 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
780 * of tnode is ignored.
781 */
782
91b9a277 783 for (i = 0; i < olen; i += 2) {
2f36895a
RO
784 left = tnode_get_child(oldtnode, i);
785 right = tnode_get_child(oldtnode, i+1);
c877efb2 786
2f36895a 787 /* Two nonempty children */
c877efb2 788 if (left && right) {
2f80b3c8
RO
789 struct tnode *newn;
790
791 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
792
793 if (!newn)
794 goto nomem;
795
796 put_child(t, tn, i/2, (struct node *)newn);
2f36895a 797 }
2f36895a 798
2f36895a 799 }
19baf839 800
91b9a277
OJ
801 for (i = 0; i < olen; i += 2) {
802 struct tnode *newBinNode;
803
19baf839
RO
804 left = tnode_get_child(oldtnode, i);
805 right = tnode_get_child(oldtnode, i+1);
c877efb2 806
19baf839
RO
807 /* At least one of the children is empty */
808 if (left == NULL) {
809 if (right == NULL) /* Both are empty */
810 continue;
811 put_child(t, tn, i/2, right);
91b9a277
OJ
812 continue;
813 }
814
815 if (right == NULL) {
19baf839 816 put_child(t, tn, i/2, left);
91b9a277
OJ
817 continue;
818 }
c877efb2 819
19baf839 820 /* Two nonempty children */
91b9a277
OJ
821 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
822 put_child(t, tn, i/2, NULL);
19baf839 823
91b9a277 824 BUG_ON(!newBinNode);
19baf839 825
91b9a277
OJ
826 put_child(t, newBinNode, 0, left);
827 put_child(t, newBinNode, 1, right);
828 put_child(t, tn, i/2, resize(t, newBinNode));
19baf839
RO
829 }
830 tnode_free(oldtnode);
831 return tn;
2f80b3c8
RO
832nomem:
833 {
834 int size = tnode_child_length(tn);
835 int j;
836
837 for(j = 0; j < size; j++)
838 if (tn->child[j])
839 tnode_free((struct tnode *)tn->child[j]);
840
841 tnode_free(tn);
842
843 return ERR_PTR(-ENOMEM);
844 }
19baf839
RO
845}
846
91b9a277 847static void trie_init(struct trie *t)
19baf839 848{
91b9a277
OJ
849 if (!t)
850 return;
851
852 t->size = 0;
853 t->trie = NULL;
854 t->revision = 0;
19baf839 855#ifdef CONFIG_IP_FIB_TRIE_STATS
91b9a277 856 memset(&t->stats, 0, sizeof(struct trie_use_stats));
19baf839 857#endif
19baf839
RO
858}
859
860static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
861{
862 struct hlist_node *node;
863 struct leaf_info *li;
864
91b9a277 865 hlist_for_each_entry(li, node, head, hlist)
c877efb2 866 if (li->plen == plen)
19baf839 867 return li;
91b9a277 868
19baf839
RO
869 return NULL;
870}
871
872static inline struct list_head * get_fa_head(struct leaf *l, int plen)
873{
19baf839 874 struct leaf_info *li = find_leaf_info(&l->list, plen);
c877efb2 875
91b9a277
OJ
876 if (!li)
877 return NULL;
c877efb2 878
91b9a277 879 return &li->falh;
19baf839
RO
880}
881
882static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
883{
c877efb2 884 struct leaf_info *li = NULL, *last = NULL;
91b9a277 885 struct hlist_node *node;
19baf839
RO
886
887 write_lock_bh(&fib_lock);
c877efb2 888
91b9a277 889 if (hlist_empty(head)) {
19baf839 890 hlist_add_head(&new->hlist, head);
91b9a277
OJ
891 } else {
892 hlist_for_each_entry(li, node, head, hlist) {
c877efb2 893 if (new->plen > li->plen)
19baf839 894 break;
91b9a277 895
19baf839
RO
896 last = li;
897 }
c877efb2 898 if (last)
19baf839 899 hlist_add_after(&last->hlist, &new->hlist);
c877efb2 900 else
19baf839
RO
901 hlist_add_before(&new->hlist, &li->hlist);
902 }
903 write_unlock_bh(&fib_lock);
904}
905
906static struct leaf *
907fib_find_node(struct trie *t, u32 key)
908{
909 int pos;
910 struct tnode *tn;
911 struct node *n;
912
913 pos = 0;
c877efb2 914 n = t->trie;
19baf839
RO
915
916 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
917 tn = (struct tnode *) n;
91b9a277 918
19baf839 919 check_tnode(tn);
91b9a277 920
c877efb2 921 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
91b9a277 922 pos = tn->pos + tn->bits;
19baf839 923 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
91b9a277 924 } else
19baf839
RO
925 break;
926 }
927 /* Case we have found a leaf. Compare prefixes */
928
91b9a277
OJ
929 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
930 return (struct leaf *)n;
931
19baf839
RO
932 return NULL;
933}
934
935static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
936{
91b9a277 937 int i;
19baf839
RO
938 int wasfull;
939 t_key cindex, key;
940 struct tnode *tp = NULL;
941
91b9a277 942 BUG_ON(!tn);
c877efb2 943
19baf839
RO
944 key = tn->key;
945 i = 0;
946
947 while (tn != NULL && NODE_PARENT(tn) != NULL) {
c877efb2 948 if (i > 10) {
19baf839 949 printk("Rebalance tn=%p \n", tn);
91b9a277
OJ
950 if (tn)
951 printk("tn->parent=%p \n", NODE_PARENT(tn));
952
19baf839 953 printk("Rebalance tp=%p \n", tp);
91b9a277
OJ
954 if (tp)
955 printk("tp->parent=%p \n", NODE_PARENT(tp));
19baf839
RO
956 }
957
91b9a277 958 BUG_ON(i > 12); /* Why is this a bug? -ojn */
19baf839
RO
959 i++;
960
961 tp = NODE_PARENT(tn);
962 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
963 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
964 tn = (struct tnode *) resize (t, (struct tnode *)tn);
965 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
91b9a277 966
c877efb2 967 if (!NODE_PARENT(tn))
19baf839
RO
968 break;
969
970 tn = NODE_PARENT(tn);
971 }
972 /* Handle last (top) tnode */
c877efb2 973 if (IS_TNODE(tn))
19baf839
RO
974 tn = (struct tnode*) resize(t, (struct tnode *)tn);
975
976 return (struct node*) tn;
977}
978
f835e471
RO
979static struct list_head *
980fib_insert_node(struct trie *t, int *err, u32 key, int plen)
19baf839
RO
981{
982 int pos, newpos;
983 struct tnode *tp = NULL, *tn = NULL;
984 struct node *n;
985 struct leaf *l;
986 int missbit;
c877efb2 987 struct list_head *fa_head = NULL;
19baf839
RO
988 struct leaf_info *li;
989 t_key cindex;
990
991 pos = 0;
c877efb2 992 n = t->trie;
19baf839 993
c877efb2
SH
994 /* If we point to NULL, stop. Either the tree is empty and we should
995 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 996 * and we should just put our new leaf in that.
c877efb2
SH
997 * If we point to a T_TNODE, check if it matches our key. Note that
998 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
999 * not be the parent's 'pos'+'bits'!
1000 *
c877efb2 1001 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
1002 * the index from our key, push the T_TNODE and walk the tree.
1003 *
1004 * If it doesn't, we have to replace it with a new T_TNODE.
1005 *
c877efb2
SH
1006 * If we point to a T_LEAF, it might or might not have the same key
1007 * as we do. If it does, just change the value, update the T_LEAF's
1008 * value, and return it.
19baf839
RO
1009 * If it doesn't, we need to replace it with a T_TNODE.
1010 */
1011
1012 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1013 tn = (struct tnode *) n;
91b9a277 1014
c877efb2 1015 check_tnode(tn);
91b9a277 1016
c877efb2 1017 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839 1018 tp = tn;
91b9a277 1019 pos = tn->pos + tn->bits;
19baf839
RO
1020 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1021
c877efb2 1022 if (n && NODE_PARENT(n) != tn) {
19baf839
RO
1023 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1024 BUG();
1025 }
91b9a277 1026 } else
19baf839
RO
1027 break;
1028 }
1029
1030 /*
1031 * n ----> NULL, LEAF or TNODE
1032 *
c877efb2 1033 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1034 */
1035
91b9a277 1036 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1037
1038 /* Case 1: n is a leaf. Compare prefixes */
1039
c877efb2 1040 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
91b9a277
OJ
1041 struct leaf *l = (struct leaf *) n;
1042
19baf839 1043 li = leaf_info_new(plen);
91b9a277 1044
c877efb2 1045 if (!li) {
f835e471
RO
1046 *err = -ENOMEM;
1047 goto err;
1048 }
19baf839
RO
1049
1050 fa_head = &li->falh;
1051 insert_leaf_info(&l->list, li);
1052 goto done;
1053 }
1054 t->size++;
1055 l = leaf_new();
1056
c877efb2 1057 if (!l) {
f835e471
RO
1058 *err = -ENOMEM;
1059 goto err;
1060 }
19baf839
RO
1061
1062 l->key = key;
1063 li = leaf_info_new(plen);
1064
c877efb2 1065 if (!li) {
f835e471
RO
1066 tnode_free((struct tnode *) l);
1067 *err = -ENOMEM;
1068 goto err;
1069 }
19baf839
RO
1070
1071 fa_head = &li->falh;
1072 insert_leaf_info(&l->list, li);
1073
19baf839 1074 if (t->trie && n == NULL) {
91b9a277 1075 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839
RO
1076
1077 NODE_SET_PARENT(l, tp);
19baf839 1078
91b9a277
OJ
1079 BUG_ON(!tp);
1080
1081 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1082 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1083 } else {
1084 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1085 /*
1086 * Add a new tnode here
19baf839
RO
1087 * first tnode need some special handling
1088 */
1089
1090 if (tp)
91b9a277 1091 pos = tp->pos+tp->bits;
19baf839 1092 else
91b9a277
OJ
1093 pos = 0;
1094
c877efb2 1095 if (n) {
19baf839
RO
1096 newpos = tkey_mismatch(key, pos, n->key);
1097 tn = tnode_new(n->key, newpos, 1);
91b9a277 1098 } else {
19baf839 1099 newpos = 0;
c877efb2 1100 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1101 }
19baf839 1102
c877efb2 1103 if (!tn) {
f835e471
RO
1104 free_leaf_info(li);
1105 tnode_free((struct tnode *) l);
1106 *err = -ENOMEM;
1107 goto err;
91b9a277
OJ
1108 }
1109
19baf839
RO
1110 NODE_SET_PARENT(tn, tp);
1111
91b9a277 1112 missbit = tkey_extract_bits(key, newpos, 1);
19baf839
RO
1113 put_child(t, tn, missbit, (struct node *)l);
1114 put_child(t, tn, 1-missbit, n);
1115
c877efb2 1116 if (tp) {
19baf839
RO
1117 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1118 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
91b9a277 1119 } else {
19baf839
RO
1120 t->trie = (struct node*) tn; /* First tnode */
1121 tp = tn;
1122 }
1123 }
91b9a277
OJ
1124
1125 if (tp && tp->pos + tp->bits > 32)
c877efb2 1126 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
19baf839 1127 tp, tp->pos, tp->bits, key, plen);
91b9a277 1128
19baf839
RO
1129 /* Rebalance the trie */
1130 t->trie = trie_rebalance(t, tp);
f835e471
RO
1131done:
1132 t->revision++;
91b9a277 1133err:
19baf839
RO
1134 return fa_head;
1135}
1136
1137static int
1138fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1139 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1140{
1141 struct trie *t = (struct trie *) tb->tb_data;
1142 struct fib_alias *fa, *new_fa;
c877efb2 1143 struct list_head *fa_head = NULL;
19baf839
RO
1144 struct fib_info *fi;
1145 int plen = r->rtm_dst_len;
1146 int type = r->rtm_type;
1147 u8 tos = r->rtm_tos;
1148 u32 key, mask;
1149 int err;
1150 struct leaf *l;
1151
1152 if (plen > 32)
1153 return -EINVAL;
1154
1155 key = 0;
c877efb2 1156 if (rta->rta_dst)
19baf839
RO
1157 memcpy(&key, rta->rta_dst, 4);
1158
1159 key = ntohl(key);
1160
91b9a277 1161 DBG("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
19baf839 1162
91b9a277 1163 mask = ntohl(inet_make_mask(plen));
19baf839 1164
c877efb2 1165 if (key & ~mask)
19baf839
RO
1166 return -EINVAL;
1167
1168 key = key & mask;
1169
91b9a277
OJ
1170 fi = fib_create_info(r, rta, nlhdr, &err);
1171
1172 if (!fi)
19baf839
RO
1173 goto err;
1174
1175 l = fib_find_node(t, key);
c877efb2 1176 fa = NULL;
19baf839 1177
c877efb2 1178 if (l) {
19baf839
RO
1179 fa_head = get_fa_head(l, plen);
1180 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1181 }
1182
1183 /* Now fa, if non-NULL, points to the first fib alias
1184 * with the same keys [prefix,tos,priority], if such key already
1185 * exists or to the node before which we will insert new one.
1186 *
1187 * If fa is NULL, we will need to allocate a new one and
1188 * insert to the head of f.
1189 *
1190 * If f is NULL, no fib node matched the destination key
1191 * and we need to allocate a new one of those as well.
1192 */
1193
91b9a277 1194 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
19baf839
RO
1195 struct fib_alias *fa_orig;
1196
1197 err = -EEXIST;
1198 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1199 goto out;
1200
1201 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1202 struct fib_info *fi_drop;
1203 u8 state;
1204
1205 write_lock_bh(&fib_lock);
1206
1207 fi_drop = fa->fa_info;
1208 fa->fa_info = fi;
1209 fa->fa_type = type;
1210 fa->fa_scope = r->rtm_scope;
1211 state = fa->fa_state;
1212 fa->fa_state &= ~FA_S_ACCESSED;
1213
1214 write_unlock_bh(&fib_lock);
1215
1216 fib_release_info(fi_drop);
1217 if (state & FA_S_ACCESSED)
91b9a277 1218 rt_cache_flush(-1);
19baf839 1219
91b9a277 1220 goto succeeded;
19baf839
RO
1221 }
1222 /* Error if we find a perfect match which
1223 * uses the same scope, type, and nexthop
1224 * information.
1225 */
1226 fa_orig = fa;
1227 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1228 if (fa->fa_tos != tos)
1229 break;
1230 if (fa->fa_info->fib_priority != fi->fib_priority)
1231 break;
1232 if (fa->fa_type == type &&
1233 fa->fa_scope == r->rtm_scope &&
1234 fa->fa_info == fi) {
1235 goto out;
1236 }
1237 }
1238 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1239 fa = fa_orig;
1240 }
1241 err = -ENOENT;
91b9a277 1242 if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
19baf839
RO
1243 goto out;
1244
1245 err = -ENOBUFS;
1246 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1247 if (new_fa == NULL)
1248 goto out;
1249
1250 new_fa->fa_info = fi;
1251 new_fa->fa_tos = tos;
1252 new_fa->fa_type = type;
1253 new_fa->fa_scope = r->rtm_scope;
1254 new_fa->fa_state = 0;
19baf839
RO
1255 /*
1256 * Insert new entry to the list.
1257 */
1258
c877efb2 1259 if (!fa_head) {
f835e471
RO
1260 fa_head = fib_insert_node(t, &err, key, plen);
1261 err = 0;
c877efb2 1262 if (err)
f835e471
RO
1263 goto out_free_new_fa;
1264 }
19baf839
RO
1265
1266 write_lock_bh(&fib_lock);
1267
91b9a277 1268 list_add_tail(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head));
19baf839
RO
1269
1270 write_unlock_bh(&fib_lock);
1271
1272 rt_cache_flush(-1);
1273 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1274succeeded:
1275 return 0;
f835e471
RO
1276
1277out_free_new_fa:
1278 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1279out:
1280 fib_release_info(fi);
91b9a277 1281err:
19baf839
RO
1282 return err;
1283}
1284
c877efb2 1285static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
06c74270 1286 struct fib_result *res)
19baf839 1287{
06c74270 1288 int err, i;
19baf839
RO
1289 t_key mask;
1290 struct leaf_info *li;
1291 struct hlist_head *hhead = &l->list;
1292 struct hlist_node *node;
c877efb2 1293
19baf839 1294 hlist_for_each_entry(li, node, hhead, hlist) {
19baf839
RO
1295 i = li->plen;
1296 mask = ntohl(inet_make_mask(i));
c877efb2 1297 if (l->key != (key & mask))
19baf839
RO
1298 continue;
1299
06c74270 1300 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
19baf839
RO
1301 *plen = i;
1302#ifdef CONFIG_IP_FIB_TRIE_STATS
1303 t->stats.semantic_match_passed++;
1304#endif
06c74270 1305 return err;
19baf839
RO
1306 }
1307#ifdef CONFIG_IP_FIB_TRIE_STATS
1308 t->stats.semantic_match_miss++;
1309#endif
1310 }
06c74270 1311 return 1;
19baf839
RO
1312}
1313
1314static int
1315fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1316{
1317 struct trie *t = (struct trie *) tb->tb_data;
1318 int plen, ret = 0;
1319 struct node *n;
1320 struct tnode *pn;
1321 int pos, bits;
91b9a277 1322 t_key key = ntohl(flp->fl4_dst);
19baf839
RO
1323 int chopped_off;
1324 t_key cindex = 0;
1325 int current_prefix_length = KEYLENGTH;
91b9a277
OJ
1326 struct tnode *cn;
1327 t_key node_prefix, key_prefix, pref_mismatch;
1328 int mp;
1329
19baf839
RO
1330 n = t->trie;
1331
1332 read_lock(&fib_lock);
91b9a277 1333
c877efb2 1334 if (!n)
19baf839
RO
1335 goto failed;
1336
1337#ifdef CONFIG_IP_FIB_TRIE_STATS
1338 t->stats.gets++;
1339#endif
1340
1341 /* Just a leaf? */
1342 if (IS_LEAF(n)) {
06c74270 1343 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
19baf839
RO
1344 goto found;
1345 goto failed;
1346 }
1347 pn = (struct tnode *) n;
1348 chopped_off = 0;
c877efb2 1349
91b9a277 1350 while (pn) {
19baf839
RO
1351 pos = pn->pos;
1352 bits = pn->bits;
1353
c877efb2 1354 if (!chopped_off)
19baf839
RO
1355 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1356
1357 n = tnode_get_child(pn, cindex);
1358
1359 if (n == NULL) {
1360#ifdef CONFIG_IP_FIB_TRIE_STATS
1361 t->stats.null_node_hit++;
1362#endif
1363 goto backtrace;
1364 }
1365
91b9a277
OJ
1366 if (IS_LEAF(n)) {
1367 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1368 goto found;
1369 else
1370 goto backtrace;
1371 }
1372
19baf839
RO
1373#define HL_OPTIMIZE
1374#ifdef HL_OPTIMIZE
91b9a277 1375 cn = (struct tnode *)n;
19baf839 1376
91b9a277
OJ
1377 /*
1378 * It's a tnode, and we can do some extra checks here if we
1379 * like, to avoid descending into a dead-end branch.
1380 * This tnode is in the parent's child array at index
1381 * key[p_pos..p_pos+p_bits] but potentially with some bits
1382 * chopped off, so in reality the index may be just a
1383 * subprefix, padded with zero at the end.
1384 * We can also take a look at any skipped bits in this
1385 * tnode - everything up to p_pos is supposed to be ok,
1386 * and the non-chopped bits of the index (se previous
1387 * paragraph) are also guaranteed ok, but the rest is
1388 * considered unknown.
1389 *
1390 * The skipped bits are key[pos+bits..cn->pos].
1391 */
19baf839 1392
91b9a277
OJ
1393 /* If current_prefix_length < pos+bits, we are already doing
1394 * actual prefix matching, which means everything from
1395 * pos+(bits-chopped_off) onward must be zero along some
1396 * branch of this subtree - otherwise there is *no* valid
1397 * prefix present. Here we can only check the skipped
1398 * bits. Remember, since we have already indexed into the
1399 * parent's child array, we know that the bits we chopped of
1400 * *are* zero.
1401 */
19baf839 1402
91b9a277 1403 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
19baf839 1404
91b9a277
OJ
1405 if (current_prefix_length < pos+bits) {
1406 if (tkey_extract_bits(cn->key, current_prefix_length,
1407 cn->pos - current_prefix_length) != 0 ||
1408 !(cn->child[0]))
1409 goto backtrace;
1410 }
19baf839 1411
91b9a277
OJ
1412 /*
1413 * If chopped_off=0, the index is fully validated and we
1414 * only need to look at the skipped bits for this, the new,
1415 * tnode. What we actually want to do is to find out if
1416 * these skipped bits match our key perfectly, or if we will
1417 * have to count on finding a matching prefix further down,
1418 * because if we do, we would like to have some way of
1419 * verifying the existence of such a prefix at this point.
1420 */
19baf839 1421
91b9a277
OJ
1422 /* The only thing we can do at this point is to verify that
1423 * any such matching prefix can indeed be a prefix to our
1424 * key, and if the bits in the node we are inspecting that
1425 * do not match our key are not ZERO, this cannot be true.
1426 * Thus, find out where there is a mismatch (before cn->pos)
1427 * and verify that all the mismatching bits are zero in the
1428 * new tnode's key.
1429 */
19baf839 1430
91b9a277
OJ
1431 /* Note: We aren't very concerned about the piece of the key
1432 * that precede pn->pos+pn->bits, since these have already been
1433 * checked. The bits after cn->pos aren't checked since these are
1434 * by definition "unknown" at this point. Thus, what we want to
1435 * see is if we are about to enter the "prefix matching" state,
1436 * and in that case verify that the skipped bits that will prevail
1437 * throughout this subtree are zero, as they have to be if we are
1438 * to find a matching prefix.
1439 */
1440
1441 node_prefix = MASK_PFX(cn->key, cn->pos);
1442 key_prefix = MASK_PFX(key, cn->pos);
1443 pref_mismatch = key_prefix^node_prefix;
1444 mp = 0;
1445
1446 /* In short: If skipped bits in this node do not match the search
1447 * key, enter the "prefix matching" state.directly.
1448 */
1449 if (pref_mismatch) {
1450 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1451 mp++;
1452 pref_mismatch = pref_mismatch <<1;
1453 }
1454 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1455
1456 if (key_prefix != 0)
1457 goto backtrace;
1458
1459 if (current_prefix_length >= cn->pos)
1460 current_prefix_length = mp;
c877efb2 1461 }
91b9a277
OJ
1462#endif
1463 pn = (struct tnode *)n; /* Descend */
1464 chopped_off = 0;
1465 continue;
1466
19baf839
RO
1467backtrace:
1468 chopped_off++;
1469
1470 /* As zero don't change the child key (cindex) */
91b9a277 1471 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
19baf839 1472 chopped_off++;
19baf839
RO
1473
1474 /* Decrease current_... with bits chopped off */
1475 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1476 current_prefix_length = pn->pos + pn->bits - chopped_off;
91b9a277 1477
19baf839 1478 /*
c877efb2 1479 * Either we do the actual chop off according or if we have
19baf839
RO
1480 * chopped off all bits in this tnode walk up to our parent.
1481 */
1482
91b9a277 1483 if (chopped_off <= pn->bits) {
19baf839 1484 cindex &= ~(1 << (chopped_off-1));
91b9a277 1485 } else {
c877efb2 1486 if (NODE_PARENT(pn) == NULL)
19baf839 1487 goto failed;
91b9a277 1488
19baf839
RO
1489 /* Get Child's index */
1490 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1491 pn = NODE_PARENT(pn);
1492 chopped_off = 0;
1493
1494#ifdef CONFIG_IP_FIB_TRIE_STATS
1495 t->stats.backtrack++;
1496#endif
1497 goto backtrace;
c877efb2 1498 }
19baf839
RO
1499 }
1500failed:
c877efb2 1501 ret = 1;
19baf839
RO
1502found:
1503 read_unlock(&fib_lock);
1504 return ret;
1505}
1506
1507static int trie_leaf_remove(struct trie *t, t_key key)
1508{
1509 t_key cindex;
1510 struct tnode *tp = NULL;
1511 struct node *n = t->trie;
1512 struct leaf *l;
1513
91b9a277 1514 DBG("entering trie_leaf_remove(%p)\n", n);
19baf839
RO
1515
1516 /* Note that in the case skipped bits, those bits are *not* checked!
c877efb2 1517 * When we finish this, we will have NULL or a T_LEAF, and the
19baf839
RO
1518 * T_LEAF may or may not match our key.
1519 */
1520
91b9a277 1521 while (n != NULL && IS_TNODE(n)) {
19baf839
RO
1522 struct tnode *tn = (struct tnode *) n;
1523 check_tnode(tn);
1524 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1525
91b9a277
OJ
1526 if (n && NODE_PARENT(n) != tn) {
1527 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1528 BUG();
1529 }
1530 }
19baf839
RO
1531 l = (struct leaf *) n;
1532
c877efb2 1533 if (!n || !tkey_equals(l->key, key))
19baf839 1534 return 0;
c877efb2
SH
1535
1536 /*
1537 * Key found.
1538 * Remove the leaf and rebalance the tree
19baf839
RO
1539 */
1540
1541 t->revision++;
1542 t->size--;
1543
1544 tp = NODE_PARENT(n);
1545 tnode_free((struct tnode *) n);
1546
c877efb2 1547 if (tp) {
19baf839
RO
1548 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1549 put_child(t, (struct tnode *)tp, cindex, NULL);
1550 t->trie = trie_rebalance(t, tp);
91b9a277 1551 } else
19baf839
RO
1552 t->trie = NULL;
1553
1554 return 1;
1555}
1556
1557static int
1558fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
91b9a277 1559 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
19baf839
RO
1560{
1561 struct trie *t = (struct trie *) tb->tb_data;
1562 u32 key, mask;
1563 int plen = r->rtm_dst_len;
1564 u8 tos = r->rtm_tos;
1565 struct fib_alias *fa, *fa_to_delete;
1566 struct list_head *fa_head;
1567 struct leaf *l;
91b9a277
OJ
1568 int kill_li = 0;
1569 struct leaf_info *li;
1570
19baf839 1571
c877efb2 1572 if (plen > 32)
19baf839
RO
1573 return -EINVAL;
1574
1575 key = 0;
c877efb2 1576 if (rta->rta_dst)
19baf839
RO
1577 memcpy(&key, rta->rta_dst, 4);
1578
1579 key = ntohl(key);
91b9a277 1580 mask = ntohl(inet_make_mask(plen));
19baf839 1581
c877efb2 1582 if (key & ~mask)
19baf839
RO
1583 return -EINVAL;
1584
1585 key = key & mask;
1586 l = fib_find_node(t, key);
1587
c877efb2 1588 if (!l)
19baf839
RO
1589 return -ESRCH;
1590
1591 fa_head = get_fa_head(l, plen);
1592 fa = fib_find_alias(fa_head, tos, 0);
1593
1594 if (!fa)
1595 return -ESRCH;
1596
91b9a277 1597 DBG("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1598
1599 fa_to_delete = NULL;
1600 fa_head = fa->fa_list.prev;
1601 list_for_each_entry(fa, fa_head, fa_list) {
1602 struct fib_info *fi = fa->fa_info;
1603
1604 if (fa->fa_tos != tos)
1605 break;
1606
1607 if ((!r->rtm_type ||
1608 fa->fa_type == r->rtm_type) &&
1609 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1610 fa->fa_scope == r->rtm_scope) &&
1611 (!r->rtm_protocol ||
1612 fi->fib_protocol == r->rtm_protocol) &&
1613 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1614 fa_to_delete = fa;
1615 break;
1616 }
1617 }
1618
91b9a277
OJ
1619 if (!fa_to_delete)
1620 return -ESRCH;
19baf839 1621
91b9a277
OJ
1622 fa = fa_to_delete;
1623 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1624
1625 l = fib_find_node(t, key);
1626 li = find_leaf_info(&l->list, plen);
19baf839 1627
91b9a277 1628 write_lock_bh(&fib_lock);
19baf839 1629
91b9a277 1630 list_del(&fa->fa_list);
19baf839 1631
91b9a277
OJ
1632 if (list_empty(fa_head)) {
1633 hlist_del(&li->hlist);
1634 kill_li = 1;
1635 }
1636 write_unlock_bh(&fib_lock);
19baf839 1637
91b9a277
OJ
1638 if (kill_li)
1639 free_leaf_info(li);
19baf839 1640
91b9a277
OJ
1641 if (hlist_empty(&l->list))
1642 trie_leaf_remove(t, key);
19baf839 1643
91b9a277
OJ
1644 if (fa->fa_state & FA_S_ACCESSED)
1645 rt_cache_flush(-1);
19baf839 1646
91b9a277
OJ
1647 fn_free_alias(fa);
1648 return 0;
19baf839
RO
1649}
1650
1651static int trie_flush_list(struct trie *t, struct list_head *head)
1652{
1653 struct fib_alias *fa, *fa_node;
1654 int found = 0;
1655
1656 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1657 struct fib_info *fi = fa->fa_info;
19baf839 1658
91b9a277 1659 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
c877efb2 1660 write_lock_bh(&fib_lock);
19baf839 1661 list_del(&fa->fa_list);
c877efb2 1662 write_unlock_bh(&fib_lock);
19baf839
RO
1663
1664 fn_free_alias(fa);
1665 found++;
1666 }
1667 }
1668 return found;
1669}
1670
1671static int trie_flush_leaf(struct trie *t, struct leaf *l)
1672{
1673 int found = 0;
1674 struct hlist_head *lih = &l->list;
1675 struct hlist_node *node, *tmp;
1676 struct leaf_info *li = NULL;
1677
1678 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
19baf839
RO
1679 found += trie_flush_list(t, &li->falh);
1680
1681 if (list_empty(&li->falh)) {
c877efb2 1682 write_lock_bh(&fib_lock);
19baf839 1683 hlist_del(&li->hlist);
c877efb2 1684 write_unlock_bh(&fib_lock);
19baf839
RO
1685
1686 free_leaf_info(li);
1687 }
1688 }
1689 return found;
1690}
1691
1692static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1693{
1694 struct node *c = (struct node *) thisleaf;
1695 struct tnode *p;
1696 int idx;
1697
c877efb2
SH
1698 if (c == NULL) {
1699 if (t->trie == NULL)
19baf839
RO
1700 return NULL;
1701
1702 if (IS_LEAF(t->trie)) /* trie w. just a leaf */
1703 return (struct leaf *) t->trie;
1704
1705 p = (struct tnode*) t->trie; /* Start */
91b9a277 1706 } else
19baf839 1707 p = (struct tnode *) NODE_PARENT(c);
c877efb2 1708
19baf839
RO
1709 while (p) {
1710 int pos, last;
1711
1712 /* Find the next child of the parent */
c877efb2
SH
1713 if (c)
1714 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1715 else
19baf839
RO
1716 pos = 0;
1717
1718 last = 1 << p->bits;
91b9a277
OJ
1719 for (idx = pos; idx < last ; idx++) {
1720 if (!p->child[idx])
1721 continue;
1722
1723 /* Decend if tnode */
1724 while (IS_TNODE(p->child[idx])) {
1725 p = (struct tnode*) p->child[idx];
1726 idx = 0;
1727
1728 /* Rightmost non-NULL branch */
1729 if (p && IS_TNODE(p))
1730 while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
1731
1732 /* Done with this tnode? */
1733 if (idx >= (1 << p->bits) || p->child[idx] == NULL)
1734 goto up;
19baf839 1735 }
91b9a277 1736 return (struct leaf*) p->child[idx];
19baf839
RO
1737 }
1738up:
1739 /* No more children go up one step */
91b9a277 1740 c = (struct node *) p;
19baf839
RO
1741 p = (struct tnode *) NODE_PARENT(p);
1742 }
1743 return NULL; /* Ready. Root of trie */
1744}
1745
1746static int fn_trie_flush(struct fib_table *tb)
1747{
1748 struct trie *t = (struct trie *) tb->tb_data;
1749 struct leaf *ll = NULL, *l = NULL;
1750 int found = 0, h;
1751
1752 t->revision++;
1753
91b9a277 1754 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
19baf839
RO
1755 found += trie_flush_leaf(t, l);
1756
1757 if (ll && hlist_empty(&ll->list))
1758 trie_leaf_remove(t, ll->key);
1759 ll = l;
1760 }
1761
1762 if (ll && hlist_empty(&ll->list))
1763 trie_leaf_remove(t, ll->key);
1764
91b9a277 1765 DBG("trie_flush found=%d\n", found);
19baf839
RO
1766 return found;
1767}
1768
91b9a277 1769static int trie_last_dflt = -1;
19baf839
RO
1770
1771static void
1772fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1773{
1774 struct trie *t = (struct trie *) tb->tb_data;
1775 int order, last_idx;
1776 struct fib_info *fi = NULL;
1777 struct fib_info *last_resort;
1778 struct fib_alias *fa = NULL;
1779 struct list_head *fa_head;
1780 struct leaf *l;
1781
1782 last_idx = -1;
1783 last_resort = NULL;
1784 order = -1;
1785
1786 read_lock(&fib_lock);
c877efb2 1787
19baf839 1788 l = fib_find_node(t, 0);
c877efb2 1789 if (!l)
19baf839
RO
1790 goto out;
1791
1792 fa_head = get_fa_head(l, 0);
c877efb2 1793 if (!fa_head)
19baf839
RO
1794 goto out;
1795
c877efb2 1796 if (list_empty(fa_head))
19baf839
RO
1797 goto out;
1798
1799 list_for_each_entry(fa, fa_head, fa_list) {
1800 struct fib_info *next_fi = fa->fa_info;
91b9a277 1801
19baf839
RO
1802 if (fa->fa_scope != res->scope ||
1803 fa->fa_type != RTN_UNICAST)
1804 continue;
91b9a277 1805
19baf839
RO
1806 if (next_fi->fib_priority > res->fi->fib_priority)
1807 break;
1808 if (!next_fi->fib_nh[0].nh_gw ||
1809 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1810 continue;
1811 fa->fa_state |= FA_S_ACCESSED;
91b9a277 1812
19baf839
RO
1813 if (fi == NULL) {
1814 if (next_fi != res->fi)
1815 break;
1816 } else if (!fib_detect_death(fi, order, &last_resort,
1817 &last_idx, &trie_last_dflt)) {
1818 if (res->fi)
1819 fib_info_put(res->fi);
1820 res->fi = fi;
1821 atomic_inc(&fi->fib_clntref);
1822 trie_last_dflt = order;
1823 goto out;
1824 }
1825 fi = next_fi;
1826 order++;
1827 }
1828 if (order <= 0 || fi == NULL) {
1829 trie_last_dflt = -1;
1830 goto out;
1831 }
1832
1833 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1834 if (res->fi)
1835 fib_info_put(res->fi);
1836 res->fi = fi;
1837 atomic_inc(&fi->fib_clntref);
1838 trie_last_dflt = order;
1839 goto out;
1840 }
1841 if (last_idx >= 0) {
1842 if (res->fi)
1843 fib_info_put(res->fi);
1844 res->fi = last_resort;
1845 if (last_resort)
1846 atomic_inc(&last_resort->fib_clntref);
1847 }
1848 trie_last_dflt = last_idx;
1849 out:;
c877efb2 1850 read_unlock(&fib_lock);
19baf839
RO
1851}
1852
c877efb2 1853static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
19baf839
RO
1854 struct sk_buff *skb, struct netlink_callback *cb)
1855{
1856 int i, s_i;
1857 struct fib_alias *fa;
1858
91b9a277 1859 u32 xkey = htonl(key);
19baf839 1860
91b9a277 1861 s_i = cb->args[3];
19baf839
RO
1862 i = 0;
1863
1864 list_for_each_entry(fa, fah, fa_list) {
1865 if (i < s_i) {
1866 i++;
1867 continue;
1868 }
1869 if (fa->fa_info->fib_nh == NULL) {
1870 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1871 i++;
1872 continue;
1873 }
1874 if (fa->fa_info == NULL) {
1875 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1876 i++;
1877 continue;
1878 }
1879
1880 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1881 cb->nlh->nlmsg_seq,
1882 RTM_NEWROUTE,
1883 tb->tb_id,
1884 fa->fa_type,
1885 fa->fa_scope,
1886 &xkey,
1887 plen,
1888 fa->fa_tos,
90f66914 1889 fa->fa_info, 0) < 0) {
19baf839
RO
1890 cb->args[3] = i;
1891 return -1;
91b9a277 1892 }
19baf839
RO
1893 i++;
1894 }
91b9a277 1895 cb->args[3] = i;
19baf839
RO
1896 return skb->len;
1897}
1898
c877efb2 1899static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
19baf839
RO
1900 struct netlink_callback *cb)
1901{
1902 int h, s_h;
1903 struct list_head *fa_head;
1904 struct leaf *l = NULL;
19baf839 1905
91b9a277 1906 s_h = cb->args[2];
19baf839 1907
91b9a277 1908 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
19baf839
RO
1909 if (h < s_h)
1910 continue;
1911 if (h > s_h)
1912 memset(&cb->args[3], 0,
1913 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1914
1915 fa_head = get_fa_head(l, plen);
91b9a277 1916
c877efb2 1917 if (!fa_head)
19baf839
RO
1918 continue;
1919
c877efb2 1920 if (list_empty(fa_head))
19baf839
RO
1921 continue;
1922
1923 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
91b9a277 1924 cb->args[2] = h;
19baf839
RO
1925 return -1;
1926 }
1927 }
91b9a277 1928 cb->args[2] = h;
19baf839
RO
1929 return skb->len;
1930}
1931
1932static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1933{
1934 int m, s_m;
1935 struct trie *t = (struct trie *) tb->tb_data;
1936
1937 s_m = cb->args[1];
1938
1939 read_lock(&fib_lock);
91b9a277 1940 for (m = 0; m <= 32; m++) {
19baf839
RO
1941 if (m < s_m)
1942 continue;
1943 if (m > s_m)
1944 memset(&cb->args[2], 0,
91b9a277 1945 sizeof(cb->args) - 2*sizeof(cb->args[0]));
19baf839
RO
1946
1947 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1948 cb->args[1] = m;
1949 goto out;
1950 }
1951 }
1952 read_unlock(&fib_lock);
1953 cb->args[1] = m;
1954 return skb->len;
91b9a277 1955out:
19baf839
RO
1956 read_unlock(&fib_lock);
1957 return -1;
1958}
1959
1960/* Fix more generic FIB names for init later */
1961
1962#ifdef CONFIG_IP_MULTIPLE_TABLES
1963struct fib_table * fib_hash_init(int id)
1964#else
1965struct fib_table * __init fib_hash_init(int id)
1966#endif
1967{
1968 struct fib_table *tb;
1969 struct trie *t;
1970
1971 if (fn_alias_kmem == NULL)
1972 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1973 sizeof(struct fib_alias),
1974 0, SLAB_HWCACHE_ALIGN,
1975 NULL, NULL);
1976
1977 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1978 GFP_KERNEL);
1979 if (tb == NULL)
1980 return NULL;
1981
1982 tb->tb_id = id;
1983 tb->tb_lookup = fn_trie_lookup;
1984 tb->tb_insert = fn_trie_insert;
1985 tb->tb_delete = fn_trie_delete;
1986 tb->tb_flush = fn_trie_flush;
1987 tb->tb_select_default = fn_trie_select_default;
1988 tb->tb_dump = fn_trie_dump;
1989 memset(tb->tb_data, 0, sizeof(struct trie));
1990
1991 t = (struct trie *) tb->tb_data;
1992
1993 trie_init(t);
1994
c877efb2 1995 if (id == RT_TABLE_LOCAL)
91b9a277 1996 trie_local = t;
c877efb2 1997 else if (id == RT_TABLE_MAIN)
91b9a277 1998 trie_main = t;
19baf839
RO
1999
2000 if (id == RT_TABLE_LOCAL)
2001 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2002
2003 return tb;
2004}
2005
2006/* Trie dump functions */
2007
2008static void putspace_seq(struct seq_file *seq, int n)
2009{
91b9a277
OJ
2010 while (n--)
2011 seq_printf(seq, " ");
19baf839
RO
2012}
2013
2014static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
2015{
2016 while (bits--)
2017 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
2018}
2019
c877efb2 2020static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
19baf839
RO
2021 int pend, int cindex, int bits)
2022{
2023 putspace_seq(seq, indent);
2024 if (IS_LEAF(n))
2025 seq_printf(seq, "|");
2026 else
2027 seq_printf(seq, "+");
2028 if (bits) {
2029 seq_printf(seq, "%d/", cindex);
2030 printbin_seq(seq, cindex, bits);
2031 seq_printf(seq, ": ");
91b9a277 2032 } else
19baf839
RO
2033 seq_printf(seq, "<root>: ");
2034 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
2035
19baf839 2036 if (IS_LEAF(n)) {
91b9a277 2037 struct leaf *l = (struct leaf *)n;
19baf839
RO
2038 struct fib_alias *fa;
2039 int i;
91b9a277
OJ
2040
2041 seq_printf(seq, "key=%d.%d.%d.%d\n",
2042 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2043
2044 for (i = 32; i >= 0; i--)
2045 if (find_leaf_info(&l->list, i)) {
19baf839 2046 struct list_head *fa_head = get_fa_head(l, i);
91b9a277 2047
c877efb2 2048 if (!fa_head)
19baf839
RO
2049 continue;
2050
c877efb2 2051 if (list_empty(fa_head))
19baf839
RO
2052 continue;
2053
2054 putspace_seq(seq, indent+2);
2055 seq_printf(seq, "{/%d...dumping}\n", i);
2056
19baf839
RO
2057 list_for_each_entry(fa, fa_head, fa_list) {
2058 putspace_seq(seq, indent+2);
19baf839
RO
2059 if (fa->fa_info == NULL) {
2060 seq_printf(seq, "Error fa_info=NULL\n");
2061 continue;
2062 }
91b9a277
OJ
2063 if (fa->fa_info->fib_nh == NULL) {
2064 seq_printf(seq, "Error _fib_nh=NULL\n");
2065 continue;
2066 }
19baf839
RO
2067
2068 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2069 fa->fa_type,
2070 fa->fa_scope,
2071 fa->fa_tos);
2072 }
2073 }
91b9a277 2074 } else {
c877efb2 2075 struct tnode *tn = (struct tnode *)n;
91b9a277
OJ
2076 int plen = ((struct tnode *)n)->pos;
2077 t_key prf = MASK_PFX(n->key, plen);
2078
2079 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2080 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2081
19baf839 2082 putspace_seq(seq, indent); seq_printf(seq, "| ");
91b9a277 2083 seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos));
19baf839
RO
2084 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2085 seq_printf(seq, "}\n");
2086 putspace_seq(seq, indent); seq_printf(seq, "| ");
2087 seq_printf(seq, "{pos=%d", tn->pos);
2088 seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
2089 seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
2090 putspace_seq(seq, indent); seq_printf(seq, "| ");
2091 seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
2092 }
2093}
2094
2095static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2096{
c877efb2 2097 struct node *n = t->trie;
91b9a277
OJ
2098 int cindex = 0;
2099 int indent = 1;
2100 int pend = 0;
19baf839 2101 int depth = 0;
91b9a277 2102 struct tnode *tn;
19baf839
RO
2103
2104 read_lock(&fib_lock);
2105
2106 seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
19baf839 2107
91b9a277
OJ
2108 if (!n) {
2109 seq_printf(seq, "------ trie is empty\n");
2110
2111 read_unlock(&fib_lock);
2112 return;
2113 }
2114
2115 printnode_seq(seq, indent, n, pend, cindex, 0);
2116
2117 if (!IS_TNODE(n)) {
2118 read_unlock(&fib_lock);
2119 return;
2120 }
2121
2122 tn = (struct tnode *)n;
2123 pend = tn->pos+tn->bits;
2124 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2125 indent += 3;
2126 depth++;
2127
2128 while (tn && cindex < (1 << tn->bits)) {
2129 if (tn->child[cindex]) {
2130 /* Got a child */
2131
2132 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2133 if (IS_LEAF(tn->child[cindex])) {
2134 cindex++;
2135 } else {
19baf839 2136 /*
91b9a277 2137 * New tnode. Decend one level
19baf839 2138 */
19baf839 2139
91b9a277
OJ
2140 depth++;
2141 tn = (struct tnode *)tn->child[cindex];
2142 pend = tn->pos + tn->bits;
2143 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2144 indent += 3;
2145 cindex = 0;
19baf839 2146 }
91b9a277
OJ
2147 } else
2148 cindex++;
2149
2150 /*
2151 * Test if we are done
2152 */
2153
2154 while (cindex >= (1 << tn->bits)) {
2155 /*
2156 * Move upwards and test for root
2157 * pop off all traversed nodes
2158 */
2159
2160 if (NODE_PARENT(tn) == NULL) {
2161 tn = NULL;
2162 break;
2163 }
2164
2165 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2166 cindex++;
2167 tn = NODE_PARENT(tn);
2168 pend = tn->pos + tn->bits;
2169 indent -= 3;
2170 depth--;
19baf839 2171 }
19baf839 2172 }
19baf839
RO
2173
2174 read_unlock(&fib_lock);
2175}
2176
2177static struct trie_stat *trie_stat_new(void)
2178{
91b9a277 2179 struct trie_stat *s;
19baf839 2180 int i;
c877efb2 2181
91b9a277
OJ
2182 s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2183 if (!s)
2184 return NULL;
2185
2186 s->totdepth = 0;
2187 s->maxdepth = 0;
2188 s->tnodes = 0;
2189 s->leaves = 0;
2190 s->nullpointers = 0;
2191
2192 for (i = 0; i < MAX_CHILDS; i++)
2193 s->nodesizes[i] = 0;
2194
19baf839 2195 return s;
c877efb2 2196}
19baf839
RO
2197
2198static struct trie_stat *trie_collect_stats(struct trie *t)
2199{
c877efb2 2200 struct node *n = t->trie;
19baf839
RO
2201 struct trie_stat *s = trie_stat_new();
2202 int cindex = 0;
19baf839
RO
2203 int pend = 0;
2204 int depth = 0;
2205
91b9a277
OJ
2206 if (!s)
2207 return NULL;
2208 if (!n)
2209 return s;
19baf839 2210
91b9a277 2211 read_lock(&fib_lock);
19baf839 2212
91b9a277
OJ
2213 if (IS_TNODE(n)) {
2214 struct tnode *tn = (struct tnode *)n;
2215 pend = tn->pos+tn->bits;
2216 s->nodesizes[tn->bits]++;
2217 depth++;
2218
2219 while (tn && cindex < (1 << tn->bits)) {
2220 if (tn->child[cindex]) {
2221 /* Got a child */
19baf839 2222
91b9a277
OJ
2223 if (IS_LEAF(tn->child[cindex])) {
2224 cindex++;
2225
2226 /* stats */
2227 if (depth > s->maxdepth)
2228 s->maxdepth = depth;
2229 s->totdepth += depth;
2230 s->leaves++;
2231 } else {
19baf839 2232 /*
91b9a277 2233 * New tnode. Decend one level
19baf839 2234 */
91b9a277
OJ
2235
2236 s->tnodes++;
2237 s->nodesizes[tn->bits]++;
2238 depth++;
2239
2240 n = tn->child[cindex];
2241 tn = (struct tnode *)n;
2242 pend = tn->pos+tn->bits;
2243
2244 cindex = 0;
19baf839 2245 }
91b9a277
OJ
2246 } else {
2247 cindex++;
2248 s->nullpointers++;
19baf839 2249 }
91b9a277
OJ
2250
2251 /*
2252 * Test if we are done
2253 */
2254
2255 while (cindex >= (1 << tn->bits)) {
2256 /*
2257 * Move upwards and test for root
2258 * pop off all traversed nodes
2259 */
2260
2261 if (NODE_PARENT(tn) == NULL) {
2262 tn = NULL;
2263 n = NULL;
2264 break;
2265 }
2266
2267 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2268 tn = NODE_PARENT(tn);
2269 cindex++;
2270 n = (struct node *)tn;
2271 pend = tn->pos+tn->bits;
2272 depth--;
2273 }
19baf839
RO
2274 }
2275 }
2276
91b9a277 2277 read_unlock(&fib_lock);
19baf839
RO
2278 return s;
2279}
2280
2281#ifdef CONFIG_PROC_FS
2282
2283static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2284{
2285 return NULL;
2286}
2287
2288static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2289{
2290 return NULL;
2291}
2292
2293static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2294{
91b9a277
OJ
2295 if (!ip_fib_main_table)
2296 return NULL;
19baf839 2297
91b9a277
OJ
2298 if (*pos)
2299 return fib_triestat_get_next(seq);
2300 else
2301 return SEQ_START_TOKEN;
19baf839
RO
2302}
2303
2304static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2305{
2306 ++*pos;
91b9a277
OJ
2307 if (v == SEQ_START_TOKEN)
2308 return fib_triestat_get_first(seq);
2309 else
2310 return fib_triestat_get_next(seq);
19baf839
RO
2311}
2312
2313static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2314{
2315
2316}
2317
c877efb2 2318/*
19baf839
RO
2319 * This outputs /proc/net/fib_triestats
2320 *
2321 * It always works in backward compatibility mode.
2322 * The format of the file is not supposed to be changed.
2323 */
2324
2325static void collect_and_show(struct trie *t, struct seq_file *seq)
2326{
2327 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2328 int i, max, pointers;
91b9a277 2329 struct trie_stat *stat;
19baf839
RO
2330 int avdepth;
2331
2332 stat = trie_collect_stats(t);
2333
91b9a277 2334 bytes = 0;
19baf839
RO
2335 seq_printf(seq, "trie=%p\n", t);
2336
2337 if (stat) {
2338 if (stat->leaves)
91b9a277 2339 avdepth = stat->totdepth*100 / stat->leaves;
19baf839 2340 else
91b9a277
OJ
2341 avdepth = 0;
2342 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100);
19baf839 2343 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
91b9a277 2344
19baf839
RO
2345 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2346 bytes += sizeof(struct leaf) * stat->leaves;
2347 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2348 bytes += sizeof(struct tnode) * stat->tnodes;
2349
2350 max = MAX_CHILDS-1;
2351
2352 while (max >= 0 && stat->nodesizes[max] == 0)
2353 max--;
2354 pointers = 0;
2355
c877efb2 2356 for (i = 1; i <= max; i++)
19baf839
RO
2357 if (stat->nodesizes[i] != 0) {
2358 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2359 pointers += (1<<i) * stat->nodesizes[i];
2360 }
2361 seq_printf(seq, "\n");
2362 seq_printf(seq, "Pointers: %d\n", pointers);
2363 bytes += sizeof(struct node *) * pointers;
2364 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2365 seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
2366
2367 kfree(stat);
2368 }
2369
2370#ifdef CONFIG_IP_FIB_TRIE_STATS
2371 seq_printf(seq, "Counters:\n---------\n");
2372 seq_printf(seq,"gets = %d\n", t->stats.gets);
2373 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2374 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2375 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2376 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2f36895a 2377 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
19baf839
RO
2378#ifdef CLEAR_STATS
2379 memset(&(t->stats), 0, sizeof(t->stats));
2380#endif
2381#endif /* CONFIG_IP_FIB_TRIE_STATS */
2382}
2383
2384static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2385{
2386 char bf[128];
c877efb2 2387
19baf839 2388 if (v == SEQ_START_TOKEN) {
c877efb2 2389 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
19baf839 2390 sizeof(struct leaf), sizeof(struct tnode));
c877efb2 2391 if (trie_local)
19baf839
RO
2392 collect_and_show(trie_local, seq);
2393
c877efb2 2394 if (trie_main)
19baf839 2395 collect_and_show(trie_main, seq);
91b9a277
OJ
2396 } else {
2397 snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400);
2398
19baf839
RO
2399 seq_printf(seq, "%-127s\n", bf);
2400 }
2401 return 0;
2402}
2403
2404static struct seq_operations fib_triestat_seq_ops = {
c877efb2
SH
2405 .start = fib_triestat_seq_start,
2406 .next = fib_triestat_seq_next,
2407 .stop = fib_triestat_seq_stop,
2408 .show = fib_triestat_seq_show,
19baf839
RO
2409};
2410
2411static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2412{
2413 struct seq_file *seq;
2414 int rc = -ENOMEM;
2415
2416 rc = seq_open(file, &fib_triestat_seq_ops);
2417 if (rc)
2418 goto out_kfree;
2419
c877efb2 2420 seq = file->private_data;
19baf839
RO
2421out:
2422 return rc;
2423out_kfree:
2424 goto out;
2425}
2426
2427static struct file_operations fib_triestat_seq_fops = {
c877efb2
SH
2428 .owner = THIS_MODULE,
2429 .open = fib_triestat_seq_open,
2430 .read = seq_read,
2431 .llseek = seq_lseek,
2432 .release = seq_release_private,
19baf839
RO
2433};
2434
2435int __init fib_stat_proc_init(void)
2436{
2437 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2438 return -ENOMEM;
2439 return 0;
2440}
2441
2442void __init fib_stat_proc_exit(void)
2443{
2444 proc_net_remove("fib_triestat");
2445}
2446
2447static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2448{
2449 return NULL;
2450}
2451
2452static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2453{
2454 return NULL;
2455}
2456
2457static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2458{
91b9a277
OJ
2459 if (!ip_fib_main_table)
2460 return NULL;
19baf839 2461
91b9a277
OJ
2462 if (*pos)
2463 return fib_trie_get_next(seq);
2464 else
2465 return SEQ_START_TOKEN;
19baf839
RO
2466}
2467
2468static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2469{
2470 ++*pos;
91b9a277
OJ
2471 if (v == SEQ_START_TOKEN)
2472 return fib_trie_get_first(seq);
2473 else
2474 return fib_trie_get_next(seq);
2475
19baf839
RO
2476}
2477
2478static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2479{
19baf839
RO
2480}
2481
c877efb2 2482/*
19baf839
RO
2483 * This outputs /proc/net/fib_trie.
2484 *
2485 * It always works in backward compatibility mode.
2486 * The format of the file is not supposed to be changed.
2487 */
2488
2489static int fib_trie_seq_show(struct seq_file *seq, void *v)
2490{
2491 char bf[128];
2492
2493 if (v == SEQ_START_TOKEN) {
c877efb2 2494 if (trie_local)
19baf839
RO
2495 trie_dump_seq(seq, trie_local);
2496
c877efb2 2497 if (trie_main)
19baf839 2498 trie_dump_seq(seq, trie_main);
91b9a277 2499 } else {
19baf839
RO
2500 snprintf(bf, sizeof(bf),
2501 "*\t%08X\t%08X", 200, 400);
2502 seq_printf(seq, "%-127s\n", bf);
2503 }
2504
2505 return 0;
2506}
2507
2508static struct seq_operations fib_trie_seq_ops = {
c877efb2
SH
2509 .start = fib_trie_seq_start,
2510 .next = fib_trie_seq_next,
2511 .stop = fib_trie_seq_stop,
2512 .show = fib_trie_seq_show,
19baf839
RO
2513};
2514
2515static int fib_trie_seq_open(struct inode *inode, struct file *file)
2516{
2517 struct seq_file *seq;
2518 int rc = -ENOMEM;
2519
2520 rc = seq_open(file, &fib_trie_seq_ops);
2521 if (rc)
2522 goto out_kfree;
2523
c877efb2 2524 seq = file->private_data;
19baf839
RO
2525out:
2526 return rc;
2527out_kfree:
2528 goto out;
2529}
2530
2531static struct file_operations fib_trie_seq_fops = {
c877efb2
SH
2532 .owner = THIS_MODULE,
2533 .open = fib_trie_seq_open,
2534 .read = seq_read,
2535 .llseek = seq_lseek,
2536 .release= seq_release_private,
19baf839
RO
2537};
2538
2539int __init fib_proc_init(void)
2540{
2541 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2542 return -ENOMEM;
2543 return 0;
2544}
2545
2546void __init fib_proc_exit(void)
2547{
2548 proc_net_remove("fib_trie");
2549}
2550
2551#endif /* CONFIG_PROC_FS */