]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - net/ipv4/fib_trie.c
[NETROM] lockdep: fix false positive
[mirror_ubuntu-jammy-kernel.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
fd966255
RO
44 *
45 * Substantial contributions to this work comes from:
46 *
47 * David S. Miller, <davem@davemloft.net>
48 * Stephen Hemminger <shemminger@osdl.org>
49 * Paul E. McKenney <paulmck@us.ibm.com>
50 * Patrick McHardy <kaber@trash.net>
19baf839
RO
51 */
52
550e29bc 53#define VERSION "0.407"
19baf839 54
19baf839
RO
55#include <asm/uaccess.h>
56#include <asm/system.h>
57#include <asm/bitops.h>
58#include <linux/types.h>
59#include <linux/kernel.h>
60#include <linux/sched.h>
61#include <linux/mm.h>
62#include <linux/string.h>
63#include <linux/socket.h>
64#include <linux/sockios.h>
65#include <linux/errno.h>
66#include <linux/in.h>
67#include <linux/inet.h>
cd8787ab 68#include <linux/inetdevice.h>
19baf839
RO
69#include <linux/netdevice.h>
70#include <linux/if_arp.h>
71#include <linux/proc_fs.h>
2373ce1c 72#include <linux/rcupdate.h>
19baf839
RO
73#include <linux/skbuff.h>
74#include <linux/netlink.h>
75#include <linux/init.h>
76#include <linux/list.h>
77#include <net/ip.h>
78#include <net/protocol.h>
79#include <net/route.h>
80#include <net/tcp.h>
81#include <net/sock.h>
82#include <net/ip_fib.h>
83#include "fib_lookup.h"
84
85#undef CONFIG_IP_FIB_TRIE_STATS
06ef921d 86#define MAX_STAT_DEPTH 32
19baf839 87
19baf839
RO
88#define KEYLENGTH (8*sizeof(t_key))
89#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
90#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
91
19baf839
RO
92typedef unsigned int t_key;
93
94#define T_TNODE 0
95#define T_LEAF 1
96#define NODE_TYPE_MASK 0x1UL
91b9a277 97#define NODE_PARENT(node) \
2373ce1c
RO
98 ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
99
100#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
101
102#define NODE_SET_PARENT(node, ptr) \
103 rcu_assign_pointer((node)->parent, \
104 ((unsigned long)(ptr)) | NODE_TYPE(node))
91b9a277
OJ
105
106#define IS_TNODE(n) (!(n->parent & T_LEAF))
107#define IS_LEAF(n) (n->parent & T_LEAF)
19baf839
RO
108
109struct node {
91b9a277
OJ
110 t_key key;
111 unsigned long parent;
19baf839
RO
112};
113
114struct leaf {
91b9a277
OJ
115 t_key key;
116 unsigned long parent;
19baf839 117 struct hlist_head list;
2373ce1c 118 struct rcu_head rcu;
19baf839
RO
119};
120
121struct leaf_info {
122 struct hlist_node hlist;
2373ce1c 123 struct rcu_head rcu;
19baf839
RO
124 int plen;
125 struct list_head falh;
126};
127
128struct tnode {
91b9a277
OJ
129 t_key key;
130 unsigned long parent;
131 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
132 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
133 unsigned short full_children; /* KEYLENGTH bits needed */
134 unsigned short empty_children; /* KEYLENGTH bits needed */
2373ce1c 135 struct rcu_head rcu;
91b9a277 136 struct node *child[0];
19baf839
RO
137};
138
139#ifdef CONFIG_IP_FIB_TRIE_STATS
140struct trie_use_stats {
141 unsigned int gets;
142 unsigned int backtrack;
143 unsigned int semantic_match_passed;
144 unsigned int semantic_match_miss;
145 unsigned int null_node_hit;
2f36895a 146 unsigned int resize_node_skipped;
19baf839
RO
147};
148#endif
149
150struct trie_stat {
151 unsigned int totdepth;
152 unsigned int maxdepth;
153 unsigned int tnodes;
154 unsigned int leaves;
155 unsigned int nullpointers;
06ef921d 156 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 157};
19baf839
RO
158
159struct trie {
91b9a277 160 struct node *trie;
19baf839
RO
161#ifdef CONFIG_IP_FIB_TRIE_STATS
162 struct trie_use_stats stats;
163#endif
91b9a277 164 int size;
19baf839
RO
165 unsigned int revision;
166};
167
19baf839
RO
168static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
169static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
19baf839 170static struct node *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
171static struct tnode *inflate(struct trie *t, struct tnode *tn);
172static struct tnode *halve(struct trie *t, struct tnode *tn);
19baf839 173static void tnode_free(struct tnode *tn);
19baf839 174
ba89966c 175static kmem_cache_t *fn_alias_kmem __read_mostly;
19baf839
RO
176static struct trie *trie_local = NULL, *trie_main = NULL;
177
2373ce1c
RO
178
179/* rcu_read_lock needs to be hold by caller from readside */
180
c877efb2 181static inline struct node *tnode_get_child(struct tnode *tn, int i)
19baf839 182{
91b9a277 183 BUG_ON(i >= 1 << tn->bits);
19baf839 184
2373ce1c 185 return rcu_dereference(tn->child[i]);
19baf839
RO
186}
187
bb435b8d 188static inline int tnode_child_length(const struct tnode *tn)
19baf839 189{
91b9a277 190 return 1 << tn->bits;
19baf839
RO
191}
192
19baf839
RO
193static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
194{
91b9a277 195 if (offset < KEYLENGTH)
19baf839 196 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 197 else
19baf839
RO
198 return 0;
199}
200
201static inline int tkey_equals(t_key a, t_key b)
202{
c877efb2 203 return a == b;
19baf839
RO
204}
205
206static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
207{
c877efb2
SH
208 if (bits == 0 || offset >= KEYLENGTH)
209 return 1;
91b9a277
OJ
210 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
211 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 212}
19baf839
RO
213
214static inline int tkey_mismatch(t_key a, int offset, t_key b)
215{
216 t_key diff = a ^ b;
217 int i = offset;
218
c877efb2
SH
219 if (!diff)
220 return 0;
221 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
222 i++;
223 return i;
224}
225
19baf839
RO
226/*
227 To understand this stuff, an understanding of keys and all their bits is
228 necessary. Every node in the trie has a key associated with it, but not
229 all of the bits in that key are significant.
230
231 Consider a node 'n' and its parent 'tp'.
232
233 If n is a leaf, every bit in its key is significant. Its presence is
772cb712 234 necessitated by path compression, since during a tree traversal (when
19baf839
RO
235 searching for a leaf - unless we are doing an insertion) we will completely
236 ignore all skipped bits we encounter. Thus we need to verify, at the end of
237 a potentially successful search, that we have indeed been walking the
238 correct key path.
239
240 Note that we can never "miss" the correct key in the tree if present by
241 following the wrong path. Path compression ensures that segments of the key
242 that are the same for all keys with a given prefix are skipped, but the
243 skipped part *is* identical for each node in the subtrie below the skipped
244 bit! trie_insert() in this implementation takes care of that - note the
245 call to tkey_sub_equals() in trie_insert().
246
247 if n is an internal node - a 'tnode' here, the various parts of its key
248 have many different meanings.
249
250 Example:
251 _________________________________________________________________
252 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
253 -----------------------------------------------------------------
254 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
255
256 _________________________________________________________________
257 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
258 -----------------------------------------------------------------
259 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
260
261 tp->pos = 7
262 tp->bits = 3
263 n->pos = 15
91b9a277 264 n->bits = 4
19baf839
RO
265
266 First, let's just ignore the bits that come before the parent tp, that is
267 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
268 not use them for anything.
269
270 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
271 index into the parent's child array. That is, they will be used to find
272 'n' among tp's children.
273
274 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
275 for the node n.
276
277 All the bits we have seen so far are significant to the node n. The rest
278 of the bits are really not needed or indeed known in n->key.
279
280 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
281 n's child array, and will of course be different for each child.
282
c877efb2 283
19baf839
RO
284 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
285 at this point.
286
287*/
288
0c7770c7 289static inline void check_tnode(const struct tnode *tn)
19baf839 290{
0c7770c7 291 WARN_ON(tn && tn->pos+tn->bits > 32);
19baf839
RO
292}
293
294static int halve_threshold = 25;
295static int inflate_threshold = 50;
e6308be8
RO
296static int halve_threshold_root = 15;
297static int inflate_threshold_root = 25;
19baf839 298
2373ce1c
RO
299
300static void __alias_free_mem(struct rcu_head *head)
19baf839 301{
2373ce1c
RO
302 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
303 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
304}
305
2373ce1c 306static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 307{
2373ce1c
RO
308 call_rcu(&fa->rcu, __alias_free_mem);
309}
91b9a277 310
2373ce1c
RO
311static void __leaf_free_rcu(struct rcu_head *head)
312{
313 kfree(container_of(head, struct leaf, rcu));
314}
91b9a277 315
2373ce1c 316static void __leaf_info_free_rcu(struct rcu_head *head)
19baf839 317{
2373ce1c 318 kfree(container_of(head, struct leaf_info, rcu));
19baf839
RO
319}
320
2373ce1c 321static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 322{
2373ce1c 323 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
19baf839
RO
324}
325
f0e36f8c
PM
326static struct tnode *tnode_alloc(unsigned int size)
327{
2373ce1c
RO
328 struct page *pages;
329
330 if (size <= PAGE_SIZE)
331 return kcalloc(size, 1, GFP_KERNEL);
332
333 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
334 if (!pages)
335 return NULL;
336
337 return page_address(pages);
f0e36f8c
PM
338}
339
2373ce1c 340static void __tnode_free_rcu(struct rcu_head *head)
f0e36f8c 341{
2373ce1c 342 struct tnode *tn = container_of(head, struct tnode, rcu);
f0e36f8c 343 unsigned int size = sizeof(struct tnode) +
2373ce1c 344 (1 << tn->bits) * sizeof(struct node *);
f0e36f8c
PM
345
346 if (size <= PAGE_SIZE)
347 kfree(tn);
348 else
349 free_pages((unsigned long)tn, get_order(size));
350}
351
2373ce1c
RO
352static inline void tnode_free(struct tnode *tn)
353{
550e29bc
RO
354 if(IS_LEAF(tn)) {
355 struct leaf *l = (struct leaf *) tn;
356 call_rcu_bh(&l->rcu, __leaf_free_rcu);
357 }
358 else
359 call_rcu(&tn->rcu, __tnode_free_rcu);
2373ce1c
RO
360}
361
362static struct leaf *leaf_new(void)
363{
364 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
365 if (l) {
366 l->parent = T_LEAF;
367 INIT_HLIST_HEAD(&l->list);
368 }
369 return l;
370}
371
372static struct leaf_info *leaf_info_new(int plen)
373{
374 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
375 if (li) {
376 li->plen = plen;
377 INIT_LIST_HEAD(&li->falh);
378 }
379 return li;
380}
381
19baf839
RO
382static struct tnode* tnode_new(t_key key, int pos, int bits)
383{
384 int nchildren = 1<<bits;
385 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
f0e36f8c 386 struct tnode *tn = tnode_alloc(sz);
19baf839 387
91b9a277 388 if (tn) {
19baf839 389 memset(tn, 0, sz);
2373ce1c 390 tn->parent = T_TNODE;
19baf839
RO
391 tn->pos = pos;
392 tn->bits = bits;
393 tn->key = key;
394 tn->full_children = 0;
395 tn->empty_children = 1<<bits;
396 }
c877efb2 397
0c7770c7
SH
398 pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
399 (unsigned int) (sizeof(struct node) * 1<<bits));
19baf839
RO
400 return tn;
401}
402
19baf839
RO
403/*
404 * Check whether a tnode 'n' is "full", i.e. it is an internal node
405 * and no bits are skipped. See discussion in dyntree paper p. 6
406 */
407
bb435b8d 408static inline int tnode_full(const struct tnode *tn, const struct node *n)
19baf839 409{
c877efb2 410 if (n == NULL || IS_LEAF(n))
19baf839
RO
411 return 0;
412
413 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
414}
415
c877efb2 416static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
19baf839
RO
417{
418 tnode_put_child_reorg(tn, i, n, -1);
419}
420
c877efb2 421 /*
19baf839
RO
422 * Add a child at position i overwriting the old value.
423 * Update the value of full_children and empty_children.
424 */
425
c877efb2 426static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
19baf839 427{
2373ce1c 428 struct node *chi = tn->child[i];
19baf839
RO
429 int isfull;
430
0c7770c7
SH
431 BUG_ON(i >= 1<<tn->bits);
432
19baf839
RO
433
434 /* update emptyChildren */
435 if (n == NULL && chi != NULL)
436 tn->empty_children++;
437 else if (n != NULL && chi == NULL)
438 tn->empty_children--;
c877efb2 439
19baf839 440 /* update fullChildren */
91b9a277 441 if (wasfull == -1)
19baf839
RO
442 wasfull = tnode_full(tn, chi);
443
444 isfull = tnode_full(tn, n);
c877efb2 445 if (wasfull && !isfull)
19baf839 446 tn->full_children--;
c877efb2 447 else if (!wasfull && isfull)
19baf839 448 tn->full_children++;
91b9a277 449
c877efb2
SH
450 if (n)
451 NODE_SET_PARENT(n, tn);
19baf839 452
2373ce1c 453 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
454}
455
c877efb2 456static struct node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
457{
458 int i;
2f36895a 459 int err = 0;
2f80b3c8 460 struct tnode *old_tn;
e6308be8
RO
461 int inflate_threshold_use;
462 int halve_threshold_use;
19baf839
RO
463
464 if (!tn)
465 return NULL;
466
0c7770c7
SH
467 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
468 tn, inflate_threshold, halve_threshold);
19baf839
RO
469
470 /* No children */
471 if (tn->empty_children == tnode_child_length(tn)) {
472 tnode_free(tn);
473 return NULL;
474 }
475 /* One child */
476 if (tn->empty_children == tnode_child_length(tn) - 1)
477 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 478 struct node *n;
19baf839 479
91b9a277 480 n = tn->child[i];
2373ce1c 481 if (!n)
91b9a277 482 continue;
91b9a277
OJ
483
484 /* compress one level */
2373ce1c 485 NODE_SET_PARENT(n, NULL);
91b9a277
OJ
486 tnode_free(tn);
487 return n;
19baf839 488 }
c877efb2 489 /*
19baf839
RO
490 * Double as long as the resulting node has a number of
491 * nonempty nodes that are above the threshold.
492 */
493
494 /*
c877efb2
SH
495 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
496 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 497 * Telecommunications, page 6:
c877efb2 498 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
499 * children in the *doubled* node is at least 'high'."
500 *
c877efb2
SH
501 * 'high' in this instance is the variable 'inflate_threshold'. It
502 * is expressed as a percentage, so we multiply it with
503 * tnode_child_length() and instead of multiplying by 2 (since the
504 * child array will be doubled by inflate()) and multiplying
505 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 506 * multiply the left-hand side by 50.
c877efb2
SH
507 *
508 * The left-hand side may look a bit weird: tnode_child_length(tn)
509 * - tn->empty_children is of course the number of non-null children
510 * in the current node. tn->full_children is the number of "full"
19baf839 511 * children, that is non-null tnodes with a skip value of 0.
c877efb2 512 * All of those will be doubled in the resulting inflated tnode, so
19baf839 513 * we just count them one extra time here.
c877efb2 514 *
19baf839 515 * A clearer way to write this would be:
c877efb2 516 *
19baf839 517 * to_be_doubled = tn->full_children;
c877efb2 518 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
519 * tn->full_children;
520 *
521 * new_child_length = tnode_child_length(tn) * 2;
522 *
c877efb2 523 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
524 * new_child_length;
525 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
526 *
527 * ...and so on, tho it would mess up the while () loop.
528 *
19baf839
RO
529 * anyway,
530 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
531 * inflate_threshold
c877efb2 532 *
19baf839
RO
533 * avoid a division:
534 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
535 * inflate_threshold * new_child_length
c877efb2 536 *
19baf839 537 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 538 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 539 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 540 *
19baf839 541 * expand new_child_length:
c877efb2 542 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 543 * tn->full_children) >=
19baf839 544 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 545 *
19baf839 546 * shorten again:
c877efb2 547 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 548 * tn->empty_children) >= inflate_threshold *
19baf839 549 * tnode_child_length(tn)
c877efb2 550 *
19baf839
RO
551 */
552
553 check_tnode(tn);
c877efb2 554
e6308be8
RO
555 /* Keep root node larger */
556
557 if(!tn->parent)
558 inflate_threshold_use = inflate_threshold_root;
559 else
560 inflate_threshold_use = inflate_threshold;
561
2f36895a 562 err = 0;
19baf839
RO
563 while ((tn->full_children > 0 &&
564 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
e6308be8 565 inflate_threshold_use * tnode_child_length(tn))) {
19baf839 566
2f80b3c8
RO
567 old_tn = tn;
568 tn = inflate(t, tn);
569 if (IS_ERR(tn)) {
570 tn = old_tn;
2f36895a
RO
571#ifdef CONFIG_IP_FIB_TRIE_STATS
572 t->stats.resize_node_skipped++;
573#endif
574 break;
575 }
19baf839
RO
576 }
577
578 check_tnode(tn);
579
580 /*
581 * Halve as long as the number of empty children in this
582 * node is above threshold.
583 */
2f36895a 584
e6308be8
RO
585
586 /* Keep root node larger */
587
588 if(!tn->parent)
589 halve_threshold_use = halve_threshold_root;
590 else
591 halve_threshold_use = halve_threshold;
592
2f36895a 593 err = 0;
19baf839
RO
594 while (tn->bits > 1 &&
595 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 596 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 597
2f80b3c8
RO
598 old_tn = tn;
599 tn = halve(t, tn);
600 if (IS_ERR(tn)) {
601 tn = old_tn;
2f36895a
RO
602#ifdef CONFIG_IP_FIB_TRIE_STATS
603 t->stats.resize_node_skipped++;
604#endif
605 break;
606 }
607 }
19baf839 608
c877efb2 609
19baf839 610 /* Only one child remains */
19baf839
RO
611 if (tn->empty_children == tnode_child_length(tn) - 1)
612 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 613 struct node *n;
19baf839 614
91b9a277 615 n = tn->child[i];
2373ce1c 616 if (!n)
91b9a277 617 continue;
91b9a277
OJ
618
619 /* compress one level */
620
2373ce1c 621 NODE_SET_PARENT(n, NULL);
91b9a277
OJ
622 tnode_free(tn);
623 return n;
19baf839
RO
624 }
625
626 return (struct node *) tn;
627}
628
2f80b3c8 629static struct tnode *inflate(struct trie *t, struct tnode *tn)
19baf839
RO
630{
631 struct tnode *inode;
632 struct tnode *oldtnode = tn;
633 int olen = tnode_child_length(tn);
634 int i;
635
0c7770c7 636 pr_debug("In inflate\n");
19baf839
RO
637
638 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
639
0c7770c7 640 if (!tn)
2f80b3c8 641 return ERR_PTR(-ENOMEM);
2f36895a
RO
642
643 /*
c877efb2
SH
644 * Preallocate and store tnodes before the actual work so we
645 * don't get into an inconsistent state if memory allocation
646 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
647 * of tnode is ignored.
648 */
91b9a277
OJ
649
650 for (i = 0; i < olen; i++) {
2f36895a
RO
651 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
652
653 if (inode &&
654 IS_TNODE(inode) &&
655 inode->pos == oldtnode->pos + oldtnode->bits &&
656 inode->bits > 1) {
657 struct tnode *left, *right;
2f36895a 658 t_key m = TKEY_GET_MASK(inode->pos, 1);
c877efb2 659
2f36895a
RO
660 left = tnode_new(inode->key&(~m), inode->pos + 1,
661 inode->bits - 1);
2f80b3c8
RO
662 if (!left)
663 goto nomem;
91b9a277 664
2f36895a
RO
665 right = tnode_new(inode->key|m, inode->pos + 1,
666 inode->bits - 1);
667
2f80b3c8
RO
668 if (!right) {
669 tnode_free(left);
670 goto nomem;
671 }
2f36895a
RO
672
673 put_child(t, tn, 2*i, (struct node *) left);
674 put_child(t, tn, 2*i+1, (struct node *) right);
675 }
676 }
677
91b9a277 678 for (i = 0; i < olen; i++) {
19baf839 679 struct node *node = tnode_get_child(oldtnode, i);
91b9a277
OJ
680 struct tnode *left, *right;
681 int size, j;
c877efb2 682
19baf839
RO
683 /* An empty child */
684 if (node == NULL)
685 continue;
686
687 /* A leaf or an internal node with skipped bits */
688
c877efb2 689 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 690 tn->pos + tn->bits - 1) {
c877efb2 691 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
19baf839
RO
692 1) == 0)
693 put_child(t, tn, 2*i, node);
694 else
695 put_child(t, tn, 2*i+1, node);
696 continue;
697 }
698
699 /* An internal node with two children */
700 inode = (struct tnode *) node;
701
702 if (inode->bits == 1) {
703 put_child(t, tn, 2*i, inode->child[0]);
704 put_child(t, tn, 2*i+1, inode->child[1]);
705
706 tnode_free(inode);
91b9a277 707 continue;
19baf839
RO
708 }
709
91b9a277
OJ
710 /* An internal node with more than two children */
711
712 /* We will replace this node 'inode' with two new
713 * ones, 'left' and 'right', each with half of the
714 * original children. The two new nodes will have
715 * a position one bit further down the key and this
716 * means that the "significant" part of their keys
717 * (see the discussion near the top of this file)
718 * will differ by one bit, which will be "0" in
719 * left's key and "1" in right's key. Since we are
720 * moving the key position by one step, the bit that
721 * we are moving away from - the bit at position
722 * (inode->pos) - is the one that will differ between
723 * left and right. So... we synthesize that bit in the
724 * two new keys.
725 * The mask 'm' below will be a single "one" bit at
726 * the position (inode->pos)
727 */
19baf839 728
91b9a277
OJ
729 /* Use the old key, but set the new significant
730 * bit to zero.
731 */
2f36895a 732
91b9a277
OJ
733 left = (struct tnode *) tnode_get_child(tn, 2*i);
734 put_child(t, tn, 2*i, NULL);
2f36895a 735
91b9a277 736 BUG_ON(!left);
2f36895a 737
91b9a277
OJ
738 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
739 put_child(t, tn, 2*i+1, NULL);
19baf839 740
91b9a277 741 BUG_ON(!right);
19baf839 742
91b9a277
OJ
743 size = tnode_child_length(left);
744 for (j = 0; j < size; j++) {
745 put_child(t, left, j, inode->child[j]);
746 put_child(t, right, j, inode->child[j + size]);
19baf839 747 }
91b9a277
OJ
748 put_child(t, tn, 2*i, resize(t, left));
749 put_child(t, tn, 2*i+1, resize(t, right));
750
751 tnode_free(inode);
19baf839
RO
752 }
753 tnode_free(oldtnode);
754 return tn;
2f80b3c8
RO
755nomem:
756 {
757 int size = tnode_child_length(tn);
758 int j;
759
0c7770c7 760 for (j = 0; j < size; j++)
2f80b3c8
RO
761 if (tn->child[j])
762 tnode_free((struct tnode *)tn->child[j]);
763
764 tnode_free(tn);
0c7770c7 765
2f80b3c8
RO
766 return ERR_PTR(-ENOMEM);
767 }
19baf839
RO
768}
769
2f80b3c8 770static struct tnode *halve(struct trie *t, struct tnode *tn)
19baf839
RO
771{
772 struct tnode *oldtnode = tn;
773 struct node *left, *right;
774 int i;
775 int olen = tnode_child_length(tn);
776
0c7770c7 777 pr_debug("In halve\n");
c877efb2
SH
778
779 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 780
2f80b3c8
RO
781 if (!tn)
782 return ERR_PTR(-ENOMEM);
2f36895a
RO
783
784 /*
c877efb2
SH
785 * Preallocate and store tnodes before the actual work so we
786 * don't get into an inconsistent state if memory allocation
787 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
788 * of tnode is ignored.
789 */
790
91b9a277 791 for (i = 0; i < olen; i += 2) {
2f36895a
RO
792 left = tnode_get_child(oldtnode, i);
793 right = tnode_get_child(oldtnode, i+1);
c877efb2 794
2f36895a 795 /* Two nonempty children */
0c7770c7 796 if (left && right) {
2f80b3c8 797 struct tnode *newn;
0c7770c7 798
2f80b3c8 799 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
800
801 if (!newn)
2f80b3c8 802 goto nomem;
0c7770c7 803
2f80b3c8 804 put_child(t, tn, i/2, (struct node *)newn);
2f36895a 805 }
2f36895a 806
2f36895a 807 }
19baf839 808
91b9a277
OJ
809 for (i = 0; i < olen; i += 2) {
810 struct tnode *newBinNode;
811
19baf839
RO
812 left = tnode_get_child(oldtnode, i);
813 right = tnode_get_child(oldtnode, i+1);
c877efb2 814
19baf839
RO
815 /* At least one of the children is empty */
816 if (left == NULL) {
817 if (right == NULL) /* Both are empty */
818 continue;
819 put_child(t, tn, i/2, right);
91b9a277 820 continue;
0c7770c7 821 }
91b9a277
OJ
822
823 if (right == NULL) {
19baf839 824 put_child(t, tn, i/2, left);
91b9a277
OJ
825 continue;
826 }
c877efb2 827
19baf839 828 /* Two nonempty children */
91b9a277
OJ
829 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
830 put_child(t, tn, i/2, NULL);
91b9a277
OJ
831 put_child(t, newBinNode, 0, left);
832 put_child(t, newBinNode, 1, right);
833 put_child(t, tn, i/2, resize(t, newBinNode));
19baf839
RO
834 }
835 tnode_free(oldtnode);
836 return tn;
2f80b3c8
RO
837nomem:
838 {
839 int size = tnode_child_length(tn);
840 int j;
841
0c7770c7 842 for (j = 0; j < size; j++)
2f80b3c8
RO
843 if (tn->child[j])
844 tnode_free((struct tnode *)tn->child[j]);
845
846 tnode_free(tn);
0c7770c7 847
2f80b3c8
RO
848 return ERR_PTR(-ENOMEM);
849 }
19baf839
RO
850}
851
91b9a277 852static void trie_init(struct trie *t)
19baf839 853{
91b9a277
OJ
854 if (!t)
855 return;
856
857 t->size = 0;
2373ce1c 858 rcu_assign_pointer(t->trie, NULL);
91b9a277 859 t->revision = 0;
19baf839 860#ifdef CONFIG_IP_FIB_TRIE_STATS
91b9a277 861 memset(&t->stats, 0, sizeof(struct trie_use_stats));
19baf839 862#endif
19baf839
RO
863}
864
772cb712 865/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
866 via get_fa_head and dump */
867
772cb712 868static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
19baf839 869{
772cb712 870 struct hlist_head *head = &l->list;
19baf839
RO
871 struct hlist_node *node;
872 struct leaf_info *li;
873
2373ce1c 874 hlist_for_each_entry_rcu(li, node, head, hlist)
c877efb2 875 if (li->plen == plen)
19baf839 876 return li;
91b9a277 877
19baf839
RO
878 return NULL;
879}
880
881static inline struct list_head * get_fa_head(struct leaf *l, int plen)
882{
772cb712 883 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 884
91b9a277
OJ
885 if (!li)
886 return NULL;
c877efb2 887
91b9a277 888 return &li->falh;
19baf839
RO
889}
890
891static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
892{
2373ce1c
RO
893 struct leaf_info *li = NULL, *last = NULL;
894 struct hlist_node *node;
895
896 if (hlist_empty(head)) {
897 hlist_add_head_rcu(&new->hlist, head);
898 } else {
899 hlist_for_each_entry(li, node, head, hlist) {
900 if (new->plen > li->plen)
901 break;
902
903 last = li;
904 }
905 if (last)
906 hlist_add_after_rcu(&last->hlist, &new->hlist);
907 else
908 hlist_add_before_rcu(&new->hlist, &li->hlist);
909 }
19baf839
RO
910}
911
2373ce1c
RO
912/* rcu_read_lock needs to be hold by caller from readside */
913
19baf839
RO
914static struct leaf *
915fib_find_node(struct trie *t, u32 key)
916{
917 int pos;
918 struct tnode *tn;
919 struct node *n;
920
921 pos = 0;
2373ce1c 922 n = rcu_dereference(t->trie);
19baf839
RO
923
924 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
925 tn = (struct tnode *) n;
91b9a277 926
19baf839 927 check_tnode(tn);
91b9a277 928
c877efb2 929 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
91b9a277 930 pos = tn->pos + tn->bits;
19baf839 931 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
91b9a277 932 } else
19baf839
RO
933 break;
934 }
935 /* Case we have found a leaf. Compare prefixes */
936
91b9a277
OJ
937 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
938 return (struct leaf *)n;
939
19baf839
RO
940 return NULL;
941}
942
943static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
944{
19baf839
RO
945 int wasfull;
946 t_key cindex, key;
947 struct tnode *tp = NULL;
948
19baf839 949 key = tn->key;
19baf839
RO
950
951 while (tn != NULL && NODE_PARENT(tn) != NULL) {
19baf839
RO
952
953 tp = NODE_PARENT(tn);
954 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
955 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
956 tn = (struct tnode *) resize (t, (struct tnode *)tn);
957 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
91b9a277 958
c877efb2 959 if (!NODE_PARENT(tn))
19baf839
RO
960 break;
961
962 tn = NODE_PARENT(tn);
963 }
964 /* Handle last (top) tnode */
c877efb2 965 if (IS_TNODE(tn))
19baf839
RO
966 tn = (struct tnode*) resize(t, (struct tnode *)tn);
967
968 return (struct node*) tn;
969}
970
2373ce1c
RO
971/* only used from updater-side */
972
f835e471
RO
973static struct list_head *
974fib_insert_node(struct trie *t, int *err, u32 key, int plen)
19baf839
RO
975{
976 int pos, newpos;
977 struct tnode *tp = NULL, *tn = NULL;
978 struct node *n;
979 struct leaf *l;
980 int missbit;
c877efb2 981 struct list_head *fa_head = NULL;
19baf839
RO
982 struct leaf_info *li;
983 t_key cindex;
984
985 pos = 0;
c877efb2 986 n = t->trie;
19baf839 987
c877efb2
SH
988 /* If we point to NULL, stop. Either the tree is empty and we should
989 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 990 * and we should just put our new leaf in that.
c877efb2
SH
991 * If we point to a T_TNODE, check if it matches our key. Note that
992 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
993 * not be the parent's 'pos'+'bits'!
994 *
c877efb2 995 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
996 * the index from our key, push the T_TNODE and walk the tree.
997 *
998 * If it doesn't, we have to replace it with a new T_TNODE.
999 *
c877efb2
SH
1000 * If we point to a T_LEAF, it might or might not have the same key
1001 * as we do. If it does, just change the value, update the T_LEAF's
1002 * value, and return it.
19baf839
RO
1003 * If it doesn't, we need to replace it with a T_TNODE.
1004 */
1005
1006 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1007 tn = (struct tnode *) n;
91b9a277 1008
c877efb2 1009 check_tnode(tn);
91b9a277 1010
c877efb2 1011 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839 1012 tp = tn;
91b9a277 1013 pos = tn->pos + tn->bits;
19baf839
RO
1014 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1015
0c7770c7 1016 BUG_ON(n && NODE_PARENT(n) != tn);
91b9a277 1017 } else
19baf839
RO
1018 break;
1019 }
1020
1021 /*
1022 * n ----> NULL, LEAF or TNODE
1023 *
c877efb2 1024 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1025 */
1026
91b9a277 1027 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1028
1029 /* Case 1: n is a leaf. Compare prefixes */
1030
c877efb2 1031 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
91b9a277
OJ
1032 struct leaf *l = (struct leaf *) n;
1033
19baf839 1034 li = leaf_info_new(plen);
91b9a277 1035
c877efb2 1036 if (!li) {
f835e471
RO
1037 *err = -ENOMEM;
1038 goto err;
1039 }
19baf839
RO
1040
1041 fa_head = &li->falh;
1042 insert_leaf_info(&l->list, li);
1043 goto done;
1044 }
1045 t->size++;
1046 l = leaf_new();
1047
c877efb2 1048 if (!l) {
f835e471
RO
1049 *err = -ENOMEM;
1050 goto err;
1051 }
19baf839
RO
1052
1053 l->key = key;
1054 li = leaf_info_new(plen);
1055
c877efb2 1056 if (!li) {
f835e471
RO
1057 tnode_free((struct tnode *) l);
1058 *err = -ENOMEM;
1059 goto err;
1060 }
19baf839
RO
1061
1062 fa_head = &li->falh;
1063 insert_leaf_info(&l->list, li);
1064
19baf839 1065 if (t->trie && n == NULL) {
91b9a277 1066 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839
RO
1067
1068 NODE_SET_PARENT(l, tp);
19baf839 1069
91b9a277
OJ
1070 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1071 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1072 } else {
1073 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1074 /*
1075 * Add a new tnode here
19baf839
RO
1076 * first tnode need some special handling
1077 */
1078
1079 if (tp)
91b9a277 1080 pos = tp->pos+tp->bits;
19baf839 1081 else
91b9a277
OJ
1082 pos = 0;
1083
c877efb2 1084 if (n) {
19baf839
RO
1085 newpos = tkey_mismatch(key, pos, n->key);
1086 tn = tnode_new(n->key, newpos, 1);
91b9a277 1087 } else {
19baf839 1088 newpos = 0;
c877efb2 1089 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1090 }
19baf839 1091
c877efb2 1092 if (!tn) {
f835e471
RO
1093 free_leaf_info(li);
1094 tnode_free((struct tnode *) l);
1095 *err = -ENOMEM;
1096 goto err;
91b9a277
OJ
1097 }
1098
19baf839
RO
1099 NODE_SET_PARENT(tn, tp);
1100
91b9a277 1101 missbit = tkey_extract_bits(key, newpos, 1);
19baf839
RO
1102 put_child(t, tn, missbit, (struct node *)l);
1103 put_child(t, tn, 1-missbit, n);
1104
c877efb2 1105 if (tp) {
19baf839
RO
1106 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1107 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
91b9a277 1108 } else {
2373ce1c 1109 rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
19baf839
RO
1110 tp = tn;
1111 }
1112 }
91b9a277
OJ
1113
1114 if (tp && tp->pos + tp->bits > 32)
78c6671a 1115 printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
19baf839 1116 tp, tp->pos, tp->bits, key, plen);
91b9a277 1117
19baf839 1118 /* Rebalance the trie */
2373ce1c
RO
1119
1120 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
f835e471
RO
1121done:
1122 t->revision++;
91b9a277 1123err:
19baf839
RO
1124 return fa_head;
1125}
1126
1127static int
1128fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1129 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1130{
1131 struct trie *t = (struct trie *) tb->tb_data;
1132 struct fib_alias *fa, *new_fa;
c877efb2 1133 struct list_head *fa_head = NULL;
19baf839
RO
1134 struct fib_info *fi;
1135 int plen = r->rtm_dst_len;
1136 int type = r->rtm_type;
1137 u8 tos = r->rtm_tos;
1138 u32 key, mask;
1139 int err;
1140 struct leaf *l;
1141
1142 if (plen > 32)
1143 return -EINVAL;
1144
1145 key = 0;
c877efb2 1146 if (rta->rta_dst)
19baf839
RO
1147 memcpy(&key, rta->rta_dst, 4);
1148
1149 key = ntohl(key);
1150
0c7770c7 1151 pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
19baf839 1152
91b9a277 1153 mask = ntohl(inet_make_mask(plen));
19baf839 1154
c877efb2 1155 if (key & ~mask)
19baf839
RO
1156 return -EINVAL;
1157
1158 key = key & mask;
1159
91b9a277
OJ
1160 fi = fib_create_info(r, rta, nlhdr, &err);
1161
1162 if (!fi)
19baf839
RO
1163 goto err;
1164
1165 l = fib_find_node(t, key);
c877efb2 1166 fa = NULL;
19baf839 1167
c877efb2 1168 if (l) {
19baf839
RO
1169 fa_head = get_fa_head(l, plen);
1170 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1171 }
1172
1173 /* Now fa, if non-NULL, points to the first fib alias
1174 * with the same keys [prefix,tos,priority], if such key already
1175 * exists or to the node before which we will insert new one.
1176 *
1177 * If fa is NULL, we will need to allocate a new one and
1178 * insert to the head of f.
1179 *
1180 * If f is NULL, no fib node matched the destination key
1181 * and we need to allocate a new one of those as well.
1182 */
1183
91b9a277 1184 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
19baf839
RO
1185 struct fib_alias *fa_orig;
1186
1187 err = -EEXIST;
1188 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1189 goto out;
1190
1191 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1192 struct fib_info *fi_drop;
1193 u8 state;
1194
2373ce1c
RO
1195 err = -ENOBUFS;
1196 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1197 if (new_fa == NULL)
1198 goto out;
19baf839
RO
1199
1200 fi_drop = fa->fa_info;
2373ce1c
RO
1201 new_fa->fa_tos = fa->fa_tos;
1202 new_fa->fa_info = fi;
1203 new_fa->fa_type = type;
1204 new_fa->fa_scope = r->rtm_scope;
19baf839 1205 state = fa->fa_state;
2373ce1c 1206 new_fa->fa_state &= ~FA_S_ACCESSED;
19baf839 1207
2373ce1c
RO
1208 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1209 alias_free_mem_rcu(fa);
19baf839
RO
1210
1211 fib_release_info(fi_drop);
1212 if (state & FA_S_ACCESSED)
91b9a277 1213 rt_cache_flush(-1);
19baf839 1214
91b9a277 1215 goto succeeded;
19baf839
RO
1216 }
1217 /* Error if we find a perfect match which
1218 * uses the same scope, type, and nexthop
1219 * information.
1220 */
1221 fa_orig = fa;
1222 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1223 if (fa->fa_tos != tos)
1224 break;
1225 if (fa->fa_info->fib_priority != fi->fib_priority)
1226 break;
1227 if (fa->fa_type == type &&
1228 fa->fa_scope == r->rtm_scope &&
1229 fa->fa_info == fi) {
1230 goto out;
1231 }
1232 }
1233 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1234 fa = fa_orig;
1235 }
1236 err = -ENOENT;
91b9a277 1237 if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
19baf839
RO
1238 goto out;
1239
1240 err = -ENOBUFS;
1241 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1242 if (new_fa == NULL)
1243 goto out;
1244
1245 new_fa->fa_info = fi;
1246 new_fa->fa_tos = tos;
1247 new_fa->fa_type = type;
1248 new_fa->fa_scope = r->rtm_scope;
1249 new_fa->fa_state = 0;
19baf839
RO
1250 /*
1251 * Insert new entry to the list.
1252 */
1253
c877efb2 1254 if (!fa_head) {
f835e471
RO
1255 fa_head = fib_insert_node(t, &err, key, plen);
1256 err = 0;
c877efb2 1257 if (err)
f835e471
RO
1258 goto out_free_new_fa;
1259 }
19baf839 1260
2373ce1c
RO
1261 list_add_tail_rcu(&new_fa->fa_list,
1262 (fa ? &fa->fa_list : fa_head));
19baf839
RO
1263
1264 rt_cache_flush(-1);
1265 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1266succeeded:
1267 return 0;
f835e471
RO
1268
1269out_free_new_fa:
1270 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1271out:
1272 fib_release_info(fi);
91b9a277 1273err:
19baf839
RO
1274 return err;
1275}
1276
2373ce1c 1277
772cb712 1278/* should be called with rcu_read_lock */
0c7770c7
SH
1279static inline int check_leaf(struct trie *t, struct leaf *l,
1280 t_key key, int *plen, const struct flowi *flp,
06c74270 1281 struct fib_result *res)
19baf839 1282{
06c74270 1283 int err, i;
19baf839
RO
1284 t_key mask;
1285 struct leaf_info *li;
1286 struct hlist_head *hhead = &l->list;
1287 struct hlist_node *node;
c877efb2 1288
2373ce1c 1289 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
19baf839
RO
1290 i = li->plen;
1291 mask = ntohl(inet_make_mask(i));
c877efb2 1292 if (l->key != (key & mask))
19baf839
RO
1293 continue;
1294
06c74270 1295 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
19baf839
RO
1296 *plen = i;
1297#ifdef CONFIG_IP_FIB_TRIE_STATS
1298 t->stats.semantic_match_passed++;
1299#endif
06c74270 1300 return err;
19baf839
RO
1301 }
1302#ifdef CONFIG_IP_FIB_TRIE_STATS
1303 t->stats.semantic_match_miss++;
1304#endif
1305 }
06c74270 1306 return 1;
19baf839
RO
1307}
1308
1309static int
1310fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1311{
1312 struct trie *t = (struct trie *) tb->tb_data;
1313 int plen, ret = 0;
1314 struct node *n;
1315 struct tnode *pn;
1316 int pos, bits;
91b9a277 1317 t_key key = ntohl(flp->fl4_dst);
19baf839
RO
1318 int chopped_off;
1319 t_key cindex = 0;
1320 int current_prefix_length = KEYLENGTH;
91b9a277
OJ
1321 struct tnode *cn;
1322 t_key node_prefix, key_prefix, pref_mismatch;
1323 int mp;
1324
2373ce1c 1325 rcu_read_lock();
91b9a277 1326
2373ce1c 1327 n = rcu_dereference(t->trie);
c877efb2 1328 if (!n)
19baf839
RO
1329 goto failed;
1330
1331#ifdef CONFIG_IP_FIB_TRIE_STATS
1332 t->stats.gets++;
1333#endif
1334
1335 /* Just a leaf? */
1336 if (IS_LEAF(n)) {
06c74270 1337 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
19baf839
RO
1338 goto found;
1339 goto failed;
1340 }
1341 pn = (struct tnode *) n;
1342 chopped_off = 0;
c877efb2 1343
91b9a277 1344 while (pn) {
19baf839
RO
1345 pos = pn->pos;
1346 bits = pn->bits;
1347
c877efb2 1348 if (!chopped_off)
19baf839
RO
1349 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1350
1351 n = tnode_get_child(pn, cindex);
1352
1353 if (n == NULL) {
1354#ifdef CONFIG_IP_FIB_TRIE_STATS
1355 t->stats.null_node_hit++;
1356#endif
1357 goto backtrace;
1358 }
1359
91b9a277
OJ
1360 if (IS_LEAF(n)) {
1361 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1362 goto found;
1363 else
1364 goto backtrace;
1365 }
1366
19baf839
RO
1367#define HL_OPTIMIZE
1368#ifdef HL_OPTIMIZE
91b9a277 1369 cn = (struct tnode *)n;
19baf839 1370
91b9a277
OJ
1371 /*
1372 * It's a tnode, and we can do some extra checks here if we
1373 * like, to avoid descending into a dead-end branch.
1374 * This tnode is in the parent's child array at index
1375 * key[p_pos..p_pos+p_bits] but potentially with some bits
1376 * chopped off, so in reality the index may be just a
1377 * subprefix, padded with zero at the end.
1378 * We can also take a look at any skipped bits in this
1379 * tnode - everything up to p_pos is supposed to be ok,
1380 * and the non-chopped bits of the index (se previous
1381 * paragraph) are also guaranteed ok, but the rest is
1382 * considered unknown.
1383 *
1384 * The skipped bits are key[pos+bits..cn->pos].
1385 */
19baf839 1386
91b9a277
OJ
1387 /* If current_prefix_length < pos+bits, we are already doing
1388 * actual prefix matching, which means everything from
1389 * pos+(bits-chopped_off) onward must be zero along some
1390 * branch of this subtree - otherwise there is *no* valid
1391 * prefix present. Here we can only check the skipped
1392 * bits. Remember, since we have already indexed into the
1393 * parent's child array, we know that the bits we chopped of
1394 * *are* zero.
1395 */
19baf839 1396
91b9a277 1397 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
19baf839 1398
91b9a277
OJ
1399 if (current_prefix_length < pos+bits) {
1400 if (tkey_extract_bits(cn->key, current_prefix_length,
1401 cn->pos - current_prefix_length) != 0 ||
1402 !(cn->child[0]))
1403 goto backtrace;
1404 }
19baf839 1405
91b9a277
OJ
1406 /*
1407 * If chopped_off=0, the index is fully validated and we
1408 * only need to look at the skipped bits for this, the new,
1409 * tnode. What we actually want to do is to find out if
1410 * these skipped bits match our key perfectly, or if we will
1411 * have to count on finding a matching prefix further down,
1412 * because if we do, we would like to have some way of
1413 * verifying the existence of such a prefix at this point.
1414 */
19baf839 1415
91b9a277
OJ
1416 /* The only thing we can do at this point is to verify that
1417 * any such matching prefix can indeed be a prefix to our
1418 * key, and if the bits in the node we are inspecting that
1419 * do not match our key are not ZERO, this cannot be true.
1420 * Thus, find out where there is a mismatch (before cn->pos)
1421 * and verify that all the mismatching bits are zero in the
1422 * new tnode's key.
1423 */
19baf839 1424
91b9a277
OJ
1425 /* Note: We aren't very concerned about the piece of the key
1426 * that precede pn->pos+pn->bits, since these have already been
1427 * checked. The bits after cn->pos aren't checked since these are
1428 * by definition "unknown" at this point. Thus, what we want to
1429 * see is if we are about to enter the "prefix matching" state,
1430 * and in that case verify that the skipped bits that will prevail
1431 * throughout this subtree are zero, as they have to be if we are
1432 * to find a matching prefix.
1433 */
1434
1435 node_prefix = MASK_PFX(cn->key, cn->pos);
1436 key_prefix = MASK_PFX(key, cn->pos);
1437 pref_mismatch = key_prefix^node_prefix;
1438 mp = 0;
1439
1440 /* In short: If skipped bits in this node do not match the search
1441 * key, enter the "prefix matching" state.directly.
1442 */
1443 if (pref_mismatch) {
1444 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1445 mp++;
1446 pref_mismatch = pref_mismatch <<1;
1447 }
1448 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1449
1450 if (key_prefix != 0)
1451 goto backtrace;
1452
1453 if (current_prefix_length >= cn->pos)
1454 current_prefix_length = mp;
c877efb2 1455 }
91b9a277
OJ
1456#endif
1457 pn = (struct tnode *)n; /* Descend */
1458 chopped_off = 0;
1459 continue;
1460
19baf839
RO
1461backtrace:
1462 chopped_off++;
1463
1464 /* As zero don't change the child key (cindex) */
91b9a277 1465 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
19baf839 1466 chopped_off++;
19baf839
RO
1467
1468 /* Decrease current_... with bits chopped off */
1469 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1470 current_prefix_length = pn->pos + pn->bits - chopped_off;
91b9a277 1471
19baf839 1472 /*
c877efb2 1473 * Either we do the actual chop off according or if we have
19baf839
RO
1474 * chopped off all bits in this tnode walk up to our parent.
1475 */
1476
91b9a277 1477 if (chopped_off <= pn->bits) {
19baf839 1478 cindex &= ~(1 << (chopped_off-1));
91b9a277 1479 } else {
c877efb2 1480 if (NODE_PARENT(pn) == NULL)
19baf839 1481 goto failed;
91b9a277 1482
19baf839
RO
1483 /* Get Child's index */
1484 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1485 pn = NODE_PARENT(pn);
1486 chopped_off = 0;
1487
1488#ifdef CONFIG_IP_FIB_TRIE_STATS
1489 t->stats.backtrack++;
1490#endif
1491 goto backtrace;
c877efb2 1492 }
19baf839
RO
1493 }
1494failed:
c877efb2 1495 ret = 1;
19baf839 1496found:
2373ce1c 1497 rcu_read_unlock();
19baf839
RO
1498 return ret;
1499}
1500
2373ce1c 1501/* only called from updater side */
19baf839
RO
1502static int trie_leaf_remove(struct trie *t, t_key key)
1503{
1504 t_key cindex;
1505 struct tnode *tp = NULL;
1506 struct node *n = t->trie;
1507 struct leaf *l;
1508
0c7770c7 1509 pr_debug("entering trie_leaf_remove(%p)\n", n);
19baf839
RO
1510
1511 /* Note that in the case skipped bits, those bits are *not* checked!
c877efb2 1512 * When we finish this, we will have NULL or a T_LEAF, and the
19baf839
RO
1513 * T_LEAF may or may not match our key.
1514 */
1515
91b9a277 1516 while (n != NULL && IS_TNODE(n)) {
19baf839
RO
1517 struct tnode *tn = (struct tnode *) n;
1518 check_tnode(tn);
1519 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1520
0c7770c7 1521 BUG_ON(n && NODE_PARENT(n) != tn);
91b9a277 1522 }
19baf839
RO
1523 l = (struct leaf *) n;
1524
c877efb2 1525 if (!n || !tkey_equals(l->key, key))
19baf839 1526 return 0;
c877efb2
SH
1527
1528 /*
1529 * Key found.
1530 * Remove the leaf and rebalance the tree
19baf839
RO
1531 */
1532
1533 t->revision++;
1534 t->size--;
1535
2373ce1c 1536 preempt_disable();
19baf839
RO
1537 tp = NODE_PARENT(n);
1538 tnode_free((struct tnode *) n);
1539
c877efb2 1540 if (tp) {
19baf839
RO
1541 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1542 put_child(t, (struct tnode *)tp, cindex, NULL);
2373ce1c 1543 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
91b9a277 1544 } else
2373ce1c
RO
1545 rcu_assign_pointer(t->trie, NULL);
1546 preempt_enable();
19baf839
RO
1547
1548 return 1;
1549}
1550
1551static int
1552fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
91b9a277 1553 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
19baf839
RO
1554{
1555 struct trie *t = (struct trie *) tb->tb_data;
1556 u32 key, mask;
1557 int plen = r->rtm_dst_len;
1558 u8 tos = r->rtm_tos;
1559 struct fib_alias *fa, *fa_to_delete;
1560 struct list_head *fa_head;
1561 struct leaf *l;
91b9a277
OJ
1562 struct leaf_info *li;
1563
19baf839 1564
c877efb2 1565 if (plen > 32)
19baf839
RO
1566 return -EINVAL;
1567
1568 key = 0;
c877efb2 1569 if (rta->rta_dst)
19baf839
RO
1570 memcpy(&key, rta->rta_dst, 4);
1571
1572 key = ntohl(key);
91b9a277 1573 mask = ntohl(inet_make_mask(plen));
19baf839 1574
c877efb2 1575 if (key & ~mask)
19baf839
RO
1576 return -EINVAL;
1577
1578 key = key & mask;
1579 l = fib_find_node(t, key);
1580
c877efb2 1581 if (!l)
19baf839
RO
1582 return -ESRCH;
1583
1584 fa_head = get_fa_head(l, plen);
1585 fa = fib_find_alias(fa_head, tos, 0);
1586
1587 if (!fa)
1588 return -ESRCH;
1589
0c7770c7 1590 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1591
1592 fa_to_delete = NULL;
1593 fa_head = fa->fa_list.prev;
2373ce1c 1594
19baf839
RO
1595 list_for_each_entry(fa, fa_head, fa_list) {
1596 struct fib_info *fi = fa->fa_info;
1597
1598 if (fa->fa_tos != tos)
1599 break;
1600
1601 if ((!r->rtm_type ||
1602 fa->fa_type == r->rtm_type) &&
1603 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1604 fa->fa_scope == r->rtm_scope) &&
1605 (!r->rtm_protocol ||
1606 fi->fib_protocol == r->rtm_protocol) &&
1607 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1608 fa_to_delete = fa;
1609 break;
1610 }
1611 }
1612
91b9a277
OJ
1613 if (!fa_to_delete)
1614 return -ESRCH;
19baf839 1615
91b9a277
OJ
1616 fa = fa_to_delete;
1617 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1618
1619 l = fib_find_node(t, key);
772cb712 1620 li = find_leaf_info(l, plen);
19baf839 1621
2373ce1c 1622 list_del_rcu(&fa->fa_list);
19baf839 1623
91b9a277 1624 if (list_empty(fa_head)) {
2373ce1c 1625 hlist_del_rcu(&li->hlist);
91b9a277 1626 free_leaf_info(li);
2373ce1c 1627 }
19baf839 1628
91b9a277
OJ
1629 if (hlist_empty(&l->list))
1630 trie_leaf_remove(t, key);
19baf839 1631
91b9a277
OJ
1632 if (fa->fa_state & FA_S_ACCESSED)
1633 rt_cache_flush(-1);
19baf839 1634
2373ce1c
RO
1635 fib_release_info(fa->fa_info);
1636 alias_free_mem_rcu(fa);
91b9a277 1637 return 0;
19baf839
RO
1638}
1639
1640static int trie_flush_list(struct trie *t, struct list_head *head)
1641{
1642 struct fib_alias *fa, *fa_node;
1643 int found = 0;
1644
1645 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1646 struct fib_info *fi = fa->fa_info;
19baf839 1647
2373ce1c
RO
1648 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1649 list_del_rcu(&fa->fa_list);
1650 fib_release_info(fa->fa_info);
1651 alias_free_mem_rcu(fa);
19baf839
RO
1652 found++;
1653 }
1654 }
1655 return found;
1656}
1657
1658static int trie_flush_leaf(struct trie *t, struct leaf *l)
1659{
1660 int found = 0;
1661 struct hlist_head *lih = &l->list;
1662 struct hlist_node *node, *tmp;
1663 struct leaf_info *li = NULL;
1664
1665 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
19baf839
RO
1666 found += trie_flush_list(t, &li->falh);
1667
1668 if (list_empty(&li->falh)) {
2373ce1c 1669 hlist_del_rcu(&li->hlist);
19baf839
RO
1670 free_leaf_info(li);
1671 }
1672 }
1673 return found;
1674}
1675
2373ce1c
RO
1676/* rcu_read_lock needs to be hold by caller from readside */
1677
19baf839
RO
1678static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1679{
1680 struct node *c = (struct node *) thisleaf;
1681 struct tnode *p;
1682 int idx;
2373ce1c 1683 struct node *trie = rcu_dereference(t->trie);
19baf839 1684
c877efb2 1685 if (c == NULL) {
2373ce1c 1686 if (trie == NULL)
19baf839
RO
1687 return NULL;
1688
2373ce1c
RO
1689 if (IS_LEAF(trie)) /* trie w. just a leaf */
1690 return (struct leaf *) trie;
19baf839 1691
2373ce1c 1692 p = (struct tnode*) trie; /* Start */
91b9a277 1693 } else
19baf839 1694 p = (struct tnode *) NODE_PARENT(c);
c877efb2 1695
19baf839
RO
1696 while (p) {
1697 int pos, last;
1698
1699 /* Find the next child of the parent */
c877efb2
SH
1700 if (c)
1701 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1702 else
19baf839
RO
1703 pos = 0;
1704
1705 last = 1 << p->bits;
91b9a277 1706 for (idx = pos; idx < last ; idx++) {
2373ce1c
RO
1707 c = rcu_dereference(p->child[idx]);
1708
1709 if (!c)
91b9a277
OJ
1710 continue;
1711
1712 /* Decend if tnode */
2373ce1c
RO
1713 while (IS_TNODE(c)) {
1714 p = (struct tnode *) c;
1715 idx = 0;
91b9a277
OJ
1716
1717 /* Rightmost non-NULL branch */
1718 if (p && IS_TNODE(p))
2373ce1c
RO
1719 while (!(c = rcu_dereference(p->child[idx]))
1720 && idx < (1<<p->bits)) idx++;
91b9a277
OJ
1721
1722 /* Done with this tnode? */
2373ce1c 1723 if (idx >= (1 << p->bits) || !c)
91b9a277 1724 goto up;
19baf839 1725 }
2373ce1c 1726 return (struct leaf *) c;
19baf839
RO
1727 }
1728up:
1729 /* No more children go up one step */
91b9a277 1730 c = (struct node *) p;
19baf839
RO
1731 p = (struct tnode *) NODE_PARENT(p);
1732 }
1733 return NULL; /* Ready. Root of trie */
1734}
1735
1736static int fn_trie_flush(struct fib_table *tb)
1737{
1738 struct trie *t = (struct trie *) tb->tb_data;
1739 struct leaf *ll = NULL, *l = NULL;
1740 int found = 0, h;
1741
1742 t->revision++;
1743
91b9a277 1744 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
19baf839
RO
1745 found += trie_flush_leaf(t, l);
1746
1747 if (ll && hlist_empty(&ll->list))
1748 trie_leaf_remove(t, ll->key);
1749 ll = l;
1750 }
1751
1752 if (ll && hlist_empty(&ll->list))
1753 trie_leaf_remove(t, ll->key);
1754
0c7770c7 1755 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1756 return found;
1757}
1758
91b9a277 1759static int trie_last_dflt = -1;
19baf839
RO
1760
1761static void
1762fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1763{
1764 struct trie *t = (struct trie *) tb->tb_data;
1765 int order, last_idx;
1766 struct fib_info *fi = NULL;
1767 struct fib_info *last_resort;
1768 struct fib_alias *fa = NULL;
1769 struct list_head *fa_head;
1770 struct leaf *l;
1771
1772 last_idx = -1;
1773 last_resort = NULL;
1774 order = -1;
1775
2373ce1c 1776 rcu_read_lock();
c877efb2 1777
19baf839 1778 l = fib_find_node(t, 0);
c877efb2 1779 if (!l)
19baf839
RO
1780 goto out;
1781
1782 fa_head = get_fa_head(l, 0);
c877efb2 1783 if (!fa_head)
19baf839
RO
1784 goto out;
1785
c877efb2 1786 if (list_empty(fa_head))
19baf839
RO
1787 goto out;
1788
2373ce1c 1789 list_for_each_entry_rcu(fa, fa_head, fa_list) {
19baf839 1790 struct fib_info *next_fi = fa->fa_info;
91b9a277 1791
19baf839
RO
1792 if (fa->fa_scope != res->scope ||
1793 fa->fa_type != RTN_UNICAST)
1794 continue;
91b9a277 1795
19baf839
RO
1796 if (next_fi->fib_priority > res->fi->fib_priority)
1797 break;
1798 if (!next_fi->fib_nh[0].nh_gw ||
1799 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1800 continue;
1801 fa->fa_state |= FA_S_ACCESSED;
91b9a277 1802
19baf839
RO
1803 if (fi == NULL) {
1804 if (next_fi != res->fi)
1805 break;
1806 } else if (!fib_detect_death(fi, order, &last_resort,
1807 &last_idx, &trie_last_dflt)) {
1808 if (res->fi)
1809 fib_info_put(res->fi);
1810 res->fi = fi;
1811 atomic_inc(&fi->fib_clntref);
1812 trie_last_dflt = order;
1813 goto out;
1814 }
1815 fi = next_fi;
1816 order++;
1817 }
1818 if (order <= 0 || fi == NULL) {
1819 trie_last_dflt = -1;
1820 goto out;
1821 }
1822
1823 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1824 if (res->fi)
1825 fib_info_put(res->fi);
1826 res->fi = fi;
1827 atomic_inc(&fi->fib_clntref);
1828 trie_last_dflt = order;
1829 goto out;
1830 }
1831 if (last_idx >= 0) {
1832 if (res->fi)
1833 fib_info_put(res->fi);
1834 res->fi = last_resort;
1835 if (last_resort)
1836 atomic_inc(&last_resort->fib_clntref);
1837 }
1838 trie_last_dflt = last_idx;
1839 out:;
2373ce1c 1840 rcu_read_unlock();
19baf839
RO
1841}
1842
c877efb2 1843static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
19baf839
RO
1844 struct sk_buff *skb, struct netlink_callback *cb)
1845{
1846 int i, s_i;
1847 struct fib_alias *fa;
1848
91b9a277 1849 u32 xkey = htonl(key);
19baf839 1850
91b9a277 1851 s_i = cb->args[3];
19baf839
RO
1852 i = 0;
1853
2373ce1c
RO
1854 /* rcu_read_lock is hold by caller */
1855
1856 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1857 if (i < s_i) {
1858 i++;
1859 continue;
1860 }
78c6671a 1861 BUG_ON(!fa->fa_info);
19baf839
RO
1862
1863 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1864 cb->nlh->nlmsg_seq,
1865 RTM_NEWROUTE,
1866 tb->tb_id,
1867 fa->fa_type,
1868 fa->fa_scope,
1869 &xkey,
1870 plen,
1871 fa->fa_tos,
90f66914 1872 fa->fa_info, 0) < 0) {
19baf839
RO
1873 cb->args[3] = i;
1874 return -1;
91b9a277 1875 }
19baf839
RO
1876 i++;
1877 }
91b9a277 1878 cb->args[3] = i;
19baf839
RO
1879 return skb->len;
1880}
1881
c877efb2 1882static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
19baf839
RO
1883 struct netlink_callback *cb)
1884{
1885 int h, s_h;
1886 struct list_head *fa_head;
1887 struct leaf *l = NULL;
19baf839 1888
91b9a277 1889 s_h = cb->args[2];
19baf839 1890
91b9a277 1891 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
19baf839
RO
1892 if (h < s_h)
1893 continue;
1894 if (h > s_h)
1895 memset(&cb->args[3], 0,
1896 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1897
1898 fa_head = get_fa_head(l, plen);
91b9a277 1899
c877efb2 1900 if (!fa_head)
19baf839
RO
1901 continue;
1902
c877efb2 1903 if (list_empty(fa_head))
19baf839
RO
1904 continue;
1905
1906 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
91b9a277 1907 cb->args[2] = h;
19baf839
RO
1908 return -1;
1909 }
1910 }
91b9a277 1911 cb->args[2] = h;
19baf839
RO
1912 return skb->len;
1913}
1914
1915static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1916{
1917 int m, s_m;
1918 struct trie *t = (struct trie *) tb->tb_data;
1919
1920 s_m = cb->args[1];
1921
2373ce1c 1922 rcu_read_lock();
91b9a277 1923 for (m = 0; m <= 32; m++) {
19baf839
RO
1924 if (m < s_m)
1925 continue;
1926 if (m > s_m)
1927 memset(&cb->args[2], 0,
91b9a277 1928 sizeof(cb->args) - 2*sizeof(cb->args[0]));
19baf839
RO
1929
1930 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1931 cb->args[1] = m;
1932 goto out;
1933 }
1934 }
2373ce1c 1935 rcu_read_unlock();
19baf839
RO
1936 cb->args[1] = m;
1937 return skb->len;
91b9a277 1938out:
2373ce1c 1939 rcu_read_unlock();
19baf839
RO
1940 return -1;
1941}
1942
1943/* Fix more generic FIB names for init later */
1944
1945#ifdef CONFIG_IP_MULTIPLE_TABLES
1946struct fib_table * fib_hash_init(int id)
1947#else
1948struct fib_table * __init fib_hash_init(int id)
1949#endif
1950{
1951 struct fib_table *tb;
1952 struct trie *t;
1953
1954 if (fn_alias_kmem == NULL)
1955 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1956 sizeof(struct fib_alias),
1957 0, SLAB_HWCACHE_ALIGN,
1958 NULL, NULL);
1959
1960 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1961 GFP_KERNEL);
1962 if (tb == NULL)
1963 return NULL;
1964
1965 tb->tb_id = id;
1966 tb->tb_lookup = fn_trie_lookup;
1967 tb->tb_insert = fn_trie_insert;
1968 tb->tb_delete = fn_trie_delete;
1969 tb->tb_flush = fn_trie_flush;
1970 tb->tb_select_default = fn_trie_select_default;
1971 tb->tb_dump = fn_trie_dump;
1972 memset(tb->tb_data, 0, sizeof(struct trie));
1973
1974 t = (struct trie *) tb->tb_data;
1975
1976 trie_init(t);
1977
c877efb2 1978 if (id == RT_TABLE_LOCAL)
91b9a277 1979 trie_local = t;
c877efb2 1980 else if (id == RT_TABLE_MAIN)
91b9a277 1981 trie_main = t;
19baf839
RO
1982
1983 if (id == RT_TABLE_LOCAL)
78c6671a 1984 printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
19baf839
RO
1985
1986 return tb;
1987}
1988
cb7b593c
SH
1989#ifdef CONFIG_PROC_FS
1990/* Depth first Trie walk iterator */
1991struct fib_trie_iter {
1992 struct tnode *tnode;
1993 struct trie *trie;
1994 unsigned index;
1995 unsigned depth;
1996};
19baf839 1997
cb7b593c 1998static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 1999{
cb7b593c
SH
2000 struct tnode *tn = iter->tnode;
2001 unsigned cindex = iter->index;
2002 struct tnode *p;
19baf839 2003
cb7b593c
SH
2004 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2005 iter->tnode, iter->index, iter->depth);
2006rescan:
2007 while (cindex < (1<<tn->bits)) {
2008 struct node *n = tnode_get_child(tn, cindex);
19baf839 2009
cb7b593c
SH
2010 if (n) {
2011 if (IS_LEAF(n)) {
2012 iter->tnode = tn;
2013 iter->index = cindex + 1;
2014 } else {
2015 /* push down one level */
2016 iter->tnode = (struct tnode *) n;
2017 iter->index = 0;
2018 ++iter->depth;
2019 }
2020 return n;
2021 }
19baf839 2022
cb7b593c
SH
2023 ++cindex;
2024 }
91b9a277 2025
cb7b593c
SH
2026 /* Current node exhausted, pop back up */
2027 p = NODE_PARENT(tn);
2028 if (p) {
2029 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2030 tn = p;
2031 --iter->depth;
2032 goto rescan;
19baf839 2033 }
cb7b593c
SH
2034
2035 /* got root? */
2036 return NULL;
19baf839
RO
2037}
2038
cb7b593c
SH
2039static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2040 struct trie *t)
19baf839 2041{
5ddf0eb2
RO
2042 struct node *n ;
2043
2044 if(!t)
2045 return NULL;
2046
2047 n = rcu_dereference(t->trie);
2048
2049 if(!iter)
2050 return NULL;
19baf839 2051
cb7b593c
SH
2052 if (n && IS_TNODE(n)) {
2053 iter->tnode = (struct tnode *) n;
2054 iter->trie = t;
2055 iter->index = 0;
1d25cd6c 2056 iter->depth = 1;
cb7b593c 2057 return n;
91b9a277 2058 }
cb7b593c
SH
2059 return NULL;
2060}
91b9a277 2061
cb7b593c
SH
2062static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2063{
2064 struct node *n;
2065 struct fib_trie_iter iter;
91b9a277 2066
cb7b593c 2067 memset(s, 0, sizeof(*s));
91b9a277 2068
cb7b593c
SH
2069 rcu_read_lock();
2070 for (n = fib_trie_get_first(&iter, t); n;
2071 n = fib_trie_get_next(&iter)) {
2072 if (IS_LEAF(n)) {
2073 s->leaves++;
2074 s->totdepth += iter.depth;
2075 if (iter.depth > s->maxdepth)
2076 s->maxdepth = iter.depth;
2077 } else {
2078 const struct tnode *tn = (const struct tnode *) n;
2079 int i;
2080
2081 s->tnodes++;
06ef921d
RO
2082 if(tn->bits < MAX_STAT_DEPTH)
2083 s->nodesizes[tn->bits]++;
2084
cb7b593c
SH
2085 for (i = 0; i < (1<<tn->bits); i++)
2086 if (!tn->child[i])
2087 s->nullpointers++;
19baf839 2088 }
19baf839 2089 }
2373ce1c 2090 rcu_read_unlock();
19baf839
RO
2091}
2092
cb7b593c
SH
2093/*
2094 * This outputs /proc/net/fib_triestats
2095 */
2096static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 2097{
cb7b593c 2098 unsigned i, max, pointers, bytes, avdepth;
c877efb2 2099
cb7b593c
SH
2100 if (stat->leaves)
2101 avdepth = stat->totdepth*100 / stat->leaves;
2102 else
2103 avdepth = 0;
91b9a277 2104
cb7b593c
SH
2105 seq_printf(seq, "\tAver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2106 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 2107
cb7b593c 2108 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
91b9a277 2109
cb7b593c
SH
2110 bytes = sizeof(struct leaf) * stat->leaves;
2111 seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2112 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 2113
06ef921d
RO
2114 max = MAX_STAT_DEPTH;
2115 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 2116 max--;
19baf839 2117
cb7b593c
SH
2118 pointers = 0;
2119 for (i = 1; i <= max; i++)
2120 if (stat->nodesizes[i] != 0) {
2121 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2122 pointers += (1<<i) * stat->nodesizes[i];
2123 }
2124 seq_putc(seq, '\n');
2125 seq_printf(seq, "\tPointers: %d\n", pointers);
2373ce1c 2126
cb7b593c
SH
2127 bytes += sizeof(struct node *) * pointers;
2128 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2129 seq_printf(seq, "Total size: %d kB\n", (bytes + 1023) / 1024);
2373ce1c 2130
cb7b593c
SH
2131#ifdef CONFIG_IP_FIB_TRIE_STATS
2132 seq_printf(seq, "Counters:\n---------\n");
2133 seq_printf(seq,"gets = %d\n", t->stats.gets);
2134 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2135 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2136 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2137 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2138 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2139#ifdef CLEAR_STATS
2140 memset(&(t->stats), 0, sizeof(t->stats));
2141#endif
2142#endif /* CONFIG_IP_FIB_TRIE_STATS */
2143}
19baf839 2144
cb7b593c
SH
2145static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2146{
2147 struct trie_stat *stat;
91b9a277 2148
cb7b593c
SH
2149 stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2150 if (!stat)
2151 return -ENOMEM;
91b9a277 2152
cb7b593c
SH
2153 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2154 sizeof(struct leaf), sizeof(struct tnode));
91b9a277 2155
cb7b593c
SH
2156 if (trie_local) {
2157 seq_printf(seq, "Local:\n");
2158 trie_collect_stats(trie_local, stat);
2159 trie_show_stats(seq, stat);
2160 }
91b9a277 2161
cb7b593c
SH
2162 if (trie_main) {
2163 seq_printf(seq, "Main:\n");
2164 trie_collect_stats(trie_main, stat);
2165 trie_show_stats(seq, stat);
19baf839 2166 }
cb7b593c 2167 kfree(stat);
19baf839 2168
cb7b593c 2169 return 0;
19baf839
RO
2170}
2171
cb7b593c 2172static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2173{
cb7b593c 2174 return single_open(file, fib_triestat_seq_show, NULL);
19baf839
RO
2175}
2176
cb7b593c
SH
2177static struct file_operations fib_triestat_fops = {
2178 .owner = THIS_MODULE,
2179 .open = fib_triestat_seq_open,
2180 .read = seq_read,
2181 .llseek = seq_lseek,
2182 .release = single_release,
2183};
2184
2185static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2186 loff_t pos)
19baf839 2187{
cb7b593c
SH
2188 loff_t idx = 0;
2189 struct node *n;
2190
2191 for (n = fib_trie_get_first(iter, trie_local);
2192 n; ++idx, n = fib_trie_get_next(iter)) {
2193 if (pos == idx)
2194 return n;
2195 }
2196
2197 for (n = fib_trie_get_first(iter, trie_main);
2198 n; ++idx, n = fib_trie_get_next(iter)) {
2199 if (pos == idx)
2200 return n;
2201 }
19baf839
RO
2202 return NULL;
2203}
2204
cb7b593c 2205static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
19baf839 2206{
cb7b593c
SH
2207 rcu_read_lock();
2208 if (*pos == 0)
91b9a277 2209 return SEQ_START_TOKEN;
cb7b593c 2210 return fib_trie_get_idx(seq->private, *pos - 1);
19baf839
RO
2211}
2212
cb7b593c 2213static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2214{
cb7b593c
SH
2215 struct fib_trie_iter *iter = seq->private;
2216 void *l = v;
2217
19baf839 2218 ++*pos;
91b9a277 2219 if (v == SEQ_START_TOKEN)
cb7b593c 2220 return fib_trie_get_idx(iter, 0);
19baf839 2221
cb7b593c
SH
2222 v = fib_trie_get_next(iter);
2223 BUG_ON(v == l);
2224 if (v)
2225 return v;
19baf839 2226
cb7b593c
SH
2227 /* continue scan in next trie */
2228 if (iter->trie == trie_local)
2229 return fib_trie_get_first(iter, trie_main);
19baf839 2230
cb7b593c
SH
2231 return NULL;
2232}
19baf839 2233
cb7b593c 2234static void fib_trie_seq_stop(struct seq_file *seq, void *v)
19baf839 2235{
cb7b593c
SH
2236 rcu_read_unlock();
2237}
91b9a277 2238
cb7b593c
SH
2239static void seq_indent(struct seq_file *seq, int n)
2240{
2241 while (n-- > 0) seq_puts(seq, " ");
2242}
19baf839 2243
cb7b593c
SH
2244static inline const char *rtn_scope(enum rt_scope_t s)
2245{
2246 static char buf[32];
19baf839 2247
cb7b593c
SH
2248 switch(s) {
2249 case RT_SCOPE_UNIVERSE: return "universe";
2250 case RT_SCOPE_SITE: return "site";
2251 case RT_SCOPE_LINK: return "link";
2252 case RT_SCOPE_HOST: return "host";
2253 case RT_SCOPE_NOWHERE: return "nowhere";
2254 default:
2255 snprintf(buf, sizeof(buf), "scope=%d", s);
2256 return buf;
2257 }
2258}
19baf839 2259
cb7b593c
SH
2260static const char *rtn_type_names[__RTN_MAX] = {
2261 [RTN_UNSPEC] = "UNSPEC",
2262 [RTN_UNICAST] = "UNICAST",
2263 [RTN_LOCAL] = "LOCAL",
2264 [RTN_BROADCAST] = "BROADCAST",
2265 [RTN_ANYCAST] = "ANYCAST",
2266 [RTN_MULTICAST] = "MULTICAST",
2267 [RTN_BLACKHOLE] = "BLACKHOLE",
2268 [RTN_UNREACHABLE] = "UNREACHABLE",
2269 [RTN_PROHIBIT] = "PROHIBIT",
2270 [RTN_THROW] = "THROW",
2271 [RTN_NAT] = "NAT",
2272 [RTN_XRESOLVE] = "XRESOLVE",
2273};
19baf839 2274
cb7b593c
SH
2275static inline const char *rtn_type(unsigned t)
2276{
2277 static char buf[32];
19baf839 2278
cb7b593c
SH
2279 if (t < __RTN_MAX && rtn_type_names[t])
2280 return rtn_type_names[t];
2281 snprintf(buf, sizeof(buf), "type %d", t);
2282 return buf;
19baf839
RO
2283}
2284
cb7b593c
SH
2285/* Pretty print the trie */
2286static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2287{
cb7b593c
SH
2288 const struct fib_trie_iter *iter = seq->private;
2289 struct node *n = v;
c877efb2 2290
cb7b593c
SH
2291 if (v == SEQ_START_TOKEN)
2292 return 0;
19baf839 2293
cb7b593c
SH
2294 if (IS_TNODE(n)) {
2295 struct tnode *tn = (struct tnode *) n;
2296 t_key prf = ntohl(MASK_PFX(tn->key, tn->pos));
91b9a277 2297
cb7b593c
SH
2298 if (!NODE_PARENT(n)) {
2299 if (iter->trie == trie_local)
2300 seq_puts(seq, "<local>:\n");
2301 else
2302 seq_puts(seq, "<main>:\n");
1d25cd6c
RO
2303 }
2304 seq_indent(seq, iter->depth-1);
2305 seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n",
2306 NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2307 tn->empty_children);
2308
cb7b593c
SH
2309 } else {
2310 struct leaf *l = (struct leaf *) n;
2311 int i;
2312 u32 val = ntohl(l->key);
2313
2314 seq_indent(seq, iter->depth);
2315 seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
2316 for (i = 32; i >= 0; i--) {
772cb712 2317 struct leaf_info *li = find_leaf_info(l, i);
cb7b593c
SH
2318 if (li) {
2319 struct fib_alias *fa;
2320 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2321 seq_indent(seq, iter->depth+1);
2322 seq_printf(seq, " /%d %s %s", i,
2323 rtn_scope(fa->fa_scope),
2324 rtn_type(fa->fa_type));
2325 if (fa->fa_tos)
2326 seq_printf(seq, "tos =%d\n",
2327 fa->fa_tos);
2328 seq_putc(seq, '\n');
2329 }
2330 }
2331 }
19baf839 2332 }
cb7b593c 2333
19baf839
RO
2334 return 0;
2335}
2336
cb7b593c
SH
2337static struct seq_operations fib_trie_seq_ops = {
2338 .start = fib_trie_seq_start,
2339 .next = fib_trie_seq_next,
2340 .stop = fib_trie_seq_stop,
2341 .show = fib_trie_seq_show,
19baf839
RO
2342};
2343
cb7b593c 2344static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839
RO
2345{
2346 struct seq_file *seq;
2347 int rc = -ENOMEM;
cb7b593c 2348 struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
19baf839 2349
cb7b593c
SH
2350 if (!s)
2351 goto out;
2352
2353 rc = seq_open(file, &fib_trie_seq_ops);
19baf839
RO
2354 if (rc)
2355 goto out_kfree;
2356
cb7b593c
SH
2357 seq = file->private_data;
2358 seq->private = s;
2359 memset(s, 0, sizeof(*s));
19baf839
RO
2360out:
2361 return rc;
2362out_kfree:
cb7b593c 2363 kfree(s);
19baf839
RO
2364 goto out;
2365}
2366
cb7b593c
SH
2367static struct file_operations fib_trie_fops = {
2368 .owner = THIS_MODULE,
2369 .open = fib_trie_seq_open,
2370 .read = seq_read,
2371 .llseek = seq_lseek,
c877efb2 2372 .release = seq_release_private,
19baf839
RO
2373};
2374
cb7b593c 2375static unsigned fib_flag_trans(int type, u32 mask, const struct fib_info *fi)
19baf839 2376{
cb7b593c
SH
2377 static unsigned type2flags[RTN_MAX + 1] = {
2378 [7] = RTF_REJECT, [8] = RTF_REJECT,
2379 };
2380 unsigned flags = type2flags[type];
19baf839 2381
cb7b593c
SH
2382 if (fi && fi->fib_nh->nh_gw)
2383 flags |= RTF_GATEWAY;
2384 if (mask == 0xFFFFFFFF)
2385 flags |= RTF_HOST;
2386 flags |= RTF_UP;
2387 return flags;
19baf839
RO
2388}
2389
cb7b593c
SH
2390/*
2391 * This outputs /proc/net/route.
2392 * The format of the file is not supposed to be changed
2393 * and needs to be same as fib_hash output to avoid breaking
2394 * legacy utilities
2395 */
2396static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2397{
c9e53cbe 2398 const struct fib_trie_iter *iter = seq->private;
cb7b593c
SH
2399 struct leaf *l = v;
2400 int i;
2401 char bf[128];
19baf839 2402
cb7b593c
SH
2403 if (v == SEQ_START_TOKEN) {
2404 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2405 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2406 "\tWindow\tIRTT");
2407 return 0;
2408 }
19baf839 2409
c9e53cbe
PM
2410 if (iter->trie == trie_local)
2411 return 0;
cb7b593c
SH
2412 if (IS_TNODE(l))
2413 return 0;
19baf839 2414
cb7b593c 2415 for (i=32; i>=0; i--) {
772cb712 2416 struct leaf_info *li = find_leaf_info(l, i);
cb7b593c
SH
2417 struct fib_alias *fa;
2418 u32 mask, prefix;
91b9a277 2419
cb7b593c
SH
2420 if (!li)
2421 continue;
19baf839 2422
cb7b593c
SH
2423 mask = inet_make_mask(li->plen);
2424 prefix = htonl(l->key);
19baf839 2425
cb7b593c 2426 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2427 const struct fib_info *fi = fa->fa_info;
cb7b593c 2428 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
19baf839 2429
cb7b593c
SH
2430 if (fa->fa_type == RTN_BROADCAST
2431 || fa->fa_type == RTN_MULTICAST)
2432 continue;
19baf839 2433
cb7b593c
SH
2434 if (fi)
2435 snprintf(bf, sizeof(bf),
2436 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2437 fi->fib_dev ? fi->fib_dev->name : "*",
2438 prefix,
2439 fi->fib_nh->nh_gw, flags, 0, 0,
2440 fi->fib_priority,
2441 mask,
2442 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2443 fi->fib_window,
2444 fi->fib_rtt >> 3);
2445 else
2446 snprintf(bf, sizeof(bf),
2447 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2448 prefix, 0, flags, 0, 0, 0,
2449 mask, 0, 0, 0);
19baf839 2450
cb7b593c
SH
2451 seq_printf(seq, "%-127s\n", bf);
2452 }
19baf839
RO
2453 }
2454
2455 return 0;
2456}
2457
cb7b593c
SH
2458static struct seq_operations fib_route_seq_ops = {
2459 .start = fib_trie_seq_start,
2460 .next = fib_trie_seq_next,
2461 .stop = fib_trie_seq_stop,
2462 .show = fib_route_seq_show,
19baf839
RO
2463};
2464
cb7b593c 2465static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839
RO
2466{
2467 struct seq_file *seq;
2468 int rc = -ENOMEM;
cb7b593c 2469 struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
19baf839 2470
cb7b593c
SH
2471 if (!s)
2472 goto out;
2473
2474 rc = seq_open(file, &fib_route_seq_ops);
19baf839
RO
2475 if (rc)
2476 goto out_kfree;
2477
cb7b593c
SH
2478 seq = file->private_data;
2479 seq->private = s;
2480 memset(s, 0, sizeof(*s));
19baf839
RO
2481out:
2482 return rc;
2483out_kfree:
cb7b593c 2484 kfree(s);
19baf839
RO
2485 goto out;
2486}
2487
cb7b593c
SH
2488static struct file_operations fib_route_fops = {
2489 .owner = THIS_MODULE,
2490 .open = fib_route_seq_open,
2491 .read = seq_read,
2492 .llseek = seq_lseek,
2493 .release = seq_release_private,
19baf839
RO
2494};
2495
2496int __init fib_proc_init(void)
2497{
cb7b593c
SH
2498 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
2499 goto out1;
2500
2501 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
2502 goto out2;
2503
2504 if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
2505 goto out3;
2506
19baf839 2507 return 0;
cb7b593c
SH
2508
2509out3:
2510 proc_net_remove("fib_triestat");
2511out2:
2512 proc_net_remove("fib_trie");
2513out1:
2514 return -ENOMEM;
19baf839
RO
2515}
2516
2517void __init fib_proc_exit(void)
2518{
2519 proc_net_remove("fib_trie");
cb7b593c
SH
2520 proc_net_remove("fib_triestat");
2521 proc_net_remove("route");
19baf839
RO
2522}
2523
2524#endif /* CONFIG_PROC_FS */