]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - net/ipv4/fib_trie.c
tcp: gso: do not generate out of order packets
[mirror_ubuntu-jammy-kernel.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
e905a9ed 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
19baf839 11 * Agricultural Sciences.
e905a9ed 12 *
19baf839
RO
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
25985edc 15 * This work is based on the LPC-trie which is originally described in:
e905a9ed 16 *
19baf839
RO
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
631dd1a8 19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
19baf839
RO
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
19baf839
RO
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
fd966255
RO
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
19baf839
RO
49 */
50
80b71b80 51#define VERSION "0.409"
19baf839 52
19baf839 53#include <asm/uaccess.h>
1977f032 54#include <linux/bitops.h>
19baf839
RO
55#include <linux/types.h>
56#include <linux/kernel.h>
19baf839
RO
57#include <linux/mm.h>
58#include <linux/string.h>
59#include <linux/socket.h>
60#include <linux/sockios.h>
61#include <linux/errno.h>
62#include <linux/in.h>
63#include <linux/inet.h>
cd8787ab 64#include <linux/inetdevice.h>
19baf839
RO
65#include <linux/netdevice.h>
66#include <linux/if_arp.h>
67#include <linux/proc_fs.h>
2373ce1c 68#include <linux/rcupdate.h>
19baf839
RO
69#include <linux/skbuff.h>
70#include <linux/netlink.h>
71#include <linux/init.h>
72#include <linux/list.h>
5a0e3ad6 73#include <linux/slab.h>
70c71606 74#include <linux/prefetch.h>
bc3b2d7f 75#include <linux/export.h>
457c4cbc 76#include <net/net_namespace.h>
19baf839
RO
77#include <net/ip.h>
78#include <net/protocol.h>
79#include <net/route.h>
80#include <net/tcp.h>
81#include <net/sock.h>
82#include <net/ip_fib.h>
83#include "fib_lookup.h"
84
06ef921d 85#define MAX_STAT_DEPTH 32
19baf839 86
19baf839 87#define KEYLENGTH (8*sizeof(t_key))
19baf839 88
19baf839
RO
89typedef unsigned int t_key;
90
91#define T_TNODE 0
92#define T_LEAF 1
93#define NODE_TYPE_MASK 0x1UL
2373ce1c
RO
94#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95
91b9a277
OJ
96#define IS_TNODE(n) (!(n->parent & T_LEAF))
97#define IS_LEAF(n) (n->parent & T_LEAF)
19baf839 98
b299e4f0 99struct rt_trie_node {
91b9a277 100 unsigned long parent;
8d965444 101 t_key key;
19baf839
RO
102};
103
104struct leaf {
91b9a277 105 unsigned long parent;
8d965444 106 t_key key;
19baf839 107 struct hlist_head list;
2373ce1c 108 struct rcu_head rcu;
19baf839
RO
109};
110
111struct leaf_info {
112 struct hlist_node hlist;
113 int plen;
5c74501f 114 u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
19baf839 115 struct list_head falh;
5c74501f 116 struct rcu_head rcu;
19baf839
RO
117};
118
119struct tnode {
91b9a277 120 unsigned long parent;
8d965444 121 t_key key;
112d8cfc
ED
122 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
123 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
8d965444
ED
124 unsigned int full_children; /* KEYLENGTH bits needed */
125 unsigned int empty_children; /* KEYLENGTH bits needed */
15be75cd
SH
126 union {
127 struct rcu_head rcu;
e0f7cb8c 128 struct tnode *tnode_free;
15be75cd 129 };
0a5c0475 130 struct rt_trie_node __rcu *child[0];
19baf839
RO
131};
132
133#ifdef CONFIG_IP_FIB_TRIE_STATS
134struct trie_use_stats {
135 unsigned int gets;
136 unsigned int backtrack;
137 unsigned int semantic_match_passed;
138 unsigned int semantic_match_miss;
139 unsigned int null_node_hit;
2f36895a 140 unsigned int resize_node_skipped;
19baf839
RO
141};
142#endif
143
144struct trie_stat {
145 unsigned int totdepth;
146 unsigned int maxdepth;
147 unsigned int tnodes;
148 unsigned int leaves;
149 unsigned int nullpointers;
93672292 150 unsigned int prefixes;
06ef921d 151 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 152};
19baf839
RO
153
154struct trie {
0a5c0475 155 struct rt_trie_node __rcu *trie;
19baf839
RO
156#ifdef CONFIG_IP_FIB_TRIE_STATS
157 struct trie_use_stats stats;
158#endif
19baf839
RO
159};
160
b299e4f0 161static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
a07f5f50 162 int wasfull);
b299e4f0 163static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
164static struct tnode *inflate(struct trie *t, struct tnode *tn);
165static struct tnode *halve(struct trie *t, struct tnode *tn);
e0f7cb8c
JP
166/* tnodes to free after resize(); protected by RTNL */
167static struct tnode *tnode_free_head;
c3059477
JP
168static size_t tnode_free_size;
169
170/*
171 * synchronize_rcu after call_rcu for that many pages; it should be especially
172 * useful before resizing the root node with PREEMPT_NONE configs; the value was
173 * obtained experimentally, aiming to avoid visible slowdown.
174 */
175static const int sync_pages = 128;
19baf839 176
e18b890b 177static struct kmem_cache *fn_alias_kmem __read_mostly;
bc3c8c1e 178static struct kmem_cache *trie_leaf_kmem __read_mostly;
19baf839 179
0a5c0475
ED
180/*
181 * caller must hold RTNL
182 */
183static inline struct tnode *node_parent(const struct rt_trie_node *node)
06801916 184{
0a5c0475
ED
185 unsigned long parent;
186
187 parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
188
189 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
b59cfbf7
ED
190}
191
0a5c0475
ED
192/*
193 * caller must hold RCU read lock or RTNL
194 */
195static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
b59cfbf7 196{
0a5c0475
ED
197 unsigned long parent;
198
199 parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
200 lockdep_rtnl_is_held());
06801916 201
0a5c0475 202 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
06801916
SH
203}
204
cf778b00 205/* Same as rcu_assign_pointer
6440cc9e
SH
206 * but that macro() assumes that value is a pointer.
207 */
b299e4f0 208static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
06801916 209{
6440cc9e
SH
210 smp_wmb();
211 node->parent = (unsigned long)ptr | NODE_TYPE(node);
06801916 212}
2373ce1c 213
0a5c0475
ED
214/*
215 * caller must hold RTNL
216 */
217static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
b59cfbf7
ED
218{
219 BUG_ON(i >= 1U << tn->bits);
2373ce1c 220
0a5c0475 221 return rtnl_dereference(tn->child[i]);
b59cfbf7
ED
222}
223
0a5c0475
ED
224/*
225 * caller must hold RCU read lock or RTNL
226 */
227static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
19baf839 228{
0a5c0475 229 BUG_ON(i >= 1U << tn->bits);
19baf839 230
0a5c0475 231 return rcu_dereference_rtnl(tn->child[i]);
19baf839
RO
232}
233
bb435b8d 234static inline int tnode_child_length(const struct tnode *tn)
19baf839 235{
91b9a277 236 return 1 << tn->bits;
19baf839
RO
237}
238
3b004569 239static inline t_key mask_pfx(t_key k, unsigned int l)
ab66b4a7
SH
240{
241 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
242}
243
3b004569 244static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
19baf839 245{
91b9a277 246 if (offset < KEYLENGTH)
19baf839 247 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 248 else
19baf839
RO
249 return 0;
250}
251
252static inline int tkey_equals(t_key a, t_key b)
253{
c877efb2 254 return a == b;
19baf839
RO
255}
256
257static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
258{
c877efb2
SH
259 if (bits == 0 || offset >= KEYLENGTH)
260 return 1;
91b9a277
OJ
261 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
262 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 263}
19baf839
RO
264
265static inline int tkey_mismatch(t_key a, int offset, t_key b)
266{
267 t_key diff = a ^ b;
268 int i = offset;
269
c877efb2
SH
270 if (!diff)
271 return 0;
272 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
273 i++;
274 return i;
275}
276
19baf839 277/*
e905a9ed
YH
278 To understand this stuff, an understanding of keys and all their bits is
279 necessary. Every node in the trie has a key associated with it, but not
19baf839
RO
280 all of the bits in that key are significant.
281
282 Consider a node 'n' and its parent 'tp'.
283
e905a9ed
YH
284 If n is a leaf, every bit in its key is significant. Its presence is
285 necessitated by path compression, since during a tree traversal (when
286 searching for a leaf - unless we are doing an insertion) we will completely
287 ignore all skipped bits we encounter. Thus we need to verify, at the end of
288 a potentially successful search, that we have indeed been walking the
19baf839
RO
289 correct key path.
290
e905a9ed
YH
291 Note that we can never "miss" the correct key in the tree if present by
292 following the wrong path. Path compression ensures that segments of the key
293 that are the same for all keys with a given prefix are skipped, but the
294 skipped part *is* identical for each node in the subtrie below the skipped
295 bit! trie_insert() in this implementation takes care of that - note the
19baf839
RO
296 call to tkey_sub_equals() in trie_insert().
297
e905a9ed 298 if n is an internal node - a 'tnode' here, the various parts of its key
19baf839
RO
299 have many different meanings.
300
e905a9ed 301 Example:
19baf839
RO
302 _________________________________________________________________
303 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
304 -----------------------------------------------------------------
e905a9ed 305 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
306
307 _________________________________________________________________
308 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
309 -----------------------------------------------------------------
310 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
311
312 tp->pos = 7
313 tp->bits = 3
314 n->pos = 15
91b9a277 315 n->bits = 4
19baf839 316
e905a9ed
YH
317 First, let's just ignore the bits that come before the parent tp, that is
318 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
19baf839
RO
319 not use them for anything.
320
321 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
e905a9ed 322 index into the parent's child array. That is, they will be used to find
19baf839
RO
323 'n' among tp's children.
324
325 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
326 for the node n.
327
e905a9ed 328 All the bits we have seen so far are significant to the node n. The rest
19baf839
RO
329 of the bits are really not needed or indeed known in n->key.
330
e905a9ed 331 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
19baf839 332 n's child array, and will of course be different for each child.
e905a9ed 333
c877efb2 334
19baf839
RO
335 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
336 at this point.
337
338*/
339
0c7770c7 340static inline void check_tnode(const struct tnode *tn)
19baf839 341{
0c7770c7 342 WARN_ON(tn && tn->pos+tn->bits > 32);
19baf839
RO
343}
344
f5026fab
DL
345static const int halve_threshold = 25;
346static const int inflate_threshold = 50;
345aa031 347static const int halve_threshold_root = 15;
80b71b80 348static const int inflate_threshold_root = 30;
2373ce1c
RO
349
350static void __alias_free_mem(struct rcu_head *head)
19baf839 351{
2373ce1c
RO
352 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
353 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
354}
355
2373ce1c 356static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 357{
2373ce1c
RO
358 call_rcu(&fa->rcu, __alias_free_mem);
359}
91b9a277 360
2373ce1c
RO
361static void __leaf_free_rcu(struct rcu_head *head)
362{
bc3c8c1e
SH
363 struct leaf *l = container_of(head, struct leaf, rcu);
364 kmem_cache_free(trie_leaf_kmem, l);
2373ce1c 365}
91b9a277 366
387a5487
SH
367static inline void free_leaf(struct leaf *l)
368{
0c03eca3 369 call_rcu(&l->rcu, __leaf_free_rcu);
387a5487
SH
370}
371
2373ce1c 372static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 373{
bceb0f45 374 kfree_rcu(leaf, rcu);
19baf839
RO
375}
376
8d965444 377static struct tnode *tnode_alloc(size_t size)
f0e36f8c 378{
2373ce1c 379 if (size <= PAGE_SIZE)
8d965444 380 return kzalloc(size, GFP_KERNEL);
15be75cd 381 else
7a1c8e5a 382 return vzalloc(size);
15be75cd 383}
2373ce1c 384
2373ce1c 385static void __tnode_free_rcu(struct rcu_head *head)
f0e36f8c 386{
2373ce1c 387 struct tnode *tn = container_of(head, struct tnode, rcu);
8d965444 388 size_t size = sizeof(struct tnode) +
b299e4f0 389 (sizeof(struct rt_trie_node *) << tn->bits);
f0e36f8c
PM
390
391 if (size <= PAGE_SIZE)
392 kfree(tn);
00203563
AV
393 else
394 vfree(tn);
f0e36f8c
PM
395}
396
2373ce1c
RO
397static inline void tnode_free(struct tnode *tn)
398{
387a5487
SH
399 if (IS_LEAF(tn))
400 free_leaf((struct leaf *) tn);
401 else
550e29bc 402 call_rcu(&tn->rcu, __tnode_free_rcu);
2373ce1c
RO
403}
404
e0f7cb8c
JP
405static void tnode_free_safe(struct tnode *tn)
406{
407 BUG_ON(IS_LEAF(tn));
7b85576d
JP
408 tn->tnode_free = tnode_free_head;
409 tnode_free_head = tn;
c3059477 410 tnode_free_size += sizeof(struct tnode) +
b299e4f0 411 (sizeof(struct rt_trie_node *) << tn->bits);
e0f7cb8c
JP
412}
413
414static void tnode_free_flush(void)
415{
416 struct tnode *tn;
417
418 while ((tn = tnode_free_head)) {
419 tnode_free_head = tn->tnode_free;
420 tn->tnode_free = NULL;
421 tnode_free(tn);
422 }
c3059477
JP
423
424 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
425 tnode_free_size = 0;
426 synchronize_rcu();
427 }
e0f7cb8c
JP
428}
429
2373ce1c
RO
430static struct leaf *leaf_new(void)
431{
bc3c8c1e 432 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
2373ce1c
RO
433 if (l) {
434 l->parent = T_LEAF;
435 INIT_HLIST_HEAD(&l->list);
436 }
437 return l;
438}
439
440static struct leaf_info *leaf_info_new(int plen)
441{
442 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
443 if (li) {
444 li->plen = plen;
5c74501f 445 li->mask_plen = ntohl(inet_make_mask(plen));
2373ce1c
RO
446 INIT_LIST_HEAD(&li->falh);
447 }
448 return li;
449}
450
a07f5f50 451static struct tnode *tnode_new(t_key key, int pos, int bits)
19baf839 452{
b299e4f0 453 size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
f0e36f8c 454 struct tnode *tn = tnode_alloc(sz);
19baf839 455
91b9a277 456 if (tn) {
2373ce1c 457 tn->parent = T_TNODE;
19baf839
RO
458 tn->pos = pos;
459 tn->bits = bits;
460 tn->key = key;
461 tn->full_children = 0;
462 tn->empty_children = 1<<bits;
463 }
c877efb2 464
a034ee3c 465 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
4ea4bf7e 466 sizeof(struct rt_trie_node *) << bits);
19baf839
RO
467 return tn;
468}
469
19baf839
RO
470/*
471 * Check whether a tnode 'n' is "full", i.e. it is an internal node
472 * and no bits are skipped. See discussion in dyntree paper p. 6
473 */
474
b299e4f0 475static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
19baf839 476{
c877efb2 477 if (n == NULL || IS_LEAF(n))
19baf839
RO
478 return 0;
479
480 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
481}
482
61648d91 483static inline void put_child(struct tnode *tn, int i,
b299e4f0 484 struct rt_trie_node *n)
19baf839
RO
485{
486 tnode_put_child_reorg(tn, i, n, -1);
487}
488
c877efb2 489 /*
19baf839
RO
490 * Add a child at position i overwriting the old value.
491 * Update the value of full_children and empty_children.
492 */
493
b299e4f0 494static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
a07f5f50 495 int wasfull)
19baf839 496{
0a5c0475 497 struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
19baf839
RO
498 int isfull;
499
0c7770c7
SH
500 BUG_ON(i >= 1<<tn->bits);
501
19baf839
RO
502 /* update emptyChildren */
503 if (n == NULL && chi != NULL)
504 tn->empty_children++;
505 else if (n != NULL && chi == NULL)
506 tn->empty_children--;
c877efb2 507
19baf839 508 /* update fullChildren */
91b9a277 509 if (wasfull == -1)
19baf839
RO
510 wasfull = tnode_full(tn, chi);
511
512 isfull = tnode_full(tn, n);
c877efb2 513 if (wasfull && !isfull)
19baf839 514 tn->full_children--;
c877efb2 515 else if (!wasfull && isfull)
19baf839 516 tn->full_children++;
91b9a277 517
c877efb2 518 if (n)
06801916 519 node_set_parent(n, tn);
19baf839 520
cf778b00 521 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
522}
523
80b71b80 524#define MAX_WORK 10
b299e4f0 525static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
526{
527 int i;
2f80b3c8 528 struct tnode *old_tn;
e6308be8
RO
529 int inflate_threshold_use;
530 int halve_threshold_use;
80b71b80 531 int max_work;
19baf839 532
e905a9ed 533 if (!tn)
19baf839
RO
534 return NULL;
535
0c7770c7
SH
536 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
537 tn, inflate_threshold, halve_threshold);
19baf839
RO
538
539 /* No children */
540 if (tn->empty_children == tnode_child_length(tn)) {
e0f7cb8c 541 tnode_free_safe(tn);
19baf839
RO
542 return NULL;
543 }
544 /* One child */
545 if (tn->empty_children == tnode_child_length(tn) - 1)
80b71b80 546 goto one_child;
c877efb2 547 /*
19baf839
RO
548 * Double as long as the resulting node has a number of
549 * nonempty nodes that are above the threshold.
550 */
551
552 /*
c877efb2
SH
553 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
554 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 555 * Telecommunications, page 6:
c877efb2 556 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
557 * children in the *doubled* node is at least 'high'."
558 *
c877efb2
SH
559 * 'high' in this instance is the variable 'inflate_threshold'. It
560 * is expressed as a percentage, so we multiply it with
561 * tnode_child_length() and instead of multiplying by 2 (since the
562 * child array will be doubled by inflate()) and multiplying
563 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 564 * multiply the left-hand side by 50.
c877efb2
SH
565 *
566 * The left-hand side may look a bit weird: tnode_child_length(tn)
567 * - tn->empty_children is of course the number of non-null children
568 * in the current node. tn->full_children is the number of "full"
19baf839 569 * children, that is non-null tnodes with a skip value of 0.
c877efb2 570 * All of those will be doubled in the resulting inflated tnode, so
19baf839 571 * we just count them one extra time here.
c877efb2 572 *
19baf839 573 * A clearer way to write this would be:
c877efb2 574 *
19baf839 575 * to_be_doubled = tn->full_children;
c877efb2 576 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
577 * tn->full_children;
578 *
579 * new_child_length = tnode_child_length(tn) * 2;
580 *
c877efb2 581 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
582 * new_child_length;
583 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
584 *
585 * ...and so on, tho it would mess up the while () loop.
586 *
19baf839
RO
587 * anyway,
588 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
589 * inflate_threshold
c877efb2 590 *
19baf839
RO
591 * avoid a division:
592 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
593 * inflate_threshold * new_child_length
c877efb2 594 *
19baf839 595 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 596 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 597 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 598 *
19baf839 599 * expand new_child_length:
c877efb2 600 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 601 * tn->full_children) >=
19baf839 602 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 603 *
19baf839 604 * shorten again:
c877efb2 605 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 606 * tn->empty_children) >= inflate_threshold *
19baf839 607 * tnode_child_length(tn)
c877efb2 608 *
19baf839
RO
609 */
610
611 check_tnode(tn);
c877efb2 612
e6308be8
RO
613 /* Keep root node larger */
614
b299e4f0 615 if (!node_parent((struct rt_trie_node *)tn)) {
80b71b80
JL
616 inflate_threshold_use = inflate_threshold_root;
617 halve_threshold_use = halve_threshold_root;
a034ee3c 618 } else {
e6308be8 619 inflate_threshold_use = inflate_threshold;
80b71b80
JL
620 halve_threshold_use = halve_threshold;
621 }
e6308be8 622
80b71b80
JL
623 max_work = MAX_WORK;
624 while ((tn->full_children > 0 && max_work-- &&
a07f5f50
SH
625 50 * (tn->full_children + tnode_child_length(tn)
626 - tn->empty_children)
627 >= inflate_threshold_use * tnode_child_length(tn))) {
19baf839 628
2f80b3c8
RO
629 old_tn = tn;
630 tn = inflate(t, tn);
a07f5f50 631
2f80b3c8
RO
632 if (IS_ERR(tn)) {
633 tn = old_tn;
2f36895a
RO
634#ifdef CONFIG_IP_FIB_TRIE_STATS
635 t->stats.resize_node_skipped++;
636#endif
637 break;
638 }
19baf839
RO
639 }
640
641 check_tnode(tn);
642
80b71b80 643 /* Return if at least one inflate is run */
a034ee3c 644 if (max_work != MAX_WORK)
b299e4f0 645 return (struct rt_trie_node *) tn;
80b71b80 646
19baf839
RO
647 /*
648 * Halve as long as the number of empty children in this
649 * node is above threshold.
650 */
2f36895a 651
80b71b80
JL
652 max_work = MAX_WORK;
653 while (tn->bits > 1 && max_work-- &&
19baf839 654 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 655 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 656
2f80b3c8
RO
657 old_tn = tn;
658 tn = halve(t, tn);
659 if (IS_ERR(tn)) {
660 tn = old_tn;
2f36895a
RO
661#ifdef CONFIG_IP_FIB_TRIE_STATS
662 t->stats.resize_node_skipped++;
663#endif
664 break;
665 }
666 }
19baf839 667
c877efb2 668
19baf839 669 /* Only one child remains */
80b71b80
JL
670 if (tn->empty_children == tnode_child_length(tn) - 1) {
671one_child:
19baf839 672 for (i = 0; i < tnode_child_length(tn); i++) {
b299e4f0 673 struct rt_trie_node *n;
19baf839 674
0a5c0475 675 n = rtnl_dereference(tn->child[i]);
2373ce1c 676 if (!n)
91b9a277 677 continue;
91b9a277
OJ
678
679 /* compress one level */
680
06801916 681 node_set_parent(n, NULL);
e0f7cb8c 682 tnode_free_safe(tn);
91b9a277 683 return n;
19baf839 684 }
80b71b80 685 }
b299e4f0 686 return (struct rt_trie_node *) tn;
19baf839
RO
687}
688
0a5c0475
ED
689
690static void tnode_clean_free(struct tnode *tn)
691{
692 int i;
693 struct tnode *tofree;
694
695 for (i = 0; i < tnode_child_length(tn); i++) {
696 tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
697 if (tofree)
698 tnode_free(tofree);
699 }
700 tnode_free(tn);
701}
702
2f80b3c8 703static struct tnode *inflate(struct trie *t, struct tnode *tn)
19baf839 704{
19baf839
RO
705 struct tnode *oldtnode = tn;
706 int olen = tnode_child_length(tn);
707 int i;
708
0c7770c7 709 pr_debug("In inflate\n");
19baf839
RO
710
711 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
712
0c7770c7 713 if (!tn)
2f80b3c8 714 return ERR_PTR(-ENOMEM);
2f36895a
RO
715
716 /*
c877efb2
SH
717 * Preallocate and store tnodes before the actual work so we
718 * don't get into an inconsistent state if memory allocation
719 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
720 * of tnode is ignored.
721 */
91b9a277
OJ
722
723 for (i = 0; i < olen; i++) {
a07f5f50 724 struct tnode *inode;
2f36895a 725
a07f5f50 726 inode = (struct tnode *) tnode_get_child(oldtnode, i);
2f36895a
RO
727 if (inode &&
728 IS_TNODE(inode) &&
729 inode->pos == oldtnode->pos + oldtnode->bits &&
730 inode->bits > 1) {
731 struct tnode *left, *right;
ab66b4a7 732 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
c877efb2 733
2f36895a
RO
734 left = tnode_new(inode->key&(~m), inode->pos + 1,
735 inode->bits - 1);
2f80b3c8
RO
736 if (!left)
737 goto nomem;
91b9a277 738
2f36895a
RO
739 right = tnode_new(inode->key|m, inode->pos + 1,
740 inode->bits - 1);
741
e905a9ed 742 if (!right) {
2f80b3c8
RO
743 tnode_free(left);
744 goto nomem;
e905a9ed 745 }
2f36895a 746
61648d91
LM
747 put_child(tn, 2*i, (struct rt_trie_node *) left);
748 put_child(tn, 2*i+1, (struct rt_trie_node *) right);
2f36895a
RO
749 }
750 }
751
91b9a277 752 for (i = 0; i < olen; i++) {
c95aaf9a 753 struct tnode *inode;
b299e4f0 754 struct rt_trie_node *node = tnode_get_child(oldtnode, i);
91b9a277
OJ
755 struct tnode *left, *right;
756 int size, j;
c877efb2 757
19baf839
RO
758 /* An empty child */
759 if (node == NULL)
760 continue;
761
762 /* A leaf or an internal node with skipped bits */
763
c877efb2 764 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 765 tn->pos + tn->bits - 1) {
a07f5f50
SH
766 if (tkey_extract_bits(node->key,
767 oldtnode->pos + oldtnode->bits,
768 1) == 0)
61648d91 769 put_child(tn, 2*i, node);
19baf839 770 else
61648d91 771 put_child(tn, 2*i+1, node);
19baf839
RO
772 continue;
773 }
774
775 /* An internal node with two children */
776 inode = (struct tnode *) node;
777
778 if (inode->bits == 1) {
61648d91
LM
779 put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
780 put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
19baf839 781
e0f7cb8c 782 tnode_free_safe(inode);
91b9a277 783 continue;
19baf839
RO
784 }
785
91b9a277
OJ
786 /* An internal node with more than two children */
787
788 /* We will replace this node 'inode' with two new
789 * ones, 'left' and 'right', each with half of the
790 * original children. The two new nodes will have
791 * a position one bit further down the key and this
792 * means that the "significant" part of their keys
793 * (see the discussion near the top of this file)
794 * will differ by one bit, which will be "0" in
795 * left's key and "1" in right's key. Since we are
796 * moving the key position by one step, the bit that
797 * we are moving away from - the bit at position
798 * (inode->pos) - is the one that will differ between
799 * left and right. So... we synthesize that bit in the
800 * two new keys.
801 * The mask 'm' below will be a single "one" bit at
802 * the position (inode->pos)
803 */
19baf839 804
91b9a277
OJ
805 /* Use the old key, but set the new significant
806 * bit to zero.
807 */
2f36895a 808
91b9a277 809 left = (struct tnode *) tnode_get_child(tn, 2*i);
61648d91 810 put_child(tn, 2*i, NULL);
2f36895a 811
91b9a277 812 BUG_ON(!left);
2f36895a 813
91b9a277 814 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
61648d91 815 put_child(tn, 2*i+1, NULL);
19baf839 816
91b9a277 817 BUG_ON(!right);
19baf839 818
91b9a277
OJ
819 size = tnode_child_length(left);
820 for (j = 0; j < size; j++) {
61648d91
LM
821 put_child(left, j, rtnl_dereference(inode->child[j]));
822 put_child(right, j, rtnl_dereference(inode->child[j + size]));
19baf839 823 }
61648d91
LM
824 put_child(tn, 2*i, resize(t, left));
825 put_child(tn, 2*i+1, resize(t, right));
91b9a277 826
e0f7cb8c 827 tnode_free_safe(inode);
19baf839 828 }
e0f7cb8c 829 tnode_free_safe(oldtnode);
19baf839 830 return tn;
2f80b3c8 831nomem:
0a5c0475
ED
832 tnode_clean_free(tn);
833 return ERR_PTR(-ENOMEM);
19baf839
RO
834}
835
2f80b3c8 836static struct tnode *halve(struct trie *t, struct tnode *tn)
19baf839
RO
837{
838 struct tnode *oldtnode = tn;
b299e4f0 839 struct rt_trie_node *left, *right;
19baf839
RO
840 int i;
841 int olen = tnode_child_length(tn);
842
0c7770c7 843 pr_debug("In halve\n");
c877efb2
SH
844
845 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 846
2f80b3c8
RO
847 if (!tn)
848 return ERR_PTR(-ENOMEM);
2f36895a
RO
849
850 /*
c877efb2
SH
851 * Preallocate and store tnodes before the actual work so we
852 * don't get into an inconsistent state if memory allocation
853 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
854 * of tnode is ignored.
855 */
856
91b9a277 857 for (i = 0; i < olen; i += 2) {
2f36895a
RO
858 left = tnode_get_child(oldtnode, i);
859 right = tnode_get_child(oldtnode, i+1);
c877efb2 860
2f36895a 861 /* Two nonempty children */
0c7770c7 862 if (left && right) {
2f80b3c8 863 struct tnode *newn;
0c7770c7 864
2f80b3c8 865 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
866
867 if (!newn)
2f80b3c8 868 goto nomem;
0c7770c7 869
61648d91 870 put_child(tn, i/2, (struct rt_trie_node *)newn);
2f36895a 871 }
2f36895a 872
2f36895a 873 }
19baf839 874
91b9a277
OJ
875 for (i = 0; i < olen; i += 2) {
876 struct tnode *newBinNode;
877
19baf839
RO
878 left = tnode_get_child(oldtnode, i);
879 right = tnode_get_child(oldtnode, i+1);
c877efb2 880
19baf839
RO
881 /* At least one of the children is empty */
882 if (left == NULL) {
883 if (right == NULL) /* Both are empty */
884 continue;
61648d91 885 put_child(tn, i/2, right);
91b9a277 886 continue;
0c7770c7 887 }
91b9a277
OJ
888
889 if (right == NULL) {
61648d91 890 put_child(tn, i/2, left);
91b9a277
OJ
891 continue;
892 }
c877efb2 893
19baf839 894 /* Two nonempty children */
91b9a277 895 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
61648d91
LM
896 put_child(tn, i/2, NULL);
897 put_child(newBinNode, 0, left);
898 put_child(newBinNode, 1, right);
899 put_child(tn, i/2, resize(t, newBinNode));
19baf839 900 }
e0f7cb8c 901 tnode_free_safe(oldtnode);
19baf839 902 return tn;
2f80b3c8 903nomem:
0a5c0475
ED
904 tnode_clean_free(tn);
905 return ERR_PTR(-ENOMEM);
19baf839
RO
906}
907
772cb712 908/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
909 via get_fa_head and dump */
910
772cb712 911static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
19baf839 912{
772cb712 913 struct hlist_head *head = &l->list;
19baf839
RO
914 struct leaf_info *li;
915
b67bfe0d 916 hlist_for_each_entry_rcu(li, head, hlist)
c877efb2 917 if (li->plen == plen)
19baf839 918 return li;
91b9a277 919
19baf839
RO
920 return NULL;
921}
922
a07f5f50 923static inline struct list_head *get_fa_head(struct leaf *l, int plen)
19baf839 924{
772cb712 925 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 926
91b9a277
OJ
927 if (!li)
928 return NULL;
c877efb2 929
91b9a277 930 return &li->falh;
19baf839
RO
931}
932
933static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
934{
e905a9ed 935 struct leaf_info *li = NULL, *last = NULL;
e905a9ed
YH
936
937 if (hlist_empty(head)) {
938 hlist_add_head_rcu(&new->hlist, head);
939 } else {
b67bfe0d 940 hlist_for_each_entry(li, head, hlist) {
e905a9ed
YH
941 if (new->plen > li->plen)
942 break;
943
944 last = li;
945 }
946 if (last)
947 hlist_add_after_rcu(&last->hlist, &new->hlist);
948 else
949 hlist_add_before_rcu(&new->hlist, &li->hlist);
950 }
19baf839
RO
951}
952
2373ce1c
RO
953/* rcu_read_lock needs to be hold by caller from readside */
954
19baf839
RO
955static struct leaf *
956fib_find_node(struct trie *t, u32 key)
957{
958 int pos;
959 struct tnode *tn;
b299e4f0 960 struct rt_trie_node *n;
19baf839
RO
961
962 pos = 0;
a034ee3c 963 n = rcu_dereference_rtnl(t->trie);
19baf839
RO
964
965 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
966 tn = (struct tnode *) n;
91b9a277 967
19baf839 968 check_tnode(tn);
91b9a277 969
c877efb2 970 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
91b9a277 971 pos = tn->pos + tn->bits;
a07f5f50
SH
972 n = tnode_get_child_rcu(tn,
973 tkey_extract_bits(key,
974 tn->pos,
975 tn->bits));
91b9a277 976 } else
19baf839
RO
977 break;
978 }
979 /* Case we have found a leaf. Compare prefixes */
980
91b9a277
OJ
981 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
982 return (struct leaf *)n;
983
19baf839
RO
984 return NULL;
985}
986
7b85576d 987static void trie_rebalance(struct trie *t, struct tnode *tn)
19baf839 988{
19baf839 989 int wasfull;
3ed18d76 990 t_key cindex, key;
06801916 991 struct tnode *tp;
19baf839 992
3ed18d76
RO
993 key = tn->key;
994
b299e4f0 995 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
19baf839
RO
996 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
997 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
e3192690 998 tn = (struct tnode *)resize(t, tn);
a07f5f50 999
e3192690 1000 tnode_put_child_reorg(tp, cindex,
b299e4f0 1001 (struct rt_trie_node *)tn, wasfull);
91b9a277 1002
b299e4f0 1003 tp = node_parent((struct rt_trie_node *) tn);
008440e3 1004 if (!tp)
cf778b00 1005 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
008440e3 1006
e0f7cb8c 1007 tnode_free_flush();
06801916 1008 if (!tp)
19baf839 1009 break;
06801916 1010 tn = tp;
19baf839 1011 }
06801916 1012
19baf839 1013 /* Handle last (top) tnode */
7b85576d 1014 if (IS_TNODE(tn))
e3192690 1015 tn = (struct tnode *)resize(t, tn);
19baf839 1016
cf778b00 1017 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
7b85576d 1018 tnode_free_flush();
19baf839
RO
1019}
1020
2373ce1c
RO
1021/* only used from updater-side */
1022
fea86ad8 1023static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
19baf839
RO
1024{
1025 int pos, newpos;
1026 struct tnode *tp = NULL, *tn = NULL;
b299e4f0 1027 struct rt_trie_node *n;
19baf839
RO
1028 struct leaf *l;
1029 int missbit;
c877efb2 1030 struct list_head *fa_head = NULL;
19baf839
RO
1031 struct leaf_info *li;
1032 t_key cindex;
1033
1034 pos = 0;
0a5c0475 1035 n = rtnl_dereference(t->trie);
19baf839 1036
c877efb2
SH
1037 /* If we point to NULL, stop. Either the tree is empty and we should
1038 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 1039 * and we should just put our new leaf in that.
c877efb2
SH
1040 * If we point to a T_TNODE, check if it matches our key. Note that
1041 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
1042 * not be the parent's 'pos'+'bits'!
1043 *
c877efb2 1044 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
1045 * the index from our key, push the T_TNODE and walk the tree.
1046 *
1047 * If it doesn't, we have to replace it with a new T_TNODE.
1048 *
c877efb2
SH
1049 * If we point to a T_LEAF, it might or might not have the same key
1050 * as we do. If it does, just change the value, update the T_LEAF's
1051 * value, and return it.
19baf839
RO
1052 * If it doesn't, we need to replace it with a T_TNODE.
1053 */
1054
1055 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1056 tn = (struct tnode *) n;
91b9a277 1057
c877efb2 1058 check_tnode(tn);
91b9a277 1059
c877efb2 1060 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839 1061 tp = tn;
91b9a277 1062 pos = tn->pos + tn->bits;
a07f5f50
SH
1063 n = tnode_get_child(tn,
1064 tkey_extract_bits(key,
1065 tn->pos,
1066 tn->bits));
19baf839 1067
06801916 1068 BUG_ON(n && node_parent(n) != tn);
91b9a277 1069 } else
19baf839
RO
1070 break;
1071 }
1072
1073 /*
1074 * n ----> NULL, LEAF or TNODE
1075 *
c877efb2 1076 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1077 */
1078
91b9a277 1079 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1080
1081 /* Case 1: n is a leaf. Compare prefixes */
1082
c877efb2 1083 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
c95aaf9a 1084 l = (struct leaf *) n;
19baf839 1085 li = leaf_info_new(plen);
91b9a277 1086
fea86ad8
SH
1087 if (!li)
1088 return NULL;
19baf839
RO
1089
1090 fa_head = &li->falh;
1091 insert_leaf_info(&l->list, li);
1092 goto done;
1093 }
19baf839
RO
1094 l = leaf_new();
1095
fea86ad8
SH
1096 if (!l)
1097 return NULL;
19baf839
RO
1098
1099 l->key = key;
1100 li = leaf_info_new(plen);
1101
c877efb2 1102 if (!li) {
387a5487 1103 free_leaf(l);
fea86ad8 1104 return NULL;
f835e471 1105 }
19baf839
RO
1106
1107 fa_head = &li->falh;
1108 insert_leaf_info(&l->list, li);
1109
19baf839 1110 if (t->trie && n == NULL) {
91b9a277 1111 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839 1112
b299e4f0 1113 node_set_parent((struct rt_trie_node *)l, tp);
19baf839 1114
91b9a277 1115 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
61648d91 1116 put_child(tp, cindex, (struct rt_trie_node *)l);
91b9a277
OJ
1117 } else {
1118 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1119 /*
1120 * Add a new tnode here
19baf839
RO
1121 * first tnode need some special handling
1122 */
1123
1124 if (tp)
91b9a277 1125 pos = tp->pos+tp->bits;
19baf839 1126 else
91b9a277
OJ
1127 pos = 0;
1128
c877efb2 1129 if (n) {
19baf839
RO
1130 newpos = tkey_mismatch(key, pos, n->key);
1131 tn = tnode_new(n->key, newpos, 1);
91b9a277 1132 } else {
19baf839 1133 newpos = 0;
c877efb2 1134 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1135 }
19baf839 1136
c877efb2 1137 if (!tn) {
f835e471 1138 free_leaf_info(li);
387a5487 1139 free_leaf(l);
fea86ad8 1140 return NULL;
91b9a277
OJ
1141 }
1142
b299e4f0 1143 node_set_parent((struct rt_trie_node *)tn, tp);
19baf839 1144
91b9a277 1145 missbit = tkey_extract_bits(key, newpos, 1);
61648d91
LM
1146 put_child(tn, missbit, (struct rt_trie_node *)l);
1147 put_child(tn, 1-missbit, n);
19baf839 1148
c877efb2 1149 if (tp) {
19baf839 1150 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
61648d91 1151 put_child(tp, cindex, (struct rt_trie_node *)tn);
91b9a277 1152 } else {
cf778b00 1153 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
19baf839
RO
1154 tp = tn;
1155 }
1156 }
91b9a277
OJ
1157
1158 if (tp && tp->pos + tp->bits > 32)
058bd4d2
JP
1159 pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1160 tp, tp->pos, tp->bits, key, plen);
91b9a277 1161
19baf839 1162 /* Rebalance the trie */
2373ce1c 1163
7b85576d 1164 trie_rebalance(t, tp);
f835e471 1165done:
19baf839
RO
1166 return fa_head;
1167}
1168
d562f1f8
RO
1169/*
1170 * Caller must hold RTNL.
1171 */
16c6cf8b 1172int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1173{
1174 struct trie *t = (struct trie *) tb->tb_data;
1175 struct fib_alias *fa, *new_fa;
c877efb2 1176 struct list_head *fa_head = NULL;
19baf839 1177 struct fib_info *fi;
4e902c57
TG
1178 int plen = cfg->fc_dst_len;
1179 u8 tos = cfg->fc_tos;
19baf839
RO
1180 u32 key, mask;
1181 int err;
1182 struct leaf *l;
1183
1184 if (plen > 32)
1185 return -EINVAL;
1186
4e902c57 1187 key = ntohl(cfg->fc_dst);
19baf839 1188
2dfe55b4 1189 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
19baf839 1190
91b9a277 1191 mask = ntohl(inet_make_mask(plen));
19baf839 1192
c877efb2 1193 if (key & ~mask)
19baf839
RO
1194 return -EINVAL;
1195
1196 key = key & mask;
1197
4e902c57
TG
1198 fi = fib_create_info(cfg);
1199 if (IS_ERR(fi)) {
1200 err = PTR_ERR(fi);
19baf839 1201 goto err;
4e902c57 1202 }
19baf839
RO
1203
1204 l = fib_find_node(t, key);
c877efb2 1205 fa = NULL;
19baf839 1206
c877efb2 1207 if (l) {
19baf839
RO
1208 fa_head = get_fa_head(l, plen);
1209 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1210 }
1211
1212 /* Now fa, if non-NULL, points to the first fib alias
1213 * with the same keys [prefix,tos,priority], if such key already
1214 * exists or to the node before which we will insert new one.
1215 *
1216 * If fa is NULL, we will need to allocate a new one and
1217 * insert to the head of f.
1218 *
1219 * If f is NULL, no fib node matched the destination key
1220 * and we need to allocate a new one of those as well.
1221 */
1222
936f6f8e
JA
1223 if (fa && fa->fa_tos == tos &&
1224 fa->fa_info->fib_priority == fi->fib_priority) {
1225 struct fib_alias *fa_first, *fa_match;
19baf839
RO
1226
1227 err = -EEXIST;
4e902c57 1228 if (cfg->fc_nlflags & NLM_F_EXCL)
19baf839
RO
1229 goto out;
1230
936f6f8e
JA
1231 /* We have 2 goals:
1232 * 1. Find exact match for type, scope, fib_info to avoid
1233 * duplicate routes
1234 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1235 */
1236 fa_match = NULL;
1237 fa_first = fa;
1238 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1239 list_for_each_entry_continue(fa, fa_head, fa_list) {
1240 if (fa->fa_tos != tos)
1241 break;
1242 if (fa->fa_info->fib_priority != fi->fib_priority)
1243 break;
1244 if (fa->fa_type == cfg->fc_type &&
936f6f8e
JA
1245 fa->fa_info == fi) {
1246 fa_match = fa;
1247 break;
1248 }
1249 }
1250
4e902c57 1251 if (cfg->fc_nlflags & NLM_F_REPLACE) {
19baf839
RO
1252 struct fib_info *fi_drop;
1253 u8 state;
1254
936f6f8e
JA
1255 fa = fa_first;
1256 if (fa_match) {
1257 if (fa == fa_match)
1258 err = 0;
6725033f 1259 goto out;
936f6f8e 1260 }
2373ce1c 1261 err = -ENOBUFS;
e94b1766 1262 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
2373ce1c
RO
1263 if (new_fa == NULL)
1264 goto out;
19baf839
RO
1265
1266 fi_drop = fa->fa_info;
2373ce1c
RO
1267 new_fa->fa_tos = fa->fa_tos;
1268 new_fa->fa_info = fi;
4e902c57 1269 new_fa->fa_type = cfg->fc_type;
19baf839 1270 state = fa->fa_state;
936f6f8e 1271 new_fa->fa_state = state & ~FA_S_ACCESSED;
19baf839 1272
2373ce1c
RO
1273 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1274 alias_free_mem_rcu(fa);
19baf839
RO
1275
1276 fib_release_info(fi_drop);
1277 if (state & FA_S_ACCESSED)
4ccfe6d4 1278 rt_cache_flush(cfg->fc_nlinfo.nl_net);
b8f55831
MK
1279 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1280 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
19baf839 1281
91b9a277 1282 goto succeeded;
19baf839
RO
1283 }
1284 /* Error if we find a perfect match which
1285 * uses the same scope, type, and nexthop
1286 * information.
1287 */
936f6f8e
JA
1288 if (fa_match)
1289 goto out;
a07f5f50 1290
4e902c57 1291 if (!(cfg->fc_nlflags & NLM_F_APPEND))
936f6f8e 1292 fa = fa_first;
19baf839
RO
1293 }
1294 err = -ENOENT;
4e902c57 1295 if (!(cfg->fc_nlflags & NLM_F_CREATE))
19baf839
RO
1296 goto out;
1297
1298 err = -ENOBUFS;
e94b1766 1299 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
19baf839
RO
1300 if (new_fa == NULL)
1301 goto out;
1302
1303 new_fa->fa_info = fi;
1304 new_fa->fa_tos = tos;
4e902c57 1305 new_fa->fa_type = cfg->fc_type;
19baf839 1306 new_fa->fa_state = 0;
19baf839
RO
1307 /*
1308 * Insert new entry to the list.
1309 */
1310
c877efb2 1311 if (!fa_head) {
fea86ad8
SH
1312 fa_head = fib_insert_node(t, key, plen);
1313 if (unlikely(!fa_head)) {
1314 err = -ENOMEM;
f835e471 1315 goto out_free_new_fa;
fea86ad8 1316 }
f835e471 1317 }
19baf839 1318
21d8c49e
DM
1319 if (!plen)
1320 tb->tb_num_default++;
1321
2373ce1c
RO
1322 list_add_tail_rcu(&new_fa->fa_list,
1323 (fa ? &fa->fa_list : fa_head));
19baf839 1324
4ccfe6d4 1325 rt_cache_flush(cfg->fc_nlinfo.nl_net);
4e902c57 1326 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
b8f55831 1327 &cfg->fc_nlinfo, 0);
19baf839
RO
1328succeeded:
1329 return 0;
f835e471
RO
1330
1331out_free_new_fa:
1332 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1333out:
1334 fib_release_info(fi);
91b9a277 1335err:
19baf839
RO
1336 return err;
1337}
1338
772cb712 1339/* should be called with rcu_read_lock */
5b470441 1340static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
22bd5b9b 1341 t_key key, const struct flowi4 *flp,
ebc0ffae 1342 struct fib_result *res, int fib_flags)
19baf839 1343{
19baf839
RO
1344 struct leaf_info *li;
1345 struct hlist_head *hhead = &l->list;
c877efb2 1346
b67bfe0d 1347 hlist_for_each_entry_rcu(li, hhead, hlist) {
3be0686b 1348 struct fib_alias *fa;
a07f5f50 1349
5c74501f 1350 if (l->key != (key & li->mask_plen))
19baf839
RO
1351 continue;
1352
3be0686b
DM
1353 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1354 struct fib_info *fi = fa->fa_info;
1355 int nhsel, err;
a07f5f50 1356
22bd5b9b 1357 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
3be0686b 1358 continue;
dccd9ecc
DM
1359 if (fi->fib_dead)
1360 continue;
37e826c5 1361 if (fa->fa_info->fib_scope < flp->flowi4_scope)
3be0686b
DM
1362 continue;
1363 fib_alias_accessed(fa);
1364 err = fib_props[fa->fa_type].error;
1365 if (err) {
19baf839 1366#ifdef CONFIG_IP_FIB_TRIE_STATS
1fbc7843 1367 t->stats.semantic_match_passed++;
3be0686b 1368#endif
1fbc7843 1369 return err;
3be0686b
DM
1370 }
1371 if (fi->fib_flags & RTNH_F_DEAD)
1372 continue;
1373 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1374 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1375
1376 if (nh->nh_flags & RTNH_F_DEAD)
1377 continue;
22bd5b9b 1378 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
3be0686b
DM
1379 continue;
1380
1381#ifdef CONFIG_IP_FIB_TRIE_STATS
1382 t->stats.semantic_match_passed++;
1383#endif
5c74501f 1384 res->prefixlen = li->plen;
3be0686b
DM
1385 res->nh_sel = nhsel;
1386 res->type = fa->fa_type;
37e826c5 1387 res->scope = fa->fa_info->fib_scope;
3be0686b
DM
1388 res->fi = fi;
1389 res->table = tb;
1390 res->fa_head = &li->falh;
1391 if (!(fib_flags & FIB_LOOKUP_NOREF))
5c74501f 1392 atomic_inc(&fi->fib_clntref);
3be0686b
DM
1393 return 0;
1394 }
1395 }
1396
1397#ifdef CONFIG_IP_FIB_TRIE_STATS
1398 t->stats.semantic_match_miss++;
19baf839 1399#endif
19baf839 1400 }
a07f5f50 1401
2e655571 1402 return 1;
19baf839
RO
1403}
1404
22bd5b9b 1405int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
ebc0ffae 1406 struct fib_result *res, int fib_flags)
19baf839
RO
1407{
1408 struct trie *t = (struct trie *) tb->tb_data;
2e655571 1409 int ret;
b299e4f0 1410 struct rt_trie_node *n;
19baf839 1411 struct tnode *pn;
3b004569 1412 unsigned int pos, bits;
22bd5b9b 1413 t_key key = ntohl(flp->daddr);
3b004569 1414 unsigned int chopped_off;
19baf839 1415 t_key cindex = 0;
3b004569 1416 unsigned int current_prefix_length = KEYLENGTH;
91b9a277 1417 struct tnode *cn;
874ffa8f 1418 t_key pref_mismatch;
91b9a277 1419
2373ce1c 1420 rcu_read_lock();
91b9a277 1421
2373ce1c 1422 n = rcu_dereference(t->trie);
c877efb2 1423 if (!n)
19baf839
RO
1424 goto failed;
1425
1426#ifdef CONFIG_IP_FIB_TRIE_STATS
1427 t->stats.gets++;
1428#endif
1429
1430 /* Just a leaf? */
1431 if (IS_LEAF(n)) {
5b470441 1432 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
a07f5f50 1433 goto found;
19baf839 1434 }
a07f5f50 1435
19baf839
RO
1436 pn = (struct tnode *) n;
1437 chopped_off = 0;
c877efb2 1438
91b9a277 1439 while (pn) {
19baf839
RO
1440 pos = pn->pos;
1441 bits = pn->bits;
1442
c877efb2 1443 if (!chopped_off)
ab66b4a7
SH
1444 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1445 pos, bits);
19baf839 1446
b902e573 1447 n = tnode_get_child_rcu(pn, cindex);
19baf839
RO
1448
1449 if (n == NULL) {
1450#ifdef CONFIG_IP_FIB_TRIE_STATS
1451 t->stats.null_node_hit++;
1452#endif
1453 goto backtrace;
1454 }
1455
91b9a277 1456 if (IS_LEAF(n)) {
5b470441 1457 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
2e655571 1458 if (ret > 0)
91b9a277 1459 goto backtrace;
a07f5f50 1460 goto found;
91b9a277
OJ
1461 }
1462
91b9a277 1463 cn = (struct tnode *)n;
19baf839 1464
91b9a277
OJ
1465 /*
1466 * It's a tnode, and we can do some extra checks here if we
1467 * like, to avoid descending into a dead-end branch.
1468 * This tnode is in the parent's child array at index
1469 * key[p_pos..p_pos+p_bits] but potentially with some bits
1470 * chopped off, so in reality the index may be just a
1471 * subprefix, padded with zero at the end.
1472 * We can also take a look at any skipped bits in this
1473 * tnode - everything up to p_pos is supposed to be ok,
1474 * and the non-chopped bits of the index (se previous
1475 * paragraph) are also guaranteed ok, but the rest is
1476 * considered unknown.
1477 *
1478 * The skipped bits are key[pos+bits..cn->pos].
1479 */
19baf839 1480
91b9a277
OJ
1481 /* If current_prefix_length < pos+bits, we are already doing
1482 * actual prefix matching, which means everything from
1483 * pos+(bits-chopped_off) onward must be zero along some
1484 * branch of this subtree - otherwise there is *no* valid
1485 * prefix present. Here we can only check the skipped
1486 * bits. Remember, since we have already indexed into the
1487 * parent's child array, we know that the bits we chopped of
1488 * *are* zero.
1489 */
19baf839 1490
a07f5f50
SH
1491 /* NOTA BENE: Checking only skipped bits
1492 for the new node here */
19baf839 1493
91b9a277
OJ
1494 if (current_prefix_length < pos+bits) {
1495 if (tkey_extract_bits(cn->key, current_prefix_length,
a07f5f50
SH
1496 cn->pos - current_prefix_length)
1497 || !(cn->child[0]))
91b9a277
OJ
1498 goto backtrace;
1499 }
19baf839 1500
91b9a277
OJ
1501 /*
1502 * If chopped_off=0, the index is fully validated and we
1503 * only need to look at the skipped bits for this, the new,
1504 * tnode. What we actually want to do is to find out if
1505 * these skipped bits match our key perfectly, or if we will
1506 * have to count on finding a matching prefix further down,
1507 * because if we do, we would like to have some way of
1508 * verifying the existence of such a prefix at this point.
1509 */
19baf839 1510
91b9a277
OJ
1511 /* The only thing we can do at this point is to verify that
1512 * any such matching prefix can indeed be a prefix to our
1513 * key, and if the bits in the node we are inspecting that
1514 * do not match our key are not ZERO, this cannot be true.
1515 * Thus, find out where there is a mismatch (before cn->pos)
1516 * and verify that all the mismatching bits are zero in the
1517 * new tnode's key.
1518 */
19baf839 1519
a07f5f50
SH
1520 /*
1521 * Note: We aren't very concerned about the piece of
1522 * the key that precede pn->pos+pn->bits, since these
1523 * have already been checked. The bits after cn->pos
1524 * aren't checked since these are by definition
1525 * "unknown" at this point. Thus, what we want to see
1526 * is if we are about to enter the "prefix matching"
1527 * state, and in that case verify that the skipped
1528 * bits that will prevail throughout this subtree are
1529 * zero, as they have to be if we are to find a
1530 * matching prefix.
91b9a277
OJ
1531 */
1532
874ffa8f 1533 pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
91b9a277 1534
a07f5f50
SH
1535 /*
1536 * In short: If skipped bits in this node do not match
1537 * the search key, enter the "prefix matching"
1538 * state.directly.
91b9a277
OJ
1539 */
1540 if (pref_mismatch) {
79cda75a
ED
1541 /* fls(x) = __fls(x) + 1 */
1542 int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
91b9a277 1543
874ffa8f 1544 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
91b9a277
OJ
1545 goto backtrace;
1546
1547 if (current_prefix_length >= cn->pos)
1548 current_prefix_length = mp;
c877efb2 1549 }
a07f5f50 1550
91b9a277
OJ
1551 pn = (struct tnode *)n; /* Descend */
1552 chopped_off = 0;
1553 continue;
1554
19baf839
RO
1555backtrace:
1556 chopped_off++;
1557
1558 /* As zero don't change the child key (cindex) */
a07f5f50
SH
1559 while ((chopped_off <= pn->bits)
1560 && !(cindex & (1<<(chopped_off-1))))
19baf839 1561 chopped_off++;
19baf839
RO
1562
1563 /* Decrease current_... with bits chopped off */
1564 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
a07f5f50
SH
1565 current_prefix_length = pn->pos + pn->bits
1566 - chopped_off;
91b9a277 1567
19baf839 1568 /*
c877efb2 1569 * Either we do the actual chop off according or if we have
19baf839
RO
1570 * chopped off all bits in this tnode walk up to our parent.
1571 */
1572
91b9a277 1573 if (chopped_off <= pn->bits) {
19baf839 1574 cindex &= ~(1 << (chopped_off-1));
91b9a277 1575 } else {
b299e4f0 1576 struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
06801916 1577 if (!parent)
19baf839 1578 goto failed;
91b9a277 1579
19baf839 1580 /* Get Child's index */
06801916
SH
1581 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1582 pn = parent;
19baf839
RO
1583 chopped_off = 0;
1584
1585#ifdef CONFIG_IP_FIB_TRIE_STATS
1586 t->stats.backtrack++;
1587#endif
1588 goto backtrace;
c877efb2 1589 }
19baf839
RO
1590 }
1591failed:
c877efb2 1592 ret = 1;
19baf839 1593found:
2373ce1c 1594 rcu_read_unlock();
19baf839
RO
1595 return ret;
1596}
6fc01438 1597EXPORT_SYMBOL_GPL(fib_table_lookup);
19baf839 1598
9195bef7
SH
1599/*
1600 * Remove the leaf and return parent.
1601 */
1602static void trie_leaf_remove(struct trie *t, struct leaf *l)
19baf839 1603{
b299e4f0 1604 struct tnode *tp = node_parent((struct rt_trie_node *) l);
c877efb2 1605
9195bef7 1606 pr_debug("entering trie_leaf_remove(%p)\n", l);
19baf839 1607
c877efb2 1608 if (tp) {
9195bef7 1609 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
61648d91 1610 put_child(tp, cindex, NULL);
7b85576d 1611 trie_rebalance(t, tp);
91b9a277 1612 } else
a9b3cd7f 1613 RCU_INIT_POINTER(t->trie, NULL);
19baf839 1614
387a5487 1615 free_leaf(l);
19baf839
RO
1616}
1617
d562f1f8
RO
1618/*
1619 * Caller must hold RTNL.
1620 */
16c6cf8b 1621int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1622{
1623 struct trie *t = (struct trie *) tb->tb_data;
1624 u32 key, mask;
4e902c57
TG
1625 int plen = cfg->fc_dst_len;
1626 u8 tos = cfg->fc_tos;
19baf839
RO
1627 struct fib_alias *fa, *fa_to_delete;
1628 struct list_head *fa_head;
1629 struct leaf *l;
91b9a277
OJ
1630 struct leaf_info *li;
1631
c877efb2 1632 if (plen > 32)
19baf839
RO
1633 return -EINVAL;
1634
4e902c57 1635 key = ntohl(cfg->fc_dst);
91b9a277 1636 mask = ntohl(inet_make_mask(plen));
19baf839 1637
c877efb2 1638 if (key & ~mask)
19baf839
RO
1639 return -EINVAL;
1640
1641 key = key & mask;
1642 l = fib_find_node(t, key);
1643
c877efb2 1644 if (!l)
19baf839
RO
1645 return -ESRCH;
1646
ad5b3102
IM
1647 li = find_leaf_info(l, plen);
1648
1649 if (!li)
1650 return -ESRCH;
1651
1652 fa_head = &li->falh;
19baf839
RO
1653 fa = fib_find_alias(fa_head, tos, 0);
1654
1655 if (!fa)
1656 return -ESRCH;
1657
0c7770c7 1658 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1659
1660 fa_to_delete = NULL;
936f6f8e
JA
1661 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1662 list_for_each_entry_continue(fa, fa_head, fa_list) {
19baf839
RO
1663 struct fib_info *fi = fa->fa_info;
1664
1665 if (fa->fa_tos != tos)
1666 break;
1667
4e902c57
TG
1668 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1669 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
37e826c5 1670 fa->fa_info->fib_scope == cfg->fc_scope) &&
74cb3c10
JA
1671 (!cfg->fc_prefsrc ||
1672 fi->fib_prefsrc == cfg->fc_prefsrc) &&
4e902c57
TG
1673 (!cfg->fc_protocol ||
1674 fi->fib_protocol == cfg->fc_protocol) &&
1675 fib_nh_match(cfg, fi) == 0) {
19baf839
RO
1676 fa_to_delete = fa;
1677 break;
1678 }
1679 }
1680
91b9a277
OJ
1681 if (!fa_to_delete)
1682 return -ESRCH;
19baf839 1683
91b9a277 1684 fa = fa_to_delete;
4e902c57 1685 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
b8f55831 1686 &cfg->fc_nlinfo, 0);
91b9a277 1687
2373ce1c 1688 list_del_rcu(&fa->fa_list);
19baf839 1689
21d8c49e
DM
1690 if (!plen)
1691 tb->tb_num_default--;
1692
91b9a277 1693 if (list_empty(fa_head)) {
2373ce1c 1694 hlist_del_rcu(&li->hlist);
91b9a277 1695 free_leaf_info(li);
2373ce1c 1696 }
19baf839 1697
91b9a277 1698 if (hlist_empty(&l->list))
9195bef7 1699 trie_leaf_remove(t, l);
19baf839 1700
91b9a277 1701 if (fa->fa_state & FA_S_ACCESSED)
4ccfe6d4 1702 rt_cache_flush(cfg->fc_nlinfo.nl_net);
19baf839 1703
2373ce1c
RO
1704 fib_release_info(fa->fa_info);
1705 alias_free_mem_rcu(fa);
91b9a277 1706 return 0;
19baf839
RO
1707}
1708
ef3660ce 1709static int trie_flush_list(struct list_head *head)
19baf839
RO
1710{
1711 struct fib_alias *fa, *fa_node;
1712 int found = 0;
1713
1714 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1715 struct fib_info *fi = fa->fa_info;
19baf839 1716
2373ce1c
RO
1717 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1718 list_del_rcu(&fa->fa_list);
1719 fib_release_info(fa->fa_info);
1720 alias_free_mem_rcu(fa);
19baf839
RO
1721 found++;
1722 }
1723 }
1724 return found;
1725}
1726
ef3660ce 1727static int trie_flush_leaf(struct leaf *l)
19baf839
RO
1728{
1729 int found = 0;
1730 struct hlist_head *lih = &l->list;
b67bfe0d 1731 struct hlist_node *tmp;
19baf839
RO
1732 struct leaf_info *li = NULL;
1733
b67bfe0d 1734 hlist_for_each_entry_safe(li, tmp, lih, hlist) {
ef3660ce 1735 found += trie_flush_list(&li->falh);
19baf839
RO
1736
1737 if (list_empty(&li->falh)) {
2373ce1c 1738 hlist_del_rcu(&li->hlist);
19baf839
RO
1739 free_leaf_info(li);
1740 }
1741 }
1742 return found;
1743}
1744
82cfbb00
SH
1745/*
1746 * Scan for the next right leaf starting at node p->child[idx]
1747 * Since we have back pointer, no recursion necessary.
1748 */
b299e4f0 1749static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
19baf839 1750{
82cfbb00
SH
1751 do {
1752 t_key idx;
c877efb2 1753
c877efb2 1754 if (c)
82cfbb00 1755 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
c877efb2 1756 else
82cfbb00 1757 idx = 0;
2373ce1c 1758
82cfbb00
SH
1759 while (idx < 1u << p->bits) {
1760 c = tnode_get_child_rcu(p, idx++);
2373ce1c 1761 if (!c)
91b9a277
OJ
1762 continue;
1763
82cfbb00 1764 if (IS_LEAF(c)) {
0a5c0475 1765 prefetch(rcu_dereference_rtnl(p->child[idx]));
82cfbb00 1766 return (struct leaf *) c;
19baf839 1767 }
82cfbb00
SH
1768
1769 /* Rescan start scanning in new node */
1770 p = (struct tnode *) c;
1771 idx = 0;
19baf839 1772 }
82cfbb00
SH
1773
1774 /* Node empty, walk back up to parent */
b299e4f0 1775 c = (struct rt_trie_node *) p;
a034ee3c 1776 } while ((p = node_parent_rcu(c)) != NULL);
82cfbb00
SH
1777
1778 return NULL; /* Root of trie */
1779}
1780
82cfbb00
SH
1781static struct leaf *trie_firstleaf(struct trie *t)
1782{
a034ee3c 1783 struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
82cfbb00
SH
1784
1785 if (!n)
1786 return NULL;
1787
1788 if (IS_LEAF(n)) /* trie is just a leaf */
1789 return (struct leaf *) n;
1790
1791 return leaf_walk_rcu(n, NULL);
1792}
1793
1794static struct leaf *trie_nextleaf(struct leaf *l)
1795{
b299e4f0 1796 struct rt_trie_node *c = (struct rt_trie_node *) l;
b902e573 1797 struct tnode *p = node_parent_rcu(c);
82cfbb00
SH
1798
1799 if (!p)
1800 return NULL; /* trie with just one leaf */
1801
1802 return leaf_walk_rcu(p, c);
19baf839
RO
1803}
1804
71d67e66
SH
1805static struct leaf *trie_leafindex(struct trie *t, int index)
1806{
1807 struct leaf *l = trie_firstleaf(t);
1808
ec28cf73 1809 while (l && index-- > 0)
71d67e66 1810 l = trie_nextleaf(l);
ec28cf73 1811
71d67e66
SH
1812 return l;
1813}
1814
1815
d562f1f8
RO
1816/*
1817 * Caller must hold RTNL.
1818 */
16c6cf8b 1819int fib_table_flush(struct fib_table *tb)
19baf839
RO
1820{
1821 struct trie *t = (struct trie *) tb->tb_data;
9195bef7 1822 struct leaf *l, *ll = NULL;
82cfbb00 1823 int found = 0;
19baf839 1824
82cfbb00 1825 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
ef3660ce 1826 found += trie_flush_leaf(l);
19baf839
RO
1827
1828 if (ll && hlist_empty(&ll->list))
9195bef7 1829 trie_leaf_remove(t, ll);
19baf839
RO
1830 ll = l;
1831 }
1832
1833 if (ll && hlist_empty(&ll->list))
9195bef7 1834 trie_leaf_remove(t, ll);
19baf839 1835
0c7770c7 1836 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1837 return found;
1838}
1839
4aa2c466
PE
1840void fib_free_table(struct fib_table *tb)
1841{
1842 kfree(tb);
1843}
1844
a07f5f50
SH
1845static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1846 struct fib_table *tb,
19baf839
RO
1847 struct sk_buff *skb, struct netlink_callback *cb)
1848{
1849 int i, s_i;
1850 struct fib_alias *fa;
32ab5f80 1851 __be32 xkey = htonl(key);
19baf839 1852
71d67e66 1853 s_i = cb->args[5];
19baf839
RO
1854 i = 0;
1855
2373ce1c
RO
1856 /* rcu_read_lock is hold by caller */
1857
1858 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1859 if (i < s_i) {
1860 i++;
1861 continue;
1862 }
19baf839 1863
15e47304 1864 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
19baf839
RO
1865 cb->nlh->nlmsg_seq,
1866 RTM_NEWROUTE,
1867 tb->tb_id,
1868 fa->fa_type,
be403ea1 1869 xkey,
19baf839
RO
1870 plen,
1871 fa->fa_tos,
64347f78 1872 fa->fa_info, NLM_F_MULTI) < 0) {
71d67e66 1873 cb->args[5] = i;
19baf839 1874 return -1;
91b9a277 1875 }
19baf839
RO
1876 i++;
1877 }
71d67e66 1878 cb->args[5] = i;
19baf839
RO
1879 return skb->len;
1880}
1881
a88ee229
SH
1882static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1883 struct sk_buff *skb, struct netlink_callback *cb)
19baf839 1884{
a88ee229 1885 struct leaf_info *li;
a88ee229 1886 int i, s_i;
19baf839 1887
71d67e66 1888 s_i = cb->args[4];
a88ee229 1889 i = 0;
19baf839 1890
a88ee229 1891 /* rcu_read_lock is hold by caller */
b67bfe0d 1892 hlist_for_each_entry_rcu(li, &l->list, hlist) {
a88ee229
SH
1893 if (i < s_i) {
1894 i++;
19baf839 1895 continue;
a88ee229 1896 }
91b9a277 1897
a88ee229 1898 if (i > s_i)
71d67e66 1899 cb->args[5] = 0;
19baf839 1900
a88ee229 1901 if (list_empty(&li->falh))
19baf839
RO
1902 continue;
1903
a88ee229 1904 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
71d67e66 1905 cb->args[4] = i;
19baf839
RO
1906 return -1;
1907 }
a88ee229 1908 i++;
19baf839 1909 }
a88ee229 1910
71d67e66 1911 cb->args[4] = i;
19baf839
RO
1912 return skb->len;
1913}
1914
16c6cf8b
SH
1915int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1916 struct netlink_callback *cb)
19baf839 1917{
a88ee229 1918 struct leaf *l;
19baf839 1919 struct trie *t = (struct trie *) tb->tb_data;
d5ce8a0e 1920 t_key key = cb->args[2];
71d67e66 1921 int count = cb->args[3];
19baf839 1922
2373ce1c 1923 rcu_read_lock();
d5ce8a0e
SH
1924 /* Dump starting at last key.
1925 * Note: 0.0.0.0/0 (ie default) is first key.
1926 */
71d67e66 1927 if (count == 0)
d5ce8a0e
SH
1928 l = trie_firstleaf(t);
1929 else {
71d67e66
SH
1930 /* Normally, continue from last key, but if that is missing
1931 * fallback to using slow rescan
1932 */
d5ce8a0e 1933 l = fib_find_node(t, key);
71d67e66
SH
1934 if (!l)
1935 l = trie_leafindex(t, count);
d5ce8a0e 1936 }
a88ee229 1937
d5ce8a0e
SH
1938 while (l) {
1939 cb->args[2] = l->key;
a88ee229 1940 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
71d67e66 1941 cb->args[3] = count;
a88ee229 1942 rcu_read_unlock();
a88ee229 1943 return -1;
19baf839 1944 }
d5ce8a0e 1945
71d67e66 1946 ++count;
d5ce8a0e 1947 l = trie_nextleaf(l);
71d67e66
SH
1948 memset(&cb->args[4], 0,
1949 sizeof(cb->args) - 4*sizeof(cb->args[0]));
19baf839 1950 }
71d67e66 1951 cb->args[3] = count;
2373ce1c 1952 rcu_read_unlock();
a88ee229 1953
19baf839 1954 return skb->len;
19baf839
RO
1955}
1956
5348ba85 1957void __init fib_trie_init(void)
7f9b8052 1958{
a07f5f50
SH
1959 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1960 sizeof(struct fib_alias),
bc3c8c1e
SH
1961 0, SLAB_PANIC, NULL);
1962
1963 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1964 max(sizeof(struct leaf),
1965 sizeof(struct leaf_info)),
1966 0, SLAB_PANIC, NULL);
7f9b8052 1967}
19baf839 1968
7f9b8052 1969
5348ba85 1970struct fib_table *fib_trie_table(u32 id)
19baf839
RO
1971{
1972 struct fib_table *tb;
1973 struct trie *t;
1974
19baf839
RO
1975 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1976 GFP_KERNEL);
1977 if (tb == NULL)
1978 return NULL;
1979
1980 tb->tb_id = id;
971b893e 1981 tb->tb_default = -1;
21d8c49e 1982 tb->tb_num_default = 0;
19baf839
RO
1983
1984 t = (struct trie *) tb->tb_data;
c28a1cf4 1985 memset(t, 0, sizeof(*t));
19baf839 1986
19baf839
RO
1987 return tb;
1988}
1989
cb7b593c
SH
1990#ifdef CONFIG_PROC_FS
1991/* Depth first Trie walk iterator */
1992struct fib_trie_iter {
1c340b2f 1993 struct seq_net_private p;
3d3b2d25 1994 struct fib_table *tb;
cb7b593c 1995 struct tnode *tnode;
a034ee3c
ED
1996 unsigned int index;
1997 unsigned int depth;
cb7b593c 1998};
19baf839 1999
b299e4f0 2000static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 2001{
cb7b593c 2002 struct tnode *tn = iter->tnode;
a034ee3c 2003 unsigned int cindex = iter->index;
cb7b593c 2004 struct tnode *p;
19baf839 2005
6640e697
EB
2006 /* A single entry routing table */
2007 if (!tn)
2008 return NULL;
2009
cb7b593c
SH
2010 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2011 iter->tnode, iter->index, iter->depth);
2012rescan:
2013 while (cindex < (1<<tn->bits)) {
b299e4f0 2014 struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
19baf839 2015
cb7b593c
SH
2016 if (n) {
2017 if (IS_LEAF(n)) {
2018 iter->tnode = tn;
2019 iter->index = cindex + 1;
2020 } else {
2021 /* push down one level */
2022 iter->tnode = (struct tnode *) n;
2023 iter->index = 0;
2024 ++iter->depth;
2025 }
2026 return n;
2027 }
19baf839 2028
cb7b593c
SH
2029 ++cindex;
2030 }
91b9a277 2031
cb7b593c 2032 /* Current node exhausted, pop back up */
b299e4f0 2033 p = node_parent_rcu((struct rt_trie_node *)tn);
cb7b593c
SH
2034 if (p) {
2035 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2036 tn = p;
2037 --iter->depth;
2038 goto rescan;
19baf839 2039 }
cb7b593c
SH
2040
2041 /* got root? */
2042 return NULL;
19baf839
RO
2043}
2044
b299e4f0 2045static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
cb7b593c 2046 struct trie *t)
19baf839 2047{
b299e4f0 2048 struct rt_trie_node *n;
5ddf0eb2 2049
132adf54 2050 if (!t)
5ddf0eb2
RO
2051 return NULL;
2052
2053 n = rcu_dereference(t->trie);
3d3b2d25 2054 if (!n)
5ddf0eb2 2055 return NULL;
19baf839 2056
3d3b2d25
SH
2057 if (IS_TNODE(n)) {
2058 iter->tnode = (struct tnode *) n;
2059 iter->index = 0;
2060 iter->depth = 1;
2061 } else {
2062 iter->tnode = NULL;
2063 iter->index = 0;
2064 iter->depth = 0;
91b9a277 2065 }
3d3b2d25
SH
2066
2067 return n;
cb7b593c 2068}
91b9a277 2069
cb7b593c
SH
2070static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2071{
b299e4f0 2072 struct rt_trie_node *n;
cb7b593c 2073 struct fib_trie_iter iter;
91b9a277 2074
cb7b593c 2075 memset(s, 0, sizeof(*s));
91b9a277 2076
cb7b593c 2077 rcu_read_lock();
3d3b2d25 2078 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
cb7b593c 2079 if (IS_LEAF(n)) {
93672292
SH
2080 struct leaf *l = (struct leaf *)n;
2081 struct leaf_info *li;
93672292 2082
cb7b593c
SH
2083 s->leaves++;
2084 s->totdepth += iter.depth;
2085 if (iter.depth > s->maxdepth)
2086 s->maxdepth = iter.depth;
93672292 2087
b67bfe0d 2088 hlist_for_each_entry_rcu(li, &l->list, hlist)
93672292 2089 ++s->prefixes;
cb7b593c
SH
2090 } else {
2091 const struct tnode *tn = (const struct tnode *) n;
2092 int i;
2093
2094 s->tnodes++;
132adf54 2095 if (tn->bits < MAX_STAT_DEPTH)
06ef921d
RO
2096 s->nodesizes[tn->bits]++;
2097
cb7b593c
SH
2098 for (i = 0; i < (1<<tn->bits); i++)
2099 if (!tn->child[i])
2100 s->nullpointers++;
19baf839 2101 }
19baf839 2102 }
2373ce1c 2103 rcu_read_unlock();
19baf839
RO
2104}
2105
cb7b593c
SH
2106/*
2107 * This outputs /proc/net/fib_triestats
2108 */
2109static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 2110{
a034ee3c 2111 unsigned int i, max, pointers, bytes, avdepth;
c877efb2 2112
cb7b593c
SH
2113 if (stat->leaves)
2114 avdepth = stat->totdepth*100 / stat->leaves;
2115 else
2116 avdepth = 0;
91b9a277 2117
a07f5f50
SH
2118 seq_printf(seq, "\tAver depth: %u.%02d\n",
2119 avdepth / 100, avdepth % 100);
cb7b593c 2120 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 2121
cb7b593c 2122 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
cb7b593c 2123 bytes = sizeof(struct leaf) * stat->leaves;
93672292
SH
2124
2125 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2126 bytes += sizeof(struct leaf_info) * stat->prefixes;
2127
187b5188 2128 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
cb7b593c 2129 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 2130
06ef921d
RO
2131 max = MAX_STAT_DEPTH;
2132 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 2133 max--;
19baf839 2134
cb7b593c
SH
2135 pointers = 0;
2136 for (i = 1; i <= max; i++)
2137 if (stat->nodesizes[i] != 0) {
187b5188 2138 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
cb7b593c
SH
2139 pointers += (1<<i) * stat->nodesizes[i];
2140 }
2141 seq_putc(seq, '\n');
187b5188 2142 seq_printf(seq, "\tPointers: %u\n", pointers);
2373ce1c 2143
b299e4f0 2144 bytes += sizeof(struct rt_trie_node *) * pointers;
187b5188
SH
2145 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2146 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
66a2f7fd 2147}
2373ce1c 2148
cb7b593c 2149#ifdef CONFIG_IP_FIB_TRIE_STATS
66a2f7fd
SH
2150static void trie_show_usage(struct seq_file *seq,
2151 const struct trie_use_stats *stats)
2152{
2153 seq_printf(seq, "\nCounters:\n---------\n");
a07f5f50
SH
2154 seq_printf(seq, "gets = %u\n", stats->gets);
2155 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2156 seq_printf(seq, "semantic match passed = %u\n",
2157 stats->semantic_match_passed);
2158 seq_printf(seq, "semantic match miss = %u\n",
2159 stats->semantic_match_miss);
2160 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2161 seq_printf(seq, "skipped node resize = %u\n\n",
2162 stats->resize_node_skipped);
cb7b593c 2163}
66a2f7fd
SH
2164#endif /* CONFIG_IP_FIB_TRIE_STATS */
2165
3d3b2d25 2166static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
d717a9a6 2167{
3d3b2d25
SH
2168 if (tb->tb_id == RT_TABLE_LOCAL)
2169 seq_puts(seq, "Local:\n");
2170 else if (tb->tb_id == RT_TABLE_MAIN)
2171 seq_puts(seq, "Main:\n");
2172 else
2173 seq_printf(seq, "Id %d:\n", tb->tb_id);
d717a9a6 2174}
19baf839 2175
3d3b2d25 2176
cb7b593c
SH
2177static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2178{
1c340b2f 2179 struct net *net = (struct net *)seq->private;
3d3b2d25 2180 unsigned int h;
877a9bff 2181
d717a9a6 2182 seq_printf(seq,
a07f5f50
SH
2183 "Basic info: size of leaf:"
2184 " %Zd bytes, size of tnode: %Zd bytes.\n",
d717a9a6
SH
2185 sizeof(struct leaf), sizeof(struct tnode));
2186
3d3b2d25
SH
2187 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2188 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
3d3b2d25
SH
2189 struct fib_table *tb;
2190
b67bfe0d 2191 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
3d3b2d25
SH
2192 struct trie *t = (struct trie *) tb->tb_data;
2193 struct trie_stat stat;
877a9bff 2194
3d3b2d25
SH
2195 if (!t)
2196 continue;
2197
2198 fib_table_print(seq, tb);
2199
2200 trie_collect_stats(t, &stat);
2201 trie_show_stats(seq, &stat);
2202#ifdef CONFIG_IP_FIB_TRIE_STATS
2203 trie_show_usage(seq, &t->stats);
2204#endif
2205 }
2206 }
19baf839 2207
cb7b593c 2208 return 0;
19baf839
RO
2209}
2210
cb7b593c 2211static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2212{
de05c557 2213 return single_open_net(inode, file, fib_triestat_seq_show);
1c340b2f
DL
2214}
2215
9a32144e 2216static const struct file_operations fib_triestat_fops = {
cb7b593c
SH
2217 .owner = THIS_MODULE,
2218 .open = fib_triestat_seq_open,
2219 .read = seq_read,
2220 .llseek = seq_lseek,
b6fcbdb4 2221 .release = single_release_net,
cb7b593c
SH
2222};
2223
b299e4f0 2224static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
19baf839 2225{
1218854a
YH
2226 struct fib_trie_iter *iter = seq->private;
2227 struct net *net = seq_file_net(seq);
cb7b593c 2228 loff_t idx = 0;
3d3b2d25 2229 unsigned int h;
cb7b593c 2230
3d3b2d25
SH
2231 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2232 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
3d3b2d25 2233 struct fib_table *tb;
cb7b593c 2234
b67bfe0d 2235 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
b299e4f0 2236 struct rt_trie_node *n;
3d3b2d25
SH
2237
2238 for (n = fib_trie_get_first(iter,
2239 (struct trie *) tb->tb_data);
2240 n; n = fib_trie_get_next(iter))
2241 if (pos == idx++) {
2242 iter->tb = tb;
2243 return n;
2244 }
2245 }
cb7b593c 2246 }
3d3b2d25 2247
19baf839
RO
2248 return NULL;
2249}
2250
cb7b593c 2251static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
c95aaf9a 2252 __acquires(RCU)
19baf839 2253{
cb7b593c 2254 rcu_read_lock();
1218854a 2255 return fib_trie_get_idx(seq, *pos);
19baf839
RO
2256}
2257
cb7b593c 2258static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2259{
cb7b593c 2260 struct fib_trie_iter *iter = seq->private;
1218854a 2261 struct net *net = seq_file_net(seq);
3d3b2d25
SH
2262 struct fib_table *tb = iter->tb;
2263 struct hlist_node *tb_node;
2264 unsigned int h;
b299e4f0 2265 struct rt_trie_node *n;
cb7b593c 2266
19baf839 2267 ++*pos;
3d3b2d25
SH
2268 /* next node in same table */
2269 n = fib_trie_get_next(iter);
2270 if (n)
2271 return n;
19baf839 2272
3d3b2d25
SH
2273 /* walk rest of this hash chain */
2274 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
0a5c0475 2275 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
3d3b2d25
SH
2276 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2277 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2278 if (n)
2279 goto found;
2280 }
19baf839 2281
3d3b2d25
SH
2282 /* new hash chain */
2283 while (++h < FIB_TABLE_HASHSZ) {
2284 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
b67bfe0d 2285 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
3d3b2d25
SH
2286 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2287 if (n)
2288 goto found;
2289 }
2290 }
cb7b593c 2291 return NULL;
3d3b2d25
SH
2292
2293found:
2294 iter->tb = tb;
2295 return n;
cb7b593c 2296}
19baf839 2297
cb7b593c 2298static void fib_trie_seq_stop(struct seq_file *seq, void *v)
c95aaf9a 2299 __releases(RCU)
19baf839 2300{
cb7b593c
SH
2301 rcu_read_unlock();
2302}
91b9a277 2303
cb7b593c
SH
2304static void seq_indent(struct seq_file *seq, int n)
2305{
a034ee3c
ED
2306 while (n-- > 0)
2307 seq_puts(seq, " ");
cb7b593c 2308}
19baf839 2309
28d36e37 2310static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
cb7b593c 2311{
132adf54 2312 switch (s) {
cb7b593c
SH
2313 case RT_SCOPE_UNIVERSE: return "universe";
2314 case RT_SCOPE_SITE: return "site";
2315 case RT_SCOPE_LINK: return "link";
2316 case RT_SCOPE_HOST: return "host";
2317 case RT_SCOPE_NOWHERE: return "nowhere";
2318 default:
28d36e37 2319 snprintf(buf, len, "scope=%d", s);
cb7b593c
SH
2320 return buf;
2321 }
2322}
19baf839 2323
36cbd3dc 2324static const char *const rtn_type_names[__RTN_MAX] = {
cb7b593c
SH
2325 [RTN_UNSPEC] = "UNSPEC",
2326 [RTN_UNICAST] = "UNICAST",
2327 [RTN_LOCAL] = "LOCAL",
2328 [RTN_BROADCAST] = "BROADCAST",
2329 [RTN_ANYCAST] = "ANYCAST",
2330 [RTN_MULTICAST] = "MULTICAST",
2331 [RTN_BLACKHOLE] = "BLACKHOLE",
2332 [RTN_UNREACHABLE] = "UNREACHABLE",
2333 [RTN_PROHIBIT] = "PROHIBIT",
2334 [RTN_THROW] = "THROW",
2335 [RTN_NAT] = "NAT",
2336 [RTN_XRESOLVE] = "XRESOLVE",
2337};
19baf839 2338
a034ee3c 2339static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
cb7b593c 2340{
cb7b593c
SH
2341 if (t < __RTN_MAX && rtn_type_names[t])
2342 return rtn_type_names[t];
28d36e37 2343 snprintf(buf, len, "type %u", t);
cb7b593c 2344 return buf;
19baf839
RO
2345}
2346
cb7b593c
SH
2347/* Pretty print the trie */
2348static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2349{
cb7b593c 2350 const struct fib_trie_iter *iter = seq->private;
b299e4f0 2351 struct rt_trie_node *n = v;
c877efb2 2352
3d3b2d25
SH
2353 if (!node_parent_rcu(n))
2354 fib_table_print(seq, iter->tb);
095b8501 2355
cb7b593c
SH
2356 if (IS_TNODE(n)) {
2357 struct tnode *tn = (struct tnode *) n;
ab66b4a7 2358 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
91b9a277 2359
1d25cd6c 2360 seq_indent(seq, iter->depth-1);
673d57e7
HH
2361 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2362 &prf, tn->pos, tn->bits, tn->full_children,
1d25cd6c 2363 tn->empty_children);
e905a9ed 2364
cb7b593c
SH
2365 } else {
2366 struct leaf *l = (struct leaf *) n;
1328042e 2367 struct leaf_info *li;
32ab5f80 2368 __be32 val = htonl(l->key);
cb7b593c
SH
2369
2370 seq_indent(seq, iter->depth);
673d57e7 2371 seq_printf(seq, " |-- %pI4\n", &val);
1328042e 2372
b67bfe0d 2373 hlist_for_each_entry_rcu(li, &l->list, hlist) {
1328042e
SH
2374 struct fib_alias *fa;
2375
2376 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2377 char buf1[32], buf2[32];
2378
2379 seq_indent(seq, iter->depth+1);
2380 seq_printf(seq, " /%d %s %s", li->plen,
2381 rtn_scope(buf1, sizeof(buf1),
37e826c5 2382 fa->fa_info->fib_scope),
1328042e
SH
2383 rtn_type(buf2, sizeof(buf2),
2384 fa->fa_type));
2385 if (fa->fa_tos)
b9c4d82a 2386 seq_printf(seq, " tos=%d", fa->fa_tos);
1328042e 2387 seq_putc(seq, '\n');
cb7b593c
SH
2388 }
2389 }
19baf839 2390 }
cb7b593c 2391
19baf839
RO
2392 return 0;
2393}
2394
f690808e 2395static const struct seq_operations fib_trie_seq_ops = {
cb7b593c
SH
2396 .start = fib_trie_seq_start,
2397 .next = fib_trie_seq_next,
2398 .stop = fib_trie_seq_stop,
2399 .show = fib_trie_seq_show,
19baf839
RO
2400};
2401
cb7b593c 2402static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839 2403{
1c340b2f
DL
2404 return seq_open_net(inode, file, &fib_trie_seq_ops,
2405 sizeof(struct fib_trie_iter));
19baf839
RO
2406}
2407
9a32144e 2408static const struct file_operations fib_trie_fops = {
cb7b593c
SH
2409 .owner = THIS_MODULE,
2410 .open = fib_trie_seq_open,
2411 .read = seq_read,
2412 .llseek = seq_lseek,
1c340b2f 2413 .release = seq_release_net,
19baf839
RO
2414};
2415
8315f5d8
SH
2416struct fib_route_iter {
2417 struct seq_net_private p;
2418 struct trie *main_trie;
2419 loff_t pos;
2420 t_key key;
2421};
2422
2423static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2424{
2425 struct leaf *l = NULL;
2426 struct trie *t = iter->main_trie;
2427
2428 /* use cache location of last found key */
2429 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2430 pos -= iter->pos;
2431 else {
2432 iter->pos = 0;
2433 l = trie_firstleaf(t);
2434 }
2435
2436 while (l && pos-- > 0) {
2437 iter->pos++;
2438 l = trie_nextleaf(l);
2439 }
2440
2441 if (l)
2442 iter->key = pos; /* remember it */
2443 else
2444 iter->pos = 0; /* forget it */
2445
2446 return l;
2447}
2448
2449static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2450 __acquires(RCU)
2451{
2452 struct fib_route_iter *iter = seq->private;
2453 struct fib_table *tb;
2454
2455 rcu_read_lock();
1218854a 2456 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
8315f5d8
SH
2457 if (!tb)
2458 return NULL;
2459
2460 iter->main_trie = (struct trie *) tb->tb_data;
2461 if (*pos == 0)
2462 return SEQ_START_TOKEN;
2463 else
2464 return fib_route_get_idx(iter, *pos - 1);
2465}
2466
2467static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2468{
2469 struct fib_route_iter *iter = seq->private;
2470 struct leaf *l = v;
2471
2472 ++*pos;
2473 if (v == SEQ_START_TOKEN) {
2474 iter->pos = 0;
2475 l = trie_firstleaf(iter->main_trie);
2476 } else {
2477 iter->pos++;
2478 l = trie_nextleaf(l);
2479 }
2480
2481 if (l)
2482 iter->key = l->key;
2483 else
2484 iter->pos = 0;
2485 return l;
2486}
2487
2488static void fib_route_seq_stop(struct seq_file *seq, void *v)
2489 __releases(RCU)
2490{
2491 rcu_read_unlock();
2492}
2493
a034ee3c 2494static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
19baf839 2495{
a034ee3c 2496 unsigned int flags = 0;
19baf839 2497
a034ee3c
ED
2498 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2499 flags = RTF_REJECT;
cb7b593c
SH
2500 if (fi && fi->fib_nh->nh_gw)
2501 flags |= RTF_GATEWAY;
32ab5f80 2502 if (mask == htonl(0xFFFFFFFF))
cb7b593c
SH
2503 flags |= RTF_HOST;
2504 flags |= RTF_UP;
2505 return flags;
19baf839
RO
2506}
2507
cb7b593c
SH
2508/*
2509 * This outputs /proc/net/route.
2510 * The format of the file is not supposed to be changed
a034ee3c 2511 * and needs to be same as fib_hash output to avoid breaking
cb7b593c
SH
2512 * legacy utilities
2513 */
2514static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2515{
cb7b593c 2516 struct leaf *l = v;
1328042e 2517 struct leaf_info *li;
19baf839 2518
cb7b593c
SH
2519 if (v == SEQ_START_TOKEN) {
2520 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2521 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2522 "\tWindow\tIRTT");
2523 return 0;
2524 }
19baf839 2525
b67bfe0d 2526 hlist_for_each_entry_rcu(li, &l->list, hlist) {
cb7b593c 2527 struct fib_alias *fa;
32ab5f80 2528 __be32 mask, prefix;
91b9a277 2529
cb7b593c
SH
2530 mask = inet_make_mask(li->plen);
2531 prefix = htonl(l->key);
19baf839 2532
cb7b593c 2533 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2534 const struct fib_info *fi = fa->fa_info;
a034ee3c 2535 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
5e659e4c 2536 int len;
19baf839 2537
cb7b593c
SH
2538 if (fa->fa_type == RTN_BROADCAST
2539 || fa->fa_type == RTN_MULTICAST)
2540 continue;
19baf839 2541
cb7b593c 2542 if (fi)
5e659e4c
PE
2543 seq_printf(seq,
2544 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2545 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c
SH
2546 fi->fib_dev ? fi->fib_dev->name : "*",
2547 prefix,
2548 fi->fib_nh->nh_gw, flags, 0, 0,
2549 fi->fib_priority,
2550 mask,
a07f5f50
SH
2551 (fi->fib_advmss ?
2552 fi->fib_advmss + 40 : 0),
cb7b593c 2553 fi->fib_window,
5e659e4c 2554 fi->fib_rtt >> 3, &len);
cb7b593c 2555 else
5e659e4c
PE
2556 seq_printf(seq,
2557 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2558 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c 2559 prefix, 0, flags, 0, 0, 0,
5e659e4c 2560 mask, 0, 0, 0, &len);
19baf839 2561
5e659e4c 2562 seq_printf(seq, "%*s\n", 127 - len, "");
cb7b593c 2563 }
19baf839
RO
2564 }
2565
2566 return 0;
2567}
2568
f690808e 2569static const struct seq_operations fib_route_seq_ops = {
8315f5d8
SH
2570 .start = fib_route_seq_start,
2571 .next = fib_route_seq_next,
2572 .stop = fib_route_seq_stop,
cb7b593c 2573 .show = fib_route_seq_show,
19baf839
RO
2574};
2575
cb7b593c 2576static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839 2577{
1c340b2f 2578 return seq_open_net(inode, file, &fib_route_seq_ops,
8315f5d8 2579 sizeof(struct fib_route_iter));
19baf839
RO
2580}
2581
9a32144e 2582static const struct file_operations fib_route_fops = {
cb7b593c
SH
2583 .owner = THIS_MODULE,
2584 .open = fib_route_seq_open,
2585 .read = seq_read,
2586 .llseek = seq_lseek,
1c340b2f 2587 .release = seq_release_net,
19baf839
RO
2588};
2589
61a02653 2590int __net_init fib_proc_init(struct net *net)
19baf839 2591{
d4beaa66 2592 if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
cb7b593c
SH
2593 goto out1;
2594
d4beaa66
G
2595 if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2596 &fib_triestat_fops))
cb7b593c
SH
2597 goto out2;
2598
d4beaa66 2599 if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
cb7b593c
SH
2600 goto out3;
2601
19baf839 2602 return 0;
cb7b593c
SH
2603
2604out3:
ece31ffd 2605 remove_proc_entry("fib_triestat", net->proc_net);
cb7b593c 2606out2:
ece31ffd 2607 remove_proc_entry("fib_trie", net->proc_net);
cb7b593c
SH
2608out1:
2609 return -ENOMEM;
19baf839
RO
2610}
2611
61a02653 2612void __net_exit fib_proc_exit(struct net *net)
19baf839 2613{
ece31ffd
G
2614 remove_proc_entry("fib_trie", net->proc_net);
2615 remove_proc_entry("fib_triestat", net->proc_net);
2616 remove_proc_entry("route", net->proc_net);
19baf839
RO
2617}
2618
2619#endif /* CONFIG_PROC_FS */