]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - net/ipv4/fib_semantics.c
netlink: change nlmsg_notify() return value logic
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / fib_semantics.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * IPv4 Forwarding Information Base: semantics.
7 *
1da177e4
LT
8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
1da177e4
LT
16#include <asm/uaccess.h>
17#include <asm/system.h>
18#include <linux/bitops.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/jiffies.h>
22#include <linux/mm.h>
23#include <linux/string.h>
24#include <linux/socket.h>
25#include <linux/sockios.h>
26#include <linux/errno.h>
27#include <linux/in.h>
28#include <linux/inet.h>
14c85021 29#include <linux/inetdevice.h>
1da177e4
LT
30#include <linux/netdevice.h>
31#include <linux/if_arp.h>
32#include <linux/proc_fs.h>
33#include <linux/skbuff.h>
1da177e4
LT
34#include <linux/init.h>
35
14c85021 36#include <net/arp.h>
1da177e4
LT
37#include <net/ip.h>
38#include <net/protocol.h>
39#include <net/route.h>
40#include <net/tcp.h>
41#include <net/sock.h>
42#include <net/ip_fib.h>
f21c7bc5 43#include <net/netlink.h>
4e902c57 44#include <net/nexthop.h>
1da177e4
LT
45
46#include "fib_lookup.h"
47
832b4c5e 48static DEFINE_SPINLOCK(fib_info_lock);
1da177e4
LT
49static struct hlist_head *fib_info_hash;
50static struct hlist_head *fib_info_laddrhash;
51static unsigned int fib_hash_size;
52static unsigned int fib_info_cnt;
53
54#define DEVINDEX_HASHBITS 8
55#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
56static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
57
58#ifdef CONFIG_IP_ROUTE_MULTIPATH
59
60static DEFINE_SPINLOCK(fib_multipath_lock);
61
62#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
63for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
64
65#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
d9319100 66for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
1da177e4
LT
67
68#else /* CONFIG_IP_ROUTE_MULTIPATH */
69
70/* Hope, that gcc will optimize it to get rid of dummy loop */
71
d9319100 72#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
1da177e4
LT
73for (nhsel=0; nhsel < 1; nhsel++)
74
d9319100 75#define change_nexthops(fi) { int nhsel = 0; struct fib_nh * nh = (struct fib_nh *)((fi)->fib_nh); \
1da177e4
LT
76for (nhsel=0; nhsel < 1; nhsel++)
77
78#endif /* CONFIG_IP_ROUTE_MULTIPATH */
79
80#define endfor_nexthops(fi) }
81
82
e905a9ed 83static const struct
1da177e4
LT
84{
85 int error;
86 u8 scope;
a0ee18b9 87} fib_props[RTN_MAX + 1] = {
e905a9ed 88 {
1da177e4
LT
89 .error = 0,
90 .scope = RT_SCOPE_NOWHERE,
91 }, /* RTN_UNSPEC */
92 {
93 .error = 0,
94 .scope = RT_SCOPE_UNIVERSE,
95 }, /* RTN_UNICAST */
96 {
97 .error = 0,
98 .scope = RT_SCOPE_HOST,
99 }, /* RTN_LOCAL */
100 {
101 .error = 0,
102 .scope = RT_SCOPE_LINK,
103 }, /* RTN_BROADCAST */
104 {
105 .error = 0,
106 .scope = RT_SCOPE_LINK,
107 }, /* RTN_ANYCAST */
108 {
109 .error = 0,
110 .scope = RT_SCOPE_UNIVERSE,
111 }, /* RTN_MULTICAST */
112 {
113 .error = -EINVAL,
114 .scope = RT_SCOPE_UNIVERSE,
115 }, /* RTN_BLACKHOLE */
116 {
117 .error = -EHOSTUNREACH,
118 .scope = RT_SCOPE_UNIVERSE,
119 }, /* RTN_UNREACHABLE */
120 {
121 .error = -EACCES,
122 .scope = RT_SCOPE_UNIVERSE,
123 }, /* RTN_PROHIBIT */
124 {
125 .error = -EAGAIN,
126 .scope = RT_SCOPE_UNIVERSE,
127 }, /* RTN_THROW */
128 {
129 .error = -EINVAL,
130 .scope = RT_SCOPE_NOWHERE,
131 }, /* RTN_NAT */
132 {
133 .error = -EINVAL,
134 .scope = RT_SCOPE_NOWHERE,
135 }, /* RTN_XRESOLVE */
136};
137
138
139/* Release a nexthop info record */
140
141void free_fib_info(struct fib_info *fi)
142{
143 if (fi->fib_dead == 0) {
a6db9010 144 printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
1da177e4
LT
145 return;
146 }
147 change_nexthops(fi) {
148 if (nh->nh_dev)
149 dev_put(nh->nh_dev);
150 nh->nh_dev = NULL;
151 } endfor_nexthops(fi);
152 fib_info_cnt--;
57d7a600 153 release_net(fi->fib_net);
1da177e4
LT
154 kfree(fi);
155}
156
157void fib_release_info(struct fib_info *fi)
158{
832b4c5e 159 spin_lock_bh(&fib_info_lock);
1da177e4
LT
160 if (fi && --fi->fib_treeref == 0) {
161 hlist_del(&fi->fib_hash);
162 if (fi->fib_prefsrc)
163 hlist_del(&fi->fib_lhash);
164 change_nexthops(fi) {
165 if (!nh->nh_dev)
166 continue;
167 hlist_del(&nh->nh_hash);
168 } endfor_nexthops(fi)
169 fi->fib_dead = 1;
170 fib_info_put(fi);
171 }
832b4c5e 172 spin_unlock_bh(&fib_info_lock);
1da177e4
LT
173}
174
175static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
176{
177 const struct fib_nh *onh = ofi->fib_nh;
178
179 for_nexthops(fi) {
180 if (nh->nh_oif != onh->nh_oif ||
181 nh->nh_gw != onh->nh_gw ||
182 nh->nh_scope != onh->nh_scope ||
183#ifdef CONFIG_IP_ROUTE_MULTIPATH
184 nh->nh_weight != onh->nh_weight ||
185#endif
186#ifdef CONFIG_NET_CLS_ROUTE
187 nh->nh_tclassid != onh->nh_tclassid ||
188#endif
189 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
190 return -1;
191 onh++;
192 } endfor_nexthops(fi);
193 return 0;
194}
195
88ebc72f
DM
196static inline unsigned int fib_devindex_hashfn(unsigned int val)
197{
198 unsigned int mask = DEVINDEX_HASHSIZE - 1;
199
200 return (val ^
201 (val >> DEVINDEX_HASHBITS) ^
202 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
203}
204
1da177e4
LT
205static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
206{
207 unsigned int mask = (fib_hash_size - 1);
208 unsigned int val = fi->fib_nhs;
209
210 val ^= fi->fib_protocol;
81f7bf6c 211 val ^= (__force u32)fi->fib_prefsrc;
1da177e4 212 val ^= fi->fib_priority;
88ebc72f
DM
213 for_nexthops(fi) {
214 val ^= fib_devindex_hashfn(nh->nh_oif);
215 } endfor_nexthops(fi)
1da177e4
LT
216
217 return (val ^ (val >> 7) ^ (val >> 12)) & mask;
218}
219
220static struct fib_info *fib_find_info(const struct fib_info *nfi)
221{
222 struct hlist_head *head;
223 struct hlist_node *node;
224 struct fib_info *fi;
225 unsigned int hash;
226
227 hash = fib_info_hashfn(nfi);
228 head = &fib_info_hash[hash];
229
230 hlist_for_each_entry(fi, node, head, fib_hash) {
4814bdbd
DL
231 if (fi->fib_net != nfi->fib_net)
232 continue;
1da177e4
LT
233 if (fi->fib_nhs != nfi->fib_nhs)
234 continue;
235 if (nfi->fib_protocol == fi->fib_protocol &&
236 nfi->fib_prefsrc == fi->fib_prefsrc &&
237 nfi->fib_priority == fi->fib_priority &&
238 memcmp(nfi->fib_metrics, fi->fib_metrics,
239 sizeof(fi->fib_metrics)) == 0 &&
240 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
241 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
242 return fi;
243 }
244
245 return NULL;
246}
247
1da177e4
LT
248/* Check, that the gateway is already configured.
249 Used only by redirect accept routine.
250 */
251
d878e72e 252int ip_fib_check_default(__be32 gw, struct net_device *dev)
1da177e4
LT
253{
254 struct hlist_head *head;
255 struct hlist_node *node;
256 struct fib_nh *nh;
257 unsigned int hash;
258
832b4c5e 259 spin_lock(&fib_info_lock);
1da177e4
LT
260
261 hash = fib_devindex_hashfn(dev->ifindex);
262 head = &fib_info_devhash[hash];
263 hlist_for_each_entry(nh, node, head, nh_hash) {
264 if (nh->nh_dev == dev &&
265 nh->nh_gw == gw &&
266 !(nh->nh_flags&RTNH_F_DEAD)) {
832b4c5e 267 spin_unlock(&fib_info_lock);
1da177e4
LT
268 return 0;
269 }
270 }
271
832b4c5e 272 spin_unlock(&fib_info_lock);
1da177e4
LT
273
274 return -1;
275}
276
339bf98f
TG
277static inline size_t fib_nlmsg_size(struct fib_info *fi)
278{
279 size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
280 + nla_total_size(4) /* RTA_TABLE */
281 + nla_total_size(4) /* RTA_DST */
282 + nla_total_size(4) /* RTA_PRIORITY */
283 + nla_total_size(4); /* RTA_PREFSRC */
284
285 /* space for nested metrics */
286 payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
287
288 if (fi->fib_nhs) {
289 /* Also handles the special case fib_nhs == 1 */
290
291 /* each nexthop is packed in an attribute */
292 size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
293
294 /* may contain flow and gateway attribute */
295 nhsize += 2 * nla_total_size(4);
296
297 /* all nexthops are packed in a nested attribute */
298 payload += nla_total_size(fi->fib_nhs * nhsize);
299 }
300
301 return payload;
302}
303
81f7bf6c 304void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
b8f55831
MK
305 int dst_len, u32 tb_id, struct nl_info *info,
306 unsigned int nlm_flags)
1da177e4
LT
307{
308 struct sk_buff *skb;
4e902c57 309 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
f21c7bc5 310 int err = -ENOBUFS;
1da177e4 311
339bf98f 312 skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
f21c7bc5
TG
313 if (skb == NULL)
314 goto errout;
1da177e4 315
4e902c57 316 err = fib_dump_info(skb, info->pid, seq, event, tb_id,
be403ea1 317 fa->fa_type, fa->fa_scope, key, dst_len,
b8f55831 318 fa->fa_tos, fa->fa_info, nlm_flags);
26932566
PM
319 if (err < 0) {
320 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
321 WARN_ON(err == -EMSGSIZE);
322 kfree_skb(skb);
323 goto errout;
324 }
1ce85fe4
PNA
325 rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
326 info->nlh, GFP_KERNEL);
327 return;
f21c7bc5
TG
328errout:
329 if (err < 0)
4d1169c1 330 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
1da177e4
LT
331}
332
333/* Return the first fib alias matching TOS with
334 * priority less than or equal to PRIO.
335 */
336struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
337{
338 if (fah) {
339 struct fib_alias *fa;
340 list_for_each_entry(fa, fah, fa_list) {
341 if (fa->fa_tos > tos)
342 continue;
343 if (fa->fa_info->fib_priority >= prio ||
344 fa->fa_tos < tos)
345 return fa;
346 }
347 }
348 return NULL;
349}
350
351int fib_detect_death(struct fib_info *fi, int order,
c17860a0 352 struct fib_info **last_resort, int *last_idx, int dflt)
1da177e4
LT
353{
354 struct neighbour *n;
355 int state = NUD_NONE;
356
357 n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
358 if (n) {
359 state = n->nud_state;
360 neigh_release(n);
361 }
d9319100 362 if (state == NUD_REACHABLE)
1da177e4 363 return 0;
c17860a0 364 if ((state&NUD_VALID) && order != dflt)
1da177e4
LT
365 return 0;
366 if ((state&NUD_VALID) ||
c17860a0 367 (*last_idx<0 && order > dflt)) {
1da177e4
LT
368 *last_resort = fi;
369 *last_idx = order;
370 }
371 return 1;
372}
373
374#ifdef CONFIG_IP_ROUTE_MULTIPATH
375
4e902c57 376static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
1da177e4
LT
377{
378 int nhs = 0;
1da177e4 379
4e902c57 380 while (rtnh_ok(rtnh, remaining)) {
1da177e4 381 nhs++;
4e902c57
TG
382 rtnh = rtnh_next(rtnh, &remaining);
383 }
384
385 /* leftover implies invalid nexthop configuration, discard it */
386 return remaining > 0 ? 0 : nhs;
1da177e4
LT
387}
388
4e902c57
TG
389static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
390 int remaining, struct fib_config *cfg)
1da177e4 391{
1da177e4 392 change_nexthops(fi) {
4e902c57
TG
393 int attrlen;
394
395 if (!rtnh_ok(rtnh, remaining))
1da177e4 396 return -EINVAL;
4e902c57
TG
397
398 nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
399 nh->nh_oif = rtnh->rtnh_ifindex;
400 nh->nh_weight = rtnh->rtnh_hops + 1;
401
402 attrlen = rtnh_attrlen(rtnh);
403 if (attrlen > 0) {
404 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
405
406 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
17fb2c64 407 nh->nh_gw = nla ? nla_get_be32(nla) : 0;
1da177e4 408#ifdef CONFIG_NET_CLS_ROUTE
4e902c57
TG
409 nla = nla_find(attrs, attrlen, RTA_FLOW);
410 nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
1da177e4
LT
411#endif
412 }
4e902c57
TG
413
414 rtnh = rtnh_next(rtnh, &remaining);
1da177e4 415 } endfor_nexthops(fi);
4e902c57 416
1da177e4
LT
417 return 0;
418}
419
420#endif
421
4e902c57 422int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
1da177e4
LT
423{
424#ifdef CONFIG_IP_ROUTE_MULTIPATH
4e902c57
TG
425 struct rtnexthop *rtnh;
426 int remaining;
1da177e4
LT
427#endif
428
4e902c57 429 if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
1da177e4
LT
430 return 1;
431
4e902c57
TG
432 if (cfg->fc_oif || cfg->fc_gw) {
433 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
434 (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw))
1da177e4
LT
435 return 0;
436 return 1;
437 }
438
439#ifdef CONFIG_IP_ROUTE_MULTIPATH
4e902c57 440 if (cfg->fc_mp == NULL)
1da177e4 441 return 0;
4e902c57
TG
442
443 rtnh = cfg->fc_mp;
444 remaining = cfg->fc_mp_len;
e905a9ed 445
1da177e4 446 for_nexthops(fi) {
4e902c57 447 int attrlen;
1da177e4 448
4e902c57 449 if (!rtnh_ok(rtnh, remaining))
1da177e4 450 return -EINVAL;
4e902c57
TG
451
452 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
1da177e4 453 return 1;
4e902c57
TG
454
455 attrlen = rtnh_attrlen(rtnh);
456 if (attrlen < 0) {
457 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
458
459 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
17fb2c64 460 if (nla && nla_get_be32(nla) != nh->nh_gw)
1da177e4
LT
461 return 1;
462#ifdef CONFIG_NET_CLS_ROUTE
4e902c57
TG
463 nla = nla_find(attrs, attrlen, RTA_FLOW);
464 if (nla && nla_get_u32(nla) != nh->nh_tclassid)
1da177e4
LT
465 return 1;
466#endif
467 }
4e902c57
TG
468
469 rtnh = rtnh_next(rtnh, &remaining);
1da177e4
LT
470 } endfor_nexthops(fi);
471#endif
472 return 0;
473}
474
475
476/*
477 Picture
478 -------
479
480 Semantics of nexthop is very messy by historical reasons.
481 We have to take into account, that:
482 a) gateway can be actually local interface address,
483 so that gatewayed route is direct.
484 b) gateway must be on-link address, possibly
485 described not by an ifaddr, but also by a direct route.
486 c) If both gateway and interface are specified, they should not
487 contradict.
488 d) If we use tunnel routes, gateway could be not on-link.
489
490 Attempt to reconcile all of these (alas, self-contradictory) conditions
491 results in pretty ugly and hairy code with obscure logic.
492
493 I chose to generalized it instead, so that the size
494 of code does not increase practically, but it becomes
495 much more general.
496 Every prefix is assigned a "scope" value: "host" is local address,
497 "link" is direct route,
498 [ ... "site" ... "interior" ... ]
499 and "universe" is true gateway route with global meaning.
500
501 Every prefix refers to a set of "nexthop"s (gw, oif),
502 where gw must have narrower scope. This recursion stops
503 when gw has LOCAL scope or if "nexthop" is declared ONLINK,
504 which means that gw is forced to be on link.
505
506 Code is still hairy, but now it is apparently logically
507 consistent and very flexible. F.e. as by-product it allows
508 to co-exists in peace independent exterior and interior
509 routing processes.
510
511 Normally it looks as following.
512
513 {universe prefix} -> (gw, oif) [scope link]
e905a9ed 514 |
1da177e4 515 |-> {link prefix} -> (gw, oif) [scope local]
e905a9ed 516 |
1da177e4
LT
517 |-> {local prefix} (terminal node)
518 */
519
4e902c57
TG
520static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
521 struct fib_nh *nh)
1da177e4
LT
522{
523 int err;
86167a37 524 struct net *net;
1da177e4 525
86167a37 526 net = cfg->fc_nlinfo.nl_net;
1da177e4
LT
527 if (nh->nh_gw) {
528 struct fib_result res;
529
530#ifdef CONFIG_IP_ROUTE_PERVASIVE
531 if (nh->nh_flags&RTNH_F_PERVASIVE)
532 return 0;
533#endif
534 if (nh->nh_flags&RTNH_F_ONLINK) {
535 struct net_device *dev;
536
4e902c57 537 if (cfg->fc_scope >= RT_SCOPE_LINK)
1da177e4 538 return -EINVAL;
86167a37 539 if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
1da177e4 540 return -EINVAL;
86167a37 541 if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
1da177e4
LT
542 return -ENODEV;
543 if (!(dev->flags&IFF_UP))
544 return -ENETDOWN;
545 nh->nh_dev = dev;
546 dev_hold(dev);
547 nh->nh_scope = RT_SCOPE_LINK;
548 return 0;
549 }
550 {
4e902c57
TG
551 struct flowi fl = {
552 .nl_u = {
553 .ip4_u = {
554 .daddr = nh->nh_gw,
555 .scope = cfg->fc_scope + 1,
556 },
557 },
558 .oif = nh->nh_oif,
559 };
1da177e4
LT
560
561 /* It is not necessary, but requires a bit of thinking */
562 if (fl.fl4_scope < RT_SCOPE_LINK)
563 fl.fl4_scope = RT_SCOPE_LINK;
86167a37 564 if ((err = fib_lookup(net, &fl, &res)) != 0)
1da177e4
LT
565 return err;
566 }
567 err = -EINVAL;
568 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
569 goto out;
570 nh->nh_scope = res.scope;
571 nh->nh_oif = FIB_RES_OIF(res);
572 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
573 goto out;
574 dev_hold(nh->nh_dev);
575 err = -ENETDOWN;
576 if (!(nh->nh_dev->flags & IFF_UP))
577 goto out;
578 err = 0;
579out:
580 fib_res_put(&res);
581 return err;
582 } else {
583 struct in_device *in_dev;
584
585 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
586 return -EINVAL;
587
86167a37 588 in_dev = inetdev_by_index(net, nh->nh_oif);
1da177e4
LT
589 if (in_dev == NULL)
590 return -ENODEV;
591 if (!(in_dev->dev->flags&IFF_UP)) {
592 in_dev_put(in_dev);
593 return -ENETDOWN;
594 }
595 nh->nh_dev = in_dev->dev;
596 dev_hold(nh->nh_dev);
597 nh->nh_scope = RT_SCOPE_HOST;
598 in_dev_put(in_dev);
599 }
600 return 0;
601}
602
81f7bf6c 603static inline unsigned int fib_laddr_hashfn(__be32 val)
1da177e4
LT
604{
605 unsigned int mask = (fib_hash_size - 1);
606
81f7bf6c 607 return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
1da177e4
LT
608}
609
610static struct hlist_head *fib_hash_alloc(int bytes)
611{
612 if (bytes <= PAGE_SIZE)
88f83491 613 return kzalloc(bytes, GFP_KERNEL);
1da177e4
LT
614 else
615 return (struct hlist_head *)
88f83491 616 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
1da177e4
LT
617}
618
619static void fib_hash_free(struct hlist_head *hash, int bytes)
620{
621 if (!hash)
622 return;
623
624 if (bytes <= PAGE_SIZE)
625 kfree(hash);
626 else
627 free_pages((unsigned long) hash, get_order(bytes));
628}
629
630static void fib_hash_move(struct hlist_head *new_info_hash,
631 struct hlist_head *new_laddrhash,
632 unsigned int new_size)
633{
b7656e7f 634 struct hlist_head *old_info_hash, *old_laddrhash;
1da177e4 635 unsigned int old_size = fib_hash_size;
b7656e7f 636 unsigned int i, bytes;
1da177e4 637
832b4c5e 638 spin_lock_bh(&fib_info_lock);
b7656e7f
DM
639 old_info_hash = fib_info_hash;
640 old_laddrhash = fib_info_laddrhash;
1da177e4
LT
641 fib_hash_size = new_size;
642
643 for (i = 0; i < old_size; i++) {
644 struct hlist_head *head = &fib_info_hash[i];
645 struct hlist_node *node, *n;
646 struct fib_info *fi;
647
648 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
649 struct hlist_head *dest;
650 unsigned int new_hash;
651
652 hlist_del(&fi->fib_hash);
653
654 new_hash = fib_info_hashfn(fi);
655 dest = &new_info_hash[new_hash];
656 hlist_add_head(&fi->fib_hash, dest);
657 }
658 }
659 fib_info_hash = new_info_hash;
660
661 for (i = 0; i < old_size; i++) {
662 struct hlist_head *lhead = &fib_info_laddrhash[i];
663 struct hlist_node *node, *n;
664 struct fib_info *fi;
665
666 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
667 struct hlist_head *ldest;
668 unsigned int new_hash;
669
670 hlist_del(&fi->fib_lhash);
671
672 new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
673 ldest = &new_laddrhash[new_hash];
674 hlist_add_head(&fi->fib_lhash, ldest);
675 }
676 }
677 fib_info_laddrhash = new_laddrhash;
678
832b4c5e 679 spin_unlock_bh(&fib_info_lock);
b7656e7f
DM
680
681 bytes = old_size * sizeof(struct hlist_head *);
682 fib_hash_free(old_info_hash, bytes);
683 fib_hash_free(old_laddrhash, bytes);
1da177e4
LT
684}
685
4e902c57 686struct fib_info *fib_create_info(struct fib_config *cfg)
1da177e4
LT
687{
688 int err;
689 struct fib_info *fi = NULL;
690 struct fib_info *ofi;
1da177e4 691 int nhs = 1;
7462bd74 692 struct net *net = cfg->fc_nlinfo.nl_net;
1da177e4
LT
693
694 /* Fast check to catch the most weird cases */
4e902c57 695 if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
1da177e4
LT
696 goto err_inval;
697
698#ifdef CONFIG_IP_ROUTE_MULTIPATH
4e902c57
TG
699 if (cfg->fc_mp) {
700 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
1da177e4
LT
701 if (nhs == 0)
702 goto err_inval;
703 }
704#endif
1da177e4
LT
705
706 err = -ENOBUFS;
707 if (fib_info_cnt >= fib_hash_size) {
708 unsigned int new_size = fib_hash_size << 1;
709 struct hlist_head *new_info_hash;
710 struct hlist_head *new_laddrhash;
711 unsigned int bytes;
712
713 if (!new_size)
714 new_size = 1;
715 bytes = new_size * sizeof(struct hlist_head *);
716 new_info_hash = fib_hash_alloc(bytes);
717 new_laddrhash = fib_hash_alloc(bytes);
718 if (!new_info_hash || !new_laddrhash) {
719 fib_hash_free(new_info_hash, bytes);
720 fib_hash_free(new_laddrhash, bytes);
88f83491 721 } else
1da177e4 722 fib_hash_move(new_info_hash, new_laddrhash, new_size);
1da177e4
LT
723
724 if (!fib_hash_size)
725 goto failure;
726 }
727
0da974f4 728 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
1da177e4
LT
729 if (fi == NULL)
730 goto failure;
731 fib_info_cnt++;
1da177e4 732
57d7a600 733 fi->fib_net = hold_net(net);
4e902c57
TG
734 fi->fib_protocol = cfg->fc_protocol;
735 fi->fib_flags = cfg->fc_flags;
736 fi->fib_priority = cfg->fc_priority;
737 fi->fib_prefsrc = cfg->fc_prefsrc;
1da177e4
LT
738
739 fi->fib_nhs = nhs;
740 change_nexthops(fi) {
741 nh->nh_parent = fi;
742 } endfor_nexthops(fi)
743
4e902c57
TG
744 if (cfg->fc_mx) {
745 struct nlattr *nla;
746 int remaining;
747
748 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
8f4c1f9b 749 int type = nla_type(nla);
4e902c57
TG
750
751 if (type) {
752 if (type > RTAX_MAX)
1da177e4 753 goto err_inval;
4e902c57 754 fi->fib_metrics[type - 1] = nla_get_u32(nla);
1da177e4 755 }
1da177e4
LT
756 }
757 }
1da177e4 758
4e902c57 759 if (cfg->fc_mp) {
1da177e4 760#ifdef CONFIG_IP_ROUTE_MULTIPATH
4e902c57
TG
761 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
762 if (err != 0)
1da177e4 763 goto failure;
4e902c57 764 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
1da177e4 765 goto err_inval;
4e902c57 766 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
1da177e4
LT
767 goto err_inval;
768#ifdef CONFIG_NET_CLS_ROUTE
4e902c57 769 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
1da177e4
LT
770 goto err_inval;
771#endif
772#else
773 goto err_inval;
774#endif
775 } else {
776 struct fib_nh *nh = fi->fib_nh;
4e902c57
TG
777
778 nh->nh_oif = cfg->fc_oif;
779 nh->nh_gw = cfg->fc_gw;
780 nh->nh_flags = cfg->fc_flags;
1da177e4 781#ifdef CONFIG_NET_CLS_ROUTE
4e902c57 782 nh->nh_tclassid = cfg->fc_flow;
1da177e4 783#endif
1da177e4
LT
784#ifdef CONFIG_IP_ROUTE_MULTIPATH
785 nh->nh_weight = 1;
786#endif
787 }
788
4e902c57
TG
789 if (fib_props[cfg->fc_type].error) {
790 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
1da177e4
LT
791 goto err_inval;
792 goto link_it;
793 }
794
4e902c57 795 if (cfg->fc_scope > RT_SCOPE_HOST)
1da177e4
LT
796 goto err_inval;
797
4e902c57 798 if (cfg->fc_scope == RT_SCOPE_HOST) {
1da177e4
LT
799 struct fib_nh *nh = fi->fib_nh;
800
801 /* Local address is added. */
802 if (nhs != 1 || nh->nh_gw)
803 goto err_inval;
804 nh->nh_scope = RT_SCOPE_NOWHERE;
7462bd74 805 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
1da177e4
LT
806 err = -ENODEV;
807 if (nh->nh_dev == NULL)
808 goto failure;
809 } else {
810 change_nexthops(fi) {
4e902c57 811 if ((err = fib_check_nh(cfg, fi, nh)) != 0)
1da177e4
LT
812 goto failure;
813 } endfor_nexthops(fi)
814 }
815
816 if (fi->fib_prefsrc) {
4e902c57
TG
817 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
818 fi->fib_prefsrc != cfg->fc_dst)
7462bd74 819 if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
1da177e4
LT
820 goto err_inval;
821 }
822
823link_it:
824 if ((ofi = fib_find_info(fi)) != NULL) {
825 fi->fib_dead = 1;
826 free_fib_info(fi);
827 ofi->fib_treeref++;
828 return ofi;
829 }
830
831 fi->fib_treeref++;
832 atomic_inc(&fi->fib_clntref);
832b4c5e 833 spin_lock_bh(&fib_info_lock);
1da177e4
LT
834 hlist_add_head(&fi->fib_hash,
835 &fib_info_hash[fib_info_hashfn(fi)]);
836 if (fi->fib_prefsrc) {
837 struct hlist_head *head;
838
839 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
840 hlist_add_head(&fi->fib_lhash, head);
841 }
842 change_nexthops(fi) {
843 struct hlist_head *head;
844 unsigned int hash;
845
846 if (!nh->nh_dev)
847 continue;
848 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
849 head = &fib_info_devhash[hash];
850 hlist_add_head(&nh->nh_hash, head);
851 } endfor_nexthops(fi)
832b4c5e 852 spin_unlock_bh(&fib_info_lock);
1da177e4
LT
853 return fi;
854
855err_inval:
856 err = -EINVAL;
857
858failure:
e905a9ed 859 if (fi) {
1da177e4
LT
860 fi->fib_dead = 1;
861 free_fib_info(fi);
862 }
4e902c57
TG
863
864 return ERR_PTR(err);
1da177e4
LT
865}
866
e5b43760 867/* Note! fib_semantic_match intentionally uses RCU list functions. */
1da177e4 868int fib_semantic_match(struct list_head *head, const struct flowi *flp,
1ef1b8c8 869 struct fib_result *res, __be32 zone, __be32 mask,
1da177e4
LT
870 int prefixlen)
871{
872 struct fib_alias *fa;
873 int nh_sel = 0;
874
e5b43760 875 list_for_each_entry_rcu(fa, head, fa_list) {
1da177e4
LT
876 int err;
877
878 if (fa->fa_tos &&
879 fa->fa_tos != flp->fl4_tos)
880 continue;
881
882 if (fa->fa_scope < flp->fl4_scope)
883 continue;
884
885 fa->fa_state |= FA_S_ACCESSED;
886
887 err = fib_props[fa->fa_type].error;
888 if (err == 0) {
889 struct fib_info *fi = fa->fa_info;
890
891 if (fi->fib_flags & RTNH_F_DEAD)
892 continue;
893
894 switch (fa->fa_type) {
895 case RTN_UNICAST:
896 case RTN_LOCAL:
897 case RTN_BROADCAST:
898 case RTN_ANYCAST:
899 case RTN_MULTICAST:
900 for_nexthops(fi) {
901 if (nh->nh_flags&RTNH_F_DEAD)
902 continue;
903 if (!flp->oif || flp->oif == nh->nh_oif)
904 break;
905 }
906#ifdef CONFIG_IP_ROUTE_MULTIPATH
907 if (nhsel < fi->fib_nhs) {
908 nh_sel = nhsel;
909 goto out_fill_res;
910 }
911#else
912 if (nhsel < 1) {
913 goto out_fill_res;
914 }
915#endif
916 endfor_nexthops(fi);
917 continue;
918
919 default:
a6db9010
SH
920 printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
921 fa->fa_type);
1da177e4 922 return -EINVAL;
3ff50b79 923 }
1da177e4
LT
924 }
925 return err;
926 }
927 return 1;
928
929out_fill_res:
930 res->prefixlen = prefixlen;
931 res->nh_sel = nh_sel;
932 res->type = fa->fa_type;
933 res->scope = fa->fa_scope;
934 res->fi = fa->fa_info;
1da177e4
LT
935 atomic_inc(&res->fi->fib_clntref);
936 return 0;
937}
938
939/* Find appropriate source address to this destination */
940
b83738ae 941__be32 __fib_res_prefsrc(struct fib_result *res)
1da177e4
LT
942{
943 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
944}
945
be403ea1 946int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
81f7bf6c 947 u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
be403ea1 948 struct fib_info *fi, unsigned int flags)
1da177e4 949{
be403ea1 950 struct nlmsghdr *nlh;
1da177e4 951 struct rtmsg *rtm;
1da177e4 952
be403ea1
TG
953 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
954 if (nlh == NULL)
26932566 955 return -EMSGSIZE;
be403ea1
TG
956
957 rtm = nlmsg_data(nlh);
1da177e4
LT
958 rtm->rtm_family = AF_INET;
959 rtm->rtm_dst_len = dst_len;
960 rtm->rtm_src_len = 0;
961 rtm->rtm_tos = tos;
709772e6
KPO
962 if (tb_id < 256)
963 rtm->rtm_table = tb_id;
964 else
965 rtm->rtm_table = RT_TABLE_COMPAT;
be403ea1 966 NLA_PUT_U32(skb, RTA_TABLE, tb_id);
1da177e4
LT
967 rtm->rtm_type = type;
968 rtm->rtm_flags = fi->fib_flags;
969 rtm->rtm_scope = scope;
1da177e4 970 rtm->rtm_protocol = fi->fib_protocol;
be403ea1
TG
971
972 if (rtm->rtm_dst_len)
17fb2c64 973 NLA_PUT_BE32(skb, RTA_DST, dst);
be403ea1 974
1da177e4 975 if (fi->fib_priority)
be403ea1
TG
976 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
977
1da177e4 978 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
be403ea1
TG
979 goto nla_put_failure;
980
1da177e4 981 if (fi->fib_prefsrc)
17fb2c64 982 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
be403ea1 983
1da177e4
LT
984 if (fi->fib_nhs == 1) {
985 if (fi->fib_nh->nh_gw)
17fb2c64 986 NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
be403ea1 987
1da177e4 988 if (fi->fib_nh->nh_oif)
be403ea1 989 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
8265abc0
PM
990#ifdef CONFIG_NET_CLS_ROUTE
991 if (fi->fib_nh[0].nh_tclassid)
be403ea1 992 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
8265abc0 993#endif
1da177e4
LT
994 }
995#ifdef CONFIG_IP_ROUTE_MULTIPATH
996 if (fi->fib_nhs > 1) {
be403ea1
TG
997 struct rtnexthop *rtnh;
998 struct nlattr *mp;
999
1000 mp = nla_nest_start(skb, RTA_MULTIPATH);
1001 if (mp == NULL)
1002 goto nla_put_failure;
1da177e4
LT
1003
1004 for_nexthops(fi) {
be403ea1
TG
1005 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1006 if (rtnh == NULL)
1007 goto nla_put_failure;
1008
1009 rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1010 rtnh->rtnh_hops = nh->nh_weight - 1;
1011 rtnh->rtnh_ifindex = nh->nh_oif;
1012
1da177e4 1013 if (nh->nh_gw)
17fb2c64 1014 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
8265abc0
PM
1015#ifdef CONFIG_NET_CLS_ROUTE
1016 if (nh->nh_tclassid)
be403ea1 1017 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
8265abc0 1018#endif
be403ea1
TG
1019 /* length of rtnetlink header + attributes */
1020 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1da177e4 1021 } endfor_nexthops(fi);
be403ea1
TG
1022
1023 nla_nest_end(skb, mp);
1da177e4
LT
1024 }
1025#endif
be403ea1 1026 return nlmsg_end(skb, nlh);
1da177e4 1027
be403ea1 1028nla_put_failure:
26932566
PM
1029 nlmsg_cancel(skb, nlh);
1030 return -EMSGSIZE;
1da177e4
LT
1031}
1032
1da177e4
LT
1033/*
1034 Update FIB if:
1035 - local address disappeared -> we must delete all the entries
1036 referring to it.
1037 - device went down -> we must shutdown all nexthops going via it.
1038 */
4814bdbd 1039int fib_sync_down_addr(struct net *net, __be32 local)
1da177e4
LT
1040{
1041 int ret = 0;
85326fa5
DL
1042 unsigned int hash = fib_laddr_hashfn(local);
1043 struct hlist_head *head = &fib_info_laddrhash[hash];
1044 struct hlist_node *node;
1045 struct fib_info *fi;
1da177e4 1046
85326fa5
DL
1047 if (fib_info_laddrhash == NULL || local == 0)
1048 return 0;
1da177e4 1049
85326fa5 1050 hlist_for_each_entry(fi, node, head, fib_lhash) {
4814bdbd
DL
1051 if (fi->fib_net != net)
1052 continue;
85326fa5
DL
1053 if (fi->fib_prefsrc == local) {
1054 fi->fib_flags |= RTNH_F_DEAD;
1055 ret++;
1da177e4
LT
1056 }
1057 }
85326fa5
DL
1058 return ret;
1059}
1060
1061int fib_sync_down_dev(struct net_device *dev, int force)
1062{
1063 int ret = 0;
1064 int scope = RT_SCOPE_NOWHERE;
1065 struct fib_info *prev_fi = NULL;
1066 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1067 struct hlist_head *head = &fib_info_devhash[hash];
1068 struct hlist_node *node;
1069 struct fib_nh *nh;
1da177e4 1070
85326fa5
DL
1071 if (force)
1072 scope = -1;
1da177e4 1073
85326fa5
DL
1074 hlist_for_each_entry(nh, node, head, nh_hash) {
1075 struct fib_info *fi = nh->nh_parent;
1076 int dead;
1da177e4 1077
85326fa5
DL
1078 BUG_ON(!fi->fib_nhs);
1079 if (nh->nh_dev != dev || fi == prev_fi)
1080 continue;
1081 prev_fi = fi;
1082 dead = 0;
1083 change_nexthops(fi) {
1084 if (nh->nh_flags&RTNH_F_DEAD)
1085 dead++;
1086 else if (nh->nh_dev == dev &&
1087 nh->nh_scope != scope) {
1088 nh->nh_flags |= RTNH_F_DEAD;
1da177e4 1089#ifdef CONFIG_IP_ROUTE_MULTIPATH
85326fa5
DL
1090 spin_lock_bh(&fib_multipath_lock);
1091 fi->fib_power -= nh->nh_power;
1092 nh->nh_power = 0;
1093 spin_unlock_bh(&fib_multipath_lock);
1da177e4 1094#endif
85326fa5
DL
1095 dead++;
1096 }
1da177e4 1097#ifdef CONFIG_IP_ROUTE_MULTIPATH
85326fa5
DL
1098 if (force > 1 && nh->nh_dev == dev) {
1099 dead = fi->fib_nhs;
1100 break;
1da177e4 1101 }
85326fa5
DL
1102#endif
1103 } endfor_nexthops(fi)
1104 if (dead == fi->fib_nhs) {
1105 fi->fib_flags |= RTNH_F_DEAD;
1106 ret++;
1da177e4
LT
1107 }
1108 }
1109
1110 return ret;
1111}
1112
1113#ifdef CONFIG_IP_ROUTE_MULTIPATH
1114
1115/*
1116 Dead device goes up. We wake up dead nexthops.
1117 It takes sense only on multipath routes.
1118 */
1119
1120int fib_sync_up(struct net_device *dev)
1121{
1122 struct fib_info *prev_fi;
1123 unsigned int hash;
1124 struct hlist_head *head;
1125 struct hlist_node *node;
1126 struct fib_nh *nh;
1127 int ret;
1128
1129 if (!(dev->flags&IFF_UP))
1130 return 0;
1131
1132 prev_fi = NULL;
1133 hash = fib_devindex_hashfn(dev->ifindex);
1134 head = &fib_info_devhash[hash];
1135 ret = 0;
1136
1137 hlist_for_each_entry(nh, node, head, nh_hash) {
1138 struct fib_info *fi = nh->nh_parent;
1139 int alive;
1140
1141 BUG_ON(!fi->fib_nhs);
1142 if (nh->nh_dev != dev || fi == prev_fi)
1143 continue;
1144
1145 prev_fi = fi;
1146 alive = 0;
1147 change_nexthops(fi) {
1148 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1149 alive++;
1150 continue;
1151 }
1152 if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1153 continue;
e5ed6399 1154 if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1da177e4
LT
1155 continue;
1156 alive++;
1157 spin_lock_bh(&fib_multipath_lock);
1158 nh->nh_power = 0;
1159 nh->nh_flags &= ~RTNH_F_DEAD;
1160 spin_unlock_bh(&fib_multipath_lock);
1161 } endfor_nexthops(fi)
1162
1163 if (alive > 0) {
1164 fi->fib_flags &= ~RTNH_F_DEAD;
1165 ret++;
1166 }
1167 }
1168
1169 return ret;
1170}
1171
1172/*
1173 The algorithm is suboptimal, but it provides really
1174 fair weighted route distribution.
1175 */
1176
1177void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1178{
1179 struct fib_info *fi = res->fi;
1180 int w;
1181
1182 spin_lock_bh(&fib_multipath_lock);
1183 if (fi->fib_power <= 0) {
1184 int power = 0;
1185 change_nexthops(fi) {
1186 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1187 power += nh->nh_weight;
1188 nh->nh_power = nh->nh_weight;
1189 }
1190 } endfor_nexthops(fi);
1191 fi->fib_power = power;
1192 if (power <= 0) {
1193 spin_unlock_bh(&fib_multipath_lock);
1194 /* Race condition: route has just become dead. */
1195 res->nh_sel = 0;
1196 return;
1197 }
1198 }
1199
1200
1201 /* w should be random number [0..fi->fib_power-1],
1202 it is pretty bad approximation.
1203 */
1204
1205 w = jiffies % fi->fib_power;
1206
1207 change_nexthops(fi) {
1208 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1209 if ((w -= nh->nh_power) <= 0) {
1210 nh->nh_power--;
1211 fi->fib_power--;
1212 res->nh_sel = nhsel;
1213 spin_unlock_bh(&fib_multipath_lock);
1214 return;
1215 }
1216 }
1217 } endfor_nexthops(fi);
1218
1219 /* Race condition: route has just become dead. */
1220 res->nh_sel = 0;
1221 spin_unlock_bh(&fib_multipath_lock);
1222}
1223#endif