1 // SPDX-License-Identifier: GPL-2.0
2 /* Generic nexthop implementation
4 * Copyright (c) 2017-19 Cumulus Networks
5 * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
8 #include <linux/nexthop.h>
9 #include <linux/rtnetlink.h>
10 #include <linux/slab.h>
12 #include <net/ipv6_stubs.h>
13 #include <net/lwtunnel.h>
14 #include <net/ndisc.h>
15 #include <net/nexthop.h>
16 #include <net/route.h>
19 #define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ)
20 #define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */
22 static void remove_nexthop(struct net
*net
, struct nexthop
*nh
,
23 struct nl_info
*nlinfo
);
25 #define NH_DEV_HASHBITS 8
26 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
28 static const struct nla_policy rtm_nh_policy_new
[] = {
29 [NHA_ID
] = { .type
= NLA_U32
},
30 [NHA_GROUP
] = { .type
= NLA_BINARY
},
31 [NHA_GROUP_TYPE
] = { .type
= NLA_U16
},
32 [NHA_BLACKHOLE
] = { .type
= NLA_FLAG
},
33 [NHA_OIF
] = { .type
= NLA_U32
},
34 [NHA_GATEWAY
] = { .type
= NLA_BINARY
},
35 [NHA_ENCAP_TYPE
] = { .type
= NLA_U16
},
36 [NHA_ENCAP
] = { .type
= NLA_NESTED
},
37 [NHA_FDB
] = { .type
= NLA_FLAG
},
38 [NHA_RES_GROUP
] = { .type
= NLA_NESTED
},
41 static const struct nla_policy rtm_nh_policy_get
[] = {
42 [NHA_ID
] = { .type
= NLA_U32
},
45 static const struct nla_policy rtm_nh_policy_dump
[] = {
46 [NHA_OIF
] = { .type
= NLA_U32
},
47 [NHA_GROUPS
] = { .type
= NLA_FLAG
},
48 [NHA_MASTER
] = { .type
= NLA_U32
},
49 [NHA_FDB
] = { .type
= NLA_FLAG
},
52 static const struct nla_policy rtm_nh_res_policy_new
[] = {
53 [NHA_RES_GROUP_BUCKETS
] = { .type
= NLA_U16
},
54 [NHA_RES_GROUP_IDLE_TIMER
] = { .type
= NLA_U32
},
55 [NHA_RES_GROUP_UNBALANCED_TIMER
] = { .type
= NLA_U32
},
58 static const struct nla_policy rtm_nh_policy_dump_bucket
[] = {
59 [NHA_ID
] = { .type
= NLA_U32
},
60 [NHA_OIF
] = { .type
= NLA_U32
},
61 [NHA_MASTER
] = { .type
= NLA_U32
},
62 [NHA_RES_BUCKET
] = { .type
= NLA_NESTED
},
65 static const struct nla_policy rtm_nh_res_bucket_policy_dump
[] = {
66 [NHA_RES_BUCKET_NH_ID
] = { .type
= NLA_U32
},
69 static const struct nla_policy rtm_nh_policy_get_bucket
[] = {
70 [NHA_ID
] = { .type
= NLA_U32
},
71 [NHA_RES_BUCKET
] = { .type
= NLA_NESTED
},
74 static const struct nla_policy rtm_nh_res_bucket_policy_get
[] = {
75 [NHA_RES_BUCKET_INDEX
] = { .type
= NLA_U16
},
78 static bool nexthop_notifiers_is_empty(struct net
*net
)
80 return !net
->nexthop
.notifier_chain
.head
;
84 __nh_notifier_single_info_init(struct nh_notifier_single_info
*nh_info
,
85 const struct nh_info
*nhi
)
87 nh_info
->dev
= nhi
->fib_nhc
.nhc_dev
;
88 nh_info
->gw_family
= nhi
->fib_nhc
.nhc_gw_family
;
89 if (nh_info
->gw_family
== AF_INET
)
90 nh_info
->ipv4
= nhi
->fib_nhc
.nhc_gw
.ipv4
;
91 else if (nh_info
->gw_family
== AF_INET6
)
92 nh_info
->ipv6
= nhi
->fib_nhc
.nhc_gw
.ipv6
;
94 nh_info
->is_reject
= nhi
->reject_nh
;
95 nh_info
->is_fdb
= nhi
->fdb_nh
;
96 nh_info
->has_encap
= !!nhi
->fib_nhc
.nhc_lwtstate
;
99 static int nh_notifier_single_info_init(struct nh_notifier_info
*info
,
100 const struct nexthop
*nh
)
102 struct nh_info
*nhi
= rtnl_dereference(nh
->nh_info
);
104 info
->type
= NH_NOTIFIER_INFO_TYPE_SINGLE
;
105 info
->nh
= kzalloc(sizeof(*info
->nh
), GFP_KERNEL
);
109 __nh_notifier_single_info_init(info
->nh
, nhi
);
114 static void nh_notifier_single_info_fini(struct nh_notifier_info
*info
)
119 static int nh_notifier_mpath_info_init(struct nh_notifier_info
*info
,
120 struct nh_group
*nhg
)
122 u16 num_nh
= nhg
->num_nh
;
125 info
->type
= NH_NOTIFIER_INFO_TYPE_GRP
;
126 info
->nh_grp
= kzalloc(struct_size(info
->nh_grp
, nh_entries
, num_nh
),
131 info
->nh_grp
->num_nh
= num_nh
;
132 info
->nh_grp
->is_fdb
= nhg
->fdb_nh
;
134 for (i
= 0; i
< num_nh
; i
++) {
135 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
138 nhi
= rtnl_dereference(nhge
->nh
->nh_info
);
139 info
->nh_grp
->nh_entries
[i
].id
= nhge
->nh
->id
;
140 info
->nh_grp
->nh_entries
[i
].weight
= nhge
->weight
;
141 __nh_notifier_single_info_init(&info
->nh_grp
->nh_entries
[i
].nh
,
148 static int nh_notifier_res_table_info_init(struct nh_notifier_info
*info
,
149 struct nh_group
*nhg
)
151 struct nh_res_table
*res_table
= rtnl_dereference(nhg
->res_table
);
152 u16 num_nh_buckets
= res_table
->num_nh_buckets
;
156 info
->type
= NH_NOTIFIER_INFO_TYPE_RES_TABLE
;
157 size
= struct_size(info
->nh_res_table
, nhs
, num_nh_buckets
);
158 info
->nh_res_table
= __vmalloc(size
, GFP_KERNEL
| __GFP_ZERO
|
160 if (!info
->nh_res_table
)
163 info
->nh_res_table
->num_nh_buckets
= num_nh_buckets
;
165 for (i
= 0; i
< num_nh_buckets
; i
++) {
166 struct nh_res_bucket
*bucket
= &res_table
->nh_buckets
[i
];
167 struct nh_grp_entry
*nhge
;
170 nhge
= rtnl_dereference(bucket
->nh_entry
);
171 nhi
= rtnl_dereference(nhge
->nh
->nh_info
);
172 __nh_notifier_single_info_init(&info
->nh_res_table
->nhs
[i
],
179 static int nh_notifier_grp_info_init(struct nh_notifier_info
*info
,
180 const struct nexthop
*nh
)
182 struct nh_group
*nhg
= rtnl_dereference(nh
->nh_grp
);
184 if (nhg
->hash_threshold
)
185 return nh_notifier_mpath_info_init(info
, nhg
);
186 else if (nhg
->resilient
)
187 return nh_notifier_res_table_info_init(info
, nhg
);
191 static void nh_notifier_grp_info_fini(struct nh_notifier_info
*info
,
192 const struct nexthop
*nh
)
194 struct nh_group
*nhg
= rtnl_dereference(nh
->nh_grp
);
196 if (nhg
->hash_threshold
)
198 else if (nhg
->resilient
)
199 vfree(info
->nh_res_table
);
202 static int nh_notifier_info_init(struct nh_notifier_info
*info
,
203 const struct nexthop
*nh
)
208 return nh_notifier_grp_info_init(info
, nh
);
210 return nh_notifier_single_info_init(info
, nh
);
213 static void nh_notifier_info_fini(struct nh_notifier_info
*info
,
214 const struct nexthop
*nh
)
217 nh_notifier_grp_info_fini(info
, nh
);
219 nh_notifier_single_info_fini(info
);
222 static int call_nexthop_notifiers(struct net
*net
,
223 enum nexthop_event_type event_type
,
225 struct netlink_ext_ack
*extack
)
227 struct nh_notifier_info info
= {
235 if (nexthop_notifiers_is_empty(net
))
238 err
= nh_notifier_info_init(&info
, nh
);
240 NL_SET_ERR_MSG(extack
, "Failed to initialize nexthop notifier info");
244 err
= blocking_notifier_call_chain(&net
->nexthop
.notifier_chain
,
246 nh_notifier_info_fini(&info
, nh
);
248 return notifier_to_errno(err
);
252 nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info
*info
,
253 bool force
, unsigned int *p_idle_timer_ms
)
255 struct nh_res_table
*res_table
;
256 struct nh_group
*nhg
;
260 /* When 'force' is false, nexthop bucket replacement is performed
261 * because the bucket was deemed to be idle. In this case, capable
262 * listeners can choose to perform an atomic replacement: The bucket is
263 * only replaced if it is inactive. However, if the idle timer interval
264 * is smaller than the interval in which a listener is querying
265 * buckets' activity from the device, then atomic replacement should
266 * not be tried. Pass the idle timer value to listeners, so that they
267 * could determine which type of replacement to perform.
270 *p_idle_timer_ms
= 0;
276 nh
= nexthop_find_by_id(info
->net
, info
->id
);
282 nhg
= rcu_dereference(nh
->nh_grp
);
283 res_table
= rcu_dereference(nhg
->res_table
);
284 *p_idle_timer_ms
= jiffies_to_msecs(res_table
->idle_timer
);
292 static int nh_notifier_res_bucket_info_init(struct nh_notifier_info
*info
,
293 u16 bucket_index
, bool force
,
294 struct nh_info
*oldi
,
295 struct nh_info
*newi
)
297 unsigned int idle_timer_ms
;
300 err
= nh_notifier_res_bucket_idle_timer_get(info
, force
,
305 info
->type
= NH_NOTIFIER_INFO_TYPE_RES_BUCKET
;
306 info
->nh_res_bucket
= kzalloc(sizeof(*info
->nh_res_bucket
),
308 if (!info
->nh_res_bucket
)
311 info
->nh_res_bucket
->bucket_index
= bucket_index
;
312 info
->nh_res_bucket
->idle_timer_ms
= idle_timer_ms
;
313 info
->nh_res_bucket
->force
= force
;
314 __nh_notifier_single_info_init(&info
->nh_res_bucket
->old_nh
, oldi
);
315 __nh_notifier_single_info_init(&info
->nh_res_bucket
->new_nh
, newi
);
319 static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info
*info
)
321 kfree(info
->nh_res_bucket
);
324 static int __call_nexthop_res_bucket_notifiers(struct net
*net
, u32 nhg_id
,
325 u16 bucket_index
, bool force
,
326 struct nh_info
*oldi
,
327 struct nh_info
*newi
,
328 struct netlink_ext_ack
*extack
)
330 struct nh_notifier_info info
= {
337 if (nexthop_notifiers_is_empty(net
))
340 err
= nh_notifier_res_bucket_info_init(&info
, bucket_index
, force
,
345 err
= blocking_notifier_call_chain(&net
->nexthop
.notifier_chain
,
346 NEXTHOP_EVENT_BUCKET_REPLACE
, &info
);
347 nh_notifier_res_bucket_info_fini(&info
);
349 return notifier_to_errno(err
);
352 /* There are three users of RES_TABLE, and NHs etc. referenced from there:
354 * 1) a collection of callbacks for NH maintenance. This operates under
356 * 2) the delayed work that gradually balances the resilient table,
357 * 3) and nexthop_select_path(), operating under RCU.
359 * Both the delayed work and the RTNL block are writers, and need to
360 * maintain mutual exclusion. Since there are only two and well-known
361 * writers for each table, the RTNL code can make sure it has exclusive
364 * - Have the DW operate without locking;
365 * - synchronously cancel the DW;
367 * - if the write was not actually a delete, call upkeep, which schedules
368 * DW again if necessary.
370 * The functions that are always called from the RTNL context use
371 * rtnl_dereference(). The functions that can also be called from the DW do
372 * a raw dereference and rely on the above mutual exclusion scheme.
374 #define nh_res_dereference(p) (rcu_dereference_raw(p))
376 static int call_nexthop_res_bucket_notifiers(struct net
*net
, u32 nhg_id
,
377 u16 bucket_index
, bool force
,
378 struct nexthop
*old_nh
,
379 struct nexthop
*new_nh
,
380 struct netlink_ext_ack
*extack
)
382 struct nh_info
*oldi
= nh_res_dereference(old_nh
->nh_info
);
383 struct nh_info
*newi
= nh_res_dereference(new_nh
->nh_info
);
385 return __call_nexthop_res_bucket_notifiers(net
, nhg_id
, bucket_index
,
386 force
, oldi
, newi
, extack
);
389 static int call_nexthop_res_table_notifiers(struct net
*net
, struct nexthop
*nh
,
390 struct netlink_ext_ack
*extack
)
392 struct nh_notifier_info info
= {
396 struct nh_group
*nhg
;
401 if (nexthop_notifiers_is_empty(net
))
404 /* At this point, the nexthop buckets are still not populated. Only
405 * emit a notification with the logical nexthops, so that a listener
406 * could potentially veto it in case of unsupported configuration.
408 nhg
= rtnl_dereference(nh
->nh_grp
);
409 err
= nh_notifier_mpath_info_init(&info
, nhg
);
411 NL_SET_ERR_MSG(extack
, "Failed to initialize nexthop notifier info");
415 err
= blocking_notifier_call_chain(&net
->nexthop
.notifier_chain
,
416 NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE
,
420 return notifier_to_errno(err
);
423 static int call_nexthop_notifier(struct notifier_block
*nb
, struct net
*net
,
424 enum nexthop_event_type event_type
,
426 struct netlink_ext_ack
*extack
)
428 struct nh_notifier_info info
= {
434 err
= nh_notifier_info_init(&info
, nh
);
438 err
= nb
->notifier_call(nb
, event_type
, &info
);
439 nh_notifier_info_fini(&info
, nh
);
441 return notifier_to_errno(err
);
444 static unsigned int nh_dev_hashfn(unsigned int val
)
446 unsigned int mask
= NH_DEV_HASHSIZE
- 1;
449 (val
>> NH_DEV_HASHBITS
) ^
450 (val
>> (NH_DEV_HASHBITS
* 2))) & mask
;
453 static void nexthop_devhash_add(struct net
*net
, struct nh_info
*nhi
)
455 struct net_device
*dev
= nhi
->fib_nhc
.nhc_dev
;
456 struct hlist_head
*head
;
461 hash
= nh_dev_hashfn(dev
->ifindex
);
462 head
= &net
->nexthop
.devhash
[hash
];
463 hlist_add_head(&nhi
->dev_hash
, head
);
466 static void nexthop_free_group(struct nexthop
*nh
)
468 struct nh_group
*nhg
;
471 nhg
= rcu_dereference_raw(nh
->nh_grp
);
472 for (i
= 0; i
< nhg
->num_nh
; ++i
) {
473 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
475 WARN_ON(!list_empty(&nhge
->nh_list
));
476 nexthop_put(nhge
->nh
);
479 WARN_ON(nhg
->spare
== nhg
);
482 vfree(rcu_dereference_raw(nhg
->res_table
));
488 static void nexthop_free_single(struct nexthop
*nh
)
492 nhi
= rcu_dereference_raw(nh
->nh_info
);
493 switch (nhi
->family
) {
495 fib_nh_release(nh
->net
, &nhi
->fib_nh
);
498 ipv6_stub
->fib6_nh_release(&nhi
->fib6_nh
);
504 void nexthop_free_rcu(struct rcu_head
*head
)
506 struct nexthop
*nh
= container_of(head
, struct nexthop
, rcu
);
509 nexthop_free_group(nh
);
511 nexthop_free_single(nh
);
515 EXPORT_SYMBOL_GPL(nexthop_free_rcu
);
517 static struct nexthop
*nexthop_alloc(void)
521 nh
= kzalloc(sizeof(struct nexthop
), GFP_KERNEL
);
523 INIT_LIST_HEAD(&nh
->fi_list
);
524 INIT_LIST_HEAD(&nh
->f6i_list
);
525 INIT_LIST_HEAD(&nh
->grp_list
);
526 INIT_LIST_HEAD(&nh
->fdb_list
);
531 static struct nh_group
*nexthop_grp_alloc(u16 num_nh
)
533 struct nh_group
*nhg
;
535 nhg
= kzalloc(struct_size(nhg
, nh_entries
, num_nh
), GFP_KERNEL
);
537 nhg
->num_nh
= num_nh
;
542 static void nh_res_table_upkeep_dw(struct work_struct
*work
);
544 static struct nh_res_table
*
545 nexthop_res_table_alloc(struct net
*net
, u32 nhg_id
, struct nh_config
*cfg
)
547 const u16 num_nh_buckets
= cfg
->nh_grp_res_num_buckets
;
548 struct nh_res_table
*res_table
;
551 size
= struct_size(res_table
, nh_buckets
, num_nh_buckets
);
552 res_table
= __vmalloc(size
, GFP_KERNEL
| __GFP_ZERO
| __GFP_NOWARN
);
556 res_table
->net
= net
;
557 res_table
->nhg_id
= nhg_id
;
558 INIT_DELAYED_WORK(&res_table
->upkeep_dw
, &nh_res_table_upkeep_dw
);
559 INIT_LIST_HEAD(&res_table
->uw_nh_entries
);
560 res_table
->idle_timer
= cfg
->nh_grp_res_idle_timer
;
561 res_table
->unbalanced_timer
= cfg
->nh_grp_res_unbalanced_timer
;
562 res_table
->num_nh_buckets
= num_nh_buckets
;
566 static void nh_base_seq_inc(struct net
*net
)
568 while (++net
->nexthop
.seq
== 0)
572 /* no reference taken; rcu lock or rtnl must be held */
573 struct nexthop
*nexthop_find_by_id(struct net
*net
, u32 id
)
575 struct rb_node
**pp
, *parent
= NULL
, *next
;
577 pp
= &net
->nexthop
.rb_root
.rb_node
;
581 next
= rcu_dereference_raw(*pp
);
586 nh
= rb_entry(parent
, struct nexthop
, rb_node
);
589 else if (id
> nh
->id
)
590 pp
= &next
->rb_right
;
596 EXPORT_SYMBOL_GPL(nexthop_find_by_id
);
598 /* used for auto id allocation; called with rtnl held */
599 static u32
nh_find_unused_id(struct net
*net
)
601 u32 id_start
= net
->nexthop
.last_id_allocated
;
604 net
->nexthop
.last_id_allocated
++;
605 if (net
->nexthop
.last_id_allocated
== id_start
)
608 if (!nexthop_find_by_id(net
, net
->nexthop
.last_id_allocated
))
609 return net
->nexthop
.last_id_allocated
;
614 static void nh_res_time_set_deadline(unsigned long next_time
,
615 unsigned long *deadline
)
617 if (time_before(next_time
, *deadline
))
618 *deadline
= next_time
;
621 static clock_t nh_res_table_unbalanced_time(struct nh_res_table
*res_table
)
623 if (list_empty(&res_table
->uw_nh_entries
))
625 return jiffies_delta_to_clock_t(jiffies
- res_table
->unbalanced_since
);
628 static int nla_put_nh_group_res(struct sk_buff
*skb
, struct nh_group
*nhg
)
630 struct nh_res_table
*res_table
= rtnl_dereference(nhg
->res_table
);
633 nest
= nla_nest_start(skb
, NHA_RES_GROUP
);
637 if (nla_put_u16(skb
, NHA_RES_GROUP_BUCKETS
,
638 res_table
->num_nh_buckets
) ||
639 nla_put_u32(skb
, NHA_RES_GROUP_IDLE_TIMER
,
640 jiffies_to_clock_t(res_table
->idle_timer
)) ||
641 nla_put_u32(skb
, NHA_RES_GROUP_UNBALANCED_TIMER
,
642 jiffies_to_clock_t(res_table
->unbalanced_timer
)) ||
643 nla_put_u64_64bit(skb
, NHA_RES_GROUP_UNBALANCED_TIME
,
644 nh_res_table_unbalanced_time(res_table
),
646 goto nla_put_failure
;
648 nla_nest_end(skb
, nest
);
652 nla_nest_cancel(skb
, nest
);
656 static int nla_put_nh_group(struct sk_buff
*skb
, struct nh_group
*nhg
)
658 struct nexthop_grp
*p
;
659 size_t len
= nhg
->num_nh
* sizeof(*p
);
664 if (nhg
->hash_threshold
)
665 group_type
= NEXTHOP_GRP_TYPE_MPATH
;
666 else if (nhg
->resilient
)
667 group_type
= NEXTHOP_GRP_TYPE_RES
;
669 if (nla_put_u16(skb
, NHA_GROUP_TYPE
, group_type
))
670 goto nla_put_failure
;
672 nla
= nla_reserve(skb
, NHA_GROUP
, len
);
674 goto nla_put_failure
;
677 for (i
= 0; i
< nhg
->num_nh
; ++i
) {
678 p
->id
= nhg
->nh_entries
[i
].nh
->id
;
679 p
->weight
= nhg
->nh_entries
[i
].weight
- 1;
683 if (nhg
->resilient
&& nla_put_nh_group_res(skb
, nhg
))
684 goto nla_put_failure
;
692 static int nh_fill_node(struct sk_buff
*skb
, struct nexthop
*nh
,
693 int event
, u32 portid
, u32 seq
, unsigned int nlflags
)
695 struct fib6_nh
*fib6_nh
;
696 struct fib_nh
*fib_nh
;
697 struct nlmsghdr
*nlh
;
701 nlh
= nlmsg_put(skb
, portid
, seq
, event
, sizeof(*nhm
), nlflags
);
705 nhm
= nlmsg_data(nlh
);
706 nhm
->nh_family
= AF_UNSPEC
;
707 nhm
->nh_flags
= nh
->nh_flags
;
708 nhm
->nh_protocol
= nh
->protocol
;
712 if (nla_put_u32(skb
, NHA_ID
, nh
->id
))
713 goto nla_put_failure
;
716 struct nh_group
*nhg
= rtnl_dereference(nh
->nh_grp
);
718 if (nhg
->fdb_nh
&& nla_put_flag(skb
, NHA_FDB
))
719 goto nla_put_failure
;
720 if (nla_put_nh_group(skb
, nhg
))
721 goto nla_put_failure
;
725 nhi
= rtnl_dereference(nh
->nh_info
);
726 nhm
->nh_family
= nhi
->family
;
727 if (nhi
->reject_nh
) {
728 if (nla_put_flag(skb
, NHA_BLACKHOLE
))
729 goto nla_put_failure
;
731 } else if (nhi
->fdb_nh
) {
732 if (nla_put_flag(skb
, NHA_FDB
))
733 goto nla_put_failure
;
735 const struct net_device
*dev
;
737 dev
= nhi
->fib_nhc
.nhc_dev
;
738 if (dev
&& nla_put_u32(skb
, NHA_OIF
, dev
->ifindex
))
739 goto nla_put_failure
;
742 nhm
->nh_scope
= nhi
->fib_nhc
.nhc_scope
;
743 switch (nhi
->family
) {
745 fib_nh
= &nhi
->fib_nh
;
746 if (fib_nh
->fib_nh_gw_family
&&
747 nla_put_be32(skb
, NHA_GATEWAY
, fib_nh
->fib_nh_gw4
))
748 goto nla_put_failure
;
752 fib6_nh
= &nhi
->fib6_nh
;
753 if (fib6_nh
->fib_nh_gw_family
&&
754 nla_put_in6_addr(skb
, NHA_GATEWAY
, &fib6_nh
->fib_nh_gw6
))
755 goto nla_put_failure
;
759 if (nhi
->fib_nhc
.nhc_lwtstate
&&
760 lwtunnel_fill_encap(skb
, nhi
->fib_nhc
.nhc_lwtstate
,
761 NHA_ENCAP
, NHA_ENCAP_TYPE
) < 0)
762 goto nla_put_failure
;
769 nlmsg_cancel(skb
, nlh
);
773 static size_t nh_nlmsg_size_grp_res(struct nh_group
*nhg
)
775 return nla_total_size(0) + /* NHA_RES_GROUP */
776 nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */
777 nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */
778 nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */
779 nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */
782 static size_t nh_nlmsg_size_grp(struct nexthop
*nh
)
784 struct nh_group
*nhg
= rtnl_dereference(nh
->nh_grp
);
785 size_t sz
= sizeof(struct nexthop_grp
) * nhg
->num_nh
;
786 size_t tot
= nla_total_size(sz
) +
787 nla_total_size(2); /* NHA_GROUP_TYPE */
790 tot
+= nh_nlmsg_size_grp_res(nhg
);
795 static size_t nh_nlmsg_size_single(struct nexthop
*nh
)
797 struct nh_info
*nhi
= rtnl_dereference(nh
->nh_info
);
800 /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
801 * are mutually exclusive
803 sz
= nla_total_size(4); /* NHA_OIF */
805 switch (nhi
->family
) {
807 if (nhi
->fib_nh
.fib_nh_gw_family
)
808 sz
+= nla_total_size(4); /* NHA_GATEWAY */
813 if (nhi
->fib6_nh
.fib_nh_gw_family
)
814 sz
+= nla_total_size(sizeof(const struct in6_addr
));
818 if (nhi
->fib_nhc
.nhc_lwtstate
) {
819 sz
+= lwtunnel_get_encap_size(nhi
->fib_nhc
.nhc_lwtstate
);
820 sz
+= nla_total_size(2); /* NHA_ENCAP_TYPE */
826 static size_t nh_nlmsg_size(struct nexthop
*nh
)
828 size_t sz
= NLMSG_ALIGN(sizeof(struct nhmsg
));
830 sz
+= nla_total_size(4); /* NHA_ID */
833 sz
+= nh_nlmsg_size_grp(nh
);
835 sz
+= nh_nlmsg_size_single(nh
);
840 static void nexthop_notify(int event
, struct nexthop
*nh
, struct nl_info
*info
)
842 unsigned int nlflags
= info
->nlh
? info
->nlh
->nlmsg_flags
: 0;
843 u32 seq
= info
->nlh
? info
->nlh
->nlmsg_seq
: 0;
847 skb
= nlmsg_new(nh_nlmsg_size(nh
), gfp_any());
851 err
= nh_fill_node(skb
, nh
, event
, info
->portid
, seq
, nlflags
);
853 /* -EMSGSIZE implies BUG in nh_nlmsg_size() */
854 WARN_ON(err
== -EMSGSIZE
);
859 rtnl_notify(skb
, info
->nl_net
, info
->portid
, RTNLGRP_NEXTHOP
,
860 info
->nlh
, gfp_any());
864 rtnl_set_sk_err(info
->nl_net
, RTNLGRP_NEXTHOP
, err
);
867 static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket
*bucket
)
869 return (unsigned long)atomic_long_read(&bucket
->used_time
);
873 nh_res_bucket_idle_point(const struct nh_res_table
*res_table
,
874 const struct nh_res_bucket
*bucket
,
877 unsigned long time
= nh_res_bucket_used_time(bucket
);
879 /* Bucket was not used since it was migrated. The idle time is now. */
880 if (time
== bucket
->migrated_time
)
883 return time
+ res_table
->idle_timer
;
887 nh_res_table_unb_point(const struct nh_res_table
*res_table
)
889 return res_table
->unbalanced_since
+ res_table
->unbalanced_timer
;
892 static void nh_res_bucket_set_idle(const struct nh_res_table
*res_table
,
893 struct nh_res_bucket
*bucket
)
895 unsigned long now
= jiffies
;
897 atomic_long_set(&bucket
->used_time
, (long)now
);
898 bucket
->migrated_time
= now
;
901 static void nh_res_bucket_set_busy(struct nh_res_bucket
*bucket
)
903 atomic_long_set(&bucket
->used_time
, (long)jiffies
);
906 static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket
*bucket
)
908 unsigned long used_time
= nh_res_bucket_used_time(bucket
);
910 return jiffies_delta_to_clock_t(jiffies
- used_time
);
913 static int nh_fill_res_bucket(struct sk_buff
*skb
, struct nexthop
*nh
,
914 struct nh_res_bucket
*bucket
, u16 bucket_index
,
915 int event
, u32 portid
, u32 seq
,
916 unsigned int nlflags
,
917 struct netlink_ext_ack
*extack
)
919 struct nh_grp_entry
*nhge
= nh_res_dereference(bucket
->nh_entry
);
920 struct nlmsghdr
*nlh
;
924 nlh
= nlmsg_put(skb
, portid
, seq
, event
, sizeof(*nhm
), nlflags
);
928 nhm
= nlmsg_data(nlh
);
929 nhm
->nh_family
= AF_UNSPEC
;
930 nhm
->nh_flags
= bucket
->nh_flags
;
931 nhm
->nh_protocol
= nh
->protocol
;
935 if (nla_put_u32(skb
, NHA_ID
, nh
->id
))
936 goto nla_put_failure
;
938 nest
= nla_nest_start(skb
, NHA_RES_BUCKET
);
940 goto nla_put_failure
;
942 if (nla_put_u16(skb
, NHA_RES_BUCKET_INDEX
, bucket_index
) ||
943 nla_put_u32(skb
, NHA_RES_BUCKET_NH_ID
, nhge
->nh
->id
) ||
944 nla_put_u64_64bit(skb
, NHA_RES_BUCKET_IDLE_TIME
,
945 nh_res_bucket_idle_time(bucket
),
947 goto nla_put_failure_nest
;
949 nla_nest_end(skb
, nest
);
953 nla_put_failure_nest
:
954 nla_nest_cancel(skb
, nest
);
956 nlmsg_cancel(skb
, nlh
);
960 static void nexthop_bucket_notify(struct nh_res_table
*res_table
,
963 struct nh_res_bucket
*bucket
= &res_table
->nh_buckets
[bucket_index
];
964 struct nh_grp_entry
*nhge
= nh_res_dereference(bucket
->nh_entry
);
965 struct nexthop
*nh
= nhge
->nh_parent
;
969 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
973 err
= nh_fill_res_bucket(skb
, nh
, bucket
, bucket_index
,
974 RTM_NEWNEXTHOPBUCKET
, 0, 0, NLM_F_REPLACE
,
981 rtnl_notify(skb
, nh
->net
, 0, RTNLGRP_NEXTHOP
, NULL
, GFP_KERNEL
);
985 rtnl_set_sk_err(nh
->net
, RTNLGRP_NEXTHOP
, err
);
988 static bool valid_group_nh(struct nexthop
*nh
, unsigned int npaths
,
989 bool *is_fdb
, struct netlink_ext_ack
*extack
)
992 struct nh_group
*nhg
= rtnl_dereference(nh
->nh_grp
);
994 /* Nesting groups within groups is not supported. */
995 if (nhg
->hash_threshold
) {
996 NL_SET_ERR_MSG(extack
,
997 "Hash-threshold group can not be a nexthop within a group");
1000 if (nhg
->resilient
) {
1001 NL_SET_ERR_MSG(extack
,
1002 "Resilient group can not be a nexthop within a group");
1005 *is_fdb
= nhg
->fdb_nh
;
1007 struct nh_info
*nhi
= rtnl_dereference(nh
->nh_info
);
1009 if (nhi
->reject_nh
&& npaths
> 1) {
1010 NL_SET_ERR_MSG(extack
,
1011 "Blackhole nexthop can not be used in a group with more than 1 path");
1014 *is_fdb
= nhi
->fdb_nh
;
1020 static int nh_check_attr_fdb_group(struct nexthop
*nh
, u8
*nh_family
,
1021 struct netlink_ext_ack
*extack
)
1023 struct nh_info
*nhi
;
1025 nhi
= rtnl_dereference(nh
->nh_info
);
1028 NL_SET_ERR_MSG(extack
, "FDB nexthop group can only have fdb nexthops");
1032 if (*nh_family
== AF_UNSPEC
) {
1033 *nh_family
= nhi
->family
;
1034 } else if (*nh_family
!= nhi
->family
) {
1035 NL_SET_ERR_MSG(extack
, "FDB nexthop group cannot have mixed family nexthops");
1042 static int nh_check_attr_group(struct net
*net
,
1043 struct nlattr
*tb
[], size_t tb_size
,
1044 u16 nh_grp_type
, struct netlink_ext_ack
*extack
)
1046 unsigned int len
= nla_len(tb
[NHA_GROUP
]);
1047 u8 nh_family
= AF_UNSPEC
;
1048 struct nexthop_grp
*nhg
;
1052 if (!len
|| len
& (sizeof(struct nexthop_grp
) - 1)) {
1053 NL_SET_ERR_MSG(extack
,
1054 "Invalid length for nexthop group attribute");
1058 /* convert len to number of nexthop ids */
1059 len
/= sizeof(*nhg
);
1061 nhg
= nla_data(tb
[NHA_GROUP
]);
1062 for (i
= 0; i
< len
; ++i
) {
1063 if (nhg
[i
].resvd1
|| nhg
[i
].resvd2
) {
1064 NL_SET_ERR_MSG(extack
, "Reserved fields in nexthop_grp must be 0");
1067 if (nhg
[i
].weight
> 254) {
1068 NL_SET_ERR_MSG(extack
, "Invalid value for weight");
1071 for (j
= i
+ 1; j
< len
; ++j
) {
1072 if (nhg
[i
].id
== nhg
[j
].id
) {
1073 NL_SET_ERR_MSG(extack
, "Nexthop id can not be used twice in a group");
1081 nhg
= nla_data(tb
[NHA_GROUP
]);
1082 for (i
= 0; i
< len
; ++i
) {
1086 nh
= nexthop_find_by_id(net
, nhg
[i
].id
);
1088 NL_SET_ERR_MSG(extack
, "Invalid nexthop id");
1091 if (!valid_group_nh(nh
, len
, &is_fdb_nh
, extack
))
1094 if (nhg_fdb
&& nh_check_attr_fdb_group(nh
, &nh_family
, extack
))
1097 if (!nhg_fdb
&& is_fdb_nh
) {
1098 NL_SET_ERR_MSG(extack
, "Non FDB nexthop group cannot have fdb nexthops");
1102 for (i
= NHA_GROUP_TYPE
+ 1; i
< tb_size
; ++i
) {
1109 if (nh_grp_type
== NEXTHOP_GRP_TYPE_RES
)
1113 NL_SET_ERR_MSG(extack
,
1114 "No other attributes can be set in nexthop groups");
1121 static bool ipv6_good_nh(const struct fib6_nh
*nh
)
1123 int state
= NUD_REACHABLE
;
1124 struct neighbour
*n
;
1128 n
= __ipv6_neigh_lookup_noref_stub(nh
->fib_nh_dev
, &nh
->fib_nh_gw6
);
1130 state
= n
->nud_state
;
1132 rcu_read_unlock_bh();
1134 return !!(state
& NUD_VALID
);
1137 static bool ipv4_good_nh(const struct fib_nh
*nh
)
1139 int state
= NUD_REACHABLE
;
1140 struct neighbour
*n
;
1144 n
= __ipv4_neigh_lookup_noref(nh
->fib_nh_dev
,
1145 (__force u32
)nh
->fib_nh_gw4
);
1147 state
= n
->nud_state
;
1149 rcu_read_unlock_bh();
1151 return !!(state
& NUD_VALID
);
1154 static struct nexthop
*nexthop_select_path_hthr(struct nh_group
*nhg
, int hash
)
1156 struct nexthop
*rc
= NULL
;
1159 for (i
= 0; i
< nhg
->num_nh
; ++i
) {
1160 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
1161 struct nh_info
*nhi
;
1163 if (hash
> atomic_read(&nhge
->hthr
.upper_bound
))
1166 nhi
= rcu_dereference(nhge
->nh
->nh_info
);
1170 /* nexthops always check if it is good and does
1171 * not rely on a sysctl for this behavior
1173 switch (nhi
->family
) {
1175 if (ipv4_good_nh(&nhi
->fib_nh
))
1179 if (ipv6_good_nh(&nhi
->fib6_nh
))
1191 static struct nexthop
*nexthop_select_path_res(struct nh_group
*nhg
, int hash
)
1193 struct nh_res_table
*res_table
= rcu_dereference(nhg
->res_table
);
1194 u16 bucket_index
= hash
% res_table
->num_nh_buckets
;
1195 struct nh_res_bucket
*bucket
;
1196 struct nh_grp_entry
*nhge
;
1198 /* nexthop_select_path() is expected to return a non-NULL value, so
1199 * skip protocol validation and just hand out whatever there is.
1201 bucket
= &res_table
->nh_buckets
[bucket_index
];
1202 nh_res_bucket_set_busy(bucket
);
1203 nhge
= rcu_dereference(bucket
->nh_entry
);
1207 struct nexthop
*nexthop_select_path(struct nexthop
*nh
, int hash
)
1209 struct nh_group
*nhg
;
1214 nhg
= rcu_dereference(nh
->nh_grp
);
1215 if (nhg
->hash_threshold
)
1216 return nexthop_select_path_hthr(nhg
, hash
);
1217 else if (nhg
->resilient
)
1218 return nexthop_select_path_res(nhg
, hash
);
1223 EXPORT_SYMBOL_GPL(nexthop_select_path
);
1225 int nexthop_for_each_fib6_nh(struct nexthop
*nh
,
1226 int (*cb
)(struct fib6_nh
*nh
, void *arg
),
1229 struct nh_info
*nhi
;
1233 struct nh_group
*nhg
;
1236 nhg
= rcu_dereference_rtnl(nh
->nh_grp
);
1237 for (i
= 0; i
< nhg
->num_nh
; i
++) {
1238 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
1240 nhi
= rcu_dereference_rtnl(nhge
->nh
->nh_info
);
1241 err
= cb(&nhi
->fib6_nh
, arg
);
1246 nhi
= rcu_dereference_rtnl(nh
->nh_info
);
1247 err
= cb(&nhi
->fib6_nh
, arg
);
1254 EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh
);
1256 static int check_src_addr(const struct in6_addr
*saddr
,
1257 struct netlink_ext_ack
*extack
)
1259 if (!ipv6_addr_any(saddr
)) {
1260 NL_SET_ERR_MSG(extack
, "IPv6 routes using source address can not use nexthop objects");
1266 int fib6_check_nexthop(struct nexthop
*nh
, struct fib6_config
*cfg
,
1267 struct netlink_ext_ack
*extack
)
1269 struct nh_info
*nhi
;
1272 /* fib6_src is unique to a fib6_info and limits the ability to cache
1273 * routes in fib6_nh within a nexthop that is potentially shared
1274 * across multiple fib entries. If the config wants to use source
1275 * routing it can not use nexthop objects. mlxsw also does not allow
1276 * fib6_src on routes.
1278 if (cfg
&& check_src_addr(&cfg
->fc_src
, extack
) < 0)
1282 struct nh_group
*nhg
;
1284 nhg
= rtnl_dereference(nh
->nh_grp
);
1287 is_fdb_nh
= nhg
->fdb_nh
;
1289 nhi
= rtnl_dereference(nh
->nh_info
);
1290 if (nhi
->family
== AF_INET
)
1292 is_fdb_nh
= nhi
->fdb_nh
;
1296 NL_SET_ERR_MSG(extack
, "Route cannot point to a fdb nexthop");
1302 NL_SET_ERR_MSG(extack
, "IPv6 routes can not use an IPv4 nexthop");
1305 EXPORT_SYMBOL_GPL(fib6_check_nexthop
);
1307 /* if existing nexthop has ipv6 routes linked to it, need
1308 * to verify this new spec works with ipv6
1310 static int fib6_check_nh_list(struct nexthop
*old
, struct nexthop
*new,
1311 struct netlink_ext_ack
*extack
)
1313 struct fib6_info
*f6i
;
1315 if (list_empty(&old
->f6i_list
))
1318 list_for_each_entry(f6i
, &old
->f6i_list
, nh_list
) {
1319 if (check_src_addr(&f6i
->fib6_src
.addr
, extack
) < 0)
1323 return fib6_check_nexthop(new, NULL
, extack
);
1326 static int nexthop_check_scope(struct nh_info
*nhi
, u8 scope
,
1327 struct netlink_ext_ack
*extack
)
1329 if (scope
== RT_SCOPE_HOST
&& nhi
->fib_nhc
.nhc_gw_family
) {
1330 NL_SET_ERR_MSG(extack
,
1331 "Route with host scope can not have a gateway");
1335 if (nhi
->fib_nhc
.nhc_flags
& RTNH_F_ONLINK
&& scope
>= RT_SCOPE_LINK
) {
1336 NL_SET_ERR_MSG(extack
, "Scope mismatch with nexthop");
1343 /* Invoked by fib add code to verify nexthop by id is ok with
1344 * config for prefix; parts of fib_check_nh not done when nexthop
1347 int fib_check_nexthop(struct nexthop
*nh
, u8 scope
,
1348 struct netlink_ext_ack
*extack
)
1350 struct nh_info
*nhi
;
1354 struct nh_group
*nhg
;
1356 nhg
= rtnl_dereference(nh
->nh_grp
);
1358 NL_SET_ERR_MSG(extack
, "Route cannot point to a fdb nexthop");
1363 if (scope
== RT_SCOPE_HOST
) {
1364 NL_SET_ERR_MSG(extack
, "Route with host scope can not have multiple nexthops");
1369 /* all nexthops in a group have the same scope */
1370 nhi
= rtnl_dereference(nhg
->nh_entries
[0].nh
->nh_info
);
1371 err
= nexthop_check_scope(nhi
, scope
, extack
);
1373 nhi
= rtnl_dereference(nh
->nh_info
);
1375 NL_SET_ERR_MSG(extack
, "Route cannot point to a fdb nexthop");
1379 err
= nexthop_check_scope(nhi
, scope
, extack
);
1386 static int fib_check_nh_list(struct nexthop
*old
, struct nexthop
*new,
1387 struct netlink_ext_ack
*extack
)
1389 struct fib_info
*fi
;
1391 list_for_each_entry(fi
, &old
->fi_list
, nh_list
) {
1394 err
= fib_check_nexthop(new, fi
->fib_scope
, extack
);
1401 static bool nh_res_nhge_is_balanced(const struct nh_grp_entry
*nhge
)
1403 return nhge
->res
.count_buckets
== nhge
->res
.wants_buckets
;
1406 static bool nh_res_nhge_is_ow(const struct nh_grp_entry
*nhge
)
1408 return nhge
->res
.count_buckets
> nhge
->res
.wants_buckets
;
1411 static bool nh_res_nhge_is_uw(const struct nh_grp_entry
*nhge
)
1413 return nhge
->res
.count_buckets
< nhge
->res
.wants_buckets
;
1416 static bool nh_res_table_is_balanced(const struct nh_res_table
*res_table
)
1418 return list_empty(&res_table
->uw_nh_entries
);
1421 static void nh_res_bucket_unset_nh(struct nh_res_bucket
*bucket
)
1423 struct nh_grp_entry
*nhge
;
1425 if (bucket
->occupied
) {
1426 nhge
= nh_res_dereference(bucket
->nh_entry
);
1427 nhge
->res
.count_buckets
--;
1428 bucket
->occupied
= false;
1432 static void nh_res_bucket_set_nh(struct nh_res_bucket
*bucket
,
1433 struct nh_grp_entry
*nhge
)
1435 nh_res_bucket_unset_nh(bucket
);
1437 bucket
->occupied
= true;
1438 rcu_assign_pointer(bucket
->nh_entry
, nhge
);
1439 nhge
->res
.count_buckets
++;
1442 static bool nh_res_bucket_should_migrate(struct nh_res_table
*res_table
,
1443 struct nh_res_bucket
*bucket
,
1444 unsigned long *deadline
, bool *force
)
1446 unsigned long now
= jiffies
;
1447 struct nh_grp_entry
*nhge
;
1448 unsigned long idle_point
;
1450 if (!bucket
->occupied
) {
1451 /* The bucket is not occupied, its NHGE pointer is either
1452 * NULL or obsolete. We _have to_ migrate: set force.
1458 nhge
= nh_res_dereference(bucket
->nh_entry
);
1460 /* If the bucket is populated by an underweight or balanced
1461 * nexthop, do not migrate.
1463 if (!nh_res_nhge_is_ow(nhge
))
1466 /* At this point we know that the bucket is populated with an
1467 * overweight nexthop. It needs to be migrated to a new nexthop if
1468 * the idle timer of unbalanced timer expired.
1471 idle_point
= nh_res_bucket_idle_point(res_table
, bucket
, now
);
1472 if (time_after_eq(now
, idle_point
)) {
1473 /* The bucket is idle. We _can_ migrate: unset force. */
1478 /* Unbalanced timer of 0 means "never force". */
1479 if (res_table
->unbalanced_timer
) {
1480 unsigned long unb_point
;
1482 unb_point
= nh_res_table_unb_point(res_table
);
1483 if (time_after(now
, unb_point
)) {
1484 /* The bucket is not idle, but the unbalanced timer
1485 * expired. We _can_ migrate, but set force anyway,
1486 * so that drivers know to ignore activity reports
1493 nh_res_time_set_deadline(unb_point
, deadline
);
1496 nh_res_time_set_deadline(idle_point
, deadline
);
1500 static bool nh_res_bucket_migrate(struct nh_res_table
*res_table
,
1501 u16 bucket_index
, bool notify
,
1502 bool notify_nl
, bool force
)
1504 struct nh_res_bucket
*bucket
= &res_table
->nh_buckets
[bucket_index
];
1505 struct nh_grp_entry
*new_nhge
;
1506 struct netlink_ext_ack extack
;
1509 new_nhge
= list_first_entry_or_null(&res_table
->uw_nh_entries
,
1510 struct nh_grp_entry
,
1512 if (WARN_ON_ONCE(!new_nhge
))
1513 /* If this function is called, "bucket" is either not
1514 * occupied, or it belongs to a next hop that is
1515 * overweight. In either case, there ought to be a
1516 * corresponding underweight next hop.
1521 struct nh_grp_entry
*old_nhge
;
1523 old_nhge
= nh_res_dereference(bucket
->nh_entry
);
1524 err
= call_nexthop_res_bucket_notifiers(res_table
->net
,
1526 bucket_index
, force
,
1528 new_nhge
->nh
, &extack
);
1530 pr_err_ratelimited("%s\n", extack
._msg
);
1533 /* It is not possible to veto a forced replacement, so
1534 * just clear the hardware flags from the nexthop
1535 * bucket to indicate to user space that this bucket is
1536 * not correctly populated in hardware.
1538 bucket
->nh_flags
&= ~(RTNH_F_OFFLOAD
| RTNH_F_TRAP
);
1542 nh_res_bucket_set_nh(bucket
, new_nhge
);
1543 nh_res_bucket_set_idle(res_table
, bucket
);
1546 nexthop_bucket_notify(res_table
, bucket_index
);
1548 if (nh_res_nhge_is_balanced(new_nhge
))
1549 list_del(&new_nhge
->res
.uw_nh_entry
);
1553 #define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2)
1555 static void nh_res_table_upkeep(struct nh_res_table
*res_table
,
1556 bool notify
, bool notify_nl
)
1558 unsigned long now
= jiffies
;
1559 unsigned long deadline
;
1562 /* Deadline is the next time that upkeep should be run. It is the
1563 * earliest time at which one of the buckets might be migrated.
1564 * Start at the most pessimistic estimate: either unbalanced_timer
1565 * from now, or if there is none, idle_timer from now. For each
1566 * encountered time point, call nh_res_time_set_deadline() to
1567 * refine the estimate.
1569 if (res_table
->unbalanced_timer
)
1570 deadline
= now
+ res_table
->unbalanced_timer
;
1572 deadline
= now
+ res_table
->idle_timer
;
1574 for (i
= 0; i
< res_table
->num_nh_buckets
; i
++) {
1575 struct nh_res_bucket
*bucket
= &res_table
->nh_buckets
[i
];
1578 if (nh_res_bucket_should_migrate(res_table
, bucket
,
1579 &deadline
, &force
)) {
1580 if (!nh_res_bucket_migrate(res_table
, i
, notify
,
1581 notify_nl
, force
)) {
1582 unsigned long idle_point
;
1584 /* A driver can override the migration
1585 * decision if the HW reports that the
1586 * bucket is actually not idle. Therefore
1587 * remark the bucket as busy again and
1588 * update the deadline.
1590 nh_res_bucket_set_busy(bucket
);
1591 idle_point
= nh_res_bucket_idle_point(res_table
,
1594 nh_res_time_set_deadline(idle_point
, &deadline
);
1599 /* If the group is still unbalanced, schedule the next upkeep to
1600 * either the deadline computed above, or the minimum deadline,
1601 * whichever comes later.
1603 if (!nh_res_table_is_balanced(res_table
)) {
1604 unsigned long now
= jiffies
;
1605 unsigned long min_deadline
;
1607 min_deadline
= now
+ NH_RES_UPKEEP_DW_MINIMUM_INTERVAL
;
1608 if (time_before(deadline
, min_deadline
))
1609 deadline
= min_deadline
;
1611 queue_delayed_work(system_power_efficient_wq
,
1612 &res_table
->upkeep_dw
, deadline
- now
);
1616 static void nh_res_table_upkeep_dw(struct work_struct
*work
)
1618 struct delayed_work
*dw
= to_delayed_work(work
);
1619 struct nh_res_table
*res_table
;
1621 res_table
= container_of(dw
, struct nh_res_table
, upkeep_dw
);
1622 nh_res_table_upkeep(res_table
, true, true);
1625 static void nh_res_table_cancel_upkeep(struct nh_res_table
*res_table
)
1627 cancel_delayed_work_sync(&res_table
->upkeep_dw
);
1630 static void nh_res_group_rebalance(struct nh_group
*nhg
,
1631 struct nh_res_table
*res_table
)
1633 int prev_upper_bound
= 0;
1638 INIT_LIST_HEAD(&res_table
->uw_nh_entries
);
1640 for (i
= 0; i
< nhg
->num_nh
; ++i
)
1641 total
+= nhg
->nh_entries
[i
].weight
;
1643 for (i
= 0; i
< nhg
->num_nh
; ++i
) {
1644 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
1648 upper_bound
= DIV_ROUND_CLOSEST(res_table
->num_nh_buckets
* w
,
1650 nhge
->res
.wants_buckets
= upper_bound
- prev_upper_bound
;
1651 prev_upper_bound
= upper_bound
;
1653 if (nh_res_nhge_is_uw(nhge
)) {
1654 if (list_empty(&res_table
->uw_nh_entries
))
1655 res_table
->unbalanced_since
= jiffies
;
1656 list_add(&nhge
->res
.uw_nh_entry
,
1657 &res_table
->uw_nh_entries
);
1662 /* Migrate buckets in res_table so that they reference NHGE's from NHG with
1663 * the right NH ID. Set those buckets that do not have a corresponding NHGE
1664 * entry in NHG as not occupied.
1666 static void nh_res_table_migrate_buckets(struct nh_res_table
*res_table
,
1667 struct nh_group
*nhg
)
1671 for (i
= 0; i
< res_table
->num_nh_buckets
; i
++) {
1672 struct nh_res_bucket
*bucket
= &res_table
->nh_buckets
[i
];
1673 u32 id
= rtnl_dereference(bucket
->nh_entry
)->nh
->id
;
1677 for (j
= 0; j
< nhg
->num_nh
; j
++) {
1678 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[j
];
1680 if (nhge
->nh
->id
== id
) {
1681 nh_res_bucket_set_nh(bucket
, nhge
);
1688 nh_res_bucket_unset_nh(bucket
);
1692 static void replace_nexthop_grp_res(struct nh_group
*oldg
,
1693 struct nh_group
*newg
)
1695 /* For NH group replacement, the new NHG might only have a stub
1696 * hash table with 0 buckets, because the number of buckets was not
1697 * specified. For NH removal, oldg and newg both reference the same
1698 * res_table. So in any case, in the following, we want to work
1699 * with oldg->res_table.
1701 struct nh_res_table
*old_res_table
= rtnl_dereference(oldg
->res_table
);
1702 unsigned long prev_unbalanced_since
= old_res_table
->unbalanced_since
;
1703 bool prev_has_uw
= !list_empty(&old_res_table
->uw_nh_entries
);
1705 nh_res_table_cancel_upkeep(old_res_table
);
1706 nh_res_table_migrate_buckets(old_res_table
, newg
);
1707 nh_res_group_rebalance(newg
, old_res_table
);
1708 if (prev_has_uw
&& !list_empty(&old_res_table
->uw_nh_entries
))
1709 old_res_table
->unbalanced_since
= prev_unbalanced_since
;
1710 nh_res_table_upkeep(old_res_table
, true, false);
1713 static void nh_hthr_group_rebalance(struct nh_group
*nhg
)
1719 for (i
= 0; i
< nhg
->num_nh
; ++i
)
1720 total
+= nhg
->nh_entries
[i
].weight
;
1722 for (i
= 0; i
< nhg
->num_nh
; ++i
) {
1723 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
1727 upper_bound
= DIV_ROUND_CLOSEST_ULL((u64
)w
<< 31, total
) - 1;
1728 atomic_set(&nhge
->hthr
.upper_bound
, upper_bound
);
1732 static void remove_nh_grp_entry(struct net
*net
, struct nh_grp_entry
*nhge
,
1733 struct nl_info
*nlinfo
)
1735 struct nh_grp_entry
*nhges
, *new_nhges
;
1736 struct nexthop
*nhp
= nhge
->nh_parent
;
1737 struct netlink_ext_ack extack
;
1738 struct nexthop
*nh
= nhge
->nh
;
1739 struct nh_group
*nhg
, *newg
;
1744 nhg
= rtnl_dereference(nhp
->nh_grp
);
1747 /* last entry, keep it visible and remove the parent */
1748 if (nhg
->num_nh
== 1) {
1749 remove_nexthop(net
, nhp
, nlinfo
);
1753 newg
->has_v4
= false;
1754 newg
->is_multipath
= nhg
->is_multipath
;
1755 newg
->hash_threshold
= nhg
->hash_threshold
;
1756 newg
->resilient
= nhg
->resilient
;
1757 newg
->fdb_nh
= nhg
->fdb_nh
;
1758 newg
->num_nh
= nhg
->num_nh
;
1760 /* copy old entries to new except the one getting removed */
1761 nhges
= nhg
->nh_entries
;
1762 new_nhges
= newg
->nh_entries
;
1763 for (i
= 0, j
= 0; i
< nhg
->num_nh
; ++i
) {
1764 struct nh_info
*nhi
;
1766 /* current nexthop getting removed */
1767 if (nhg
->nh_entries
[i
].nh
== nh
) {
1772 nhi
= rtnl_dereference(nhges
[i
].nh
->nh_info
);
1773 if (nhi
->family
== AF_INET
)
1774 newg
->has_v4
= true;
1776 list_del(&nhges
[i
].nh_list
);
1777 new_nhges
[j
].nh_parent
= nhges
[i
].nh_parent
;
1778 new_nhges
[j
].nh
= nhges
[i
].nh
;
1779 new_nhges
[j
].weight
= nhges
[i
].weight
;
1780 list_add(&new_nhges
[j
].nh_list
, &new_nhges
[j
].nh
->grp_list
);
1784 if (newg
->hash_threshold
)
1785 nh_hthr_group_rebalance(newg
);
1786 else if (newg
->resilient
)
1787 replace_nexthop_grp_res(nhg
, newg
);
1789 rcu_assign_pointer(nhp
->nh_grp
, newg
);
1791 list_del(&nhge
->nh_list
);
1792 nexthop_put(nhge
->nh
);
1794 /* Removal of a NH from a resilient group is notified through
1795 * bucket notifications.
1797 if (newg
->hash_threshold
) {
1798 err
= call_nexthop_notifiers(net
, NEXTHOP_EVENT_REPLACE
, nhp
,
1801 pr_err("%s\n", extack
._msg
);
1805 nexthop_notify(RTM_NEWNEXTHOP
, nhp
, nlinfo
);
1808 static void remove_nexthop_from_groups(struct net
*net
, struct nexthop
*nh
,
1809 struct nl_info
*nlinfo
)
1811 struct nh_grp_entry
*nhge
, *tmp
;
1813 list_for_each_entry_safe(nhge
, tmp
, &nh
->grp_list
, nh_list
)
1814 remove_nh_grp_entry(net
, nhge
, nlinfo
);
1816 /* make sure all see the newly published array before releasing rtnl */
1820 static void remove_nexthop_group(struct nexthop
*nh
, struct nl_info
*nlinfo
)
1822 struct nh_group
*nhg
= rcu_dereference_rtnl(nh
->nh_grp
);
1823 struct nh_res_table
*res_table
;
1824 int i
, num_nh
= nhg
->num_nh
;
1826 for (i
= 0; i
< num_nh
; ++i
) {
1827 struct nh_grp_entry
*nhge
= &nhg
->nh_entries
[i
];
1829 if (WARN_ON(!nhge
->nh
))
1832 list_del_init(&nhge
->nh_list
);
1835 if (nhg
->resilient
) {
1836 res_table
= rtnl_dereference(nhg
->res_table
);
1837 nh_res_table_cancel_upkeep(res_table
);
1841 /* not called for nexthop replace */
1842 static void __remove_nexthop_fib(struct net
*net
, struct nexthop
*nh
)
1844 struct fib6_info
*f6i
, *tmp
;
1845 bool do_flush
= false;
1846 struct fib_info
*fi
;
1848 list_for_each_entry(fi
, &nh
->fi_list
, nh_list
) {
1849 fi
->fib_flags
|= RTNH_F_DEAD
;
1855 /* ip6_del_rt removes the entry from this list hence the _safe */
1856 list_for_each_entry_safe(f6i
, tmp
, &nh
->f6i_list
, nh_list
) {
1857 /* __ip6_del_rt does a release, so do a hold here */
1858 fib6_info_hold(f6i
);
1859 ipv6_stub
->ip6_del_rt(net
, f6i
,
1860 !net
->ipv4
.sysctl_nexthop_compat_mode
);
1864 static void __remove_nexthop(struct net
*net
, struct nexthop
*nh
,
1865 struct nl_info
*nlinfo
)
1867 __remove_nexthop_fib(net
, nh
);
1870 remove_nexthop_group(nh
, nlinfo
);
1872 struct nh_info
*nhi
;
1874 nhi
= rtnl_dereference(nh
->nh_info
);
1875 if (nhi
->fib_nhc
.nhc_dev
)
1876 hlist_del(&nhi
->dev_hash
);
1878 remove_nexthop_from_groups(net
, nh
, nlinfo
);
1882 static void remove_nexthop(struct net
*net
, struct nexthop
*nh
,
1883 struct nl_info
*nlinfo
)
1885 call_nexthop_notifiers(net
, NEXTHOP_EVENT_DEL
, nh
, NULL
);
1887 /* remove from the tree */
1888 rb_erase(&nh
->rb_node
, &net
->nexthop
.rb_root
);
1891 nexthop_notify(RTM_DELNEXTHOP
, nh
, nlinfo
);
1893 __remove_nexthop(net
, nh
, nlinfo
);
1894 nh_base_seq_inc(net
);
1899 /* if any FIB entries reference this nexthop, any dst entries
1900 * need to be regenerated
1902 static void nh_rt_cache_flush(struct net
*net
, struct nexthop
*nh
)
1904 struct fib6_info
*f6i
;
1906 if (!list_empty(&nh
->fi_list
))
1907 rt_cache_flush(net
);
1909 list_for_each_entry(f6i
, &nh
->f6i_list
, nh_list
)
1910 ipv6_stub
->fib6_update_sernum(net
, f6i
);
1913 static int replace_nexthop_grp(struct net
*net
, struct nexthop
*old
,
1914 struct nexthop
*new, const struct nh_config
*cfg
,
1915 struct netlink_ext_ack
*extack
)
1917 struct nh_res_table
*tmp_table
= NULL
;
1918 struct nh_res_table
*new_res_table
;
1919 struct nh_res_table
*old_res_table
;
1920 struct nh_group
*oldg
, *newg
;
1923 if (!new->is_group
) {
1924 NL_SET_ERR_MSG(extack
, "Can not replace a nexthop group with a nexthop.");
1928 oldg
= rtnl_dereference(old
->nh_grp
);
1929 newg
= rtnl_dereference(new->nh_grp
);
1931 if (newg
->hash_threshold
!= oldg
->hash_threshold
) {
1932 NL_SET_ERR_MSG(extack
, "Can not replace a nexthop group with one of a different type.");
1936 if (newg
->hash_threshold
) {
1937 err
= call_nexthop_notifiers(net
, NEXTHOP_EVENT_REPLACE
, new,
1941 } else if (newg
->resilient
) {
1942 new_res_table
= rtnl_dereference(newg
->res_table
);
1943 old_res_table
= rtnl_dereference(oldg
->res_table
);
1945 /* Accept if num_nh_buckets was not given, but if it was
1946 * given, demand that the value be correct.
1948 if (cfg
->nh_grp_res_has_num_buckets
&&
1949 cfg
->nh_grp_res_num_buckets
!=
1950 old_res_table
->num_nh_buckets
) {
1951 NL_SET_ERR_MSG(extack
, "Can not change number of buckets of a resilient nexthop group.");
1955 /* Emit a pre-replace notification so that listeners could veto
1956 * a potentially unsupported configuration. Otherwise,
1957 * individual bucket replacement notifications would need to be
1958 * vetoed, which is something that should only happen if the
1959 * bucket is currently active.
1961 err
= call_nexthop_res_table_notifiers(net
, new, extack
);
1965 if (cfg
->nh_grp_res_has_idle_timer
)
1966 old_res_table
->idle_timer
= cfg
->nh_grp_res_idle_timer
;
1967 if (cfg
->nh_grp_res_has_unbalanced_timer
)
1968 old_res_table
->unbalanced_timer
=
1969 cfg
->nh_grp_res_unbalanced_timer
;
1971 replace_nexthop_grp_res(oldg
, newg
);
1973 tmp_table
= new_res_table
;
1974 rcu_assign_pointer(newg
->res_table
, old_res_table
);
1975 rcu_assign_pointer(newg
->spare
->res_table
, old_res_table
);
1978 /* update parents - used by nexthop code for cleanup */
1979 for (i
= 0; i
< newg
->num_nh
; i
++)
1980 newg
->nh_entries
[i
].nh_parent
= old
;
1982 rcu_assign_pointer(old
->nh_grp
, newg
);
1984 if (newg
->resilient
) {
1985 rcu_assign_pointer(oldg
->res_table
, tmp_table
);
1986 rcu_assign_pointer(oldg
->spare
->res_table
, tmp_table
);
1989 for (i
= 0; i
< oldg
->num_nh
; i
++)
1990 oldg
->nh_entries
[i
].nh_parent
= new;
1992 rcu_assign_pointer(new->nh_grp
, oldg
);
1997 static void nh_group_v4_update(struct nh_group
*nhg
)
1999 struct nh_grp_entry
*nhges
;
2000 bool has_v4
= false;
2003 nhges
= nhg
->nh_entries
;
2004 for (i
= 0; i
< nhg
->num_nh
; i
++) {
2005 struct nh_info
*nhi
;
2007 nhi
= rtnl_dereference(nhges
[i
].nh
->nh_info
);
2008 if (nhi
->family
== AF_INET
)
2011 nhg
->has_v4
= has_v4
;
2014 static int replace_nexthop_single_notify_res(struct net
*net
,
2015 struct nh_res_table
*res_table
,
2016 struct nexthop
*old
,
2017 struct nh_info
*oldi
,
2018 struct nh_info
*newi
,
2019 struct netlink_ext_ack
*extack
)
2021 u32 nhg_id
= res_table
->nhg_id
;
2025 for (i
= 0; i
< res_table
->num_nh_buckets
; i
++) {
2026 struct nh_res_bucket
*bucket
= &res_table
->nh_buckets
[i
];
2027 struct nh_grp_entry
*nhge
;
2029 nhge
= rtnl_dereference(bucket
->nh_entry
);
2030 if (nhge
->nh
== old
) {
2031 err
= __call_nexthop_res_bucket_notifiers(net
, nhg_id
,
2044 struct nh_res_bucket
*bucket
= &res_table
->nh_buckets
[i
];
2045 struct nh_grp_entry
*nhge
;
2047 nhge
= rtnl_dereference(bucket
->nh_entry
);
2048 if (nhge
->nh
== old
)
2049 __call_nexthop_res_bucket_notifiers(net
, nhg_id
, i
,
2056 static int replace_nexthop_single_notify(struct net
*net
,
2057 struct nexthop
*group_nh
,
2058 struct nexthop
*old
,
2059 struct nh_info
*oldi
,
2060 struct nh_info
*newi
,
2061 struct netlink_ext_ack
*extack
)
2063 struct nh_group
*nhg
= rtnl_dereference(group_nh
->nh_grp
);
2064 struct nh_res_table
*res_table
;
2066 if (nhg
->hash_threshold
) {
2067 return call_nexthop_notifiers(net
, NEXTHOP_EVENT_REPLACE
,
2069 } else if (nhg
->resilient
) {
2070 res_table
= rtnl_dereference(nhg
->res_table
);
2071 return replace_nexthop_single_notify_res(net
, res_table
,
2079 static int replace_nexthop_single(struct net
*net
, struct nexthop
*old
,
2080 struct nexthop
*new,
2081 struct netlink_ext_ack
*extack
)
2083 u8 old_protocol
, old_nh_flags
;
2084 struct nh_info
*oldi
, *newi
;
2085 struct nh_grp_entry
*nhge
;
2088 if (new->is_group
) {
2089 NL_SET_ERR_MSG(extack
, "Can not replace a nexthop with a nexthop group.");
2093 err
= call_nexthop_notifiers(net
, NEXTHOP_EVENT_REPLACE
, new, extack
);
2097 /* Hardware flags were set on 'old' as 'new' is not in the red-black
2098 * tree. Therefore, inherit the flags from 'old' to 'new'.
2100 new->nh_flags
|= old
->nh_flags
& (RTNH_F_OFFLOAD
| RTNH_F_TRAP
);
2102 oldi
= rtnl_dereference(old
->nh_info
);
2103 newi
= rtnl_dereference(new->nh_info
);
2105 newi
->nh_parent
= old
;
2106 oldi
->nh_parent
= new;
2108 old_protocol
= old
->protocol
;
2109 old_nh_flags
= old
->nh_flags
;
2111 old
->protocol
= new->protocol
;
2112 old
->nh_flags
= new->nh_flags
;
2114 rcu_assign_pointer(old
->nh_info
, newi
);
2115 rcu_assign_pointer(new->nh_info
, oldi
);
2117 /* Send a replace notification for all the groups using the nexthop. */
2118 list_for_each_entry(nhge
, &old
->grp_list
, nh_list
) {
2119 struct nexthop
*nhp
= nhge
->nh_parent
;
2121 err
= replace_nexthop_single_notify(net
, nhp
, old
, oldi
, newi
,
2127 /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially
2128 * update IPv4 indication in all the groups using the nexthop.
2130 if (oldi
->family
== AF_INET
&& newi
->family
== AF_INET6
) {
2131 list_for_each_entry(nhge
, &old
->grp_list
, nh_list
) {
2132 struct nexthop
*nhp
= nhge
->nh_parent
;
2133 struct nh_group
*nhg
;
2135 nhg
= rtnl_dereference(nhp
->nh_grp
);
2136 nh_group_v4_update(nhg
);
2143 rcu_assign_pointer(new->nh_info
, newi
);
2144 rcu_assign_pointer(old
->nh_info
, oldi
);
2145 old
->nh_flags
= old_nh_flags
;
2146 old
->protocol
= old_protocol
;
2147 oldi
->nh_parent
= old
;
2148 newi
->nh_parent
= new;
2149 list_for_each_entry_continue_reverse(nhge
, &old
->grp_list
, nh_list
) {
2150 struct nexthop
*nhp
= nhge
->nh_parent
;
2152 replace_nexthop_single_notify(net
, nhp
, old
, newi
, oldi
, NULL
);
2154 call_nexthop_notifiers(net
, NEXTHOP_EVENT_REPLACE
, old
, extack
);
2158 static void __nexthop_replace_notify(struct net
*net
, struct nexthop
*nh
,
2159 struct nl_info
*info
)
2161 struct fib6_info
*f6i
;
2163 if (!list_empty(&nh
->fi_list
)) {
2164 struct fib_info
*fi
;
2166 /* expectation is a few fib_info per nexthop and then
2167 * a lot of routes per fib_info. So mark the fib_info
2168 * and then walk the fib tables once
2170 list_for_each_entry(fi
, &nh
->fi_list
, nh_list
)
2171 fi
->nh_updated
= true;
2173 fib_info_notify_update(net
, info
);
2175 list_for_each_entry(fi
, &nh
->fi_list
, nh_list
)
2176 fi
->nh_updated
= false;
2179 list_for_each_entry(f6i
, &nh
->f6i_list
, nh_list
)
2180 ipv6_stub
->fib6_rt_update(net
, f6i
, info
);
2183 /* send RTM_NEWROUTE with REPLACE flag set for all FIB entries
2184 * linked to this nexthop and for all groups that the nexthop
2187 static void nexthop_replace_notify(struct net
*net
, struct nexthop
*nh
,
2188 struct nl_info
*info
)
2190 struct nh_grp_entry
*nhge
;
2192 __nexthop_replace_notify(net
, nh
, info
);
2194 list_for_each_entry(nhge
, &nh
->grp_list
, nh_list
)
2195 __nexthop_replace_notify(net
, nhge
->nh_parent
, info
);
2198 static int replace_nexthop(struct net
*net
, struct nexthop
*old
,
2199 struct nexthop
*new, const struct nh_config
*cfg
,
2200 struct netlink_ext_ack
*extack
)
2202 bool new_is_reject
= false;
2203 struct nh_grp_entry
*nhge
;
2206 /* check that existing FIB entries are ok with the
2207 * new nexthop definition
2209 err
= fib_check_nh_list(old
, new, extack
);
2213 err
= fib6_check_nh_list(old
, new, extack
);
2217 if (!new->is_group
) {
2218 struct nh_info
*nhi
= rtnl_dereference(new->nh_info
);
2220 new_is_reject
= nhi
->reject_nh
;
2223 list_for_each_entry(nhge
, &old
->grp_list
, nh_list
) {
2224 /* if new nexthop is a blackhole, any groups using this
2225 * nexthop cannot have more than 1 path
2227 if (new_is_reject
&&
2228 nexthop_num_path(nhge
->nh_parent
) > 1) {
2229 NL_SET_ERR_MSG(extack
, "Blackhole nexthop can not be a member of a group with more than one path");
2233 err
= fib_check_nh_list(nhge
->nh_parent
, new, extack
);
2237 err
= fib6_check_nh_list(nhge
->nh_parent
, new, extack
);
2243 err
= replace_nexthop_grp(net
, old
, new, cfg
, extack
);
2245 err
= replace_nexthop_single(net
, old
, new, extack
);
2248 nh_rt_cache_flush(net
, old
);
2250 __remove_nexthop(net
, new, NULL
);
2257 /* called with rtnl_lock held */
2258 static int insert_nexthop(struct net
*net
, struct nexthop
*new_nh
,
2259 struct nh_config
*cfg
, struct netlink_ext_ack
*extack
)
2261 struct rb_node
**pp
, *parent
= NULL
, *next
;
2262 struct rb_root
*root
= &net
->nexthop
.rb_root
;
2263 bool replace
= !!(cfg
->nlflags
& NLM_F_REPLACE
);
2264 bool create
= !!(cfg
->nlflags
& NLM_F_CREATE
);
2265 u32 new_id
= new_nh
->id
;
2266 int replace_notify
= 0;
2269 pp
= &root
->rb_node
;
2279 nh
= rb_entry(parent
, struct nexthop
, rb_node
);
2280 if (new_id
< nh
->id
) {
2281 pp
= &next
->rb_left
;
2282 } else if (new_id
> nh
->id
) {
2283 pp
= &next
->rb_right
;
2284 } else if (replace
) {
2285 rc
= replace_nexthop(net
, nh
, new_nh
, cfg
, extack
);
2287 new_nh
= nh
; /* send notification with old nh */
2292 /* id already exists and not a replace */
2297 if (replace
&& !create
) {
2298 NL_SET_ERR_MSG(extack
, "Replace specified without create and no entry exists");
2303 if (new_nh
->is_group
) {
2304 struct nh_group
*nhg
= rtnl_dereference(new_nh
->nh_grp
);
2305 struct nh_res_table
*res_table
;
2307 if (nhg
->resilient
) {
2308 res_table
= rtnl_dereference(nhg
->res_table
);
2310 /* Not passing the number of buckets is OK when
2311 * replacing, but not when creating a new group.
2313 if (!cfg
->nh_grp_res_has_num_buckets
) {
2314 NL_SET_ERR_MSG(extack
, "Number of buckets not specified for nexthop group insertion");
2319 nh_res_group_rebalance(nhg
, res_table
);
2321 /* Do not send bucket notifications, we do full
2322 * notification below.
2324 nh_res_table_upkeep(res_table
, false, false);
2328 rb_link_node_rcu(&new_nh
->rb_node
, parent
, pp
);
2329 rb_insert_color(&new_nh
->rb_node
, root
);
2331 /* The initial insertion is a full notification for hash-threshold as
2332 * well as resilient groups.
2334 rc
= call_nexthop_notifiers(net
, NEXTHOP_EVENT_REPLACE
, new_nh
, extack
);
2336 rb_erase(&new_nh
->rb_node
, &net
->nexthop
.rb_root
);
2340 nh_base_seq_inc(net
);
2341 nexthop_notify(RTM_NEWNEXTHOP
, new_nh
, &cfg
->nlinfo
);
2342 if (replace_notify
&& net
->ipv4
.sysctl_nexthop_compat_mode
)
2343 nexthop_replace_notify(net
, new_nh
, &cfg
->nlinfo
);
2350 /* remove all nexthops tied to a device being deleted */
2351 static void nexthop_flush_dev(struct net_device
*dev
, unsigned long event
)
2353 unsigned int hash
= nh_dev_hashfn(dev
->ifindex
);
2354 struct net
*net
= dev_net(dev
);
2355 struct hlist_head
*head
= &net
->nexthop
.devhash
[hash
];
2356 struct hlist_node
*n
;
2357 struct nh_info
*nhi
;
2359 hlist_for_each_entry_safe(nhi
, n
, head
, dev_hash
) {
2360 if (nhi
->fib_nhc
.nhc_dev
!= dev
)
2363 if (nhi
->reject_nh
&&
2364 (event
== NETDEV_DOWN
|| event
== NETDEV_CHANGE
))
2367 remove_nexthop(net
, nhi
->nh_parent
, NULL
);
2371 /* rtnl; called when net namespace is deleted */
2372 static void flush_all_nexthops(struct net
*net
)
2374 struct rb_root
*root
= &net
->nexthop
.rb_root
;
2375 struct rb_node
*node
;
2378 while ((node
= rb_first(root
))) {
2379 nh
= rb_entry(node
, struct nexthop
, rb_node
);
2380 remove_nexthop(net
, nh
, NULL
);
2385 static struct nexthop
*nexthop_create_group(struct net
*net
,
2386 struct nh_config
*cfg
)
2388 struct nlattr
*grps_attr
= cfg
->nh_grp
;
2389 struct nexthop_grp
*entry
= nla_data(grps_attr
);
2390 u16 num_nh
= nla_len(grps_attr
) / sizeof(*entry
);
2391 struct nh_group
*nhg
;
2396 if (WARN_ON(!num_nh
))
2397 return ERR_PTR(-EINVAL
);
2399 nh
= nexthop_alloc();
2401 return ERR_PTR(-ENOMEM
);
2405 nhg
= nexthop_grp_alloc(num_nh
);
2408 return ERR_PTR(-ENOMEM
);
2411 /* spare group used for removals */
2412 nhg
->spare
= nexthop_grp_alloc(num_nh
);
2416 return ERR_PTR(-ENOMEM
);
2418 nhg
->spare
->spare
= nhg
;
2420 for (i
= 0; i
< nhg
->num_nh
; ++i
) {
2421 struct nexthop
*nhe
;
2422 struct nh_info
*nhi
;
2424 nhe
= nexthop_find_by_id(net
, entry
[i
].id
);
2425 if (!nexthop_get(nhe
)) {
2430 nhi
= rtnl_dereference(nhe
->nh_info
);
2431 if (nhi
->family
== AF_INET
)
2434 nhg
->nh_entries
[i
].nh
= nhe
;
2435 nhg
->nh_entries
[i
].weight
= entry
[i
].weight
+ 1;
2436 list_add(&nhg
->nh_entries
[i
].nh_list
, &nhe
->grp_list
);
2437 nhg
->nh_entries
[i
].nh_parent
= nh
;
2440 if (cfg
->nh_grp_type
== NEXTHOP_GRP_TYPE_MPATH
) {
2441 nhg
->hash_threshold
= 1;
2442 nhg
->is_multipath
= true;
2443 } else if (cfg
->nh_grp_type
== NEXTHOP_GRP_TYPE_RES
) {
2444 struct nh_res_table
*res_table
;
2446 res_table
= nexthop_res_table_alloc(net
, cfg
->nh_id
, cfg
);
2452 rcu_assign_pointer(nhg
->spare
->res_table
, res_table
);
2453 rcu_assign_pointer(nhg
->res_table
, res_table
);
2454 nhg
->resilient
= true;
2455 nhg
->is_multipath
= true;
2458 WARN_ON_ONCE(nhg
->hash_threshold
+ nhg
->resilient
!= 1);
2460 if (nhg
->hash_threshold
)
2461 nh_hthr_group_rebalance(nhg
);
2466 rcu_assign_pointer(nh
->nh_grp
, nhg
);
2471 for (i
--; i
>= 0; --i
) {
2472 list_del(&nhg
->nh_entries
[i
].nh_list
);
2473 nexthop_put(nhg
->nh_entries
[i
].nh
);
2480 return ERR_PTR(err
);
2483 static int nh_create_ipv4(struct net
*net
, struct nexthop
*nh
,
2484 struct nh_info
*nhi
, struct nh_config
*cfg
,
2485 struct netlink_ext_ack
*extack
)
2487 struct fib_nh
*fib_nh
= &nhi
->fib_nh
;
2488 struct fib_config fib_cfg
= {
2489 .fc_oif
= cfg
->nh_ifindex
,
2490 .fc_gw4
= cfg
->gw
.ipv4
,
2491 .fc_gw_family
= cfg
->gw
.ipv4
? AF_INET
: 0,
2492 .fc_flags
= cfg
->nh_flags
,
2493 .fc_nlinfo
= cfg
->nlinfo
,
2494 .fc_encap
= cfg
->nh_encap
,
2495 .fc_encap_type
= cfg
->nh_encap_type
,
2497 u32 tb_id
= (cfg
->dev
? l3mdev_fib_table(cfg
->dev
) : RT_TABLE_MAIN
);
2500 err
= fib_nh_init(net
, fib_nh
, &fib_cfg
, 1, extack
);
2502 fib_nh_release(net
, fib_nh
);
2509 /* sets nh_dev if successful */
2510 err
= fib_check_nh(net
, fib_nh
, tb_id
, 0, extack
);
2512 nh
->nh_flags
= fib_nh
->fib_nh_flags
;
2513 fib_info_update_nhc_saddr(net
, &fib_nh
->nh_common
,
2514 fib_nh
->fib_nh_scope
);
2516 fib_nh_release(net
, fib_nh
);
2522 static int nh_create_ipv6(struct net
*net
, struct nexthop
*nh
,
2523 struct nh_info
*nhi
, struct nh_config
*cfg
,
2524 struct netlink_ext_ack
*extack
)
2526 struct fib6_nh
*fib6_nh
= &nhi
->fib6_nh
;
2527 struct fib6_config fib6_cfg
= {
2528 .fc_table
= l3mdev_fib_table(cfg
->dev
),
2529 .fc_ifindex
= cfg
->nh_ifindex
,
2530 .fc_gateway
= cfg
->gw
.ipv6
,
2531 .fc_flags
= cfg
->nh_flags
,
2532 .fc_nlinfo
= cfg
->nlinfo
,
2533 .fc_encap
= cfg
->nh_encap
,
2534 .fc_encap_type
= cfg
->nh_encap_type
,
2535 .fc_is_fdb
= cfg
->nh_fdb
,
2539 if (!ipv6_addr_any(&cfg
->gw
.ipv6
))
2540 fib6_cfg
.fc_flags
|= RTF_GATEWAY
;
2542 /* sets nh_dev if successful */
2543 err
= ipv6_stub
->fib6_nh_init(net
, fib6_nh
, &fib6_cfg
, GFP_KERNEL
,
2546 ipv6_stub
->fib6_nh_release(fib6_nh
);
2548 nh
->nh_flags
= fib6_nh
->fib_nh_flags
;
2553 static struct nexthop
*nexthop_create(struct net
*net
, struct nh_config
*cfg
,
2554 struct netlink_ext_ack
*extack
)
2556 struct nh_info
*nhi
;
2560 nh
= nexthop_alloc();
2562 return ERR_PTR(-ENOMEM
);
2564 nhi
= kzalloc(sizeof(*nhi
), GFP_KERNEL
);
2567 return ERR_PTR(-ENOMEM
);
2570 nh
->nh_flags
= cfg
->nh_flags
;
2573 nhi
->nh_parent
= nh
;
2574 nhi
->family
= cfg
->nh_family
;
2575 nhi
->fib_nhc
.nhc_scope
= RT_SCOPE_LINK
;
2580 if (cfg
->nh_blackhole
) {
2582 cfg
->nh_ifindex
= net
->loopback_dev
->ifindex
;
2585 switch (cfg
->nh_family
) {
2587 err
= nh_create_ipv4(net
, nh
, nhi
, cfg
, extack
);
2590 err
= nh_create_ipv6(net
, nh
, nhi
, cfg
, extack
);
2597 return ERR_PTR(err
);
2600 /* add the entry to the device based hash */
2602 nexthop_devhash_add(net
, nhi
);
2604 rcu_assign_pointer(nh
->nh_info
, nhi
);
2609 /* called with rtnl lock held */
2610 static struct nexthop
*nexthop_add(struct net
*net
, struct nh_config
*cfg
,
2611 struct netlink_ext_ack
*extack
)
2616 if (cfg
->nlflags
& NLM_F_REPLACE
&& !cfg
->nh_id
) {
2617 NL_SET_ERR_MSG(extack
, "Replace requires nexthop id");
2618 return ERR_PTR(-EINVAL
);
2622 cfg
->nh_id
= nh_find_unused_id(net
);
2624 NL_SET_ERR_MSG(extack
, "No unused id");
2625 return ERR_PTR(-EINVAL
);
2630 nh
= nexthop_create_group(net
, cfg
);
2632 nh
= nexthop_create(net
, cfg
, extack
);
2637 refcount_set(&nh
->refcnt
, 1);
2638 nh
->id
= cfg
->nh_id
;
2639 nh
->protocol
= cfg
->nh_protocol
;
2642 err
= insert_nexthop(net
, nh
, cfg
, extack
);
2644 __remove_nexthop(net
, nh
, NULL
);
2652 static int rtm_nh_get_timer(struct nlattr
*attr
, unsigned long fallback
,
2653 unsigned long *timer_p
, bool *has_p
,
2654 struct netlink_ext_ack
*extack
)
2656 unsigned long timer
;
2660 *timer_p
= fallback
;
2665 value
= nla_get_u32(attr
);
2666 timer
= clock_t_to_jiffies(value
);
2667 if (timer
== ~0UL) {
2668 NL_SET_ERR_MSG(extack
, "Timer value too large");
2677 static int rtm_to_nh_config_grp_res(struct nlattr
*res
, struct nh_config
*cfg
,
2678 struct netlink_ext_ack
*extack
)
2680 struct nlattr
*tb
[ARRAY_SIZE(rtm_nh_res_policy_new
)] = {};
2684 err
= nla_parse_nested(tb
,
2685 ARRAY_SIZE(rtm_nh_res_policy_new
) - 1,
2686 res
, rtm_nh_res_policy_new
, extack
);
2691 if (tb
[NHA_RES_GROUP_BUCKETS
]) {
2692 cfg
->nh_grp_res_num_buckets
=
2693 nla_get_u16(tb
[NHA_RES_GROUP_BUCKETS
]);
2694 cfg
->nh_grp_res_has_num_buckets
= true;
2695 if (!cfg
->nh_grp_res_num_buckets
) {
2696 NL_SET_ERR_MSG(extack
, "Number of buckets needs to be non-0");
2701 err
= rtm_nh_get_timer(tb
[NHA_RES_GROUP_IDLE_TIMER
],
2702 NH_RES_DEFAULT_IDLE_TIMER
,
2703 &cfg
->nh_grp_res_idle_timer
,
2704 &cfg
->nh_grp_res_has_idle_timer
,
2709 return rtm_nh_get_timer(tb
[NHA_RES_GROUP_UNBALANCED_TIMER
],
2710 NH_RES_DEFAULT_UNBALANCED_TIMER
,
2711 &cfg
->nh_grp_res_unbalanced_timer
,
2712 &cfg
->nh_grp_res_has_unbalanced_timer
,
2716 static int rtm_to_nh_config(struct net
*net
, struct sk_buff
*skb
,
2717 struct nlmsghdr
*nlh
, struct nh_config
*cfg
,
2718 struct netlink_ext_ack
*extack
)
2720 struct nhmsg
*nhm
= nlmsg_data(nlh
);
2721 struct nlattr
*tb
[ARRAY_SIZE(rtm_nh_policy_new
)];
2724 err
= nlmsg_parse(nlh
, sizeof(*nhm
), tb
,
2725 ARRAY_SIZE(rtm_nh_policy_new
) - 1,
2726 rtm_nh_policy_new
, extack
);
2731 if (nhm
->resvd
|| nhm
->nh_scope
) {
2732 NL_SET_ERR_MSG(extack
, "Invalid values in ancillary header");
2735 if (nhm
->nh_flags
& ~NEXTHOP_VALID_USER_FLAGS
) {
2736 NL_SET_ERR_MSG(extack
, "Invalid nexthop flags in ancillary header");
2740 switch (nhm
->nh_family
) {
2749 NL_SET_ERR_MSG(extack
, "Invalid address family");
2753 memset(cfg
, 0, sizeof(*cfg
));
2754 cfg
->nlflags
= nlh
->nlmsg_flags
;
2755 cfg
->nlinfo
.portid
= NETLINK_CB(skb
).portid
;
2756 cfg
->nlinfo
.nlh
= nlh
;
2757 cfg
->nlinfo
.nl_net
= net
;
2759 cfg
->nh_family
= nhm
->nh_family
;
2760 cfg
->nh_protocol
= nhm
->nh_protocol
;
2761 cfg
->nh_flags
= nhm
->nh_flags
;
2764 cfg
->nh_id
= nla_get_u32(tb
[NHA_ID
]);
2767 if (tb
[NHA_OIF
] || tb
[NHA_BLACKHOLE
] ||
2768 tb
[NHA_ENCAP
] || tb
[NHA_ENCAP_TYPE
]) {
2769 NL_SET_ERR_MSG(extack
, "Fdb attribute can not be used with encap, oif or blackhole");
2772 if (nhm
->nh_flags
) {
2773 NL_SET_ERR_MSG(extack
, "Unsupported nexthop flags in ancillary header");
2776 cfg
->nh_fdb
= nla_get_flag(tb
[NHA_FDB
]);
2779 if (tb
[NHA_GROUP
]) {
2780 if (nhm
->nh_family
!= AF_UNSPEC
) {
2781 NL_SET_ERR_MSG(extack
, "Invalid family for group");
2784 cfg
->nh_grp
= tb
[NHA_GROUP
];
2786 cfg
->nh_grp_type
= NEXTHOP_GRP_TYPE_MPATH
;
2787 if (tb
[NHA_GROUP_TYPE
])
2788 cfg
->nh_grp_type
= nla_get_u16(tb
[NHA_GROUP_TYPE
]);
2790 if (cfg
->nh_grp_type
> NEXTHOP_GRP_TYPE_MAX
) {
2791 NL_SET_ERR_MSG(extack
, "Invalid group type");
2794 err
= nh_check_attr_group(net
, tb
, ARRAY_SIZE(tb
),
2795 cfg
->nh_grp_type
, extack
);
2799 if (cfg
->nh_grp_type
== NEXTHOP_GRP_TYPE_RES
)
2800 err
= rtm_to_nh_config_grp_res(tb
[NHA_RES_GROUP
],
2803 /* no other attributes should be set */
2807 if (tb
[NHA_BLACKHOLE
]) {
2808 if (tb
[NHA_GATEWAY
] || tb
[NHA_OIF
] ||
2809 tb
[NHA_ENCAP
] || tb
[NHA_ENCAP_TYPE
] || tb
[NHA_FDB
]) {
2810 NL_SET_ERR_MSG(extack
, "Blackhole attribute can not be used with gateway, oif, encap or fdb");
2814 cfg
->nh_blackhole
= 1;
2819 if (!cfg
->nh_fdb
&& !tb
[NHA_OIF
]) {
2820 NL_SET_ERR_MSG(extack
, "Device attribute required for non-blackhole and non-fdb nexthops");
2824 if (!cfg
->nh_fdb
&& tb
[NHA_OIF
]) {
2825 cfg
->nh_ifindex
= nla_get_u32(tb
[NHA_OIF
]);
2826 if (cfg
->nh_ifindex
)
2827 cfg
->dev
= __dev_get_by_index(net
, cfg
->nh_ifindex
);
2830 NL_SET_ERR_MSG(extack
, "Invalid device index");
2832 } else if (!(cfg
->dev
->flags
& IFF_UP
)) {
2833 NL_SET_ERR_MSG(extack
, "Nexthop device is not up");
2836 } else if (!netif_carrier_ok(cfg
->dev
)) {
2837 NL_SET_ERR_MSG(extack
, "Carrier for nexthop device is down");
2844 if (tb
[NHA_GATEWAY
]) {
2845 struct nlattr
*gwa
= tb
[NHA_GATEWAY
];
2847 switch (cfg
->nh_family
) {
2849 if (nla_len(gwa
) != sizeof(u32
)) {
2850 NL_SET_ERR_MSG(extack
, "Invalid gateway");
2853 cfg
->gw
.ipv4
= nla_get_be32(gwa
);
2856 if (nla_len(gwa
) != sizeof(struct in6_addr
)) {
2857 NL_SET_ERR_MSG(extack
, "Invalid gateway");
2860 cfg
->gw
.ipv6
= nla_get_in6_addr(gwa
);
2863 NL_SET_ERR_MSG(extack
,
2864 "Unknown address family for gateway");
2868 /* device only nexthop (no gateway) */
2869 if (cfg
->nh_flags
& RTNH_F_ONLINK
) {
2870 NL_SET_ERR_MSG(extack
,
2871 "ONLINK flag can not be set for nexthop without a gateway");
2876 if (tb
[NHA_ENCAP
]) {
2877 cfg
->nh_encap
= tb
[NHA_ENCAP
];
2879 if (!tb
[NHA_ENCAP_TYPE
]) {
2880 NL_SET_ERR_MSG(extack
, "LWT encapsulation type is missing");
2884 cfg
->nh_encap_type
= nla_get_u16(tb
[NHA_ENCAP_TYPE
]);
2885 err
= lwtunnel_valid_encap_type(cfg
->nh_encap_type
, extack
);
2889 } else if (tb
[NHA_ENCAP_TYPE
]) {
2890 NL_SET_ERR_MSG(extack
, "LWT encapsulation attribute is missing");
2901 static int rtm_new_nexthop(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
2902 struct netlink_ext_ack
*extack
)
2904 struct net
*net
= sock_net(skb
->sk
);
2905 struct nh_config cfg
;
2909 err
= rtm_to_nh_config(net
, skb
, nlh
, &cfg
, extack
);
2911 nh
= nexthop_add(net
, &cfg
, extack
);
2919 static int __nh_valid_get_del_req(const struct nlmsghdr
*nlh
,
2920 struct nlattr
**tb
, u32
*id
,
2921 struct netlink_ext_ack
*extack
)
2923 struct nhmsg
*nhm
= nlmsg_data(nlh
);
2925 if (nhm
->nh_protocol
|| nhm
->resvd
|| nhm
->nh_scope
|| nhm
->nh_flags
) {
2926 NL_SET_ERR_MSG(extack
, "Invalid values in header");
2931 NL_SET_ERR_MSG(extack
, "Nexthop id is missing");
2935 *id
= nla_get_u32(tb
[NHA_ID
]);
2937 NL_SET_ERR_MSG(extack
, "Invalid nexthop id");
2944 static int nh_valid_get_del_req(const struct nlmsghdr
*nlh
, u32
*id
,
2945 struct netlink_ext_ack
*extack
)
2947 struct nlattr
*tb
[ARRAY_SIZE(rtm_nh_policy_get
)];
2950 err
= nlmsg_parse(nlh
, sizeof(struct nhmsg
), tb
,
2951 ARRAY_SIZE(rtm_nh_policy_get
) - 1,
2952 rtm_nh_policy_get
, extack
);
2956 return __nh_valid_get_del_req(nlh
, tb
, id
, extack
);
2960 static int rtm_del_nexthop(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
2961 struct netlink_ext_ack
*extack
)
2963 struct net
*net
= sock_net(skb
->sk
);
2964 struct nl_info nlinfo
= {
2967 .portid
= NETLINK_CB(skb
).portid
,
2973 err
= nh_valid_get_del_req(nlh
, &id
, extack
);
2977 nh
= nexthop_find_by_id(net
, id
);
2981 remove_nexthop(net
, nh
, &nlinfo
);
2987 static int rtm_get_nexthop(struct sk_buff
*in_skb
, struct nlmsghdr
*nlh
,
2988 struct netlink_ext_ack
*extack
)
2990 struct net
*net
= sock_net(in_skb
->sk
);
2991 struct sk_buff
*skb
= NULL
;
2996 err
= nh_valid_get_del_req(nlh
, &id
, extack
);
3001 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
3006 nh
= nexthop_find_by_id(net
, id
);
3010 err
= nh_fill_node(skb
, nh
, RTM_NEWNEXTHOP
, NETLINK_CB(in_skb
).portid
,
3013 WARN_ON(err
== -EMSGSIZE
);
3017 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).portid
);
3025 struct nh_dump_filter
{
3031 u32 res_bucket_nh_id
;
3034 static bool nh_dump_filtered(struct nexthop
*nh
,
3035 struct nh_dump_filter
*filter
, u8 family
)
3037 const struct net_device
*dev
;
3038 const struct nh_info
*nhi
;
3040 if (filter
->group_filter
&& !nh
->is_group
)
3043 if (!filter
->dev_idx
&& !filter
->master_idx
&& !family
)
3049 nhi
= rtnl_dereference(nh
->nh_info
);
3050 if (family
&& nhi
->family
!= family
)
3053 dev
= nhi
->fib_nhc
.nhc_dev
;
3054 if (filter
->dev_idx
&& (!dev
|| dev
->ifindex
!= filter
->dev_idx
))
3057 if (filter
->master_idx
) {
3058 struct net_device
*master
;
3063 master
= netdev_master_upper_dev_get((struct net_device
*)dev
);
3064 if (!master
|| master
->ifindex
!= filter
->master_idx
)
3071 static int __nh_valid_dump_req(const struct nlmsghdr
*nlh
, struct nlattr
**tb
,
3072 struct nh_dump_filter
*filter
,
3073 struct netlink_ext_ack
*extack
)
3079 idx
= nla_get_u32(tb
[NHA_OIF
]);
3080 if (idx
> INT_MAX
) {
3081 NL_SET_ERR_MSG(extack
, "Invalid device index");
3084 filter
->dev_idx
= idx
;
3086 if (tb
[NHA_MASTER
]) {
3087 idx
= nla_get_u32(tb
[NHA_MASTER
]);
3088 if (idx
> INT_MAX
) {
3089 NL_SET_ERR_MSG(extack
, "Invalid master device index");
3092 filter
->master_idx
= idx
;
3094 filter
->group_filter
= nla_get_flag(tb
[NHA_GROUPS
]);
3095 filter
->fdb_filter
= nla_get_flag(tb
[NHA_FDB
]);
3097 nhm
= nlmsg_data(nlh
);
3098 if (nhm
->nh_protocol
|| nhm
->resvd
|| nhm
->nh_scope
|| nhm
->nh_flags
) {
3099 NL_SET_ERR_MSG(extack
, "Invalid values in header for nexthop dump request");
3106 static int nh_valid_dump_req(const struct nlmsghdr
*nlh
,
3107 struct nh_dump_filter
*filter
,
3108 struct netlink_callback
*cb
)
3110 struct nlattr
*tb
[ARRAY_SIZE(rtm_nh_policy_dump
)];
3113 err
= nlmsg_parse(nlh
, sizeof(struct nhmsg
), tb
,
3114 ARRAY_SIZE(rtm_nh_policy_dump
) - 1,
3115 rtm_nh_policy_dump
, cb
->extack
);
3119 return __nh_valid_dump_req(nlh
, tb
, filter
, cb
->extack
);
3122 struct rtm_dump_nh_ctx
{
3126 static struct rtm_dump_nh_ctx
*
3127 rtm_dump_nh_ctx(struct netlink_callback
*cb
)
3129 struct rtm_dump_nh_ctx
*ctx
= (void *)cb
->ctx
;
3131 BUILD_BUG_ON(sizeof(*ctx
) > sizeof(cb
->ctx
));
3135 static int rtm_dump_walk_nexthops(struct sk_buff
*skb
,
3136 struct netlink_callback
*cb
,
3137 struct rb_root
*root
,
3138 struct rtm_dump_nh_ctx
*ctx
,
3139 int (*nh_cb
)(struct sk_buff
*skb
,
3140 struct netlink_callback
*cb
,
3141 struct nexthop
*nh
, void *data
),
3144 struct rb_node
*node
;
3149 for (node
= rb_first(root
); node
; node
= rb_next(node
)) {
3152 nh
= rb_entry(node
, struct nexthop
, rb_node
);
3157 err
= nh_cb(skb
, cb
, nh
, data
);
3166 static int rtm_dump_nexthop_cb(struct sk_buff
*skb
, struct netlink_callback
*cb
,
3167 struct nexthop
*nh
, void *data
)
3169 struct nhmsg
*nhm
= nlmsg_data(cb
->nlh
);
3170 struct nh_dump_filter
*filter
= data
;
3172 if (nh_dump_filtered(nh
, filter
, nhm
->nh_family
))
3175 return nh_fill_node(skb
, nh
, RTM_NEWNEXTHOP
,
3176 NETLINK_CB(cb
->skb
).portid
,
3177 cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
);
3181 static int rtm_dump_nexthop(struct sk_buff
*skb
, struct netlink_callback
*cb
)
3183 struct rtm_dump_nh_ctx
*ctx
= rtm_dump_nh_ctx(cb
);
3184 struct net
*net
= sock_net(skb
->sk
);
3185 struct rb_root
*root
= &net
->nexthop
.rb_root
;
3186 struct nh_dump_filter filter
= {};
3189 err
= nh_valid_dump_req(cb
->nlh
, &filter
, cb
);
3193 err
= rtm_dump_walk_nexthops(skb
, cb
, root
, ctx
,
3194 &rtm_dump_nexthop_cb
, &filter
);
3196 if (likely(skb
->len
))
3204 cb
->seq
= net
->nexthop
.seq
;
3205 nl_dump_check_consistent(cb
, nlmsg_hdr(skb
));
3209 static struct nexthop
*
3210 nexthop_find_group_resilient(struct net
*net
, u32 id
,
3211 struct netlink_ext_ack
*extack
)
3213 struct nh_group
*nhg
;
3216 nh
= nexthop_find_by_id(net
, id
);
3218 return ERR_PTR(-ENOENT
);
3220 if (!nh
->is_group
) {
3221 NL_SET_ERR_MSG(extack
, "Not a nexthop group");
3222 return ERR_PTR(-EINVAL
);
3225 nhg
= rtnl_dereference(nh
->nh_grp
);
3226 if (!nhg
->resilient
) {
3227 NL_SET_ERR_MSG(extack
, "Nexthop group not of type resilient");
3228 return ERR_PTR(-EINVAL
);
3234 static int nh_valid_dump_nhid(struct nlattr
*attr
, u32
*nh_id_p
,
3235 struct netlink_ext_ack
*extack
)
3240 idx
= nla_get_u32(attr
);
3242 NL_SET_ERR_MSG(extack
, "Invalid nexthop id");
3253 static int nh_valid_dump_bucket_req(const struct nlmsghdr
*nlh
,
3254 struct nh_dump_filter
*filter
,
3255 struct netlink_callback
*cb
)
3257 struct nlattr
*res_tb
[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump
)];
3258 struct nlattr
*tb
[ARRAY_SIZE(rtm_nh_policy_dump_bucket
)];
3261 err
= nlmsg_parse(nlh
, sizeof(struct nhmsg
), tb
,
3262 ARRAY_SIZE(rtm_nh_policy_dump_bucket
) - 1,
3263 rtm_nh_policy_dump_bucket
, NULL
);
3267 err
= nh_valid_dump_nhid(tb
[NHA_ID
], &filter
->nh_id
, cb
->extack
);
3271 if (tb
[NHA_RES_BUCKET
]) {
3272 size_t max
= ARRAY_SIZE(rtm_nh_res_bucket_policy_dump
) - 1;
3274 err
= nla_parse_nested(res_tb
, max
,
3276 rtm_nh_res_bucket_policy_dump
,
3281 err
= nh_valid_dump_nhid(res_tb
[NHA_RES_BUCKET_NH_ID
],
3282 &filter
->res_bucket_nh_id
,
3288 return __nh_valid_dump_req(nlh
, tb
, filter
, cb
->extack
);
3291 struct rtm_dump_res_bucket_ctx
{
3292 struct rtm_dump_nh_ctx nh
;
3294 u32 done_nh_idx
; /* 1 + the index of the last fully processed NH. */
3297 static struct rtm_dump_res_bucket_ctx
*
3298 rtm_dump_res_bucket_ctx(struct netlink_callback
*cb
)
3300 struct rtm_dump_res_bucket_ctx
*ctx
= (void *)cb
->ctx
;
3302 BUILD_BUG_ON(sizeof(*ctx
) > sizeof(cb
->ctx
));
3306 struct rtm_dump_nexthop_bucket_data
{
3307 struct rtm_dump_res_bucket_ctx
*ctx
;
3308 struct nh_dump_filter filter
;
3311 static int rtm_dump_nexthop_bucket_nh(struct sk_buff
*skb
,
3312 struct netlink_callback
*cb
,
3314 struct rtm_dump_nexthop_bucket_data
*dd
)
3316 u32 portid
= NETLINK_CB(cb
->skb
).portid
;
3317 struct nhmsg
*nhm
= nlmsg_data(cb
->nlh
);
3318 struct nh_res_table
*res_table
;
3319 struct nh_group
*nhg
;
3323 if (dd
->ctx
->nh
.idx
< dd
->ctx
->done_nh_idx
)
3326 nhg
= rtnl_dereference(nh
->nh_grp
);
3327 res_table
= rtnl_dereference(nhg
->res_table
);
3328 for (bucket_index
= dd
->ctx
->bucket_index
;
3329 bucket_index
< res_table
->num_nh_buckets
;
3331 struct nh_res_bucket
*bucket
;
3332 struct nh_grp_entry
*nhge
;
3334 bucket
= &res_table
->nh_buckets
[bucket_index
];
3335 nhge
= rtnl_dereference(bucket
->nh_entry
);
3336 if (nh_dump_filtered(nhge
->nh
, &dd
->filter
, nhm
->nh_family
))
3339 if (dd
->filter
.res_bucket_nh_id
&&
3340 dd
->filter
.res_bucket_nh_id
!= nhge
->nh
->id
)
3343 err
= nh_fill_res_bucket(skb
, nh
, bucket
, bucket_index
,
3344 RTM_NEWNEXTHOPBUCKET
, portid
,
3345 cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
,
3348 if (likely(skb
->len
))
3354 dd
->ctx
->done_nh_idx
= dd
->ctx
->nh
.idx
+ 1;
3360 dd
->ctx
->bucket_index
= bucket_index
;
3364 static int rtm_dump_nexthop_bucket_cb(struct sk_buff
*skb
,
3365 struct netlink_callback
*cb
,
3366 struct nexthop
*nh
, void *data
)
3368 struct rtm_dump_nexthop_bucket_data
*dd
= data
;
3369 struct nh_group
*nhg
;
3374 nhg
= rtnl_dereference(nh
->nh_grp
);
3375 if (!nhg
->resilient
)
3378 return rtm_dump_nexthop_bucket_nh(skb
, cb
, nh
, dd
);
3382 static int rtm_dump_nexthop_bucket(struct sk_buff
*skb
,
3383 struct netlink_callback
*cb
)
3385 struct rtm_dump_res_bucket_ctx
*ctx
= rtm_dump_res_bucket_ctx(cb
);
3386 struct rtm_dump_nexthop_bucket_data dd
= { .ctx
= ctx
};
3387 struct net
*net
= sock_net(skb
->sk
);
3391 err
= nh_valid_dump_bucket_req(cb
->nlh
, &dd
.filter
, cb
);
3395 if (dd
.filter
.nh_id
) {
3396 nh
= nexthop_find_group_resilient(net
, dd
.filter
.nh_id
,
3400 err
= rtm_dump_nexthop_bucket_nh(skb
, cb
, nh
, &dd
);
3402 struct rb_root
*root
= &net
->nexthop
.rb_root
;
3404 err
= rtm_dump_walk_nexthops(skb
, cb
, root
, &ctx
->nh
,
3405 &rtm_dump_nexthop_bucket_cb
, &dd
);
3409 if (likely(skb
->len
))
3417 cb
->seq
= net
->nexthop
.seq
;
3418 nl_dump_check_consistent(cb
, nlmsg_hdr(skb
));
3422 static int nh_valid_get_bucket_req_res_bucket(struct nlattr
*res
,
3424 struct netlink_ext_ack
*extack
)
3426 struct nlattr
*tb
[ARRAY_SIZE(rtm_nh_res_bucket_policy_get
)];
3429 err
= nla_parse_nested(tb
, ARRAY_SIZE(rtm_nh_res_bucket_policy_get
) - 1,
3430 res
, rtm_nh_res_bucket_policy_get
, extack
);
3434 if (!tb
[NHA_RES_BUCKET_INDEX
]) {
3435 NL_SET_ERR_MSG(extack
, "Bucket index is missing");
3439 *bucket_index
= nla_get_u16(tb
[NHA_RES_BUCKET_INDEX
]);
3443 static int nh_valid_get_bucket_req(const struct nlmsghdr
*nlh
,
3444 u32
*id
, u16
*bucket_index
,
3445 struct netlink_ext_ack
*extack
)
3447 struct nlattr
*tb
[ARRAY_SIZE(rtm_nh_policy_get_bucket
)];
3450 err
= nlmsg_parse(nlh
, sizeof(struct nhmsg
), tb
,
3451 ARRAY_SIZE(rtm_nh_policy_get_bucket
) - 1,
3452 rtm_nh_policy_get_bucket
, extack
);
3456 err
= __nh_valid_get_del_req(nlh
, tb
, id
, extack
);
3460 if (!tb
[NHA_RES_BUCKET
]) {
3461 NL_SET_ERR_MSG(extack
, "Bucket information is missing");
3465 err
= nh_valid_get_bucket_req_res_bucket(tb
[NHA_RES_BUCKET
],
3466 bucket_index
, extack
);
3474 static int rtm_get_nexthop_bucket(struct sk_buff
*in_skb
, struct nlmsghdr
*nlh
,
3475 struct netlink_ext_ack
*extack
)
3477 struct net
*net
= sock_net(in_skb
->sk
);
3478 struct nh_res_table
*res_table
;
3479 struct sk_buff
*skb
= NULL
;
3480 struct nh_group
*nhg
;
3486 err
= nh_valid_get_bucket_req(nlh
, &id
, &bucket_index
, extack
);
3490 nh
= nexthop_find_group_resilient(net
, id
, extack
);
3494 nhg
= rtnl_dereference(nh
->nh_grp
);
3495 res_table
= rtnl_dereference(nhg
->res_table
);
3496 if (bucket_index
>= res_table
->num_nh_buckets
) {
3497 NL_SET_ERR_MSG(extack
, "Bucket index out of bounds");
3501 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
3505 err
= nh_fill_res_bucket(skb
, nh
, &res_table
->nh_buckets
[bucket_index
],
3506 bucket_index
, RTM_NEWNEXTHOPBUCKET
,
3507 NETLINK_CB(in_skb
).portid
, nlh
->nlmsg_seq
,
3510 WARN_ON(err
== -EMSGSIZE
);
3514 return rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).portid
);
3521 static void nexthop_sync_mtu(struct net_device
*dev
, u32 orig_mtu
)
3523 unsigned int hash
= nh_dev_hashfn(dev
->ifindex
);
3524 struct net
*net
= dev_net(dev
);
3525 struct hlist_head
*head
= &net
->nexthop
.devhash
[hash
];
3526 struct hlist_node
*n
;
3527 struct nh_info
*nhi
;
3529 hlist_for_each_entry_safe(nhi
, n
, head
, dev_hash
) {
3530 if (nhi
->fib_nhc
.nhc_dev
== dev
) {
3531 if (nhi
->family
== AF_INET
)
3532 fib_nhc_update_mtu(&nhi
->fib_nhc
, dev
->mtu
,
3539 static int nh_netdev_event(struct notifier_block
*this,
3540 unsigned long event
, void *ptr
)
3542 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
3543 struct netdev_notifier_info_ext
*info_ext
;
3547 case NETDEV_UNREGISTER
:
3548 nexthop_flush_dev(dev
, event
);
3551 if (!(dev_get_flags(dev
) & (IFF_RUNNING
| IFF_LOWER_UP
)))
3552 nexthop_flush_dev(dev
, event
);
3554 case NETDEV_CHANGEMTU
:
3556 nexthop_sync_mtu(dev
, info_ext
->ext
.mtu
);
3557 rt_cache_flush(dev_net(dev
));
3563 static struct notifier_block nh_netdev_notifier
= {
3564 .notifier_call
= nh_netdev_event
,
3567 static int nexthops_dump(struct net
*net
, struct notifier_block
*nb
,
3568 struct netlink_ext_ack
*extack
)
3570 struct rb_root
*root
= &net
->nexthop
.rb_root
;
3571 struct rb_node
*node
;
3574 for (node
= rb_first(root
); node
; node
= rb_next(node
)) {
3577 nh
= rb_entry(node
, struct nexthop
, rb_node
);
3578 err
= call_nexthop_notifier(nb
, net
, NEXTHOP_EVENT_REPLACE
, nh
,
3587 int register_nexthop_notifier(struct net
*net
, struct notifier_block
*nb
,
3588 struct netlink_ext_ack
*extack
)
3593 err
= nexthops_dump(net
, nb
, extack
);
3596 err
= blocking_notifier_chain_register(&net
->nexthop
.notifier_chain
,
3602 EXPORT_SYMBOL(register_nexthop_notifier
);
3604 int unregister_nexthop_notifier(struct net
*net
, struct notifier_block
*nb
)
3606 return blocking_notifier_chain_unregister(&net
->nexthop
.notifier_chain
,
3609 EXPORT_SYMBOL(unregister_nexthop_notifier
);
3611 void nexthop_set_hw_flags(struct net
*net
, u32 id
, bool offload
, bool trap
)
3613 struct nexthop
*nexthop
;
3617 nexthop
= nexthop_find_by_id(net
, id
);
3621 nexthop
->nh_flags
&= ~(RTNH_F_OFFLOAD
| RTNH_F_TRAP
);
3623 nexthop
->nh_flags
|= RTNH_F_OFFLOAD
;
3625 nexthop
->nh_flags
|= RTNH_F_TRAP
;
3630 EXPORT_SYMBOL(nexthop_set_hw_flags
);
3632 void nexthop_bucket_set_hw_flags(struct net
*net
, u32 id
, u16 bucket_index
,
3633 bool offload
, bool trap
)
3635 struct nh_res_table
*res_table
;
3636 struct nh_res_bucket
*bucket
;
3637 struct nexthop
*nexthop
;
3638 struct nh_group
*nhg
;
3642 nexthop
= nexthop_find_by_id(net
, id
);
3643 if (!nexthop
|| !nexthop
->is_group
)
3646 nhg
= rcu_dereference(nexthop
->nh_grp
);
3647 if (!nhg
->resilient
)
3650 if (bucket_index
>= nhg
->res_table
->num_nh_buckets
)
3653 res_table
= rcu_dereference(nhg
->res_table
);
3654 bucket
= &res_table
->nh_buckets
[bucket_index
];
3655 bucket
->nh_flags
&= ~(RTNH_F_OFFLOAD
| RTNH_F_TRAP
);
3657 bucket
->nh_flags
|= RTNH_F_OFFLOAD
;
3659 bucket
->nh_flags
|= RTNH_F_TRAP
;
3664 EXPORT_SYMBOL(nexthop_bucket_set_hw_flags
);
3666 void nexthop_res_grp_activity_update(struct net
*net
, u32 id
, u16 num_buckets
,
3667 unsigned long *activity
)
3669 struct nh_res_table
*res_table
;
3670 struct nexthop
*nexthop
;
3671 struct nh_group
*nhg
;
3676 nexthop
= nexthop_find_by_id(net
, id
);
3677 if (!nexthop
|| !nexthop
->is_group
)
3680 nhg
= rcu_dereference(nexthop
->nh_grp
);
3681 if (!nhg
->resilient
)
3684 /* Instead of silently ignoring some buckets, demand that the sizes
3687 res_table
= rcu_dereference(nhg
->res_table
);
3688 if (num_buckets
!= res_table
->num_nh_buckets
)
3691 for (i
= 0; i
< num_buckets
; i
++) {
3692 if (test_bit(i
, activity
))
3693 nh_res_bucket_set_busy(&res_table
->nh_buckets
[i
]);
3699 EXPORT_SYMBOL(nexthop_res_grp_activity_update
);
3701 static void __net_exit
nexthop_net_exit(struct net
*net
)
3704 flush_all_nexthops(net
);
3706 kfree(net
->nexthop
.devhash
);
3709 static int __net_init
nexthop_net_init(struct net
*net
)
3711 size_t sz
= sizeof(struct hlist_head
) * NH_DEV_HASHSIZE
;
3713 net
->nexthop
.rb_root
= RB_ROOT
;
3714 net
->nexthop
.devhash
= kzalloc(sz
, GFP_KERNEL
);
3715 if (!net
->nexthop
.devhash
)
3717 BLOCKING_INIT_NOTIFIER_HEAD(&net
->nexthop
.notifier_chain
);
3722 static struct pernet_operations nexthop_net_ops
= {
3723 .init
= nexthop_net_init
,
3724 .exit
= nexthop_net_exit
,
3727 static int __init
nexthop_init(void)
3729 register_pernet_subsys(&nexthop_net_ops
);
3731 register_netdevice_notifier(&nh_netdev_notifier
);
3733 rtnl_register(PF_UNSPEC
, RTM_NEWNEXTHOP
, rtm_new_nexthop
, NULL
, 0);
3734 rtnl_register(PF_UNSPEC
, RTM_DELNEXTHOP
, rtm_del_nexthop
, NULL
, 0);
3735 rtnl_register(PF_UNSPEC
, RTM_GETNEXTHOP
, rtm_get_nexthop
,
3736 rtm_dump_nexthop
, 0);
3738 rtnl_register(PF_INET
, RTM_NEWNEXTHOP
, rtm_new_nexthop
, NULL
, 0);
3739 rtnl_register(PF_INET
, RTM_GETNEXTHOP
, NULL
, rtm_dump_nexthop
, 0);
3741 rtnl_register(PF_INET6
, RTM_NEWNEXTHOP
, rtm_new_nexthop
, NULL
, 0);
3742 rtnl_register(PF_INET6
, RTM_GETNEXTHOP
, NULL
, rtm_dump_nexthop
, 0);
3744 rtnl_register(PF_UNSPEC
, RTM_GETNEXTHOPBUCKET
, rtm_get_nexthop_bucket
,
3745 rtm_dump_nexthop_bucket
, 0);
3749 subsys_initcall(nexthop_init
);