1 /* Copyright (c) 2008, 2009 Nicira Networks
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at:
7 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
24 #include <linux/genetlink.h>
25 #include <linux/rtnetlink.h>
29 #include <sys/types.h>
35 #include "command-line.h"
40 #include "fatal-signal.h"
42 #include "leak-checker.h"
46 #include "openvswitch/brcompat-netlink.h"
47 #include "poll-loop.h"
56 #define THIS_MODULE VLM_brcompatd
59 /* xxx Just hangs if datapath is rmmod/insmod. Learn to reconnect? */
61 /* Actions to modify bridge compatibility configuration. */
69 static void parse_options(int argc
, char *argv
[]);
70 static void usage(void) NO_RETURN
;
72 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 60);
74 /* Maximum number of milliseconds to wait for the config file to be
75 * unlocked. If set to zero, no waiting will occur. */
76 static int lock_timeout
= 500;
78 /* Maximum number of milliseconds to wait before pruning port entries that
79 * no longer exist. If set to zero, ports are never pruned. */
80 static int prune_timeout
= 5000;
82 /* Config file shared with ovs-vswitchd (usually ovs-vswitchd.conf). */
83 static char *config_file
;
85 /* Command to run (via system()) to reload the ovs-vswitchd configuration
87 static char *reload_command
;
89 /* Netlink socket to listen for interface changes. */
90 static struct nl_sock
*rtnl_sock
;
92 /* Netlink socket to bridge compatibility kernel module. */
93 static struct nl_sock
*brc_sock
;
95 /* The Generic Netlink family number used for bridge compatibility. */
96 static int brc_family
;
98 static const struct nl_policy brc_multicast_policy
[] = {
99 [BRC_GENL_A_MC_GROUP
] = {.type
= NL_A_U32
}
102 static const struct nl_policy rtnlgrp_link_policy
[] = {
103 [IFLA_IFNAME
] = { .type
= NL_A_STRING
, .optional
= false },
104 [IFLA_MASTER
] = { .type
= NL_A_U32
, .optional
= true },
108 lookup_brc_multicast_group(int *multicast_group
)
110 struct nl_sock
*sock
;
111 struct ofpbuf request
, *reply
;
112 struct nlattr
*attrs
[ARRAY_SIZE(brc_multicast_policy
)];
115 retval
= nl_sock_create(NETLINK_GENERIC
, 0, 0, 0, &sock
);
119 ofpbuf_init(&request
, 0);
120 nl_msg_put_genlmsghdr(&request
, sock
, 0, brc_family
,
121 NLM_F_REQUEST
, BRC_GENL_C_QUERY_MC
, 1);
122 retval
= nl_sock_transact(sock
, &request
, &reply
);
123 ofpbuf_uninit(&request
);
125 nl_sock_destroy(sock
);
128 if (!nl_policy_parse(reply
, NLMSG_HDRLEN
+ GENL_HDRLEN
,
129 brc_multicast_policy
, attrs
,
130 ARRAY_SIZE(brc_multicast_policy
))) {
131 nl_sock_destroy(sock
);
132 ofpbuf_delete(reply
);
135 *multicast_group
= nl_attr_get_u32(attrs
[BRC_GENL_A_MC_GROUP
]);
136 nl_sock_destroy(sock
);
137 ofpbuf_delete(reply
);
142 /* Opens a socket for brcompat notifications. Returns 0 if successful,
143 * otherwise a positive errno value. */
145 brc_open(struct nl_sock
**sock
)
147 int multicast_group
= 0;
150 retval
= nl_lookup_genl_family(BRC_GENL_FAMILY_NAME
, &brc_family
);
155 retval
= lookup_brc_multicast_group(&multicast_group
);
160 retval
= nl_sock_create(NETLINK_GENERIC
, multicast_group
, 0, 0, sock
);
168 static const struct nl_policy brc_dp_policy
[] = {
169 [BRC_GENL_A_DP_NAME
] = { .type
= NL_A_STRING
},
173 bridge_exists(const char *name
)
175 return cfg_has_section("bridge.%s", name
);
179 rewrite_and_reload_config(void)
181 if (cfg_is_dirty()) {
182 int error1
= cfg_write();
183 int error2
= cfg_read();
184 long long int reload_start
= time_msec();
185 int error3
= system(reload_command
);
186 long long int elapsed
= time_msec() - reload_start
;
187 COVERAGE_INC(brcompatd_reload
);
189 VLOG_INFO("reload command executed in %lld ms", elapsed
);
192 VLOG_ERR("failed to execute reload command: %s", strerror(errno
));
193 } else if (error3
!= 0) {
194 char *msg
= process_status_msg(error3
);
195 VLOG_ERR("reload command exited with error (%s)", msg
);
198 return error1
? error1
: error2
? error2
: error3
? ECHILD
: 0;
203 /* Go through the configuration file and remove any ports that no longer
204 * exist associated with a bridge. */
210 struct svec bridges
, delete;
212 if (cfg_lock(NULL
, 0)) {
213 /* Couldn't lock config file. */
219 cfg_get_subsections(&bridges
, "bridge");
220 for (i
=0; i
<bridges
.n
; i
++) {
221 const char *br_name
= bridges
.names
[i
];
222 struct svec ports
, ifaces
;
226 /* Get all the interfaces for the given bridge, breaking bonded
227 * interfaces down into their constituent parts. */
229 cfg_get_all_keys(&ports
, "bridge.%s.port", br_name
);
230 for (j
=0; j
<ports
.n
; j
++) {
231 const char *port_name
= ports
.names
[j
];
232 if (cfg_has_section("bonding.%s", port_name
)) {
235 cfg_get_all_keys(&slaves
, "bonding.%s.slave", port_name
);
236 svec_append(&ifaces
, &slaves
);
237 svec_destroy(&slaves
);
239 svec_add(&ifaces
, port_name
);
242 svec_destroy(&ports
);
244 /* Check that the interfaces exist. */
245 for (j
= 0; j
< ifaces
.n
; j
++) {
246 const char *iface_name
= ifaces
.names
[j
];
247 enum netdev_flags flags
;
249 /* The local port and internal ports are created and destroyed by
250 * ovs-vswitchd itself, so don't bother checking for them at all.
251 * In practice, they might not exist if ovs-vswitchd hasn't
252 * finished reloading since the configuration file was updated. */
253 if (!strcmp(iface_name
, br_name
)
254 || cfg_get_bool(0, "iface.%s.internal", iface_name
)) {
258 error
= netdev_nodev_get_flags(iface_name
, &flags
);
259 if (error
== ENODEV
) {
260 VLOG_INFO_RL(&rl
, "removing dead interface %s from %s",
261 iface_name
, br_name
);
262 svec_add(&delete, iface_name
);
264 VLOG_INFO_RL(&rl
, "unknown error %d on interface %s from %s",
265 error
, iface_name
, br_name
);
268 svec_destroy(&ifaces
);
270 svec_destroy(&bridges
);
275 for (i
= 0; i
< delete.n
; i
++) {
276 cfg_del_match("bridge.*.port=%s", delete.names
[i
]);
277 cfg_del_match("bonding.*.slave=%s", delete.names
[i
]);
279 rewrite_and_reload_config();
284 svec_destroy(&delete);
288 /* Checks whether a network device named 'name' exists and returns true if so,
291 * XXX it is possible that this doesn't entirely accomplish what we want in
292 * context, since ovs-vswitchd.conf may cause vswitchd to create or destroy
293 * network devices based on iface.*.internal settings.
295 * XXX may want to move this to lib/netdev.
297 * XXX why not just use netdev_nodev_get_flags() or similar function? */
299 netdev_exists(const char *name
)
305 filename
= xasprintf("/sys/class/net/%s", name
);
306 error
= stat(filename
, &s
);
312 add_bridge(const char *br_name
)
314 if (bridge_exists(br_name
)) {
315 VLOG_WARN("addbr %s: bridge %s exists", br_name
, br_name
);
317 } else if (netdev_exists(br_name
)) {
318 if (cfg_get_bool(0, "iface.%s.fake-bridge", br_name
)) {
319 VLOG_WARN("addbr %s: %s exists as a fake bridge",
323 VLOG_WARN("addbr %s: cannot create bridge %s because a network "
324 "device named %s already exists",
325 br_name
, br_name
, br_name
);
330 cfg_add_entry("bridge.%s.port=%s", br_name
, br_name
);
331 VLOG_INFO("addbr %s: success", br_name
);
337 del_bridge(const char *br_name
)
339 if (!bridge_exists(br_name
)) {
340 VLOG_WARN("delbr %s: no bridge named %s", br_name
, br_name
);
344 cfg_del_section("bridge.%s", br_name
);
345 VLOG_INFO("delbr %s: success", br_name
);
351 parse_command(struct ofpbuf
*buffer
, uint32_t *seq
, const char **br_name
,
352 const char **port_name
)
354 static const struct nl_policy policy
[] = {
355 [BRC_GENL_A_DP_NAME
] = { .type
= NL_A_STRING
},
356 [BRC_GENL_A_PORT_NAME
] = { .type
= NL_A_STRING
, .optional
= true },
358 struct nlattr
*attrs
[ARRAY_SIZE(policy
)];
360 if (!nl_policy_parse(buffer
, NLMSG_HDRLEN
+ GENL_HDRLEN
, policy
,
361 attrs
, ARRAY_SIZE(policy
))
362 || (port_name
&& !attrs
[BRC_GENL_A_PORT_NAME
])) {
366 *seq
= ((struct nlmsghdr
*) buffer
->data
)->nlmsg_seq
;
367 *br_name
= nl_attr_get_string(attrs
[BRC_GENL_A_DP_NAME
]);
369 *port_name
= nl_attr_get_string(attrs
[BRC_GENL_A_PORT_NAME
]);
375 send_reply(uint32_t seq
, int error
)
381 ofpbuf_init(&msg
, 0);
382 nl_msg_put_genlmsghdr(&msg
, brc_sock
, 32, brc_family
, NLM_F_REQUEST
,
383 BRC_GENL_C_DP_RESULT
, 1);
384 ((struct nlmsghdr
*) msg
.data
)->nlmsg_seq
= seq
;
385 nl_msg_put_u32(&msg
, BRC_GENL_A_ERR_CODE
, error
);
388 retval
= nl_sock_send(brc_sock
, &msg
, false);
390 VLOG_WARN_RL(&rl
, "replying to brcompat request: %s",
397 handle_bridge_cmd(struct ofpbuf
*buffer
, bool add
)
403 error
= parse_command(buffer
, &seq
, &br_name
, NULL
);
405 error
= add
? add_bridge(br_name
) : del_bridge(br_name
);
407 error
= rewrite_and_reload_config();
409 send_reply(seq
, error
);
414 static const struct nl_policy brc_port_policy
[] = {
415 [BRC_GENL_A_DP_NAME
] = { .type
= NL_A_STRING
},
416 [BRC_GENL_A_PORT_NAME
] = { .type
= NL_A_STRING
},
420 del_port(const char *br_name
, const char *port_name
)
422 cfg_del_entry("bridge.%s.port=%s", br_name
, port_name
);
423 cfg_del_match("bonding.*.slave=%s", port_name
);
424 cfg_del_match("vlan.%s.*", port_name
);
428 handle_port_cmd(struct ofpbuf
*buffer
, bool add
)
430 const char *cmd_name
= add
? "add-if" : "del-if";
431 const char *br_name
, *port_name
;
435 error
= parse_command(buffer
, &seq
, &br_name
, &port_name
);
437 if (!bridge_exists(br_name
)) {
438 VLOG_WARN("%s %s %s: no bridge named %s",
439 cmd_name
, br_name
, port_name
, br_name
);
441 } else if (!netdev_exists(port_name
)) {
442 VLOG_WARN("%s %s %s: no network device named %s",
443 cmd_name
, br_name
, port_name
, port_name
);
447 cfg_add_entry("bridge.%s.port=%s", br_name
, port_name
);
449 del_port(br_name
, port_name
);
451 VLOG_INFO("%s %s %s: success", cmd_name
, br_name
, port_name
);
452 error
= rewrite_and_reload_config();
454 send_reply(seq
, error
);
461 brc_recv_update(void)
464 struct ofpbuf
*buffer
;
465 struct genlmsghdr
*genlmsghdr
;
470 ofpbuf_delete(buffer
);
471 retval
= nl_sock_recv(brc_sock
, &buffer
, false);
472 } while (retval
== ENOBUFS
474 && (nl_msg_nlmsgerr(buffer
, NULL
)
475 || nl_msg_nlmsghdr(buffer
)->nlmsg_type
== NLMSG_DONE
)));
477 if (retval
!= EAGAIN
) {
478 VLOG_WARN_RL(&rl
, "brc_recv_update: %s", strerror(retval
));
483 genlmsghdr
= nl_msg_genlmsghdr(buffer
);
485 VLOG_WARN_RL(&rl
, "received packet too short for generic NetLink");
489 if (nl_msg_nlmsghdr(buffer
)->nlmsg_type
!= brc_family
) {
490 VLOG_DBG_RL(&rl
, "received type (%"PRIu16
") != brcompat family (%d)",
491 nl_msg_nlmsghdr(buffer
)->nlmsg_type
, brc_family
);
495 if (cfg_lock(NULL
, lock_timeout
)) {
496 /* Couldn't lock config file. */
501 switch (genlmsghdr
->cmd
) {
502 case BRC_GENL_C_DP_ADD
:
503 retval
= handle_bridge_cmd(buffer
, true);
506 case BRC_GENL_C_DP_DEL
:
507 retval
= handle_bridge_cmd(buffer
, false);
510 case BRC_GENL_C_PORT_ADD
:
511 retval
= handle_port_cmd(buffer
, true);
514 case BRC_GENL_C_PORT_DEL
:
515 retval
= handle_port_cmd(buffer
, false);
525 ofpbuf_delete(buffer
);
529 /* Check for interface configuration changes announced through RTNL. */
531 rtnl_recv_update(void)
535 int error
= nl_sock_recv(rtnl_sock
, &buf
, false);
536 if (error
== EAGAIN
) {
538 } else if (error
== ENOBUFS
) {
539 VLOG_WARN_RL(&rl
, "network monitor socket overflowed");
541 VLOG_WARN_RL(&rl
, "error on network monitor socket: %s",
544 struct nlattr
*attrs
[ARRAY_SIZE(rtnlgrp_link_policy
)];
545 struct nlmsghdr
*nlh
;
546 struct ifinfomsg
*iim
;
548 nlh
= ofpbuf_at(buf
, 0, NLMSG_HDRLEN
);
549 iim
= ofpbuf_at(buf
, NLMSG_HDRLEN
, sizeof *iim
);
551 VLOG_WARN_RL(&rl
, "received bad rtnl message (no ifinfomsg)");
556 if (!nl_policy_parse(buf
, NLMSG_HDRLEN
+ sizeof(struct ifinfomsg
),
558 attrs
, ARRAY_SIZE(rtnlgrp_link_policy
))) {
559 VLOG_WARN_RL(&rl
,"received bad rtnl message (policy)");
563 if (nlh
->nlmsg_type
== RTM_DELLINK
&& attrs
[IFLA_MASTER
]) {
564 const char *port_name
= nl_attr_get_string(attrs
[IFLA_IFNAME
]);
565 char br_name
[IFNAMSIZ
];
566 uint32_t br_idx
= nl_attr_get_u32(attrs
[IFLA_MASTER
]);
568 enum netdev_flags flags
;
570 if (!if_indextoname(br_idx
, br_name
)) {
575 if (cfg_lock(NULL
, lock_timeout
)) {
576 /* Couldn't lock config file. */
577 /* xxx this should try again and print error msg. */
582 if (netdev_nodev_get_flags(port_name
, &flags
) == ENODEV
) {
583 /* Network device is really gone. */
584 VLOG_INFO("network device %s destroyed, "
585 "removing from bridge %s", port_name
, br_name
);
587 cfg_get_all_keys(&ports
, "bridge.%s.port", br_name
);
589 if (svec_contains(&ports
, port_name
)) {
590 del_port(br_name
, port_name
);
591 rewrite_and_reload_config();
594 /* A network device by that name exists even though the kernel
595 * told us it had disappeared. Probably, what happened was
598 * 1. Device destroyed.
599 * 2. Notification sent to us.
600 * 3. New device created with same name as old one.
601 * 4. ovs-brcompatd notified, removes device from bridge.
603 * There's no a priori reason that in this situation that the
604 * new device with the same name should remain in the bridge;
605 * on the contrary, that would be unexpected. *But* there is
606 * one important situation where, if we do this, bad things
607 * happen. This is the case of XenServer Tools version 5.0.0,
608 * which on boot of a Windows VM cause something like this to
609 * happen on the Xen host:
611 * i. Create tap1.0 and vif1.0.
613 * iii. Delete vif1.0.
614 * iv. Re-create vif1.0.
616 * (XenServer Tools 5.5.0 does not exhibit this behavior, and
617 * neither does a VM without Tools installed at all.@.)
619 * Steps iii and iv happen within a few seconds of each other.
620 * Step iv causes /etc/xensource/scripts/vif to run, which in
621 * turn calls ovs-cfg-mod to add the new device to the bridge.
622 * If step iv happens after step 4 (in our first list of
623 * steps), then all is well, but if it happens between 3 and 4
624 * (which can easily happen if ovs-brcompatd has to wait to
625 * lock the configuration file), then we will remove the new
626 * incarnation from the bridge instead of the old one!
628 * So, to avoid this problem, we do nothing here. This is
629 * strictly incorrect except for this one particular case, and
630 * perhaps that will bite us someday. If that happens, then we
631 * will have to somehow track network devices by ifindex, since
632 * a new device will have a new ifindex even if it has the same
633 * name as an old device.
635 VLOG_INFO("kernel reported network device %s removed but "
636 "a device by that name exists (XS Tools 5.0.0?)",
646 main(int argc
, char *argv
[])
648 struct unixctl_server
*unixctl
;
651 set_program_name(argv
[0]);
652 register_fault_handlers();
655 parse_options(argc
, argv
);
656 signal(SIGPIPE
, SIG_IGN
);
659 die_if_already_running();
662 retval
= unixctl_server_create(NULL
, &unixctl
);
664 ovs_fatal(retval
, "could not listen for vlog connections");
667 if (brc_open(&brc_sock
)) {
668 ovs_fatal(0, "could not open brcompat socket. Check "
669 "\"brcompat\" kernel module.");
673 if (nl_sock_create(NETLINK_ROUTE
, RTNLGRP_LINK
, 0, 0, &rtnl_sock
)) {
674 ovs_fatal(0, "could not create rtnetlink socket");
681 unixctl_server_run(unixctl
);
684 /* If 'prune_timeout' is non-zero, we actively prune from the
685 * config file any 'bridge.<br_name>.port' entries that are no
686 * longer valid. We use two methods:
688 * 1) The kernel explicitly notifies us of removed ports
689 * through the RTNL messages.
691 * 2) We periodically check all ports associated with bridges
692 * to see if they no longer exist.
698 nl_sock_wait(rtnl_sock
, POLLIN
);
699 poll_timer_wait(prune_timeout
);
702 nl_sock_wait(brc_sock
, POLLIN
);
703 unixctl_server_wait(unixctl
);
711 parse_options(int argc
, char *argv
[])
714 OPT_LOCK_TIMEOUT
= UCHAR_MAX
+ 1,
718 LEAK_CHECKER_OPTION_ENUMS
720 static struct option long_options
[] = {
721 {"help", no_argument
, 0, 'h'},
722 {"version", no_argument
, 0, 'V'},
723 {"lock-timeout", required_argument
, 0, OPT_LOCK_TIMEOUT
},
724 {"prune-timeout", required_argument
, 0, OPT_PRUNE_TIMEOUT
},
725 {"reload-command", required_argument
, 0, OPT_RELOAD_COMMAND
},
728 LEAK_CHECKER_LONG_OPTIONS
,
731 char *short_options
= long_options_to_short_options(long_options
);
734 reload_command
= xasprintf("%s/ovs-appctl -t "
735 "%s/ovs-vswitchd.`cat %s/ovs-vswitchd.pid`.ctl "
736 "-e vswitchd/reload 2>&1 "
737 "| /usr/bin/logger -t brcompatd-reload",
738 ovs_bindir
, ovs_rundir
, ovs_rundir
);
742 c
= getopt_long(argc
, argv
, short_options
, long_options
, NULL
);
753 OVS_PRINT_VERSION(0, 0);
756 case OPT_LOCK_TIMEOUT
:
757 lock_timeout
= atoi(optarg
);
760 case OPT_PRUNE_TIMEOUT
:
761 prune_timeout
= atoi(optarg
) * 1000;
764 case OPT_RELOAD_COMMAND
:
765 reload_command
= optarg
;
769 DAEMON_OPTION_HANDLERS
770 LEAK_CHECKER_OPTION_HANDLERS
785 ovs_fatal(0, "exactly one non-option argument required; "
786 "use --help for usage");
789 config_file
= argv
[0];
790 error
= cfg_set_file(config_file
);
792 ovs_fatal(error
, "failed to add configuration file \"%s\"",
800 printf("%s: bridge compatibility front-end for ovs-vswitchd\n"
801 "usage: %s [OPTIONS] CONFIG\n"
802 "CONFIG is the configuration file used by ovs-vswitchd.\n",
803 program_name
, program_name
);
804 printf("\nConfiguration options:\n"
805 " --reload-command=COMMAND shell command to reload ovs-vswitchd\n"
806 " --prune-timeout=SECS wait at most SECS before pruning ports\n"
807 " --lock-timeout=MSECS wait at most MSECS for CONFIG to unlock\n"
811 printf("\nOther options:\n"
812 " -h, --help display this help message\n"
813 " -V, --version display version information\n");
814 leak_checker_usage();
815 printf("\nThe default reload command is:\n%s\n", reload_command
);