2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
20 #include "netdev-linux-private.h"
24 #include <sys/types.h>
25 #include <netinet/in.h>
26 #include <arpa/inet.h>
29 #include <linux/filter.h>
30 #include <linux/gen_stats.h>
31 #include <linux/if_ether.h>
32 #include <linux/if_tun.h>
33 #include <linux/types.h>
34 #include <linux/ethtool.h>
35 #include <linux/mii.h>
36 #include <linux/rtnetlink.h>
37 #include <linux/sockios.h>
38 #include <sys/ioctl.h>
39 #include <sys/socket.h>
40 #include <sys/utsname.h>
41 #include <netpacket/packet.h>
43 #include <net/if_arp.h>
44 #include <net/route.h>
51 #include "dp-packet.h"
52 #include "dpif-netlink.h"
53 #include "dpif-netdev.h"
54 #include "openvswitch/dynamic-string.h"
55 #include "fatal-signal.h"
57 #include "openvswitch/hmap.h"
58 #include "netdev-afxdp.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
65 #include "openvswitch/ofpbuf.h"
66 #include "openflow/openflow.h"
67 #include "ovs-atomic.h"
69 #include "openvswitch/poll-loop.h"
70 #include "rtnetlink.h"
71 #include "openvswitch/shash.h"
72 #include "socket-util.h"
76 #include "unaligned.h"
77 #include "openvswitch/vlog.h"
80 VLOG_DEFINE_THIS_MODULE(netdev_linux
);
82 COVERAGE_DEFINE(netdev_set_policing
);
83 COVERAGE_DEFINE(netdev_arp_lookup
);
84 COVERAGE_DEFINE(netdev_get_ifindex
);
85 COVERAGE_DEFINE(netdev_get_hwaddr
);
86 COVERAGE_DEFINE(netdev_set_hwaddr
);
87 COVERAGE_DEFINE(netdev_get_ethtool
);
88 COVERAGE_DEFINE(netdev_set_ethtool
);
91 #ifndef IFLA_IF_NETNSID
92 #define IFLA_IF_NETNSID 0x45
94 /* These were introduced in Linux 2.6.14, so they might be missing if we have
96 #ifndef ADVERTISED_Pause
97 #define ADVERTISED_Pause (1 << 13)
99 #ifndef ADVERTISED_Asym_Pause
100 #define ADVERTISED_Asym_Pause (1 << 14)
103 /* These were introduced in Linux 2.6.24, so they might be missing if we
104 * have old headers. */
105 #ifndef ETHTOOL_GFLAGS
106 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
108 #ifndef ETHTOOL_SFLAGS
109 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
112 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
115 #define TC_RTAB_SIZE 1024
118 #ifndef TCM_IFINDEX_MAGIC_BLOCK
119 #define TCM_IFINDEX_MAGIC_BLOCK (0xFFFFFFFFU)
122 /* Linux 2.6.21 introduced struct tpacket_auxdata.
123 * Linux 2.6.27 added the tp_vlan_tci member.
124 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
125 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
126 * TP_STATUS_VLAN_TPID_VALID.
128 * With all this churn it's easiest to unconditionally define a replacement
129 * structure that has everything we want.
131 #ifndef PACKET_AUXDATA
132 #define PACKET_AUXDATA 8
134 #ifndef TP_STATUS_VLAN_VALID
135 #define TP_STATUS_VLAN_VALID (1 << 4)
137 #ifndef TP_STATUS_VLAN_TPID_VALID
138 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
140 #undef tpacket_auxdata
141 #define tpacket_auxdata rpl_tpacket_auxdata
142 struct tpacket_auxdata
{
148 uint16_t tp_vlan_tci
;
149 uint16_t tp_vlan_tpid
;
152 /* Linux 2.6.27 introduced ethtool_cmd_speed
154 * To avoid revisiting problems reported with using configure to detect
155 * compatibility (see report at
156 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
157 * unconditionally replace ethtool_cmd_speed. */
158 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
159 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd
*ep
)
161 return ep
->speed
| (ep
->speed_hi
<< 16);
164 /* Linux 2.6.30 introduced supported and advertised flags for
165 * 1G base KX, and 10G base KX4, KR and R. */
166 #ifndef SUPPORTED_1000baseKX_Full
167 #define SUPPORTED_1000baseKX_Full (1 << 17)
168 #define SUPPORTED_10000baseKX4_Full (1 << 18)
169 #define SUPPORTED_10000baseKR_Full (1 << 19)
170 #define SUPPORTED_10000baseR_FEC (1 << 20)
171 #define ADVERTISED_1000baseKX_Full (1 << 17)
172 #define ADVERTISED_10000baseKX4_Full (1 << 18)
173 #define ADVERTISED_10000baseKR_Full (1 << 19)
174 #define ADVERTISED_10000baseR_FEC (1 << 20)
177 /* Linux 3.5 introduced supported and advertised flags for
178 * 40G base KR4, CR4, SR4 and LR4. */
179 #ifndef SUPPORTED_40000baseKR4_Full
180 #define SUPPORTED_40000baseKR4_Full (1 << 23)
181 #define SUPPORTED_40000baseCR4_Full (1 << 24)
182 #define SUPPORTED_40000baseSR4_Full (1 << 25)
183 #define SUPPORTED_40000baseLR4_Full (1 << 26)
184 #define ADVERTISED_40000baseKR4_Full (1 << 23)
185 #define ADVERTISED_40000baseCR4_Full (1 << 24)
186 #define ADVERTISED_40000baseSR4_Full (1 << 25)
187 #define ADVERTISED_40000baseLR4_Full (1 << 26)
190 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
192 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
193 * 2.6.32-431.29.2.el6.x86_64 (see report at
194 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
195 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
196 * unconditionally define a replacement. */
198 #define IFLA_STATS64 23
200 #define rtnl_link_stats64 rpl_rtnl_link_stats64
201 struct rtnl_link_stats64
{
213 uint64_t rx_length_errors
;
214 uint64_t rx_over_errors
;
215 uint64_t rx_crc_errors
;
216 uint64_t rx_frame_errors
;
217 uint64_t rx_fifo_errors
;
218 uint64_t rx_missed_errors
;
220 uint64_t tx_aborted_errors
;
221 uint64_t tx_carrier_errors
;
222 uint64_t tx_fifo_errors
;
223 uint64_t tx_heartbeat_errors
;
224 uint64_t tx_window_errors
;
226 uint64_t rx_compressed
;
227 uint64_t tx_compressed
;
231 VALID_IFINDEX
= 1 << 0,
232 VALID_ETHERADDR
= 1 << 1,
235 VALID_POLICING
= 1 << 4,
236 VALID_VPORT_STAT_ERROR
= 1 << 5,
237 VALID_DRVINFO
= 1 << 6,
238 VALID_FEATURES
= 1 << 7,
241 struct linux_lag_slave
{
243 struct shash_node
*node
;
246 /* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
247 static struct ovs_mutex lag_mutex
= OVS_MUTEX_INITIALIZER
;
249 /* All slaves whose LAG masters are network devices in OvS. */
250 static struct shash lag_shash
OVS_GUARDED_BY(lag_mutex
)
251 = SHASH_INITIALIZER(&lag_shash
);
253 /* Traffic control. */
255 /* An instance of a traffic control class. Always associated with a particular
258 * Each TC implementation subclasses this with whatever additional data it
261 const struct tc_ops
*ops
;
262 struct hmap queues
; /* Contains "struct tc_queue"s.
263 * Read by generic TC layer.
264 * Written only by TC implementation. */
267 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
269 /* One traffic control queue.
271 * Each TC implementation subclasses this with whatever additional data it
274 struct hmap_node hmap_node
; /* In struct tc's "queues" hmap. */
275 unsigned int queue_id
; /* OpenFlow queue ID. */
276 long long int created
; /* Time queue was created, in msecs. */
279 /* A particular kind of traffic control. Each implementation generally maps to
280 * one particular Linux qdisc class.
282 * The functions below return 0 if successful or a positive errno value on
283 * failure, except where otherwise noted. All of them must be provided, except
284 * where otherwise noted. */
286 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
287 * This is null for tc_ops_default and tc_ops_other, for which there are no
288 * appropriate values. */
289 const char *linux_name
;
291 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
292 const char *ovs_name
;
294 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
295 * queues. The queues are numbered 0 through n_queues - 1. */
296 unsigned int n_queues
;
298 /* Called to install this TC class on 'netdev'. The implementation should
299 * make the Netlink calls required to set up 'netdev' with the right qdisc
300 * and configure it according to 'details'. The implementation may assume
301 * that the current qdisc is the default; that is, there is no need for it
302 * to delete the current qdisc before installing itself.
304 * The contents of 'details' should be documented as valid for 'ovs_name'
305 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
306 * (which is built as ovs-vswitchd.conf.db(8)).
308 * This function must return 0 if and only if it sets 'netdev->tc' to an
309 * initialized 'struct tc'.
311 * (This function is null for tc_ops_other, which cannot be installed. For
312 * other TC classes it should always be nonnull.) */
313 int (*tc_install
)(struct netdev
*netdev
, const struct smap
*details
);
315 /* Called when the netdev code determines (through a Netlink query) that
316 * this TC class's qdisc is installed on 'netdev', but we didn't install
317 * it ourselves and so don't know any of the details.
319 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
320 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
321 * implementation should parse the other attributes of 'nlmsg' as
322 * necessary to determine its configuration. If necessary it should also
323 * use Netlink queries to determine the configuration of queues on
326 * This function must return 0 if and only if it sets 'netdev->tc' to an
327 * initialized 'struct tc'. */
328 int (*tc_load
)(struct netdev
*netdev
, struct ofpbuf
*nlmsg
);
330 /* Destroys the data structures allocated by the implementation as part of
331 * 'tc'. (This includes destroying 'tc->queues' by calling
334 * The implementation should not need to perform any Netlink calls. If
335 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
336 * (But it may not be desirable.)
338 * This function may be null if 'tc' is trivial. */
339 void (*tc_destroy
)(struct tc
*tc
);
341 /* Retrieves details of 'netdev->tc' configuration into 'details'.
343 * The implementation should not need to perform any Netlink calls, because
344 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
345 * cached the configuration.
347 * The contents of 'details' should be documented as valid for 'ovs_name'
348 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
349 * (which is built as ovs-vswitchd.conf.db(8)).
351 * This function may be null if 'tc' is not configurable.
353 int (*qdisc_get
)(const struct netdev
*netdev
, struct smap
*details
);
355 /* Reconfigures 'netdev->tc' according to 'details', performing any
356 * required Netlink calls to complete the reconfiguration.
358 * The contents of 'details' should be documented as valid for 'ovs_name'
359 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
360 * (which is built as ovs-vswitchd.conf.db(8)).
362 * This function may be null if 'tc' is not configurable.
364 int (*qdisc_set
)(struct netdev
*, const struct smap
*details
);
366 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
367 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
369 * The contents of 'details' should be documented as valid for 'ovs_name'
370 * in the "other_config" column in the "Queue" table in
371 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
373 * The implementation should not need to perform any Netlink calls, because
374 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
375 * cached the queue configuration.
377 * This function may be null if 'tc' does not have queues ('n_queues' is
379 int (*class_get
)(const struct netdev
*netdev
, const struct tc_queue
*queue
,
380 struct smap
*details
);
382 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
383 * 'details', perfoming any required Netlink calls to complete the
384 * reconfiguration. The caller ensures that 'queue_id' is less than
387 * The contents of 'details' should be documented as valid for 'ovs_name'
388 * in the "other_config" column in the "Queue" table in
389 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
391 * This function may be null if 'tc' does not have queues or its queues are
392 * not configurable. */
393 int (*class_set
)(struct netdev
*, unsigned int queue_id
,
394 const struct smap
*details
);
396 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
397 * tc_queue's within 'netdev->tc->queues'.
399 * This function may be null if 'tc' does not have queues or its queues
400 * cannot be deleted. */
401 int (*class_delete
)(struct netdev
*, struct tc_queue
*queue
);
403 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
404 * 'struct tc_queue's within 'netdev->tc->queues'.
406 * On success, initializes '*stats'.
408 * This function may be null if 'tc' does not have queues or if it cannot
409 * report queue statistics. */
410 int (*class_get_stats
)(const struct netdev
*netdev
,
411 const struct tc_queue
*queue
,
412 struct netdev_queue_stats
*stats
);
414 /* Extracts queue stats from 'nlmsg', which is a response to a
415 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
417 * This function may be null if 'tc' does not have queues or if it cannot
418 * report queue statistics. */
419 int (*class_dump_stats
)(const struct netdev
*netdev
,
420 const struct ofpbuf
*nlmsg
,
421 netdev_dump_queue_stats_cb
*cb
, void *aux
);
425 tc_init(struct tc
*tc
, const struct tc_ops
*ops
)
428 hmap_init(&tc
->queues
);
432 tc_destroy(struct tc
*tc
)
434 hmap_destroy(&tc
->queues
);
437 static const struct tc_ops tc_ops_htb
;
438 static const struct tc_ops tc_ops_hfsc
;
439 static const struct tc_ops tc_ops_codel
;
440 static const struct tc_ops tc_ops_fqcodel
;
441 static const struct tc_ops tc_ops_sfq
;
442 static const struct tc_ops tc_ops_netem
;
443 static const struct tc_ops tc_ops_default
;
444 static const struct tc_ops tc_ops_noop
;
445 static const struct tc_ops tc_ops_other
;
447 static const struct tc_ops
*const tcs
[] = {
448 &tc_ops_htb
, /* Hierarchy token bucket (see tc-htb(8)). */
449 &tc_ops_hfsc
, /* Hierarchical fair service curve. */
450 &tc_ops_codel
, /* Controlled delay */
451 &tc_ops_fqcodel
, /* Fair queue controlled delay */
452 &tc_ops_sfq
, /* Stochastic fair queueing */
453 &tc_ops_netem
, /* Network Emulator */
454 &tc_ops_noop
, /* Non operating qos type. */
455 &tc_ops_default
, /* Default qdisc (see tc-pfifo_fast(8)). */
456 &tc_ops_other
, /* Some other qdisc. */
460 static unsigned int tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
);
461 static unsigned int tc_bytes_to_ticks(unsigned int rate
, unsigned int size
);
462 static unsigned int tc_buffer_per_jiffy(unsigned int rate
);
463 static uint32_t tc_time_to_ticks(uint32_t time
);
465 static struct tcmsg
*netdev_linux_tc_make_request(const struct netdev
*,
469 static int tc_add_policer(struct netdev
*,
470 uint32_t kbits_rate
, uint32_t kbits_burst
);
472 static int tc_parse_qdisc(const struct ofpbuf
*, const char **kind
,
473 struct nlattr
**options
);
474 static int tc_parse_class(const struct ofpbuf
*, unsigned int *queue_id
,
475 struct nlattr
**options
,
476 struct netdev_queue_stats
*);
477 static int tc_query_class(const struct netdev
*,
478 unsigned int handle
, unsigned int parent
,
479 struct ofpbuf
**replyp
);
480 static int tc_delete_class(const struct netdev
*, unsigned int handle
);
482 static int tc_del_qdisc(struct netdev
*netdev
);
483 static int tc_query_qdisc(const struct netdev
*netdev
);
486 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
);
487 static int tc_calc_cell_log(unsigned int mtu
);
488 static void tc_fill_rate(struct tc_ratespec
*rate
, uint64_t bps
, int mtu
);
489 static int tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
);
492 /* This is set pretty low because we probably won't learn anything from the
493 * additional log messages. */
494 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
496 /* Polling miimon status for all ports causes performance degradation when
497 * handling a large number of ports. If there are no devices using miimon, then
498 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
500 * Readers do not depend on this variable synchronizing with the related
501 * changes in the device miimon status, so we can use atomic_count. */
502 static atomic_count miimon_cnt
= ATOMIC_COUNT_INIT(0);
504 static int netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*,
505 int cmd
, const char *cmd_name
);
506 static int get_flags(const struct netdev
*, unsigned int *flags
);
507 static int set_flags(const char *, unsigned int flags
);
508 static int update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
509 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
510 OVS_REQUIRES(netdev
->mutex
);
511 static int get_ifindex(const struct netdev
*, int *ifindexp
);
512 static int do_set_addr(struct netdev
*netdev
,
513 int ioctl_nr
, const char *ioctl_name
,
514 struct in_addr addr
);
515 static int get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
);
516 static int set_etheraddr(const char *netdev_name
, const struct eth_addr
);
517 static int af_packet_sock(void);
518 static bool netdev_linux_miimon_enabled(void);
519 static void netdev_linux_miimon_run(void);
520 static void netdev_linux_miimon_wait(void);
521 static int netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
);
524 is_tap_netdev(const struct netdev
*netdev
)
526 return netdev_get_class(netdev
) == &netdev_tap_class
;
530 netdev_linux_netnsid_update__(struct netdev_linux
*netdev
)
532 struct dpif_netlink_vport reply
;
536 error
= dpif_netlink_vport_get(netdev_get_name(&netdev
->up
), &reply
, &buf
);
538 if (error
== ENOENT
) {
539 /* Assume it is local if there is no API (e.g. if the openvswitch
540 * kernel module is not loaded). */
541 netnsid_set_local(&netdev
->netnsid
);
543 netnsid_unset(&netdev
->netnsid
);
548 netnsid_set(&netdev
->netnsid
, reply
.netnsid
);
554 netdev_linux_netnsid_update(struct netdev_linux
*netdev
)
556 if (netnsid_is_unset(netdev
->netnsid
)) {
557 if (netdev_get_class(&netdev
->up
) == &netdev_tap_class
) {
558 netnsid_set_local(&netdev
->netnsid
);
560 return netdev_linux_netnsid_update__(netdev
);
568 netdev_linux_netnsid_is_eq(struct netdev_linux
*netdev
, int nsid
)
570 netdev_linux_netnsid_update(netdev
);
571 return netnsid_eq(netdev
->netnsid
, nsid
);
575 netdev_linux_netnsid_is_remote(struct netdev_linux
*netdev
)
577 netdev_linux_netnsid_update(netdev
);
578 return netnsid_is_remote(netdev
->netnsid
);
581 static int netdev_linux_update_via_netlink(struct netdev_linux
*);
582 static void netdev_linux_update(struct netdev_linux
*netdev
, int,
583 const struct rtnetlink_change
*)
584 OVS_REQUIRES(netdev
->mutex
);
585 static void netdev_linux_changed(struct netdev_linux
*netdev
,
586 unsigned int ifi_flags
, unsigned int mask
)
587 OVS_REQUIRES(netdev
->mutex
);
589 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
590 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
591 * if no such socket could be created. */
592 static struct nl_sock
*
593 netdev_linux_notify_sock(void)
595 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
596 static struct nl_sock
*sock
;
597 unsigned int mcgroups
[] = {RTNLGRP_LINK
, RTNLGRP_IPV4_IFADDR
,
598 RTNLGRP_IPV6_IFADDR
, RTNLGRP_IPV6_IFINFO
};
600 if (ovsthread_once_start(&once
)) {
603 error
= nl_sock_create(NETLINK_ROUTE
, &sock
);
607 for (i
= 0; i
< ARRAY_SIZE(mcgroups
); i
++) {
608 error
= nl_sock_join_mcgroup(sock
, mcgroups
[i
]);
610 nl_sock_destroy(sock
);
616 nl_sock_listen_all_nsid(sock
, true);
617 ovsthread_once_done(&once
);
624 netdev_linux_miimon_enabled(void)
626 return atomic_count_get(&miimon_cnt
) > 0;
630 netdev_linux_kind_is_lag(const char *kind
)
632 if (!strcmp(kind
, "bond") || !strcmp(kind
, "team")) {
640 netdev_linux_update_lag(struct rtnetlink_change
*change
)
641 OVS_REQUIRES(lag_mutex
)
643 struct linux_lag_slave
*lag
;
645 if (!rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)) {
649 if (change
->slave
&& netdev_linux_kind_is_lag(change
->slave
)) {
650 lag
= shash_find_data(&lag_shash
, change
->ifname
);
653 struct netdev
*master_netdev
;
654 char master_name
[IFNAMSIZ
];
658 if_indextoname(change
->master_ifindex
, master_name
);
659 master_netdev
= netdev_from_name(master_name
);
660 if (!master_netdev
) {
664 if (is_netdev_linux_class(master_netdev
->netdev_class
)) {
665 block_id
= netdev_get_block_id(master_netdev
);
667 netdev_close(master_netdev
);
671 lag
= xmalloc(sizeof *lag
);
672 lag
->block_id
= block_id
;
673 lag
->node
= shash_add(&lag_shash
, change
->ifname
, lag
);
675 /* delete ingress block in case it exists */
676 tc_add_del_qdisc(change
->if_index
, false, 0, TC_INGRESS
);
677 /* LAG master is linux netdev so add slave to same block. */
678 error
= tc_add_del_qdisc(change
->if_index
, true, block_id
,
681 VLOG_WARN("failed to bind LAG slave %s to master's block",
683 shash_delete(&lag_shash
, lag
->node
);
688 netdev_close(master_netdev
);
690 } else if (change
->master_ifindex
== 0) {
691 /* Check if this was a lag slave that has been freed. */
692 lag
= shash_find_data(&lag_shash
, change
->ifname
);
695 tc_add_del_qdisc(change
->if_index
, false, lag
->block_id
,
697 shash_delete(&lag_shash
, lag
->node
);
704 netdev_linux_run(const struct netdev_class
*netdev_class OVS_UNUSED
)
706 struct nl_sock
*sock
;
709 if (netdev_linux_miimon_enabled()) {
710 netdev_linux_miimon_run();
713 sock
= netdev_linux_notify_sock();
719 uint64_t buf_stub
[4096 / 8];
723 ofpbuf_use_stub(&buf
, buf_stub
, sizeof buf_stub
);
724 error
= nl_sock_recv(sock
, &buf
, &nsid
, false);
726 struct rtnetlink_change change
;
728 if (rtnetlink_parse(&buf
, &change
)) {
729 struct netdev
*netdev_
= NULL
;
730 char dev_name
[IFNAMSIZ
];
732 if (!change
.ifname
) {
733 change
.ifname
= if_indextoname(change
.if_index
, dev_name
);
737 netdev_
= netdev_from_name(change
.ifname
);
739 if (netdev_
&& is_netdev_linux_class(netdev_
->netdev_class
)) {
740 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
742 ovs_mutex_lock(&netdev
->mutex
);
743 netdev_linux_update(netdev
, nsid
, &change
);
744 ovs_mutex_unlock(&netdev
->mutex
);
746 else if (!netdev_
&& change
.ifname
) {
747 /* Netdev is not present in OvS but its master could be. */
748 ovs_mutex_lock(&lag_mutex
);
749 netdev_linux_update_lag(&change
);
750 ovs_mutex_unlock(&lag_mutex
);
752 netdev_close(netdev_
);
754 } else if (error
== ENOBUFS
) {
755 struct shash device_shash
;
756 struct shash_node
*node
;
760 shash_init(&device_shash
);
761 netdev_get_devices(&netdev_linux_class
, &device_shash
);
762 SHASH_FOR_EACH (node
, &device_shash
) {
763 struct netdev
*netdev_
= node
->data
;
764 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
767 ovs_mutex_lock(&netdev
->mutex
);
768 get_flags(netdev_
, &flags
);
769 netdev_linux_changed(netdev
, flags
, 0);
770 ovs_mutex_unlock(&netdev
->mutex
);
772 netdev_close(netdev_
);
774 shash_destroy(&device_shash
);
775 } else if (error
!= EAGAIN
) {
776 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 5);
777 VLOG_WARN_RL(&rll
, "error reading or parsing netlink (%s)",
778 ovs_strerror(error
));
785 netdev_linux_wait(const struct netdev_class
*netdev_class OVS_UNUSED
)
787 struct nl_sock
*sock
;
789 if (netdev_linux_miimon_enabled()) {
790 netdev_linux_miimon_wait();
792 sock
= netdev_linux_notify_sock();
794 nl_sock_wait(sock
, POLLIN
);
799 netdev_linux_changed(struct netdev_linux
*dev
,
800 unsigned int ifi_flags
, unsigned int mask
)
801 OVS_REQUIRES(dev
->mutex
)
803 netdev_change_seq_changed(&dev
->up
);
805 if ((dev
->ifi_flags
^ ifi_flags
) & IFF_RUNNING
) {
806 dev
->carrier_resets
++;
808 dev
->ifi_flags
= ifi_flags
;
810 dev
->cache_valid
&= mask
;
811 if (!(mask
& VALID_IN
)) {
812 netdev_get_addrs_list_flush();
817 netdev_linux_update__(struct netdev_linux
*dev
,
818 const struct rtnetlink_change
*change
)
819 OVS_REQUIRES(dev
->mutex
)
821 if (rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)) {
822 if (change
->nlmsg_type
== RTM_NEWLINK
) {
823 /* Keep drv-info, and ip addresses. */
824 netdev_linux_changed(dev
, change
->ifi_flags
,
825 VALID_DRVINFO
| VALID_IN
);
827 /* Update netdev from rtnl-change msg. */
829 dev
->mtu
= change
->mtu
;
830 dev
->cache_valid
|= VALID_MTU
;
831 dev
->netdev_mtu_error
= 0;
834 if (!eth_addr_is_zero(change
->mac
)) {
835 dev
->etheraddr
= change
->mac
;
836 dev
->cache_valid
|= VALID_ETHERADDR
;
837 dev
->ether_addr_error
= 0;
839 /* The mac addr has been changed, report it now. */
840 rtnetlink_report_link();
843 if (change
->master
&& netdev_linux_kind_is_lag(change
->master
)) {
844 dev
->is_lag_master
= true;
847 dev
->ifindex
= change
->if_index
;
848 dev
->cache_valid
|= VALID_IFINDEX
;
849 dev
->get_ifindex_error
= 0;
853 netdev_linux_changed(dev
, change
->ifi_flags
, 0);
854 dev
->present
= false;
855 netnsid_unset(&dev
->netnsid
);
857 } else if (rtnetlink_type_is_rtnlgrp_addr(change
->nlmsg_type
)) {
858 /* Invalidates in4, in6. */
859 netdev_linux_changed(dev
, dev
->ifi_flags
, ~VALID_IN
);
866 netdev_linux_update(struct netdev_linux
*dev
, int nsid
,
867 const struct rtnetlink_change
*change
)
868 OVS_REQUIRES(dev
->mutex
)
870 if (netdev_linux_netnsid_is_eq(dev
, nsid
)) {
871 netdev_linux_update__(dev
, change
);
875 static struct netdev
*
876 netdev_linux_alloc(void)
878 struct netdev_linux
*netdev
= xzalloc(sizeof *netdev
);
883 netdev_linux_common_construct(struct netdev
*netdev_
)
885 /* Prevent any attempt to create (or open) a network device named "default"
886 * or "all". These device names are effectively reserved on Linux because
887 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
888 * itself this wouldn't call for any special treatment, but in practice if
889 * a program tries to create devices with these names, it causes the kernel
890 * to fire a "new device" notification event even though creation failed,
891 * and in turn that causes OVS to wake up and try to create them again,
892 * which ends up as a 100% CPU loop. */
893 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
894 const char *name
= netdev_
->name
;
895 if (!strcmp(name
, "default") || !strcmp(name
, "all")) {
896 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 1);
897 VLOG_WARN_RL(&rll
, "%s: Linux forbids network device with this name",
902 /* The device could be in the same network namespace or in another one. */
903 netnsid_unset(&netdev
->netnsid
);
904 ovs_mutex_init(&netdev
->mutex
);
908 /* Creates system and internal devices. */
910 netdev_linux_construct(struct netdev
*netdev_
)
912 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
913 int error
= netdev_linux_common_construct(netdev_
);
918 error
= get_flags(&netdev
->up
, &netdev
->ifi_flags
);
919 if (error
== ENODEV
) {
920 if (netdev
->up
.netdev_class
!= &netdev_internal_class
) {
921 /* The device does not exist, so don't allow it to be opened. */
924 /* "Internal" netdevs have to be created as netdev objects before
925 * they exist in the kernel, because creating them in the kernel
926 * happens by passing a netdev object to dpif_port_add().
927 * Therefore, ignore the error. */
934 /* For most types of netdevs we open the device for each call of
935 * netdev_open(). However, this is not the case with tap devices,
936 * since it is only possible to open the device once. In this
937 * situation we share a single file descriptor, and consequently
938 * buffers, across all readers. Therefore once data is read it will
939 * be unavailable to other reads for tap devices. */
941 netdev_linux_construct_tap(struct netdev
*netdev_
)
943 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
944 static const char tap_dev
[] = "/dev/net/tun";
945 const char *name
= netdev_
->name
;
948 int error
= netdev_linux_common_construct(netdev_
);
953 /* Open tap device. */
954 netdev
->tap_fd
= open(tap_dev
, O_RDWR
);
955 if (netdev
->tap_fd
< 0) {
957 VLOG_WARN("opening \"%s\" failed: %s", tap_dev
, ovs_strerror(error
));
961 /* Create tap device. */
962 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
963 ifr
.ifr_flags
= IFF_TAP
| IFF_NO_PI
;
964 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
965 if (ioctl(netdev
->tap_fd
, TUNSETIFF
, &ifr
) == -1) {
966 VLOG_WARN("%s: creating tap device failed: %s", name
,
967 ovs_strerror(errno
));
972 /* Make non-blocking. */
973 error
= set_nonblocking(netdev
->tap_fd
);
978 if (ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 1)) {
979 VLOG_WARN("%s: creating tap device failed (persist): %s", name
,
980 ovs_strerror(errno
));
985 netdev
->present
= true;
989 close(netdev
->tap_fd
);
994 netdev_linux_destruct(struct netdev
*netdev_
)
996 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
998 if (netdev
->tc
&& netdev
->tc
->ops
->tc_destroy
) {
999 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
1002 if (netdev_get_class(netdev_
) == &netdev_tap_class
1003 && netdev
->tap_fd
>= 0)
1005 ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 0);
1006 close(netdev
->tap_fd
);
1009 if (netdev
->miimon_interval
> 0) {
1010 atomic_count_dec(&miimon_cnt
);
1013 ovs_mutex_destroy(&netdev
->mutex
);
1017 netdev_linux_dealloc(struct netdev
*netdev_
)
1019 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1023 static struct netdev_rxq
*
1024 netdev_linux_rxq_alloc(void)
1026 struct netdev_rxq_linux
*rx
= xzalloc(sizeof *rx
);
1031 netdev_linux_rxq_construct(struct netdev_rxq
*rxq_
)
1033 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1034 struct netdev
*netdev_
= rx
->up
.netdev
;
1035 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1038 ovs_mutex_lock(&netdev
->mutex
);
1039 rx
->is_tap
= is_tap_netdev(netdev_
);
1041 rx
->fd
= netdev
->tap_fd
;
1043 struct sockaddr_ll sll
;
1045 /* Result of tcpdump -dd inbound */
1046 static const struct sock_filter filt
[] = {
1047 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1048 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1049 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1050 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1052 static const struct sock_fprog fprog
= {
1053 ARRAY_SIZE(filt
), (struct sock_filter
*) filt
1056 /* Create file descriptor. */
1057 rx
->fd
= socket(PF_PACKET
, SOCK_RAW
, 0);
1060 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error
));
1065 if (setsockopt(rx
->fd
, SOL_PACKET
, PACKET_AUXDATA
, &val
, sizeof val
)) {
1067 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1068 netdev_get_name(netdev_
), ovs_strerror(error
));
1072 /* Set non-blocking mode. */
1073 error
= set_nonblocking(rx
->fd
);
1078 /* Get ethernet device index. */
1079 error
= get_ifindex(&netdev
->up
, &ifindex
);
1084 /* Bind to specific ethernet device. */
1085 memset(&sll
, 0, sizeof sll
);
1086 sll
.sll_family
= AF_PACKET
;
1087 sll
.sll_ifindex
= ifindex
;
1088 sll
.sll_protocol
= htons(ETH_P_ALL
);
1089 if (bind(rx
->fd
, (struct sockaddr
*) &sll
, sizeof sll
) < 0) {
1091 VLOG_ERR("%s: failed to bind raw socket (%s)",
1092 netdev_get_name(netdev_
), ovs_strerror(error
));
1096 /* Filter for only inbound packets. */
1097 error
= setsockopt(rx
->fd
, SOL_SOCKET
, SO_ATTACH_FILTER
, &fprog
,
1101 VLOG_ERR("%s: failed to attach filter (%s)",
1102 netdev_get_name(netdev_
), ovs_strerror(error
));
1106 ovs_mutex_unlock(&netdev
->mutex
);
1114 ovs_mutex_unlock(&netdev
->mutex
);
1119 netdev_linux_rxq_destruct(struct netdev_rxq
*rxq_
)
1121 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1129 netdev_linux_rxq_dealloc(struct netdev_rxq
*rxq_
)
1131 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1137 auxdata_to_vlan_tpid(const struct tpacket_auxdata
*aux
, bool double_tagged
)
1139 if (aux
->tp_status
& TP_STATUS_VLAN_TPID_VALID
) {
1140 return htons(aux
->tp_vlan_tpid
);
1141 } else if (double_tagged
) {
1142 return htons(ETH_TYPE_VLAN_8021AD
);
1144 return htons(ETH_TYPE_VLAN_8021Q
);
1149 auxdata_has_vlan_tci(const struct tpacket_auxdata
*aux
)
1151 return aux
->tp_vlan_tci
|| aux
->tp_status
& TP_STATUS_VLAN_VALID
;
1155 netdev_linux_rxq_recv_sock(int fd
, struct dp_packet
*buffer
)
1160 struct cmsghdr
*cmsg
;
1162 struct cmsghdr cmsg
;
1163 char buffer
[CMSG_SPACE(sizeof(struct tpacket_auxdata
))];
1167 /* Reserve headroom for a single VLAN tag */
1168 dp_packet_reserve(buffer
, VLAN_HEADER_LEN
);
1169 size
= dp_packet_tailroom(buffer
);
1171 iov
.iov_base
= dp_packet_data(buffer
);
1173 msgh
.msg_name
= NULL
;
1174 msgh
.msg_namelen
= 0;
1175 msgh
.msg_iov
= &iov
;
1176 msgh
.msg_iovlen
= 1;
1177 msgh
.msg_control
= &cmsg_buffer
;
1178 msgh
.msg_controllen
= sizeof cmsg_buffer
;
1182 retval
= recvmsg(fd
, &msgh
, MSG_TRUNC
);
1183 } while (retval
< 0 && errno
== EINTR
);
1187 } else if (retval
> size
) {
1191 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1193 for (cmsg
= CMSG_FIRSTHDR(&msgh
); cmsg
; cmsg
= CMSG_NXTHDR(&msgh
, cmsg
)) {
1194 const struct tpacket_auxdata
*aux
;
1196 if (cmsg
->cmsg_level
!= SOL_PACKET
1197 || cmsg
->cmsg_type
!= PACKET_AUXDATA
1198 || cmsg
->cmsg_len
< CMSG_LEN(sizeof(struct tpacket_auxdata
))) {
1202 aux
= ALIGNED_CAST(struct tpacket_auxdata
*, CMSG_DATA(cmsg
));
1203 if (auxdata_has_vlan_tci(aux
)) {
1204 struct eth_header
*eth
;
1207 if (retval
< ETH_HEADER_LEN
) {
1211 eth
= dp_packet_data(buffer
);
1212 double_tagged
= eth
->eth_type
== htons(ETH_TYPE_VLAN_8021Q
);
1214 eth_push_vlan(buffer
, auxdata_to_vlan_tpid(aux
, double_tagged
),
1215 htons(aux
->tp_vlan_tci
));
1224 netdev_linux_rxq_recv_tap(int fd
, struct dp_packet
*buffer
)
1227 size_t size
= dp_packet_tailroom(buffer
);
1230 retval
= read(fd
, dp_packet_data(buffer
), size
);
1231 } while (retval
< 0 && errno
== EINTR
);
1237 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1242 netdev_linux_rxq_recv(struct netdev_rxq
*rxq_
, struct dp_packet_batch
*batch
,
1245 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1246 struct netdev
*netdev
= rx
->up
.netdev
;
1247 struct dp_packet
*buffer
;
1251 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
1252 mtu
= ETH_PAYLOAD_MAX
;
1255 /* Assume Ethernet port. No need to set packet_type. */
1256 buffer
= dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN
+ mtu
,
1257 DP_NETDEV_HEADROOM
);
1258 retval
= (rx
->is_tap
1259 ? netdev_linux_rxq_recv_tap(rx
->fd
, buffer
)
1260 : netdev_linux_rxq_recv_sock(rx
->fd
, buffer
));
1263 if (retval
!= EAGAIN
&& retval
!= EMSGSIZE
) {
1264 VLOG_WARN_RL(&rl
, "error receiving Ethernet packet on %s: %s",
1265 netdev_rxq_get_name(rxq_
), ovs_strerror(errno
));
1267 dp_packet_delete(buffer
);
1269 dp_packet_batch_init_packet(batch
, buffer
);
1280 netdev_linux_rxq_wait(struct netdev_rxq
*rxq_
)
1282 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1283 poll_fd_wait(rx
->fd
, POLLIN
);
1287 netdev_linux_rxq_drain(struct netdev_rxq
*rxq_
)
1289 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1292 int error
= af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_
), &ifr
,
1293 SIOCGIFTXQLEN
, "SIOCGIFTXQLEN");
1297 drain_fd(rx
->fd
, ifr
.ifr_qlen
);
1300 return drain_rcvbuf(rx
->fd
);
1305 netdev_linux_sock_batch_send(int sock
, int ifindex
,
1306 struct dp_packet_batch
*batch
)
1308 const size_t size
= dp_packet_batch_size(batch
);
1309 /* We don't bother setting most fields in sockaddr_ll because the
1310 * kernel ignores them for SOCK_RAW. */
1311 struct sockaddr_ll sll
= { .sll_family
= AF_PACKET
,
1312 .sll_ifindex
= ifindex
};
1314 struct mmsghdr
*mmsg
= xmalloc(sizeof(*mmsg
) * size
);
1315 struct iovec
*iov
= xmalloc(sizeof(*iov
) * size
);
1317 struct dp_packet
*packet
;
1318 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1319 iov
[i
].iov_base
= dp_packet_data(packet
);
1320 iov
[i
].iov_len
= dp_packet_size(packet
);
1321 mmsg
[i
].msg_hdr
= (struct msghdr
) { .msg_name
= &sll
,
1322 .msg_namelen
= sizeof sll
,
1328 for (uint32_t ofs
= 0; ofs
< size
; ) {
1331 retval
= sendmmsg(sock
, mmsg
+ ofs
, size
- ofs
, 0);
1332 error
= retval
< 0 ? errno
: 0;
1333 } while (error
== EINTR
);
1345 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1346 * essential, because packets sent to a tap device with an AF_PACKET socket
1347 * will loop back to be *received* again on the tap device. This doesn't occur
1348 * on other interface types because we attach a socket filter to the rx
1351 netdev_linux_tap_batch_send(struct netdev
*netdev_
,
1352 struct dp_packet_batch
*batch
)
1354 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1355 struct dp_packet
*packet
;
1357 /* The Linux tap driver returns EIO if the device is not up,
1358 * so if the device is not up, don't waste time sending it.
1359 * However, if the device is in another network namespace
1360 * then OVS can't retrieve the state. In that case, send the
1361 * packets anyway. */
1362 if (netdev
->present
&& !(netdev
->ifi_flags
& IFF_UP
)) {
1363 netdev
->tx_dropped
+= dp_packet_batch_size(batch
);
1367 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1368 size_t size
= dp_packet_size(packet
);
1373 retval
= write(netdev
->tap_fd
, dp_packet_data(packet
), size
);
1374 error
= retval
< 0 ? errno
: 0;
1375 } while (error
== EINTR
);
1378 /* The Linux tap driver returns EIO if the device is not up. From
1379 * the OVS side this is not an error, so we ignore it; otherwise,
1380 * return the erro. */
1384 } else if (retval
!= size
) {
1385 VLOG_WARN_RL(&rl
, "sent partial Ethernet packet (%"PRIuSIZE
" "
1386 "bytes of %"PRIuSIZE
") on %s",
1387 retval
, size
, netdev_get_name(netdev_
));
1394 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1395 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1396 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1397 * the packet is too big or too small to transmit on the device.
1399 * The kernel maintains a packet transmission queue, so the caller is not
1400 * expected to do additional queuing of packets. */
1402 netdev_linux_send(struct netdev
*netdev_
, int qid OVS_UNUSED
,
1403 struct dp_packet_batch
*batch
,
1404 bool concurrent_txq OVS_UNUSED
)
1409 if (!is_tap_netdev(netdev_
)) {
1410 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_
))) {
1415 sock
= af_packet_sock();
1421 int ifindex
= netdev_get_ifindex(netdev_
);
1427 error
= netdev_linux_sock_batch_send(sock
, ifindex
, batch
);
1429 error
= netdev_linux_tap_batch_send(netdev_
, batch
);
1432 if (error
== ENOBUFS
) {
1433 /* The Linux AF_PACKET implementation never blocks waiting
1434 * for room for packets, instead returning ENOBUFS.
1435 * Translate this into EAGAIN for the caller. */
1438 VLOG_WARN_RL(&rl
, "error sending Ethernet packet on %s: %s",
1439 netdev_get_name(netdev_
), ovs_strerror(error
));
1444 dp_packet_delete_batch(batch
, true);
1448 /* Registers with the poll loop to wake up from the next call to poll_block()
1449 * when the packet transmission queue has sufficient room to transmit a packet
1450 * with netdev_send().
1452 * The kernel maintains a packet transmission queue, so the client is not
1453 * expected to do additional queuing of packets. Thus, this function is
1454 * unlikely to ever be used. It is included for completeness. */
1456 netdev_linux_send_wait(struct netdev
*netdev
, int qid OVS_UNUSED
)
1458 if (is_tap_netdev(netdev
)) {
1459 /* TAP device always accepts packets.*/
1460 poll_immediate_wake();
1464 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1465 * otherwise a positive errno value. */
1467 netdev_linux_set_etheraddr(struct netdev
*netdev_
, const struct eth_addr mac
)
1469 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1470 enum netdev_flags old_flags
= 0;
1473 ovs_mutex_lock(&netdev
->mutex
);
1474 if (netdev_linux_netnsid_is_remote(netdev
)) {
1479 if (netdev
->cache_valid
& VALID_ETHERADDR
) {
1480 error
= netdev
->ether_addr_error
;
1481 if (error
|| eth_addr_equals(netdev
->etheraddr
, mac
)) {
1484 netdev
->cache_valid
&= ~VALID_ETHERADDR
;
1487 /* Tap devices must be brought down before setting the address. */
1488 if (is_tap_netdev(netdev_
)) {
1489 update_flags(netdev
, NETDEV_UP
, 0, &old_flags
);
1491 error
= set_etheraddr(netdev_get_name(netdev_
), mac
);
1492 if (!error
|| error
== ENODEV
) {
1493 netdev
->ether_addr_error
= error
;
1494 netdev
->cache_valid
|= VALID_ETHERADDR
;
1496 netdev
->etheraddr
= mac
;
1500 if (is_tap_netdev(netdev_
) && old_flags
& NETDEV_UP
) {
1501 update_flags(netdev
, 0, NETDEV_UP
, &old_flags
);
1505 ovs_mutex_unlock(&netdev
->mutex
);
1509 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1511 netdev_linux_get_etheraddr(const struct netdev
*netdev_
, struct eth_addr
*mac
)
1513 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1516 ovs_mutex_lock(&netdev
->mutex
);
1517 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1518 netdev_linux_update_via_netlink(netdev
);
1521 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1522 /* Fall back to ioctl if netlink fails */
1523 netdev
->ether_addr_error
= get_etheraddr(netdev_get_name(netdev_
),
1524 &netdev
->etheraddr
);
1525 netdev
->cache_valid
|= VALID_ETHERADDR
;
1528 error
= netdev
->ether_addr_error
;
1530 *mac
= netdev
->etheraddr
;
1532 ovs_mutex_unlock(&netdev
->mutex
);
1538 netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
)
1542 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1543 netdev_linux_update_via_netlink(netdev
);
1546 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1547 /* Fall back to ioctl if netlink fails */
1550 netdev
->netdev_mtu_error
= af_inet_ifreq_ioctl(
1551 netdev_get_name(&netdev
->up
), &ifr
, SIOCGIFMTU
, "SIOCGIFMTU");
1552 netdev
->mtu
= ifr
.ifr_mtu
;
1553 netdev
->cache_valid
|= VALID_MTU
;
1556 error
= netdev
->netdev_mtu_error
;
1558 *mtup
= netdev
->mtu
;
1564 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1565 * in bytes, not including the hardware header; thus, this is typically 1500
1566 * bytes for Ethernet devices. */
1568 netdev_linux_get_mtu(const struct netdev
*netdev_
, int *mtup
)
1570 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1573 ovs_mutex_lock(&netdev
->mutex
);
1574 error
= netdev_linux_get_mtu__(netdev
, mtup
);
1575 ovs_mutex_unlock(&netdev
->mutex
);
1580 /* Sets the maximum size of transmitted (MTU) for given device using linux
1581 * networking ioctl interface.
1584 netdev_linux_set_mtu(struct netdev
*netdev_
, int mtu
)
1586 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1590 ovs_mutex_lock(&netdev
->mutex
);
1591 if (netdev_linux_netnsid_is_remote(netdev
)) {
1596 if (netdev
->cache_valid
& VALID_MTU
) {
1597 error
= netdev
->netdev_mtu_error
;
1598 if (error
|| netdev
->mtu
== mtu
) {
1601 netdev
->cache_valid
&= ~VALID_MTU
;
1604 error
= af_inet_ifreq_ioctl(netdev_get_name(netdev_
), &ifr
,
1605 SIOCSIFMTU
, "SIOCSIFMTU");
1606 if (!error
|| error
== ENODEV
) {
1607 netdev
->netdev_mtu_error
= error
;
1608 netdev
->mtu
= ifr
.ifr_mtu
;
1609 netdev
->cache_valid
|= VALID_MTU
;
1612 ovs_mutex_unlock(&netdev
->mutex
);
1616 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1617 * On failure, returns a negative errno value. */
1619 netdev_linux_get_ifindex(const struct netdev
*netdev_
)
1621 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1624 ovs_mutex_lock(&netdev
->mutex
);
1625 if (netdev_linux_netnsid_is_remote(netdev
)) {
1629 error
= get_ifindex(netdev_
, &ifindex
);
1632 ovs_mutex_unlock(&netdev
->mutex
);
1633 return error
? -error
: ifindex
;
1637 netdev_linux_get_carrier(const struct netdev
*netdev_
, bool *carrier
)
1639 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1641 ovs_mutex_lock(&netdev
->mutex
);
1642 if (netdev
->miimon_interval
> 0) {
1643 *carrier
= netdev
->miimon
;
1645 *carrier
= (netdev
->ifi_flags
& IFF_RUNNING
) != 0;
1647 ovs_mutex_unlock(&netdev
->mutex
);
1652 static long long int
1653 netdev_linux_get_carrier_resets(const struct netdev
*netdev_
)
1655 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1656 long long int carrier_resets
;
1658 ovs_mutex_lock(&netdev
->mutex
);
1659 carrier_resets
= netdev
->carrier_resets
;
1660 ovs_mutex_unlock(&netdev
->mutex
);
1662 return carrier_resets
;
1666 netdev_linux_do_miimon(const char *name
, int cmd
, const char *cmd_name
,
1667 struct mii_ioctl_data
*data
)
1672 memset(&ifr
, 0, sizeof ifr
);
1673 memcpy(&ifr
.ifr_data
, data
, sizeof *data
);
1674 error
= af_inet_ifreq_ioctl(name
, &ifr
, cmd
, cmd_name
);
1675 memcpy(data
, &ifr
.ifr_data
, sizeof *data
);
1681 netdev_linux_get_miimon(const char *name
, bool *miimon
)
1683 struct mii_ioctl_data data
;
1688 memset(&data
, 0, sizeof data
);
1689 error
= netdev_linux_do_miimon(name
, SIOCGMIIPHY
, "SIOCGMIIPHY", &data
);
1691 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1692 data
.reg_num
= MII_BMSR
;
1693 error
= netdev_linux_do_miimon(name
, SIOCGMIIREG
, "SIOCGMIIREG",
1697 *miimon
= !!(data
.val_out
& BMSR_LSTATUS
);
1701 struct ethtool_cmd ecmd
;
1703 VLOG_DBG_RL(&rl
, "%s: failed to query MII, falling back to ethtool",
1706 COVERAGE_INC(netdev_get_ethtool
);
1707 memset(&ecmd
, 0, sizeof ecmd
);
1708 error
= netdev_linux_do_ethtool(name
, &ecmd
, ETHTOOL_GLINK
,
1711 struct ethtool_value eval
;
1713 memcpy(&eval
, &ecmd
, sizeof eval
);
1714 *miimon
= !!eval
.data
;
1716 VLOG_WARN_RL(&rl
, "%s: ethtool link status failed", name
);
1724 netdev_linux_set_miimon_interval(struct netdev
*netdev_
,
1725 long long int interval
)
1727 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1729 ovs_mutex_lock(&netdev
->mutex
);
1730 interval
= interval
> 0 ? MAX(interval
, 100) : 0;
1731 if (netdev
->miimon_interval
!= interval
) {
1732 if (interval
&& !netdev
->miimon_interval
) {
1733 atomic_count_inc(&miimon_cnt
);
1734 } else if (!interval
&& netdev
->miimon_interval
) {
1735 atomic_count_dec(&miimon_cnt
);
1738 netdev
->miimon_interval
= interval
;
1739 timer_set_expired(&netdev
->miimon_timer
);
1741 ovs_mutex_unlock(&netdev
->mutex
);
1747 netdev_linux_miimon_run(void)
1749 struct shash device_shash
;
1750 struct shash_node
*node
;
1752 shash_init(&device_shash
);
1753 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1754 SHASH_FOR_EACH (node
, &device_shash
) {
1755 struct netdev
*netdev
= node
->data
;
1756 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1759 ovs_mutex_lock(&dev
->mutex
);
1760 if (dev
->miimon_interval
> 0 && timer_expired(&dev
->miimon_timer
)) {
1761 netdev_linux_get_miimon(dev
->up
.name
, &miimon
);
1762 if (miimon
!= dev
->miimon
) {
1763 dev
->miimon
= miimon
;
1764 netdev_linux_changed(dev
, dev
->ifi_flags
, 0);
1767 timer_set_duration(&dev
->miimon_timer
, dev
->miimon_interval
);
1769 ovs_mutex_unlock(&dev
->mutex
);
1770 netdev_close(netdev
);
1773 shash_destroy(&device_shash
);
1777 netdev_linux_miimon_wait(void)
1779 struct shash device_shash
;
1780 struct shash_node
*node
;
1782 shash_init(&device_shash
);
1783 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1784 SHASH_FOR_EACH (node
, &device_shash
) {
1785 struct netdev
*netdev
= node
->data
;
1786 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1788 ovs_mutex_lock(&dev
->mutex
);
1789 if (dev
->miimon_interval
> 0) {
1790 timer_wait(&dev
->miimon_timer
);
1792 ovs_mutex_unlock(&dev
->mutex
);
1793 netdev_close(netdev
);
1795 shash_destroy(&device_shash
);
1799 swap_uint64(uint64_t *a
, uint64_t *b
)
1806 /* Copies 'src' into 'dst', performing format conversion in the process.
1808 * 'src' is allowed to be misaligned. */
1810 netdev_stats_from_ovs_vport_stats(struct netdev_stats
*dst
,
1811 const struct ovs_vport_stats
*src
)
1813 dst
->rx_packets
= get_32aligned_u64(&src
->rx_packets
);
1814 dst
->tx_packets
= get_32aligned_u64(&src
->tx_packets
);
1815 dst
->rx_bytes
= get_32aligned_u64(&src
->rx_bytes
);
1816 dst
->tx_bytes
= get_32aligned_u64(&src
->tx_bytes
);
1817 dst
->rx_errors
= get_32aligned_u64(&src
->rx_errors
);
1818 dst
->tx_errors
= get_32aligned_u64(&src
->tx_errors
);
1819 dst
->rx_dropped
= get_32aligned_u64(&src
->rx_dropped
);
1820 dst
->tx_dropped
= get_32aligned_u64(&src
->tx_dropped
);
1822 dst
->collisions
= 0;
1823 dst
->rx_length_errors
= 0;
1824 dst
->rx_over_errors
= 0;
1825 dst
->rx_crc_errors
= 0;
1826 dst
->rx_frame_errors
= 0;
1827 dst
->rx_fifo_errors
= 0;
1828 dst
->rx_missed_errors
= 0;
1829 dst
->tx_aborted_errors
= 0;
1830 dst
->tx_carrier_errors
= 0;
1831 dst
->tx_fifo_errors
= 0;
1832 dst
->tx_heartbeat_errors
= 0;
1833 dst
->tx_window_errors
= 0;
1837 get_stats_via_vport__(const struct netdev
*netdev
, struct netdev_stats
*stats
)
1839 struct dpif_netlink_vport reply
;
1843 error
= dpif_netlink_vport_get(netdev_get_name(netdev
), &reply
, &buf
);
1846 } else if (!reply
.stats
) {
1851 netdev_stats_from_ovs_vport_stats(stats
, reply
.stats
);
1859 get_stats_via_vport(const struct netdev
*netdev_
,
1860 struct netdev_stats
*stats
)
1862 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1864 if (!netdev
->vport_stats_error
||
1865 !(netdev
->cache_valid
& VALID_VPORT_STAT_ERROR
)) {
1868 error
= get_stats_via_vport__(netdev_
, stats
);
1869 if (error
&& error
!= ENOENT
&& error
!= ENODEV
) {
1870 VLOG_WARN_RL(&rl
, "%s: obtaining netdev stats via vport failed "
1872 netdev_get_name(netdev_
), ovs_strerror(error
));
1874 netdev
->vport_stats_error
= error
;
1875 netdev
->cache_valid
|= VALID_VPORT_STAT_ERROR
;
1879 /* Retrieves current device stats for 'netdev-linux'. */
1881 netdev_linux_get_stats(const struct netdev
*netdev_
,
1882 struct netdev_stats
*stats
)
1884 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1885 struct netdev_stats dev_stats
;
1888 ovs_mutex_lock(&netdev
->mutex
);
1889 get_stats_via_vport(netdev_
, stats
);
1890 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1892 if (!netdev
->vport_stats_error
) {
1895 } else if (netdev
->vport_stats_error
) {
1896 /* stats not available from OVS then use netdev stats. */
1899 /* Use kernel netdev's packet and byte counts since vport's counters
1900 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1902 stats
->rx_packets
= dev_stats
.rx_packets
;
1903 stats
->rx_bytes
= dev_stats
.rx_bytes
;
1904 stats
->tx_packets
= dev_stats
.tx_packets
;
1905 stats
->tx_bytes
= dev_stats
.tx_bytes
;
1907 stats
->rx_errors
+= dev_stats
.rx_errors
;
1908 stats
->tx_errors
+= dev_stats
.tx_errors
;
1909 stats
->rx_dropped
+= dev_stats
.rx_dropped
;
1910 stats
->tx_dropped
+= dev_stats
.tx_dropped
;
1911 stats
->multicast
+= dev_stats
.multicast
;
1912 stats
->collisions
+= dev_stats
.collisions
;
1913 stats
->rx_length_errors
+= dev_stats
.rx_length_errors
;
1914 stats
->rx_over_errors
+= dev_stats
.rx_over_errors
;
1915 stats
->rx_crc_errors
+= dev_stats
.rx_crc_errors
;
1916 stats
->rx_frame_errors
+= dev_stats
.rx_frame_errors
;
1917 stats
->rx_fifo_errors
+= dev_stats
.rx_fifo_errors
;
1918 stats
->rx_missed_errors
+= dev_stats
.rx_missed_errors
;
1919 stats
->tx_aborted_errors
+= dev_stats
.tx_aborted_errors
;
1920 stats
->tx_carrier_errors
+= dev_stats
.tx_carrier_errors
;
1921 stats
->tx_fifo_errors
+= dev_stats
.tx_fifo_errors
;
1922 stats
->tx_heartbeat_errors
+= dev_stats
.tx_heartbeat_errors
;
1923 stats
->tx_window_errors
+= dev_stats
.tx_window_errors
;
1925 ovs_mutex_unlock(&netdev
->mutex
);
1930 /* Retrieves current device stats for 'netdev-tap' netdev or
1931 * netdev-internal. */
1933 netdev_tap_get_stats(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
1935 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1936 struct netdev_stats dev_stats
;
1939 ovs_mutex_lock(&netdev
->mutex
);
1940 get_stats_via_vport(netdev_
, stats
);
1941 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1943 if (!netdev
->vport_stats_error
) {
1946 } else if (netdev
->vport_stats_error
) {
1947 /* Transmit and receive stats will appear to be swapped relative to the
1948 * other ports since we are the one sending the data, not a remote
1949 * computer. For consistency, we swap them back here. This does not
1950 * apply if we are getting stats from the vport layer because it always
1951 * tracks stats from the perspective of the switch. */
1954 swap_uint64(&stats
->rx_packets
, &stats
->tx_packets
);
1955 swap_uint64(&stats
->rx_bytes
, &stats
->tx_bytes
);
1956 swap_uint64(&stats
->rx_errors
, &stats
->tx_errors
);
1957 swap_uint64(&stats
->rx_dropped
, &stats
->tx_dropped
);
1958 stats
->rx_length_errors
= 0;
1959 stats
->rx_over_errors
= 0;
1960 stats
->rx_crc_errors
= 0;
1961 stats
->rx_frame_errors
= 0;
1962 stats
->rx_fifo_errors
= 0;
1963 stats
->rx_missed_errors
= 0;
1964 stats
->tx_aborted_errors
= 0;
1965 stats
->tx_carrier_errors
= 0;
1966 stats
->tx_fifo_errors
= 0;
1967 stats
->tx_heartbeat_errors
= 0;
1968 stats
->tx_window_errors
= 0;
1970 /* Use kernel netdev's packet and byte counts since vport counters
1971 * do not reflect packet counts on the wire when GSO, TSO or GRO
1973 stats
->rx_packets
= dev_stats
.tx_packets
;
1974 stats
->rx_bytes
= dev_stats
.tx_bytes
;
1975 stats
->tx_packets
= dev_stats
.rx_packets
;
1976 stats
->tx_bytes
= dev_stats
.rx_bytes
;
1978 stats
->rx_dropped
+= dev_stats
.tx_dropped
;
1979 stats
->tx_dropped
+= dev_stats
.rx_dropped
;
1981 stats
->rx_errors
+= dev_stats
.tx_errors
;
1982 stats
->tx_errors
+= dev_stats
.rx_errors
;
1984 stats
->multicast
+= dev_stats
.multicast
;
1985 stats
->collisions
+= dev_stats
.collisions
;
1987 stats
->tx_dropped
+= netdev
->tx_dropped
;
1988 ovs_mutex_unlock(&netdev
->mutex
);
1994 netdev_internal_get_stats(const struct netdev
*netdev_
,
1995 struct netdev_stats
*stats
)
1997 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2000 ovs_mutex_lock(&netdev
->mutex
);
2001 get_stats_via_vport(netdev_
, stats
);
2002 error
= netdev
->vport_stats_error
;
2003 ovs_mutex_unlock(&netdev
->mutex
);
2009 netdev_linux_read_features(struct netdev_linux
*netdev
)
2011 struct ethtool_cmd ecmd
;
2015 if (netdev
->cache_valid
& VALID_FEATURES
) {
2019 COVERAGE_INC(netdev_get_ethtool
);
2020 memset(&ecmd
, 0, sizeof ecmd
);
2021 error
= netdev_linux_do_ethtool(netdev
->up
.name
, &ecmd
,
2022 ETHTOOL_GSET
, "ETHTOOL_GSET");
2027 /* Supported features. */
2028 netdev
->supported
= 0;
2029 if (ecmd
.supported
& SUPPORTED_10baseT_Half
) {
2030 netdev
->supported
|= NETDEV_F_10MB_HD
;
2032 if (ecmd
.supported
& SUPPORTED_10baseT_Full
) {
2033 netdev
->supported
|= NETDEV_F_10MB_FD
;
2035 if (ecmd
.supported
& SUPPORTED_100baseT_Half
) {
2036 netdev
->supported
|= NETDEV_F_100MB_HD
;
2038 if (ecmd
.supported
& SUPPORTED_100baseT_Full
) {
2039 netdev
->supported
|= NETDEV_F_100MB_FD
;
2041 if (ecmd
.supported
& SUPPORTED_1000baseT_Half
) {
2042 netdev
->supported
|= NETDEV_F_1GB_HD
;
2044 if ((ecmd
.supported
& SUPPORTED_1000baseT_Full
) ||
2045 (ecmd
.supported
& SUPPORTED_1000baseKX_Full
)) {
2046 netdev
->supported
|= NETDEV_F_1GB_FD
;
2048 if ((ecmd
.supported
& SUPPORTED_10000baseT_Full
) ||
2049 (ecmd
.supported
& SUPPORTED_10000baseKX4_Full
) ||
2050 (ecmd
.supported
& SUPPORTED_10000baseKR_Full
) ||
2051 (ecmd
.supported
& SUPPORTED_10000baseR_FEC
)) {
2052 netdev
->supported
|= NETDEV_F_10GB_FD
;
2054 if ((ecmd
.supported
& SUPPORTED_40000baseKR4_Full
) ||
2055 (ecmd
.supported
& SUPPORTED_40000baseCR4_Full
) ||
2056 (ecmd
.supported
& SUPPORTED_40000baseSR4_Full
) ||
2057 (ecmd
.supported
& SUPPORTED_40000baseLR4_Full
)) {
2058 netdev
->supported
|= NETDEV_F_40GB_FD
;
2060 if (ecmd
.supported
& SUPPORTED_TP
) {
2061 netdev
->supported
|= NETDEV_F_COPPER
;
2063 if (ecmd
.supported
& SUPPORTED_FIBRE
) {
2064 netdev
->supported
|= NETDEV_F_FIBER
;
2066 if (ecmd
.supported
& SUPPORTED_Autoneg
) {
2067 netdev
->supported
|= NETDEV_F_AUTONEG
;
2069 if (ecmd
.supported
& SUPPORTED_Pause
) {
2070 netdev
->supported
|= NETDEV_F_PAUSE
;
2072 if (ecmd
.supported
& SUPPORTED_Asym_Pause
) {
2073 netdev
->supported
|= NETDEV_F_PAUSE_ASYM
;
2076 /* Advertised features. */
2077 netdev
->advertised
= 0;
2078 if (ecmd
.advertising
& ADVERTISED_10baseT_Half
) {
2079 netdev
->advertised
|= NETDEV_F_10MB_HD
;
2081 if (ecmd
.advertising
& ADVERTISED_10baseT_Full
) {
2082 netdev
->advertised
|= NETDEV_F_10MB_FD
;
2084 if (ecmd
.advertising
& ADVERTISED_100baseT_Half
) {
2085 netdev
->advertised
|= NETDEV_F_100MB_HD
;
2087 if (ecmd
.advertising
& ADVERTISED_100baseT_Full
) {
2088 netdev
->advertised
|= NETDEV_F_100MB_FD
;
2090 if (ecmd
.advertising
& ADVERTISED_1000baseT_Half
) {
2091 netdev
->advertised
|= NETDEV_F_1GB_HD
;
2093 if ((ecmd
.advertising
& ADVERTISED_1000baseT_Full
) ||
2094 (ecmd
.advertising
& ADVERTISED_1000baseKX_Full
)) {
2095 netdev
->advertised
|= NETDEV_F_1GB_FD
;
2097 if ((ecmd
.advertising
& ADVERTISED_10000baseT_Full
) ||
2098 (ecmd
.advertising
& ADVERTISED_10000baseKX4_Full
) ||
2099 (ecmd
.advertising
& ADVERTISED_10000baseKR_Full
) ||
2100 (ecmd
.advertising
& ADVERTISED_10000baseR_FEC
)) {
2101 netdev
->advertised
|= NETDEV_F_10GB_FD
;
2103 if ((ecmd
.advertising
& ADVERTISED_40000baseKR4_Full
) ||
2104 (ecmd
.advertising
& ADVERTISED_40000baseCR4_Full
) ||
2105 (ecmd
.advertising
& ADVERTISED_40000baseSR4_Full
) ||
2106 (ecmd
.advertising
& ADVERTISED_40000baseLR4_Full
)) {
2107 netdev
->advertised
|= NETDEV_F_40GB_FD
;
2109 if (ecmd
.advertising
& ADVERTISED_TP
) {
2110 netdev
->advertised
|= NETDEV_F_COPPER
;
2112 if (ecmd
.advertising
& ADVERTISED_FIBRE
) {
2113 netdev
->advertised
|= NETDEV_F_FIBER
;
2115 if (ecmd
.advertising
& ADVERTISED_Autoneg
) {
2116 netdev
->advertised
|= NETDEV_F_AUTONEG
;
2118 if (ecmd
.advertising
& ADVERTISED_Pause
) {
2119 netdev
->advertised
|= NETDEV_F_PAUSE
;
2121 if (ecmd
.advertising
& ADVERTISED_Asym_Pause
) {
2122 netdev
->advertised
|= NETDEV_F_PAUSE_ASYM
;
2125 /* Current settings. */
2126 speed
= ethtool_cmd_speed(&ecmd
);
2127 if (speed
== SPEED_10
) {
2128 netdev
->current
= ecmd
.duplex
? NETDEV_F_10MB_FD
: NETDEV_F_10MB_HD
;
2129 } else if (speed
== SPEED_100
) {
2130 netdev
->current
= ecmd
.duplex
? NETDEV_F_100MB_FD
: NETDEV_F_100MB_HD
;
2131 } else if (speed
== SPEED_1000
) {
2132 netdev
->current
= ecmd
.duplex
? NETDEV_F_1GB_FD
: NETDEV_F_1GB_HD
;
2133 } else if (speed
== SPEED_10000
) {
2134 netdev
->current
= NETDEV_F_10GB_FD
;
2135 } else if (speed
== 40000) {
2136 netdev
->current
= NETDEV_F_40GB_FD
;
2137 } else if (speed
== 100000) {
2138 netdev
->current
= NETDEV_F_100GB_FD
;
2139 } else if (speed
== 1000000) {
2140 netdev
->current
= NETDEV_F_1TB_FD
;
2142 netdev
->current
= 0;
2145 if (ecmd
.port
== PORT_TP
) {
2146 netdev
->current
|= NETDEV_F_COPPER
;
2147 } else if (ecmd
.port
== PORT_FIBRE
) {
2148 netdev
->current
|= NETDEV_F_FIBER
;
2152 netdev
->current
|= NETDEV_F_AUTONEG
;
2156 netdev
->cache_valid
|= VALID_FEATURES
;
2157 netdev
->get_features_error
= error
;
2160 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2161 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2162 * Returns 0 if successful, otherwise a positive errno value. */
2164 netdev_linux_get_features(const struct netdev
*netdev_
,
2165 enum netdev_features
*current
,
2166 enum netdev_features
*advertised
,
2167 enum netdev_features
*supported
,
2168 enum netdev_features
*peer
)
2170 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2173 ovs_mutex_lock(&netdev
->mutex
);
2174 if (netdev_linux_netnsid_is_remote(netdev
)) {
2179 netdev_linux_read_features(netdev
);
2180 if (!netdev
->get_features_error
) {
2181 *current
= netdev
->current
;
2182 *advertised
= netdev
->advertised
;
2183 *supported
= netdev
->supported
;
2184 *peer
= 0; /* XXX */
2186 error
= netdev
->get_features_error
;
2189 ovs_mutex_unlock(&netdev
->mutex
);
2193 /* Set the features advertised by 'netdev' to 'advertise'. */
2195 netdev_linux_set_advertisements(struct netdev
*netdev_
,
2196 enum netdev_features advertise
)
2198 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2199 struct ethtool_cmd ecmd
;
2202 ovs_mutex_lock(&netdev
->mutex
);
2204 COVERAGE_INC(netdev_get_ethtool
);
2206 if (netdev_linux_netnsid_is_remote(netdev
)) {
2211 memset(&ecmd
, 0, sizeof ecmd
);
2212 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2213 ETHTOOL_GSET
, "ETHTOOL_GSET");
2218 ecmd
.advertising
= 0;
2219 if (advertise
& NETDEV_F_10MB_HD
) {
2220 ecmd
.advertising
|= ADVERTISED_10baseT_Half
;
2222 if (advertise
& NETDEV_F_10MB_FD
) {
2223 ecmd
.advertising
|= ADVERTISED_10baseT_Full
;
2225 if (advertise
& NETDEV_F_100MB_HD
) {
2226 ecmd
.advertising
|= ADVERTISED_100baseT_Half
;
2228 if (advertise
& NETDEV_F_100MB_FD
) {
2229 ecmd
.advertising
|= ADVERTISED_100baseT_Full
;
2231 if (advertise
& NETDEV_F_1GB_HD
) {
2232 ecmd
.advertising
|= ADVERTISED_1000baseT_Half
;
2234 if (advertise
& NETDEV_F_1GB_FD
) {
2235 ecmd
.advertising
|= ADVERTISED_1000baseT_Full
;
2237 if (advertise
& NETDEV_F_10GB_FD
) {
2238 ecmd
.advertising
|= ADVERTISED_10000baseT_Full
;
2240 if (advertise
& NETDEV_F_COPPER
) {
2241 ecmd
.advertising
|= ADVERTISED_TP
;
2243 if (advertise
& NETDEV_F_FIBER
) {
2244 ecmd
.advertising
|= ADVERTISED_FIBRE
;
2246 if (advertise
& NETDEV_F_AUTONEG
) {
2247 ecmd
.advertising
|= ADVERTISED_Autoneg
;
2249 if (advertise
& NETDEV_F_PAUSE
) {
2250 ecmd
.advertising
|= ADVERTISED_Pause
;
2252 if (advertise
& NETDEV_F_PAUSE_ASYM
) {
2253 ecmd
.advertising
|= ADVERTISED_Asym_Pause
;
2255 COVERAGE_INC(netdev_set_ethtool
);
2256 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2257 ETHTOOL_SSET
, "ETHTOOL_SSET");
2260 ovs_mutex_unlock(&netdev
->mutex
);
2264 static struct tc_police
2265 tc_matchall_fill_police(uint32_t kbits_rate
, uint32_t kbits_burst
)
2267 unsigned int bsize
= MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024 / 64;
2268 unsigned int bps
= ((uint64_t) kbits_rate
* 1000) / 8;
2269 struct tc_police police
;
2270 struct tc_ratespec rate
;
2273 memset(&rate
, 0, sizeof rate
);
2275 rate
.cell_log
= tc_calc_cell_log(mtu
);
2276 rate
.mpu
= ETH_TOTAL_MIN
;
2278 memset(&police
, 0, sizeof police
);
2279 police
.burst
= tc_bytes_to_ticks(bps
, bsize
);
2280 police
.action
= TC_POLICE_SHOT
;
2288 nl_msg_put_act_police(struct ofpbuf
*request
, struct tc_police police
)
2292 nl_msg_put_string(request
, TCA_ACT_KIND
, "police");
2293 offset
= nl_msg_start_nested(request
, TCA_ACT_OPTIONS
);
2294 nl_msg_put_unspec(request
, TCA_POLICE_TBF
, &police
, sizeof police
);
2295 tc_put_rtab(request
, TCA_POLICE_RATE
, &police
.rate
);
2296 nl_msg_put_u32(request
, TCA_POLICE_RESULT
, TC_ACT_UNSPEC
);
2297 nl_msg_end_nested(request
, offset
);
2301 tc_add_matchall_policer(struct netdev
*netdev
, uint32_t kbits_rate
,
2302 uint32_t kbits_burst
)
2304 uint16_t eth_type
= (OVS_FORCE
uint16_t) htons(ETH_P_ALL
);
2305 size_t basic_offset
, action_offset
, inner_offset
;
2306 uint16_t prio
= TC_RESERVED_PRIORITY_POLICE
;
2307 int ifindex
, index
, err
= 0;
2308 struct tc_police pol_act
;
2309 uint32_t block_id
= 0;
2310 struct ofpbuf request
;
2311 struct ofpbuf
*reply
;
2312 struct tcmsg
*tcmsg
;
2313 uint32_t handle
= 1;
2315 err
= get_ifindex(netdev
, &ifindex
);
2320 index
= block_id
? TCM_IFINDEX_MAGIC_BLOCK
: ifindex
;
2321 tcmsg
= tc_make_request(index
, RTM_NEWTFILTER
, NLM_F_CREATE
| NLM_F_ECHO
,
2323 tcmsg
->tcm_parent
= block_id
? : TC_INGRESS_PARENT
;
2324 tcmsg
->tcm_info
= tc_make_handle(prio
, eth_type
);
2325 tcmsg
->tcm_handle
= handle
;
2327 pol_act
= tc_matchall_fill_police(kbits_rate
, kbits_burst
);
2328 nl_msg_put_string(&request
, TCA_KIND
, "matchall");
2329 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
2330 action_offset
= nl_msg_start_nested(&request
, TCA_MATCHALL_ACT
);
2331 inner_offset
= nl_msg_start_nested(&request
, 1);
2332 nl_msg_put_act_police(&request
, pol_act
);
2333 nl_msg_end_nested(&request
, inner_offset
);
2334 nl_msg_end_nested(&request
, action_offset
);
2335 nl_msg_end_nested(&request
, basic_offset
);
2337 err
= tc_transact(&request
, &reply
);
2340 ofpbuf_at_assert(reply
, NLMSG_HDRLEN
, sizeof *tc
);
2341 ofpbuf_delete(reply
);
2348 tc_del_matchall_policer(struct netdev
*netdev
)
2350 uint32_t block_id
= 0;
2354 err
= get_ifindex(netdev
, &ifindex
);
2359 err
= tc_del_filter(ifindex
, TC_RESERVED_PRIORITY_POLICE
, 1, block_id
,
2368 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2369 * successful, otherwise a positive errno value. */
2371 netdev_linux_set_policing(struct netdev
*netdev_
,
2372 uint32_t kbits_rate
, uint32_t kbits_burst
)
2374 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2375 const char *netdev_name
= netdev_get_name(netdev_
);
2379 kbits_burst
= (!kbits_rate
? 0 /* Force to 0 if no rate specified. */
2380 : !kbits_burst
? 8000 /* Default to 8000 kbits if 0. */
2381 : kbits_burst
); /* Stick with user-specified value. */
2383 ovs_mutex_lock(&netdev
->mutex
);
2384 if (netdev_linux_netnsid_is_remote(netdev
)) {
2389 if (netdev
->cache_valid
& VALID_POLICING
) {
2390 error
= netdev
->netdev_policing_error
;
2391 if (error
|| (netdev
->kbits_rate
== kbits_rate
&&
2392 netdev
->kbits_burst
== kbits_burst
)) {
2393 /* Assume that settings haven't changed since we last set them. */
2396 netdev
->cache_valid
&= ~VALID_POLICING
;
2399 COVERAGE_INC(netdev_set_policing
);
2401 /* Use matchall for policing when offloadling ovs with tc-flower. */
2402 if (netdev_is_flow_api_enabled()) {
2403 error
= tc_del_matchall_policer(netdev_
);
2405 error
= tc_add_matchall_policer(netdev_
, kbits_rate
, kbits_burst
);
2407 ovs_mutex_unlock(&netdev
->mutex
);
2411 error
= get_ifindex(netdev_
, &ifindex
);
2416 /* Remove any existing ingress qdisc. */
2417 error
= tc_add_del_qdisc(ifindex
, false, 0, TC_INGRESS
);
2419 VLOG_WARN_RL(&rl
, "%s: removing policing failed: %s",
2420 netdev_name
, ovs_strerror(error
));
2425 error
= tc_add_del_qdisc(ifindex
, true, 0, TC_INGRESS
);
2427 VLOG_WARN_RL(&rl
, "%s: adding policing qdisc failed: %s",
2428 netdev_name
, ovs_strerror(error
));
2432 error
= tc_add_policer(netdev_
, kbits_rate
, kbits_burst
);
2434 VLOG_WARN_RL(&rl
, "%s: adding policing action failed: %s",
2435 netdev_name
, ovs_strerror(error
));
2440 netdev
->kbits_rate
= kbits_rate
;
2441 netdev
->kbits_burst
= kbits_burst
;
2444 if (!error
|| error
== ENODEV
) {
2445 netdev
->netdev_policing_error
= error
;
2446 netdev
->cache_valid
|= VALID_POLICING
;
2448 ovs_mutex_unlock(&netdev
->mutex
);
2453 netdev_linux_get_qos_types(const struct netdev
*netdev OVS_UNUSED
,
2456 const struct tc_ops
*const *opsp
;
2457 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2458 const struct tc_ops
*ops
= *opsp
;
2459 if (ops
->tc_install
&& ops
->ovs_name
[0] != '\0') {
2460 sset_add(types
, ops
->ovs_name
);
2466 static const struct tc_ops
*
2467 tc_lookup_ovs_name(const char *name
)
2469 const struct tc_ops
*const *opsp
;
2471 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2472 const struct tc_ops
*ops
= *opsp
;
2473 if (!strcmp(name
, ops
->ovs_name
)) {
2480 static const struct tc_ops
*
2481 tc_lookup_linux_name(const char *name
)
2483 const struct tc_ops
*const *opsp
;
2485 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2486 const struct tc_ops
*ops
= *opsp
;
2487 if (ops
->linux_name
&& !strcmp(name
, ops
->linux_name
)) {
2494 static struct tc_queue
*
2495 tc_find_queue__(const struct netdev
*netdev_
, unsigned int queue_id
,
2498 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2499 struct tc_queue
*queue
;
2501 HMAP_FOR_EACH_IN_BUCKET (queue
, hmap_node
, hash
, &netdev
->tc
->queues
) {
2502 if (queue
->queue_id
== queue_id
) {
2509 static struct tc_queue
*
2510 tc_find_queue(const struct netdev
*netdev
, unsigned int queue_id
)
2512 return tc_find_queue__(netdev
, queue_id
, hash_int(queue_id
, 0));
2516 netdev_linux_get_qos_capabilities(const struct netdev
*netdev OVS_UNUSED
,
2518 struct netdev_qos_capabilities
*caps
)
2520 const struct tc_ops
*ops
= tc_lookup_ovs_name(type
);
2524 caps
->n_queues
= ops
->n_queues
;
2529 netdev_linux_get_qos(const struct netdev
*netdev_
,
2530 const char **typep
, struct smap
*details
)
2532 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2535 ovs_mutex_lock(&netdev
->mutex
);
2536 if (netdev_linux_netnsid_is_remote(netdev
)) {
2541 error
= tc_query_qdisc(netdev_
);
2543 *typep
= netdev
->tc
->ops
->ovs_name
;
2544 error
= (netdev
->tc
->ops
->qdisc_get
2545 ? netdev
->tc
->ops
->qdisc_get(netdev_
, details
)
2550 ovs_mutex_unlock(&netdev
->mutex
);
2555 netdev_linux_set_qos(struct netdev
*netdev_
,
2556 const char *type
, const struct smap
*details
)
2558 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2559 const struct tc_ops
*new_ops
;
2562 new_ops
= tc_lookup_ovs_name(type
);
2563 if (!new_ops
|| !new_ops
->tc_install
) {
2567 if (new_ops
== &tc_ops_noop
) {
2568 return new_ops
->tc_install(netdev_
, details
);
2571 ovs_mutex_lock(&netdev
->mutex
);
2572 if (netdev_linux_netnsid_is_remote(netdev
)) {
2577 error
= tc_query_qdisc(netdev_
);
2582 if (new_ops
== netdev
->tc
->ops
) {
2583 error
= new_ops
->qdisc_set
? new_ops
->qdisc_set(netdev_
, details
) : 0;
2585 /* Delete existing qdisc. */
2586 error
= tc_del_qdisc(netdev_
);
2590 ovs_assert(netdev
->tc
== NULL
);
2592 /* Install new qdisc. */
2593 error
= new_ops
->tc_install(netdev_
, details
);
2594 ovs_assert((error
== 0) == (netdev
->tc
!= NULL
));
2598 ovs_mutex_unlock(&netdev
->mutex
);
2603 netdev_linux_get_queue(const struct netdev
*netdev_
,
2604 unsigned int queue_id
, struct smap
*details
)
2606 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2609 ovs_mutex_lock(&netdev
->mutex
);
2610 if (netdev_linux_netnsid_is_remote(netdev
)) {
2615 error
= tc_query_qdisc(netdev_
);
2617 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2619 ? netdev
->tc
->ops
->class_get(netdev_
, queue
, details
)
2624 ovs_mutex_unlock(&netdev
->mutex
);
2629 netdev_linux_set_queue(struct netdev
*netdev_
,
2630 unsigned int queue_id
, const struct smap
*details
)
2632 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2635 ovs_mutex_lock(&netdev
->mutex
);
2636 if (netdev_linux_netnsid_is_remote(netdev
)) {
2641 error
= tc_query_qdisc(netdev_
);
2643 error
= (queue_id
< netdev
->tc
->ops
->n_queues
2644 && netdev
->tc
->ops
->class_set
2645 ? netdev
->tc
->ops
->class_set(netdev_
, queue_id
, details
)
2650 ovs_mutex_unlock(&netdev
->mutex
);
2655 netdev_linux_delete_queue(struct netdev
*netdev_
, unsigned int queue_id
)
2657 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2660 ovs_mutex_lock(&netdev
->mutex
);
2661 if (netdev_linux_netnsid_is_remote(netdev
)) {
2666 error
= tc_query_qdisc(netdev_
);
2668 if (netdev
->tc
->ops
->class_delete
) {
2669 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2671 ? netdev
->tc
->ops
->class_delete(netdev_
, queue
)
2679 ovs_mutex_unlock(&netdev
->mutex
);
2684 netdev_linux_get_queue_stats(const struct netdev
*netdev_
,
2685 unsigned int queue_id
,
2686 struct netdev_queue_stats
*stats
)
2688 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2691 ovs_mutex_lock(&netdev
->mutex
);
2692 if (netdev_linux_netnsid_is_remote(netdev
)) {
2697 error
= tc_query_qdisc(netdev_
);
2699 if (netdev
->tc
->ops
->class_get_stats
) {
2700 const struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2702 stats
->created
= queue
->created
;
2703 error
= netdev
->tc
->ops
->class_get_stats(netdev_
, queue
,
2714 ovs_mutex_unlock(&netdev
->mutex
);
2718 struct queue_dump_state
{
2719 struct nl_dump dump
;
2724 start_queue_dump(const struct netdev
*netdev
, struct queue_dump_state
*state
)
2726 struct ofpbuf request
;
2727 struct tcmsg
*tcmsg
;
2729 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, 0, &request
);
2733 tcmsg
->tcm_parent
= 0;
2734 nl_dump_start(&state
->dump
, NETLINK_ROUTE
, &request
);
2735 ofpbuf_uninit(&request
);
2737 ofpbuf_init(&state
->buf
, NL_DUMP_BUFSIZE
);
2742 finish_queue_dump(struct queue_dump_state
*state
)
2744 ofpbuf_uninit(&state
->buf
);
2745 return nl_dump_done(&state
->dump
);
2748 struct netdev_linux_queue_state
{
2749 unsigned int *queues
;
2755 netdev_linux_queue_dump_start(const struct netdev
*netdev_
, void **statep
)
2757 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2760 ovs_mutex_lock(&netdev
->mutex
);
2761 if (netdev_linux_netnsid_is_remote(netdev
)) {
2766 error
= tc_query_qdisc(netdev_
);
2768 if (netdev
->tc
->ops
->class_get
) {
2769 struct netdev_linux_queue_state
*state
;
2770 struct tc_queue
*queue
;
2773 *statep
= state
= xmalloc(sizeof *state
);
2774 state
->n_queues
= hmap_count(&netdev
->tc
->queues
);
2775 state
->cur_queue
= 0;
2776 state
->queues
= xmalloc(state
->n_queues
* sizeof *state
->queues
);
2779 HMAP_FOR_EACH (queue
, hmap_node
, &netdev
->tc
->queues
) {
2780 state
->queues
[i
++] = queue
->queue_id
;
2788 ovs_mutex_unlock(&netdev
->mutex
);
2793 netdev_linux_queue_dump_next(const struct netdev
*netdev_
, void *state_
,
2794 unsigned int *queue_idp
, struct smap
*details
)
2796 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2797 struct netdev_linux_queue_state
*state
= state_
;
2800 ovs_mutex_lock(&netdev
->mutex
);
2801 if (netdev_linux_netnsid_is_remote(netdev
)) {
2806 while (state
->cur_queue
< state
->n_queues
) {
2807 unsigned int queue_id
= state
->queues
[state
->cur_queue
++];
2808 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2811 *queue_idp
= queue_id
;
2812 error
= netdev
->tc
->ops
->class_get(netdev_
, queue
, details
);
2818 ovs_mutex_unlock(&netdev
->mutex
);
2823 netdev_linux_queue_dump_done(const struct netdev
*netdev OVS_UNUSED
,
2826 struct netdev_linux_queue_state
*state
= state_
;
2828 free(state
->queues
);
2834 netdev_linux_dump_queue_stats(const struct netdev
*netdev_
,
2835 netdev_dump_queue_stats_cb
*cb
, void *aux
)
2837 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2840 ovs_mutex_lock(&netdev
->mutex
);
2841 if (netdev_linux_netnsid_is_remote(netdev
)) {
2846 error
= tc_query_qdisc(netdev_
);
2848 struct queue_dump_state state
;
2850 if (!netdev
->tc
->ops
->class_dump_stats
) {
2852 } else if (!start_queue_dump(netdev_
, &state
)) {
2858 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
2859 retval
= netdev
->tc
->ops
->class_dump_stats(netdev_
, &msg
,
2866 retval
= finish_queue_dump(&state
);
2874 ovs_mutex_unlock(&netdev
->mutex
);
2879 netdev_linux_set_in4(struct netdev
*netdev_
, struct in_addr address
,
2880 struct in_addr netmask
)
2882 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2885 ovs_mutex_lock(&netdev
->mutex
);
2886 if (netdev_linux_netnsid_is_remote(netdev
)) {
2891 error
= do_set_addr(netdev_
, SIOCSIFADDR
, "SIOCSIFADDR", address
);
2893 if (address
.s_addr
!= INADDR_ANY
) {
2894 error
= do_set_addr(netdev_
, SIOCSIFNETMASK
,
2895 "SIOCSIFNETMASK", netmask
);
2900 ovs_mutex_unlock(&netdev
->mutex
);
2904 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2905 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2908 netdev_linux_get_addr_list(const struct netdev
*netdev_
,
2909 struct in6_addr
**addr
, struct in6_addr
**mask
, int *n_cnt
)
2911 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2914 ovs_mutex_lock(&netdev
->mutex
);
2915 if (netdev_linux_netnsid_is_remote(netdev
)) {
2920 error
= netdev_get_addrs(netdev_get_name(netdev_
), addr
, mask
, n_cnt
);
2923 ovs_mutex_unlock(&netdev
->mutex
);
2928 make_in4_sockaddr(struct sockaddr
*sa
, struct in_addr addr
)
2930 struct sockaddr_in sin
;
2931 memset(&sin
, 0, sizeof sin
);
2932 sin
.sin_family
= AF_INET
;
2933 sin
.sin_addr
= addr
;
2936 memset(sa
, 0, sizeof *sa
);
2937 memcpy(sa
, &sin
, sizeof sin
);
2941 do_set_addr(struct netdev
*netdev
,
2942 int ioctl_nr
, const char *ioctl_name
, struct in_addr addr
)
2946 make_in4_sockaddr(&ifr
.ifr_addr
, addr
);
2947 return af_inet_ifreq_ioctl(netdev_get_name(netdev
), &ifr
, ioctl_nr
,
2951 /* Adds 'router' as a default IP gateway. */
2953 netdev_linux_add_router(struct netdev
*netdev OVS_UNUSED
, struct in_addr router
)
2955 struct in_addr any
= { INADDR_ANY
};
2959 memset(&rt
, 0, sizeof rt
);
2960 make_in4_sockaddr(&rt
.rt_dst
, any
);
2961 make_in4_sockaddr(&rt
.rt_gateway
, router
);
2962 make_in4_sockaddr(&rt
.rt_genmask
, any
);
2963 rt
.rt_flags
= RTF_UP
| RTF_GATEWAY
;
2964 error
= af_inet_ioctl(SIOCADDRT
, &rt
);
2966 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error
));
2972 netdev_linux_get_next_hop(const struct in_addr
*host
, struct in_addr
*next_hop
,
2975 static const char fn
[] = "/proc/net/route";
2980 *netdev_name
= NULL
;
2981 stream
= fopen(fn
, "r");
2982 if (stream
== NULL
) {
2983 VLOG_WARN_RL(&rl
, "%s: open failed: %s", fn
, ovs_strerror(errno
));
2988 while (fgets(line
, sizeof line
, stream
)) {
2991 ovs_be32 dest
, gateway
, mask
;
2992 int refcnt
, metric
, mtu
;
2993 unsigned int flags
, use
, window
, irtt
;
2996 "%16s %"SCNx32
" %"SCNx32
" %04X %d %u %d %"SCNx32
2998 iface
, &dest
, &gateway
, &flags
, &refcnt
,
2999 &use
, &metric
, &mask
, &mtu
, &window
, &irtt
)) {
3000 VLOG_WARN_RL(&rl
, "%s: could not parse line %d: %s",
3004 if (!(flags
& RTF_UP
)) {
3005 /* Skip routes that aren't up. */
3009 /* The output of 'dest', 'mask', and 'gateway' were given in
3010 * network byte order, so we don't need need any endian
3011 * conversions here. */
3012 if ((dest
& mask
) == (host
->s_addr
& mask
)) {
3014 /* The host is directly reachable. */
3015 next_hop
->s_addr
= 0;
3017 /* To reach the host, we must go through a gateway. */
3018 next_hop
->s_addr
= gateway
;
3020 *netdev_name
= xstrdup(iface
);
3032 netdev_linux_get_status(const struct netdev
*netdev_
, struct smap
*smap
)
3034 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3037 ovs_mutex_lock(&netdev
->mutex
);
3038 if (!(netdev
->cache_valid
& VALID_DRVINFO
)) {
3039 struct ethtool_cmd
*cmd
= (struct ethtool_cmd
*) &netdev
->drvinfo
;
3041 COVERAGE_INC(netdev_get_ethtool
);
3042 memset(&netdev
->drvinfo
, 0, sizeof netdev
->drvinfo
);
3043 error
= netdev_linux_do_ethtool(netdev
->up
.name
,
3046 "ETHTOOL_GDRVINFO");
3048 netdev
->cache_valid
|= VALID_DRVINFO
;
3053 smap_add(smap
, "driver_name", netdev
->drvinfo
.driver
);
3054 smap_add(smap
, "driver_version", netdev
->drvinfo
.version
);
3055 smap_add(smap
, "firmware_version", netdev
->drvinfo
.fw_version
);
3057 ovs_mutex_unlock(&netdev
->mutex
);
3063 netdev_internal_get_status(const struct netdev
*netdev OVS_UNUSED
,
3066 smap_add(smap
, "driver_name", "openvswitch");
3071 netdev_linux_get_block_id(struct netdev
*netdev_
)
3073 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3074 uint32_t block_id
= 0;
3076 ovs_mutex_lock(&netdev
->mutex
);
3077 /* Ensure the linux netdev has had its fields populated. */
3078 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
3079 netdev_linux_update_via_netlink(netdev
);
3082 /* Only assigning block ids to linux netdevs that are LAG masters. */
3083 if (netdev
->is_lag_master
) {
3084 block_id
= netdev
->ifindex
;
3086 ovs_mutex_unlock(&netdev
->mutex
);
3091 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3092 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3093 * returns 0. Otherwise, it returns a positive errno value; in particular,
3094 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3096 netdev_linux_arp_lookup(const struct netdev
*netdev
,
3097 ovs_be32 ip
, struct eth_addr
*mac
)
3100 struct sockaddr_in sin
;
3103 memset(&r
, 0, sizeof r
);
3104 memset(&sin
, 0, sizeof sin
);
3105 sin
.sin_family
= AF_INET
;
3106 sin
.sin_addr
.s_addr
= ip
;
3108 memcpy(&r
.arp_pa
, &sin
, sizeof sin
);
3109 r
.arp_ha
.sa_family
= ARPHRD_ETHER
;
3111 ovs_strzcpy(r
.arp_dev
, netdev_get_name(netdev
), sizeof r
.arp_dev
);
3112 COVERAGE_INC(netdev_arp_lookup
);
3113 retval
= af_inet_ioctl(SIOCGARP
, &r
);
3115 memcpy(mac
, r
.arp_ha
.sa_data
, ETH_ADDR_LEN
);
3116 } else if (retval
!= ENXIO
) {
3117 VLOG_WARN_RL(&rl
, "%s: could not look up ARP entry for "IP_FMT
": %s",
3118 netdev_get_name(netdev
), IP_ARGS(ip
),
3119 ovs_strerror(retval
));
3125 nd_to_iff_flags(enum netdev_flags nd
)
3127 unsigned int iff
= 0;
3128 if (nd
& NETDEV_UP
) {
3131 if (nd
& NETDEV_PROMISC
) {
3134 if (nd
& NETDEV_LOOPBACK
) {
3135 iff
|= IFF_LOOPBACK
;
3141 iff_to_nd_flags(unsigned int iff
)
3143 enum netdev_flags nd
= 0;
3147 if (iff
& IFF_PROMISC
) {
3148 nd
|= NETDEV_PROMISC
;
3150 if (iff
& IFF_LOOPBACK
) {
3151 nd
|= NETDEV_LOOPBACK
;
3157 update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
3158 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
3159 OVS_REQUIRES(netdev
->mutex
)
3161 unsigned int old_flags
, new_flags
;
3164 old_flags
= netdev
->ifi_flags
;
3165 *old_flagsp
= iff_to_nd_flags(old_flags
);
3166 new_flags
= (old_flags
& ~nd_to_iff_flags(off
)) | nd_to_iff_flags(on
);
3167 if (new_flags
!= old_flags
) {
3168 error
= set_flags(netdev_get_name(&netdev
->up
), new_flags
);
3169 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
3176 netdev_linux_update_flags(struct netdev
*netdev_
, enum netdev_flags off
,
3177 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
3179 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3182 ovs_mutex_lock(&netdev
->mutex
);
3184 /* Changing flags over netlink isn't support yet. */
3185 if (netdev_linux_netnsid_is_remote(netdev
)) {
3189 error
= update_flags(netdev
, off
, on
, old_flagsp
);
3191 /* Try reading flags over netlink, or fall back to ioctl. */
3192 if (!netdev_linux_update_via_netlink(netdev
)) {
3193 *old_flagsp
= iff_to_nd_flags(netdev
->ifi_flags
);
3195 error
= update_flags(netdev
, off
, on
, old_flagsp
);
3200 ovs_mutex_unlock(&netdev
->mutex
);
3204 #define NETDEV_LINUX_CLASS_COMMON \
3205 .run = netdev_linux_run, \
3206 .wait = netdev_linux_wait, \
3207 .alloc = netdev_linux_alloc, \
3208 .dealloc = netdev_linux_dealloc, \
3209 .send_wait = netdev_linux_send_wait, \
3210 .set_etheraddr = netdev_linux_set_etheraddr, \
3211 .get_etheraddr = netdev_linux_get_etheraddr, \
3212 .get_mtu = netdev_linux_get_mtu, \
3213 .set_mtu = netdev_linux_set_mtu, \
3214 .get_ifindex = netdev_linux_get_ifindex, \
3215 .get_carrier = netdev_linux_get_carrier, \
3216 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3217 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3218 .set_advertisements = netdev_linux_set_advertisements, \
3219 .set_policing = netdev_linux_set_policing, \
3220 .get_qos_types = netdev_linux_get_qos_types, \
3221 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3222 .get_qos = netdev_linux_get_qos, \
3223 .set_qos = netdev_linux_set_qos, \
3224 .get_queue = netdev_linux_get_queue, \
3225 .set_queue = netdev_linux_set_queue, \
3226 .delete_queue = netdev_linux_delete_queue, \
3227 .get_queue_stats = netdev_linux_get_queue_stats, \
3228 .queue_dump_start = netdev_linux_queue_dump_start, \
3229 .queue_dump_next = netdev_linux_queue_dump_next, \
3230 .queue_dump_done = netdev_linux_queue_dump_done, \
3231 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3232 .set_in4 = netdev_linux_set_in4, \
3233 .get_addr_list = netdev_linux_get_addr_list, \
3234 .add_router = netdev_linux_add_router, \
3235 .get_next_hop = netdev_linux_get_next_hop, \
3236 .arp_lookup = netdev_linux_arp_lookup, \
3237 .update_flags = netdev_linux_update_flags, \
3238 .rxq_alloc = netdev_linux_rxq_alloc, \
3239 .rxq_dealloc = netdev_linux_rxq_dealloc, \
3240 .rxq_wait = netdev_linux_rxq_wait, \
3241 .rxq_drain = netdev_linux_rxq_drain
3243 const struct netdev_class netdev_linux_class
= {
3244 NETDEV_LINUX_CLASS_COMMON
,
3247 .construct
= netdev_linux_construct
,
3248 .destruct
= netdev_linux_destruct
,
3249 .get_stats
= netdev_linux_get_stats
,
3250 .get_features
= netdev_linux_get_features
,
3251 .get_status
= netdev_linux_get_status
,
3252 .get_block_id
= netdev_linux_get_block_id
,
3253 .send
= netdev_linux_send
,
3254 .rxq_construct
= netdev_linux_rxq_construct
,
3255 .rxq_destruct
= netdev_linux_rxq_destruct
,
3256 .rxq_recv
= netdev_linux_rxq_recv
,
3259 const struct netdev_class netdev_tap_class
= {
3260 NETDEV_LINUX_CLASS_COMMON
,
3263 .construct
= netdev_linux_construct_tap
,
3264 .destruct
= netdev_linux_destruct
,
3265 .get_stats
= netdev_tap_get_stats
,
3266 .get_features
= netdev_linux_get_features
,
3267 .get_status
= netdev_linux_get_status
,
3268 .send
= netdev_linux_send
,
3269 .rxq_construct
= netdev_linux_rxq_construct
,
3270 .rxq_destruct
= netdev_linux_rxq_destruct
,
3271 .rxq_recv
= netdev_linux_rxq_recv
,
3274 const struct netdev_class netdev_internal_class
= {
3275 NETDEV_LINUX_CLASS_COMMON
,
3278 .construct
= netdev_linux_construct
,
3279 .destruct
= netdev_linux_destruct
,
3280 .get_stats
= netdev_internal_get_stats
,
3281 .get_status
= netdev_internal_get_status
,
3282 .send
= netdev_linux_send
,
3283 .rxq_construct
= netdev_linux_rxq_construct
,
3284 .rxq_destruct
= netdev_linux_rxq_destruct
,
3285 .rxq_recv
= netdev_linux_rxq_recv
,
3289 const struct netdev_class netdev_afxdp_class
= {
3290 NETDEV_LINUX_CLASS_COMMON
,
3293 .construct
= netdev_linux_construct
,
3294 .destruct
= netdev_afxdp_destruct
,
3295 .get_stats
= netdev_afxdp_get_stats
,
3296 .get_status
= netdev_linux_get_status
,
3297 .set_config
= netdev_afxdp_set_config
,
3298 .get_config
= netdev_afxdp_get_config
,
3299 .reconfigure
= netdev_afxdp_reconfigure
,
3300 .get_numa_id
= netdev_afxdp_get_numa_id
,
3301 .send
= netdev_afxdp_batch_send
,
3302 .rxq_construct
= netdev_afxdp_rxq_construct
,
3303 .rxq_destruct
= netdev_afxdp_rxq_destruct
,
3304 .rxq_recv
= netdev_afxdp_rxq_recv
,
3309 #define CODEL_N_QUEUES 0x0000
3311 /* In sufficiently new kernel headers these are defined as enums in
3312 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3313 * kernels. (This overrides any enum definition in the header file but that's
3315 #define TCA_CODEL_TARGET 1
3316 #define TCA_CODEL_LIMIT 2
3317 #define TCA_CODEL_INTERVAL 3
3326 static struct codel
*
3327 codel_get__(const struct netdev
*netdev_
)
3329 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3330 return CONTAINER_OF(netdev
->tc
, struct codel
, tc
);
3334 codel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3337 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3338 struct codel
*codel
;
3340 codel
= xmalloc(sizeof *codel
);
3341 tc_init(&codel
->tc
, &tc_ops_codel
);
3342 codel
->target
= target
;
3343 codel
->limit
= limit
;
3344 codel
->interval
= interval
;
3346 netdev
->tc
= &codel
->tc
;
3350 codel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3354 struct ofpbuf request
;
3355 struct tcmsg
*tcmsg
;
3356 uint32_t otarget
, olimit
, ointerval
;
3359 tc_del_qdisc(netdev
);
3361 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3362 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3366 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3367 tcmsg
->tcm_parent
= TC_H_ROOT
;
3369 otarget
= target
? target
: 5000;
3370 olimit
= limit
? limit
: 10240;
3371 ointerval
= interval
? interval
: 100000;
3373 nl_msg_put_string(&request
, TCA_KIND
, "codel");
3374 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3375 nl_msg_put_u32(&request
, TCA_CODEL_TARGET
, otarget
);
3376 nl_msg_put_u32(&request
, TCA_CODEL_LIMIT
, olimit
);
3377 nl_msg_put_u32(&request
, TCA_CODEL_INTERVAL
, ointerval
);
3378 nl_msg_end_nested(&request
, opt_offset
);
3380 error
= tc_transact(&request
, NULL
);
3382 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3383 "target %u, limit %u, interval %u error %d(%s)",
3384 netdev_get_name(netdev
),
3385 otarget
, olimit
, ointerval
,
3386 error
, ovs_strerror(error
));
3392 codel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3393 const struct smap
*details
, struct codel
*codel
)
3395 codel
->target
= smap_get_ullong(details
, "target", 0);
3396 codel
->limit
= smap_get_ullong(details
, "limit", 0);
3397 codel
->interval
= smap_get_ullong(details
, "interval", 0);
3399 if (!codel
->target
) {
3400 codel
->target
= 5000;
3402 if (!codel
->limit
) {
3403 codel
->limit
= 10240;
3405 if (!codel
->interval
) {
3406 codel
->interval
= 100000;
3411 codel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3416 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3417 error
= codel_setup_qdisc__(netdev
, codel
.target
, codel
.limit
,
3420 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3426 codel_parse_tca_options__(struct nlattr
*nl_options
, struct codel
*codel
)
3428 static const struct nl_policy tca_codel_policy
[] = {
3429 [TCA_CODEL_TARGET
] = { .type
= NL_A_U32
},
3430 [TCA_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3431 [TCA_CODEL_INTERVAL
] = { .type
= NL_A_U32
}
3434 struct nlattr
*attrs
[ARRAY_SIZE(tca_codel_policy
)];
3436 if (!nl_parse_nested(nl_options
, tca_codel_policy
,
3437 attrs
, ARRAY_SIZE(tca_codel_policy
))) {
3438 VLOG_WARN_RL(&rl
, "failed to parse CoDel class options");
3442 codel
->target
= nl_attr_get_u32(attrs
[TCA_CODEL_TARGET
]);
3443 codel
->limit
= nl_attr_get_u32(attrs
[TCA_CODEL_LIMIT
]);
3444 codel
->interval
= nl_attr_get_u32(attrs
[TCA_CODEL_INTERVAL
]);
3449 codel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3451 struct nlattr
*nlattr
;
3456 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3461 error
= codel_parse_tca_options__(nlattr
, &codel
);
3466 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3472 codel_tc_destroy(struct tc
*tc
)
3474 struct codel
*codel
= CONTAINER_OF(tc
, struct codel
, tc
);
3480 codel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3482 const struct codel
*codel
= codel_get__(netdev
);
3483 smap_add_format(details
, "target", "%u", codel
->target
);
3484 smap_add_format(details
, "limit", "%u", codel
->limit
);
3485 smap_add_format(details
, "interval", "%u", codel
->interval
);
3490 codel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3494 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3495 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3496 codel_get__(netdev
)->target
= codel
.target
;
3497 codel_get__(netdev
)->limit
= codel
.limit
;
3498 codel_get__(netdev
)->interval
= codel
.interval
;
3502 static const struct tc_ops tc_ops_codel
= {
3503 .linux_name
= "codel",
3504 .ovs_name
= "linux-codel",
3505 .n_queues
= CODEL_N_QUEUES
,
3506 .tc_install
= codel_tc_install
,
3507 .tc_load
= codel_tc_load
,
3508 .tc_destroy
= codel_tc_destroy
,
3509 .qdisc_get
= codel_qdisc_get
,
3510 .qdisc_set
= codel_qdisc_set
,
3513 /* FQ-CoDel traffic control class. */
3515 #define FQCODEL_N_QUEUES 0x0000
3517 /* In sufficiently new kernel headers these are defined as enums in
3518 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3519 * kernels. (This overrides any enum definition in the header file but that's
3521 #define TCA_FQ_CODEL_TARGET 1
3522 #define TCA_FQ_CODEL_LIMIT 2
3523 #define TCA_FQ_CODEL_INTERVAL 3
3524 #define TCA_FQ_CODEL_ECN 4
3525 #define TCA_FQ_CODEL_FLOWS 5
3526 #define TCA_FQ_CODEL_QUANTUM 6
3537 static struct fqcodel
*
3538 fqcodel_get__(const struct netdev
*netdev_
)
3540 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3541 return CONTAINER_OF(netdev
->tc
, struct fqcodel
, tc
);
3545 fqcodel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3546 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3548 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3549 struct fqcodel
*fqcodel
;
3551 fqcodel
= xmalloc(sizeof *fqcodel
);
3552 tc_init(&fqcodel
->tc
, &tc_ops_fqcodel
);
3553 fqcodel
->target
= target
;
3554 fqcodel
->limit
= limit
;
3555 fqcodel
->interval
= interval
;
3556 fqcodel
->flows
= flows
;
3557 fqcodel
->quantum
= quantum
;
3559 netdev
->tc
= &fqcodel
->tc
;
3563 fqcodel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3564 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3567 struct ofpbuf request
;
3568 struct tcmsg
*tcmsg
;
3569 uint32_t otarget
, olimit
, ointerval
, oflows
, oquantum
;
3572 tc_del_qdisc(netdev
);
3574 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3575 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3579 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3580 tcmsg
->tcm_parent
= TC_H_ROOT
;
3582 otarget
= target
? target
: 5000;
3583 olimit
= limit
? limit
: 10240;
3584 ointerval
= interval
? interval
: 100000;
3585 oflows
= flows
? flows
: 1024;
3586 oquantum
= quantum
? quantum
: 1514; /* fq_codel default quantum is 1514
3589 nl_msg_put_string(&request
, TCA_KIND
, "fq_codel");
3590 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3591 nl_msg_put_u32(&request
, TCA_FQ_CODEL_TARGET
, otarget
);
3592 nl_msg_put_u32(&request
, TCA_FQ_CODEL_LIMIT
, olimit
);
3593 nl_msg_put_u32(&request
, TCA_FQ_CODEL_INTERVAL
, ointerval
);
3594 nl_msg_put_u32(&request
, TCA_FQ_CODEL_FLOWS
, oflows
);
3595 nl_msg_put_u32(&request
, TCA_FQ_CODEL_QUANTUM
, oquantum
);
3596 nl_msg_end_nested(&request
, opt_offset
);
3598 error
= tc_transact(&request
, NULL
);
3600 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3601 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3602 netdev_get_name(netdev
),
3603 otarget
, olimit
, ointerval
, oflows
, oquantum
,
3604 error
, ovs_strerror(error
));
3610 fqcodel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3611 const struct smap
*details
, struct fqcodel
*fqcodel
)
3613 fqcodel
->target
= smap_get_ullong(details
, "target", 0);
3614 fqcodel
->limit
= smap_get_ullong(details
, "limit", 0);
3615 fqcodel
->interval
= smap_get_ullong(details
, "interval", 0);
3616 fqcodel
->flows
= smap_get_ullong(details
, "flows", 0);
3617 fqcodel
->quantum
= smap_get_ullong(details
, "quantum", 0);
3619 if (!fqcodel
->target
) {
3620 fqcodel
->target
= 5000;
3622 if (!fqcodel
->limit
) {
3623 fqcodel
->limit
= 10240;
3625 if (!fqcodel
->interval
) {
3626 fqcodel
->interval
= 1000000;
3628 if (!fqcodel
->flows
) {
3629 fqcodel
->flows
= 1024;
3631 if (!fqcodel
->quantum
) {
3632 fqcodel
->quantum
= 1514;
3637 fqcodel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3640 struct fqcodel fqcodel
;
3642 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3643 error
= fqcodel_setup_qdisc__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3644 fqcodel
.interval
, fqcodel
.flows
,
3647 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3648 fqcodel
.interval
, fqcodel
.flows
, fqcodel
.quantum
);
3654 fqcodel_parse_tca_options__(struct nlattr
*nl_options
, struct fqcodel
*fqcodel
)
3656 static const struct nl_policy tca_fqcodel_policy
[] = {
3657 [TCA_FQ_CODEL_TARGET
] = { .type
= NL_A_U32
},
3658 [TCA_FQ_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3659 [TCA_FQ_CODEL_INTERVAL
] = { .type
= NL_A_U32
},
3660 [TCA_FQ_CODEL_FLOWS
] = { .type
= NL_A_U32
},
3661 [TCA_FQ_CODEL_QUANTUM
] = { .type
= NL_A_U32
}
3664 struct nlattr
*attrs
[ARRAY_SIZE(tca_fqcodel_policy
)];
3666 if (!nl_parse_nested(nl_options
, tca_fqcodel_policy
,
3667 attrs
, ARRAY_SIZE(tca_fqcodel_policy
))) {
3668 VLOG_WARN_RL(&rl
, "failed to parse FQ_CoDel class options");
3672 fqcodel
->target
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_TARGET
]);
3673 fqcodel
->limit
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_LIMIT
]);
3674 fqcodel
->interval
=nl_attr_get_u32(attrs
[TCA_FQ_CODEL_INTERVAL
]);
3675 fqcodel
->flows
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_FLOWS
]);
3676 fqcodel
->quantum
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_QUANTUM
]);
3681 fqcodel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3683 struct nlattr
*nlattr
;
3686 struct fqcodel fqcodel
;
3688 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3693 error
= fqcodel_parse_tca_options__(nlattr
, &fqcodel
);
3698 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3699 fqcodel
.flows
, fqcodel
.quantum
);
3704 fqcodel_tc_destroy(struct tc
*tc
)
3706 struct fqcodel
*fqcodel
= CONTAINER_OF(tc
, struct fqcodel
, tc
);
3712 fqcodel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3714 const struct fqcodel
*fqcodel
= fqcodel_get__(netdev
);
3715 smap_add_format(details
, "target", "%u", fqcodel
->target
);
3716 smap_add_format(details
, "limit", "%u", fqcodel
->limit
);
3717 smap_add_format(details
, "interval", "%u", fqcodel
->interval
);
3718 smap_add_format(details
, "flows", "%u", fqcodel
->flows
);
3719 smap_add_format(details
, "quantum", "%u", fqcodel
->quantum
);
3724 fqcodel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3726 struct fqcodel fqcodel
;
3728 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3729 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3730 fqcodel
.flows
, fqcodel
.quantum
);
3731 fqcodel_get__(netdev
)->target
= fqcodel
.target
;
3732 fqcodel_get__(netdev
)->limit
= fqcodel
.limit
;
3733 fqcodel_get__(netdev
)->interval
= fqcodel
.interval
;
3734 fqcodel_get__(netdev
)->flows
= fqcodel
.flows
;
3735 fqcodel_get__(netdev
)->quantum
= fqcodel
.quantum
;
3739 static const struct tc_ops tc_ops_fqcodel
= {
3740 .linux_name
= "fq_codel",
3741 .ovs_name
= "linux-fq_codel",
3742 .n_queues
= FQCODEL_N_QUEUES
,
3743 .tc_install
= fqcodel_tc_install
,
3744 .tc_load
= fqcodel_tc_load
,
3745 .tc_destroy
= fqcodel_tc_destroy
,
3746 .qdisc_get
= fqcodel_qdisc_get
,
3747 .qdisc_set
= fqcodel_qdisc_set
,
3750 /* SFQ traffic control class. */
3752 #define SFQ_N_QUEUES 0x0000
3761 sfq_get__(const struct netdev
*netdev_
)
3763 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3764 return CONTAINER_OF(netdev
->tc
, struct sfq
, tc
);
3768 sfq_install__(struct netdev
*netdev_
, uint32_t quantum
, uint32_t perturb
)
3770 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3773 sfq
= xmalloc(sizeof *sfq
);
3774 tc_init(&sfq
->tc
, &tc_ops_sfq
);
3775 sfq
->perturb
= perturb
;
3776 sfq
->quantum
= quantum
;
3778 netdev
->tc
= &sfq
->tc
;
3782 sfq_setup_qdisc__(struct netdev
*netdev
, uint32_t quantum
, uint32_t perturb
)
3784 struct tc_sfq_qopt opt
;
3785 struct ofpbuf request
;
3786 struct tcmsg
*tcmsg
;
3788 int mtu_error
, error
;
3789 mtu_error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3791 tc_del_qdisc(netdev
);
3793 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3794 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3798 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3799 tcmsg
->tcm_parent
= TC_H_ROOT
;
3801 memset(&opt
, 0, sizeof opt
);
3804 opt
.quantum
= mtu
; /* if we cannot find mtu, use default */
3807 opt
.quantum
= quantum
;
3811 opt
.perturb_period
= 10;
3813 opt
.perturb_period
= perturb
;
3816 nl_msg_put_string(&request
, TCA_KIND
, "sfq");
3817 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
3819 error
= tc_transact(&request
, NULL
);
3821 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3822 "quantum %u, perturb %u error %d(%s)",
3823 netdev_get_name(netdev
),
3824 opt
.quantum
, opt
.perturb_period
,
3825 error
, ovs_strerror(error
));
3831 sfq_parse_qdisc_details__(struct netdev
*netdev
,
3832 const struct smap
*details
, struct sfq
*sfq
)
3834 sfq
->perturb
= smap_get_ullong(details
, "perturb", 0);
3835 sfq
->quantum
= smap_get_ullong(details
, "quantum", 0);
3837 if (!sfq
->perturb
) {
3841 if (!sfq
->quantum
) {
3843 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
3846 VLOG_WARN_RL(&rl
, "when using SFQ, you must specify quantum on a "
3847 "device without mtu");
3853 sfq_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3858 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3859 error
= sfq_setup_qdisc__(netdev
, sfq
.quantum
, sfq
.perturb
);
3861 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3867 sfq_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3869 const struct tc_sfq_qopt
*sfq
;
3870 struct nlattr
*nlattr
;
3874 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3876 sfq
= nl_attr_get(nlattr
);
3877 sfq_install__(netdev
, sfq
->quantum
, sfq
->perturb_period
);
3885 sfq_tc_destroy(struct tc
*tc
)
3887 struct sfq
*sfq
= CONTAINER_OF(tc
, struct sfq
, tc
);
3893 sfq_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3895 const struct sfq
*sfq
= sfq_get__(netdev
);
3896 smap_add_format(details
, "quantum", "%u", sfq
->quantum
);
3897 smap_add_format(details
, "perturb", "%u", sfq
->perturb
);
3902 sfq_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3906 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3907 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3908 sfq_get__(netdev
)->quantum
= sfq
.quantum
;
3909 sfq_get__(netdev
)->perturb
= sfq
.perturb
;
3913 static const struct tc_ops tc_ops_sfq
= {
3914 .linux_name
= "sfq",
3915 .ovs_name
= "linux-sfq",
3916 .n_queues
= SFQ_N_QUEUES
,
3917 .tc_install
= sfq_tc_install
,
3918 .tc_load
= sfq_tc_load
,
3919 .tc_destroy
= sfq_tc_destroy
,
3920 .qdisc_get
= sfq_qdisc_get
,
3921 .qdisc_set
= sfq_qdisc_set
,
3924 /* netem traffic control class. */
3933 static struct netem
*
3934 netem_get__(const struct netdev
*netdev_
)
3936 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3937 return CONTAINER_OF(netdev
->tc
, struct netem
, tc
);
3941 netem_install__(struct netdev
*netdev_
, uint32_t latency
,
3942 uint32_t limit
, uint32_t loss
)
3944 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3945 struct netem
*netem
;
3947 netem
= xmalloc(sizeof *netem
);
3948 tc_init(&netem
->tc
, &tc_ops_netem
);
3949 netem
->latency
= latency
;
3950 netem
->limit
= limit
;
3953 netdev
->tc
= &netem
->tc
;
3957 netem_setup_qdisc__(struct netdev
*netdev
, uint32_t latency
,
3958 uint32_t limit
, uint32_t loss
)
3960 struct tc_netem_qopt opt
;
3961 struct ofpbuf request
;
3962 struct tcmsg
*tcmsg
;
3965 tc_del_qdisc(netdev
);
3967 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3968 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3972 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3973 tcmsg
->tcm_parent
= TC_H_ROOT
;
3975 memset(&opt
, 0, sizeof opt
);
3986 "loss should be a percentage value between 0 to 100, "
3987 "loss was %u", loss
);
3990 opt
.loss
= floor(UINT32_MAX
* (loss
/ 100.0));
3993 opt
.latency
= tc_time_to_ticks(latency
);
3995 nl_msg_put_string(&request
, TCA_KIND
, "netem");
3996 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
3998 error
= tc_transact(&request
, NULL
);
4000 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
4001 "latency %u, limit %u, loss %u error %d(%s)",
4002 netdev_get_name(netdev
),
4003 opt
.latency
, opt
.limit
, opt
.loss
,
4004 error
, ovs_strerror(error
));
4010 netem_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
4011 const struct smap
*details
, struct netem
*netem
)
4013 netem
->latency
= smap_get_ullong(details
, "latency", 0);
4014 netem
->limit
= smap_get_ullong(details
, "limit", 0);
4015 netem
->loss
= smap_get_ullong(details
, "loss", 0);
4017 if (!netem
->limit
) {
4018 netem
->limit
= 1000;
4023 netem_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4028 netem_parse_qdisc_details__(netdev
, details
, &netem
);
4029 error
= netem_setup_qdisc__(netdev
, netem
.latency
,
4030 netem
.limit
, netem
.loss
);
4032 netem_install__(netdev
, netem
.latency
, netem
.limit
, netem
.loss
);
4038 netem_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
4040 const struct tc_netem_qopt
*netem
;
4041 struct nlattr
*nlattr
;
4045 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
4047 netem
= nl_attr_get(nlattr
);
4048 netem_install__(netdev
, netem
->latency
, netem
->limit
, netem
->loss
);
4056 netem_tc_destroy(struct tc
*tc
)
4058 struct netem
*netem
= CONTAINER_OF(tc
, struct netem
, tc
);
4064 netem_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4066 const struct netem
*netem
= netem_get__(netdev
);
4067 smap_add_format(details
, "latency", "%u", netem
->latency
);
4068 smap_add_format(details
, "limit", "%u", netem
->limit
);
4069 smap_add_format(details
, "loss", "%u", netem
->loss
);
4074 netem_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4078 netem_parse_qdisc_details__(netdev
, details
, &netem
);
4079 netem_install__(netdev
, netem
.latency
, netem
.limit
, netem
.loss
);
4080 netem_get__(netdev
)->latency
= netem
.latency
;
4081 netem_get__(netdev
)->limit
= netem
.limit
;
4082 netem_get__(netdev
)->loss
= netem
.loss
;
4086 static const struct tc_ops tc_ops_netem
= {
4087 .linux_name
= "netem",
4088 .ovs_name
= "linux-netem",
4090 .tc_install
= netem_tc_install
,
4091 .tc_load
= netem_tc_load
,
4092 .tc_destroy
= netem_tc_destroy
,
4093 .qdisc_get
= netem_qdisc_get
,
4094 .qdisc_set
= netem_qdisc_set
,
4097 /* HTB traffic control class. */
4099 #define HTB_N_QUEUES 0xf000
4100 #define HTB_RATE2QUANTUM 10
4104 unsigned int max_rate
; /* In bytes/s. */
4108 struct tc_queue tc_queue
;
4109 unsigned int min_rate
; /* In bytes/s. */
4110 unsigned int max_rate
; /* In bytes/s. */
4111 unsigned int burst
; /* In bytes. */
4112 unsigned int priority
; /* Lower values are higher priorities. */
4116 htb_get__(const struct netdev
*netdev_
)
4118 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4119 return CONTAINER_OF(netdev
->tc
, struct htb
, tc
);
4123 htb_install__(struct netdev
*netdev_
, uint64_t max_rate
)
4125 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4128 htb
= xmalloc(sizeof *htb
);
4129 tc_init(&htb
->tc
, &tc_ops_htb
);
4130 htb
->max_rate
= max_rate
;
4132 netdev
->tc
= &htb
->tc
;
4135 /* Create an HTB qdisc.
4137 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
4139 htb_setup_qdisc__(struct netdev
*netdev
)
4142 struct tc_htb_glob opt
;
4143 struct ofpbuf request
;
4144 struct tcmsg
*tcmsg
;
4146 tc_del_qdisc(netdev
);
4148 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4149 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4153 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4154 tcmsg
->tcm_parent
= TC_H_ROOT
;
4156 nl_msg_put_string(&request
, TCA_KIND
, "htb");
4158 memset(&opt
, 0, sizeof opt
);
4159 opt
.rate2quantum
= HTB_RATE2QUANTUM
;
4163 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4164 nl_msg_put_unspec(&request
, TCA_HTB_INIT
, &opt
, sizeof opt
);
4165 nl_msg_end_nested(&request
, opt_offset
);
4167 return tc_transact(&request
, NULL
);
4170 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
4171 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
4173 htb_setup_class__(struct netdev
*netdev
, unsigned int handle
,
4174 unsigned int parent
, struct htb_class
*class)
4177 struct tc_htb_opt opt
;
4178 struct ofpbuf request
;
4179 struct tcmsg
*tcmsg
;
4183 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
4185 VLOG_WARN_RL(&rl
, "cannot set up HTB on device %s that lacks MTU",
4186 netdev_get_name(netdev
));
4190 memset(&opt
, 0, sizeof opt
);
4191 tc_fill_rate(&opt
.rate
, class->min_rate
, mtu
);
4192 tc_fill_rate(&opt
.ceil
, class->max_rate
, mtu
);
4193 /* Makes sure the quantum is at least MTU. Setting quantum will
4194 * make htb ignore the r2q for this class. */
4195 if ((class->min_rate
/ HTB_RATE2QUANTUM
) < mtu
) {
4198 opt
.buffer
= tc_calc_buffer(opt
.rate
.rate
, mtu
, class->burst
);
4199 opt
.cbuffer
= tc_calc_buffer(opt
.ceil
.rate
, mtu
, class->burst
);
4200 opt
.prio
= class->priority
;
4202 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
4207 tcmsg
->tcm_handle
= handle
;
4208 tcmsg
->tcm_parent
= parent
;
4210 nl_msg_put_string(&request
, TCA_KIND
, "htb");
4211 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4212 nl_msg_put_unspec(&request
, TCA_HTB_PARMS
, &opt
, sizeof opt
);
4213 tc_put_rtab(&request
, TCA_HTB_RTAB
, &opt
.rate
);
4214 tc_put_rtab(&request
, TCA_HTB_CTAB
, &opt
.ceil
);
4215 nl_msg_end_nested(&request
, opt_offset
);
4217 error
= tc_transact(&request
, NULL
);
4219 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
4220 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
4221 netdev_get_name(netdev
),
4222 tc_get_major(handle
), tc_get_minor(handle
),
4223 tc_get_major(parent
), tc_get_minor(parent
),
4224 class->min_rate
, class->max_rate
,
4225 class->burst
, class->priority
, ovs_strerror(error
));
4230 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
4231 * description of them into 'details'. The description complies with the
4232 * specification given in the vswitch database documentation for linux-htb
4235 htb_parse_tca_options__(struct nlattr
*nl_options
, struct htb_class
*class)
4237 static const struct nl_policy tca_htb_policy
[] = {
4238 [TCA_HTB_PARMS
] = { .type
= NL_A_UNSPEC
, .optional
= false,
4239 .min_len
= sizeof(struct tc_htb_opt
) },
4242 struct nlattr
*attrs
[ARRAY_SIZE(tca_htb_policy
)];
4243 const struct tc_htb_opt
*htb
;
4245 if (!nl_parse_nested(nl_options
, tca_htb_policy
,
4246 attrs
, ARRAY_SIZE(tca_htb_policy
))) {
4247 VLOG_WARN_RL(&rl
, "failed to parse HTB class options");
4251 htb
= nl_attr_get(attrs
[TCA_HTB_PARMS
]);
4252 class->min_rate
= htb
->rate
.rate
;
4253 class->max_rate
= htb
->ceil
.rate
;
4254 class->burst
= tc_ticks_to_bytes(htb
->rate
.rate
, htb
->buffer
);
4255 class->priority
= htb
->prio
;
4260 htb_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4261 struct htb_class
*options
,
4262 struct netdev_queue_stats
*stats
)
4264 struct nlattr
*nl_options
;
4265 unsigned int handle
;
4268 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4269 if (!error
&& queue_id
) {
4270 unsigned int major
= tc_get_major(handle
);
4271 unsigned int minor
= tc_get_minor(handle
);
4272 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4273 *queue_id
= minor
- 1;
4278 if (!error
&& options
) {
4279 error
= htb_parse_tca_options__(nl_options
, options
);
4285 htb_parse_qdisc_details__(struct netdev
*netdev_
,
4286 const struct smap
*details
, struct htb_class
*hc
)
4288 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4290 hc
->max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
4291 if (!hc
->max_rate
) {
4292 enum netdev_features current
;
4294 netdev_linux_read_features(netdev
);
4295 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4296 hc
->max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4298 hc
->min_rate
= hc
->max_rate
;
4304 htb_parse_class_details__(struct netdev
*netdev
,
4305 const struct smap
*details
, struct htb_class
*hc
)
4307 const struct htb
*htb
= htb_get__(netdev
);
4309 unsigned long long int max_rate_bit
;
4311 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
4313 VLOG_WARN_RL(&rl
, "cannot parse HTB class on device %s that lacks MTU",
4314 netdev_get_name(netdev
));
4318 /* HTB requires at least an mtu sized min-rate to send any traffic even
4319 * on uncongested links. */
4320 hc
->min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
4321 hc
->min_rate
= MAX(hc
->min_rate
, mtu
);
4322 hc
->min_rate
= MIN(hc
->min_rate
, htb
->max_rate
);
4325 max_rate_bit
= smap_get_ullong(details
, "max-rate", 0);
4326 hc
->max_rate
= max_rate_bit
? max_rate_bit
/ 8 : htb
->max_rate
;
4327 hc
->max_rate
= MAX(hc
->max_rate
, hc
->min_rate
);
4328 hc
->max_rate
= MIN(hc
->max_rate
, htb
->max_rate
);
4332 * According to hints in the documentation that I've read, it is important
4333 * that 'burst' be at least as big as the largest frame that might be
4334 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4335 * but having it a bit too small is a problem. Since netdev_get_mtu()
4336 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4337 * the MTU. We actually add 64, instead of 14, as a guard against
4338 * additional headers get tacked on somewhere that we're not aware of. */
4339 hc
->burst
= smap_get_ullong(details
, "burst", 0) / 8;
4340 hc
->burst
= MAX(hc
->burst
, mtu
+ 64);
4343 hc
->priority
= smap_get_ullong(details
, "priority", 0);
4349 htb_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4350 unsigned int parent
, struct htb_class
*options
,
4351 struct netdev_queue_stats
*stats
)
4353 struct ofpbuf
*reply
;
4356 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4358 error
= htb_parse_tcmsg__(reply
, NULL
, options
, stats
);
4359 ofpbuf_delete(reply
);
4365 htb_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4369 error
= htb_setup_qdisc__(netdev
);
4371 struct htb_class hc
;
4373 htb_parse_qdisc_details__(netdev
, details
, &hc
);
4374 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4375 tc_make_handle(1, 0), &hc
);
4377 htb_install__(netdev
, hc
.max_rate
);
4383 static struct htb_class
*
4384 htb_class_cast__(const struct tc_queue
*queue
)
4386 return CONTAINER_OF(queue
, struct htb_class
, tc_queue
);
4390 htb_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4391 const struct htb_class
*hc
)
4393 struct htb
*htb
= htb_get__(netdev
);
4394 size_t hash
= hash_int(queue_id
, 0);
4395 struct tc_queue
*queue
;
4396 struct htb_class
*hcp
;
4398 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4400 hcp
= htb_class_cast__(queue
);
4402 hcp
= xmalloc(sizeof *hcp
);
4403 queue
= &hcp
->tc_queue
;
4404 queue
->queue_id
= queue_id
;
4405 queue
->created
= time_msec();
4406 hmap_insert(&htb
->tc
.queues
, &queue
->hmap_node
, hash
);
4409 hcp
->min_rate
= hc
->min_rate
;
4410 hcp
->max_rate
= hc
->max_rate
;
4411 hcp
->burst
= hc
->burst
;
4412 hcp
->priority
= hc
->priority
;
4416 htb_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4419 struct queue_dump_state state
;
4420 struct htb_class hc
;
4422 /* Get qdisc options. */
4424 htb_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4425 htb_install__(netdev
, hc
.max_rate
);
4428 if (!start_queue_dump(netdev
, &state
)) {
4431 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4432 unsigned int queue_id
;
4434 if (!htb_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4435 htb_update_queue__(netdev
, queue_id
, &hc
);
4438 finish_queue_dump(&state
);
4444 htb_tc_destroy(struct tc
*tc
)
4446 struct htb
*htb
= CONTAINER_OF(tc
, struct htb
, tc
);
4447 struct htb_class
*hc
;
4449 HMAP_FOR_EACH_POP (hc
, tc_queue
.hmap_node
, &htb
->tc
.queues
) {
4457 htb_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4459 const struct htb
*htb
= htb_get__(netdev
);
4460 smap_add_format(details
, "max-rate", "%llu", 8ULL * htb
->max_rate
);
4465 htb_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4467 struct htb_class hc
;
4470 htb_parse_qdisc_details__(netdev
, details
, &hc
);
4471 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4472 tc_make_handle(1, 0), &hc
);
4474 htb_get__(netdev
)->max_rate
= hc
.max_rate
;
4480 htb_class_get(const struct netdev
*netdev OVS_UNUSED
,
4481 const struct tc_queue
*queue
, struct smap
*details
)
4483 const struct htb_class
*hc
= htb_class_cast__(queue
);
4485 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4486 if (hc
->min_rate
!= hc
->max_rate
) {
4487 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4489 smap_add_format(details
, "burst", "%llu", 8ULL * hc
->burst
);
4491 smap_add_format(details
, "priority", "%u", hc
->priority
);
4497 htb_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4498 const struct smap
*details
)
4500 struct htb_class hc
;
4503 error
= htb_parse_class_details__(netdev
, details
, &hc
);
4508 error
= htb_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
4509 tc_make_handle(1, 0xfffe), &hc
);
4514 htb_update_queue__(netdev
, queue_id
, &hc
);
4519 htb_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4521 struct htb_class
*hc
= htb_class_cast__(queue
);
4522 struct htb
*htb
= htb_get__(netdev
);
4525 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4527 hmap_remove(&htb
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4534 htb_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4535 struct netdev_queue_stats
*stats
)
4537 return htb_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4538 tc_make_handle(1, 0xfffe), NULL
, stats
);
4542 htb_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4543 const struct ofpbuf
*nlmsg
,
4544 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4546 struct netdev_queue_stats stats
;
4547 unsigned int handle
, major
, minor
;
4550 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4555 major
= tc_get_major(handle
);
4556 minor
= tc_get_minor(handle
);
4557 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4558 (*cb
)(minor
- 1, &stats
, aux
);
4563 static const struct tc_ops tc_ops_htb
= {
4564 .linux_name
= "htb",
4565 .ovs_name
= "linux-htb",
4566 .n_queues
= HTB_N_QUEUES
,
4567 .tc_install
= htb_tc_install
,
4568 .tc_load
= htb_tc_load
,
4569 .tc_destroy
= htb_tc_destroy
,
4570 .qdisc_get
= htb_qdisc_get
,
4571 .qdisc_set
= htb_qdisc_set
,
4572 .class_get
= htb_class_get
,
4573 .class_set
= htb_class_set
,
4574 .class_delete
= htb_class_delete
,
4575 .class_get_stats
= htb_class_get_stats
,
4576 .class_dump_stats
= htb_class_dump_stats
4579 /* "linux-hfsc" traffic control class. */
4581 #define HFSC_N_QUEUES 0xf000
4589 struct tc_queue tc_queue
;
4594 static struct hfsc
*
4595 hfsc_get__(const struct netdev
*netdev_
)
4597 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4598 return CONTAINER_OF(netdev
->tc
, struct hfsc
, tc
);
4601 static struct hfsc_class
*
4602 hfsc_class_cast__(const struct tc_queue
*queue
)
4604 return CONTAINER_OF(queue
, struct hfsc_class
, tc_queue
);
4608 hfsc_install__(struct netdev
*netdev_
, uint32_t max_rate
)
4610 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4613 hfsc
= xmalloc(sizeof *hfsc
);
4614 tc_init(&hfsc
->tc
, &tc_ops_hfsc
);
4615 hfsc
->max_rate
= max_rate
;
4616 netdev
->tc
= &hfsc
->tc
;
4620 hfsc_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4621 const struct hfsc_class
*hc
)
4625 struct hfsc_class
*hcp
;
4626 struct tc_queue
*queue
;
4628 hfsc
= hfsc_get__(netdev
);
4629 hash
= hash_int(queue_id
, 0);
4631 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4633 hcp
= hfsc_class_cast__(queue
);
4635 hcp
= xmalloc(sizeof *hcp
);
4636 queue
= &hcp
->tc_queue
;
4637 queue
->queue_id
= queue_id
;
4638 queue
->created
= time_msec();
4639 hmap_insert(&hfsc
->tc
.queues
, &queue
->hmap_node
, hash
);
4642 hcp
->min_rate
= hc
->min_rate
;
4643 hcp
->max_rate
= hc
->max_rate
;
4647 hfsc_parse_tca_options__(struct nlattr
*nl_options
, struct hfsc_class
*class)
4649 const struct tc_service_curve
*rsc
, *fsc
, *usc
;
4650 static const struct nl_policy tca_hfsc_policy
[] = {
4652 .type
= NL_A_UNSPEC
,
4654 .min_len
= sizeof(struct tc_service_curve
),
4657 .type
= NL_A_UNSPEC
,
4659 .min_len
= sizeof(struct tc_service_curve
),
4662 .type
= NL_A_UNSPEC
,
4664 .min_len
= sizeof(struct tc_service_curve
),
4667 struct nlattr
*attrs
[ARRAY_SIZE(tca_hfsc_policy
)];
4669 if (!nl_parse_nested(nl_options
, tca_hfsc_policy
,
4670 attrs
, ARRAY_SIZE(tca_hfsc_policy
))) {
4671 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options");
4675 rsc
= nl_attr_get(attrs
[TCA_HFSC_RSC
]);
4676 fsc
= nl_attr_get(attrs
[TCA_HFSC_FSC
]);
4677 usc
= nl_attr_get(attrs
[TCA_HFSC_USC
]);
4679 if (rsc
->m1
!= 0 || rsc
->d
!= 0 ||
4680 fsc
->m1
!= 0 || fsc
->d
!= 0 ||
4681 usc
->m1
!= 0 || usc
->d
!= 0) {
4682 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4683 "Non-linear service curves are not supported.");
4687 if (rsc
->m2
!= fsc
->m2
) {
4688 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4689 "Real-time service curves are not supported ");
4693 if (rsc
->m2
> usc
->m2
) {
4694 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4695 "Min-rate service curve is greater than "
4696 "the max-rate service curve.");
4700 class->min_rate
= fsc
->m2
;
4701 class->max_rate
= usc
->m2
;
4706 hfsc_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4707 struct hfsc_class
*options
,
4708 struct netdev_queue_stats
*stats
)
4711 unsigned int handle
;
4712 struct nlattr
*nl_options
;
4714 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4720 unsigned int major
, minor
;
4722 major
= tc_get_major(handle
);
4723 minor
= tc_get_minor(handle
);
4724 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4725 *queue_id
= minor
- 1;
4732 error
= hfsc_parse_tca_options__(nl_options
, options
);
4739 hfsc_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4740 unsigned int parent
, struct hfsc_class
*options
,
4741 struct netdev_queue_stats
*stats
)
4744 struct ofpbuf
*reply
;
4746 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4751 error
= hfsc_parse_tcmsg__(reply
, NULL
, options
, stats
);
4752 ofpbuf_delete(reply
);
4757 hfsc_parse_qdisc_details__(struct netdev
*netdev_
, const struct smap
*details
,
4758 struct hfsc_class
*class)
4760 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4762 uint32_t max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
4764 enum netdev_features current
;
4766 netdev_linux_read_features(netdev
);
4767 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4768 max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4771 class->min_rate
= max_rate
;
4772 class->max_rate
= max_rate
;
4776 hfsc_parse_class_details__(struct netdev
*netdev
,
4777 const struct smap
*details
,
4778 struct hfsc_class
* class)
4780 const struct hfsc
*hfsc
;
4781 uint32_t min_rate
, max_rate
;
4783 hfsc
= hfsc_get__(netdev
);
4785 min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
4786 min_rate
= MAX(min_rate
, 1);
4787 min_rate
= MIN(min_rate
, hfsc
->max_rate
);
4789 max_rate
= smap_get_ullong(details
, "max-rate", hfsc
->max_rate
* 8) / 8;
4790 max_rate
= MAX(max_rate
, min_rate
);
4791 max_rate
= MIN(max_rate
, hfsc
->max_rate
);
4793 class->min_rate
= min_rate
;
4794 class->max_rate
= max_rate
;
4799 /* Create an HFSC qdisc.
4801 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4803 hfsc_setup_qdisc__(struct netdev
* netdev
)
4805 struct tcmsg
*tcmsg
;
4806 struct ofpbuf request
;
4807 struct tc_hfsc_qopt opt
;
4809 tc_del_qdisc(netdev
);
4811 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4812 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4818 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4819 tcmsg
->tcm_parent
= TC_H_ROOT
;
4821 memset(&opt
, 0, sizeof opt
);
4824 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4825 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
4827 return tc_transact(&request
, NULL
);
4830 /* Create an HFSC class.
4832 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4833 * sc rate <min_rate> ul rate <max_rate>" */
4835 hfsc_setup_class__(struct netdev
*netdev
, unsigned int handle
,
4836 unsigned int parent
, struct hfsc_class
*class)
4840 struct tcmsg
*tcmsg
;
4841 struct ofpbuf request
;
4842 struct tc_service_curve min
, max
;
4844 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
4851 tcmsg
->tcm_handle
= handle
;
4852 tcmsg
->tcm_parent
= parent
;
4856 min
.m2
= class->min_rate
;
4860 max
.m2
= class->max_rate
;
4862 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4863 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4864 nl_msg_put_unspec(&request
, TCA_HFSC_RSC
, &min
, sizeof min
);
4865 nl_msg_put_unspec(&request
, TCA_HFSC_FSC
, &min
, sizeof min
);
4866 nl_msg_put_unspec(&request
, TCA_HFSC_USC
, &max
, sizeof max
);
4867 nl_msg_end_nested(&request
, opt_offset
);
4869 error
= tc_transact(&request
, NULL
);
4871 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
4872 "min-rate %ubps, max-rate %ubps (%s)",
4873 netdev_get_name(netdev
),
4874 tc_get_major(handle
), tc_get_minor(handle
),
4875 tc_get_major(parent
), tc_get_minor(parent
),
4876 class->min_rate
, class->max_rate
, ovs_strerror(error
));
4883 hfsc_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4886 struct hfsc_class
class;
4888 error
= hfsc_setup_qdisc__(netdev
);
4894 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4895 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4896 tc_make_handle(1, 0), &class);
4902 hfsc_install__(netdev
, class.max_rate
);
4907 hfsc_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4910 struct queue_dump_state state
;
4911 struct hfsc_class hc
;
4914 hfsc_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4915 hfsc_install__(netdev
, hc
.max_rate
);
4917 if (!start_queue_dump(netdev
, &state
)) {
4921 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4922 unsigned int queue_id
;
4924 if (!hfsc_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4925 hfsc_update_queue__(netdev
, queue_id
, &hc
);
4929 finish_queue_dump(&state
);
4934 hfsc_tc_destroy(struct tc
*tc
)
4937 struct hfsc_class
*hc
, *next
;
4939 hfsc
= CONTAINER_OF(tc
, struct hfsc
, tc
);
4941 HMAP_FOR_EACH_SAFE (hc
, next
, tc_queue
.hmap_node
, &hfsc
->tc
.queues
) {
4942 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4951 hfsc_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4953 const struct hfsc
*hfsc
;
4954 hfsc
= hfsc_get__(netdev
);
4955 smap_add_format(details
, "max-rate", "%llu", 8ULL * hfsc
->max_rate
);
4960 hfsc_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4963 struct hfsc_class
class;
4965 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4966 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4967 tc_make_handle(1, 0), &class);
4970 hfsc_get__(netdev
)->max_rate
= class.max_rate
;
4977 hfsc_class_get(const struct netdev
*netdev OVS_UNUSED
,
4978 const struct tc_queue
*queue
, struct smap
*details
)
4980 const struct hfsc_class
*hc
;
4982 hc
= hfsc_class_cast__(queue
);
4983 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4984 if (hc
->min_rate
!= hc
->max_rate
) {
4985 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4991 hfsc_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4992 const struct smap
*details
)
4995 struct hfsc_class
class;
4997 error
= hfsc_parse_class_details__(netdev
, details
, &class);
5002 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
5003 tc_make_handle(1, 0xfffe), &class);
5008 hfsc_update_queue__(netdev
, queue_id
, &class);
5013 hfsc_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
5017 struct hfsc_class
*hc
;
5019 hc
= hfsc_class_cast__(queue
);
5020 hfsc
= hfsc_get__(netdev
);
5022 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
5024 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
5031 hfsc_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
5032 struct netdev_queue_stats
*stats
)
5034 return hfsc_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
5035 tc_make_handle(1, 0xfffe), NULL
, stats
);
5039 hfsc_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
5040 const struct ofpbuf
*nlmsg
,
5041 netdev_dump_queue_stats_cb
*cb
, void *aux
)
5043 struct netdev_queue_stats stats
;
5044 unsigned int handle
, major
, minor
;
5047 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
5052 major
= tc_get_major(handle
);
5053 minor
= tc_get_minor(handle
);
5054 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
5055 (*cb
)(minor
- 1, &stats
, aux
);
5060 static const struct tc_ops tc_ops_hfsc
= {
5061 .linux_name
= "hfsc",
5062 .ovs_name
= "linux-hfsc",
5063 .n_queues
= HFSC_N_QUEUES
, /* n_queues */
5064 .tc_install
= hfsc_tc_install
,
5065 .tc_load
= hfsc_tc_load
,
5066 .tc_destroy
= hfsc_tc_destroy
,
5067 .qdisc_get
= hfsc_qdisc_get
,
5068 .qdisc_set
= hfsc_qdisc_set
,
5069 .class_get
= hfsc_class_get
,
5070 .class_set
= hfsc_class_set
,
5071 .class_delete
= hfsc_class_delete
,
5072 .class_get_stats
= hfsc_class_get_stats
,
5073 .class_dump_stats
= hfsc_class_dump_stats
,
5076 /* "linux-noop" traffic control class. */
5079 noop_install__(struct netdev
*netdev_
)
5081 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5082 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
5084 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
5088 noop_tc_install(struct netdev
*netdev
,
5089 const struct smap
*details OVS_UNUSED
)
5091 noop_install__(netdev
);
5096 noop_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5098 noop_install__(netdev
);
5102 static const struct tc_ops tc_ops_noop
= {
5103 .ovs_name
= "linux-noop", /* ovs_name */
5104 .tc_install
= noop_tc_install
,
5105 .tc_load
= noop_tc_load
,
5108 /* "linux-default" traffic control class.
5110 * This class represents the default, unnamed Linux qdisc. It corresponds to
5111 * the "" (empty string) QoS type in the OVS database. */
5114 default_install__(struct netdev
*netdev_
)
5116 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5117 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
5119 /* Nothing but a tc class implementation is allowed to write to a tc. This
5120 * class never does that, so we can legitimately use a const tc object. */
5121 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
5125 default_tc_install(struct netdev
*netdev
,
5126 const struct smap
*details OVS_UNUSED
)
5128 default_install__(netdev
);
5133 default_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5135 default_install__(netdev
);
5139 static const struct tc_ops tc_ops_default
= {
5140 .ovs_name
= "", /* ovs_name */
5141 .tc_install
= default_tc_install
,
5142 .tc_load
= default_tc_load
,
5145 /* "linux-other" traffic control class.
5150 other_tc_load(struct netdev
*netdev_
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5152 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5153 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_other
);
5155 /* Nothing but a tc class implementation is allowed to write to a tc. This
5156 * class never does that, so we can legitimately use a const tc object. */
5157 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
5161 static const struct tc_ops tc_ops_other
= {
5162 .ovs_name
= "linux-other",
5163 .tc_load
= other_tc_load
,
5166 /* Traffic control. */
5168 /* Number of kernel "tc" ticks per second. */
5169 static double ticks_per_s
;
5171 /* Number of kernel "jiffies" per second. This is used for the purpose of
5172 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
5173 * one jiffy's worth of data.
5175 * There are two possibilities here:
5177 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5178 * approximate range of 100 to 1024. That means that we really need to
5179 * make sure that the qdisc can buffer that much data.
5181 * - 'buffer_hz' is an absurdly large number. That means that the kernel
5182 * has finely granular timers and there's no need to fudge additional room
5183 * for buffers. (There's no extra effort needed to implement that: the
5184 * large 'buffer_hz' is used as a divisor, so practically any number will
5185 * come out as 0 in the division. Small integer results in the case of
5186 * really high dividends won't have any real effect anyhow.)
5188 static unsigned int buffer_hz
;
5190 static struct tcmsg
*
5191 netdev_linux_tc_make_request(const struct netdev
*netdev
, int type
,
5192 unsigned int flags
, struct ofpbuf
*request
)
5197 error
= get_ifindex(netdev
, &ifindex
);
5202 return tc_make_request(ifindex
, type
, flags
, request
);
5205 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5208 * This function is equivalent to running:
5209 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5210 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
5213 * The configuration and stats may be seen with the following command:
5214 * /sbin/tc -s filter show dev <devname> parent ffff:
5216 * Returns 0 if successful, otherwise a positive errno value.
5219 tc_add_policer(struct netdev
*netdev
,
5220 uint32_t kbits_rate
, uint32_t kbits_burst
)
5222 struct tc_police tc_police
;
5223 struct ofpbuf request
;
5224 struct tcmsg
*tcmsg
;
5225 size_t basic_offset
;
5226 size_t police_offset
;
5230 memset(&tc_police
, 0, sizeof tc_police
);
5231 tc_police
.action
= TC_POLICE_SHOT
;
5232 tc_police
.mtu
= mtu
;
5233 tc_fill_rate(&tc_police
.rate
, ((uint64_t) kbits_rate
* 1000)/8, mtu
);
5235 /* The following appears wrong in one way: In networking a kilobit is
5236 * usually 1000 bits but this uses 1024 bits.
5238 * However if you "fix" those problems then "tc filter show ..." shows
5239 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5240 * 1,000,000 bits, whereas this actually ends up doing the right thing from
5241 * tc's point of view. Whatever. */
5242 tc_police
.burst
= tc_bytes_to_ticks(
5243 tc_police
.rate
.rate
, MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024 / 8);
5245 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTFILTER
,
5246 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
5250 tcmsg
->tcm_parent
= tc_make_handle(0xffff, 0);
5251 tcmsg
->tcm_info
= tc_make_handle(49,
5252 (OVS_FORCE
uint16_t) htons(ETH_P_ALL
));
5254 nl_msg_put_string(&request
, TCA_KIND
, "basic");
5255 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
5256 police_offset
= nl_msg_start_nested(&request
, TCA_BASIC_POLICE
);
5257 nl_msg_put_unspec(&request
, TCA_POLICE_TBF
, &tc_police
, sizeof tc_police
);
5258 tc_put_rtab(&request
, TCA_POLICE_RATE
, &tc_police
.rate
);
5259 nl_msg_end_nested(&request
, police_offset
);
5260 nl_msg_end_nested(&request
, basic_offset
);
5262 error
= tc_transact(&request
, NULL
);
5273 /* The values in psched are not individually very meaningful, but they are
5274 * important. The tables below show some values seen in the wild.
5278 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5279 * (Before that, there are hints that it was 1000000000.)
5281 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5285 * -----------------------------------
5286 * [1] 000c8000 000f4240 000f4240 00000064
5287 * [2] 000003e8 00000400 000f4240 3b9aca00
5288 * [3] 000003e8 00000400 000f4240 3b9aca00
5289 * [4] 000003e8 00000400 000f4240 00000064
5290 * [5] 000003e8 00000040 000f4240 3b9aca00
5291 * [6] 000003e8 00000040 000f4240 000000f9
5293 * a b c d ticks_per_s buffer_hz
5294 * ------- --------- ---------- ------------- ----------- -------------
5295 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5296 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5297 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5298 * [4] 1,000 1,024 1,000,000 100 976,562 100
5299 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5300 * [6] 1,000 64 1,000,000 249 15,625,000 249
5302 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5303 * [2] 2.6.26-1-686-bigmem from Debian lenny
5304 * [3] 2.6.26-2-sparc64 from Debian lenny
5305 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5306 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5307 * [6] 2.6.34 from kernel.org on KVM
5309 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5310 static const char fn
[] = "/proc/net/psched";
5311 unsigned int a
, b
, c
, d
;
5314 if (!ovsthread_once_start(&once
)) {
5321 stream
= fopen(fn
, "r");
5323 VLOG_WARN("%s: open failed: %s", fn
, ovs_strerror(errno
));
5327 if (fscanf(stream
, "%x %x %x %x", &a
, &b
, &c
, &d
) != 4) {
5328 VLOG_WARN("%s: read failed", fn
);
5332 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn
, a
, b
, c
, d
);
5335 if (!a
|| !b
|| !c
) {
5336 VLOG_WARN("%s: invalid scheduler parameters", fn
);
5340 ticks_per_s
= (double) a
* c
/ b
;
5344 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5347 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn
, ticks_per_s
, buffer_hz
);
5350 ovsthread_once_done(&once
);
5353 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5354 * rate of 'rate' bytes per second. */
5356 tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
)
5359 return (rate
* ticks
) / ticks_per_s
;
5362 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
5363 * rate of 'rate' bytes per second. */
5365 tc_bytes_to_ticks(unsigned int rate
, unsigned int size
)
5368 return rate
? ((unsigned long long int) ticks_per_s
* size
) / rate
: 0;
5371 /* Returns the number of bytes that need to be reserved for qdisc buffering at
5372 * a transmission rate of 'rate' bytes per second. */
5374 tc_buffer_per_jiffy(unsigned int rate
)
5377 return rate
/ buffer_hz
;
5381 tc_time_to_ticks(uint32_t time
) {
5383 return time
* (ticks_per_s
/ 1000000);
5386 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5387 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5388 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5389 * stores NULL into it if it is absent.
5391 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5394 * Returns 0 if successful, otherwise a positive errno value. */
5396 tc_parse_qdisc(const struct ofpbuf
*msg
, const char **kind
,
5397 struct nlattr
**options
)
5399 static const struct nl_policy tca_policy
[] = {
5400 [TCA_KIND
] = { .type
= NL_A_STRING
, .optional
= false },
5401 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= true },
5403 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5405 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5406 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5407 VLOG_WARN_RL(&rl
, "failed to parse qdisc message");
5412 *kind
= nl_attr_get_string(ta
[TCA_KIND
]);
5416 *options
= ta
[TCA_OPTIONS
];
5431 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5432 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5433 * into '*options', and its queue statistics into '*stats'. Any of the output
5434 * arguments may be null.
5436 * Returns 0 if successful, otherwise a positive errno value. */
5438 tc_parse_class(const struct ofpbuf
*msg
, unsigned int *handlep
,
5439 struct nlattr
**options
, struct netdev_queue_stats
*stats
)
5441 static const struct nl_policy tca_policy
[] = {
5442 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= false },
5443 [TCA_STATS2
] = { .type
= NL_A_NESTED
, .optional
= false },
5445 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5447 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5448 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5449 VLOG_WARN_RL(&rl
, "failed to parse class message");
5454 struct tcmsg
*tc
= ofpbuf_at_assert(msg
, NLMSG_HDRLEN
, sizeof *tc
);
5455 *handlep
= tc
->tcm_handle
;
5459 *options
= ta
[TCA_OPTIONS
];
5463 const struct gnet_stats_queue
*gsq
;
5464 struct gnet_stats_basic gsb
;
5466 static const struct nl_policy stats_policy
[] = {
5467 [TCA_STATS_BASIC
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5468 .min_len
= sizeof gsb
},
5469 [TCA_STATS_QUEUE
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5470 .min_len
= sizeof *gsq
},
5472 struct nlattr
*sa
[ARRAY_SIZE(stats_policy
)];
5474 if (!nl_parse_nested(ta
[TCA_STATS2
], stats_policy
,
5475 sa
, ARRAY_SIZE(sa
))) {
5476 VLOG_WARN_RL(&rl
, "failed to parse class stats");
5480 /* Alignment issues screw up the length of struct gnet_stats_basic on
5481 * some arch/bitsize combinations. Newer versions of Linux have a
5482 * struct gnet_stats_basic_packed, but we can't depend on that. The
5483 * easiest thing to do is just to make a copy. */
5484 memset(&gsb
, 0, sizeof gsb
);
5485 memcpy(&gsb
, nl_attr_get(sa
[TCA_STATS_BASIC
]),
5486 MIN(nl_attr_get_size(sa
[TCA_STATS_BASIC
]), sizeof gsb
));
5487 stats
->tx_bytes
= gsb
.bytes
;
5488 stats
->tx_packets
= gsb
.packets
;
5490 gsq
= nl_attr_get(sa
[TCA_STATS_QUEUE
]);
5491 stats
->tx_errors
= gsq
->drops
;
5501 memset(stats
, 0, sizeof *stats
);
5506 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5509 tc_query_class(const struct netdev
*netdev
,
5510 unsigned int handle
, unsigned int parent
,
5511 struct ofpbuf
**replyp
)
5513 struct ofpbuf request
;
5514 struct tcmsg
*tcmsg
;
5517 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, NLM_F_ECHO
,
5522 tcmsg
->tcm_handle
= handle
;
5523 tcmsg
->tcm_parent
= parent
;
5525 error
= tc_transact(&request
, replyp
);
5527 VLOG_WARN_RL(&rl
, "query %s class %u:%u (parent %u:%u) failed (%s)",
5528 netdev_get_name(netdev
),
5529 tc_get_major(handle
), tc_get_minor(handle
),
5530 tc_get_major(parent
), tc_get_minor(parent
),
5531 ovs_strerror(error
));
5536 /* Equivalent to "tc class del dev <name> handle <handle>". */
5538 tc_delete_class(const struct netdev
*netdev
, unsigned int handle
)
5540 struct ofpbuf request
;
5541 struct tcmsg
*tcmsg
;
5544 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_DELTCLASS
, 0, &request
);
5548 tcmsg
->tcm_handle
= handle
;
5549 tcmsg
->tcm_parent
= 0;
5551 error
= tc_transact(&request
, NULL
);
5553 VLOG_WARN_RL(&rl
, "delete %s class %u:%u failed (%s)",
5554 netdev_get_name(netdev
),
5555 tc_get_major(handle
), tc_get_minor(handle
),
5556 ovs_strerror(error
));
5561 /* Equivalent to "tc qdisc del dev <name> root". */
5563 tc_del_qdisc(struct netdev
*netdev_
)
5565 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5566 struct ofpbuf request
;
5567 struct tcmsg
*tcmsg
;
5570 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_DELQDISC
, 0, &request
);
5574 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
5575 tcmsg
->tcm_parent
= TC_H_ROOT
;
5577 error
= tc_transact(&request
, NULL
);
5578 if (error
== EINVAL
) {
5579 /* EINVAL probably means that the default qdisc was in use, in which
5580 * case we've accomplished our purpose. */
5583 if (!error
&& netdev
->tc
) {
5584 if (netdev
->tc
->ops
->tc_destroy
) {
5585 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
5593 getqdisc_is_safe(void)
5595 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5596 static bool safe
= false;
5598 if (ovsthread_once_start(&once
)) {
5599 struct utsname utsname
;
5602 if (uname(&utsname
) == -1) {
5603 VLOG_WARN("uname failed (%s)", ovs_strerror(errno
));
5604 } else if (!ovs_scan(utsname
.release
, "%d.%d", &major
, &minor
)) {
5605 VLOG_WARN("uname reported bad OS release (%s)", utsname
.release
);
5606 } else if (major
< 2 || (major
== 2 && minor
< 35)) {
5607 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5612 ovsthread_once_done(&once
);
5617 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5618 * kernel to determine what they are. Returns 0 if successful, otherwise a
5619 * positive errno value. */
5621 tc_query_qdisc(const struct netdev
*netdev_
)
5623 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5624 struct ofpbuf request
, *qdisc
;
5625 const struct tc_ops
*ops
;
5626 struct tcmsg
*tcmsg
;
5634 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5635 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5636 * 2.6.35 without that fix backported to it.
5638 * To avoid the OOPS, we must not make a request that would attempt to dump
5639 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5640 * few others. There are a few ways that I can see to do this, but most of
5641 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5642 * technique chosen here is to assume that any non-default qdisc that we
5643 * create will have a class with handle 1:0. The built-in qdiscs only have
5644 * a class with handle 0:0.
5646 * On Linux 2.6.35+ we use the straightforward method because it allows us
5647 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5648 * in such a case we get no response at all from the kernel (!) if a
5649 * builtin qdisc is in use (which is later caught by "!error &&
5650 * !qdisc->size"). */
5651 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_GETQDISC
, NLM_F_ECHO
,
5656 tcmsg
->tcm_handle
= tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5657 tcmsg
->tcm_parent
= getqdisc_is_safe() ? TC_H_ROOT
: 0;
5659 /* Figure out what tc class to instantiate. */
5660 error
= tc_transact(&request
, &qdisc
);
5661 if (!error
&& qdisc
->size
) {
5664 error
= tc_parse_qdisc(qdisc
, &kind
, NULL
);
5666 ops
= &tc_ops_other
;
5668 ops
= tc_lookup_linux_name(kind
);
5670 static struct vlog_rate_limit rl2
= VLOG_RATE_LIMIT_INIT(1, 1);
5671 VLOG_DBG_RL(&rl2
, "unknown qdisc \"%s\"", kind
);
5673 ops
= &tc_ops_other
;
5676 } else if ((!error
&& !qdisc
->size
) || error
== ENOENT
) {
5677 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5678 * set up by some other entity that doesn't have a handle 1:0. We will
5679 * assume that it's the system default qdisc. */
5680 ops
= &tc_ops_default
;
5683 /* Who knows? Maybe the device got deleted. */
5684 VLOG_WARN_RL(&rl
, "query %s qdisc failed (%s)",
5685 netdev_get_name(netdev_
), ovs_strerror(error
));
5686 ops
= &tc_ops_other
;
5689 /* Instantiate it. */
5690 load_error
= ops
->tc_load(CONST_CAST(struct netdev
*, netdev_
), qdisc
);
5691 ovs_assert((load_error
== 0) == (netdev
->tc
!= NULL
));
5692 ofpbuf_delete(qdisc
);
5694 return error
? error
: load_error
;
5697 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5698 approximate the time to transmit packets of various lengths. For an MTU of
5699 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5700 represents two possible packet lengths; for a MTU of 513 through 1024, four
5701 possible lengths; and so on.
5703 Returns, for the specified 'mtu', the number of bits that packet lengths
5704 need to be shifted right to fit within such a 256-entry table. */
5706 tc_calc_cell_log(unsigned int mtu
)
5711 mtu
= ETH_PAYLOAD_MAX
;
5713 mtu
+= ETH_HEADER_LEN
+ VLAN_HEADER_LEN
;
5715 for (cell_log
= 0; mtu
>= 256; cell_log
++) {
5722 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5725 tc_fill_rate(struct tc_ratespec
*rate
, uint64_t Bps
, int mtu
)
5727 memset(rate
, 0, sizeof *rate
);
5728 rate
->cell_log
= tc_calc_cell_log(mtu
);
5729 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5730 /* rate->cell_align = 0; */ /* distro headers. */
5731 rate
->mpu
= ETH_TOTAL_MIN
;
5735 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5736 * attribute of the specified "type".
5738 * See tc_calc_cell_log() above for a description of "rtab"s. */
5740 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
)
5745 rtab
= nl_msg_put_unspec_uninit(msg
, type
, TC_RTAB_SIZE
);
5746 for (i
= 0; i
< TC_RTAB_SIZE
/ sizeof *rtab
; i
++) {
5747 unsigned packet_size
= (i
+ 1) << rate
->cell_log
;
5748 if (packet_size
< rate
->mpu
) {
5749 packet_size
= rate
->mpu
;
5751 rtab
[i
] = tc_bytes_to_ticks(rate
->rate
, packet_size
);
5755 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5756 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5757 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5760 tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
)
5762 unsigned int min_burst
= tc_buffer_per_jiffy(Bps
) + mtu
;
5763 return tc_bytes_to_ticks(Bps
, MAX(burst_bytes
, min_burst
));
5766 /* Linux-only functions declared in netdev-linux.h */
5768 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5769 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5771 netdev_linux_ethtool_set_flag(struct netdev
*netdev
, uint32_t flag
,
5772 const char *flag_name
, bool enable
)
5774 const char *netdev_name
= netdev_get_name(netdev
);
5775 struct ethtool_value evalue
;
5779 COVERAGE_INC(netdev_get_ethtool
);
5780 memset(&evalue
, 0, sizeof evalue
);
5781 error
= netdev_linux_do_ethtool(netdev_name
,
5782 (struct ethtool_cmd
*)&evalue
,
5783 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5788 COVERAGE_INC(netdev_set_ethtool
);
5789 new_flags
= (evalue
.data
& ~flag
) | (enable
? flag
: 0);
5790 if (new_flags
== evalue
.data
) {
5793 evalue
.data
= new_flags
;
5794 error
= netdev_linux_do_ethtool(netdev_name
,
5795 (struct ethtool_cmd
*)&evalue
,
5796 ETHTOOL_SFLAGS
, "ETHTOOL_SFLAGS");
5801 COVERAGE_INC(netdev_get_ethtool
);
5802 memset(&evalue
, 0, sizeof evalue
);
5803 error
= netdev_linux_do_ethtool(netdev_name
,
5804 (struct ethtool_cmd
*)&evalue
,
5805 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5810 if (new_flags
!= evalue
.data
) {
5811 VLOG_WARN_RL(&rl
, "attempt to %s ethtool %s flag on network "
5812 "device %s failed", enable
? "enable" : "disable",
5813 flag_name
, netdev_name
);
5820 /* Utility functions. */
5822 /* Copies 'src' into 'dst', performing format conversion in the process. */
5824 netdev_stats_from_rtnl_link_stats(struct netdev_stats
*dst
,
5825 const struct rtnl_link_stats
*src
)
5827 dst
->rx_packets
= src
->rx_packets
;
5828 dst
->tx_packets
= src
->tx_packets
;
5829 dst
->rx_bytes
= src
->rx_bytes
;
5830 dst
->tx_bytes
= src
->tx_bytes
;
5831 dst
->rx_errors
= src
->rx_errors
;
5832 dst
->tx_errors
= src
->tx_errors
;
5833 dst
->rx_dropped
= src
->rx_dropped
;
5834 dst
->tx_dropped
= src
->tx_dropped
;
5835 dst
->multicast
= src
->multicast
;
5836 dst
->collisions
= src
->collisions
;
5837 dst
->rx_length_errors
= src
->rx_length_errors
;
5838 dst
->rx_over_errors
= src
->rx_over_errors
;
5839 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5840 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5841 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5842 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5843 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5844 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5845 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5846 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5847 dst
->tx_window_errors
= src
->tx_window_errors
;
5850 /* Copies 'src' into 'dst', performing format conversion in the process. */
5852 netdev_stats_from_rtnl_link_stats64(struct netdev_stats
*dst
,
5853 const struct rtnl_link_stats64
*src
)
5855 dst
->rx_packets
= src
->rx_packets
;
5856 dst
->tx_packets
= src
->tx_packets
;
5857 dst
->rx_bytes
= src
->rx_bytes
;
5858 dst
->tx_bytes
= src
->tx_bytes
;
5859 dst
->rx_errors
= src
->rx_errors
;
5860 dst
->tx_errors
= src
->tx_errors
;
5861 dst
->rx_dropped
= src
->rx_dropped
;
5862 dst
->tx_dropped
= src
->tx_dropped
;
5863 dst
->multicast
= src
->multicast
;
5864 dst
->collisions
= src
->collisions
;
5865 dst
->rx_length_errors
= src
->rx_length_errors
;
5866 dst
->rx_over_errors
= src
->rx_over_errors
;
5867 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5868 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5869 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5870 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5871 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5872 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5873 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5874 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5875 dst
->tx_window_errors
= src
->tx_window_errors
;
5879 get_stats_via_netlink(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
5881 struct ofpbuf request
;
5882 struct ofpbuf
*reply
;
5885 /* Filtering all counters by default */
5886 memset(stats
, 0xFF, sizeof(struct netdev_stats
));
5888 ofpbuf_init(&request
, 0);
5889 nl_msg_put_nlmsghdr(&request
,
5890 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
5891 RTM_GETLINK
, NLM_F_REQUEST
);
5892 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
5893 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(netdev_
));
5894 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
5895 ofpbuf_uninit(&request
);
5900 if (ofpbuf_try_pull(reply
, NLMSG_HDRLEN
+ sizeof(struct ifinfomsg
))) {
5901 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS64
);
5902 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats64
)) {
5903 netdev_stats_from_rtnl_link_stats64(stats
, nl_attr_get(a
));
5906 a
= nl_attr_find(reply
, 0, IFLA_STATS
);
5907 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats
)) {
5908 netdev_stats_from_rtnl_link_stats(stats
, nl_attr_get(a
));
5911 VLOG_WARN_RL(&rl
, "RTM_GETLINK reply lacks stats");
5916 VLOG_WARN_RL(&rl
, "short RTM_GETLINK reply");
5921 ofpbuf_delete(reply
);
5926 get_flags(const struct netdev
*dev
, unsigned int *flags
)
5932 error
= af_inet_ifreq_ioctl(dev
->name
, &ifr
, SIOCGIFFLAGS
, "SIOCGIFFLAGS");
5934 *flags
= ifr
.ifr_flags
;
5940 set_flags(const char *name
, unsigned int flags
)
5944 ifr
.ifr_flags
= flags
;
5945 return af_inet_ifreq_ioctl(name
, &ifr
, SIOCSIFFLAGS
, "SIOCSIFFLAGS");
5949 linux_get_ifindex(const char *netdev_name
)
5954 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5955 COVERAGE_INC(netdev_get_ifindex
);
5957 error
= af_inet_ioctl(SIOCGIFINDEX
, &ifr
);
5959 /* ENODEV probably means that a vif disappeared asynchronously and
5960 * hasn't been removed from the database yet, so reduce the log level
5961 * to INFO for that case. */
5962 VLOG_RL(&rl
, error
== ENODEV
? VLL_INFO
: VLL_ERR
,
5963 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5964 netdev_name
, ovs_strerror(error
));
5967 return ifr
.ifr_ifindex
;
5971 get_ifindex(const struct netdev
*netdev_
, int *ifindexp
)
5973 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5975 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
5976 netdev_linux_update_via_netlink(netdev
);
5979 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
5980 /* Fall back to ioctl if netlink fails */
5981 int ifindex
= linux_get_ifindex(netdev_get_name(netdev_
));
5984 netdev
->get_ifindex_error
= -ifindex
;
5985 netdev
->ifindex
= 0;
5987 netdev
->get_ifindex_error
= 0;
5988 netdev
->ifindex
= ifindex
;
5990 netdev
->cache_valid
|= VALID_IFINDEX
;
5993 *ifindexp
= netdev
->ifindex
;
5994 return netdev
->get_ifindex_error
;
5998 netdev_linux_update_via_netlink(struct netdev_linux
*netdev
)
6000 struct ofpbuf request
;
6001 struct ofpbuf
*reply
;
6002 struct rtnetlink_change chg
;
6003 struct rtnetlink_change
*change
= &chg
;
6006 ofpbuf_init(&request
, 0);
6007 nl_msg_put_nlmsghdr(&request
,
6008 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
) +
6009 NL_A_U32_SIZE
, RTM_GETLINK
, NLM_F_REQUEST
);
6010 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
6012 /* The correct identifiers for a Linux device are netnsid and ifindex,
6013 * but ifindex changes as the port is moved to another network namespace
6014 * and the interface name statically stored in ovsdb. */
6015 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(&netdev
->up
));
6016 if (netdev_linux_netnsid_is_remote(netdev
)) {
6017 nl_msg_put_u32(&request
, IFLA_IF_NETNSID
, netdev
->netnsid
);
6019 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
6020 ofpbuf_uninit(&request
);
6022 ofpbuf_delete(reply
);
6026 if (rtnetlink_parse(reply
, change
)
6027 && change
->nlmsg_type
== RTM_NEWLINK
) {
6028 bool changed
= false;
6031 /* Update netdev from rtnl msg and increment its seq if needed. */
6032 if ((change
->ifi_flags
^ netdev
->ifi_flags
) & IFF_RUNNING
) {
6033 netdev
->carrier_resets
++;
6036 if (change
->ifi_flags
!= netdev
->ifi_flags
) {
6037 netdev
->ifi_flags
= change
->ifi_flags
;
6040 if (change
->mtu
&& change
->mtu
!= netdev
->mtu
) {
6041 netdev
->mtu
= change
->mtu
;
6042 netdev
->cache_valid
|= VALID_MTU
;
6043 netdev
->netdev_mtu_error
= 0;
6046 if (!eth_addr_is_zero(change
->mac
)
6047 && !eth_addr_equals(change
->mac
, netdev
->etheraddr
)) {
6048 netdev
->etheraddr
= change
->mac
;
6049 netdev
->cache_valid
|= VALID_ETHERADDR
;
6050 netdev
->ether_addr_error
= 0;
6053 if (change
->if_index
!= netdev
->ifindex
) {
6054 netdev
->ifindex
= change
->if_index
;
6055 netdev
->cache_valid
|= VALID_IFINDEX
;
6056 netdev
->get_ifindex_error
= 0;
6059 if (change
->master
&& netdev_linux_kind_is_lag(change
->master
)) {
6060 netdev
->is_lag_master
= true;
6063 netdev_change_seq_changed(&netdev
->up
);
6069 ofpbuf_delete(reply
);
6074 get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
)
6080 memset(&ifr
, 0, sizeof ifr
);
6081 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
6082 COVERAGE_INC(netdev_get_hwaddr
);
6083 error
= af_inet_ioctl(SIOCGIFHWADDR
, &ifr
);
6085 /* ENODEV probably means that a vif disappeared asynchronously and
6086 * hasn't been removed from the database yet, so reduce the log level
6087 * to INFO for that case. */
6088 VLOG(error
== ENODEV
? VLL_INFO
: VLL_ERR
,
6089 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
6090 netdev_name
, ovs_strerror(error
));
6093 hwaddr_family
= ifr
.ifr_hwaddr
.sa_family
;
6094 if (hwaddr_family
!= AF_UNSPEC
&& hwaddr_family
!= ARPHRD_ETHER
&&
6095 hwaddr_family
!= ARPHRD_NONE
) {
6096 VLOG_INFO("%s device has unknown hardware address family %d",
6097 netdev_name
, hwaddr_family
);
6100 memcpy(ea
, ifr
.ifr_hwaddr
.sa_data
, ETH_ADDR_LEN
);
6105 set_etheraddr(const char *netdev_name
, const struct eth_addr mac
)
6110 memset(&ifr
, 0, sizeof ifr
);
6111 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
6112 ifr
.ifr_hwaddr
.sa_family
= ARPHRD_ETHER
;
6113 memcpy(ifr
.ifr_hwaddr
.sa_data
, &mac
, ETH_ADDR_LEN
);
6114 COVERAGE_INC(netdev_set_hwaddr
);
6115 error
= af_inet_ioctl(SIOCSIFHWADDR
, &ifr
);
6117 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
6118 netdev_name
, ovs_strerror(error
));
6124 netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*ecmd
,
6125 int cmd
, const char *cmd_name
)
6130 memset(&ifr
, 0, sizeof ifr
);
6131 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
6132 ifr
.ifr_data
= (caddr_t
) ecmd
;
6135 error
= af_inet_ioctl(SIOCETHTOOL
, &ifr
);
6137 if (error
!= EOPNOTSUPP
) {
6138 VLOG_WARN_RL(&rl
, "ethtool command %s on network device %s "
6139 "failed: %s", cmd_name
, name
, ovs_strerror(error
));
6141 /* The device doesn't support this operation. That's pretty
6142 * common, so there's no point in logging anything. */
6148 /* Returns an AF_PACKET raw socket or a negative errno value. */
6150 af_packet_sock(void)
6152 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
6155 if (ovsthread_once_start(&once
)) {
6156 sock
= socket(AF_PACKET
, SOCK_RAW
, 0);
6158 int error
= set_nonblocking(sock
);
6165 VLOG_ERR("failed to create packet socket: %s",
6166 ovs_strerror(errno
));
6168 ovsthread_once_done(&once
);