2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <sys/types.h>
24 #include <netinet/in.h>
25 #include <arpa/inet.h>
27 #include <linux/filter.h>
28 #include <linux/gen_stats.h>
29 #include <linux/if_ether.h>
30 #include <linux/if_tun.h>
31 #include <linux/types.h>
32 #include <linux/ethtool.h>
33 #include <linux/mii.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/ioctl.h>
37 #include <sys/socket.h>
38 #include <sys/utsname.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
50 #include "dp-packet.h"
51 #include "dpif-netlink.h"
52 #include "dpif-netdev.h"
53 #include "openvswitch/dynamic-string.h"
54 #include "fatal-signal.h"
56 #include "openvswitch/hmap.h"
57 #include "netdev-provider.h"
58 #include "netdev-tc-offloads.h"
59 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "openvswitch/poll-loop.h"
69 #include "rtnetlink.h"
70 #include "openvswitch/shash.h"
71 #include "socket-util.h"
75 #include "unaligned.h"
76 #include "openvswitch/vlog.h"
79 VLOG_DEFINE_THIS_MODULE(netdev_linux
);
81 COVERAGE_DEFINE(netdev_set_policing
);
82 COVERAGE_DEFINE(netdev_arp_lookup
);
83 COVERAGE_DEFINE(netdev_get_ifindex
);
84 COVERAGE_DEFINE(netdev_get_hwaddr
);
85 COVERAGE_DEFINE(netdev_set_hwaddr
);
86 COVERAGE_DEFINE(netdev_get_ethtool
);
87 COVERAGE_DEFINE(netdev_set_ethtool
);
90 #ifndef IFLA_IF_NETNSID
91 #define IFLA_IF_NETNSID 0x45
93 /* These were introduced in Linux 2.6.14, so they might be missing if we have
95 #ifndef ADVERTISED_Pause
96 #define ADVERTISED_Pause (1 << 13)
98 #ifndef ADVERTISED_Asym_Pause
99 #define ADVERTISED_Asym_Pause (1 << 14)
102 /* These were introduced in Linux 2.6.24, so they might be missing if we
103 * have old headers. */
104 #ifndef ETHTOOL_GFLAGS
105 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
107 #ifndef ETHTOOL_SFLAGS
108 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
111 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
114 #define TC_RTAB_SIZE 1024
117 /* Linux 2.6.21 introduced struct tpacket_auxdata.
118 * Linux 2.6.27 added the tp_vlan_tci member.
119 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
120 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
121 * TP_STATUS_VLAN_TPID_VALID.
123 * With all this churn it's easiest to unconditionally define a replacement
124 * structure that has everything we want.
126 #ifndef PACKET_AUXDATA
127 #define PACKET_AUXDATA 8
129 #ifndef TP_STATUS_VLAN_VALID
130 #define TP_STATUS_VLAN_VALID (1 << 4)
132 #ifndef TP_STATUS_VLAN_TPID_VALID
133 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
135 #undef tpacket_auxdata
136 #define tpacket_auxdata rpl_tpacket_auxdata
137 struct tpacket_auxdata
{
143 uint16_t tp_vlan_tci
;
144 uint16_t tp_vlan_tpid
;
147 /* Linux 2.6.27 introduced ethtool_cmd_speed
149 * To avoid revisiting problems reported with using configure to detect
150 * compatibility (see report at
151 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
152 * unconditionally replace ethtool_cmd_speed. */
153 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
154 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd
*ep
)
156 return ep
->speed
| (ep
->speed_hi
<< 16);
159 /* Linux 2.6.30 introduced supported and advertised flags for
160 * 1G base KX, and 10G base KX4, KR and R. */
161 #ifndef SUPPORTED_1000baseKX_Full
162 #define SUPPORTED_1000baseKX_Full (1 << 17)
163 #define SUPPORTED_10000baseKX4_Full (1 << 18)
164 #define SUPPORTED_10000baseKR_Full (1 << 19)
165 #define SUPPORTED_10000baseR_FEC (1 << 20)
166 #define ADVERTISED_1000baseKX_Full (1 << 17)
167 #define ADVERTISED_10000baseKX4_Full (1 << 18)
168 #define ADVERTISED_10000baseKR_Full (1 << 19)
169 #define ADVERTISED_10000baseR_FEC (1 << 20)
172 /* Linux 3.5 introduced supported and advertised flags for
173 * 40G base KR4, CR4, SR4 and LR4. */
174 #ifndef SUPPORTED_40000baseKR4_Full
175 #define SUPPORTED_40000baseKR4_Full (1 << 23)
176 #define SUPPORTED_40000baseCR4_Full (1 << 24)
177 #define SUPPORTED_40000baseSR4_Full (1 << 25)
178 #define SUPPORTED_40000baseLR4_Full (1 << 26)
179 #define ADVERTISED_40000baseKR4_Full (1 << 23)
180 #define ADVERTISED_40000baseCR4_Full (1 << 24)
181 #define ADVERTISED_40000baseSR4_Full (1 << 25)
182 #define ADVERTISED_40000baseLR4_Full (1 << 26)
185 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
187 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
188 * 2.6.32-431.29.2.el6.x86_64 (see report at
189 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
190 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
191 * unconditionally define a replacement. */
193 #define IFLA_STATS64 23
195 #define rtnl_link_stats64 rpl_rtnl_link_stats64
196 struct rtnl_link_stats64
{
208 uint64_t rx_length_errors
;
209 uint64_t rx_over_errors
;
210 uint64_t rx_crc_errors
;
211 uint64_t rx_frame_errors
;
212 uint64_t rx_fifo_errors
;
213 uint64_t rx_missed_errors
;
215 uint64_t tx_aborted_errors
;
216 uint64_t tx_carrier_errors
;
217 uint64_t tx_fifo_errors
;
218 uint64_t tx_heartbeat_errors
;
219 uint64_t tx_window_errors
;
221 uint64_t rx_compressed
;
222 uint64_t tx_compressed
;
226 VALID_IFINDEX
= 1 << 0,
227 VALID_ETHERADDR
= 1 << 1,
230 VALID_POLICING
= 1 << 4,
231 VALID_VPORT_STAT_ERROR
= 1 << 5,
232 VALID_DRVINFO
= 1 << 6,
233 VALID_FEATURES
= 1 << 7,
236 /* Traffic control. */
238 /* An instance of a traffic control class. Always associated with a particular
241 * Each TC implementation subclasses this with whatever additional data it
244 const struct tc_ops
*ops
;
245 struct hmap queues
; /* Contains "struct tc_queue"s.
246 * Read by generic TC layer.
247 * Written only by TC implementation. */
250 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
252 /* One traffic control queue.
254 * Each TC implementation subclasses this with whatever additional data it
257 struct hmap_node hmap_node
; /* In struct tc's "queues" hmap. */
258 unsigned int queue_id
; /* OpenFlow queue ID. */
259 long long int created
; /* Time queue was created, in msecs. */
262 /* A particular kind of traffic control. Each implementation generally maps to
263 * one particular Linux qdisc class.
265 * The functions below return 0 if successful or a positive errno value on
266 * failure, except where otherwise noted. All of them must be provided, except
267 * where otherwise noted. */
269 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
270 * This is null for tc_ops_default and tc_ops_other, for which there are no
271 * appropriate values. */
272 const char *linux_name
;
274 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
275 const char *ovs_name
;
277 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
278 * queues. The queues are numbered 0 through n_queues - 1. */
279 unsigned int n_queues
;
281 /* Called to install this TC class on 'netdev'. The implementation should
282 * make the Netlink calls required to set up 'netdev' with the right qdisc
283 * and configure it according to 'details'. The implementation may assume
284 * that the current qdisc is the default; that is, there is no need for it
285 * to delete the current qdisc before installing itself.
287 * The contents of 'details' should be documented as valid for 'ovs_name'
288 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
289 * (which is built as ovs-vswitchd.conf.db(8)).
291 * This function must return 0 if and only if it sets 'netdev->tc' to an
292 * initialized 'struct tc'.
294 * (This function is null for tc_ops_other, which cannot be installed. For
295 * other TC classes it should always be nonnull.) */
296 int (*tc_install
)(struct netdev
*netdev
, const struct smap
*details
);
298 /* Called when the netdev code determines (through a Netlink query) that
299 * this TC class's qdisc is installed on 'netdev', but we didn't install
300 * it ourselves and so don't know any of the details.
302 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
303 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
304 * implementation should parse the other attributes of 'nlmsg' as
305 * necessary to determine its configuration. If necessary it should also
306 * use Netlink queries to determine the configuration of queues on
309 * This function must return 0 if and only if it sets 'netdev->tc' to an
310 * initialized 'struct tc'. */
311 int (*tc_load
)(struct netdev
*netdev
, struct ofpbuf
*nlmsg
);
313 /* Destroys the data structures allocated by the implementation as part of
314 * 'tc'. (This includes destroying 'tc->queues' by calling
317 * The implementation should not need to perform any Netlink calls. If
318 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
319 * (But it may not be desirable.)
321 * This function may be null if 'tc' is trivial. */
322 void (*tc_destroy
)(struct tc
*tc
);
324 /* Retrieves details of 'netdev->tc' configuration into 'details'.
326 * The implementation should not need to perform any Netlink calls, because
327 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
328 * cached the configuration.
330 * The contents of 'details' should be documented as valid for 'ovs_name'
331 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
332 * (which is built as ovs-vswitchd.conf.db(8)).
334 * This function may be null if 'tc' is not configurable.
336 int (*qdisc_get
)(const struct netdev
*netdev
, struct smap
*details
);
338 /* Reconfigures 'netdev->tc' according to 'details', performing any
339 * required Netlink calls to complete the reconfiguration.
341 * The contents of 'details' should be documented as valid for 'ovs_name'
342 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
343 * (which is built as ovs-vswitchd.conf.db(8)).
345 * This function may be null if 'tc' is not configurable.
347 int (*qdisc_set
)(struct netdev
*, const struct smap
*details
);
349 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
350 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
352 * The contents of 'details' should be documented as valid for 'ovs_name'
353 * in the "other_config" column in the "Queue" table in
354 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
356 * The implementation should not need to perform any Netlink calls, because
357 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
358 * cached the queue configuration.
360 * This function may be null if 'tc' does not have queues ('n_queues' is
362 int (*class_get
)(const struct netdev
*netdev
, const struct tc_queue
*queue
,
363 struct smap
*details
);
365 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
366 * 'details', perfoming any required Netlink calls to complete the
367 * reconfiguration. The caller ensures that 'queue_id' is less than
370 * The contents of 'details' should be documented as valid for 'ovs_name'
371 * in the "other_config" column in the "Queue" table in
372 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
374 * This function may be null if 'tc' does not have queues or its queues are
375 * not configurable. */
376 int (*class_set
)(struct netdev
*, unsigned int queue_id
,
377 const struct smap
*details
);
379 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
380 * tc_queue's within 'netdev->tc->queues'.
382 * This function may be null if 'tc' does not have queues or its queues
383 * cannot be deleted. */
384 int (*class_delete
)(struct netdev
*, struct tc_queue
*queue
);
386 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
387 * 'struct tc_queue's within 'netdev->tc->queues'.
389 * On success, initializes '*stats'.
391 * This function may be null if 'tc' does not have queues or if it cannot
392 * report queue statistics. */
393 int (*class_get_stats
)(const struct netdev
*netdev
,
394 const struct tc_queue
*queue
,
395 struct netdev_queue_stats
*stats
);
397 /* Extracts queue stats from 'nlmsg', which is a response to a
398 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
400 * This function may be null if 'tc' does not have queues or if it cannot
401 * report queue statistics. */
402 int (*class_dump_stats
)(const struct netdev
*netdev
,
403 const struct ofpbuf
*nlmsg
,
404 netdev_dump_queue_stats_cb
*cb
, void *aux
);
408 tc_init(struct tc
*tc
, const struct tc_ops
*ops
)
411 hmap_init(&tc
->queues
);
415 tc_destroy(struct tc
*tc
)
417 hmap_destroy(&tc
->queues
);
420 static const struct tc_ops tc_ops_htb
;
421 static const struct tc_ops tc_ops_hfsc
;
422 static const struct tc_ops tc_ops_codel
;
423 static const struct tc_ops tc_ops_fqcodel
;
424 static const struct tc_ops tc_ops_sfq
;
425 static const struct tc_ops tc_ops_default
;
426 static const struct tc_ops tc_ops_noop
;
427 static const struct tc_ops tc_ops_other
;
429 static const struct tc_ops
*const tcs
[] = {
430 &tc_ops_htb
, /* Hierarchy token bucket (see tc-htb(8)). */
431 &tc_ops_hfsc
, /* Hierarchical fair service curve. */
432 &tc_ops_codel
, /* Controlled delay */
433 &tc_ops_fqcodel
, /* Fair queue controlled delay */
434 &tc_ops_sfq
, /* Stochastic fair queueing */
435 &tc_ops_noop
, /* Non operating qos type. */
436 &tc_ops_default
, /* Default qdisc (see tc-pfifo_fast(8)). */
437 &tc_ops_other
, /* Some other qdisc. */
441 static unsigned int tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
);
442 static unsigned int tc_bytes_to_ticks(unsigned int rate
, unsigned int size
);
443 static unsigned int tc_buffer_per_jiffy(unsigned int rate
);
445 static struct tcmsg
*netdev_linux_tc_make_request(const struct netdev
*,
449 static int tc_add_policer(struct netdev
*,
450 uint32_t kbits_rate
, uint32_t kbits_burst
);
452 static int tc_parse_qdisc(const struct ofpbuf
*, const char **kind
,
453 struct nlattr
**options
);
454 static int tc_parse_class(const struct ofpbuf
*, unsigned int *queue_id
,
455 struct nlattr
**options
,
456 struct netdev_queue_stats
*);
457 static int tc_query_class(const struct netdev
*,
458 unsigned int handle
, unsigned int parent
,
459 struct ofpbuf
**replyp
);
460 static int tc_delete_class(const struct netdev
*, unsigned int handle
);
462 static int tc_del_qdisc(struct netdev
*netdev
);
463 static int tc_query_qdisc(const struct netdev
*netdev
);
465 static int tc_calc_cell_log(unsigned int mtu
);
466 static void tc_fill_rate(struct tc_ratespec
*rate
, uint64_t bps
, int mtu
);
467 static void tc_put_rtab(struct ofpbuf
*, uint16_t type
,
468 const struct tc_ratespec
*rate
);
469 static int tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
);
471 struct netdev_linux
{
474 /* Protects all members below. */
475 struct ovs_mutex mutex
;
477 unsigned int cache_valid
;
479 bool miimon
; /* Link status of last poll. */
480 long long int miimon_interval
; /* Miimon Poll rate. Disabled if <= 0. */
481 struct timer miimon_timer
;
483 int netnsid
; /* Network namespace ID. */
484 /* The following are figured out "on demand" only. They are only valid
485 * when the corresponding VALID_* bit in 'cache_valid' is set. */
487 struct eth_addr etheraddr
;
489 unsigned int ifi_flags
;
490 long long int carrier_resets
;
491 uint32_t kbits_rate
; /* Policing data. */
492 uint32_t kbits_burst
;
493 int vport_stats_error
; /* Cached error code from vport_get_stats().
494 0 or an errno value. */
495 int netdev_mtu_error
; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
496 int ether_addr_error
; /* Cached error code from set/get etheraddr. */
497 int netdev_policing_error
; /* Cached error code from set policing. */
498 int get_features_error
; /* Cached error code from ETHTOOL_GSET. */
499 int get_ifindex_error
; /* Cached error code from SIOCGIFINDEX. */
501 enum netdev_features current
; /* Cached from ETHTOOL_GSET. */
502 enum netdev_features advertised
; /* Cached from ETHTOOL_GSET. */
503 enum netdev_features supported
; /* Cached from ETHTOOL_GSET. */
505 struct ethtool_drvinfo drvinfo
; /* Cached from ETHTOOL_GDRVINFO. */
508 /* For devices of class netdev_tap_class only. */
510 bool present
; /* If the device is present in the namespace */
511 uint64_t tx_dropped
; /* tap device can drop if the iface is down */
514 struct netdev_rxq_linux
{
515 struct netdev_rxq up
;
520 /* This is set pretty low because we probably won't learn anything from the
521 * additional log messages. */
522 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
524 /* Polling miimon status for all ports causes performance degradation when
525 * handling a large number of ports. If there are no devices using miimon, then
526 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
528 * Readers do not depend on this variable synchronizing with the related
529 * changes in the device miimon status, so we can use atomic_count. */
530 static atomic_count miimon_cnt
= ATOMIC_COUNT_INIT(0);
532 static void netdev_linux_run(const struct netdev_class
*);
534 static int netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*,
535 int cmd
, const char *cmd_name
);
536 static int get_flags(const struct netdev
*, unsigned int *flags
);
537 static int set_flags(const char *, unsigned int flags
);
538 static int update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
539 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
540 OVS_REQUIRES(netdev
->mutex
);
541 static int get_ifindex(const struct netdev
*, int *ifindexp
);
542 static int do_set_addr(struct netdev
*netdev
,
543 int ioctl_nr
, const char *ioctl_name
,
544 struct in_addr addr
);
545 static int get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
);
546 static int set_etheraddr(const char *netdev_name
, const struct eth_addr
);
547 static int get_stats_via_netlink(const struct netdev
*, struct netdev_stats
*);
548 static int af_packet_sock(void);
549 static bool netdev_linux_miimon_enabled(void);
550 static void netdev_linux_miimon_run(void);
551 static void netdev_linux_miimon_wait(void);
552 static int netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
);
555 is_netdev_linux_class(const struct netdev_class
*netdev_class
)
557 return netdev_class
->run
== netdev_linux_run
;
561 is_tap_netdev(const struct netdev
*netdev
)
563 return netdev_get_class(netdev
) == &netdev_tap_class
;
566 static struct netdev_linux
*
567 netdev_linux_cast(const struct netdev
*netdev
)
569 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev
)));
571 return CONTAINER_OF(netdev
, struct netdev_linux
, up
);
574 static struct netdev_rxq_linux
*
575 netdev_rxq_linux_cast(const struct netdev_rxq
*rx
)
577 ovs_assert(is_netdev_linux_class(netdev_get_class(rx
->netdev
)));
578 return CONTAINER_OF(rx
, struct netdev_rxq_linux
, up
);
582 netdev_linux_netnsid_update__(struct netdev_linux
*netdev
)
584 struct dpif_netlink_vport reply
;
588 error
= dpif_netlink_vport_get(netdev_get_name(&netdev
->up
), &reply
, &buf
);
590 netnsid_unset(&netdev
->netnsid
);
594 netnsid_set(&netdev
->netnsid
, reply
.netnsid
);
600 netdev_linux_netnsid_update(struct netdev_linux
*netdev
)
602 if (netnsid_is_unset(netdev
->netnsid
)) {
603 return netdev_linux_netnsid_update__(netdev
);
610 netdev_linux_netnsid_is_eq(struct netdev_linux
*netdev
, int nsid
)
612 netdev_linux_netnsid_update(netdev
);
613 return netnsid_eq(netdev
->netnsid
, nsid
);
617 netdev_linux_netnsid_is_remote(struct netdev_linux
*netdev
)
619 netdev_linux_netnsid_update(netdev
);
620 return netnsid_is_remote(netdev
->netnsid
);
623 static int netdev_linux_update_via_netlink(struct netdev_linux
*);
624 static void netdev_linux_update(struct netdev_linux
*netdev
, int,
625 const struct rtnetlink_change
*)
626 OVS_REQUIRES(netdev
->mutex
);
627 static void netdev_linux_changed(struct netdev_linux
*netdev
,
628 unsigned int ifi_flags
, unsigned int mask
)
629 OVS_REQUIRES(netdev
->mutex
);
631 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
632 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
633 * if no such socket could be created. */
634 static struct nl_sock
*
635 netdev_linux_notify_sock(void)
637 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
638 static struct nl_sock
*sock
;
639 unsigned int mcgroups
[] = {RTNLGRP_LINK
, RTNLGRP_IPV4_IFADDR
,
640 RTNLGRP_IPV6_IFADDR
, RTNLGRP_IPV6_IFINFO
};
642 if (ovsthread_once_start(&once
)) {
645 error
= nl_sock_create(NETLINK_ROUTE
, &sock
);
649 for (i
= 0; i
< ARRAY_SIZE(mcgroups
); i
++) {
650 error
= nl_sock_join_mcgroup(sock
, mcgroups
[i
]);
652 nl_sock_destroy(sock
);
658 nl_sock_listen_all_nsid(sock
, true);
659 ovsthread_once_done(&once
);
666 netdev_linux_miimon_enabled(void)
668 return atomic_count_get(&miimon_cnt
) > 0;
672 netdev_linux_run(const struct netdev_class
*netdev_class OVS_UNUSED
)
674 struct nl_sock
*sock
;
677 if (netdev_linux_miimon_enabled()) {
678 netdev_linux_miimon_run();
681 sock
= netdev_linux_notify_sock();
687 uint64_t buf_stub
[4096 / 8];
691 ofpbuf_use_stub(&buf
, buf_stub
, sizeof buf_stub
);
692 error
= nl_sock_recv(sock
, &buf
, &nsid
, false);
694 struct rtnetlink_change change
;
696 if (rtnetlink_parse(&buf
, &change
)) {
697 struct netdev
*netdev_
= NULL
;
698 char dev_name
[IFNAMSIZ
];
700 if (!change
.ifname
) {
701 change
.ifname
= if_indextoname(change
.if_index
, dev_name
);
705 netdev_
= netdev_from_name(change
.ifname
);
707 if (netdev_
&& is_netdev_linux_class(netdev_
->netdev_class
)) {
708 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
710 ovs_mutex_lock(&netdev
->mutex
);
711 netdev_linux_update(netdev
, nsid
, &change
);
712 ovs_mutex_unlock(&netdev
->mutex
);
714 netdev_close(netdev_
);
716 } else if (error
== ENOBUFS
) {
717 struct shash device_shash
;
718 struct shash_node
*node
;
722 shash_init(&device_shash
);
723 netdev_get_devices(&netdev_linux_class
, &device_shash
);
724 SHASH_FOR_EACH (node
, &device_shash
) {
725 struct netdev
*netdev_
= node
->data
;
726 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
729 ovs_mutex_lock(&netdev
->mutex
);
730 get_flags(netdev_
, &flags
);
731 netdev_linux_changed(netdev
, flags
, 0);
732 ovs_mutex_unlock(&netdev
->mutex
);
734 netdev_close(netdev_
);
736 shash_destroy(&device_shash
);
737 } else if (error
!= EAGAIN
) {
738 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 5);
739 VLOG_WARN_RL(&rll
, "error reading or parsing netlink (%s)",
740 ovs_strerror(error
));
747 netdev_linux_wait(const struct netdev_class
*netdev_class OVS_UNUSED
)
749 struct nl_sock
*sock
;
751 if (netdev_linux_miimon_enabled()) {
752 netdev_linux_miimon_wait();
754 sock
= netdev_linux_notify_sock();
756 nl_sock_wait(sock
, POLLIN
);
761 netdev_linux_changed(struct netdev_linux
*dev
,
762 unsigned int ifi_flags
, unsigned int mask
)
763 OVS_REQUIRES(dev
->mutex
)
765 netdev_change_seq_changed(&dev
->up
);
767 if ((dev
->ifi_flags
^ ifi_flags
) & IFF_RUNNING
) {
768 dev
->carrier_resets
++;
770 dev
->ifi_flags
= ifi_flags
;
772 dev
->cache_valid
&= mask
;
773 if (!(mask
& VALID_IN
)) {
774 netdev_get_addrs_list_flush();
779 netdev_linux_update__(struct netdev_linux
*dev
,
780 const struct rtnetlink_change
*change
)
781 OVS_REQUIRES(dev
->mutex
)
783 if (rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)) {
784 if (change
->nlmsg_type
== RTM_NEWLINK
) {
785 /* Keep drv-info, and ip addresses. */
786 netdev_linux_changed(dev
, change
->ifi_flags
,
787 VALID_DRVINFO
| VALID_IN
);
789 /* Update netdev from rtnl-change msg. */
791 dev
->mtu
= change
->mtu
;
792 dev
->cache_valid
|= VALID_MTU
;
793 dev
->netdev_mtu_error
= 0;
796 if (!eth_addr_is_zero(change
->mac
)) {
797 dev
->etheraddr
= change
->mac
;
798 dev
->cache_valid
|= VALID_ETHERADDR
;
799 dev
->ether_addr_error
= 0;
801 /* The mac addr has been changed, report it now. */
802 rtnetlink_report_link();
805 dev
->ifindex
= change
->if_index
;
806 dev
->cache_valid
|= VALID_IFINDEX
;
807 dev
->get_ifindex_error
= 0;
811 netdev_linux_changed(dev
, change
->ifi_flags
, 0);
812 dev
->present
= false;
813 netnsid_unset(&dev
->netnsid
);
815 } else if (rtnetlink_type_is_rtnlgrp_addr(change
->nlmsg_type
)) {
816 /* Invalidates in4, in6. */
817 netdev_linux_changed(dev
, dev
->ifi_flags
, ~VALID_IN
);
824 netdev_linux_update(struct netdev_linux
*dev
, int nsid
,
825 const struct rtnetlink_change
*change
)
826 OVS_REQUIRES(dev
->mutex
)
828 if (netdev_linux_netnsid_is_eq(dev
, nsid
)) {
829 netdev_linux_update__(dev
, change
);
833 static struct netdev
*
834 netdev_linux_alloc(void)
836 struct netdev_linux
*netdev
= xzalloc(sizeof *netdev
);
841 netdev_linux_common_construct(struct netdev
*netdev_
)
843 /* Prevent any attempt to create (or open) a network device named "default"
844 * or "all". These device names are effectively reserved on Linux because
845 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
846 * itself this wouldn't call for any special treatment, but in practice if
847 * a program tries to create devices with these names, it causes the kernel
848 * to fire a "new device" notification event even though creation failed,
849 * and in turn that causes OVS to wake up and try to create them again,
850 * which ends up as a 100% CPU loop. */
851 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
852 const char *name
= netdev_
->name
;
853 if (!strcmp(name
, "default") || !strcmp(name
, "all")) {
854 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 1);
855 VLOG_WARN_RL(&rll
, "%s: Linux forbids network device with this name",
860 /* The device could be in the same network namespace or in another one. */
861 netnsid_unset(&netdev
->netnsid
);
862 ovs_mutex_init(&netdev
->mutex
);
866 /* Creates system and internal devices. */
868 netdev_linux_construct(struct netdev
*netdev_
)
870 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
871 int error
= netdev_linux_common_construct(netdev_
);
876 error
= get_flags(&netdev
->up
, &netdev
->ifi_flags
);
877 if (error
== ENODEV
) {
878 if (netdev
->up
.netdev_class
!= &netdev_internal_class
) {
879 /* The device does not exist, so don't allow it to be opened. */
882 /* "Internal" netdevs have to be created as netdev objects before
883 * they exist in the kernel, because creating them in the kernel
884 * happens by passing a netdev object to dpif_port_add().
885 * Therefore, ignore the error. */
892 /* For most types of netdevs we open the device for each call of
893 * netdev_open(). However, this is not the case with tap devices,
894 * since it is only possible to open the device once. In this
895 * situation we share a single file descriptor, and consequently
896 * buffers, across all readers. Therefore once data is read it will
897 * be unavailable to other reads for tap devices. */
899 netdev_linux_construct_tap(struct netdev
*netdev_
)
901 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
902 static const char tap_dev
[] = "/dev/net/tun";
903 const char *name
= netdev_
->name
;
906 int error
= netdev_linux_common_construct(netdev_
);
911 /* Open tap device. */
912 netdev
->tap_fd
= open(tap_dev
, O_RDWR
);
913 if (netdev
->tap_fd
< 0) {
915 VLOG_WARN("opening \"%s\" failed: %s", tap_dev
, ovs_strerror(error
));
919 /* Create tap device. */
920 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
921 ifr
.ifr_flags
= IFF_TAP
| IFF_NO_PI
;
922 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
923 if (ioctl(netdev
->tap_fd
, TUNSETIFF
, &ifr
) == -1) {
924 VLOG_WARN("%s: creating tap device failed: %s", name
,
925 ovs_strerror(errno
));
930 /* Make non-blocking. */
931 error
= set_nonblocking(netdev
->tap_fd
);
936 if (ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 1)) {
937 VLOG_WARN("%s: creating tap device failed (persist): %s", name
,
938 ovs_strerror(errno
));
946 close(netdev
->tap_fd
);
951 netdev_linux_destruct(struct netdev
*netdev_
)
953 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
955 if (netdev
->tc
&& netdev
->tc
->ops
->tc_destroy
) {
956 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
959 if (netdev_get_class(netdev_
) == &netdev_tap_class
960 && netdev
->tap_fd
>= 0)
962 ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 0);
963 close(netdev
->tap_fd
);
966 if (netdev
->miimon_interval
> 0) {
967 atomic_count_dec(&miimon_cnt
);
970 ovs_mutex_destroy(&netdev
->mutex
);
974 netdev_linux_dealloc(struct netdev
*netdev_
)
976 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
980 static struct netdev_rxq
*
981 netdev_linux_rxq_alloc(void)
983 struct netdev_rxq_linux
*rx
= xzalloc(sizeof *rx
);
988 netdev_linux_rxq_construct(struct netdev_rxq
*rxq_
)
990 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
991 struct netdev
*netdev_
= rx
->up
.netdev
;
992 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
995 ovs_mutex_lock(&netdev
->mutex
);
996 rx
->is_tap
= is_tap_netdev(netdev_
);
998 rx
->fd
= netdev
->tap_fd
;
1000 struct sockaddr_ll sll
;
1002 /* Result of tcpdump -dd inbound */
1003 static const struct sock_filter filt
[] = {
1004 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1005 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1006 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1007 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1009 static const struct sock_fprog fprog
= {
1010 ARRAY_SIZE(filt
), (struct sock_filter
*) filt
1013 /* Create file descriptor. */
1014 rx
->fd
= socket(PF_PACKET
, SOCK_RAW
, 0);
1017 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error
));
1022 if (setsockopt(rx
->fd
, SOL_PACKET
, PACKET_AUXDATA
, &val
, sizeof val
)) {
1024 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1025 netdev_get_name(netdev_
), ovs_strerror(error
));
1029 /* Set non-blocking mode. */
1030 error
= set_nonblocking(rx
->fd
);
1035 /* Get ethernet device index. */
1036 error
= get_ifindex(&netdev
->up
, &ifindex
);
1041 /* Bind to specific ethernet device. */
1042 memset(&sll
, 0, sizeof sll
);
1043 sll
.sll_family
= AF_PACKET
;
1044 sll
.sll_ifindex
= ifindex
;
1045 sll
.sll_protocol
= htons(ETH_P_ALL
);
1046 if (bind(rx
->fd
, (struct sockaddr
*) &sll
, sizeof sll
) < 0) {
1048 VLOG_ERR("%s: failed to bind raw socket (%s)",
1049 netdev_get_name(netdev_
), ovs_strerror(error
));
1053 /* Filter for only inbound packets. */
1054 error
= setsockopt(rx
->fd
, SOL_SOCKET
, SO_ATTACH_FILTER
, &fprog
,
1058 VLOG_ERR("%s: failed to attach filter (%s)",
1059 netdev_get_name(netdev_
), ovs_strerror(error
));
1063 ovs_mutex_unlock(&netdev
->mutex
);
1071 ovs_mutex_unlock(&netdev
->mutex
);
1076 netdev_linux_rxq_destruct(struct netdev_rxq
*rxq_
)
1078 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1086 netdev_linux_rxq_dealloc(struct netdev_rxq
*rxq_
)
1088 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1094 auxdata_to_vlan_tpid(const struct tpacket_auxdata
*aux
, bool double_tagged
)
1096 if (aux
->tp_status
& TP_STATUS_VLAN_TPID_VALID
) {
1097 return htons(aux
->tp_vlan_tpid
);
1098 } else if (double_tagged
) {
1099 return htons(ETH_TYPE_VLAN_8021AD
);
1101 return htons(ETH_TYPE_VLAN_8021Q
);
1106 auxdata_has_vlan_tci(const struct tpacket_auxdata
*aux
)
1108 return aux
->tp_vlan_tci
|| aux
->tp_status
& TP_STATUS_VLAN_VALID
;
1112 netdev_linux_rxq_recv_sock(int fd
, struct dp_packet
*buffer
)
1117 struct cmsghdr
*cmsg
;
1119 struct cmsghdr cmsg
;
1120 char buffer
[CMSG_SPACE(sizeof(struct tpacket_auxdata
))];
1124 /* Reserve headroom for a single VLAN tag */
1125 dp_packet_reserve(buffer
, VLAN_HEADER_LEN
);
1126 size
= dp_packet_tailroom(buffer
);
1128 iov
.iov_base
= dp_packet_data(buffer
);
1130 msgh
.msg_name
= NULL
;
1131 msgh
.msg_namelen
= 0;
1132 msgh
.msg_iov
= &iov
;
1133 msgh
.msg_iovlen
= 1;
1134 msgh
.msg_control
= &cmsg_buffer
;
1135 msgh
.msg_controllen
= sizeof cmsg_buffer
;
1139 retval
= recvmsg(fd
, &msgh
, MSG_TRUNC
);
1140 } while (retval
< 0 && errno
== EINTR
);
1144 } else if (retval
> size
) {
1148 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1150 for (cmsg
= CMSG_FIRSTHDR(&msgh
); cmsg
; cmsg
= CMSG_NXTHDR(&msgh
, cmsg
)) {
1151 const struct tpacket_auxdata
*aux
;
1153 if (cmsg
->cmsg_level
!= SOL_PACKET
1154 || cmsg
->cmsg_type
!= PACKET_AUXDATA
1155 || cmsg
->cmsg_len
< CMSG_LEN(sizeof(struct tpacket_auxdata
))) {
1159 aux
= ALIGNED_CAST(struct tpacket_auxdata
*, CMSG_DATA(cmsg
));
1160 if (auxdata_has_vlan_tci(aux
)) {
1161 struct eth_header
*eth
;
1164 if (retval
< ETH_HEADER_LEN
) {
1168 eth
= dp_packet_data(buffer
);
1169 double_tagged
= eth
->eth_type
== htons(ETH_TYPE_VLAN_8021Q
);
1171 eth_push_vlan(buffer
, auxdata_to_vlan_tpid(aux
, double_tagged
),
1172 htons(aux
->tp_vlan_tci
));
1181 netdev_linux_rxq_recv_tap(int fd
, struct dp_packet
*buffer
)
1184 size_t size
= dp_packet_tailroom(buffer
);
1187 retval
= read(fd
, dp_packet_data(buffer
), size
);
1188 } while (retval
< 0 && errno
== EINTR
);
1194 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1199 netdev_linux_rxq_recv(struct netdev_rxq
*rxq_
, struct dp_packet_batch
*batch
)
1201 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1202 struct netdev
*netdev
= rx
->up
.netdev
;
1203 struct dp_packet
*buffer
;
1207 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
1208 mtu
= ETH_PAYLOAD_MAX
;
1211 /* Assume Ethernet port. No need to set packet_type. */
1212 buffer
= dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN
+ mtu
,
1213 DP_NETDEV_HEADROOM
);
1214 retval
= (rx
->is_tap
1215 ? netdev_linux_rxq_recv_tap(rx
->fd
, buffer
)
1216 : netdev_linux_rxq_recv_sock(rx
->fd
, buffer
));
1219 if (retval
!= EAGAIN
&& retval
!= EMSGSIZE
) {
1220 VLOG_WARN_RL(&rl
, "error receiving Ethernet packet on %s: %s",
1221 netdev_rxq_get_name(rxq_
), ovs_strerror(errno
));
1223 dp_packet_delete(buffer
);
1225 dp_packet_batch_init_packet(batch
, buffer
);
1232 netdev_linux_rxq_wait(struct netdev_rxq
*rxq_
)
1234 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1235 poll_fd_wait(rx
->fd
, POLLIN
);
1239 netdev_linux_rxq_drain(struct netdev_rxq
*rxq_
)
1241 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1244 int error
= af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_
), &ifr
,
1245 SIOCGIFTXQLEN
, "SIOCGIFTXQLEN");
1249 drain_fd(rx
->fd
, ifr
.ifr_qlen
);
1252 return drain_rcvbuf(rx
->fd
);
1257 netdev_linux_sock_batch_send(int sock
, int ifindex
,
1258 struct dp_packet_batch
*batch
)
1260 const size_t size
= dp_packet_batch_size(batch
);
1261 /* We don't bother setting most fields in sockaddr_ll because the
1262 * kernel ignores them for SOCK_RAW. */
1263 struct sockaddr_ll sll
= { .sll_family
= AF_PACKET
,
1264 .sll_ifindex
= ifindex
};
1266 struct mmsghdr
*mmsg
= xmalloc(sizeof(*mmsg
) * size
);
1267 struct iovec
*iov
= xmalloc(sizeof(*iov
) * size
);
1269 struct dp_packet
*packet
;
1270 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1271 iov
[i
].iov_base
= dp_packet_data(packet
);
1272 iov
[i
].iov_len
= dp_packet_size(packet
);
1273 mmsg
[i
].msg_hdr
= (struct msghdr
) { .msg_name
= &sll
,
1274 .msg_namelen
= sizeof sll
,
1280 for (uint32_t ofs
= 0; ofs
< size
; ) {
1283 retval
= sendmmsg(sock
, mmsg
+ ofs
, size
- ofs
, 0);
1284 error
= retval
< 0 ? errno
: 0;
1285 } while (error
== EINTR
);
1297 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1298 * essential, because packets sent to a tap device with an AF_PACKET socket
1299 * will loop back to be *received* again on the tap device. This doesn't occur
1300 * on other interface types because we attach a socket filter to the rx
1303 netdev_linux_tap_batch_send(struct netdev
*netdev_
,
1304 struct dp_packet_batch
*batch
)
1306 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1307 struct dp_packet
*packet
;
1309 /* The Linux tap driver returns EIO if the device is not up,
1310 * so if the device is not up, don't waste time sending it.
1311 * However, if the device is in another network namespace
1312 * then OVS can't retrieve the state. In that case, send the
1313 * packets anyway. */
1314 if (netdev
->present
&& !(netdev
->ifi_flags
& IFF_UP
)) {
1315 netdev
->tx_dropped
+= dp_packet_batch_size(batch
);
1319 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1320 size_t size
= dp_packet_size(packet
);
1325 retval
= write(netdev
->tap_fd
, dp_packet_data(packet
), size
);
1326 error
= retval
< 0 ? errno
: 0;
1327 } while (error
== EINTR
);
1330 /* The Linux tap driver returns EIO if the device is not up. From
1331 * the OVS side this is not an error, so we ignore it; otherwise,
1332 * return the erro. */
1336 } else if (retval
!= size
) {
1337 VLOG_WARN_RL(&rl
, "sent partial Ethernet packet (%"PRIuSIZE
" "
1338 "bytes of %"PRIuSIZE
") on %s",
1339 retval
, size
, netdev_get_name(netdev_
));
1346 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1347 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1348 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1349 * the packet is too big or too small to transmit on the device.
1351 * The kernel maintains a packet transmission queue, so the caller is not
1352 * expected to do additional queuing of packets. */
1354 netdev_linux_send(struct netdev
*netdev_
, int qid OVS_UNUSED
,
1355 struct dp_packet_batch
*batch
,
1356 bool concurrent_txq OVS_UNUSED
)
1361 if (!is_tap_netdev(netdev_
)) {
1362 sock
= af_packet_sock();
1368 int ifindex
= netdev_get_ifindex(netdev_
);
1374 error
= netdev_linux_sock_batch_send(sock
, ifindex
, batch
);
1376 error
= netdev_linux_tap_batch_send(netdev_
, batch
);
1379 if (error
== ENOBUFS
) {
1380 /* The Linux AF_PACKET implementation never blocks waiting
1381 * for room for packets, instead returning ENOBUFS.
1382 * Translate this into EAGAIN for the caller. */
1385 VLOG_WARN_RL(&rl
, "error sending Ethernet packet on %s: %s",
1386 netdev_get_name(netdev_
), ovs_strerror(error
));
1391 dp_packet_delete_batch(batch
, true);
1395 /* Registers with the poll loop to wake up from the next call to poll_block()
1396 * when the packet transmission queue has sufficient room to transmit a packet
1397 * with netdev_send().
1399 * The kernel maintains a packet transmission queue, so the client is not
1400 * expected to do additional queuing of packets. Thus, this function is
1401 * unlikely to ever be used. It is included for completeness. */
1403 netdev_linux_send_wait(struct netdev
*netdev
, int qid OVS_UNUSED
)
1405 if (is_tap_netdev(netdev
)) {
1406 /* TAP device always accepts packets.*/
1407 poll_immediate_wake();
1411 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1412 * otherwise a positive errno value. */
1414 netdev_linux_set_etheraddr(struct netdev
*netdev_
, const struct eth_addr mac
)
1416 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1417 enum netdev_flags old_flags
= 0;
1420 ovs_mutex_lock(&netdev
->mutex
);
1422 if (netdev
->cache_valid
& VALID_ETHERADDR
) {
1423 error
= netdev
->ether_addr_error
;
1424 if (error
|| eth_addr_equals(netdev
->etheraddr
, mac
)) {
1427 netdev
->cache_valid
&= ~VALID_ETHERADDR
;
1430 /* Tap devices must be brought down before setting the address. */
1431 if (is_tap_netdev(netdev_
)) {
1432 update_flags(netdev
, NETDEV_UP
, 0, &old_flags
);
1434 error
= set_etheraddr(netdev_get_name(netdev_
), mac
);
1435 if (!error
|| error
== ENODEV
) {
1436 netdev
->ether_addr_error
= error
;
1437 netdev
->cache_valid
|= VALID_ETHERADDR
;
1439 netdev
->etheraddr
= mac
;
1443 if (is_tap_netdev(netdev_
) && old_flags
& NETDEV_UP
) {
1444 update_flags(netdev
, 0, NETDEV_UP
, &old_flags
);
1448 ovs_mutex_unlock(&netdev
->mutex
);
1452 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1454 netdev_linux_get_etheraddr(const struct netdev
*netdev_
, struct eth_addr
*mac
)
1456 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1459 ovs_mutex_lock(&netdev
->mutex
);
1460 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1461 netdev_linux_update_via_netlink(netdev
);
1464 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1465 /* Fall back to ioctl if netlink fails */
1466 netdev
->ether_addr_error
= get_etheraddr(netdev_get_name(netdev_
),
1467 &netdev
->etheraddr
);
1468 netdev
->cache_valid
|= VALID_ETHERADDR
;
1471 error
= netdev
->ether_addr_error
;
1473 *mac
= netdev
->etheraddr
;
1475 ovs_mutex_unlock(&netdev
->mutex
);
1481 netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
)
1485 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1486 netdev_linux_update_via_netlink(netdev
);
1489 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1490 /* Fall back to ioctl if netlink fails */
1493 netdev
->netdev_mtu_error
= af_inet_ifreq_ioctl(
1494 netdev_get_name(&netdev
->up
), &ifr
, SIOCGIFMTU
, "SIOCGIFMTU");
1495 netdev
->mtu
= ifr
.ifr_mtu
;
1496 netdev
->cache_valid
|= VALID_MTU
;
1499 error
= netdev
->netdev_mtu_error
;
1501 *mtup
= netdev
->mtu
;
1507 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1508 * in bytes, not including the hardware header; thus, this is typically 1500
1509 * bytes for Ethernet devices. */
1511 netdev_linux_get_mtu(const struct netdev
*netdev_
, int *mtup
)
1513 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1516 ovs_mutex_lock(&netdev
->mutex
);
1517 error
= netdev_linux_get_mtu__(netdev
, mtup
);
1518 ovs_mutex_unlock(&netdev
->mutex
);
1523 /* Sets the maximum size of transmitted (MTU) for given device using linux
1524 * networking ioctl interface.
1527 netdev_linux_set_mtu(struct netdev
*netdev_
, int mtu
)
1529 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1533 ovs_mutex_lock(&netdev
->mutex
);
1534 if (netdev
->cache_valid
& VALID_MTU
) {
1535 error
= netdev
->netdev_mtu_error
;
1536 if (error
|| netdev
->mtu
== mtu
) {
1539 netdev
->cache_valid
&= ~VALID_MTU
;
1542 error
= af_inet_ifreq_ioctl(netdev_get_name(netdev_
), &ifr
,
1543 SIOCSIFMTU
, "SIOCSIFMTU");
1544 if (!error
|| error
== ENODEV
) {
1545 netdev
->netdev_mtu_error
= error
;
1546 netdev
->mtu
= ifr
.ifr_mtu
;
1547 netdev
->cache_valid
|= VALID_MTU
;
1550 ovs_mutex_unlock(&netdev
->mutex
);
1554 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1555 * On failure, returns a negative errno value. */
1557 netdev_linux_get_ifindex(const struct netdev
*netdev_
)
1559 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1562 ovs_mutex_lock(&netdev
->mutex
);
1563 error
= get_ifindex(netdev_
, &ifindex
);
1564 ovs_mutex_unlock(&netdev
->mutex
);
1566 return error
? -error
: ifindex
;
1570 netdev_linux_get_carrier(const struct netdev
*netdev_
, bool *carrier
)
1572 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1574 ovs_mutex_lock(&netdev
->mutex
);
1575 if (netdev
->miimon_interval
> 0) {
1576 *carrier
= netdev
->miimon
;
1578 *carrier
= (netdev
->ifi_flags
& IFF_RUNNING
) != 0;
1580 ovs_mutex_unlock(&netdev
->mutex
);
1585 static long long int
1586 netdev_linux_get_carrier_resets(const struct netdev
*netdev_
)
1588 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1589 long long int carrier_resets
;
1591 ovs_mutex_lock(&netdev
->mutex
);
1592 carrier_resets
= netdev
->carrier_resets
;
1593 ovs_mutex_unlock(&netdev
->mutex
);
1595 return carrier_resets
;
1599 netdev_linux_do_miimon(const char *name
, int cmd
, const char *cmd_name
,
1600 struct mii_ioctl_data
*data
)
1605 memset(&ifr
, 0, sizeof ifr
);
1606 memcpy(&ifr
.ifr_data
, data
, sizeof *data
);
1607 error
= af_inet_ifreq_ioctl(name
, &ifr
, cmd
, cmd_name
);
1608 memcpy(data
, &ifr
.ifr_data
, sizeof *data
);
1614 netdev_linux_get_miimon(const char *name
, bool *miimon
)
1616 struct mii_ioctl_data data
;
1621 memset(&data
, 0, sizeof data
);
1622 error
= netdev_linux_do_miimon(name
, SIOCGMIIPHY
, "SIOCGMIIPHY", &data
);
1624 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1625 data
.reg_num
= MII_BMSR
;
1626 error
= netdev_linux_do_miimon(name
, SIOCGMIIREG
, "SIOCGMIIREG",
1630 *miimon
= !!(data
.val_out
& BMSR_LSTATUS
);
1634 struct ethtool_cmd ecmd
;
1636 VLOG_DBG_RL(&rl
, "%s: failed to query MII, falling back to ethtool",
1639 COVERAGE_INC(netdev_get_ethtool
);
1640 memset(&ecmd
, 0, sizeof ecmd
);
1641 error
= netdev_linux_do_ethtool(name
, &ecmd
, ETHTOOL_GLINK
,
1644 struct ethtool_value eval
;
1646 memcpy(&eval
, &ecmd
, sizeof eval
);
1647 *miimon
= !!eval
.data
;
1649 VLOG_WARN_RL(&rl
, "%s: ethtool link status failed", name
);
1657 netdev_linux_set_miimon_interval(struct netdev
*netdev_
,
1658 long long int interval
)
1660 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1662 ovs_mutex_lock(&netdev
->mutex
);
1663 interval
= interval
> 0 ? MAX(interval
, 100) : 0;
1664 if (netdev
->miimon_interval
!= interval
) {
1665 if (interval
&& !netdev
->miimon_interval
) {
1666 atomic_count_inc(&miimon_cnt
);
1667 } else if (!interval
&& netdev
->miimon_interval
) {
1668 atomic_count_dec(&miimon_cnt
);
1671 netdev
->miimon_interval
= interval
;
1672 timer_set_expired(&netdev
->miimon_timer
);
1674 ovs_mutex_unlock(&netdev
->mutex
);
1680 netdev_linux_miimon_run(void)
1682 struct shash device_shash
;
1683 struct shash_node
*node
;
1685 shash_init(&device_shash
);
1686 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1687 SHASH_FOR_EACH (node
, &device_shash
) {
1688 struct netdev
*netdev
= node
->data
;
1689 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1692 ovs_mutex_lock(&dev
->mutex
);
1693 if (dev
->miimon_interval
> 0 && timer_expired(&dev
->miimon_timer
)) {
1694 netdev_linux_get_miimon(dev
->up
.name
, &miimon
);
1695 if (miimon
!= dev
->miimon
) {
1696 dev
->miimon
= miimon
;
1697 netdev_linux_changed(dev
, dev
->ifi_flags
, 0);
1700 timer_set_duration(&dev
->miimon_timer
, dev
->miimon_interval
);
1702 ovs_mutex_unlock(&dev
->mutex
);
1703 netdev_close(netdev
);
1706 shash_destroy(&device_shash
);
1710 netdev_linux_miimon_wait(void)
1712 struct shash device_shash
;
1713 struct shash_node
*node
;
1715 shash_init(&device_shash
);
1716 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1717 SHASH_FOR_EACH (node
, &device_shash
) {
1718 struct netdev
*netdev
= node
->data
;
1719 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1721 ovs_mutex_lock(&dev
->mutex
);
1722 if (dev
->miimon_interval
> 0) {
1723 timer_wait(&dev
->miimon_timer
);
1725 ovs_mutex_unlock(&dev
->mutex
);
1726 netdev_close(netdev
);
1728 shash_destroy(&device_shash
);
1732 swap_uint64(uint64_t *a
, uint64_t *b
)
1739 /* Copies 'src' into 'dst', performing format conversion in the process.
1741 * 'src' is allowed to be misaligned. */
1743 netdev_stats_from_ovs_vport_stats(struct netdev_stats
*dst
,
1744 const struct ovs_vport_stats
*src
)
1746 dst
->rx_packets
= get_32aligned_u64(&src
->rx_packets
);
1747 dst
->tx_packets
= get_32aligned_u64(&src
->tx_packets
);
1748 dst
->rx_bytes
= get_32aligned_u64(&src
->rx_bytes
);
1749 dst
->tx_bytes
= get_32aligned_u64(&src
->tx_bytes
);
1750 dst
->rx_errors
= get_32aligned_u64(&src
->rx_errors
);
1751 dst
->tx_errors
= get_32aligned_u64(&src
->tx_errors
);
1752 dst
->rx_dropped
= get_32aligned_u64(&src
->rx_dropped
);
1753 dst
->tx_dropped
= get_32aligned_u64(&src
->tx_dropped
);
1755 dst
->collisions
= 0;
1756 dst
->rx_length_errors
= 0;
1757 dst
->rx_over_errors
= 0;
1758 dst
->rx_crc_errors
= 0;
1759 dst
->rx_frame_errors
= 0;
1760 dst
->rx_fifo_errors
= 0;
1761 dst
->rx_missed_errors
= 0;
1762 dst
->tx_aborted_errors
= 0;
1763 dst
->tx_carrier_errors
= 0;
1764 dst
->tx_fifo_errors
= 0;
1765 dst
->tx_heartbeat_errors
= 0;
1766 dst
->tx_window_errors
= 0;
1770 get_stats_via_vport__(const struct netdev
*netdev
, struct netdev_stats
*stats
)
1772 struct dpif_netlink_vport reply
;
1776 error
= dpif_netlink_vport_get(netdev_get_name(netdev
), &reply
, &buf
);
1779 } else if (!reply
.stats
) {
1784 netdev_stats_from_ovs_vport_stats(stats
, reply
.stats
);
1792 get_stats_via_vport(const struct netdev
*netdev_
,
1793 struct netdev_stats
*stats
)
1795 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1797 if (!netdev
->vport_stats_error
||
1798 !(netdev
->cache_valid
& VALID_VPORT_STAT_ERROR
)) {
1801 error
= get_stats_via_vport__(netdev_
, stats
);
1802 if (error
&& error
!= ENOENT
&& error
!= ENODEV
) {
1803 VLOG_WARN_RL(&rl
, "%s: obtaining netdev stats via vport failed "
1805 netdev_get_name(netdev_
), ovs_strerror(error
));
1807 netdev
->vport_stats_error
= error
;
1808 netdev
->cache_valid
|= VALID_VPORT_STAT_ERROR
;
1812 /* Retrieves current device stats for 'netdev-linux'. */
1814 netdev_linux_get_stats(const struct netdev
*netdev_
,
1815 struct netdev_stats
*stats
)
1817 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1818 struct netdev_stats dev_stats
;
1821 ovs_mutex_lock(&netdev
->mutex
);
1822 get_stats_via_vport(netdev_
, stats
);
1823 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1825 if (!netdev
->vport_stats_error
) {
1828 } else if (netdev
->vport_stats_error
) {
1829 /* stats not available from OVS then use netdev stats. */
1832 /* Use kernel netdev's packet and byte counts since vport's counters
1833 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1835 stats
->rx_packets
= dev_stats
.rx_packets
;
1836 stats
->rx_bytes
= dev_stats
.rx_bytes
;
1837 stats
->tx_packets
= dev_stats
.tx_packets
;
1838 stats
->tx_bytes
= dev_stats
.tx_bytes
;
1840 stats
->rx_errors
+= dev_stats
.rx_errors
;
1841 stats
->tx_errors
+= dev_stats
.tx_errors
;
1842 stats
->rx_dropped
+= dev_stats
.rx_dropped
;
1843 stats
->tx_dropped
+= dev_stats
.tx_dropped
;
1844 stats
->multicast
+= dev_stats
.multicast
;
1845 stats
->collisions
+= dev_stats
.collisions
;
1846 stats
->rx_length_errors
+= dev_stats
.rx_length_errors
;
1847 stats
->rx_over_errors
+= dev_stats
.rx_over_errors
;
1848 stats
->rx_crc_errors
+= dev_stats
.rx_crc_errors
;
1849 stats
->rx_frame_errors
+= dev_stats
.rx_frame_errors
;
1850 stats
->rx_fifo_errors
+= dev_stats
.rx_fifo_errors
;
1851 stats
->rx_missed_errors
+= dev_stats
.rx_missed_errors
;
1852 stats
->tx_aborted_errors
+= dev_stats
.tx_aborted_errors
;
1853 stats
->tx_carrier_errors
+= dev_stats
.tx_carrier_errors
;
1854 stats
->tx_fifo_errors
+= dev_stats
.tx_fifo_errors
;
1855 stats
->tx_heartbeat_errors
+= dev_stats
.tx_heartbeat_errors
;
1856 stats
->tx_window_errors
+= dev_stats
.tx_window_errors
;
1858 ovs_mutex_unlock(&netdev
->mutex
);
1863 /* Retrieves current device stats for 'netdev-tap' netdev or
1864 * netdev-internal. */
1866 netdev_tap_get_stats(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
1868 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1869 struct netdev_stats dev_stats
;
1872 ovs_mutex_lock(&netdev
->mutex
);
1873 get_stats_via_vport(netdev_
, stats
);
1874 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1876 if (!netdev
->vport_stats_error
) {
1879 } else if (netdev
->vport_stats_error
) {
1880 /* Transmit and receive stats will appear to be swapped relative to the
1881 * other ports since we are the one sending the data, not a remote
1882 * computer. For consistency, we swap them back here. This does not
1883 * apply if we are getting stats from the vport layer because it always
1884 * tracks stats from the perspective of the switch. */
1887 swap_uint64(&stats
->rx_packets
, &stats
->tx_packets
);
1888 swap_uint64(&stats
->rx_bytes
, &stats
->tx_bytes
);
1889 swap_uint64(&stats
->rx_errors
, &stats
->tx_errors
);
1890 swap_uint64(&stats
->rx_dropped
, &stats
->tx_dropped
);
1891 stats
->rx_length_errors
= 0;
1892 stats
->rx_over_errors
= 0;
1893 stats
->rx_crc_errors
= 0;
1894 stats
->rx_frame_errors
= 0;
1895 stats
->rx_fifo_errors
= 0;
1896 stats
->rx_missed_errors
= 0;
1897 stats
->tx_aborted_errors
= 0;
1898 stats
->tx_carrier_errors
= 0;
1899 stats
->tx_fifo_errors
= 0;
1900 stats
->tx_heartbeat_errors
= 0;
1901 stats
->tx_window_errors
= 0;
1903 /* Use kernel netdev's packet and byte counts since vport counters
1904 * do not reflect packet counts on the wire when GSO, TSO or GRO
1906 stats
->rx_packets
= dev_stats
.tx_packets
;
1907 stats
->rx_bytes
= dev_stats
.tx_bytes
;
1908 stats
->tx_packets
= dev_stats
.rx_packets
;
1909 stats
->tx_bytes
= dev_stats
.rx_bytes
;
1911 stats
->rx_dropped
+= dev_stats
.tx_dropped
;
1912 stats
->tx_dropped
+= dev_stats
.rx_dropped
;
1914 stats
->rx_errors
+= dev_stats
.tx_errors
;
1915 stats
->tx_errors
+= dev_stats
.rx_errors
;
1917 stats
->multicast
+= dev_stats
.multicast
;
1918 stats
->collisions
+= dev_stats
.collisions
;
1920 stats
->tx_dropped
+= netdev
->tx_dropped
;
1921 ovs_mutex_unlock(&netdev
->mutex
);
1927 netdev_internal_get_stats(const struct netdev
*netdev_
,
1928 struct netdev_stats
*stats
)
1930 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1933 ovs_mutex_lock(&netdev
->mutex
);
1934 get_stats_via_vport(netdev_
, stats
);
1935 error
= netdev
->vport_stats_error
;
1936 ovs_mutex_unlock(&netdev
->mutex
);
1942 netdev_linux_read_features(struct netdev_linux
*netdev
)
1944 struct ethtool_cmd ecmd
;
1948 if (netdev
->cache_valid
& VALID_FEATURES
) {
1952 COVERAGE_INC(netdev_get_ethtool
);
1953 memset(&ecmd
, 0, sizeof ecmd
);
1954 error
= netdev_linux_do_ethtool(netdev
->up
.name
, &ecmd
,
1955 ETHTOOL_GSET
, "ETHTOOL_GSET");
1960 /* Supported features. */
1961 netdev
->supported
= 0;
1962 if (ecmd
.supported
& SUPPORTED_10baseT_Half
) {
1963 netdev
->supported
|= NETDEV_F_10MB_HD
;
1965 if (ecmd
.supported
& SUPPORTED_10baseT_Full
) {
1966 netdev
->supported
|= NETDEV_F_10MB_FD
;
1968 if (ecmd
.supported
& SUPPORTED_100baseT_Half
) {
1969 netdev
->supported
|= NETDEV_F_100MB_HD
;
1971 if (ecmd
.supported
& SUPPORTED_100baseT_Full
) {
1972 netdev
->supported
|= NETDEV_F_100MB_FD
;
1974 if (ecmd
.supported
& SUPPORTED_1000baseT_Half
) {
1975 netdev
->supported
|= NETDEV_F_1GB_HD
;
1977 if ((ecmd
.supported
& SUPPORTED_1000baseT_Full
) ||
1978 (ecmd
.supported
& SUPPORTED_1000baseKX_Full
)) {
1979 netdev
->supported
|= NETDEV_F_1GB_FD
;
1981 if ((ecmd
.supported
& SUPPORTED_10000baseT_Full
) ||
1982 (ecmd
.supported
& SUPPORTED_10000baseKX4_Full
) ||
1983 (ecmd
.supported
& SUPPORTED_10000baseKR_Full
) ||
1984 (ecmd
.supported
& SUPPORTED_10000baseR_FEC
)) {
1985 netdev
->supported
|= NETDEV_F_10GB_FD
;
1987 if ((ecmd
.supported
& SUPPORTED_40000baseKR4_Full
) ||
1988 (ecmd
.supported
& SUPPORTED_40000baseCR4_Full
) ||
1989 (ecmd
.supported
& SUPPORTED_40000baseSR4_Full
) ||
1990 (ecmd
.supported
& SUPPORTED_40000baseLR4_Full
)) {
1991 netdev
->supported
|= NETDEV_F_40GB_FD
;
1993 if (ecmd
.supported
& SUPPORTED_TP
) {
1994 netdev
->supported
|= NETDEV_F_COPPER
;
1996 if (ecmd
.supported
& SUPPORTED_FIBRE
) {
1997 netdev
->supported
|= NETDEV_F_FIBER
;
1999 if (ecmd
.supported
& SUPPORTED_Autoneg
) {
2000 netdev
->supported
|= NETDEV_F_AUTONEG
;
2002 if (ecmd
.supported
& SUPPORTED_Pause
) {
2003 netdev
->supported
|= NETDEV_F_PAUSE
;
2005 if (ecmd
.supported
& SUPPORTED_Asym_Pause
) {
2006 netdev
->supported
|= NETDEV_F_PAUSE_ASYM
;
2009 /* Advertised features. */
2010 netdev
->advertised
= 0;
2011 if (ecmd
.advertising
& ADVERTISED_10baseT_Half
) {
2012 netdev
->advertised
|= NETDEV_F_10MB_HD
;
2014 if (ecmd
.advertising
& ADVERTISED_10baseT_Full
) {
2015 netdev
->advertised
|= NETDEV_F_10MB_FD
;
2017 if (ecmd
.advertising
& ADVERTISED_100baseT_Half
) {
2018 netdev
->advertised
|= NETDEV_F_100MB_HD
;
2020 if (ecmd
.advertising
& ADVERTISED_100baseT_Full
) {
2021 netdev
->advertised
|= NETDEV_F_100MB_FD
;
2023 if (ecmd
.advertising
& ADVERTISED_1000baseT_Half
) {
2024 netdev
->advertised
|= NETDEV_F_1GB_HD
;
2026 if ((ecmd
.advertising
& ADVERTISED_1000baseT_Full
) ||
2027 (ecmd
.advertising
& ADVERTISED_1000baseKX_Full
)) {
2028 netdev
->advertised
|= NETDEV_F_1GB_FD
;
2030 if ((ecmd
.advertising
& ADVERTISED_10000baseT_Full
) ||
2031 (ecmd
.advertising
& ADVERTISED_10000baseKX4_Full
) ||
2032 (ecmd
.advertising
& ADVERTISED_10000baseKR_Full
) ||
2033 (ecmd
.advertising
& ADVERTISED_10000baseR_FEC
)) {
2034 netdev
->advertised
|= NETDEV_F_10GB_FD
;
2036 if ((ecmd
.advertising
& ADVERTISED_40000baseKR4_Full
) ||
2037 (ecmd
.advertising
& ADVERTISED_40000baseCR4_Full
) ||
2038 (ecmd
.advertising
& ADVERTISED_40000baseSR4_Full
) ||
2039 (ecmd
.advertising
& ADVERTISED_40000baseLR4_Full
)) {
2040 netdev
->advertised
|= NETDEV_F_40GB_FD
;
2042 if (ecmd
.advertising
& ADVERTISED_TP
) {
2043 netdev
->advertised
|= NETDEV_F_COPPER
;
2045 if (ecmd
.advertising
& ADVERTISED_FIBRE
) {
2046 netdev
->advertised
|= NETDEV_F_FIBER
;
2048 if (ecmd
.advertising
& ADVERTISED_Autoneg
) {
2049 netdev
->advertised
|= NETDEV_F_AUTONEG
;
2051 if (ecmd
.advertising
& ADVERTISED_Pause
) {
2052 netdev
->advertised
|= NETDEV_F_PAUSE
;
2054 if (ecmd
.advertising
& ADVERTISED_Asym_Pause
) {
2055 netdev
->advertised
|= NETDEV_F_PAUSE_ASYM
;
2058 /* Current settings. */
2059 speed
= ethtool_cmd_speed(&ecmd
);
2060 if (speed
== SPEED_10
) {
2061 netdev
->current
= ecmd
.duplex
? NETDEV_F_10MB_FD
: NETDEV_F_10MB_HD
;
2062 } else if (speed
== SPEED_100
) {
2063 netdev
->current
= ecmd
.duplex
? NETDEV_F_100MB_FD
: NETDEV_F_100MB_HD
;
2064 } else if (speed
== SPEED_1000
) {
2065 netdev
->current
= ecmd
.duplex
? NETDEV_F_1GB_FD
: NETDEV_F_1GB_HD
;
2066 } else if (speed
== SPEED_10000
) {
2067 netdev
->current
= NETDEV_F_10GB_FD
;
2068 } else if (speed
== 40000) {
2069 netdev
->current
= NETDEV_F_40GB_FD
;
2070 } else if (speed
== 100000) {
2071 netdev
->current
= NETDEV_F_100GB_FD
;
2072 } else if (speed
== 1000000) {
2073 netdev
->current
= NETDEV_F_1TB_FD
;
2075 netdev
->current
= 0;
2078 if (ecmd
.port
== PORT_TP
) {
2079 netdev
->current
|= NETDEV_F_COPPER
;
2080 } else if (ecmd
.port
== PORT_FIBRE
) {
2081 netdev
->current
|= NETDEV_F_FIBER
;
2085 netdev
->current
|= NETDEV_F_AUTONEG
;
2089 netdev
->cache_valid
|= VALID_FEATURES
;
2090 netdev
->get_features_error
= error
;
2093 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2094 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2095 * Returns 0 if successful, otherwise a positive errno value. */
2097 netdev_linux_get_features(const struct netdev
*netdev_
,
2098 enum netdev_features
*current
,
2099 enum netdev_features
*advertised
,
2100 enum netdev_features
*supported
,
2101 enum netdev_features
*peer
)
2103 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2106 ovs_mutex_lock(&netdev
->mutex
);
2107 netdev_linux_read_features(netdev
);
2108 if (!netdev
->get_features_error
) {
2109 *current
= netdev
->current
;
2110 *advertised
= netdev
->advertised
;
2111 *supported
= netdev
->supported
;
2112 *peer
= 0; /* XXX */
2114 error
= netdev
->get_features_error
;
2115 ovs_mutex_unlock(&netdev
->mutex
);
2120 /* Set the features advertised by 'netdev' to 'advertise'. */
2122 netdev_linux_set_advertisements(struct netdev
*netdev_
,
2123 enum netdev_features advertise
)
2125 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2126 struct ethtool_cmd ecmd
;
2129 ovs_mutex_lock(&netdev
->mutex
);
2131 COVERAGE_INC(netdev_get_ethtool
);
2132 memset(&ecmd
, 0, sizeof ecmd
);
2133 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2134 ETHTOOL_GSET
, "ETHTOOL_GSET");
2139 ecmd
.advertising
= 0;
2140 if (advertise
& NETDEV_F_10MB_HD
) {
2141 ecmd
.advertising
|= ADVERTISED_10baseT_Half
;
2143 if (advertise
& NETDEV_F_10MB_FD
) {
2144 ecmd
.advertising
|= ADVERTISED_10baseT_Full
;
2146 if (advertise
& NETDEV_F_100MB_HD
) {
2147 ecmd
.advertising
|= ADVERTISED_100baseT_Half
;
2149 if (advertise
& NETDEV_F_100MB_FD
) {
2150 ecmd
.advertising
|= ADVERTISED_100baseT_Full
;
2152 if (advertise
& NETDEV_F_1GB_HD
) {
2153 ecmd
.advertising
|= ADVERTISED_1000baseT_Half
;
2155 if (advertise
& NETDEV_F_1GB_FD
) {
2156 ecmd
.advertising
|= ADVERTISED_1000baseT_Full
;
2158 if (advertise
& NETDEV_F_10GB_FD
) {
2159 ecmd
.advertising
|= ADVERTISED_10000baseT_Full
;
2161 if (advertise
& NETDEV_F_COPPER
) {
2162 ecmd
.advertising
|= ADVERTISED_TP
;
2164 if (advertise
& NETDEV_F_FIBER
) {
2165 ecmd
.advertising
|= ADVERTISED_FIBRE
;
2167 if (advertise
& NETDEV_F_AUTONEG
) {
2168 ecmd
.advertising
|= ADVERTISED_Autoneg
;
2170 if (advertise
& NETDEV_F_PAUSE
) {
2171 ecmd
.advertising
|= ADVERTISED_Pause
;
2173 if (advertise
& NETDEV_F_PAUSE_ASYM
) {
2174 ecmd
.advertising
|= ADVERTISED_Asym_Pause
;
2176 COVERAGE_INC(netdev_set_ethtool
);
2177 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2178 ETHTOOL_SSET
, "ETHTOOL_SSET");
2181 ovs_mutex_unlock(&netdev
->mutex
);
2185 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2186 * successful, otherwise a positive errno value. */
2188 netdev_linux_set_policing(struct netdev
*netdev_
,
2189 uint32_t kbits_rate
, uint32_t kbits_burst
)
2191 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2192 const char *netdev_name
= netdev_get_name(netdev_
);
2196 if (netdev_is_flow_api_enabled()) {
2198 VLOG_WARN_RL(&rl
, "%s: policing with offload isn't supported",
2204 kbits_burst
= (!kbits_rate
? 0 /* Force to 0 if no rate specified. */
2205 : !kbits_burst
? 8000 /* Default to 8000 kbits if 0. */
2206 : kbits_burst
); /* Stick with user-specified value. */
2208 ovs_mutex_lock(&netdev
->mutex
);
2209 if (netdev
->cache_valid
& VALID_POLICING
) {
2210 error
= netdev
->netdev_policing_error
;
2211 if (error
|| (netdev
->kbits_rate
== kbits_rate
&&
2212 netdev
->kbits_burst
== kbits_burst
)) {
2213 /* Assume that settings haven't changed since we last set them. */
2216 netdev
->cache_valid
&= ~VALID_POLICING
;
2219 error
= get_ifindex(netdev_
, &ifindex
);
2224 COVERAGE_INC(netdev_set_policing
);
2225 /* Remove any existing ingress qdisc. */
2226 error
= tc_add_del_ingress_qdisc(ifindex
, false);
2228 VLOG_WARN_RL(&rl
, "%s: removing policing failed: %s",
2229 netdev_name
, ovs_strerror(error
));
2234 error
= tc_add_del_ingress_qdisc(ifindex
, true);
2236 VLOG_WARN_RL(&rl
, "%s: adding policing qdisc failed: %s",
2237 netdev_name
, ovs_strerror(error
));
2241 error
= tc_add_policer(netdev_
, kbits_rate
, kbits_burst
);
2243 VLOG_WARN_RL(&rl
, "%s: adding policing action failed: %s",
2244 netdev_name
, ovs_strerror(error
));
2249 netdev
->kbits_rate
= kbits_rate
;
2250 netdev
->kbits_burst
= kbits_burst
;
2253 if (!error
|| error
== ENODEV
) {
2254 netdev
->netdev_policing_error
= error
;
2255 netdev
->cache_valid
|= VALID_POLICING
;
2257 ovs_mutex_unlock(&netdev
->mutex
);
2262 netdev_linux_get_qos_types(const struct netdev
*netdev OVS_UNUSED
,
2265 const struct tc_ops
*const *opsp
;
2266 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2267 const struct tc_ops
*ops
= *opsp
;
2268 if (ops
->tc_install
&& ops
->ovs_name
[0] != '\0') {
2269 sset_add(types
, ops
->ovs_name
);
2275 static const struct tc_ops
*
2276 tc_lookup_ovs_name(const char *name
)
2278 const struct tc_ops
*const *opsp
;
2280 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2281 const struct tc_ops
*ops
= *opsp
;
2282 if (!strcmp(name
, ops
->ovs_name
)) {
2289 static const struct tc_ops
*
2290 tc_lookup_linux_name(const char *name
)
2292 const struct tc_ops
*const *opsp
;
2294 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2295 const struct tc_ops
*ops
= *opsp
;
2296 if (ops
->linux_name
&& !strcmp(name
, ops
->linux_name
)) {
2303 static struct tc_queue
*
2304 tc_find_queue__(const struct netdev
*netdev_
, unsigned int queue_id
,
2307 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2308 struct tc_queue
*queue
;
2310 HMAP_FOR_EACH_IN_BUCKET (queue
, hmap_node
, hash
, &netdev
->tc
->queues
) {
2311 if (queue
->queue_id
== queue_id
) {
2318 static struct tc_queue
*
2319 tc_find_queue(const struct netdev
*netdev
, unsigned int queue_id
)
2321 return tc_find_queue__(netdev
, queue_id
, hash_int(queue_id
, 0));
2325 netdev_linux_get_qos_capabilities(const struct netdev
*netdev OVS_UNUSED
,
2327 struct netdev_qos_capabilities
*caps
)
2329 const struct tc_ops
*ops
= tc_lookup_ovs_name(type
);
2333 caps
->n_queues
= ops
->n_queues
;
2338 netdev_linux_get_qos(const struct netdev
*netdev_
,
2339 const char **typep
, struct smap
*details
)
2341 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2344 ovs_mutex_lock(&netdev
->mutex
);
2345 error
= tc_query_qdisc(netdev_
);
2347 *typep
= netdev
->tc
->ops
->ovs_name
;
2348 error
= (netdev
->tc
->ops
->qdisc_get
2349 ? netdev
->tc
->ops
->qdisc_get(netdev_
, details
)
2352 ovs_mutex_unlock(&netdev
->mutex
);
2358 netdev_linux_set_qos(struct netdev
*netdev_
,
2359 const char *type
, const struct smap
*details
)
2361 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2362 const struct tc_ops
*new_ops
;
2365 new_ops
= tc_lookup_ovs_name(type
);
2366 if (!new_ops
|| !new_ops
->tc_install
) {
2370 if (new_ops
== &tc_ops_noop
) {
2371 return new_ops
->tc_install(netdev_
, details
);
2374 ovs_mutex_lock(&netdev
->mutex
);
2375 error
= tc_query_qdisc(netdev_
);
2380 if (new_ops
== netdev
->tc
->ops
) {
2381 error
= new_ops
->qdisc_set
? new_ops
->qdisc_set(netdev_
, details
) : 0;
2383 /* Delete existing qdisc. */
2384 error
= tc_del_qdisc(netdev_
);
2388 ovs_assert(netdev
->tc
== NULL
);
2390 /* Install new qdisc. */
2391 error
= new_ops
->tc_install(netdev_
, details
);
2392 ovs_assert((error
== 0) == (netdev
->tc
!= NULL
));
2396 ovs_mutex_unlock(&netdev
->mutex
);
2401 netdev_linux_get_queue(const struct netdev
*netdev_
,
2402 unsigned int queue_id
, struct smap
*details
)
2404 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2407 ovs_mutex_lock(&netdev
->mutex
);
2408 error
= tc_query_qdisc(netdev_
);
2410 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2412 ? netdev
->tc
->ops
->class_get(netdev_
, queue
, details
)
2415 ovs_mutex_unlock(&netdev
->mutex
);
2421 netdev_linux_set_queue(struct netdev
*netdev_
,
2422 unsigned int queue_id
, const struct smap
*details
)
2424 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2427 ovs_mutex_lock(&netdev
->mutex
);
2428 error
= tc_query_qdisc(netdev_
);
2430 error
= (queue_id
< netdev
->tc
->ops
->n_queues
2431 && netdev
->tc
->ops
->class_set
2432 ? netdev
->tc
->ops
->class_set(netdev_
, queue_id
, details
)
2435 ovs_mutex_unlock(&netdev
->mutex
);
2441 netdev_linux_delete_queue(struct netdev
*netdev_
, unsigned int queue_id
)
2443 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2446 ovs_mutex_lock(&netdev
->mutex
);
2447 error
= tc_query_qdisc(netdev_
);
2449 if (netdev
->tc
->ops
->class_delete
) {
2450 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2452 ? netdev
->tc
->ops
->class_delete(netdev_
, queue
)
2458 ovs_mutex_unlock(&netdev
->mutex
);
2464 netdev_linux_get_queue_stats(const struct netdev
*netdev_
,
2465 unsigned int queue_id
,
2466 struct netdev_queue_stats
*stats
)
2468 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2471 ovs_mutex_lock(&netdev
->mutex
);
2472 error
= tc_query_qdisc(netdev_
);
2474 if (netdev
->tc
->ops
->class_get_stats
) {
2475 const struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2477 stats
->created
= queue
->created
;
2478 error
= netdev
->tc
->ops
->class_get_stats(netdev_
, queue
,
2487 ovs_mutex_unlock(&netdev
->mutex
);
2492 struct queue_dump_state
{
2493 struct nl_dump dump
;
2498 start_queue_dump(const struct netdev
*netdev
, struct queue_dump_state
*state
)
2500 struct ofpbuf request
;
2501 struct tcmsg
*tcmsg
;
2503 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, 0, &request
);
2507 tcmsg
->tcm_parent
= 0;
2508 nl_dump_start(&state
->dump
, NETLINK_ROUTE
, &request
);
2509 ofpbuf_uninit(&request
);
2511 ofpbuf_init(&state
->buf
, NL_DUMP_BUFSIZE
);
2516 finish_queue_dump(struct queue_dump_state
*state
)
2518 ofpbuf_uninit(&state
->buf
);
2519 return nl_dump_done(&state
->dump
);
2522 struct netdev_linux_queue_state
{
2523 unsigned int *queues
;
2529 netdev_linux_queue_dump_start(const struct netdev
*netdev_
, void **statep
)
2531 const struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2534 ovs_mutex_lock(&netdev
->mutex
);
2535 error
= tc_query_qdisc(netdev_
);
2537 if (netdev
->tc
->ops
->class_get
) {
2538 struct netdev_linux_queue_state
*state
;
2539 struct tc_queue
*queue
;
2542 *statep
= state
= xmalloc(sizeof *state
);
2543 state
->n_queues
= hmap_count(&netdev
->tc
->queues
);
2544 state
->cur_queue
= 0;
2545 state
->queues
= xmalloc(state
->n_queues
* sizeof *state
->queues
);
2548 HMAP_FOR_EACH (queue
, hmap_node
, &netdev
->tc
->queues
) {
2549 state
->queues
[i
++] = queue
->queue_id
;
2555 ovs_mutex_unlock(&netdev
->mutex
);
2561 netdev_linux_queue_dump_next(const struct netdev
*netdev_
, void *state_
,
2562 unsigned int *queue_idp
, struct smap
*details
)
2564 const struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2565 struct netdev_linux_queue_state
*state
= state_
;
2568 ovs_mutex_lock(&netdev
->mutex
);
2569 while (state
->cur_queue
< state
->n_queues
) {
2570 unsigned int queue_id
= state
->queues
[state
->cur_queue
++];
2571 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2574 *queue_idp
= queue_id
;
2575 error
= netdev
->tc
->ops
->class_get(netdev_
, queue
, details
);
2579 ovs_mutex_unlock(&netdev
->mutex
);
2585 netdev_linux_queue_dump_done(const struct netdev
*netdev OVS_UNUSED
,
2588 struct netdev_linux_queue_state
*state
= state_
;
2590 free(state
->queues
);
2596 netdev_linux_dump_queue_stats(const struct netdev
*netdev_
,
2597 netdev_dump_queue_stats_cb
*cb
, void *aux
)
2599 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2602 ovs_mutex_lock(&netdev
->mutex
);
2603 error
= tc_query_qdisc(netdev_
);
2605 struct queue_dump_state state
;
2607 if (!netdev
->tc
->ops
->class_dump_stats
) {
2609 } else if (!start_queue_dump(netdev_
, &state
)) {
2615 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
2616 retval
= netdev
->tc
->ops
->class_dump_stats(netdev_
, &msg
,
2623 retval
= finish_queue_dump(&state
);
2629 ovs_mutex_unlock(&netdev
->mutex
);
2635 netdev_linux_set_in4(struct netdev
*netdev_
, struct in_addr address
,
2636 struct in_addr netmask
)
2638 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2641 ovs_mutex_lock(&netdev
->mutex
);
2642 error
= do_set_addr(netdev_
, SIOCSIFADDR
, "SIOCSIFADDR", address
);
2644 if (address
.s_addr
!= INADDR_ANY
) {
2645 error
= do_set_addr(netdev_
, SIOCSIFNETMASK
,
2646 "SIOCSIFNETMASK", netmask
);
2650 ovs_mutex_unlock(&netdev
->mutex
);
2655 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2656 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2659 netdev_linux_get_addr_list(const struct netdev
*netdev_
,
2660 struct in6_addr
**addr
, struct in6_addr
**mask
, int *n_cnt
)
2662 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2665 ovs_mutex_lock(&netdev
->mutex
);
2666 error
= netdev_get_addrs(netdev_get_name(netdev_
), addr
, mask
, n_cnt
);
2667 ovs_mutex_unlock(&netdev
->mutex
);
2673 make_in4_sockaddr(struct sockaddr
*sa
, struct in_addr addr
)
2675 struct sockaddr_in sin
;
2676 memset(&sin
, 0, sizeof sin
);
2677 sin
.sin_family
= AF_INET
;
2678 sin
.sin_addr
= addr
;
2681 memset(sa
, 0, sizeof *sa
);
2682 memcpy(sa
, &sin
, sizeof sin
);
2686 do_set_addr(struct netdev
*netdev
,
2687 int ioctl_nr
, const char *ioctl_name
, struct in_addr addr
)
2691 make_in4_sockaddr(&ifr
.ifr_addr
, addr
);
2692 return af_inet_ifreq_ioctl(netdev_get_name(netdev
), &ifr
, ioctl_nr
,
2696 /* Adds 'router' as a default IP gateway. */
2698 netdev_linux_add_router(struct netdev
*netdev OVS_UNUSED
, struct in_addr router
)
2700 struct in_addr any
= { INADDR_ANY
};
2704 memset(&rt
, 0, sizeof rt
);
2705 make_in4_sockaddr(&rt
.rt_dst
, any
);
2706 make_in4_sockaddr(&rt
.rt_gateway
, router
);
2707 make_in4_sockaddr(&rt
.rt_genmask
, any
);
2708 rt
.rt_flags
= RTF_UP
| RTF_GATEWAY
;
2709 error
= af_inet_ioctl(SIOCADDRT
, &rt
);
2711 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error
));
2717 netdev_linux_get_next_hop(const struct in_addr
*host
, struct in_addr
*next_hop
,
2720 static const char fn
[] = "/proc/net/route";
2725 *netdev_name
= NULL
;
2726 stream
= fopen(fn
, "r");
2727 if (stream
== NULL
) {
2728 VLOG_WARN_RL(&rl
, "%s: open failed: %s", fn
, ovs_strerror(errno
));
2733 while (fgets(line
, sizeof line
, stream
)) {
2736 ovs_be32 dest
, gateway
, mask
;
2737 int refcnt
, metric
, mtu
;
2738 unsigned int flags
, use
, window
, irtt
;
2741 "%16s %"SCNx32
" %"SCNx32
" %04X %d %u %d %"SCNx32
2743 iface
, &dest
, &gateway
, &flags
, &refcnt
,
2744 &use
, &metric
, &mask
, &mtu
, &window
, &irtt
)) {
2745 VLOG_WARN_RL(&rl
, "%s: could not parse line %d: %s",
2749 if (!(flags
& RTF_UP
)) {
2750 /* Skip routes that aren't up. */
2754 /* The output of 'dest', 'mask', and 'gateway' were given in
2755 * network byte order, so we don't need need any endian
2756 * conversions here. */
2757 if ((dest
& mask
) == (host
->s_addr
& mask
)) {
2759 /* The host is directly reachable. */
2760 next_hop
->s_addr
= 0;
2762 /* To reach the host, we must go through a gateway. */
2763 next_hop
->s_addr
= gateway
;
2765 *netdev_name
= xstrdup(iface
);
2777 netdev_linux_get_status(const struct netdev
*netdev_
, struct smap
*smap
)
2779 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2782 ovs_mutex_lock(&netdev
->mutex
);
2783 if (!(netdev
->cache_valid
& VALID_DRVINFO
)) {
2784 struct ethtool_cmd
*cmd
= (struct ethtool_cmd
*) &netdev
->drvinfo
;
2786 COVERAGE_INC(netdev_get_ethtool
);
2787 memset(&netdev
->drvinfo
, 0, sizeof netdev
->drvinfo
);
2788 error
= netdev_linux_do_ethtool(netdev
->up
.name
,
2791 "ETHTOOL_GDRVINFO");
2793 netdev
->cache_valid
|= VALID_DRVINFO
;
2798 smap_add(smap
, "driver_name", netdev
->drvinfo
.driver
);
2799 smap_add(smap
, "driver_version", netdev
->drvinfo
.version
);
2800 smap_add(smap
, "firmware_version", netdev
->drvinfo
.fw_version
);
2802 ovs_mutex_unlock(&netdev
->mutex
);
2808 netdev_internal_get_status(const struct netdev
*netdev OVS_UNUSED
,
2811 smap_add(smap
, "driver_name", "openvswitch");
2815 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2816 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2817 * returns 0. Otherwise, it returns a positive errno value; in particular,
2818 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2820 netdev_linux_arp_lookup(const struct netdev
*netdev
,
2821 ovs_be32 ip
, struct eth_addr
*mac
)
2824 struct sockaddr_in sin
;
2827 memset(&r
, 0, sizeof r
);
2828 memset(&sin
, 0, sizeof sin
);
2829 sin
.sin_family
= AF_INET
;
2830 sin
.sin_addr
.s_addr
= ip
;
2832 memcpy(&r
.arp_pa
, &sin
, sizeof sin
);
2833 r
.arp_ha
.sa_family
= ARPHRD_ETHER
;
2835 ovs_strzcpy(r
.arp_dev
, netdev_get_name(netdev
), sizeof r
.arp_dev
);
2836 COVERAGE_INC(netdev_arp_lookup
);
2837 retval
= af_inet_ioctl(SIOCGARP
, &r
);
2839 memcpy(mac
, r
.arp_ha
.sa_data
, ETH_ADDR_LEN
);
2840 } else if (retval
!= ENXIO
) {
2841 VLOG_WARN_RL(&rl
, "%s: could not look up ARP entry for "IP_FMT
": %s",
2842 netdev_get_name(netdev
), IP_ARGS(ip
),
2843 ovs_strerror(retval
));
2849 nd_to_iff_flags(enum netdev_flags nd
)
2852 if (nd
& NETDEV_UP
) {
2855 if (nd
& NETDEV_PROMISC
) {
2858 if (nd
& NETDEV_LOOPBACK
) {
2859 iff
|= IFF_LOOPBACK
;
2865 iff_to_nd_flags(int iff
)
2867 enum netdev_flags nd
= 0;
2871 if (iff
& IFF_PROMISC
) {
2872 nd
|= NETDEV_PROMISC
;
2874 if (iff
& IFF_LOOPBACK
) {
2875 nd
|= NETDEV_LOOPBACK
;
2881 update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
2882 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
2883 OVS_REQUIRES(netdev
->mutex
)
2885 int old_flags
, new_flags
;
2888 old_flags
= netdev
->ifi_flags
;
2889 *old_flagsp
= iff_to_nd_flags(old_flags
);
2890 new_flags
= (old_flags
& ~nd_to_iff_flags(off
)) | nd_to_iff_flags(on
);
2891 if (new_flags
!= old_flags
) {
2892 error
= set_flags(netdev_get_name(&netdev
->up
), new_flags
);
2893 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
2900 netdev_linux_update_flags(struct netdev
*netdev_
, enum netdev_flags off
,
2901 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
2903 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2906 ovs_mutex_lock(&netdev
->mutex
);
2908 /* Changing flags over netlink isn't support yet. */
2909 error
= update_flags(netdev
, off
, on
, old_flagsp
);
2911 /* Try reading flags over netlink, or fall back to ioctl. */
2912 if (!netdev_linux_update_via_netlink(netdev
)) {
2913 *old_flagsp
= iff_to_nd_flags(netdev
->ifi_flags
);
2915 error
= update_flags(netdev
, off
, on
, old_flagsp
);
2918 ovs_mutex_unlock(&netdev
->mutex
);
2922 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2923 GET_FEATURES, GET_STATUS, \
2927 false, /* is_pmd */ \
2931 netdev_linux_wait, \
2933 netdev_linux_alloc, \
2935 netdev_linux_destruct, \
2936 netdev_linux_dealloc, \
2937 NULL, /* get_config */ \
2938 NULL, /* set_config */ \
2939 NULL, /* get_tunnel_config */ \
2940 NULL, /* build header */ \
2941 NULL, /* push header */ \
2942 NULL, /* pop header */ \
2943 NULL, /* get_numa_id */ \
2944 NULL, /* set_tx_multiq */ \
2946 netdev_linux_send, \
2947 netdev_linux_send_wait, \
2949 netdev_linux_set_etheraddr, \
2950 netdev_linux_get_etheraddr, \
2951 netdev_linux_get_mtu, \
2952 netdev_linux_set_mtu, \
2953 netdev_linux_get_ifindex, \
2954 netdev_linux_get_carrier, \
2955 netdev_linux_get_carrier_resets, \
2956 netdev_linux_set_miimon_interval, \
2961 netdev_linux_set_advertisements, \
2962 NULL, /* get_pt_mode */ \
2964 netdev_linux_set_policing, \
2965 netdev_linux_get_qos_types, \
2966 netdev_linux_get_qos_capabilities, \
2967 netdev_linux_get_qos, \
2968 netdev_linux_set_qos, \
2969 netdev_linux_get_queue, \
2970 netdev_linux_set_queue, \
2971 netdev_linux_delete_queue, \
2972 netdev_linux_get_queue_stats, \
2973 netdev_linux_queue_dump_start, \
2974 netdev_linux_queue_dump_next, \
2975 netdev_linux_queue_dump_done, \
2976 netdev_linux_dump_queue_stats, \
2978 netdev_linux_set_in4, \
2979 netdev_linux_get_addr_list, \
2980 netdev_linux_add_router, \
2981 netdev_linux_get_next_hop, \
2983 netdev_linux_arp_lookup, \
2985 netdev_linux_update_flags, \
2986 NULL, /* reconfigure */ \
2988 netdev_linux_rxq_alloc, \
2989 netdev_linux_rxq_construct, \
2990 netdev_linux_rxq_destruct, \
2991 netdev_linux_rxq_dealloc, \
2992 netdev_linux_rxq_recv, \
2993 netdev_linux_rxq_wait, \
2994 netdev_linux_rxq_drain, \
2999 const struct netdev_class netdev_linux_class
=
3002 netdev_linux_construct
,
3003 netdev_linux_get_stats
,
3004 netdev_linux_get_features
,
3005 netdev_linux_get_status
,
3006 LINUX_FLOW_OFFLOAD_API
);
3008 const struct netdev_class netdev_tap_class
=
3011 netdev_linux_construct_tap
,
3012 netdev_tap_get_stats
,
3013 netdev_linux_get_features
,
3014 netdev_linux_get_status
,
3017 const struct netdev_class netdev_internal_class
=
3020 netdev_linux_construct
,
3021 netdev_internal_get_stats
,
3022 NULL
, /* get_features */
3023 netdev_internal_get_status
,
3027 #define CODEL_N_QUEUES 0x0000
3029 /* In sufficiently new kernel headers these are defined as enums in
3030 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3031 * kernels. (This overrides any enum definition in the header file but that's
3033 #define TCA_CODEL_TARGET 1
3034 #define TCA_CODEL_LIMIT 2
3035 #define TCA_CODEL_INTERVAL 3
3044 static struct codel
*
3045 codel_get__(const struct netdev
*netdev_
)
3047 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3048 return CONTAINER_OF(netdev
->tc
, struct codel
, tc
);
3052 codel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3055 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3056 struct codel
*codel
;
3058 codel
= xmalloc(sizeof *codel
);
3059 tc_init(&codel
->tc
, &tc_ops_codel
);
3060 codel
->target
= target
;
3061 codel
->limit
= limit
;
3062 codel
->interval
= interval
;
3064 netdev
->tc
= &codel
->tc
;
3068 codel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3072 struct ofpbuf request
;
3073 struct tcmsg
*tcmsg
;
3074 uint32_t otarget
, olimit
, ointerval
;
3077 tc_del_qdisc(netdev
);
3079 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3080 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3084 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3085 tcmsg
->tcm_parent
= TC_H_ROOT
;
3087 otarget
= target
? target
: 5000;
3088 olimit
= limit
? limit
: 10240;
3089 ointerval
= interval
? interval
: 100000;
3091 nl_msg_put_string(&request
, TCA_KIND
, "codel");
3092 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3093 nl_msg_put_u32(&request
, TCA_CODEL_TARGET
, otarget
);
3094 nl_msg_put_u32(&request
, TCA_CODEL_LIMIT
, olimit
);
3095 nl_msg_put_u32(&request
, TCA_CODEL_INTERVAL
, ointerval
);
3096 nl_msg_end_nested(&request
, opt_offset
);
3098 error
= tc_transact(&request
, NULL
);
3100 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3101 "target %u, limit %u, interval %u error %d(%s)",
3102 netdev_get_name(netdev
),
3103 otarget
, olimit
, ointerval
,
3104 error
, ovs_strerror(error
));
3110 codel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3111 const struct smap
*details
, struct codel
*codel
)
3113 codel
->target
= smap_get_ullong(details
, "target", 0);
3114 codel
->limit
= smap_get_ullong(details
, "limit", 0);
3115 codel
->interval
= smap_get_ullong(details
, "interval", 0);
3117 if (!codel
->target
) {
3118 codel
->target
= 5000;
3120 if (!codel
->limit
) {
3121 codel
->limit
= 10240;
3123 if (!codel
->interval
) {
3124 codel
->interval
= 100000;
3129 codel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3134 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3135 error
= codel_setup_qdisc__(netdev
, codel
.target
, codel
.limit
,
3138 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3144 codel_parse_tca_options__(struct nlattr
*nl_options
, struct codel
*codel
)
3146 static const struct nl_policy tca_codel_policy
[] = {
3147 [TCA_CODEL_TARGET
] = { .type
= NL_A_U32
},
3148 [TCA_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3149 [TCA_CODEL_INTERVAL
] = { .type
= NL_A_U32
}
3152 struct nlattr
*attrs
[ARRAY_SIZE(tca_codel_policy
)];
3154 if (!nl_parse_nested(nl_options
, tca_codel_policy
,
3155 attrs
, ARRAY_SIZE(tca_codel_policy
))) {
3156 VLOG_WARN_RL(&rl
, "failed to parse CoDel class options");
3160 codel
->target
= nl_attr_get_u32(attrs
[TCA_CODEL_TARGET
]);
3161 codel
->limit
= nl_attr_get_u32(attrs
[TCA_CODEL_LIMIT
]);
3162 codel
->interval
= nl_attr_get_u32(attrs
[TCA_CODEL_INTERVAL
]);
3167 codel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3169 struct nlattr
*nlattr
;
3174 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3179 error
= codel_parse_tca_options__(nlattr
, &codel
);
3184 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3190 codel_tc_destroy(struct tc
*tc
)
3192 struct codel
*codel
= CONTAINER_OF(tc
, struct codel
, tc
);
3198 codel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3200 const struct codel
*codel
= codel_get__(netdev
);
3201 smap_add_format(details
, "target", "%u", codel
->target
);
3202 smap_add_format(details
, "limit", "%u", codel
->limit
);
3203 smap_add_format(details
, "interval", "%u", codel
->interval
);
3208 codel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3212 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3213 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3214 codel_get__(netdev
)->target
= codel
.target
;
3215 codel_get__(netdev
)->limit
= codel
.limit
;
3216 codel_get__(netdev
)->interval
= codel
.interval
;
3220 static const struct tc_ops tc_ops_codel
= {
3221 "codel", /* linux_name */
3222 "linux-codel", /* ovs_name */
3223 CODEL_N_QUEUES
, /* n_queues */
3236 /* FQ-CoDel traffic control class. */
3238 #define FQCODEL_N_QUEUES 0x0000
3240 /* In sufficiently new kernel headers these are defined as enums in
3241 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3242 * kernels. (This overrides any enum definition in the header file but that's
3244 #define TCA_FQ_CODEL_TARGET 1
3245 #define TCA_FQ_CODEL_LIMIT 2
3246 #define TCA_FQ_CODEL_INTERVAL 3
3247 #define TCA_FQ_CODEL_ECN 4
3248 #define TCA_FQ_CODEL_FLOWS 5
3249 #define TCA_FQ_CODEL_QUANTUM 6
3260 static struct fqcodel
*
3261 fqcodel_get__(const struct netdev
*netdev_
)
3263 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3264 return CONTAINER_OF(netdev
->tc
, struct fqcodel
, tc
);
3268 fqcodel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3269 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3271 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3272 struct fqcodel
*fqcodel
;
3274 fqcodel
= xmalloc(sizeof *fqcodel
);
3275 tc_init(&fqcodel
->tc
, &tc_ops_fqcodel
);
3276 fqcodel
->target
= target
;
3277 fqcodel
->limit
= limit
;
3278 fqcodel
->interval
= interval
;
3279 fqcodel
->flows
= flows
;
3280 fqcodel
->quantum
= quantum
;
3282 netdev
->tc
= &fqcodel
->tc
;
3286 fqcodel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3287 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3290 struct ofpbuf request
;
3291 struct tcmsg
*tcmsg
;
3292 uint32_t otarget
, olimit
, ointerval
, oflows
, oquantum
;
3295 tc_del_qdisc(netdev
);
3297 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3298 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3302 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3303 tcmsg
->tcm_parent
= TC_H_ROOT
;
3305 otarget
= target
? target
: 5000;
3306 olimit
= limit
? limit
: 10240;
3307 ointerval
= interval
? interval
: 100000;
3308 oflows
= flows
? flows
: 1024;
3309 oquantum
= quantum
? quantum
: 1514; /* fq_codel default quantum is 1514
3312 nl_msg_put_string(&request
, TCA_KIND
, "fq_codel");
3313 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3314 nl_msg_put_u32(&request
, TCA_FQ_CODEL_TARGET
, otarget
);
3315 nl_msg_put_u32(&request
, TCA_FQ_CODEL_LIMIT
, olimit
);
3316 nl_msg_put_u32(&request
, TCA_FQ_CODEL_INTERVAL
, ointerval
);
3317 nl_msg_put_u32(&request
, TCA_FQ_CODEL_FLOWS
, oflows
);
3318 nl_msg_put_u32(&request
, TCA_FQ_CODEL_QUANTUM
, oquantum
);
3319 nl_msg_end_nested(&request
, opt_offset
);
3321 error
= tc_transact(&request
, NULL
);
3323 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3324 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3325 netdev_get_name(netdev
),
3326 otarget
, olimit
, ointerval
, oflows
, oquantum
,
3327 error
, ovs_strerror(error
));
3333 fqcodel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3334 const struct smap
*details
, struct fqcodel
*fqcodel
)
3336 fqcodel
->target
= smap_get_ullong(details
, "target", 0);
3337 fqcodel
->limit
= smap_get_ullong(details
, "limit", 0);
3338 fqcodel
->interval
= smap_get_ullong(details
, "interval", 0);
3339 fqcodel
->flows
= smap_get_ullong(details
, "flows", 0);
3340 fqcodel
->quantum
= smap_get_ullong(details
, "quantum", 0);
3342 if (!fqcodel
->target
) {
3343 fqcodel
->target
= 5000;
3345 if (!fqcodel
->limit
) {
3346 fqcodel
->limit
= 10240;
3348 if (!fqcodel
->interval
) {
3349 fqcodel
->interval
= 1000000;
3351 if (!fqcodel
->flows
) {
3352 fqcodel
->flows
= 1024;
3354 if (!fqcodel
->quantum
) {
3355 fqcodel
->quantum
= 1514;
3360 fqcodel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3363 struct fqcodel fqcodel
;
3365 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3366 error
= fqcodel_setup_qdisc__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3367 fqcodel
.interval
, fqcodel
.flows
,
3370 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3371 fqcodel
.interval
, fqcodel
.flows
, fqcodel
.quantum
);
3377 fqcodel_parse_tca_options__(struct nlattr
*nl_options
, struct fqcodel
*fqcodel
)
3379 static const struct nl_policy tca_fqcodel_policy
[] = {
3380 [TCA_FQ_CODEL_TARGET
] = { .type
= NL_A_U32
},
3381 [TCA_FQ_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3382 [TCA_FQ_CODEL_INTERVAL
] = { .type
= NL_A_U32
},
3383 [TCA_FQ_CODEL_FLOWS
] = { .type
= NL_A_U32
},
3384 [TCA_FQ_CODEL_QUANTUM
] = { .type
= NL_A_U32
}
3387 struct nlattr
*attrs
[ARRAY_SIZE(tca_fqcodel_policy
)];
3389 if (!nl_parse_nested(nl_options
, tca_fqcodel_policy
,
3390 attrs
, ARRAY_SIZE(tca_fqcodel_policy
))) {
3391 VLOG_WARN_RL(&rl
, "failed to parse FQ_CoDel class options");
3395 fqcodel
->target
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_TARGET
]);
3396 fqcodel
->limit
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_LIMIT
]);
3397 fqcodel
->interval
=nl_attr_get_u32(attrs
[TCA_FQ_CODEL_INTERVAL
]);
3398 fqcodel
->flows
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_FLOWS
]);
3399 fqcodel
->quantum
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_QUANTUM
]);
3404 fqcodel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3406 struct nlattr
*nlattr
;
3409 struct fqcodel fqcodel
;
3411 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3416 error
= fqcodel_parse_tca_options__(nlattr
, &fqcodel
);
3421 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3422 fqcodel
.flows
, fqcodel
.quantum
);
3427 fqcodel_tc_destroy(struct tc
*tc
)
3429 struct fqcodel
*fqcodel
= CONTAINER_OF(tc
, struct fqcodel
, tc
);
3435 fqcodel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3437 const struct fqcodel
*fqcodel
= fqcodel_get__(netdev
);
3438 smap_add_format(details
, "target", "%u", fqcodel
->target
);
3439 smap_add_format(details
, "limit", "%u", fqcodel
->limit
);
3440 smap_add_format(details
, "interval", "%u", fqcodel
->interval
);
3441 smap_add_format(details
, "flows", "%u", fqcodel
->flows
);
3442 smap_add_format(details
, "quantum", "%u", fqcodel
->quantum
);
3447 fqcodel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3449 struct fqcodel fqcodel
;
3451 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3452 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3453 fqcodel
.flows
, fqcodel
.quantum
);
3454 fqcodel_get__(netdev
)->target
= fqcodel
.target
;
3455 fqcodel_get__(netdev
)->limit
= fqcodel
.limit
;
3456 fqcodel_get__(netdev
)->interval
= fqcodel
.interval
;
3457 fqcodel_get__(netdev
)->flows
= fqcodel
.flows
;
3458 fqcodel_get__(netdev
)->quantum
= fqcodel
.quantum
;
3462 static const struct tc_ops tc_ops_fqcodel
= {
3463 "fq_codel", /* linux_name */
3464 "linux-fq_codel", /* ovs_name */
3465 FQCODEL_N_QUEUES
, /* n_queues */
3478 /* SFQ traffic control class. */
3480 #define SFQ_N_QUEUES 0x0000
3489 sfq_get__(const struct netdev
*netdev_
)
3491 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3492 return CONTAINER_OF(netdev
->tc
, struct sfq
, tc
);
3496 sfq_install__(struct netdev
*netdev_
, uint32_t quantum
, uint32_t perturb
)
3498 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3501 sfq
= xmalloc(sizeof *sfq
);
3502 tc_init(&sfq
->tc
, &tc_ops_sfq
);
3503 sfq
->perturb
= perturb
;
3504 sfq
->quantum
= quantum
;
3506 netdev
->tc
= &sfq
->tc
;
3510 sfq_setup_qdisc__(struct netdev
*netdev
, uint32_t quantum
, uint32_t perturb
)
3512 struct tc_sfq_qopt opt
;
3513 struct ofpbuf request
;
3514 struct tcmsg
*tcmsg
;
3516 int mtu_error
, error
;
3517 mtu_error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3519 tc_del_qdisc(netdev
);
3521 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3522 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3526 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3527 tcmsg
->tcm_parent
= TC_H_ROOT
;
3529 memset(&opt
, 0, sizeof opt
);
3532 opt
.quantum
= mtu
; /* if we cannot find mtu, use default */
3535 opt
.quantum
= quantum
;
3539 opt
.perturb_period
= 10;
3541 opt
.perturb_period
= perturb
;
3544 nl_msg_put_string(&request
, TCA_KIND
, "sfq");
3545 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
3547 error
= tc_transact(&request
, NULL
);
3549 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3550 "quantum %u, perturb %u error %d(%s)",
3551 netdev_get_name(netdev
),
3552 opt
.quantum
, opt
.perturb_period
,
3553 error
, ovs_strerror(error
));
3559 sfq_parse_qdisc_details__(struct netdev
*netdev
,
3560 const struct smap
*details
, struct sfq
*sfq
)
3562 sfq
->perturb
= smap_get_ullong(details
, "perturb", 0);
3563 sfq
->quantum
= smap_get_ullong(details
, "quantum", 0);
3565 if (!sfq
->perturb
) {
3569 if (!sfq
->quantum
) {
3571 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
3574 VLOG_WARN_RL(&rl
, "when using SFQ, you must specify quantum on a "
3575 "device without mtu");
3581 sfq_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3586 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3587 error
= sfq_setup_qdisc__(netdev
, sfq
.quantum
, sfq
.perturb
);
3589 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3595 sfq_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3597 const struct tc_sfq_qopt
*sfq
;
3598 struct nlattr
*nlattr
;
3602 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3604 sfq
= nl_attr_get(nlattr
);
3605 sfq_install__(netdev
, sfq
->perturb_period
, sfq
->quantum
);
3613 sfq_tc_destroy(struct tc
*tc
)
3615 struct sfq
*sfq
= CONTAINER_OF(tc
, struct sfq
, tc
);
3621 sfq_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3623 const struct sfq
*sfq
= sfq_get__(netdev
);
3624 smap_add_format(details
, "quantum", "%u", sfq
->quantum
);
3625 smap_add_format(details
, "perturb", "%u", sfq
->perturb
);
3630 sfq_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3634 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3635 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3636 sfq_get__(netdev
)->quantum
= sfq
.quantum
;
3637 sfq_get__(netdev
)->perturb
= sfq
.perturb
;
3641 static const struct tc_ops tc_ops_sfq
= {
3642 "sfq", /* linux_name */
3643 "linux-sfq", /* ovs_name */
3644 SFQ_N_QUEUES
, /* n_queues */
3657 /* HTB traffic control class. */
3659 #define HTB_N_QUEUES 0xf000
3660 #define HTB_RATE2QUANTUM 10
3664 unsigned int max_rate
; /* In bytes/s. */
3668 struct tc_queue tc_queue
;
3669 unsigned int min_rate
; /* In bytes/s. */
3670 unsigned int max_rate
; /* In bytes/s. */
3671 unsigned int burst
; /* In bytes. */
3672 unsigned int priority
; /* Lower values are higher priorities. */
3676 htb_get__(const struct netdev
*netdev_
)
3678 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3679 return CONTAINER_OF(netdev
->tc
, struct htb
, tc
);
3683 htb_install__(struct netdev
*netdev_
, uint64_t max_rate
)
3685 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3688 htb
= xmalloc(sizeof *htb
);
3689 tc_init(&htb
->tc
, &tc_ops_htb
);
3690 htb
->max_rate
= max_rate
;
3692 netdev
->tc
= &htb
->tc
;
3695 /* Create an HTB qdisc.
3697 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3699 htb_setup_qdisc__(struct netdev
*netdev
)
3702 struct tc_htb_glob opt
;
3703 struct ofpbuf request
;
3704 struct tcmsg
*tcmsg
;
3706 tc_del_qdisc(netdev
);
3708 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3709 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3713 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3714 tcmsg
->tcm_parent
= TC_H_ROOT
;
3716 nl_msg_put_string(&request
, TCA_KIND
, "htb");
3718 memset(&opt
, 0, sizeof opt
);
3719 opt
.rate2quantum
= HTB_RATE2QUANTUM
;
3723 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3724 nl_msg_put_unspec(&request
, TCA_HTB_INIT
, &opt
, sizeof opt
);
3725 nl_msg_end_nested(&request
, opt_offset
);
3727 return tc_transact(&request
, NULL
);
3730 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3731 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3733 htb_setup_class__(struct netdev
*netdev
, unsigned int handle
,
3734 unsigned int parent
, struct htb_class
*class)
3737 struct tc_htb_opt opt
;
3738 struct ofpbuf request
;
3739 struct tcmsg
*tcmsg
;
3743 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3745 VLOG_WARN_RL(&rl
, "cannot set up HTB on device %s that lacks MTU",
3746 netdev_get_name(netdev
));
3750 memset(&opt
, 0, sizeof opt
);
3751 tc_fill_rate(&opt
.rate
, class->min_rate
, mtu
);
3752 tc_fill_rate(&opt
.ceil
, class->max_rate
, mtu
);
3753 /* Makes sure the quantum is at least MTU. Setting quantum will
3754 * make htb ignore the r2q for this class. */
3755 if ((class->min_rate
/ HTB_RATE2QUANTUM
) < mtu
) {
3758 opt
.buffer
= tc_calc_buffer(opt
.rate
.rate
, mtu
, class->burst
);
3759 opt
.cbuffer
= tc_calc_buffer(opt
.ceil
.rate
, mtu
, class->burst
);
3760 opt
.prio
= class->priority
;
3762 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
3767 tcmsg
->tcm_handle
= handle
;
3768 tcmsg
->tcm_parent
= parent
;
3770 nl_msg_put_string(&request
, TCA_KIND
, "htb");
3771 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3772 nl_msg_put_unspec(&request
, TCA_HTB_PARMS
, &opt
, sizeof opt
);
3773 tc_put_rtab(&request
, TCA_HTB_RTAB
, &opt
.rate
);
3774 tc_put_rtab(&request
, TCA_HTB_CTAB
, &opt
.ceil
);
3775 nl_msg_end_nested(&request
, opt_offset
);
3777 error
= tc_transact(&request
, NULL
);
3779 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
3780 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3781 netdev_get_name(netdev
),
3782 tc_get_major(handle
), tc_get_minor(handle
),
3783 tc_get_major(parent
), tc_get_minor(parent
),
3784 class->min_rate
, class->max_rate
,
3785 class->burst
, class->priority
, ovs_strerror(error
));
3790 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3791 * description of them into 'details'. The description complies with the
3792 * specification given in the vswitch database documentation for linux-htb
3795 htb_parse_tca_options__(struct nlattr
*nl_options
, struct htb_class
*class)
3797 static const struct nl_policy tca_htb_policy
[] = {
3798 [TCA_HTB_PARMS
] = { .type
= NL_A_UNSPEC
, .optional
= false,
3799 .min_len
= sizeof(struct tc_htb_opt
) },
3802 struct nlattr
*attrs
[ARRAY_SIZE(tca_htb_policy
)];
3803 const struct tc_htb_opt
*htb
;
3805 if (!nl_parse_nested(nl_options
, tca_htb_policy
,
3806 attrs
, ARRAY_SIZE(tca_htb_policy
))) {
3807 VLOG_WARN_RL(&rl
, "failed to parse HTB class options");
3811 htb
= nl_attr_get(attrs
[TCA_HTB_PARMS
]);
3812 class->min_rate
= htb
->rate
.rate
;
3813 class->max_rate
= htb
->ceil
.rate
;
3814 class->burst
= tc_ticks_to_bytes(htb
->rate
.rate
, htb
->buffer
);
3815 class->priority
= htb
->prio
;
3820 htb_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
3821 struct htb_class
*options
,
3822 struct netdev_queue_stats
*stats
)
3824 struct nlattr
*nl_options
;
3825 unsigned int handle
;
3828 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
3829 if (!error
&& queue_id
) {
3830 unsigned int major
= tc_get_major(handle
);
3831 unsigned int minor
= tc_get_minor(handle
);
3832 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
3833 *queue_id
= minor
- 1;
3838 if (!error
&& options
) {
3839 error
= htb_parse_tca_options__(nl_options
, options
);
3845 htb_parse_qdisc_details__(struct netdev
*netdev_
,
3846 const struct smap
*details
, struct htb_class
*hc
)
3848 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3850 hc
->max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
3851 if (!hc
->max_rate
) {
3852 enum netdev_features current
;
3854 netdev_linux_read_features(netdev
);
3855 current
= !netdev
->get_features_error
? netdev
->current
: 0;
3856 hc
->max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
3858 hc
->min_rate
= hc
->max_rate
;
3864 htb_parse_class_details__(struct netdev
*netdev
,
3865 const struct smap
*details
, struct htb_class
*hc
)
3867 const struct htb
*htb
= htb_get__(netdev
);
3869 unsigned long long int max_rate_bit
;
3871 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3873 VLOG_WARN_RL(&rl
, "cannot parse HTB class on device %s that lacks MTU",
3874 netdev_get_name(netdev
));
3878 /* HTB requires at least an mtu sized min-rate to send any traffic even
3879 * on uncongested links. */
3880 hc
->min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
3881 hc
->min_rate
= MAX(hc
->min_rate
, mtu
);
3882 hc
->min_rate
= MIN(hc
->min_rate
, htb
->max_rate
);
3885 max_rate_bit
= smap_get_ullong(details
, "max-rate", 0);
3886 hc
->max_rate
= max_rate_bit
? max_rate_bit
/ 8 : htb
->max_rate
;
3887 hc
->max_rate
= MAX(hc
->max_rate
, hc
->min_rate
);
3888 hc
->max_rate
= MIN(hc
->max_rate
, htb
->max_rate
);
3892 * According to hints in the documentation that I've read, it is important
3893 * that 'burst' be at least as big as the largest frame that might be
3894 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3895 * but having it a bit too small is a problem. Since netdev_get_mtu()
3896 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3897 * the MTU. We actually add 64, instead of 14, as a guard against
3898 * additional headers get tacked on somewhere that we're not aware of. */
3899 hc
->burst
= smap_get_ullong(details
, "burst", 0) / 8;
3900 hc
->burst
= MAX(hc
->burst
, mtu
+ 64);
3903 hc
->priority
= smap_get_ullong(details
, "priority", 0);
3909 htb_query_class__(const struct netdev
*netdev
, unsigned int handle
,
3910 unsigned int parent
, struct htb_class
*options
,
3911 struct netdev_queue_stats
*stats
)
3913 struct ofpbuf
*reply
;
3916 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
3918 error
= htb_parse_tcmsg__(reply
, NULL
, options
, stats
);
3919 ofpbuf_delete(reply
);
3925 htb_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3929 error
= htb_setup_qdisc__(netdev
);
3931 struct htb_class hc
;
3933 htb_parse_qdisc_details__(netdev
, details
, &hc
);
3934 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
3935 tc_make_handle(1, 0), &hc
);
3937 htb_install__(netdev
, hc
.max_rate
);
3943 static struct htb_class
*
3944 htb_class_cast__(const struct tc_queue
*queue
)
3946 return CONTAINER_OF(queue
, struct htb_class
, tc_queue
);
3950 htb_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
3951 const struct htb_class
*hc
)
3953 struct htb
*htb
= htb_get__(netdev
);
3954 size_t hash
= hash_int(queue_id
, 0);
3955 struct tc_queue
*queue
;
3956 struct htb_class
*hcp
;
3958 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
3960 hcp
= htb_class_cast__(queue
);
3962 hcp
= xmalloc(sizeof *hcp
);
3963 queue
= &hcp
->tc_queue
;
3964 queue
->queue_id
= queue_id
;
3965 queue
->created
= time_msec();
3966 hmap_insert(&htb
->tc
.queues
, &queue
->hmap_node
, hash
);
3969 hcp
->min_rate
= hc
->min_rate
;
3970 hcp
->max_rate
= hc
->max_rate
;
3971 hcp
->burst
= hc
->burst
;
3972 hcp
->priority
= hc
->priority
;
3976 htb_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
3979 struct queue_dump_state state
;
3980 struct htb_class hc
;
3982 /* Get qdisc options. */
3984 htb_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
3985 htb_install__(netdev
, hc
.max_rate
);
3988 if (!start_queue_dump(netdev
, &state
)) {
3991 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
3992 unsigned int queue_id
;
3994 if (!htb_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
3995 htb_update_queue__(netdev
, queue_id
, &hc
);
3998 finish_queue_dump(&state
);
4004 htb_tc_destroy(struct tc
*tc
)
4006 struct htb
*htb
= CONTAINER_OF(tc
, struct htb
, tc
);
4007 struct htb_class
*hc
;
4009 HMAP_FOR_EACH_POP (hc
, tc_queue
.hmap_node
, &htb
->tc
.queues
) {
4017 htb_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4019 const struct htb
*htb
= htb_get__(netdev
);
4020 smap_add_format(details
, "max-rate", "%llu", 8ULL * htb
->max_rate
);
4025 htb_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4027 struct htb_class hc
;
4030 htb_parse_qdisc_details__(netdev
, details
, &hc
);
4031 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4032 tc_make_handle(1, 0), &hc
);
4034 htb_get__(netdev
)->max_rate
= hc
.max_rate
;
4040 htb_class_get(const struct netdev
*netdev OVS_UNUSED
,
4041 const struct tc_queue
*queue
, struct smap
*details
)
4043 const struct htb_class
*hc
= htb_class_cast__(queue
);
4045 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4046 if (hc
->min_rate
!= hc
->max_rate
) {
4047 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4049 smap_add_format(details
, "burst", "%llu", 8ULL * hc
->burst
);
4051 smap_add_format(details
, "priority", "%u", hc
->priority
);
4057 htb_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4058 const struct smap
*details
)
4060 struct htb_class hc
;
4063 error
= htb_parse_class_details__(netdev
, details
, &hc
);
4068 error
= htb_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
4069 tc_make_handle(1, 0xfffe), &hc
);
4074 htb_update_queue__(netdev
, queue_id
, &hc
);
4079 htb_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4081 struct htb_class
*hc
= htb_class_cast__(queue
);
4082 struct htb
*htb
= htb_get__(netdev
);
4085 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4087 hmap_remove(&htb
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4094 htb_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4095 struct netdev_queue_stats
*stats
)
4097 return htb_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4098 tc_make_handle(1, 0xfffe), NULL
, stats
);
4102 htb_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4103 const struct ofpbuf
*nlmsg
,
4104 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4106 struct netdev_queue_stats stats
;
4107 unsigned int handle
, major
, minor
;
4110 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4115 major
= tc_get_major(handle
);
4116 minor
= tc_get_minor(handle
);
4117 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4118 (*cb
)(minor
- 1, &stats
, aux
);
4123 static const struct tc_ops tc_ops_htb
= {
4124 "htb", /* linux_name */
4125 "linux-htb", /* ovs_name */
4126 HTB_N_QUEUES
, /* n_queues */
4135 htb_class_get_stats
,
4136 htb_class_dump_stats
4139 /* "linux-hfsc" traffic control class. */
4141 #define HFSC_N_QUEUES 0xf000
4149 struct tc_queue tc_queue
;
4154 static struct hfsc
*
4155 hfsc_get__(const struct netdev
*netdev_
)
4157 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4158 return CONTAINER_OF(netdev
->tc
, struct hfsc
, tc
);
4161 static struct hfsc_class
*
4162 hfsc_class_cast__(const struct tc_queue
*queue
)
4164 return CONTAINER_OF(queue
, struct hfsc_class
, tc_queue
);
4168 hfsc_install__(struct netdev
*netdev_
, uint32_t max_rate
)
4170 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4173 hfsc
= xmalloc(sizeof *hfsc
);
4174 tc_init(&hfsc
->tc
, &tc_ops_hfsc
);
4175 hfsc
->max_rate
= max_rate
;
4176 netdev
->tc
= &hfsc
->tc
;
4180 hfsc_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4181 const struct hfsc_class
*hc
)
4185 struct hfsc_class
*hcp
;
4186 struct tc_queue
*queue
;
4188 hfsc
= hfsc_get__(netdev
);
4189 hash
= hash_int(queue_id
, 0);
4191 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4193 hcp
= hfsc_class_cast__(queue
);
4195 hcp
= xmalloc(sizeof *hcp
);
4196 queue
= &hcp
->tc_queue
;
4197 queue
->queue_id
= queue_id
;
4198 queue
->created
= time_msec();
4199 hmap_insert(&hfsc
->tc
.queues
, &queue
->hmap_node
, hash
);
4202 hcp
->min_rate
= hc
->min_rate
;
4203 hcp
->max_rate
= hc
->max_rate
;
4207 hfsc_parse_tca_options__(struct nlattr
*nl_options
, struct hfsc_class
*class)
4209 const struct tc_service_curve
*rsc
, *fsc
, *usc
;
4210 static const struct nl_policy tca_hfsc_policy
[] = {
4212 .type
= NL_A_UNSPEC
,
4214 .min_len
= sizeof(struct tc_service_curve
),
4217 .type
= NL_A_UNSPEC
,
4219 .min_len
= sizeof(struct tc_service_curve
),
4222 .type
= NL_A_UNSPEC
,
4224 .min_len
= sizeof(struct tc_service_curve
),
4227 struct nlattr
*attrs
[ARRAY_SIZE(tca_hfsc_policy
)];
4229 if (!nl_parse_nested(nl_options
, tca_hfsc_policy
,
4230 attrs
, ARRAY_SIZE(tca_hfsc_policy
))) {
4231 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options");
4235 rsc
= nl_attr_get(attrs
[TCA_HFSC_RSC
]);
4236 fsc
= nl_attr_get(attrs
[TCA_HFSC_FSC
]);
4237 usc
= nl_attr_get(attrs
[TCA_HFSC_USC
]);
4239 if (rsc
->m1
!= 0 || rsc
->d
!= 0 ||
4240 fsc
->m1
!= 0 || fsc
->d
!= 0 ||
4241 usc
->m1
!= 0 || usc
->d
!= 0) {
4242 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4243 "Non-linear service curves are not supported.");
4247 if (rsc
->m2
!= fsc
->m2
) {
4248 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4249 "Real-time service curves are not supported ");
4253 if (rsc
->m2
> usc
->m2
) {
4254 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4255 "Min-rate service curve is greater than "
4256 "the max-rate service curve.");
4260 class->min_rate
= fsc
->m2
;
4261 class->max_rate
= usc
->m2
;
4266 hfsc_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4267 struct hfsc_class
*options
,
4268 struct netdev_queue_stats
*stats
)
4271 unsigned int handle
;
4272 struct nlattr
*nl_options
;
4274 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4280 unsigned int major
, minor
;
4282 major
= tc_get_major(handle
);
4283 minor
= tc_get_minor(handle
);
4284 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4285 *queue_id
= minor
- 1;
4292 error
= hfsc_parse_tca_options__(nl_options
, options
);
4299 hfsc_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4300 unsigned int parent
, struct hfsc_class
*options
,
4301 struct netdev_queue_stats
*stats
)
4304 struct ofpbuf
*reply
;
4306 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4311 error
= hfsc_parse_tcmsg__(reply
, NULL
, options
, stats
);
4312 ofpbuf_delete(reply
);
4317 hfsc_parse_qdisc_details__(struct netdev
*netdev_
, const struct smap
*details
,
4318 struct hfsc_class
*class)
4320 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4322 uint32_t max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
4324 enum netdev_features current
;
4326 netdev_linux_read_features(netdev
);
4327 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4328 max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4331 class->min_rate
= max_rate
;
4332 class->max_rate
= max_rate
;
4336 hfsc_parse_class_details__(struct netdev
*netdev
,
4337 const struct smap
*details
,
4338 struct hfsc_class
* class)
4340 const struct hfsc
*hfsc
;
4341 uint32_t min_rate
, max_rate
;
4343 hfsc
= hfsc_get__(netdev
);
4345 min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
4346 min_rate
= MAX(min_rate
, 1);
4347 min_rate
= MIN(min_rate
, hfsc
->max_rate
);
4349 max_rate
= smap_get_ullong(details
, "max-rate", hfsc
->max_rate
* 8) / 8;
4350 max_rate
= MAX(max_rate
, min_rate
);
4351 max_rate
= MIN(max_rate
, hfsc
->max_rate
);
4353 class->min_rate
= min_rate
;
4354 class->max_rate
= max_rate
;
4359 /* Create an HFSC qdisc.
4361 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4363 hfsc_setup_qdisc__(struct netdev
* netdev
)
4365 struct tcmsg
*tcmsg
;
4366 struct ofpbuf request
;
4367 struct tc_hfsc_qopt opt
;
4369 tc_del_qdisc(netdev
);
4371 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4372 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4378 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4379 tcmsg
->tcm_parent
= TC_H_ROOT
;
4381 memset(&opt
, 0, sizeof opt
);
4384 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4385 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
4387 return tc_transact(&request
, NULL
);
4390 /* Create an HFSC class.
4392 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4393 * sc rate <min_rate> ul rate <max_rate>" */
4395 hfsc_setup_class__(struct netdev
*netdev
, unsigned int handle
,
4396 unsigned int parent
, struct hfsc_class
*class)
4400 struct tcmsg
*tcmsg
;
4401 struct ofpbuf request
;
4402 struct tc_service_curve min
, max
;
4404 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
4411 tcmsg
->tcm_handle
= handle
;
4412 tcmsg
->tcm_parent
= parent
;
4416 min
.m2
= class->min_rate
;
4420 max
.m2
= class->max_rate
;
4422 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4423 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4424 nl_msg_put_unspec(&request
, TCA_HFSC_RSC
, &min
, sizeof min
);
4425 nl_msg_put_unspec(&request
, TCA_HFSC_FSC
, &min
, sizeof min
);
4426 nl_msg_put_unspec(&request
, TCA_HFSC_USC
, &max
, sizeof max
);
4427 nl_msg_end_nested(&request
, opt_offset
);
4429 error
= tc_transact(&request
, NULL
);
4431 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
4432 "min-rate %ubps, max-rate %ubps (%s)",
4433 netdev_get_name(netdev
),
4434 tc_get_major(handle
), tc_get_minor(handle
),
4435 tc_get_major(parent
), tc_get_minor(parent
),
4436 class->min_rate
, class->max_rate
, ovs_strerror(error
));
4443 hfsc_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4446 struct hfsc_class
class;
4448 error
= hfsc_setup_qdisc__(netdev
);
4454 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4455 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4456 tc_make_handle(1, 0), &class);
4462 hfsc_install__(netdev
, class.max_rate
);
4467 hfsc_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4470 struct queue_dump_state state
;
4471 struct hfsc_class hc
;
4474 hfsc_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4475 hfsc_install__(netdev
, hc
.max_rate
);
4477 if (!start_queue_dump(netdev
, &state
)) {
4481 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4482 unsigned int queue_id
;
4484 if (!hfsc_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4485 hfsc_update_queue__(netdev
, queue_id
, &hc
);
4489 finish_queue_dump(&state
);
4494 hfsc_tc_destroy(struct tc
*tc
)
4497 struct hfsc_class
*hc
, *next
;
4499 hfsc
= CONTAINER_OF(tc
, struct hfsc
, tc
);
4501 HMAP_FOR_EACH_SAFE (hc
, next
, tc_queue
.hmap_node
, &hfsc
->tc
.queues
) {
4502 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4511 hfsc_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4513 const struct hfsc
*hfsc
;
4514 hfsc
= hfsc_get__(netdev
);
4515 smap_add_format(details
, "max-rate", "%llu", 8ULL * hfsc
->max_rate
);
4520 hfsc_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4523 struct hfsc_class
class;
4525 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4526 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4527 tc_make_handle(1, 0), &class);
4530 hfsc_get__(netdev
)->max_rate
= class.max_rate
;
4537 hfsc_class_get(const struct netdev
*netdev OVS_UNUSED
,
4538 const struct tc_queue
*queue
, struct smap
*details
)
4540 const struct hfsc_class
*hc
;
4542 hc
= hfsc_class_cast__(queue
);
4543 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4544 if (hc
->min_rate
!= hc
->max_rate
) {
4545 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4551 hfsc_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4552 const struct smap
*details
)
4555 struct hfsc_class
class;
4557 error
= hfsc_parse_class_details__(netdev
, details
, &class);
4562 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
4563 tc_make_handle(1, 0xfffe), &class);
4568 hfsc_update_queue__(netdev
, queue_id
, &class);
4573 hfsc_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4577 struct hfsc_class
*hc
;
4579 hc
= hfsc_class_cast__(queue
);
4580 hfsc
= hfsc_get__(netdev
);
4582 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4584 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4591 hfsc_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4592 struct netdev_queue_stats
*stats
)
4594 return hfsc_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4595 tc_make_handle(1, 0xfffe), NULL
, stats
);
4599 hfsc_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4600 const struct ofpbuf
*nlmsg
,
4601 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4603 struct netdev_queue_stats stats
;
4604 unsigned int handle
, major
, minor
;
4607 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4612 major
= tc_get_major(handle
);
4613 minor
= tc_get_minor(handle
);
4614 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4615 (*cb
)(minor
- 1, &stats
, aux
);
4620 static const struct tc_ops tc_ops_hfsc
= {
4621 "hfsc", /* linux_name */
4622 "linux-hfsc", /* ovs_name */
4623 HFSC_N_QUEUES
, /* n_queues */
4624 hfsc_tc_install
, /* tc_install */
4625 hfsc_tc_load
, /* tc_load */
4626 hfsc_tc_destroy
, /* tc_destroy */
4627 hfsc_qdisc_get
, /* qdisc_get */
4628 hfsc_qdisc_set
, /* qdisc_set */
4629 hfsc_class_get
, /* class_get */
4630 hfsc_class_set
, /* class_set */
4631 hfsc_class_delete
, /* class_delete */
4632 hfsc_class_get_stats
, /* class_get_stats */
4633 hfsc_class_dump_stats
/* class_dump_stats */
4636 /* "linux-noop" traffic control class. */
4639 noop_install__(struct netdev
*netdev_
)
4641 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4642 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
4644 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4648 noop_tc_install(struct netdev
*netdev
,
4649 const struct smap
*details OVS_UNUSED
)
4651 noop_install__(netdev
);
4656 noop_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4658 noop_install__(netdev
);
4662 static const struct tc_ops tc_ops_noop
= {
4663 NULL
, /* linux_name */
4664 "linux-noop", /* ovs_name */
4668 NULL
, /* tc_destroy */
4669 NULL
, /* qdisc_get */
4670 NULL
, /* qdisc_set */
4671 NULL
, /* class_get */
4672 NULL
, /* class_set */
4673 NULL
, /* class_delete */
4674 NULL
, /* class_get_stats */
4675 NULL
/* class_dump_stats */
4678 /* "linux-default" traffic control class.
4680 * This class represents the default, unnamed Linux qdisc. It corresponds to
4681 * the "" (empty string) QoS type in the OVS database. */
4684 default_install__(struct netdev
*netdev_
)
4686 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4687 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
4689 /* Nothing but a tc class implementation is allowed to write to a tc. This
4690 * class never does that, so we can legitimately use a const tc object. */
4691 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4695 default_tc_install(struct netdev
*netdev
,
4696 const struct smap
*details OVS_UNUSED
)
4698 default_install__(netdev
);
4703 default_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4705 default_install__(netdev
);
4709 static const struct tc_ops tc_ops_default
= {
4710 NULL
, /* linux_name */
4715 NULL
, /* tc_destroy */
4716 NULL
, /* qdisc_get */
4717 NULL
, /* qdisc_set */
4718 NULL
, /* class_get */
4719 NULL
, /* class_set */
4720 NULL
, /* class_delete */
4721 NULL
, /* class_get_stats */
4722 NULL
/* class_dump_stats */
4725 /* "linux-other" traffic control class.
4730 other_tc_load(struct netdev
*netdev_
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4732 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4733 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_other
);
4735 /* Nothing but a tc class implementation is allowed to write to a tc. This
4736 * class never does that, so we can legitimately use a const tc object. */
4737 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4741 static const struct tc_ops tc_ops_other
= {
4742 NULL
, /* linux_name */
4743 "linux-other", /* ovs_name */
4745 NULL
, /* tc_install */
4747 NULL
, /* tc_destroy */
4748 NULL
, /* qdisc_get */
4749 NULL
, /* qdisc_set */
4750 NULL
, /* class_get */
4751 NULL
, /* class_set */
4752 NULL
, /* class_delete */
4753 NULL
, /* class_get_stats */
4754 NULL
/* class_dump_stats */
4757 /* Traffic control. */
4759 /* Number of kernel "tc" ticks per second. */
4760 static double ticks_per_s
;
4762 /* Number of kernel "jiffies" per second. This is used for the purpose of
4763 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4764 * one jiffy's worth of data.
4766 * There are two possibilities here:
4768 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4769 * approximate range of 100 to 1024. That means that we really need to
4770 * make sure that the qdisc can buffer that much data.
4772 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4773 * has finely granular timers and there's no need to fudge additional room
4774 * for buffers. (There's no extra effort needed to implement that: the
4775 * large 'buffer_hz' is used as a divisor, so practically any number will
4776 * come out as 0 in the division. Small integer results in the case of
4777 * really high dividends won't have any real effect anyhow.)
4779 static unsigned int buffer_hz
;
4781 static struct tcmsg
*
4782 netdev_linux_tc_make_request(const struct netdev
*netdev
, int type
,
4783 unsigned int flags
, struct ofpbuf
*request
)
4788 error
= get_ifindex(netdev
, &ifindex
);
4793 return tc_make_request(ifindex
, type
, flags
, request
);
4796 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4799 * This function is equivalent to running:
4800 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4801 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4804 * The configuration and stats may be seen with the following command:
4805 * /sbin/tc -s filter show dev <devname> parent ffff:
4807 * Returns 0 if successful, otherwise a positive errno value.
4810 tc_add_policer(struct netdev
*netdev
,
4811 uint32_t kbits_rate
, uint32_t kbits_burst
)
4813 struct tc_police tc_police
;
4814 struct ofpbuf request
;
4815 struct tcmsg
*tcmsg
;
4816 size_t basic_offset
;
4817 size_t police_offset
;
4821 memset(&tc_police
, 0, sizeof tc_police
);
4822 tc_police
.action
= TC_POLICE_SHOT
;
4823 tc_police
.mtu
= mtu
;
4824 tc_fill_rate(&tc_police
.rate
, ((uint64_t) kbits_rate
* 1000)/8, mtu
);
4826 /* The following appears wrong in one way: In networking a kilobit is
4827 * usually 1000 bits but this uses 1024 bits.
4829 * However if you "fix" those problems then "tc filter show ..." shows
4830 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4831 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4832 * tc's point of view. Whatever. */
4833 tc_police
.burst
= tc_bytes_to_ticks(
4834 tc_police
.rate
.rate
, MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024 / 8);
4836 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTFILTER
,
4837 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4841 tcmsg
->tcm_parent
= tc_make_handle(0xffff, 0);
4842 tcmsg
->tcm_info
= tc_make_handle(49,
4843 (OVS_FORCE
uint16_t) htons(ETH_P_ALL
));
4845 nl_msg_put_string(&request
, TCA_KIND
, "basic");
4846 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4847 police_offset
= nl_msg_start_nested(&request
, TCA_BASIC_POLICE
);
4848 nl_msg_put_unspec(&request
, TCA_POLICE_TBF
, &tc_police
, sizeof tc_police
);
4849 tc_put_rtab(&request
, TCA_POLICE_RATE
, &tc_police
.rate
);
4850 nl_msg_end_nested(&request
, police_offset
);
4851 nl_msg_end_nested(&request
, basic_offset
);
4853 error
= tc_transact(&request
, NULL
);
4864 /* The values in psched are not individually very meaningful, but they are
4865 * important. The tables below show some values seen in the wild.
4869 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4870 * (Before that, there are hints that it was 1000000000.)
4872 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4876 * -----------------------------------
4877 * [1] 000c8000 000f4240 000f4240 00000064
4878 * [2] 000003e8 00000400 000f4240 3b9aca00
4879 * [3] 000003e8 00000400 000f4240 3b9aca00
4880 * [4] 000003e8 00000400 000f4240 00000064
4881 * [5] 000003e8 00000040 000f4240 3b9aca00
4882 * [6] 000003e8 00000040 000f4240 000000f9
4884 * a b c d ticks_per_s buffer_hz
4885 * ------- --------- ---------- ------------- ----------- -------------
4886 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4887 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4888 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4889 * [4] 1,000 1,024 1,000,000 100 976,562 100
4890 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4891 * [6] 1,000 64 1,000,000 249 15,625,000 249
4893 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4894 * [2] 2.6.26-1-686-bigmem from Debian lenny
4895 * [3] 2.6.26-2-sparc64 from Debian lenny
4896 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4897 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4898 * [6] 2.6.34 from kernel.org on KVM
4900 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
4901 static const char fn
[] = "/proc/net/psched";
4902 unsigned int a
, b
, c
, d
;
4905 if (!ovsthread_once_start(&once
)) {
4912 stream
= fopen(fn
, "r");
4914 VLOG_WARN("%s: open failed: %s", fn
, ovs_strerror(errno
));
4918 if (fscanf(stream
, "%x %x %x %x", &a
, &b
, &c
, &d
) != 4) {
4919 VLOG_WARN("%s: read failed", fn
);
4923 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn
, a
, b
, c
, d
);
4927 VLOG_WARN("%s: invalid scheduler parameters", fn
);
4931 ticks_per_s
= (double) a
* c
/ b
;
4935 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4938 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn
, ticks_per_s
, buffer_hz
);
4941 ovsthread_once_done(&once
);
4944 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4945 * rate of 'rate' bytes per second. */
4947 tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
)
4950 return (rate
* ticks
) / ticks_per_s
;
4953 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4954 * rate of 'rate' bytes per second. */
4956 tc_bytes_to_ticks(unsigned int rate
, unsigned int size
)
4959 return rate
? ((unsigned long long int) ticks_per_s
* size
) / rate
: 0;
4962 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4963 * a transmission rate of 'rate' bytes per second. */
4965 tc_buffer_per_jiffy(unsigned int rate
)
4968 return rate
/ buffer_hz
;
4971 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4972 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4973 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4974 * stores NULL into it if it is absent.
4976 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4979 * Returns 0 if successful, otherwise a positive errno value. */
4981 tc_parse_qdisc(const struct ofpbuf
*msg
, const char **kind
,
4982 struct nlattr
**options
)
4984 static const struct nl_policy tca_policy
[] = {
4985 [TCA_KIND
] = { .type
= NL_A_STRING
, .optional
= false },
4986 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= true },
4988 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
4990 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
4991 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
4992 VLOG_WARN_RL(&rl
, "failed to parse qdisc message");
4997 *kind
= nl_attr_get_string(ta
[TCA_KIND
]);
5001 *options
= ta
[TCA_OPTIONS
];
5016 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5017 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5018 * into '*options', and its queue statistics into '*stats'. Any of the output
5019 * arguments may be null.
5021 * Returns 0 if successful, otherwise a positive errno value. */
5023 tc_parse_class(const struct ofpbuf
*msg
, unsigned int *handlep
,
5024 struct nlattr
**options
, struct netdev_queue_stats
*stats
)
5026 static const struct nl_policy tca_policy
[] = {
5027 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= false },
5028 [TCA_STATS2
] = { .type
= NL_A_NESTED
, .optional
= false },
5030 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5032 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5033 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5034 VLOG_WARN_RL(&rl
, "failed to parse class message");
5039 struct tcmsg
*tc
= ofpbuf_at_assert(msg
, NLMSG_HDRLEN
, sizeof *tc
);
5040 *handlep
= tc
->tcm_handle
;
5044 *options
= ta
[TCA_OPTIONS
];
5048 const struct gnet_stats_queue
*gsq
;
5049 struct gnet_stats_basic gsb
;
5051 static const struct nl_policy stats_policy
[] = {
5052 [TCA_STATS_BASIC
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5053 .min_len
= sizeof gsb
},
5054 [TCA_STATS_QUEUE
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5055 .min_len
= sizeof *gsq
},
5057 struct nlattr
*sa
[ARRAY_SIZE(stats_policy
)];
5059 if (!nl_parse_nested(ta
[TCA_STATS2
], stats_policy
,
5060 sa
, ARRAY_SIZE(sa
))) {
5061 VLOG_WARN_RL(&rl
, "failed to parse class stats");
5065 /* Alignment issues screw up the length of struct gnet_stats_basic on
5066 * some arch/bitsize combinations. Newer versions of Linux have a
5067 * struct gnet_stats_basic_packed, but we can't depend on that. The
5068 * easiest thing to do is just to make a copy. */
5069 memset(&gsb
, 0, sizeof gsb
);
5070 memcpy(&gsb
, nl_attr_get(sa
[TCA_STATS_BASIC
]),
5071 MIN(nl_attr_get_size(sa
[TCA_STATS_BASIC
]), sizeof gsb
));
5072 stats
->tx_bytes
= gsb
.bytes
;
5073 stats
->tx_packets
= gsb
.packets
;
5075 gsq
= nl_attr_get(sa
[TCA_STATS_QUEUE
]);
5076 stats
->tx_errors
= gsq
->drops
;
5086 memset(stats
, 0, sizeof *stats
);
5091 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5094 tc_query_class(const struct netdev
*netdev
,
5095 unsigned int handle
, unsigned int parent
,
5096 struct ofpbuf
**replyp
)
5098 struct ofpbuf request
;
5099 struct tcmsg
*tcmsg
;
5102 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, NLM_F_ECHO
,
5107 tcmsg
->tcm_handle
= handle
;
5108 tcmsg
->tcm_parent
= parent
;
5110 error
= tc_transact(&request
, replyp
);
5112 VLOG_WARN_RL(&rl
, "query %s class %u:%u (parent %u:%u) failed (%s)",
5113 netdev_get_name(netdev
),
5114 tc_get_major(handle
), tc_get_minor(handle
),
5115 tc_get_major(parent
), tc_get_minor(parent
),
5116 ovs_strerror(error
));
5121 /* Equivalent to "tc class del dev <name> handle <handle>". */
5123 tc_delete_class(const struct netdev
*netdev
, unsigned int handle
)
5125 struct ofpbuf request
;
5126 struct tcmsg
*tcmsg
;
5129 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_DELTCLASS
, 0, &request
);
5133 tcmsg
->tcm_handle
= handle
;
5134 tcmsg
->tcm_parent
= 0;
5136 error
= tc_transact(&request
, NULL
);
5138 VLOG_WARN_RL(&rl
, "delete %s class %u:%u failed (%s)",
5139 netdev_get_name(netdev
),
5140 tc_get_major(handle
), tc_get_minor(handle
),
5141 ovs_strerror(error
));
5146 /* Equivalent to "tc qdisc del dev <name> root". */
5148 tc_del_qdisc(struct netdev
*netdev_
)
5150 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5151 struct ofpbuf request
;
5152 struct tcmsg
*tcmsg
;
5155 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_DELQDISC
, 0, &request
);
5159 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
5160 tcmsg
->tcm_parent
= TC_H_ROOT
;
5162 error
= tc_transact(&request
, NULL
);
5163 if (error
== EINVAL
) {
5164 /* EINVAL probably means that the default qdisc was in use, in which
5165 * case we've accomplished our purpose. */
5168 if (!error
&& netdev
->tc
) {
5169 if (netdev
->tc
->ops
->tc_destroy
) {
5170 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
5178 getqdisc_is_safe(void)
5180 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5181 static bool safe
= false;
5183 if (ovsthread_once_start(&once
)) {
5184 struct utsname utsname
;
5187 if (uname(&utsname
) == -1) {
5188 VLOG_WARN("uname failed (%s)", ovs_strerror(errno
));
5189 } else if (!ovs_scan(utsname
.release
, "%d.%d", &major
, &minor
)) {
5190 VLOG_WARN("uname reported bad OS release (%s)", utsname
.release
);
5191 } else if (major
< 2 || (major
== 2 && minor
< 35)) {
5192 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5197 ovsthread_once_done(&once
);
5202 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5203 * kernel to determine what they are. Returns 0 if successful, otherwise a
5204 * positive errno value. */
5206 tc_query_qdisc(const struct netdev
*netdev_
)
5208 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5209 struct ofpbuf request
, *qdisc
;
5210 const struct tc_ops
*ops
;
5211 struct tcmsg
*tcmsg
;
5219 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5220 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5221 * 2.6.35 without that fix backported to it.
5223 * To avoid the OOPS, we must not make a request that would attempt to dump
5224 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5225 * few others. There are a few ways that I can see to do this, but most of
5226 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5227 * technique chosen here is to assume that any non-default qdisc that we
5228 * create will have a class with handle 1:0. The built-in qdiscs only have
5229 * a class with handle 0:0.
5231 * On Linux 2.6.35+ we use the straightforward method because it allows us
5232 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5233 * in such a case we get no response at all from the kernel (!) if a
5234 * builtin qdisc is in use (which is later caught by "!error &&
5235 * !qdisc->size"). */
5236 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_GETQDISC
, NLM_F_ECHO
,
5241 tcmsg
->tcm_handle
= tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5242 tcmsg
->tcm_parent
= getqdisc_is_safe() ? TC_H_ROOT
: 0;
5244 /* Figure out what tc class to instantiate. */
5245 error
= tc_transact(&request
, &qdisc
);
5246 if (!error
&& qdisc
->size
) {
5249 error
= tc_parse_qdisc(qdisc
, &kind
, NULL
);
5251 ops
= &tc_ops_other
;
5253 ops
= tc_lookup_linux_name(kind
);
5255 static struct vlog_rate_limit rl2
= VLOG_RATE_LIMIT_INIT(1, 1);
5256 VLOG_DBG_RL(&rl2
, "unknown qdisc \"%s\"", kind
);
5258 ops
= &tc_ops_other
;
5261 } else if ((!error
&& !qdisc
->size
) || error
== ENOENT
) {
5262 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5263 * set up by some other entity that doesn't have a handle 1:0. We will
5264 * assume that it's the system default qdisc. */
5265 ops
= &tc_ops_default
;
5268 /* Who knows? Maybe the device got deleted. */
5269 VLOG_WARN_RL(&rl
, "query %s qdisc failed (%s)",
5270 netdev_get_name(netdev_
), ovs_strerror(error
));
5271 ops
= &tc_ops_other
;
5274 /* Instantiate it. */
5275 load_error
= ops
->tc_load(CONST_CAST(struct netdev
*, netdev_
), qdisc
);
5276 ovs_assert((load_error
== 0) == (netdev
->tc
!= NULL
));
5277 ofpbuf_delete(qdisc
);
5279 return error
? error
: load_error
;
5282 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5283 approximate the time to transmit packets of various lengths. For an MTU of
5284 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5285 represents two possible packet lengths; for a MTU of 513 through 1024, four
5286 possible lengths; and so on.
5288 Returns, for the specified 'mtu', the number of bits that packet lengths
5289 need to be shifted right to fit within such a 256-entry table. */
5291 tc_calc_cell_log(unsigned int mtu
)
5296 mtu
= ETH_PAYLOAD_MAX
;
5298 mtu
+= ETH_HEADER_LEN
+ VLAN_HEADER_LEN
;
5300 for (cell_log
= 0; mtu
>= 256; cell_log
++) {
5307 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5310 tc_fill_rate(struct tc_ratespec
*rate
, uint64_t Bps
, int mtu
)
5312 memset(rate
, 0, sizeof *rate
);
5313 rate
->cell_log
= tc_calc_cell_log(mtu
);
5314 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5315 /* rate->cell_align = 0; */ /* distro headers. */
5316 rate
->mpu
= ETH_TOTAL_MIN
;
5320 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5321 * attribute of the specified "type".
5323 * See tc_calc_cell_log() above for a description of "rtab"s. */
5325 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
)
5330 rtab
= nl_msg_put_unspec_uninit(msg
, type
, TC_RTAB_SIZE
);
5331 for (i
= 0; i
< TC_RTAB_SIZE
/ sizeof *rtab
; i
++) {
5332 unsigned packet_size
= (i
+ 1) << rate
->cell_log
;
5333 if (packet_size
< rate
->mpu
) {
5334 packet_size
= rate
->mpu
;
5336 rtab
[i
] = tc_bytes_to_ticks(rate
->rate
, packet_size
);
5340 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5341 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5342 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5345 tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
)
5347 unsigned int min_burst
= tc_buffer_per_jiffy(Bps
) + mtu
;
5348 return tc_bytes_to_ticks(Bps
, MAX(burst_bytes
, min_burst
));
5351 /* Linux-only functions declared in netdev-linux.h */
5353 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5354 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5356 netdev_linux_ethtool_set_flag(struct netdev
*netdev
, uint32_t flag
,
5357 const char *flag_name
, bool enable
)
5359 const char *netdev_name
= netdev_get_name(netdev
);
5360 struct ethtool_value evalue
;
5364 COVERAGE_INC(netdev_get_ethtool
);
5365 memset(&evalue
, 0, sizeof evalue
);
5366 error
= netdev_linux_do_ethtool(netdev_name
,
5367 (struct ethtool_cmd
*)&evalue
,
5368 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5373 COVERAGE_INC(netdev_set_ethtool
);
5374 new_flags
= (evalue
.data
& ~flag
) | (enable
? flag
: 0);
5375 if (new_flags
== evalue
.data
) {
5378 evalue
.data
= new_flags
;
5379 error
= netdev_linux_do_ethtool(netdev_name
,
5380 (struct ethtool_cmd
*)&evalue
,
5381 ETHTOOL_SFLAGS
, "ETHTOOL_SFLAGS");
5386 COVERAGE_INC(netdev_get_ethtool
);
5387 memset(&evalue
, 0, sizeof evalue
);
5388 error
= netdev_linux_do_ethtool(netdev_name
,
5389 (struct ethtool_cmd
*)&evalue
,
5390 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5395 if (new_flags
!= evalue
.data
) {
5396 VLOG_WARN_RL(&rl
, "attempt to %s ethtool %s flag on network "
5397 "device %s failed", enable
? "enable" : "disable",
5398 flag_name
, netdev_name
);
5405 /* Utility functions. */
5407 /* Copies 'src' into 'dst', performing format conversion in the process. */
5409 netdev_stats_from_rtnl_link_stats(struct netdev_stats
*dst
,
5410 const struct rtnl_link_stats
*src
)
5412 dst
->rx_packets
= src
->rx_packets
;
5413 dst
->tx_packets
= src
->tx_packets
;
5414 dst
->rx_bytes
= src
->rx_bytes
;
5415 dst
->tx_bytes
= src
->tx_bytes
;
5416 dst
->rx_errors
= src
->rx_errors
;
5417 dst
->tx_errors
= src
->tx_errors
;
5418 dst
->rx_dropped
= src
->rx_dropped
;
5419 dst
->tx_dropped
= src
->tx_dropped
;
5420 dst
->multicast
= src
->multicast
;
5421 dst
->collisions
= src
->collisions
;
5422 dst
->rx_length_errors
= src
->rx_length_errors
;
5423 dst
->rx_over_errors
= src
->rx_over_errors
;
5424 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5425 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5426 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5427 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5428 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5429 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5430 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5431 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5432 dst
->tx_window_errors
= src
->tx_window_errors
;
5435 /* Copies 'src' into 'dst', performing format conversion in the process. */
5437 netdev_stats_from_rtnl_link_stats64(struct netdev_stats
*dst
,
5438 const struct rtnl_link_stats64
*src
)
5440 dst
->rx_packets
= src
->rx_packets
;
5441 dst
->tx_packets
= src
->tx_packets
;
5442 dst
->rx_bytes
= src
->rx_bytes
;
5443 dst
->tx_bytes
= src
->tx_bytes
;
5444 dst
->rx_errors
= src
->rx_errors
;
5445 dst
->tx_errors
= src
->tx_errors
;
5446 dst
->rx_dropped
= src
->rx_dropped
;
5447 dst
->tx_dropped
= src
->tx_dropped
;
5448 dst
->multicast
= src
->multicast
;
5449 dst
->collisions
= src
->collisions
;
5450 dst
->rx_length_errors
= src
->rx_length_errors
;
5451 dst
->rx_over_errors
= src
->rx_over_errors
;
5452 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5453 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5454 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5455 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5456 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5457 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5458 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5459 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5460 dst
->tx_window_errors
= src
->tx_window_errors
;
5464 get_stats_via_netlink(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
5466 struct ofpbuf request
;
5467 struct ofpbuf
*reply
;
5470 /* Filtering all counters by default */
5471 memset(stats
, 0xFF, sizeof(struct netdev_stats
));
5473 ofpbuf_init(&request
, 0);
5474 nl_msg_put_nlmsghdr(&request
,
5475 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
5476 RTM_GETLINK
, NLM_F_REQUEST
);
5477 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
5478 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(netdev_
));
5479 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
5480 ofpbuf_uninit(&request
);
5485 if (ofpbuf_try_pull(reply
, NLMSG_HDRLEN
+ sizeof(struct ifinfomsg
))) {
5486 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS64
);
5487 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats64
)) {
5488 netdev_stats_from_rtnl_link_stats64(stats
, nl_attr_get(a
));
5491 a
= nl_attr_find(reply
, 0, IFLA_STATS
);
5492 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats
)) {
5493 netdev_stats_from_rtnl_link_stats(stats
, nl_attr_get(a
));
5496 VLOG_WARN_RL(&rl
, "RTM_GETLINK reply lacks stats");
5501 VLOG_WARN_RL(&rl
, "short RTM_GETLINK reply");
5506 ofpbuf_delete(reply
);
5511 get_flags(const struct netdev
*dev
, unsigned int *flags
)
5517 error
= af_inet_ifreq_ioctl(dev
->name
, &ifr
, SIOCGIFFLAGS
, "SIOCGIFFLAGS");
5519 *flags
= ifr
.ifr_flags
;
5525 set_flags(const char *name
, unsigned int flags
)
5529 ifr
.ifr_flags
= flags
;
5530 return af_inet_ifreq_ioctl(name
, &ifr
, SIOCSIFFLAGS
, "SIOCSIFFLAGS");
5534 linux_get_ifindex(const char *netdev_name
)
5539 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5540 COVERAGE_INC(netdev_get_ifindex
);
5542 error
= af_inet_ioctl(SIOCGIFINDEX
, &ifr
);
5544 /* ENODEV probably means that a vif disappeared asynchronously and
5545 * hasn't been removed from the database yet, so reduce the log level
5546 * to INFO for that case. */
5547 VLOG_RL(&rl
, error
== ENODEV
? VLL_INFO
: VLL_ERR
,
5548 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5549 netdev_name
, ovs_strerror(error
));
5552 return ifr
.ifr_ifindex
;
5556 get_ifindex(const struct netdev
*netdev_
, int *ifindexp
)
5558 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5560 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
5561 netdev_linux_update_via_netlink(netdev
);
5564 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
5565 /* Fall back to ioctl if netlink fails */
5566 int ifindex
= linux_get_ifindex(netdev_get_name(netdev_
));
5569 netdev
->get_ifindex_error
= -ifindex
;
5570 netdev
->ifindex
= 0;
5572 netdev
->get_ifindex_error
= 0;
5573 netdev
->ifindex
= ifindex
;
5575 netdev
->cache_valid
|= VALID_IFINDEX
;
5578 *ifindexp
= netdev
->ifindex
;
5579 return netdev
->get_ifindex_error
;
5583 netdev_linux_update_via_netlink(struct netdev_linux
*netdev
)
5585 struct ofpbuf request
;
5586 struct ofpbuf
*reply
;
5587 struct rtnetlink_change chg
;
5588 struct rtnetlink_change
*change
= &chg
;
5591 ofpbuf_init(&request
, 0);
5592 nl_msg_put_nlmsghdr(&request
,
5593 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
5594 RTM_GETLINK
, NLM_F_REQUEST
);
5595 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
5597 /* The correct identifiers for a Linux device are netnsid and ifindex,
5598 * but ifindex changes as the port is moved to another network namespace
5599 * and the interface name statically stored in ovsdb. */
5600 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(&netdev
->up
));
5601 if (netdev_linux_netnsid_is_remote(netdev
)) {
5602 nl_msg_push_u32(&request
, IFLA_IF_NETNSID
, netdev
->netnsid
);
5604 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
5605 ofpbuf_uninit(&request
);
5607 ofpbuf_delete(reply
);
5611 if (rtnetlink_parse(reply
, change
)
5612 && change
->nlmsg_type
== RTM_NEWLINK
) {
5613 bool changed
= false;
5616 /* Update netdev from rtnl msg and increment its seq if needed. */
5617 if ((change
->ifi_flags
^ netdev
->ifi_flags
) & IFF_RUNNING
) {
5618 netdev
->carrier_resets
++;
5621 if (change
->ifi_flags
!= netdev
->ifi_flags
) {
5622 netdev
->ifi_flags
= change
->ifi_flags
;
5625 if (change
->mtu
&& change
->mtu
!= netdev
->mtu
) {
5626 netdev
->mtu
= change
->mtu
;
5627 netdev
->cache_valid
|= VALID_MTU
;
5628 netdev
->netdev_mtu_error
= 0;
5631 if (!eth_addr_is_zero(change
->mac
)
5632 && !eth_addr_equals(change
->mac
, netdev
->etheraddr
)) {
5633 netdev
->etheraddr
= change
->mac
;
5634 netdev
->cache_valid
|= VALID_ETHERADDR
;
5635 netdev
->ether_addr_error
= 0;
5638 if (change
->if_index
!= netdev
->ifindex
) {
5639 netdev
->ifindex
= change
->if_index
;
5640 netdev
->cache_valid
|= VALID_IFINDEX
;
5641 netdev
->get_ifindex_error
= 0;
5645 netdev_change_seq_changed(&netdev
->up
);
5651 ofpbuf_delete(reply
);
5656 get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
)
5662 memset(&ifr
, 0, sizeof ifr
);
5663 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5664 COVERAGE_INC(netdev_get_hwaddr
);
5665 error
= af_inet_ioctl(SIOCGIFHWADDR
, &ifr
);
5667 /* ENODEV probably means that a vif disappeared asynchronously and
5668 * hasn't been removed from the database yet, so reduce the log level
5669 * to INFO for that case. */
5670 VLOG(error
== ENODEV
? VLL_INFO
: VLL_ERR
,
5671 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5672 netdev_name
, ovs_strerror(error
));
5675 hwaddr_family
= ifr
.ifr_hwaddr
.sa_family
;
5676 if (hwaddr_family
!= AF_UNSPEC
&& hwaddr_family
!= ARPHRD_ETHER
&&
5677 hwaddr_family
!= ARPHRD_NONE
) {
5678 VLOG_INFO("%s device has unknown hardware address family %d",
5679 netdev_name
, hwaddr_family
);
5682 memcpy(ea
, ifr
.ifr_hwaddr
.sa_data
, ETH_ADDR_LEN
);
5687 set_etheraddr(const char *netdev_name
, const struct eth_addr mac
)
5692 memset(&ifr
, 0, sizeof ifr
);
5693 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5694 ifr
.ifr_hwaddr
.sa_family
= ARPHRD_ETHER
;
5695 memcpy(ifr
.ifr_hwaddr
.sa_data
, &mac
, ETH_ADDR_LEN
);
5696 COVERAGE_INC(netdev_set_hwaddr
);
5697 error
= af_inet_ioctl(SIOCSIFHWADDR
, &ifr
);
5699 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5700 netdev_name
, ovs_strerror(error
));
5706 netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*ecmd
,
5707 int cmd
, const char *cmd_name
)
5712 memset(&ifr
, 0, sizeof ifr
);
5713 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
5714 ifr
.ifr_data
= (caddr_t
) ecmd
;
5717 error
= af_inet_ioctl(SIOCETHTOOL
, &ifr
);
5719 if (error
!= EOPNOTSUPP
) {
5720 VLOG_WARN_RL(&rl
, "ethtool command %s on network device %s "
5721 "failed: %s", cmd_name
, name
, ovs_strerror(error
));
5723 /* The device doesn't support this operation. That's pretty
5724 * common, so there's no point in logging anything. */
5730 /* Returns an AF_PACKET raw socket or a negative errno value. */
5732 af_packet_sock(void)
5734 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5737 if (ovsthread_once_start(&once
)) {
5738 sock
= socket(AF_PACKET
, SOCK_RAW
, 0);
5740 int error
= set_nonblocking(sock
);
5747 VLOG_ERR("failed to create packet socket: %s",
5748 ovs_strerror(errno
));
5750 ovsthread_once_done(&once
);