2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <sys/types.h>
24 #include <netinet/in.h>
25 #include <arpa/inet.h>
27 #include <linux/filter.h>
28 #include <linux/gen_stats.h>
29 #include <linux/if_ether.h>
30 #include <linux/if_tun.h>
31 #include <linux/types.h>
32 #include <linux/ethtool.h>
33 #include <linux/mii.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/ioctl.h>
37 #include <sys/socket.h>
38 #include <sys/utsname.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
50 #include "dp-packet.h"
51 #include "dpif-netlink.h"
52 #include "dpif-netdev.h"
53 #include "openvswitch/dynamic-string.h"
54 #include "fatal-signal.h"
56 #include "openvswitch/hmap.h"
57 #include "netdev-provider.h"
58 #include "netdev-tc-offloads.h"
59 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "openvswitch/poll-loop.h"
69 #include "rtnetlink.h"
70 #include "openvswitch/shash.h"
71 #include "socket-util.h"
75 #include "unaligned.h"
76 #include "openvswitch/vlog.h"
79 VLOG_DEFINE_THIS_MODULE(netdev_linux
);
81 COVERAGE_DEFINE(netdev_set_policing
);
82 COVERAGE_DEFINE(netdev_arp_lookup
);
83 COVERAGE_DEFINE(netdev_get_ifindex
);
84 COVERAGE_DEFINE(netdev_get_hwaddr
);
85 COVERAGE_DEFINE(netdev_set_hwaddr
);
86 COVERAGE_DEFINE(netdev_get_ethtool
);
87 COVERAGE_DEFINE(netdev_set_ethtool
);
90 #ifndef IFLA_IF_NETNSID
91 #define IFLA_IF_NETNSID 0x45
93 /* These were introduced in Linux 2.6.14, so they might be missing if we have
95 #ifndef ADVERTISED_Pause
96 #define ADVERTISED_Pause (1 << 13)
98 #ifndef ADVERTISED_Asym_Pause
99 #define ADVERTISED_Asym_Pause (1 << 14)
102 /* These were introduced in Linux 2.6.24, so they might be missing if we
103 * have old headers. */
104 #ifndef ETHTOOL_GFLAGS
105 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
107 #ifndef ETHTOOL_SFLAGS
108 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
111 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
114 #define TC_RTAB_SIZE 1024
117 /* Linux 2.6.21 introduced struct tpacket_auxdata.
118 * Linux 2.6.27 added the tp_vlan_tci member.
119 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
120 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
121 * TP_STATUS_VLAN_TPID_VALID.
123 * With all this churn it's easiest to unconditionally define a replacement
124 * structure that has everything we want.
126 #ifndef PACKET_AUXDATA
127 #define PACKET_AUXDATA 8
129 #ifndef TP_STATUS_VLAN_VALID
130 #define TP_STATUS_VLAN_VALID (1 << 4)
132 #ifndef TP_STATUS_VLAN_TPID_VALID
133 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
135 #undef tpacket_auxdata
136 #define tpacket_auxdata rpl_tpacket_auxdata
137 struct tpacket_auxdata
{
143 uint16_t tp_vlan_tci
;
144 uint16_t tp_vlan_tpid
;
147 /* Linux 2.6.27 introduced ethtool_cmd_speed
149 * To avoid revisiting problems reported with using configure to detect
150 * compatibility (see report at
151 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
152 * unconditionally replace ethtool_cmd_speed. */
153 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
154 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd
*ep
)
156 return ep
->speed
| (ep
->speed_hi
<< 16);
159 /* Linux 2.6.30 introduced supported and advertised flags for
160 * 1G base KX, and 10G base KX4, KR and R. */
161 #ifndef SUPPORTED_1000baseKX_Full
162 #define SUPPORTED_1000baseKX_Full (1 << 17)
163 #define SUPPORTED_10000baseKX4_Full (1 << 18)
164 #define SUPPORTED_10000baseKR_Full (1 << 19)
165 #define SUPPORTED_10000baseR_FEC (1 << 20)
166 #define ADVERTISED_1000baseKX_Full (1 << 17)
167 #define ADVERTISED_10000baseKX4_Full (1 << 18)
168 #define ADVERTISED_10000baseKR_Full (1 << 19)
169 #define ADVERTISED_10000baseR_FEC (1 << 20)
172 /* Linux 3.5 introduced supported and advertised flags for
173 * 40G base KR4, CR4, SR4 and LR4. */
174 #ifndef SUPPORTED_40000baseKR4_Full
175 #define SUPPORTED_40000baseKR4_Full (1 << 23)
176 #define SUPPORTED_40000baseCR4_Full (1 << 24)
177 #define SUPPORTED_40000baseSR4_Full (1 << 25)
178 #define SUPPORTED_40000baseLR4_Full (1 << 26)
179 #define ADVERTISED_40000baseKR4_Full (1 << 23)
180 #define ADVERTISED_40000baseCR4_Full (1 << 24)
181 #define ADVERTISED_40000baseSR4_Full (1 << 25)
182 #define ADVERTISED_40000baseLR4_Full (1 << 26)
185 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
187 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
188 * 2.6.32-431.29.2.el6.x86_64 (see report at
189 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
190 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
191 * unconditionally define a replacement. */
193 #define IFLA_STATS64 23
195 #define rtnl_link_stats64 rpl_rtnl_link_stats64
196 struct rtnl_link_stats64
{
208 uint64_t rx_length_errors
;
209 uint64_t rx_over_errors
;
210 uint64_t rx_crc_errors
;
211 uint64_t rx_frame_errors
;
212 uint64_t rx_fifo_errors
;
213 uint64_t rx_missed_errors
;
215 uint64_t tx_aborted_errors
;
216 uint64_t tx_carrier_errors
;
217 uint64_t tx_fifo_errors
;
218 uint64_t tx_heartbeat_errors
;
219 uint64_t tx_window_errors
;
221 uint64_t rx_compressed
;
222 uint64_t tx_compressed
;
226 VALID_IFINDEX
= 1 << 0,
227 VALID_ETHERADDR
= 1 << 1,
230 VALID_POLICING
= 1 << 4,
231 VALID_VPORT_STAT_ERROR
= 1 << 5,
232 VALID_DRVINFO
= 1 << 6,
233 VALID_FEATURES
= 1 << 7,
236 struct linux_lag_slave
{
238 struct shash_node
*node
;
241 /* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
242 static struct ovs_mutex lag_mutex
= OVS_MUTEX_INITIALIZER
;
244 /* All slaves whose LAG masters are network devices in OvS. */
245 static struct shash lag_shash
OVS_GUARDED_BY(lag_mutex
)
246 = SHASH_INITIALIZER(&lag_shash
);
248 /* Traffic control. */
250 /* An instance of a traffic control class. Always associated with a particular
253 * Each TC implementation subclasses this with whatever additional data it
256 const struct tc_ops
*ops
;
257 struct hmap queues
; /* Contains "struct tc_queue"s.
258 * Read by generic TC layer.
259 * Written only by TC implementation. */
262 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
264 /* One traffic control queue.
266 * Each TC implementation subclasses this with whatever additional data it
269 struct hmap_node hmap_node
; /* In struct tc's "queues" hmap. */
270 unsigned int queue_id
; /* OpenFlow queue ID. */
271 long long int created
; /* Time queue was created, in msecs. */
274 /* A particular kind of traffic control. Each implementation generally maps to
275 * one particular Linux qdisc class.
277 * The functions below return 0 if successful or a positive errno value on
278 * failure, except where otherwise noted. All of them must be provided, except
279 * where otherwise noted. */
281 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
282 * This is null for tc_ops_default and tc_ops_other, for which there are no
283 * appropriate values. */
284 const char *linux_name
;
286 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
287 const char *ovs_name
;
289 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
290 * queues. The queues are numbered 0 through n_queues - 1. */
291 unsigned int n_queues
;
293 /* Called to install this TC class on 'netdev'. The implementation should
294 * make the Netlink calls required to set up 'netdev' with the right qdisc
295 * and configure it according to 'details'. The implementation may assume
296 * that the current qdisc is the default; that is, there is no need for it
297 * to delete the current qdisc before installing itself.
299 * The contents of 'details' should be documented as valid for 'ovs_name'
300 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
301 * (which is built as ovs-vswitchd.conf.db(8)).
303 * This function must return 0 if and only if it sets 'netdev->tc' to an
304 * initialized 'struct tc'.
306 * (This function is null for tc_ops_other, which cannot be installed. For
307 * other TC classes it should always be nonnull.) */
308 int (*tc_install
)(struct netdev
*netdev
, const struct smap
*details
);
310 /* Called when the netdev code determines (through a Netlink query) that
311 * this TC class's qdisc is installed on 'netdev', but we didn't install
312 * it ourselves and so don't know any of the details.
314 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
315 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
316 * implementation should parse the other attributes of 'nlmsg' as
317 * necessary to determine its configuration. If necessary it should also
318 * use Netlink queries to determine the configuration of queues on
321 * This function must return 0 if and only if it sets 'netdev->tc' to an
322 * initialized 'struct tc'. */
323 int (*tc_load
)(struct netdev
*netdev
, struct ofpbuf
*nlmsg
);
325 /* Destroys the data structures allocated by the implementation as part of
326 * 'tc'. (This includes destroying 'tc->queues' by calling
329 * The implementation should not need to perform any Netlink calls. If
330 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
331 * (But it may not be desirable.)
333 * This function may be null if 'tc' is trivial. */
334 void (*tc_destroy
)(struct tc
*tc
);
336 /* Retrieves details of 'netdev->tc' configuration into 'details'.
338 * The implementation should not need to perform any Netlink calls, because
339 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
340 * cached the configuration.
342 * The contents of 'details' should be documented as valid for 'ovs_name'
343 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
344 * (which is built as ovs-vswitchd.conf.db(8)).
346 * This function may be null if 'tc' is not configurable.
348 int (*qdisc_get
)(const struct netdev
*netdev
, struct smap
*details
);
350 /* Reconfigures 'netdev->tc' according to 'details', performing any
351 * required Netlink calls to complete the reconfiguration.
353 * The contents of 'details' should be documented as valid for 'ovs_name'
354 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
355 * (which is built as ovs-vswitchd.conf.db(8)).
357 * This function may be null if 'tc' is not configurable.
359 int (*qdisc_set
)(struct netdev
*, const struct smap
*details
);
361 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
362 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
364 * The contents of 'details' should be documented as valid for 'ovs_name'
365 * in the "other_config" column in the "Queue" table in
366 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
368 * The implementation should not need to perform any Netlink calls, because
369 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
370 * cached the queue configuration.
372 * This function may be null if 'tc' does not have queues ('n_queues' is
374 int (*class_get
)(const struct netdev
*netdev
, const struct tc_queue
*queue
,
375 struct smap
*details
);
377 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
378 * 'details', perfoming any required Netlink calls to complete the
379 * reconfiguration. The caller ensures that 'queue_id' is less than
382 * The contents of 'details' should be documented as valid for 'ovs_name'
383 * in the "other_config" column in the "Queue" table in
384 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
386 * This function may be null if 'tc' does not have queues or its queues are
387 * not configurable. */
388 int (*class_set
)(struct netdev
*, unsigned int queue_id
,
389 const struct smap
*details
);
391 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
392 * tc_queue's within 'netdev->tc->queues'.
394 * This function may be null if 'tc' does not have queues or its queues
395 * cannot be deleted. */
396 int (*class_delete
)(struct netdev
*, struct tc_queue
*queue
);
398 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
399 * 'struct tc_queue's within 'netdev->tc->queues'.
401 * On success, initializes '*stats'.
403 * This function may be null if 'tc' does not have queues or if it cannot
404 * report queue statistics. */
405 int (*class_get_stats
)(const struct netdev
*netdev
,
406 const struct tc_queue
*queue
,
407 struct netdev_queue_stats
*stats
);
409 /* Extracts queue stats from 'nlmsg', which is a response to a
410 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
412 * This function may be null if 'tc' does not have queues or if it cannot
413 * report queue statistics. */
414 int (*class_dump_stats
)(const struct netdev
*netdev
,
415 const struct ofpbuf
*nlmsg
,
416 netdev_dump_queue_stats_cb
*cb
, void *aux
);
420 tc_init(struct tc
*tc
, const struct tc_ops
*ops
)
423 hmap_init(&tc
->queues
);
427 tc_destroy(struct tc
*tc
)
429 hmap_destroy(&tc
->queues
);
432 static const struct tc_ops tc_ops_htb
;
433 static const struct tc_ops tc_ops_hfsc
;
434 static const struct tc_ops tc_ops_codel
;
435 static const struct tc_ops tc_ops_fqcodel
;
436 static const struct tc_ops tc_ops_sfq
;
437 static const struct tc_ops tc_ops_default
;
438 static const struct tc_ops tc_ops_noop
;
439 static const struct tc_ops tc_ops_other
;
441 static const struct tc_ops
*const tcs
[] = {
442 &tc_ops_htb
, /* Hierarchy token bucket (see tc-htb(8)). */
443 &tc_ops_hfsc
, /* Hierarchical fair service curve. */
444 &tc_ops_codel
, /* Controlled delay */
445 &tc_ops_fqcodel
, /* Fair queue controlled delay */
446 &tc_ops_sfq
, /* Stochastic fair queueing */
447 &tc_ops_noop
, /* Non operating qos type. */
448 &tc_ops_default
, /* Default qdisc (see tc-pfifo_fast(8)). */
449 &tc_ops_other
, /* Some other qdisc. */
453 static unsigned int tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
);
454 static unsigned int tc_bytes_to_ticks(unsigned int rate
, unsigned int size
);
455 static unsigned int tc_buffer_per_jiffy(unsigned int rate
);
457 static struct tcmsg
*netdev_linux_tc_make_request(const struct netdev
*,
461 static int tc_add_policer(struct netdev
*,
462 uint32_t kbits_rate
, uint32_t kbits_burst
);
464 static int tc_parse_qdisc(const struct ofpbuf
*, const char **kind
,
465 struct nlattr
**options
);
466 static int tc_parse_class(const struct ofpbuf
*, unsigned int *queue_id
,
467 struct nlattr
**options
,
468 struct netdev_queue_stats
*);
469 static int tc_query_class(const struct netdev
*,
470 unsigned int handle
, unsigned int parent
,
471 struct ofpbuf
**replyp
);
472 static int tc_delete_class(const struct netdev
*, unsigned int handle
);
474 static int tc_del_qdisc(struct netdev
*netdev
);
475 static int tc_query_qdisc(const struct netdev
*netdev
);
477 static int tc_calc_cell_log(unsigned int mtu
);
478 static void tc_fill_rate(struct tc_ratespec
*rate
, uint64_t bps
, int mtu
);
479 static void tc_put_rtab(struct ofpbuf
*, uint16_t type
,
480 const struct tc_ratespec
*rate
);
481 static int tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
);
483 struct netdev_linux
{
486 /* Protects all members below. */
487 struct ovs_mutex mutex
;
489 unsigned int cache_valid
;
491 bool miimon
; /* Link status of last poll. */
492 long long int miimon_interval
; /* Miimon Poll rate. Disabled if <= 0. */
493 struct timer miimon_timer
;
495 int netnsid
; /* Network namespace ID. */
496 /* The following are figured out "on demand" only. They are only valid
497 * when the corresponding VALID_* bit in 'cache_valid' is set. */
499 struct eth_addr etheraddr
;
501 unsigned int ifi_flags
;
502 long long int carrier_resets
;
503 uint32_t kbits_rate
; /* Policing data. */
504 uint32_t kbits_burst
;
505 int vport_stats_error
; /* Cached error code from vport_get_stats().
506 0 or an errno value. */
507 int netdev_mtu_error
; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
508 int ether_addr_error
; /* Cached error code from set/get etheraddr. */
509 int netdev_policing_error
; /* Cached error code from set policing. */
510 int get_features_error
; /* Cached error code from ETHTOOL_GSET. */
511 int get_ifindex_error
; /* Cached error code from SIOCGIFINDEX. */
513 enum netdev_features current
; /* Cached from ETHTOOL_GSET. */
514 enum netdev_features advertised
; /* Cached from ETHTOOL_GSET. */
515 enum netdev_features supported
; /* Cached from ETHTOOL_GSET. */
517 struct ethtool_drvinfo drvinfo
; /* Cached from ETHTOOL_GDRVINFO. */
520 /* For devices of class netdev_tap_class only. */
522 bool present
; /* If the device is present in the namespace */
523 uint64_t tx_dropped
; /* tap device can drop if the iface is down */
525 /* LAG information. */
526 bool is_lag_master
; /* True if the netdev is a LAG master. */
529 struct netdev_rxq_linux
{
530 struct netdev_rxq up
;
535 /* This is set pretty low because we probably won't learn anything from the
536 * additional log messages. */
537 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
539 /* Polling miimon status for all ports causes performance degradation when
540 * handling a large number of ports. If there are no devices using miimon, then
541 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
543 * Readers do not depend on this variable synchronizing with the related
544 * changes in the device miimon status, so we can use atomic_count. */
545 static atomic_count miimon_cnt
= ATOMIC_COUNT_INIT(0);
547 static void netdev_linux_run(const struct netdev_class
*);
549 static int netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*,
550 int cmd
, const char *cmd_name
);
551 static int get_flags(const struct netdev
*, unsigned int *flags
);
552 static int set_flags(const char *, unsigned int flags
);
553 static int update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
554 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
555 OVS_REQUIRES(netdev
->mutex
);
556 static int get_ifindex(const struct netdev
*, int *ifindexp
);
557 static int do_set_addr(struct netdev
*netdev
,
558 int ioctl_nr
, const char *ioctl_name
,
559 struct in_addr addr
);
560 static int get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
);
561 static int set_etheraddr(const char *netdev_name
, const struct eth_addr
);
562 static int get_stats_via_netlink(const struct netdev
*, struct netdev_stats
*);
563 static int af_packet_sock(void);
564 static bool netdev_linux_miimon_enabled(void);
565 static void netdev_linux_miimon_run(void);
566 static void netdev_linux_miimon_wait(void);
567 static int netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
);
570 is_netdev_linux_class(const struct netdev_class
*netdev_class
)
572 return netdev_class
->run
== netdev_linux_run
;
576 is_tap_netdev(const struct netdev
*netdev
)
578 return netdev_get_class(netdev
) == &netdev_tap_class
;
581 static struct netdev_linux
*
582 netdev_linux_cast(const struct netdev
*netdev
)
584 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev
)));
586 return CONTAINER_OF(netdev
, struct netdev_linux
, up
);
589 static struct netdev_rxq_linux
*
590 netdev_rxq_linux_cast(const struct netdev_rxq
*rx
)
592 ovs_assert(is_netdev_linux_class(netdev_get_class(rx
->netdev
)));
593 return CONTAINER_OF(rx
, struct netdev_rxq_linux
, up
);
597 netdev_linux_netnsid_update__(struct netdev_linux
*netdev
)
599 struct dpif_netlink_vport reply
;
603 error
= dpif_netlink_vport_get(netdev_get_name(&netdev
->up
), &reply
, &buf
);
605 if (error
== ENOENT
) {
606 /* Assume it is local if there is no API (e.g. if the openvswitch
607 * kernel module is not loaded). */
608 netnsid_set_local(&netdev
->netnsid
);
610 netnsid_unset(&netdev
->netnsid
);
615 netnsid_set(&netdev
->netnsid
, reply
.netnsid
);
621 netdev_linux_netnsid_update(struct netdev_linux
*netdev
)
623 if (netnsid_is_unset(netdev
->netnsid
)) {
624 if (netdev_get_class(&netdev
->up
) == &netdev_tap_class
) {
625 netnsid_set_local(&netdev
->netnsid
);
627 return netdev_linux_netnsid_update__(netdev
);
635 netdev_linux_netnsid_is_eq(struct netdev_linux
*netdev
, int nsid
)
637 netdev_linux_netnsid_update(netdev
);
638 return netnsid_eq(netdev
->netnsid
, nsid
);
642 netdev_linux_netnsid_is_remote(struct netdev_linux
*netdev
)
644 netdev_linux_netnsid_update(netdev
);
645 return netnsid_is_remote(netdev
->netnsid
);
648 static int netdev_linux_update_via_netlink(struct netdev_linux
*);
649 static void netdev_linux_update(struct netdev_linux
*netdev
, int,
650 const struct rtnetlink_change
*)
651 OVS_REQUIRES(netdev
->mutex
);
652 static void netdev_linux_changed(struct netdev_linux
*netdev
,
653 unsigned int ifi_flags
, unsigned int mask
)
654 OVS_REQUIRES(netdev
->mutex
);
656 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
657 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
658 * if no such socket could be created. */
659 static struct nl_sock
*
660 netdev_linux_notify_sock(void)
662 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
663 static struct nl_sock
*sock
;
664 unsigned int mcgroups
[] = {RTNLGRP_LINK
, RTNLGRP_IPV4_IFADDR
,
665 RTNLGRP_IPV6_IFADDR
, RTNLGRP_IPV6_IFINFO
};
667 if (ovsthread_once_start(&once
)) {
670 error
= nl_sock_create(NETLINK_ROUTE
, &sock
);
674 for (i
= 0; i
< ARRAY_SIZE(mcgroups
); i
++) {
675 error
= nl_sock_join_mcgroup(sock
, mcgroups
[i
]);
677 nl_sock_destroy(sock
);
683 nl_sock_listen_all_nsid(sock
, true);
684 ovsthread_once_done(&once
);
691 netdev_linux_miimon_enabled(void)
693 return atomic_count_get(&miimon_cnt
) > 0;
697 netdev_linux_kind_is_lag(const char *kind
)
699 if (!strcmp(kind
, "bond") || !strcmp(kind
, "team")) {
707 netdev_linux_update_lag(struct rtnetlink_change
*change
)
708 OVS_REQUIRES(lag_mutex
)
710 struct linux_lag_slave
*lag
;
712 if (!rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)) {
716 if (change
->slave
&& netdev_linux_kind_is_lag(change
->slave
)) {
717 lag
= shash_find_data(&lag_shash
, change
->ifname
);
720 struct netdev
*master_netdev
;
721 char master_name
[IFNAMSIZ
];
725 if_indextoname(change
->master_ifindex
, master_name
);
726 master_netdev
= netdev_from_name(master_name
);
727 if (!master_netdev
) {
731 if (is_netdev_linux_class(master_netdev
->netdev_class
)) {
732 block_id
= netdev_get_block_id(master_netdev
);
734 netdev_close(master_netdev
);
738 lag
= xmalloc(sizeof *lag
);
739 lag
->block_id
= block_id
;
740 lag
->node
= shash_add(&lag_shash
, change
->ifname
, lag
);
742 /* LAG master is linux netdev so add slave to same block. */
743 error
= tc_add_del_ingress_qdisc(change
->if_index
, true,
746 VLOG_WARN("failed to bind LAG slave to master's block");
747 shash_delete(&lag_shash
, lag
->node
);
752 netdev_close(master_netdev
);
754 } else if (change
->master_ifindex
== 0) {
755 /* Check if this was a lag slave that has been freed. */
756 lag
= shash_find_data(&lag_shash
, change
->ifname
);
759 tc_add_del_ingress_qdisc(change
->if_index
, false,
761 shash_delete(&lag_shash
, lag
->node
);
768 netdev_linux_run(const struct netdev_class
*netdev_class OVS_UNUSED
)
770 struct nl_sock
*sock
;
773 if (netdev_linux_miimon_enabled()) {
774 netdev_linux_miimon_run();
777 sock
= netdev_linux_notify_sock();
783 uint64_t buf_stub
[4096 / 8];
787 ofpbuf_use_stub(&buf
, buf_stub
, sizeof buf_stub
);
788 error
= nl_sock_recv(sock
, &buf
, &nsid
, false);
790 struct rtnetlink_change change
;
792 if (rtnetlink_parse(&buf
, &change
)) {
793 struct netdev
*netdev_
= NULL
;
794 char dev_name
[IFNAMSIZ
];
796 if (!change
.ifname
) {
797 change
.ifname
= if_indextoname(change
.if_index
, dev_name
);
801 netdev_
= netdev_from_name(change
.ifname
);
803 if (netdev_
&& is_netdev_linux_class(netdev_
->netdev_class
)) {
804 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
806 ovs_mutex_lock(&netdev
->mutex
);
807 netdev_linux_update(netdev
, nsid
, &change
);
808 ovs_mutex_unlock(&netdev
->mutex
);
810 else if (!netdev_
&& change
.ifname
) {
811 /* Netdev is not present in OvS but its master could be. */
812 ovs_mutex_lock(&lag_mutex
);
813 netdev_linux_update_lag(&change
);
814 ovs_mutex_unlock(&lag_mutex
);
816 netdev_close(netdev_
);
818 } else if (error
== ENOBUFS
) {
819 struct shash device_shash
;
820 struct shash_node
*node
;
824 shash_init(&device_shash
);
825 netdev_get_devices(&netdev_linux_class
, &device_shash
);
826 SHASH_FOR_EACH (node
, &device_shash
) {
827 struct netdev
*netdev_
= node
->data
;
828 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
831 ovs_mutex_lock(&netdev
->mutex
);
832 get_flags(netdev_
, &flags
);
833 netdev_linux_changed(netdev
, flags
, 0);
834 ovs_mutex_unlock(&netdev
->mutex
);
836 netdev_close(netdev_
);
838 shash_destroy(&device_shash
);
839 } else if (error
!= EAGAIN
) {
840 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 5);
841 VLOG_WARN_RL(&rll
, "error reading or parsing netlink (%s)",
842 ovs_strerror(error
));
849 netdev_linux_wait(const struct netdev_class
*netdev_class OVS_UNUSED
)
851 struct nl_sock
*sock
;
853 if (netdev_linux_miimon_enabled()) {
854 netdev_linux_miimon_wait();
856 sock
= netdev_linux_notify_sock();
858 nl_sock_wait(sock
, POLLIN
);
863 netdev_linux_changed(struct netdev_linux
*dev
,
864 unsigned int ifi_flags
, unsigned int mask
)
865 OVS_REQUIRES(dev
->mutex
)
867 netdev_change_seq_changed(&dev
->up
);
869 if ((dev
->ifi_flags
^ ifi_flags
) & IFF_RUNNING
) {
870 dev
->carrier_resets
++;
872 dev
->ifi_flags
= ifi_flags
;
874 dev
->cache_valid
&= mask
;
875 if (!(mask
& VALID_IN
)) {
876 netdev_get_addrs_list_flush();
881 netdev_linux_update__(struct netdev_linux
*dev
,
882 const struct rtnetlink_change
*change
)
883 OVS_REQUIRES(dev
->mutex
)
885 if (rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)) {
886 if (change
->nlmsg_type
== RTM_NEWLINK
) {
887 /* Keep drv-info, and ip addresses. */
888 netdev_linux_changed(dev
, change
->ifi_flags
,
889 VALID_DRVINFO
| VALID_IN
);
891 /* Update netdev from rtnl-change msg. */
893 dev
->mtu
= change
->mtu
;
894 dev
->cache_valid
|= VALID_MTU
;
895 dev
->netdev_mtu_error
= 0;
898 if (!eth_addr_is_zero(change
->mac
)) {
899 dev
->etheraddr
= change
->mac
;
900 dev
->cache_valid
|= VALID_ETHERADDR
;
901 dev
->ether_addr_error
= 0;
903 /* The mac addr has been changed, report it now. */
904 rtnetlink_report_link();
907 if (change
->master
&& netdev_linux_kind_is_lag(change
->master
)) {
908 dev
->is_lag_master
= true;
911 dev
->ifindex
= change
->if_index
;
912 dev
->cache_valid
|= VALID_IFINDEX
;
913 dev
->get_ifindex_error
= 0;
917 netdev_linux_changed(dev
, change
->ifi_flags
, 0);
918 dev
->present
= false;
919 netnsid_unset(&dev
->netnsid
);
921 } else if (rtnetlink_type_is_rtnlgrp_addr(change
->nlmsg_type
)) {
922 /* Invalidates in4, in6. */
923 netdev_linux_changed(dev
, dev
->ifi_flags
, ~VALID_IN
);
930 netdev_linux_update(struct netdev_linux
*dev
, int nsid
,
931 const struct rtnetlink_change
*change
)
932 OVS_REQUIRES(dev
->mutex
)
934 if (netdev_linux_netnsid_is_eq(dev
, nsid
)) {
935 netdev_linux_update__(dev
, change
);
939 static struct netdev
*
940 netdev_linux_alloc(void)
942 struct netdev_linux
*netdev
= xzalloc(sizeof *netdev
);
947 netdev_linux_common_construct(struct netdev
*netdev_
)
949 /* Prevent any attempt to create (or open) a network device named "default"
950 * or "all". These device names are effectively reserved on Linux because
951 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
952 * itself this wouldn't call for any special treatment, but in practice if
953 * a program tries to create devices with these names, it causes the kernel
954 * to fire a "new device" notification event even though creation failed,
955 * and in turn that causes OVS to wake up and try to create them again,
956 * which ends up as a 100% CPU loop. */
957 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
958 const char *name
= netdev_
->name
;
959 if (!strcmp(name
, "default") || !strcmp(name
, "all")) {
960 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 1);
961 VLOG_WARN_RL(&rll
, "%s: Linux forbids network device with this name",
966 /* The device could be in the same network namespace or in another one. */
967 netnsid_unset(&netdev
->netnsid
);
968 ovs_mutex_init(&netdev
->mutex
);
972 /* Creates system and internal devices. */
974 netdev_linux_construct(struct netdev
*netdev_
)
976 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
977 int error
= netdev_linux_common_construct(netdev_
);
982 error
= get_flags(&netdev
->up
, &netdev
->ifi_flags
);
983 if (error
== ENODEV
) {
984 if (netdev
->up
.netdev_class
!= &netdev_internal_class
) {
985 /* The device does not exist, so don't allow it to be opened. */
988 /* "Internal" netdevs have to be created as netdev objects before
989 * they exist in the kernel, because creating them in the kernel
990 * happens by passing a netdev object to dpif_port_add().
991 * Therefore, ignore the error. */
998 /* For most types of netdevs we open the device for each call of
999 * netdev_open(). However, this is not the case with tap devices,
1000 * since it is only possible to open the device once. In this
1001 * situation we share a single file descriptor, and consequently
1002 * buffers, across all readers. Therefore once data is read it will
1003 * be unavailable to other reads for tap devices. */
1005 netdev_linux_construct_tap(struct netdev
*netdev_
)
1007 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1008 static const char tap_dev
[] = "/dev/net/tun";
1009 const char *name
= netdev_
->name
;
1012 int error
= netdev_linux_common_construct(netdev_
);
1017 /* Open tap device. */
1018 netdev
->tap_fd
= open(tap_dev
, O_RDWR
);
1019 if (netdev
->tap_fd
< 0) {
1021 VLOG_WARN("opening \"%s\" failed: %s", tap_dev
, ovs_strerror(error
));
1025 /* Create tap device. */
1026 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
1027 ifr
.ifr_flags
= IFF_TAP
| IFF_NO_PI
;
1028 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
1029 if (ioctl(netdev
->tap_fd
, TUNSETIFF
, &ifr
) == -1) {
1030 VLOG_WARN("%s: creating tap device failed: %s", name
,
1031 ovs_strerror(errno
));
1036 /* Make non-blocking. */
1037 error
= set_nonblocking(netdev
->tap_fd
);
1042 if (ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 1)) {
1043 VLOG_WARN("%s: creating tap device failed (persist): %s", name
,
1044 ovs_strerror(errno
));
1049 netdev
->present
= true;
1053 close(netdev
->tap_fd
);
1058 netdev_linux_destruct(struct netdev
*netdev_
)
1060 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1062 if (netdev
->tc
&& netdev
->tc
->ops
->tc_destroy
) {
1063 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
1066 if (netdev_get_class(netdev_
) == &netdev_tap_class
1067 && netdev
->tap_fd
>= 0)
1069 ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 0);
1070 close(netdev
->tap_fd
);
1073 if (netdev
->miimon_interval
> 0) {
1074 atomic_count_dec(&miimon_cnt
);
1077 ovs_mutex_destroy(&netdev
->mutex
);
1081 netdev_linux_dealloc(struct netdev
*netdev_
)
1083 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1087 static struct netdev_rxq
*
1088 netdev_linux_rxq_alloc(void)
1090 struct netdev_rxq_linux
*rx
= xzalloc(sizeof *rx
);
1095 netdev_linux_rxq_construct(struct netdev_rxq
*rxq_
)
1097 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1098 struct netdev
*netdev_
= rx
->up
.netdev
;
1099 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1102 ovs_mutex_lock(&netdev
->mutex
);
1103 rx
->is_tap
= is_tap_netdev(netdev_
);
1105 rx
->fd
= netdev
->tap_fd
;
1107 struct sockaddr_ll sll
;
1109 /* Result of tcpdump -dd inbound */
1110 static const struct sock_filter filt
[] = {
1111 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1112 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1113 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1114 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1116 static const struct sock_fprog fprog
= {
1117 ARRAY_SIZE(filt
), (struct sock_filter
*) filt
1120 /* Create file descriptor. */
1121 rx
->fd
= socket(PF_PACKET
, SOCK_RAW
, 0);
1124 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error
));
1129 if (setsockopt(rx
->fd
, SOL_PACKET
, PACKET_AUXDATA
, &val
, sizeof val
)) {
1131 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1132 netdev_get_name(netdev_
), ovs_strerror(error
));
1136 /* Set non-blocking mode. */
1137 error
= set_nonblocking(rx
->fd
);
1142 /* Get ethernet device index. */
1143 error
= get_ifindex(&netdev
->up
, &ifindex
);
1148 /* Bind to specific ethernet device. */
1149 memset(&sll
, 0, sizeof sll
);
1150 sll
.sll_family
= AF_PACKET
;
1151 sll
.sll_ifindex
= ifindex
;
1152 sll
.sll_protocol
= htons(ETH_P_ALL
);
1153 if (bind(rx
->fd
, (struct sockaddr
*) &sll
, sizeof sll
) < 0) {
1155 VLOG_ERR("%s: failed to bind raw socket (%s)",
1156 netdev_get_name(netdev_
), ovs_strerror(error
));
1160 /* Filter for only inbound packets. */
1161 error
= setsockopt(rx
->fd
, SOL_SOCKET
, SO_ATTACH_FILTER
, &fprog
,
1165 VLOG_ERR("%s: failed to attach filter (%s)",
1166 netdev_get_name(netdev_
), ovs_strerror(error
));
1170 ovs_mutex_unlock(&netdev
->mutex
);
1178 ovs_mutex_unlock(&netdev
->mutex
);
1183 netdev_linux_rxq_destruct(struct netdev_rxq
*rxq_
)
1185 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1193 netdev_linux_rxq_dealloc(struct netdev_rxq
*rxq_
)
1195 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1201 auxdata_to_vlan_tpid(const struct tpacket_auxdata
*aux
, bool double_tagged
)
1203 if (aux
->tp_status
& TP_STATUS_VLAN_TPID_VALID
) {
1204 return htons(aux
->tp_vlan_tpid
);
1205 } else if (double_tagged
) {
1206 return htons(ETH_TYPE_VLAN_8021AD
);
1208 return htons(ETH_TYPE_VLAN_8021Q
);
1213 auxdata_has_vlan_tci(const struct tpacket_auxdata
*aux
)
1215 return aux
->tp_vlan_tci
|| aux
->tp_status
& TP_STATUS_VLAN_VALID
;
1219 netdev_linux_rxq_recv_sock(int fd
, struct dp_packet
*buffer
)
1224 struct cmsghdr
*cmsg
;
1226 struct cmsghdr cmsg
;
1227 char buffer
[CMSG_SPACE(sizeof(struct tpacket_auxdata
))];
1231 /* Reserve headroom for a single VLAN tag */
1232 dp_packet_reserve(buffer
, VLAN_HEADER_LEN
);
1233 size
= dp_packet_tailroom(buffer
);
1235 iov
.iov_base
= dp_packet_data(buffer
);
1237 msgh
.msg_name
= NULL
;
1238 msgh
.msg_namelen
= 0;
1239 msgh
.msg_iov
= &iov
;
1240 msgh
.msg_iovlen
= 1;
1241 msgh
.msg_control
= &cmsg_buffer
;
1242 msgh
.msg_controllen
= sizeof cmsg_buffer
;
1246 retval
= recvmsg(fd
, &msgh
, MSG_TRUNC
);
1247 } while (retval
< 0 && errno
== EINTR
);
1251 } else if (retval
> size
) {
1255 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1257 for (cmsg
= CMSG_FIRSTHDR(&msgh
); cmsg
; cmsg
= CMSG_NXTHDR(&msgh
, cmsg
)) {
1258 const struct tpacket_auxdata
*aux
;
1260 if (cmsg
->cmsg_level
!= SOL_PACKET
1261 || cmsg
->cmsg_type
!= PACKET_AUXDATA
1262 || cmsg
->cmsg_len
< CMSG_LEN(sizeof(struct tpacket_auxdata
))) {
1266 aux
= ALIGNED_CAST(struct tpacket_auxdata
*, CMSG_DATA(cmsg
));
1267 if (auxdata_has_vlan_tci(aux
)) {
1268 struct eth_header
*eth
;
1271 if (retval
< ETH_HEADER_LEN
) {
1275 eth
= dp_packet_data(buffer
);
1276 double_tagged
= eth
->eth_type
== htons(ETH_TYPE_VLAN_8021Q
);
1278 eth_push_vlan(buffer
, auxdata_to_vlan_tpid(aux
, double_tagged
),
1279 htons(aux
->tp_vlan_tci
));
1288 netdev_linux_rxq_recv_tap(int fd
, struct dp_packet
*buffer
)
1291 size_t size
= dp_packet_tailroom(buffer
);
1294 retval
= read(fd
, dp_packet_data(buffer
), size
);
1295 } while (retval
< 0 && errno
== EINTR
);
1301 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1306 netdev_linux_rxq_recv(struct netdev_rxq
*rxq_
, struct dp_packet_batch
*batch
,
1309 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1310 struct netdev
*netdev
= rx
->up
.netdev
;
1311 struct dp_packet
*buffer
;
1315 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
1316 mtu
= ETH_PAYLOAD_MAX
;
1319 /* Assume Ethernet port. No need to set packet_type. */
1320 buffer
= dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN
+ mtu
,
1321 DP_NETDEV_HEADROOM
);
1322 retval
= (rx
->is_tap
1323 ? netdev_linux_rxq_recv_tap(rx
->fd
, buffer
)
1324 : netdev_linux_rxq_recv_sock(rx
->fd
, buffer
));
1327 if (retval
!= EAGAIN
&& retval
!= EMSGSIZE
) {
1328 VLOG_WARN_RL(&rl
, "error receiving Ethernet packet on %s: %s",
1329 netdev_rxq_get_name(rxq_
), ovs_strerror(errno
));
1331 dp_packet_delete(buffer
);
1333 dp_packet_batch_init_packet(batch
, buffer
);
1344 netdev_linux_rxq_wait(struct netdev_rxq
*rxq_
)
1346 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1347 poll_fd_wait(rx
->fd
, POLLIN
);
1351 netdev_linux_rxq_drain(struct netdev_rxq
*rxq_
)
1353 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1356 int error
= af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_
), &ifr
,
1357 SIOCGIFTXQLEN
, "SIOCGIFTXQLEN");
1361 drain_fd(rx
->fd
, ifr
.ifr_qlen
);
1364 return drain_rcvbuf(rx
->fd
);
1369 netdev_linux_sock_batch_send(int sock
, int ifindex
,
1370 struct dp_packet_batch
*batch
)
1372 const size_t size
= dp_packet_batch_size(batch
);
1373 /* We don't bother setting most fields in sockaddr_ll because the
1374 * kernel ignores them for SOCK_RAW. */
1375 struct sockaddr_ll sll
= { .sll_family
= AF_PACKET
,
1376 .sll_ifindex
= ifindex
};
1378 struct mmsghdr
*mmsg
= xmalloc(sizeof(*mmsg
) * size
);
1379 struct iovec
*iov
= xmalloc(sizeof(*iov
) * size
);
1381 struct dp_packet
*packet
;
1382 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1383 iov
[i
].iov_base
= dp_packet_data(packet
);
1384 iov
[i
].iov_len
= dp_packet_size(packet
);
1385 mmsg
[i
].msg_hdr
= (struct msghdr
) { .msg_name
= &sll
,
1386 .msg_namelen
= sizeof sll
,
1392 for (uint32_t ofs
= 0; ofs
< size
; ) {
1395 retval
= sendmmsg(sock
, mmsg
+ ofs
, size
- ofs
, 0);
1396 error
= retval
< 0 ? errno
: 0;
1397 } while (error
== EINTR
);
1409 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1410 * essential, because packets sent to a tap device with an AF_PACKET socket
1411 * will loop back to be *received* again on the tap device. This doesn't occur
1412 * on other interface types because we attach a socket filter to the rx
1415 netdev_linux_tap_batch_send(struct netdev
*netdev_
,
1416 struct dp_packet_batch
*batch
)
1418 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1419 struct dp_packet
*packet
;
1421 /* The Linux tap driver returns EIO if the device is not up,
1422 * so if the device is not up, don't waste time sending it.
1423 * However, if the device is in another network namespace
1424 * then OVS can't retrieve the state. In that case, send the
1425 * packets anyway. */
1426 if (netdev
->present
&& !(netdev
->ifi_flags
& IFF_UP
)) {
1427 netdev
->tx_dropped
+= dp_packet_batch_size(batch
);
1431 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1432 size_t size
= dp_packet_size(packet
);
1437 retval
= write(netdev
->tap_fd
, dp_packet_data(packet
), size
);
1438 error
= retval
< 0 ? errno
: 0;
1439 } while (error
== EINTR
);
1442 /* The Linux tap driver returns EIO if the device is not up. From
1443 * the OVS side this is not an error, so we ignore it; otherwise,
1444 * return the erro. */
1448 } else if (retval
!= size
) {
1449 VLOG_WARN_RL(&rl
, "sent partial Ethernet packet (%"PRIuSIZE
" "
1450 "bytes of %"PRIuSIZE
") on %s",
1451 retval
, size
, netdev_get_name(netdev_
));
1458 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1459 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1460 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1461 * the packet is too big or too small to transmit on the device.
1463 * The kernel maintains a packet transmission queue, so the caller is not
1464 * expected to do additional queuing of packets. */
1466 netdev_linux_send(struct netdev
*netdev_
, int qid OVS_UNUSED
,
1467 struct dp_packet_batch
*batch
,
1468 bool concurrent_txq OVS_UNUSED
)
1473 if (!is_tap_netdev(netdev_
)) {
1474 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_
))) {
1479 sock
= af_packet_sock();
1485 int ifindex
= netdev_get_ifindex(netdev_
);
1491 error
= netdev_linux_sock_batch_send(sock
, ifindex
, batch
);
1493 error
= netdev_linux_tap_batch_send(netdev_
, batch
);
1496 if (error
== ENOBUFS
) {
1497 /* The Linux AF_PACKET implementation never blocks waiting
1498 * for room for packets, instead returning ENOBUFS.
1499 * Translate this into EAGAIN for the caller. */
1502 VLOG_WARN_RL(&rl
, "error sending Ethernet packet on %s: %s",
1503 netdev_get_name(netdev_
), ovs_strerror(error
));
1508 dp_packet_delete_batch(batch
, true);
1512 /* Registers with the poll loop to wake up from the next call to poll_block()
1513 * when the packet transmission queue has sufficient room to transmit a packet
1514 * with netdev_send().
1516 * The kernel maintains a packet transmission queue, so the client is not
1517 * expected to do additional queuing of packets. Thus, this function is
1518 * unlikely to ever be used. It is included for completeness. */
1520 netdev_linux_send_wait(struct netdev
*netdev
, int qid OVS_UNUSED
)
1522 if (is_tap_netdev(netdev
)) {
1523 /* TAP device always accepts packets.*/
1524 poll_immediate_wake();
1528 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1529 * otherwise a positive errno value. */
1531 netdev_linux_set_etheraddr(struct netdev
*netdev_
, const struct eth_addr mac
)
1533 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1534 enum netdev_flags old_flags
= 0;
1537 ovs_mutex_lock(&netdev
->mutex
);
1538 if (netdev_linux_netnsid_is_remote(netdev
)) {
1543 if (netdev
->cache_valid
& VALID_ETHERADDR
) {
1544 error
= netdev
->ether_addr_error
;
1545 if (error
|| eth_addr_equals(netdev
->etheraddr
, mac
)) {
1548 netdev
->cache_valid
&= ~VALID_ETHERADDR
;
1551 /* Tap devices must be brought down before setting the address. */
1552 if (is_tap_netdev(netdev_
)) {
1553 update_flags(netdev
, NETDEV_UP
, 0, &old_flags
);
1555 error
= set_etheraddr(netdev_get_name(netdev_
), mac
);
1556 if (!error
|| error
== ENODEV
) {
1557 netdev
->ether_addr_error
= error
;
1558 netdev
->cache_valid
|= VALID_ETHERADDR
;
1560 netdev
->etheraddr
= mac
;
1564 if (is_tap_netdev(netdev_
) && old_flags
& NETDEV_UP
) {
1565 update_flags(netdev
, 0, NETDEV_UP
, &old_flags
);
1569 ovs_mutex_unlock(&netdev
->mutex
);
1573 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1575 netdev_linux_get_etheraddr(const struct netdev
*netdev_
, struct eth_addr
*mac
)
1577 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1580 ovs_mutex_lock(&netdev
->mutex
);
1581 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1582 netdev_linux_update_via_netlink(netdev
);
1585 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1586 /* Fall back to ioctl if netlink fails */
1587 netdev
->ether_addr_error
= get_etheraddr(netdev_get_name(netdev_
),
1588 &netdev
->etheraddr
);
1589 netdev
->cache_valid
|= VALID_ETHERADDR
;
1592 error
= netdev
->ether_addr_error
;
1594 *mac
= netdev
->etheraddr
;
1596 ovs_mutex_unlock(&netdev
->mutex
);
1602 netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
)
1606 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1607 netdev_linux_update_via_netlink(netdev
);
1610 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1611 /* Fall back to ioctl if netlink fails */
1614 netdev
->netdev_mtu_error
= af_inet_ifreq_ioctl(
1615 netdev_get_name(&netdev
->up
), &ifr
, SIOCGIFMTU
, "SIOCGIFMTU");
1616 netdev
->mtu
= ifr
.ifr_mtu
;
1617 netdev
->cache_valid
|= VALID_MTU
;
1620 error
= netdev
->netdev_mtu_error
;
1622 *mtup
= netdev
->mtu
;
1628 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1629 * in bytes, not including the hardware header; thus, this is typically 1500
1630 * bytes for Ethernet devices. */
1632 netdev_linux_get_mtu(const struct netdev
*netdev_
, int *mtup
)
1634 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1637 ovs_mutex_lock(&netdev
->mutex
);
1638 error
= netdev_linux_get_mtu__(netdev
, mtup
);
1639 ovs_mutex_unlock(&netdev
->mutex
);
1644 /* Sets the maximum size of transmitted (MTU) for given device using linux
1645 * networking ioctl interface.
1648 netdev_linux_set_mtu(struct netdev
*netdev_
, int mtu
)
1650 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1654 ovs_mutex_lock(&netdev
->mutex
);
1655 if (netdev_linux_netnsid_is_remote(netdev
)) {
1660 if (netdev
->cache_valid
& VALID_MTU
) {
1661 error
= netdev
->netdev_mtu_error
;
1662 if (error
|| netdev
->mtu
== mtu
) {
1665 netdev
->cache_valid
&= ~VALID_MTU
;
1668 error
= af_inet_ifreq_ioctl(netdev_get_name(netdev_
), &ifr
,
1669 SIOCSIFMTU
, "SIOCSIFMTU");
1670 if (!error
|| error
== ENODEV
) {
1671 netdev
->netdev_mtu_error
= error
;
1672 netdev
->mtu
= ifr
.ifr_mtu
;
1673 netdev
->cache_valid
|= VALID_MTU
;
1676 ovs_mutex_unlock(&netdev
->mutex
);
1680 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1681 * On failure, returns a negative errno value. */
1683 netdev_linux_get_ifindex(const struct netdev
*netdev_
)
1685 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1688 ovs_mutex_lock(&netdev
->mutex
);
1689 if (netdev_linux_netnsid_is_remote(netdev
)) {
1693 error
= get_ifindex(netdev_
, &ifindex
);
1696 ovs_mutex_unlock(&netdev
->mutex
);
1697 return error
? -error
: ifindex
;
1701 netdev_linux_get_carrier(const struct netdev
*netdev_
, bool *carrier
)
1703 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1705 ovs_mutex_lock(&netdev
->mutex
);
1706 if (netdev
->miimon_interval
> 0) {
1707 *carrier
= netdev
->miimon
;
1709 *carrier
= (netdev
->ifi_flags
& IFF_RUNNING
) != 0;
1711 ovs_mutex_unlock(&netdev
->mutex
);
1716 static long long int
1717 netdev_linux_get_carrier_resets(const struct netdev
*netdev_
)
1719 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1720 long long int carrier_resets
;
1722 ovs_mutex_lock(&netdev
->mutex
);
1723 carrier_resets
= netdev
->carrier_resets
;
1724 ovs_mutex_unlock(&netdev
->mutex
);
1726 return carrier_resets
;
1730 netdev_linux_do_miimon(const char *name
, int cmd
, const char *cmd_name
,
1731 struct mii_ioctl_data
*data
)
1736 memset(&ifr
, 0, sizeof ifr
);
1737 memcpy(&ifr
.ifr_data
, data
, sizeof *data
);
1738 error
= af_inet_ifreq_ioctl(name
, &ifr
, cmd
, cmd_name
);
1739 memcpy(data
, &ifr
.ifr_data
, sizeof *data
);
1745 netdev_linux_get_miimon(const char *name
, bool *miimon
)
1747 struct mii_ioctl_data data
;
1752 memset(&data
, 0, sizeof data
);
1753 error
= netdev_linux_do_miimon(name
, SIOCGMIIPHY
, "SIOCGMIIPHY", &data
);
1755 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1756 data
.reg_num
= MII_BMSR
;
1757 error
= netdev_linux_do_miimon(name
, SIOCGMIIREG
, "SIOCGMIIREG",
1761 *miimon
= !!(data
.val_out
& BMSR_LSTATUS
);
1765 struct ethtool_cmd ecmd
;
1767 VLOG_DBG_RL(&rl
, "%s: failed to query MII, falling back to ethtool",
1770 COVERAGE_INC(netdev_get_ethtool
);
1771 memset(&ecmd
, 0, sizeof ecmd
);
1772 error
= netdev_linux_do_ethtool(name
, &ecmd
, ETHTOOL_GLINK
,
1775 struct ethtool_value eval
;
1777 memcpy(&eval
, &ecmd
, sizeof eval
);
1778 *miimon
= !!eval
.data
;
1780 VLOG_WARN_RL(&rl
, "%s: ethtool link status failed", name
);
1788 netdev_linux_set_miimon_interval(struct netdev
*netdev_
,
1789 long long int interval
)
1791 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1793 ovs_mutex_lock(&netdev
->mutex
);
1794 interval
= interval
> 0 ? MAX(interval
, 100) : 0;
1795 if (netdev
->miimon_interval
!= interval
) {
1796 if (interval
&& !netdev
->miimon_interval
) {
1797 atomic_count_inc(&miimon_cnt
);
1798 } else if (!interval
&& netdev
->miimon_interval
) {
1799 atomic_count_dec(&miimon_cnt
);
1802 netdev
->miimon_interval
= interval
;
1803 timer_set_expired(&netdev
->miimon_timer
);
1805 ovs_mutex_unlock(&netdev
->mutex
);
1811 netdev_linux_miimon_run(void)
1813 struct shash device_shash
;
1814 struct shash_node
*node
;
1816 shash_init(&device_shash
);
1817 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1818 SHASH_FOR_EACH (node
, &device_shash
) {
1819 struct netdev
*netdev
= node
->data
;
1820 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1823 ovs_mutex_lock(&dev
->mutex
);
1824 if (dev
->miimon_interval
> 0 && timer_expired(&dev
->miimon_timer
)) {
1825 netdev_linux_get_miimon(dev
->up
.name
, &miimon
);
1826 if (miimon
!= dev
->miimon
) {
1827 dev
->miimon
= miimon
;
1828 netdev_linux_changed(dev
, dev
->ifi_flags
, 0);
1831 timer_set_duration(&dev
->miimon_timer
, dev
->miimon_interval
);
1833 ovs_mutex_unlock(&dev
->mutex
);
1834 netdev_close(netdev
);
1837 shash_destroy(&device_shash
);
1841 netdev_linux_miimon_wait(void)
1843 struct shash device_shash
;
1844 struct shash_node
*node
;
1846 shash_init(&device_shash
);
1847 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1848 SHASH_FOR_EACH (node
, &device_shash
) {
1849 struct netdev
*netdev
= node
->data
;
1850 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1852 ovs_mutex_lock(&dev
->mutex
);
1853 if (dev
->miimon_interval
> 0) {
1854 timer_wait(&dev
->miimon_timer
);
1856 ovs_mutex_unlock(&dev
->mutex
);
1857 netdev_close(netdev
);
1859 shash_destroy(&device_shash
);
1863 swap_uint64(uint64_t *a
, uint64_t *b
)
1870 /* Copies 'src' into 'dst', performing format conversion in the process.
1872 * 'src' is allowed to be misaligned. */
1874 netdev_stats_from_ovs_vport_stats(struct netdev_stats
*dst
,
1875 const struct ovs_vport_stats
*src
)
1877 dst
->rx_packets
= get_32aligned_u64(&src
->rx_packets
);
1878 dst
->tx_packets
= get_32aligned_u64(&src
->tx_packets
);
1879 dst
->rx_bytes
= get_32aligned_u64(&src
->rx_bytes
);
1880 dst
->tx_bytes
= get_32aligned_u64(&src
->tx_bytes
);
1881 dst
->rx_errors
= get_32aligned_u64(&src
->rx_errors
);
1882 dst
->tx_errors
= get_32aligned_u64(&src
->tx_errors
);
1883 dst
->rx_dropped
= get_32aligned_u64(&src
->rx_dropped
);
1884 dst
->tx_dropped
= get_32aligned_u64(&src
->tx_dropped
);
1886 dst
->collisions
= 0;
1887 dst
->rx_length_errors
= 0;
1888 dst
->rx_over_errors
= 0;
1889 dst
->rx_crc_errors
= 0;
1890 dst
->rx_frame_errors
= 0;
1891 dst
->rx_fifo_errors
= 0;
1892 dst
->rx_missed_errors
= 0;
1893 dst
->tx_aborted_errors
= 0;
1894 dst
->tx_carrier_errors
= 0;
1895 dst
->tx_fifo_errors
= 0;
1896 dst
->tx_heartbeat_errors
= 0;
1897 dst
->tx_window_errors
= 0;
1901 get_stats_via_vport__(const struct netdev
*netdev
, struct netdev_stats
*stats
)
1903 struct dpif_netlink_vport reply
;
1907 error
= dpif_netlink_vport_get(netdev_get_name(netdev
), &reply
, &buf
);
1910 } else if (!reply
.stats
) {
1915 netdev_stats_from_ovs_vport_stats(stats
, reply
.stats
);
1923 get_stats_via_vport(const struct netdev
*netdev_
,
1924 struct netdev_stats
*stats
)
1926 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1928 if (!netdev
->vport_stats_error
||
1929 !(netdev
->cache_valid
& VALID_VPORT_STAT_ERROR
)) {
1932 error
= get_stats_via_vport__(netdev_
, stats
);
1933 if (error
&& error
!= ENOENT
&& error
!= ENODEV
) {
1934 VLOG_WARN_RL(&rl
, "%s: obtaining netdev stats via vport failed "
1936 netdev_get_name(netdev_
), ovs_strerror(error
));
1938 netdev
->vport_stats_error
= error
;
1939 netdev
->cache_valid
|= VALID_VPORT_STAT_ERROR
;
1943 /* Retrieves current device stats for 'netdev-linux'. */
1945 netdev_linux_get_stats(const struct netdev
*netdev_
,
1946 struct netdev_stats
*stats
)
1948 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1949 struct netdev_stats dev_stats
;
1952 ovs_mutex_lock(&netdev
->mutex
);
1953 get_stats_via_vport(netdev_
, stats
);
1954 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1956 if (!netdev
->vport_stats_error
) {
1959 } else if (netdev
->vport_stats_error
) {
1960 /* stats not available from OVS then use netdev stats. */
1963 /* Use kernel netdev's packet and byte counts since vport's counters
1964 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1966 stats
->rx_packets
= dev_stats
.rx_packets
;
1967 stats
->rx_bytes
= dev_stats
.rx_bytes
;
1968 stats
->tx_packets
= dev_stats
.tx_packets
;
1969 stats
->tx_bytes
= dev_stats
.tx_bytes
;
1971 stats
->rx_errors
+= dev_stats
.rx_errors
;
1972 stats
->tx_errors
+= dev_stats
.tx_errors
;
1973 stats
->rx_dropped
+= dev_stats
.rx_dropped
;
1974 stats
->tx_dropped
+= dev_stats
.tx_dropped
;
1975 stats
->multicast
+= dev_stats
.multicast
;
1976 stats
->collisions
+= dev_stats
.collisions
;
1977 stats
->rx_length_errors
+= dev_stats
.rx_length_errors
;
1978 stats
->rx_over_errors
+= dev_stats
.rx_over_errors
;
1979 stats
->rx_crc_errors
+= dev_stats
.rx_crc_errors
;
1980 stats
->rx_frame_errors
+= dev_stats
.rx_frame_errors
;
1981 stats
->rx_fifo_errors
+= dev_stats
.rx_fifo_errors
;
1982 stats
->rx_missed_errors
+= dev_stats
.rx_missed_errors
;
1983 stats
->tx_aborted_errors
+= dev_stats
.tx_aborted_errors
;
1984 stats
->tx_carrier_errors
+= dev_stats
.tx_carrier_errors
;
1985 stats
->tx_fifo_errors
+= dev_stats
.tx_fifo_errors
;
1986 stats
->tx_heartbeat_errors
+= dev_stats
.tx_heartbeat_errors
;
1987 stats
->tx_window_errors
+= dev_stats
.tx_window_errors
;
1989 ovs_mutex_unlock(&netdev
->mutex
);
1994 /* Retrieves current device stats for 'netdev-tap' netdev or
1995 * netdev-internal. */
1997 netdev_tap_get_stats(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
1999 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2000 struct netdev_stats dev_stats
;
2003 ovs_mutex_lock(&netdev
->mutex
);
2004 get_stats_via_vport(netdev_
, stats
);
2005 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
2007 if (!netdev
->vport_stats_error
) {
2010 } else if (netdev
->vport_stats_error
) {
2011 /* Transmit and receive stats will appear to be swapped relative to the
2012 * other ports since we are the one sending the data, not a remote
2013 * computer. For consistency, we swap them back here. This does not
2014 * apply if we are getting stats from the vport layer because it always
2015 * tracks stats from the perspective of the switch. */
2018 swap_uint64(&stats
->rx_packets
, &stats
->tx_packets
);
2019 swap_uint64(&stats
->rx_bytes
, &stats
->tx_bytes
);
2020 swap_uint64(&stats
->rx_errors
, &stats
->tx_errors
);
2021 swap_uint64(&stats
->rx_dropped
, &stats
->tx_dropped
);
2022 stats
->rx_length_errors
= 0;
2023 stats
->rx_over_errors
= 0;
2024 stats
->rx_crc_errors
= 0;
2025 stats
->rx_frame_errors
= 0;
2026 stats
->rx_fifo_errors
= 0;
2027 stats
->rx_missed_errors
= 0;
2028 stats
->tx_aborted_errors
= 0;
2029 stats
->tx_carrier_errors
= 0;
2030 stats
->tx_fifo_errors
= 0;
2031 stats
->tx_heartbeat_errors
= 0;
2032 stats
->tx_window_errors
= 0;
2034 /* Use kernel netdev's packet and byte counts since vport counters
2035 * do not reflect packet counts on the wire when GSO, TSO or GRO
2037 stats
->rx_packets
= dev_stats
.tx_packets
;
2038 stats
->rx_bytes
= dev_stats
.tx_bytes
;
2039 stats
->tx_packets
= dev_stats
.rx_packets
;
2040 stats
->tx_bytes
= dev_stats
.rx_bytes
;
2042 stats
->rx_dropped
+= dev_stats
.tx_dropped
;
2043 stats
->tx_dropped
+= dev_stats
.rx_dropped
;
2045 stats
->rx_errors
+= dev_stats
.tx_errors
;
2046 stats
->tx_errors
+= dev_stats
.rx_errors
;
2048 stats
->multicast
+= dev_stats
.multicast
;
2049 stats
->collisions
+= dev_stats
.collisions
;
2051 stats
->tx_dropped
+= netdev
->tx_dropped
;
2052 ovs_mutex_unlock(&netdev
->mutex
);
2058 netdev_internal_get_stats(const struct netdev
*netdev_
,
2059 struct netdev_stats
*stats
)
2061 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2064 ovs_mutex_lock(&netdev
->mutex
);
2065 get_stats_via_vport(netdev_
, stats
);
2066 error
= netdev
->vport_stats_error
;
2067 ovs_mutex_unlock(&netdev
->mutex
);
2073 netdev_linux_read_features(struct netdev_linux
*netdev
)
2075 struct ethtool_cmd ecmd
;
2079 if (netdev
->cache_valid
& VALID_FEATURES
) {
2083 COVERAGE_INC(netdev_get_ethtool
);
2084 memset(&ecmd
, 0, sizeof ecmd
);
2085 error
= netdev_linux_do_ethtool(netdev
->up
.name
, &ecmd
,
2086 ETHTOOL_GSET
, "ETHTOOL_GSET");
2091 /* Supported features. */
2092 netdev
->supported
= 0;
2093 if (ecmd
.supported
& SUPPORTED_10baseT_Half
) {
2094 netdev
->supported
|= NETDEV_F_10MB_HD
;
2096 if (ecmd
.supported
& SUPPORTED_10baseT_Full
) {
2097 netdev
->supported
|= NETDEV_F_10MB_FD
;
2099 if (ecmd
.supported
& SUPPORTED_100baseT_Half
) {
2100 netdev
->supported
|= NETDEV_F_100MB_HD
;
2102 if (ecmd
.supported
& SUPPORTED_100baseT_Full
) {
2103 netdev
->supported
|= NETDEV_F_100MB_FD
;
2105 if (ecmd
.supported
& SUPPORTED_1000baseT_Half
) {
2106 netdev
->supported
|= NETDEV_F_1GB_HD
;
2108 if ((ecmd
.supported
& SUPPORTED_1000baseT_Full
) ||
2109 (ecmd
.supported
& SUPPORTED_1000baseKX_Full
)) {
2110 netdev
->supported
|= NETDEV_F_1GB_FD
;
2112 if ((ecmd
.supported
& SUPPORTED_10000baseT_Full
) ||
2113 (ecmd
.supported
& SUPPORTED_10000baseKX4_Full
) ||
2114 (ecmd
.supported
& SUPPORTED_10000baseKR_Full
) ||
2115 (ecmd
.supported
& SUPPORTED_10000baseR_FEC
)) {
2116 netdev
->supported
|= NETDEV_F_10GB_FD
;
2118 if ((ecmd
.supported
& SUPPORTED_40000baseKR4_Full
) ||
2119 (ecmd
.supported
& SUPPORTED_40000baseCR4_Full
) ||
2120 (ecmd
.supported
& SUPPORTED_40000baseSR4_Full
) ||
2121 (ecmd
.supported
& SUPPORTED_40000baseLR4_Full
)) {
2122 netdev
->supported
|= NETDEV_F_40GB_FD
;
2124 if (ecmd
.supported
& SUPPORTED_TP
) {
2125 netdev
->supported
|= NETDEV_F_COPPER
;
2127 if (ecmd
.supported
& SUPPORTED_FIBRE
) {
2128 netdev
->supported
|= NETDEV_F_FIBER
;
2130 if (ecmd
.supported
& SUPPORTED_Autoneg
) {
2131 netdev
->supported
|= NETDEV_F_AUTONEG
;
2133 if (ecmd
.supported
& SUPPORTED_Pause
) {
2134 netdev
->supported
|= NETDEV_F_PAUSE
;
2136 if (ecmd
.supported
& SUPPORTED_Asym_Pause
) {
2137 netdev
->supported
|= NETDEV_F_PAUSE_ASYM
;
2140 /* Advertised features. */
2141 netdev
->advertised
= 0;
2142 if (ecmd
.advertising
& ADVERTISED_10baseT_Half
) {
2143 netdev
->advertised
|= NETDEV_F_10MB_HD
;
2145 if (ecmd
.advertising
& ADVERTISED_10baseT_Full
) {
2146 netdev
->advertised
|= NETDEV_F_10MB_FD
;
2148 if (ecmd
.advertising
& ADVERTISED_100baseT_Half
) {
2149 netdev
->advertised
|= NETDEV_F_100MB_HD
;
2151 if (ecmd
.advertising
& ADVERTISED_100baseT_Full
) {
2152 netdev
->advertised
|= NETDEV_F_100MB_FD
;
2154 if (ecmd
.advertising
& ADVERTISED_1000baseT_Half
) {
2155 netdev
->advertised
|= NETDEV_F_1GB_HD
;
2157 if ((ecmd
.advertising
& ADVERTISED_1000baseT_Full
) ||
2158 (ecmd
.advertising
& ADVERTISED_1000baseKX_Full
)) {
2159 netdev
->advertised
|= NETDEV_F_1GB_FD
;
2161 if ((ecmd
.advertising
& ADVERTISED_10000baseT_Full
) ||
2162 (ecmd
.advertising
& ADVERTISED_10000baseKX4_Full
) ||
2163 (ecmd
.advertising
& ADVERTISED_10000baseKR_Full
) ||
2164 (ecmd
.advertising
& ADVERTISED_10000baseR_FEC
)) {
2165 netdev
->advertised
|= NETDEV_F_10GB_FD
;
2167 if ((ecmd
.advertising
& ADVERTISED_40000baseKR4_Full
) ||
2168 (ecmd
.advertising
& ADVERTISED_40000baseCR4_Full
) ||
2169 (ecmd
.advertising
& ADVERTISED_40000baseSR4_Full
) ||
2170 (ecmd
.advertising
& ADVERTISED_40000baseLR4_Full
)) {
2171 netdev
->advertised
|= NETDEV_F_40GB_FD
;
2173 if (ecmd
.advertising
& ADVERTISED_TP
) {
2174 netdev
->advertised
|= NETDEV_F_COPPER
;
2176 if (ecmd
.advertising
& ADVERTISED_FIBRE
) {
2177 netdev
->advertised
|= NETDEV_F_FIBER
;
2179 if (ecmd
.advertising
& ADVERTISED_Autoneg
) {
2180 netdev
->advertised
|= NETDEV_F_AUTONEG
;
2182 if (ecmd
.advertising
& ADVERTISED_Pause
) {
2183 netdev
->advertised
|= NETDEV_F_PAUSE
;
2185 if (ecmd
.advertising
& ADVERTISED_Asym_Pause
) {
2186 netdev
->advertised
|= NETDEV_F_PAUSE_ASYM
;
2189 /* Current settings. */
2190 speed
= ethtool_cmd_speed(&ecmd
);
2191 if (speed
== SPEED_10
) {
2192 netdev
->current
= ecmd
.duplex
? NETDEV_F_10MB_FD
: NETDEV_F_10MB_HD
;
2193 } else if (speed
== SPEED_100
) {
2194 netdev
->current
= ecmd
.duplex
? NETDEV_F_100MB_FD
: NETDEV_F_100MB_HD
;
2195 } else if (speed
== SPEED_1000
) {
2196 netdev
->current
= ecmd
.duplex
? NETDEV_F_1GB_FD
: NETDEV_F_1GB_HD
;
2197 } else if (speed
== SPEED_10000
) {
2198 netdev
->current
= NETDEV_F_10GB_FD
;
2199 } else if (speed
== 40000) {
2200 netdev
->current
= NETDEV_F_40GB_FD
;
2201 } else if (speed
== 100000) {
2202 netdev
->current
= NETDEV_F_100GB_FD
;
2203 } else if (speed
== 1000000) {
2204 netdev
->current
= NETDEV_F_1TB_FD
;
2206 netdev
->current
= 0;
2209 if (ecmd
.port
== PORT_TP
) {
2210 netdev
->current
|= NETDEV_F_COPPER
;
2211 } else if (ecmd
.port
== PORT_FIBRE
) {
2212 netdev
->current
|= NETDEV_F_FIBER
;
2216 netdev
->current
|= NETDEV_F_AUTONEG
;
2220 netdev
->cache_valid
|= VALID_FEATURES
;
2221 netdev
->get_features_error
= error
;
2224 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2225 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2226 * Returns 0 if successful, otherwise a positive errno value. */
2228 netdev_linux_get_features(const struct netdev
*netdev_
,
2229 enum netdev_features
*current
,
2230 enum netdev_features
*advertised
,
2231 enum netdev_features
*supported
,
2232 enum netdev_features
*peer
)
2234 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2237 ovs_mutex_lock(&netdev
->mutex
);
2238 if (netdev_linux_netnsid_is_remote(netdev
)) {
2243 netdev_linux_read_features(netdev
);
2244 if (!netdev
->get_features_error
) {
2245 *current
= netdev
->current
;
2246 *advertised
= netdev
->advertised
;
2247 *supported
= netdev
->supported
;
2248 *peer
= 0; /* XXX */
2250 error
= netdev
->get_features_error
;
2253 ovs_mutex_unlock(&netdev
->mutex
);
2257 /* Set the features advertised by 'netdev' to 'advertise'. */
2259 netdev_linux_set_advertisements(struct netdev
*netdev_
,
2260 enum netdev_features advertise
)
2262 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2263 struct ethtool_cmd ecmd
;
2266 ovs_mutex_lock(&netdev
->mutex
);
2268 COVERAGE_INC(netdev_get_ethtool
);
2270 if (netdev_linux_netnsid_is_remote(netdev
)) {
2275 memset(&ecmd
, 0, sizeof ecmd
);
2276 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2277 ETHTOOL_GSET
, "ETHTOOL_GSET");
2282 ecmd
.advertising
= 0;
2283 if (advertise
& NETDEV_F_10MB_HD
) {
2284 ecmd
.advertising
|= ADVERTISED_10baseT_Half
;
2286 if (advertise
& NETDEV_F_10MB_FD
) {
2287 ecmd
.advertising
|= ADVERTISED_10baseT_Full
;
2289 if (advertise
& NETDEV_F_100MB_HD
) {
2290 ecmd
.advertising
|= ADVERTISED_100baseT_Half
;
2292 if (advertise
& NETDEV_F_100MB_FD
) {
2293 ecmd
.advertising
|= ADVERTISED_100baseT_Full
;
2295 if (advertise
& NETDEV_F_1GB_HD
) {
2296 ecmd
.advertising
|= ADVERTISED_1000baseT_Half
;
2298 if (advertise
& NETDEV_F_1GB_FD
) {
2299 ecmd
.advertising
|= ADVERTISED_1000baseT_Full
;
2301 if (advertise
& NETDEV_F_10GB_FD
) {
2302 ecmd
.advertising
|= ADVERTISED_10000baseT_Full
;
2304 if (advertise
& NETDEV_F_COPPER
) {
2305 ecmd
.advertising
|= ADVERTISED_TP
;
2307 if (advertise
& NETDEV_F_FIBER
) {
2308 ecmd
.advertising
|= ADVERTISED_FIBRE
;
2310 if (advertise
& NETDEV_F_AUTONEG
) {
2311 ecmd
.advertising
|= ADVERTISED_Autoneg
;
2313 if (advertise
& NETDEV_F_PAUSE
) {
2314 ecmd
.advertising
|= ADVERTISED_Pause
;
2316 if (advertise
& NETDEV_F_PAUSE_ASYM
) {
2317 ecmd
.advertising
|= ADVERTISED_Asym_Pause
;
2319 COVERAGE_INC(netdev_set_ethtool
);
2320 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2321 ETHTOOL_SSET
, "ETHTOOL_SSET");
2324 ovs_mutex_unlock(&netdev
->mutex
);
2328 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2329 * successful, otherwise a positive errno value. */
2331 netdev_linux_set_policing(struct netdev
*netdev_
,
2332 uint32_t kbits_rate
, uint32_t kbits_burst
)
2334 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2335 const char *netdev_name
= netdev_get_name(netdev_
);
2339 if (netdev_is_flow_api_enabled()) {
2341 VLOG_WARN_RL(&rl
, "%s: policing with offload isn't supported",
2347 kbits_burst
= (!kbits_rate
? 0 /* Force to 0 if no rate specified. */
2348 : !kbits_burst
? 8000 /* Default to 8000 kbits if 0. */
2349 : kbits_burst
); /* Stick with user-specified value. */
2351 ovs_mutex_lock(&netdev
->mutex
);
2352 if (netdev_linux_netnsid_is_remote(netdev
)) {
2357 if (netdev
->cache_valid
& VALID_POLICING
) {
2358 error
= netdev
->netdev_policing_error
;
2359 if (error
|| (netdev
->kbits_rate
== kbits_rate
&&
2360 netdev
->kbits_burst
== kbits_burst
)) {
2361 /* Assume that settings haven't changed since we last set them. */
2364 netdev
->cache_valid
&= ~VALID_POLICING
;
2367 error
= get_ifindex(netdev_
, &ifindex
);
2372 COVERAGE_INC(netdev_set_policing
);
2373 /* Remove any existing ingress qdisc. */
2374 error
= tc_add_del_ingress_qdisc(ifindex
, false, 0);
2376 VLOG_WARN_RL(&rl
, "%s: removing policing failed: %s",
2377 netdev_name
, ovs_strerror(error
));
2382 error
= tc_add_del_ingress_qdisc(ifindex
, true, 0);
2384 VLOG_WARN_RL(&rl
, "%s: adding policing qdisc failed: %s",
2385 netdev_name
, ovs_strerror(error
));
2389 error
= tc_add_policer(netdev_
, kbits_rate
, kbits_burst
);
2391 VLOG_WARN_RL(&rl
, "%s: adding policing action failed: %s",
2392 netdev_name
, ovs_strerror(error
));
2397 netdev
->kbits_rate
= kbits_rate
;
2398 netdev
->kbits_burst
= kbits_burst
;
2401 if (!error
|| error
== ENODEV
) {
2402 netdev
->netdev_policing_error
= error
;
2403 netdev
->cache_valid
|= VALID_POLICING
;
2405 ovs_mutex_unlock(&netdev
->mutex
);
2410 netdev_linux_get_qos_types(const struct netdev
*netdev OVS_UNUSED
,
2413 const struct tc_ops
*const *opsp
;
2414 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2415 const struct tc_ops
*ops
= *opsp
;
2416 if (ops
->tc_install
&& ops
->ovs_name
[0] != '\0') {
2417 sset_add(types
, ops
->ovs_name
);
2423 static const struct tc_ops
*
2424 tc_lookup_ovs_name(const char *name
)
2426 const struct tc_ops
*const *opsp
;
2428 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2429 const struct tc_ops
*ops
= *opsp
;
2430 if (!strcmp(name
, ops
->ovs_name
)) {
2437 static const struct tc_ops
*
2438 tc_lookup_linux_name(const char *name
)
2440 const struct tc_ops
*const *opsp
;
2442 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2443 const struct tc_ops
*ops
= *opsp
;
2444 if (ops
->linux_name
&& !strcmp(name
, ops
->linux_name
)) {
2451 static struct tc_queue
*
2452 tc_find_queue__(const struct netdev
*netdev_
, unsigned int queue_id
,
2455 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2456 struct tc_queue
*queue
;
2458 HMAP_FOR_EACH_IN_BUCKET (queue
, hmap_node
, hash
, &netdev
->tc
->queues
) {
2459 if (queue
->queue_id
== queue_id
) {
2466 static struct tc_queue
*
2467 tc_find_queue(const struct netdev
*netdev
, unsigned int queue_id
)
2469 return tc_find_queue__(netdev
, queue_id
, hash_int(queue_id
, 0));
2473 netdev_linux_get_qos_capabilities(const struct netdev
*netdev OVS_UNUSED
,
2475 struct netdev_qos_capabilities
*caps
)
2477 const struct tc_ops
*ops
= tc_lookup_ovs_name(type
);
2481 caps
->n_queues
= ops
->n_queues
;
2486 netdev_linux_get_qos(const struct netdev
*netdev_
,
2487 const char **typep
, struct smap
*details
)
2489 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2492 ovs_mutex_lock(&netdev
->mutex
);
2493 if (netdev_linux_netnsid_is_remote(netdev
)) {
2498 error
= tc_query_qdisc(netdev_
);
2500 *typep
= netdev
->tc
->ops
->ovs_name
;
2501 error
= (netdev
->tc
->ops
->qdisc_get
2502 ? netdev
->tc
->ops
->qdisc_get(netdev_
, details
)
2507 ovs_mutex_unlock(&netdev
->mutex
);
2512 netdev_linux_set_qos(struct netdev
*netdev_
,
2513 const char *type
, const struct smap
*details
)
2515 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2516 const struct tc_ops
*new_ops
;
2519 new_ops
= tc_lookup_ovs_name(type
);
2520 if (!new_ops
|| !new_ops
->tc_install
) {
2524 if (new_ops
== &tc_ops_noop
) {
2525 return new_ops
->tc_install(netdev_
, details
);
2528 ovs_mutex_lock(&netdev
->mutex
);
2529 if (netdev_linux_netnsid_is_remote(netdev
)) {
2534 error
= tc_query_qdisc(netdev_
);
2539 if (new_ops
== netdev
->tc
->ops
) {
2540 error
= new_ops
->qdisc_set
? new_ops
->qdisc_set(netdev_
, details
) : 0;
2542 /* Delete existing qdisc. */
2543 error
= tc_del_qdisc(netdev_
);
2547 ovs_assert(netdev
->tc
== NULL
);
2549 /* Install new qdisc. */
2550 error
= new_ops
->tc_install(netdev_
, details
);
2551 ovs_assert((error
== 0) == (netdev
->tc
!= NULL
));
2555 ovs_mutex_unlock(&netdev
->mutex
);
2560 netdev_linux_get_queue(const struct netdev
*netdev_
,
2561 unsigned int queue_id
, struct smap
*details
)
2563 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2566 ovs_mutex_lock(&netdev
->mutex
);
2567 if (netdev_linux_netnsid_is_remote(netdev
)) {
2572 error
= tc_query_qdisc(netdev_
);
2574 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2576 ? netdev
->tc
->ops
->class_get(netdev_
, queue
, details
)
2581 ovs_mutex_unlock(&netdev
->mutex
);
2586 netdev_linux_set_queue(struct netdev
*netdev_
,
2587 unsigned int queue_id
, const struct smap
*details
)
2589 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2592 ovs_mutex_lock(&netdev
->mutex
);
2593 if (netdev_linux_netnsid_is_remote(netdev
)) {
2598 error
= tc_query_qdisc(netdev_
);
2600 error
= (queue_id
< netdev
->tc
->ops
->n_queues
2601 && netdev
->tc
->ops
->class_set
2602 ? netdev
->tc
->ops
->class_set(netdev_
, queue_id
, details
)
2607 ovs_mutex_unlock(&netdev
->mutex
);
2612 netdev_linux_delete_queue(struct netdev
*netdev_
, unsigned int queue_id
)
2614 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2617 ovs_mutex_lock(&netdev
->mutex
);
2618 if (netdev_linux_netnsid_is_remote(netdev
)) {
2623 error
= tc_query_qdisc(netdev_
);
2625 if (netdev
->tc
->ops
->class_delete
) {
2626 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2628 ? netdev
->tc
->ops
->class_delete(netdev_
, queue
)
2636 ovs_mutex_unlock(&netdev
->mutex
);
2641 netdev_linux_get_queue_stats(const struct netdev
*netdev_
,
2642 unsigned int queue_id
,
2643 struct netdev_queue_stats
*stats
)
2645 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2648 ovs_mutex_lock(&netdev
->mutex
);
2649 if (netdev_linux_netnsid_is_remote(netdev
)) {
2654 error
= tc_query_qdisc(netdev_
);
2656 if (netdev
->tc
->ops
->class_get_stats
) {
2657 const struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2659 stats
->created
= queue
->created
;
2660 error
= netdev
->tc
->ops
->class_get_stats(netdev_
, queue
,
2671 ovs_mutex_unlock(&netdev
->mutex
);
2675 struct queue_dump_state
{
2676 struct nl_dump dump
;
2681 start_queue_dump(const struct netdev
*netdev
, struct queue_dump_state
*state
)
2683 struct ofpbuf request
;
2684 struct tcmsg
*tcmsg
;
2686 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, 0, &request
);
2690 tcmsg
->tcm_parent
= 0;
2691 nl_dump_start(&state
->dump
, NETLINK_ROUTE
, &request
);
2692 ofpbuf_uninit(&request
);
2694 ofpbuf_init(&state
->buf
, NL_DUMP_BUFSIZE
);
2699 finish_queue_dump(struct queue_dump_state
*state
)
2701 ofpbuf_uninit(&state
->buf
);
2702 return nl_dump_done(&state
->dump
);
2705 struct netdev_linux_queue_state
{
2706 unsigned int *queues
;
2712 netdev_linux_queue_dump_start(const struct netdev
*netdev_
, void **statep
)
2714 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2717 ovs_mutex_lock(&netdev
->mutex
);
2718 if (netdev_linux_netnsid_is_remote(netdev
)) {
2723 error
= tc_query_qdisc(netdev_
);
2725 if (netdev
->tc
->ops
->class_get
) {
2726 struct netdev_linux_queue_state
*state
;
2727 struct tc_queue
*queue
;
2730 *statep
= state
= xmalloc(sizeof *state
);
2731 state
->n_queues
= hmap_count(&netdev
->tc
->queues
);
2732 state
->cur_queue
= 0;
2733 state
->queues
= xmalloc(state
->n_queues
* sizeof *state
->queues
);
2736 HMAP_FOR_EACH (queue
, hmap_node
, &netdev
->tc
->queues
) {
2737 state
->queues
[i
++] = queue
->queue_id
;
2745 ovs_mutex_unlock(&netdev
->mutex
);
2750 netdev_linux_queue_dump_next(const struct netdev
*netdev_
, void *state_
,
2751 unsigned int *queue_idp
, struct smap
*details
)
2753 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2754 struct netdev_linux_queue_state
*state
= state_
;
2757 ovs_mutex_lock(&netdev
->mutex
);
2758 if (netdev_linux_netnsid_is_remote(netdev
)) {
2763 while (state
->cur_queue
< state
->n_queues
) {
2764 unsigned int queue_id
= state
->queues
[state
->cur_queue
++];
2765 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2768 *queue_idp
= queue_id
;
2769 error
= netdev
->tc
->ops
->class_get(netdev_
, queue
, details
);
2775 ovs_mutex_unlock(&netdev
->mutex
);
2780 netdev_linux_queue_dump_done(const struct netdev
*netdev OVS_UNUSED
,
2783 struct netdev_linux_queue_state
*state
= state_
;
2785 free(state
->queues
);
2791 netdev_linux_dump_queue_stats(const struct netdev
*netdev_
,
2792 netdev_dump_queue_stats_cb
*cb
, void *aux
)
2794 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2797 ovs_mutex_lock(&netdev
->mutex
);
2798 if (netdev_linux_netnsid_is_remote(netdev
)) {
2803 error
= tc_query_qdisc(netdev_
);
2805 struct queue_dump_state state
;
2807 if (!netdev
->tc
->ops
->class_dump_stats
) {
2809 } else if (!start_queue_dump(netdev_
, &state
)) {
2815 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
2816 retval
= netdev
->tc
->ops
->class_dump_stats(netdev_
, &msg
,
2823 retval
= finish_queue_dump(&state
);
2831 ovs_mutex_unlock(&netdev
->mutex
);
2836 netdev_linux_set_in4(struct netdev
*netdev_
, struct in_addr address
,
2837 struct in_addr netmask
)
2839 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2842 ovs_mutex_lock(&netdev
->mutex
);
2843 if (netdev_linux_netnsid_is_remote(netdev
)) {
2848 error
= do_set_addr(netdev_
, SIOCSIFADDR
, "SIOCSIFADDR", address
);
2850 if (address
.s_addr
!= INADDR_ANY
) {
2851 error
= do_set_addr(netdev_
, SIOCSIFNETMASK
,
2852 "SIOCSIFNETMASK", netmask
);
2857 ovs_mutex_unlock(&netdev
->mutex
);
2861 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2862 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2865 netdev_linux_get_addr_list(const struct netdev
*netdev_
,
2866 struct in6_addr
**addr
, struct in6_addr
**mask
, int *n_cnt
)
2868 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2871 ovs_mutex_lock(&netdev
->mutex
);
2872 if (netdev_linux_netnsid_is_remote(netdev
)) {
2877 error
= netdev_get_addrs(netdev_get_name(netdev_
), addr
, mask
, n_cnt
);
2880 ovs_mutex_unlock(&netdev
->mutex
);
2885 make_in4_sockaddr(struct sockaddr
*sa
, struct in_addr addr
)
2887 struct sockaddr_in sin
;
2888 memset(&sin
, 0, sizeof sin
);
2889 sin
.sin_family
= AF_INET
;
2890 sin
.sin_addr
= addr
;
2893 memset(sa
, 0, sizeof *sa
);
2894 memcpy(sa
, &sin
, sizeof sin
);
2898 do_set_addr(struct netdev
*netdev
,
2899 int ioctl_nr
, const char *ioctl_name
, struct in_addr addr
)
2903 make_in4_sockaddr(&ifr
.ifr_addr
, addr
);
2904 return af_inet_ifreq_ioctl(netdev_get_name(netdev
), &ifr
, ioctl_nr
,
2908 /* Adds 'router' as a default IP gateway. */
2910 netdev_linux_add_router(struct netdev
*netdev OVS_UNUSED
, struct in_addr router
)
2912 struct in_addr any
= { INADDR_ANY
};
2916 memset(&rt
, 0, sizeof rt
);
2917 make_in4_sockaddr(&rt
.rt_dst
, any
);
2918 make_in4_sockaddr(&rt
.rt_gateway
, router
);
2919 make_in4_sockaddr(&rt
.rt_genmask
, any
);
2920 rt
.rt_flags
= RTF_UP
| RTF_GATEWAY
;
2921 error
= af_inet_ioctl(SIOCADDRT
, &rt
);
2923 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error
));
2929 netdev_linux_get_next_hop(const struct in_addr
*host
, struct in_addr
*next_hop
,
2932 static const char fn
[] = "/proc/net/route";
2937 *netdev_name
= NULL
;
2938 stream
= fopen(fn
, "r");
2939 if (stream
== NULL
) {
2940 VLOG_WARN_RL(&rl
, "%s: open failed: %s", fn
, ovs_strerror(errno
));
2945 while (fgets(line
, sizeof line
, stream
)) {
2948 ovs_be32 dest
, gateway
, mask
;
2949 int refcnt
, metric
, mtu
;
2950 unsigned int flags
, use
, window
, irtt
;
2953 "%16s %"SCNx32
" %"SCNx32
" %04X %d %u %d %"SCNx32
2955 iface
, &dest
, &gateway
, &flags
, &refcnt
,
2956 &use
, &metric
, &mask
, &mtu
, &window
, &irtt
)) {
2957 VLOG_WARN_RL(&rl
, "%s: could not parse line %d: %s",
2961 if (!(flags
& RTF_UP
)) {
2962 /* Skip routes that aren't up. */
2966 /* The output of 'dest', 'mask', and 'gateway' were given in
2967 * network byte order, so we don't need need any endian
2968 * conversions here. */
2969 if ((dest
& mask
) == (host
->s_addr
& mask
)) {
2971 /* The host is directly reachable. */
2972 next_hop
->s_addr
= 0;
2974 /* To reach the host, we must go through a gateway. */
2975 next_hop
->s_addr
= gateway
;
2977 *netdev_name
= xstrdup(iface
);
2989 netdev_linux_get_status(const struct netdev
*netdev_
, struct smap
*smap
)
2991 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2994 ovs_mutex_lock(&netdev
->mutex
);
2995 if (!(netdev
->cache_valid
& VALID_DRVINFO
)) {
2996 struct ethtool_cmd
*cmd
= (struct ethtool_cmd
*) &netdev
->drvinfo
;
2998 COVERAGE_INC(netdev_get_ethtool
);
2999 memset(&netdev
->drvinfo
, 0, sizeof netdev
->drvinfo
);
3000 error
= netdev_linux_do_ethtool(netdev
->up
.name
,
3003 "ETHTOOL_GDRVINFO");
3005 netdev
->cache_valid
|= VALID_DRVINFO
;
3010 smap_add(smap
, "driver_name", netdev
->drvinfo
.driver
);
3011 smap_add(smap
, "driver_version", netdev
->drvinfo
.version
);
3012 smap_add(smap
, "firmware_version", netdev
->drvinfo
.fw_version
);
3014 ovs_mutex_unlock(&netdev
->mutex
);
3020 netdev_internal_get_status(const struct netdev
*netdev OVS_UNUSED
,
3023 smap_add(smap
, "driver_name", "openvswitch");
3028 netdev_linux_get_block_id(struct netdev
*netdev_
)
3030 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3031 uint32_t block_id
= 0;
3033 ovs_mutex_lock(&netdev
->mutex
);
3034 /* Ensure the linux netdev has had its fields populated. */
3035 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
3036 netdev_linux_update_via_netlink(netdev
);
3039 /* Only assigning block ids to linux netdevs that are LAG masters. */
3040 if (netdev
->is_lag_master
) {
3041 block_id
= netdev
->ifindex
;
3043 ovs_mutex_unlock(&netdev
->mutex
);
3048 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3049 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3050 * returns 0. Otherwise, it returns a positive errno value; in particular,
3051 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3053 netdev_linux_arp_lookup(const struct netdev
*netdev
,
3054 ovs_be32 ip
, struct eth_addr
*mac
)
3057 struct sockaddr_in sin
;
3060 memset(&r
, 0, sizeof r
);
3061 memset(&sin
, 0, sizeof sin
);
3062 sin
.sin_family
= AF_INET
;
3063 sin
.sin_addr
.s_addr
= ip
;
3065 memcpy(&r
.arp_pa
, &sin
, sizeof sin
);
3066 r
.arp_ha
.sa_family
= ARPHRD_ETHER
;
3068 ovs_strzcpy(r
.arp_dev
, netdev_get_name(netdev
), sizeof r
.arp_dev
);
3069 COVERAGE_INC(netdev_arp_lookup
);
3070 retval
= af_inet_ioctl(SIOCGARP
, &r
);
3072 memcpy(mac
, r
.arp_ha
.sa_data
, ETH_ADDR_LEN
);
3073 } else if (retval
!= ENXIO
) {
3074 VLOG_WARN_RL(&rl
, "%s: could not look up ARP entry for "IP_FMT
": %s",
3075 netdev_get_name(netdev
), IP_ARGS(ip
),
3076 ovs_strerror(retval
));
3082 nd_to_iff_flags(enum netdev_flags nd
)
3085 if (nd
& NETDEV_UP
) {
3088 if (nd
& NETDEV_PROMISC
) {
3091 if (nd
& NETDEV_LOOPBACK
) {
3092 iff
|= IFF_LOOPBACK
;
3098 iff_to_nd_flags(int iff
)
3100 enum netdev_flags nd
= 0;
3104 if (iff
& IFF_PROMISC
) {
3105 nd
|= NETDEV_PROMISC
;
3107 if (iff
& IFF_LOOPBACK
) {
3108 nd
|= NETDEV_LOOPBACK
;
3114 update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
3115 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
3116 OVS_REQUIRES(netdev
->mutex
)
3118 int old_flags
, new_flags
;
3121 old_flags
= netdev
->ifi_flags
;
3122 *old_flagsp
= iff_to_nd_flags(old_flags
);
3123 new_flags
= (old_flags
& ~nd_to_iff_flags(off
)) | nd_to_iff_flags(on
);
3124 if (new_flags
!= old_flags
) {
3125 error
= set_flags(netdev_get_name(&netdev
->up
), new_flags
);
3126 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
3133 netdev_linux_update_flags(struct netdev
*netdev_
, enum netdev_flags off
,
3134 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
3136 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3139 ovs_mutex_lock(&netdev
->mutex
);
3141 /* Changing flags over netlink isn't support yet. */
3142 if (netdev_linux_netnsid_is_remote(netdev
)) {
3146 error
= update_flags(netdev
, off
, on
, old_flagsp
);
3148 /* Try reading flags over netlink, or fall back to ioctl. */
3149 if (!netdev_linux_update_via_netlink(netdev
)) {
3150 *old_flagsp
= iff_to_nd_flags(netdev
->ifi_flags
);
3152 error
= update_flags(netdev
, off
, on
, old_flagsp
);
3157 ovs_mutex_unlock(&netdev
->mutex
);
3161 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
3162 GET_FEATURES, GET_STATUS, \
3163 FLOW_OFFLOAD_API, GET_BLOCK_ID) \
3166 false, /* is_pmd */ \
3170 netdev_linux_wait, \
3172 netdev_linux_alloc, \
3174 netdev_linux_destruct, \
3175 netdev_linux_dealloc, \
3176 NULL, /* get_config */ \
3177 NULL, /* set_config */ \
3178 NULL, /* get_tunnel_config */ \
3179 NULL, /* build header */ \
3180 NULL, /* push header */ \
3181 NULL, /* pop header */ \
3182 NULL, /* get_numa_id */ \
3183 NULL, /* set_tx_multiq */ \
3185 netdev_linux_send, \
3186 netdev_linux_send_wait, \
3188 netdev_linux_set_etheraddr, \
3189 netdev_linux_get_etheraddr, \
3190 netdev_linux_get_mtu, \
3191 netdev_linux_set_mtu, \
3192 netdev_linux_get_ifindex, \
3193 netdev_linux_get_carrier, \
3194 netdev_linux_get_carrier_resets, \
3195 netdev_linux_set_miimon_interval, \
3200 netdev_linux_set_advertisements, \
3201 NULL, /* get_pt_mode */ \
3203 netdev_linux_set_policing, \
3204 netdev_linux_get_qos_types, \
3205 netdev_linux_get_qos_capabilities, \
3206 netdev_linux_get_qos, \
3207 netdev_linux_set_qos, \
3208 netdev_linux_get_queue, \
3209 netdev_linux_set_queue, \
3210 netdev_linux_delete_queue, \
3211 netdev_linux_get_queue_stats, \
3212 netdev_linux_queue_dump_start, \
3213 netdev_linux_queue_dump_next, \
3214 netdev_linux_queue_dump_done, \
3215 netdev_linux_dump_queue_stats, \
3217 netdev_linux_set_in4, \
3218 netdev_linux_get_addr_list, \
3219 netdev_linux_add_router, \
3220 netdev_linux_get_next_hop, \
3222 netdev_linux_arp_lookup, \
3224 netdev_linux_update_flags, \
3225 NULL, /* reconfigure */ \
3227 netdev_linux_rxq_alloc, \
3228 netdev_linux_rxq_construct, \
3229 netdev_linux_rxq_destruct, \
3230 netdev_linux_rxq_dealloc, \
3231 netdev_linux_rxq_recv, \
3232 netdev_linux_rxq_wait, \
3233 netdev_linux_rxq_drain, \
3239 const struct netdev_class netdev_linux_class
=
3242 netdev_linux_construct
,
3243 netdev_linux_get_stats
,
3244 netdev_linux_get_features
,
3245 netdev_linux_get_status
,
3246 LINUX_FLOW_OFFLOAD_API
,
3247 netdev_linux_get_block_id
);
3249 const struct netdev_class netdev_tap_class
=
3252 netdev_linux_construct_tap
,
3253 netdev_tap_get_stats
,
3254 netdev_linux_get_features
,
3255 netdev_linux_get_status
,
3259 const struct netdev_class netdev_internal_class
=
3262 netdev_linux_construct
,
3263 netdev_internal_get_stats
,
3264 NULL
, /* get_features */
3265 netdev_internal_get_status
,
3270 #define CODEL_N_QUEUES 0x0000
3272 /* In sufficiently new kernel headers these are defined as enums in
3273 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3274 * kernels. (This overrides any enum definition in the header file but that's
3276 #define TCA_CODEL_TARGET 1
3277 #define TCA_CODEL_LIMIT 2
3278 #define TCA_CODEL_INTERVAL 3
3287 static struct codel
*
3288 codel_get__(const struct netdev
*netdev_
)
3290 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3291 return CONTAINER_OF(netdev
->tc
, struct codel
, tc
);
3295 codel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3298 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3299 struct codel
*codel
;
3301 codel
= xmalloc(sizeof *codel
);
3302 tc_init(&codel
->tc
, &tc_ops_codel
);
3303 codel
->target
= target
;
3304 codel
->limit
= limit
;
3305 codel
->interval
= interval
;
3307 netdev
->tc
= &codel
->tc
;
3311 codel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3315 struct ofpbuf request
;
3316 struct tcmsg
*tcmsg
;
3317 uint32_t otarget
, olimit
, ointerval
;
3320 tc_del_qdisc(netdev
);
3322 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3323 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3327 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3328 tcmsg
->tcm_parent
= TC_H_ROOT
;
3330 otarget
= target
? target
: 5000;
3331 olimit
= limit
? limit
: 10240;
3332 ointerval
= interval
? interval
: 100000;
3334 nl_msg_put_string(&request
, TCA_KIND
, "codel");
3335 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3336 nl_msg_put_u32(&request
, TCA_CODEL_TARGET
, otarget
);
3337 nl_msg_put_u32(&request
, TCA_CODEL_LIMIT
, olimit
);
3338 nl_msg_put_u32(&request
, TCA_CODEL_INTERVAL
, ointerval
);
3339 nl_msg_end_nested(&request
, opt_offset
);
3341 error
= tc_transact(&request
, NULL
);
3343 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3344 "target %u, limit %u, interval %u error %d(%s)",
3345 netdev_get_name(netdev
),
3346 otarget
, olimit
, ointerval
,
3347 error
, ovs_strerror(error
));
3353 codel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3354 const struct smap
*details
, struct codel
*codel
)
3356 codel
->target
= smap_get_ullong(details
, "target", 0);
3357 codel
->limit
= smap_get_ullong(details
, "limit", 0);
3358 codel
->interval
= smap_get_ullong(details
, "interval", 0);
3360 if (!codel
->target
) {
3361 codel
->target
= 5000;
3363 if (!codel
->limit
) {
3364 codel
->limit
= 10240;
3366 if (!codel
->interval
) {
3367 codel
->interval
= 100000;
3372 codel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3377 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3378 error
= codel_setup_qdisc__(netdev
, codel
.target
, codel
.limit
,
3381 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3387 codel_parse_tca_options__(struct nlattr
*nl_options
, struct codel
*codel
)
3389 static const struct nl_policy tca_codel_policy
[] = {
3390 [TCA_CODEL_TARGET
] = { .type
= NL_A_U32
},
3391 [TCA_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3392 [TCA_CODEL_INTERVAL
] = { .type
= NL_A_U32
}
3395 struct nlattr
*attrs
[ARRAY_SIZE(tca_codel_policy
)];
3397 if (!nl_parse_nested(nl_options
, tca_codel_policy
,
3398 attrs
, ARRAY_SIZE(tca_codel_policy
))) {
3399 VLOG_WARN_RL(&rl
, "failed to parse CoDel class options");
3403 codel
->target
= nl_attr_get_u32(attrs
[TCA_CODEL_TARGET
]);
3404 codel
->limit
= nl_attr_get_u32(attrs
[TCA_CODEL_LIMIT
]);
3405 codel
->interval
= nl_attr_get_u32(attrs
[TCA_CODEL_INTERVAL
]);
3410 codel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3412 struct nlattr
*nlattr
;
3417 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3422 error
= codel_parse_tca_options__(nlattr
, &codel
);
3427 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3433 codel_tc_destroy(struct tc
*tc
)
3435 struct codel
*codel
= CONTAINER_OF(tc
, struct codel
, tc
);
3441 codel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3443 const struct codel
*codel
= codel_get__(netdev
);
3444 smap_add_format(details
, "target", "%u", codel
->target
);
3445 smap_add_format(details
, "limit", "%u", codel
->limit
);
3446 smap_add_format(details
, "interval", "%u", codel
->interval
);
3451 codel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3455 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3456 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3457 codel_get__(netdev
)->target
= codel
.target
;
3458 codel_get__(netdev
)->limit
= codel
.limit
;
3459 codel_get__(netdev
)->interval
= codel
.interval
;
3463 static const struct tc_ops tc_ops_codel
= {
3464 "codel", /* linux_name */
3465 "linux-codel", /* ovs_name */
3466 CODEL_N_QUEUES
, /* n_queues */
3479 /* FQ-CoDel traffic control class. */
3481 #define FQCODEL_N_QUEUES 0x0000
3483 /* In sufficiently new kernel headers these are defined as enums in
3484 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3485 * kernels. (This overrides any enum definition in the header file but that's
3487 #define TCA_FQ_CODEL_TARGET 1
3488 #define TCA_FQ_CODEL_LIMIT 2
3489 #define TCA_FQ_CODEL_INTERVAL 3
3490 #define TCA_FQ_CODEL_ECN 4
3491 #define TCA_FQ_CODEL_FLOWS 5
3492 #define TCA_FQ_CODEL_QUANTUM 6
3503 static struct fqcodel
*
3504 fqcodel_get__(const struct netdev
*netdev_
)
3506 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3507 return CONTAINER_OF(netdev
->tc
, struct fqcodel
, tc
);
3511 fqcodel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3512 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3514 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3515 struct fqcodel
*fqcodel
;
3517 fqcodel
= xmalloc(sizeof *fqcodel
);
3518 tc_init(&fqcodel
->tc
, &tc_ops_fqcodel
);
3519 fqcodel
->target
= target
;
3520 fqcodel
->limit
= limit
;
3521 fqcodel
->interval
= interval
;
3522 fqcodel
->flows
= flows
;
3523 fqcodel
->quantum
= quantum
;
3525 netdev
->tc
= &fqcodel
->tc
;
3529 fqcodel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3530 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3533 struct ofpbuf request
;
3534 struct tcmsg
*tcmsg
;
3535 uint32_t otarget
, olimit
, ointerval
, oflows
, oquantum
;
3538 tc_del_qdisc(netdev
);
3540 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3541 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3545 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3546 tcmsg
->tcm_parent
= TC_H_ROOT
;
3548 otarget
= target
? target
: 5000;
3549 olimit
= limit
? limit
: 10240;
3550 ointerval
= interval
? interval
: 100000;
3551 oflows
= flows
? flows
: 1024;
3552 oquantum
= quantum
? quantum
: 1514; /* fq_codel default quantum is 1514
3555 nl_msg_put_string(&request
, TCA_KIND
, "fq_codel");
3556 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3557 nl_msg_put_u32(&request
, TCA_FQ_CODEL_TARGET
, otarget
);
3558 nl_msg_put_u32(&request
, TCA_FQ_CODEL_LIMIT
, olimit
);
3559 nl_msg_put_u32(&request
, TCA_FQ_CODEL_INTERVAL
, ointerval
);
3560 nl_msg_put_u32(&request
, TCA_FQ_CODEL_FLOWS
, oflows
);
3561 nl_msg_put_u32(&request
, TCA_FQ_CODEL_QUANTUM
, oquantum
);
3562 nl_msg_end_nested(&request
, opt_offset
);
3564 error
= tc_transact(&request
, NULL
);
3566 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3567 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3568 netdev_get_name(netdev
),
3569 otarget
, olimit
, ointerval
, oflows
, oquantum
,
3570 error
, ovs_strerror(error
));
3576 fqcodel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3577 const struct smap
*details
, struct fqcodel
*fqcodel
)
3579 fqcodel
->target
= smap_get_ullong(details
, "target", 0);
3580 fqcodel
->limit
= smap_get_ullong(details
, "limit", 0);
3581 fqcodel
->interval
= smap_get_ullong(details
, "interval", 0);
3582 fqcodel
->flows
= smap_get_ullong(details
, "flows", 0);
3583 fqcodel
->quantum
= smap_get_ullong(details
, "quantum", 0);
3585 if (!fqcodel
->target
) {
3586 fqcodel
->target
= 5000;
3588 if (!fqcodel
->limit
) {
3589 fqcodel
->limit
= 10240;
3591 if (!fqcodel
->interval
) {
3592 fqcodel
->interval
= 1000000;
3594 if (!fqcodel
->flows
) {
3595 fqcodel
->flows
= 1024;
3597 if (!fqcodel
->quantum
) {
3598 fqcodel
->quantum
= 1514;
3603 fqcodel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3606 struct fqcodel fqcodel
;
3608 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3609 error
= fqcodel_setup_qdisc__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3610 fqcodel
.interval
, fqcodel
.flows
,
3613 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3614 fqcodel
.interval
, fqcodel
.flows
, fqcodel
.quantum
);
3620 fqcodel_parse_tca_options__(struct nlattr
*nl_options
, struct fqcodel
*fqcodel
)
3622 static const struct nl_policy tca_fqcodel_policy
[] = {
3623 [TCA_FQ_CODEL_TARGET
] = { .type
= NL_A_U32
},
3624 [TCA_FQ_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3625 [TCA_FQ_CODEL_INTERVAL
] = { .type
= NL_A_U32
},
3626 [TCA_FQ_CODEL_FLOWS
] = { .type
= NL_A_U32
},
3627 [TCA_FQ_CODEL_QUANTUM
] = { .type
= NL_A_U32
}
3630 struct nlattr
*attrs
[ARRAY_SIZE(tca_fqcodel_policy
)];
3632 if (!nl_parse_nested(nl_options
, tca_fqcodel_policy
,
3633 attrs
, ARRAY_SIZE(tca_fqcodel_policy
))) {
3634 VLOG_WARN_RL(&rl
, "failed to parse FQ_CoDel class options");
3638 fqcodel
->target
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_TARGET
]);
3639 fqcodel
->limit
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_LIMIT
]);
3640 fqcodel
->interval
=nl_attr_get_u32(attrs
[TCA_FQ_CODEL_INTERVAL
]);
3641 fqcodel
->flows
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_FLOWS
]);
3642 fqcodel
->quantum
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_QUANTUM
]);
3647 fqcodel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3649 struct nlattr
*nlattr
;
3652 struct fqcodel fqcodel
;
3654 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3659 error
= fqcodel_parse_tca_options__(nlattr
, &fqcodel
);
3664 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3665 fqcodel
.flows
, fqcodel
.quantum
);
3670 fqcodel_tc_destroy(struct tc
*tc
)
3672 struct fqcodel
*fqcodel
= CONTAINER_OF(tc
, struct fqcodel
, tc
);
3678 fqcodel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3680 const struct fqcodel
*fqcodel
= fqcodel_get__(netdev
);
3681 smap_add_format(details
, "target", "%u", fqcodel
->target
);
3682 smap_add_format(details
, "limit", "%u", fqcodel
->limit
);
3683 smap_add_format(details
, "interval", "%u", fqcodel
->interval
);
3684 smap_add_format(details
, "flows", "%u", fqcodel
->flows
);
3685 smap_add_format(details
, "quantum", "%u", fqcodel
->quantum
);
3690 fqcodel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3692 struct fqcodel fqcodel
;
3694 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3695 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3696 fqcodel
.flows
, fqcodel
.quantum
);
3697 fqcodel_get__(netdev
)->target
= fqcodel
.target
;
3698 fqcodel_get__(netdev
)->limit
= fqcodel
.limit
;
3699 fqcodel_get__(netdev
)->interval
= fqcodel
.interval
;
3700 fqcodel_get__(netdev
)->flows
= fqcodel
.flows
;
3701 fqcodel_get__(netdev
)->quantum
= fqcodel
.quantum
;
3705 static const struct tc_ops tc_ops_fqcodel
= {
3706 "fq_codel", /* linux_name */
3707 "linux-fq_codel", /* ovs_name */
3708 FQCODEL_N_QUEUES
, /* n_queues */
3721 /* SFQ traffic control class. */
3723 #define SFQ_N_QUEUES 0x0000
3732 sfq_get__(const struct netdev
*netdev_
)
3734 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3735 return CONTAINER_OF(netdev
->tc
, struct sfq
, tc
);
3739 sfq_install__(struct netdev
*netdev_
, uint32_t quantum
, uint32_t perturb
)
3741 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3744 sfq
= xmalloc(sizeof *sfq
);
3745 tc_init(&sfq
->tc
, &tc_ops_sfq
);
3746 sfq
->perturb
= perturb
;
3747 sfq
->quantum
= quantum
;
3749 netdev
->tc
= &sfq
->tc
;
3753 sfq_setup_qdisc__(struct netdev
*netdev
, uint32_t quantum
, uint32_t perturb
)
3755 struct tc_sfq_qopt opt
;
3756 struct ofpbuf request
;
3757 struct tcmsg
*tcmsg
;
3759 int mtu_error
, error
;
3760 mtu_error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3762 tc_del_qdisc(netdev
);
3764 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3765 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3769 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3770 tcmsg
->tcm_parent
= TC_H_ROOT
;
3772 memset(&opt
, 0, sizeof opt
);
3775 opt
.quantum
= mtu
; /* if we cannot find mtu, use default */
3778 opt
.quantum
= quantum
;
3782 opt
.perturb_period
= 10;
3784 opt
.perturb_period
= perturb
;
3787 nl_msg_put_string(&request
, TCA_KIND
, "sfq");
3788 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
3790 error
= tc_transact(&request
, NULL
);
3792 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3793 "quantum %u, perturb %u error %d(%s)",
3794 netdev_get_name(netdev
),
3795 opt
.quantum
, opt
.perturb_period
,
3796 error
, ovs_strerror(error
));
3802 sfq_parse_qdisc_details__(struct netdev
*netdev
,
3803 const struct smap
*details
, struct sfq
*sfq
)
3805 sfq
->perturb
= smap_get_ullong(details
, "perturb", 0);
3806 sfq
->quantum
= smap_get_ullong(details
, "quantum", 0);
3808 if (!sfq
->perturb
) {
3812 if (!sfq
->quantum
) {
3814 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
3817 VLOG_WARN_RL(&rl
, "when using SFQ, you must specify quantum on a "
3818 "device without mtu");
3824 sfq_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3829 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3830 error
= sfq_setup_qdisc__(netdev
, sfq
.quantum
, sfq
.perturb
);
3832 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3838 sfq_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3840 const struct tc_sfq_qopt
*sfq
;
3841 struct nlattr
*nlattr
;
3845 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3847 sfq
= nl_attr_get(nlattr
);
3848 sfq_install__(netdev
, sfq
->perturb_period
, sfq
->quantum
);
3856 sfq_tc_destroy(struct tc
*tc
)
3858 struct sfq
*sfq
= CONTAINER_OF(tc
, struct sfq
, tc
);
3864 sfq_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3866 const struct sfq
*sfq
= sfq_get__(netdev
);
3867 smap_add_format(details
, "quantum", "%u", sfq
->quantum
);
3868 smap_add_format(details
, "perturb", "%u", sfq
->perturb
);
3873 sfq_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3877 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3878 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3879 sfq_get__(netdev
)->quantum
= sfq
.quantum
;
3880 sfq_get__(netdev
)->perturb
= sfq
.perturb
;
3884 static const struct tc_ops tc_ops_sfq
= {
3885 "sfq", /* linux_name */
3886 "linux-sfq", /* ovs_name */
3887 SFQ_N_QUEUES
, /* n_queues */
3900 /* HTB traffic control class. */
3902 #define HTB_N_QUEUES 0xf000
3903 #define HTB_RATE2QUANTUM 10
3907 unsigned int max_rate
; /* In bytes/s. */
3911 struct tc_queue tc_queue
;
3912 unsigned int min_rate
; /* In bytes/s. */
3913 unsigned int max_rate
; /* In bytes/s. */
3914 unsigned int burst
; /* In bytes. */
3915 unsigned int priority
; /* Lower values are higher priorities. */
3919 htb_get__(const struct netdev
*netdev_
)
3921 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3922 return CONTAINER_OF(netdev
->tc
, struct htb
, tc
);
3926 htb_install__(struct netdev
*netdev_
, uint64_t max_rate
)
3928 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3931 htb
= xmalloc(sizeof *htb
);
3932 tc_init(&htb
->tc
, &tc_ops_htb
);
3933 htb
->max_rate
= max_rate
;
3935 netdev
->tc
= &htb
->tc
;
3938 /* Create an HTB qdisc.
3940 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3942 htb_setup_qdisc__(struct netdev
*netdev
)
3945 struct tc_htb_glob opt
;
3946 struct ofpbuf request
;
3947 struct tcmsg
*tcmsg
;
3949 tc_del_qdisc(netdev
);
3951 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3952 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3956 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3957 tcmsg
->tcm_parent
= TC_H_ROOT
;
3959 nl_msg_put_string(&request
, TCA_KIND
, "htb");
3961 memset(&opt
, 0, sizeof opt
);
3962 opt
.rate2quantum
= HTB_RATE2QUANTUM
;
3966 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3967 nl_msg_put_unspec(&request
, TCA_HTB_INIT
, &opt
, sizeof opt
);
3968 nl_msg_end_nested(&request
, opt_offset
);
3970 return tc_transact(&request
, NULL
);
3973 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3974 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3976 htb_setup_class__(struct netdev
*netdev
, unsigned int handle
,
3977 unsigned int parent
, struct htb_class
*class)
3980 struct tc_htb_opt opt
;
3981 struct ofpbuf request
;
3982 struct tcmsg
*tcmsg
;
3986 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3988 VLOG_WARN_RL(&rl
, "cannot set up HTB on device %s that lacks MTU",
3989 netdev_get_name(netdev
));
3993 memset(&opt
, 0, sizeof opt
);
3994 tc_fill_rate(&opt
.rate
, class->min_rate
, mtu
);
3995 tc_fill_rate(&opt
.ceil
, class->max_rate
, mtu
);
3996 /* Makes sure the quantum is at least MTU. Setting quantum will
3997 * make htb ignore the r2q for this class. */
3998 if ((class->min_rate
/ HTB_RATE2QUANTUM
) < mtu
) {
4001 opt
.buffer
= tc_calc_buffer(opt
.rate
.rate
, mtu
, class->burst
);
4002 opt
.cbuffer
= tc_calc_buffer(opt
.ceil
.rate
, mtu
, class->burst
);
4003 opt
.prio
= class->priority
;
4005 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
4010 tcmsg
->tcm_handle
= handle
;
4011 tcmsg
->tcm_parent
= parent
;
4013 nl_msg_put_string(&request
, TCA_KIND
, "htb");
4014 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4015 nl_msg_put_unspec(&request
, TCA_HTB_PARMS
, &opt
, sizeof opt
);
4016 tc_put_rtab(&request
, TCA_HTB_RTAB
, &opt
.rate
);
4017 tc_put_rtab(&request
, TCA_HTB_CTAB
, &opt
.ceil
);
4018 nl_msg_end_nested(&request
, opt_offset
);
4020 error
= tc_transact(&request
, NULL
);
4022 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
4023 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
4024 netdev_get_name(netdev
),
4025 tc_get_major(handle
), tc_get_minor(handle
),
4026 tc_get_major(parent
), tc_get_minor(parent
),
4027 class->min_rate
, class->max_rate
,
4028 class->burst
, class->priority
, ovs_strerror(error
));
4033 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
4034 * description of them into 'details'. The description complies with the
4035 * specification given in the vswitch database documentation for linux-htb
4038 htb_parse_tca_options__(struct nlattr
*nl_options
, struct htb_class
*class)
4040 static const struct nl_policy tca_htb_policy
[] = {
4041 [TCA_HTB_PARMS
] = { .type
= NL_A_UNSPEC
, .optional
= false,
4042 .min_len
= sizeof(struct tc_htb_opt
) },
4045 struct nlattr
*attrs
[ARRAY_SIZE(tca_htb_policy
)];
4046 const struct tc_htb_opt
*htb
;
4048 if (!nl_parse_nested(nl_options
, tca_htb_policy
,
4049 attrs
, ARRAY_SIZE(tca_htb_policy
))) {
4050 VLOG_WARN_RL(&rl
, "failed to parse HTB class options");
4054 htb
= nl_attr_get(attrs
[TCA_HTB_PARMS
]);
4055 class->min_rate
= htb
->rate
.rate
;
4056 class->max_rate
= htb
->ceil
.rate
;
4057 class->burst
= tc_ticks_to_bytes(htb
->rate
.rate
, htb
->buffer
);
4058 class->priority
= htb
->prio
;
4063 htb_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4064 struct htb_class
*options
,
4065 struct netdev_queue_stats
*stats
)
4067 struct nlattr
*nl_options
;
4068 unsigned int handle
;
4071 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4072 if (!error
&& queue_id
) {
4073 unsigned int major
= tc_get_major(handle
);
4074 unsigned int minor
= tc_get_minor(handle
);
4075 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4076 *queue_id
= minor
- 1;
4081 if (!error
&& options
) {
4082 error
= htb_parse_tca_options__(nl_options
, options
);
4088 htb_parse_qdisc_details__(struct netdev
*netdev_
,
4089 const struct smap
*details
, struct htb_class
*hc
)
4091 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4093 hc
->max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
4094 if (!hc
->max_rate
) {
4095 enum netdev_features current
;
4097 netdev_linux_read_features(netdev
);
4098 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4099 hc
->max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4101 hc
->min_rate
= hc
->max_rate
;
4107 htb_parse_class_details__(struct netdev
*netdev
,
4108 const struct smap
*details
, struct htb_class
*hc
)
4110 const struct htb
*htb
= htb_get__(netdev
);
4112 unsigned long long int max_rate_bit
;
4114 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
4116 VLOG_WARN_RL(&rl
, "cannot parse HTB class on device %s that lacks MTU",
4117 netdev_get_name(netdev
));
4121 /* HTB requires at least an mtu sized min-rate to send any traffic even
4122 * on uncongested links. */
4123 hc
->min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
4124 hc
->min_rate
= MAX(hc
->min_rate
, mtu
);
4125 hc
->min_rate
= MIN(hc
->min_rate
, htb
->max_rate
);
4128 max_rate_bit
= smap_get_ullong(details
, "max-rate", 0);
4129 hc
->max_rate
= max_rate_bit
? max_rate_bit
/ 8 : htb
->max_rate
;
4130 hc
->max_rate
= MAX(hc
->max_rate
, hc
->min_rate
);
4131 hc
->max_rate
= MIN(hc
->max_rate
, htb
->max_rate
);
4135 * According to hints in the documentation that I've read, it is important
4136 * that 'burst' be at least as big as the largest frame that might be
4137 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4138 * but having it a bit too small is a problem. Since netdev_get_mtu()
4139 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4140 * the MTU. We actually add 64, instead of 14, as a guard against
4141 * additional headers get tacked on somewhere that we're not aware of. */
4142 hc
->burst
= smap_get_ullong(details
, "burst", 0) / 8;
4143 hc
->burst
= MAX(hc
->burst
, mtu
+ 64);
4146 hc
->priority
= smap_get_ullong(details
, "priority", 0);
4152 htb_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4153 unsigned int parent
, struct htb_class
*options
,
4154 struct netdev_queue_stats
*stats
)
4156 struct ofpbuf
*reply
;
4159 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4161 error
= htb_parse_tcmsg__(reply
, NULL
, options
, stats
);
4162 ofpbuf_delete(reply
);
4168 htb_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4172 error
= htb_setup_qdisc__(netdev
);
4174 struct htb_class hc
;
4176 htb_parse_qdisc_details__(netdev
, details
, &hc
);
4177 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4178 tc_make_handle(1, 0), &hc
);
4180 htb_install__(netdev
, hc
.max_rate
);
4186 static struct htb_class
*
4187 htb_class_cast__(const struct tc_queue
*queue
)
4189 return CONTAINER_OF(queue
, struct htb_class
, tc_queue
);
4193 htb_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4194 const struct htb_class
*hc
)
4196 struct htb
*htb
= htb_get__(netdev
);
4197 size_t hash
= hash_int(queue_id
, 0);
4198 struct tc_queue
*queue
;
4199 struct htb_class
*hcp
;
4201 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4203 hcp
= htb_class_cast__(queue
);
4205 hcp
= xmalloc(sizeof *hcp
);
4206 queue
= &hcp
->tc_queue
;
4207 queue
->queue_id
= queue_id
;
4208 queue
->created
= time_msec();
4209 hmap_insert(&htb
->tc
.queues
, &queue
->hmap_node
, hash
);
4212 hcp
->min_rate
= hc
->min_rate
;
4213 hcp
->max_rate
= hc
->max_rate
;
4214 hcp
->burst
= hc
->burst
;
4215 hcp
->priority
= hc
->priority
;
4219 htb_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4222 struct queue_dump_state state
;
4223 struct htb_class hc
;
4225 /* Get qdisc options. */
4227 htb_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4228 htb_install__(netdev
, hc
.max_rate
);
4231 if (!start_queue_dump(netdev
, &state
)) {
4234 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4235 unsigned int queue_id
;
4237 if (!htb_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4238 htb_update_queue__(netdev
, queue_id
, &hc
);
4241 finish_queue_dump(&state
);
4247 htb_tc_destroy(struct tc
*tc
)
4249 struct htb
*htb
= CONTAINER_OF(tc
, struct htb
, tc
);
4250 struct htb_class
*hc
;
4252 HMAP_FOR_EACH_POP (hc
, tc_queue
.hmap_node
, &htb
->tc
.queues
) {
4260 htb_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4262 const struct htb
*htb
= htb_get__(netdev
);
4263 smap_add_format(details
, "max-rate", "%llu", 8ULL * htb
->max_rate
);
4268 htb_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4270 struct htb_class hc
;
4273 htb_parse_qdisc_details__(netdev
, details
, &hc
);
4274 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4275 tc_make_handle(1, 0), &hc
);
4277 htb_get__(netdev
)->max_rate
= hc
.max_rate
;
4283 htb_class_get(const struct netdev
*netdev OVS_UNUSED
,
4284 const struct tc_queue
*queue
, struct smap
*details
)
4286 const struct htb_class
*hc
= htb_class_cast__(queue
);
4288 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4289 if (hc
->min_rate
!= hc
->max_rate
) {
4290 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4292 smap_add_format(details
, "burst", "%llu", 8ULL * hc
->burst
);
4294 smap_add_format(details
, "priority", "%u", hc
->priority
);
4300 htb_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4301 const struct smap
*details
)
4303 struct htb_class hc
;
4306 error
= htb_parse_class_details__(netdev
, details
, &hc
);
4311 error
= htb_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
4312 tc_make_handle(1, 0xfffe), &hc
);
4317 htb_update_queue__(netdev
, queue_id
, &hc
);
4322 htb_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4324 struct htb_class
*hc
= htb_class_cast__(queue
);
4325 struct htb
*htb
= htb_get__(netdev
);
4328 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4330 hmap_remove(&htb
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4337 htb_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4338 struct netdev_queue_stats
*stats
)
4340 return htb_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4341 tc_make_handle(1, 0xfffe), NULL
, stats
);
4345 htb_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4346 const struct ofpbuf
*nlmsg
,
4347 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4349 struct netdev_queue_stats stats
;
4350 unsigned int handle
, major
, minor
;
4353 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4358 major
= tc_get_major(handle
);
4359 minor
= tc_get_minor(handle
);
4360 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4361 (*cb
)(minor
- 1, &stats
, aux
);
4366 static const struct tc_ops tc_ops_htb
= {
4367 "htb", /* linux_name */
4368 "linux-htb", /* ovs_name */
4369 HTB_N_QUEUES
, /* n_queues */
4378 htb_class_get_stats
,
4379 htb_class_dump_stats
4382 /* "linux-hfsc" traffic control class. */
4384 #define HFSC_N_QUEUES 0xf000
4392 struct tc_queue tc_queue
;
4397 static struct hfsc
*
4398 hfsc_get__(const struct netdev
*netdev_
)
4400 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4401 return CONTAINER_OF(netdev
->tc
, struct hfsc
, tc
);
4404 static struct hfsc_class
*
4405 hfsc_class_cast__(const struct tc_queue
*queue
)
4407 return CONTAINER_OF(queue
, struct hfsc_class
, tc_queue
);
4411 hfsc_install__(struct netdev
*netdev_
, uint32_t max_rate
)
4413 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4416 hfsc
= xmalloc(sizeof *hfsc
);
4417 tc_init(&hfsc
->tc
, &tc_ops_hfsc
);
4418 hfsc
->max_rate
= max_rate
;
4419 netdev
->tc
= &hfsc
->tc
;
4423 hfsc_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4424 const struct hfsc_class
*hc
)
4428 struct hfsc_class
*hcp
;
4429 struct tc_queue
*queue
;
4431 hfsc
= hfsc_get__(netdev
);
4432 hash
= hash_int(queue_id
, 0);
4434 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4436 hcp
= hfsc_class_cast__(queue
);
4438 hcp
= xmalloc(sizeof *hcp
);
4439 queue
= &hcp
->tc_queue
;
4440 queue
->queue_id
= queue_id
;
4441 queue
->created
= time_msec();
4442 hmap_insert(&hfsc
->tc
.queues
, &queue
->hmap_node
, hash
);
4445 hcp
->min_rate
= hc
->min_rate
;
4446 hcp
->max_rate
= hc
->max_rate
;
4450 hfsc_parse_tca_options__(struct nlattr
*nl_options
, struct hfsc_class
*class)
4452 const struct tc_service_curve
*rsc
, *fsc
, *usc
;
4453 static const struct nl_policy tca_hfsc_policy
[] = {
4455 .type
= NL_A_UNSPEC
,
4457 .min_len
= sizeof(struct tc_service_curve
),
4460 .type
= NL_A_UNSPEC
,
4462 .min_len
= sizeof(struct tc_service_curve
),
4465 .type
= NL_A_UNSPEC
,
4467 .min_len
= sizeof(struct tc_service_curve
),
4470 struct nlattr
*attrs
[ARRAY_SIZE(tca_hfsc_policy
)];
4472 if (!nl_parse_nested(nl_options
, tca_hfsc_policy
,
4473 attrs
, ARRAY_SIZE(tca_hfsc_policy
))) {
4474 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options");
4478 rsc
= nl_attr_get(attrs
[TCA_HFSC_RSC
]);
4479 fsc
= nl_attr_get(attrs
[TCA_HFSC_FSC
]);
4480 usc
= nl_attr_get(attrs
[TCA_HFSC_USC
]);
4482 if (rsc
->m1
!= 0 || rsc
->d
!= 0 ||
4483 fsc
->m1
!= 0 || fsc
->d
!= 0 ||
4484 usc
->m1
!= 0 || usc
->d
!= 0) {
4485 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4486 "Non-linear service curves are not supported.");
4490 if (rsc
->m2
!= fsc
->m2
) {
4491 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4492 "Real-time service curves are not supported ");
4496 if (rsc
->m2
> usc
->m2
) {
4497 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4498 "Min-rate service curve is greater than "
4499 "the max-rate service curve.");
4503 class->min_rate
= fsc
->m2
;
4504 class->max_rate
= usc
->m2
;
4509 hfsc_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4510 struct hfsc_class
*options
,
4511 struct netdev_queue_stats
*stats
)
4514 unsigned int handle
;
4515 struct nlattr
*nl_options
;
4517 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4523 unsigned int major
, minor
;
4525 major
= tc_get_major(handle
);
4526 minor
= tc_get_minor(handle
);
4527 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4528 *queue_id
= minor
- 1;
4535 error
= hfsc_parse_tca_options__(nl_options
, options
);
4542 hfsc_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4543 unsigned int parent
, struct hfsc_class
*options
,
4544 struct netdev_queue_stats
*stats
)
4547 struct ofpbuf
*reply
;
4549 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4554 error
= hfsc_parse_tcmsg__(reply
, NULL
, options
, stats
);
4555 ofpbuf_delete(reply
);
4560 hfsc_parse_qdisc_details__(struct netdev
*netdev_
, const struct smap
*details
,
4561 struct hfsc_class
*class)
4563 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4565 uint32_t max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
4567 enum netdev_features current
;
4569 netdev_linux_read_features(netdev
);
4570 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4571 max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4574 class->min_rate
= max_rate
;
4575 class->max_rate
= max_rate
;
4579 hfsc_parse_class_details__(struct netdev
*netdev
,
4580 const struct smap
*details
,
4581 struct hfsc_class
* class)
4583 const struct hfsc
*hfsc
;
4584 uint32_t min_rate
, max_rate
;
4586 hfsc
= hfsc_get__(netdev
);
4588 min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
4589 min_rate
= MAX(min_rate
, 1);
4590 min_rate
= MIN(min_rate
, hfsc
->max_rate
);
4592 max_rate
= smap_get_ullong(details
, "max-rate", hfsc
->max_rate
* 8) / 8;
4593 max_rate
= MAX(max_rate
, min_rate
);
4594 max_rate
= MIN(max_rate
, hfsc
->max_rate
);
4596 class->min_rate
= min_rate
;
4597 class->max_rate
= max_rate
;
4602 /* Create an HFSC qdisc.
4604 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4606 hfsc_setup_qdisc__(struct netdev
* netdev
)
4608 struct tcmsg
*tcmsg
;
4609 struct ofpbuf request
;
4610 struct tc_hfsc_qopt opt
;
4612 tc_del_qdisc(netdev
);
4614 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4615 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4621 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4622 tcmsg
->tcm_parent
= TC_H_ROOT
;
4624 memset(&opt
, 0, sizeof opt
);
4627 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4628 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
4630 return tc_transact(&request
, NULL
);
4633 /* Create an HFSC class.
4635 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4636 * sc rate <min_rate> ul rate <max_rate>" */
4638 hfsc_setup_class__(struct netdev
*netdev
, unsigned int handle
,
4639 unsigned int parent
, struct hfsc_class
*class)
4643 struct tcmsg
*tcmsg
;
4644 struct ofpbuf request
;
4645 struct tc_service_curve min
, max
;
4647 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
4654 tcmsg
->tcm_handle
= handle
;
4655 tcmsg
->tcm_parent
= parent
;
4659 min
.m2
= class->min_rate
;
4663 max
.m2
= class->max_rate
;
4665 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4666 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4667 nl_msg_put_unspec(&request
, TCA_HFSC_RSC
, &min
, sizeof min
);
4668 nl_msg_put_unspec(&request
, TCA_HFSC_FSC
, &min
, sizeof min
);
4669 nl_msg_put_unspec(&request
, TCA_HFSC_USC
, &max
, sizeof max
);
4670 nl_msg_end_nested(&request
, opt_offset
);
4672 error
= tc_transact(&request
, NULL
);
4674 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
4675 "min-rate %ubps, max-rate %ubps (%s)",
4676 netdev_get_name(netdev
),
4677 tc_get_major(handle
), tc_get_minor(handle
),
4678 tc_get_major(parent
), tc_get_minor(parent
),
4679 class->min_rate
, class->max_rate
, ovs_strerror(error
));
4686 hfsc_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4689 struct hfsc_class
class;
4691 error
= hfsc_setup_qdisc__(netdev
);
4697 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4698 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4699 tc_make_handle(1, 0), &class);
4705 hfsc_install__(netdev
, class.max_rate
);
4710 hfsc_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4713 struct queue_dump_state state
;
4714 struct hfsc_class hc
;
4717 hfsc_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4718 hfsc_install__(netdev
, hc
.max_rate
);
4720 if (!start_queue_dump(netdev
, &state
)) {
4724 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4725 unsigned int queue_id
;
4727 if (!hfsc_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4728 hfsc_update_queue__(netdev
, queue_id
, &hc
);
4732 finish_queue_dump(&state
);
4737 hfsc_tc_destroy(struct tc
*tc
)
4740 struct hfsc_class
*hc
, *next
;
4742 hfsc
= CONTAINER_OF(tc
, struct hfsc
, tc
);
4744 HMAP_FOR_EACH_SAFE (hc
, next
, tc_queue
.hmap_node
, &hfsc
->tc
.queues
) {
4745 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4754 hfsc_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4756 const struct hfsc
*hfsc
;
4757 hfsc
= hfsc_get__(netdev
);
4758 smap_add_format(details
, "max-rate", "%llu", 8ULL * hfsc
->max_rate
);
4763 hfsc_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4766 struct hfsc_class
class;
4768 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4769 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4770 tc_make_handle(1, 0), &class);
4773 hfsc_get__(netdev
)->max_rate
= class.max_rate
;
4780 hfsc_class_get(const struct netdev
*netdev OVS_UNUSED
,
4781 const struct tc_queue
*queue
, struct smap
*details
)
4783 const struct hfsc_class
*hc
;
4785 hc
= hfsc_class_cast__(queue
);
4786 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4787 if (hc
->min_rate
!= hc
->max_rate
) {
4788 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4794 hfsc_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4795 const struct smap
*details
)
4798 struct hfsc_class
class;
4800 error
= hfsc_parse_class_details__(netdev
, details
, &class);
4805 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
4806 tc_make_handle(1, 0xfffe), &class);
4811 hfsc_update_queue__(netdev
, queue_id
, &class);
4816 hfsc_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4820 struct hfsc_class
*hc
;
4822 hc
= hfsc_class_cast__(queue
);
4823 hfsc
= hfsc_get__(netdev
);
4825 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4827 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4834 hfsc_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4835 struct netdev_queue_stats
*stats
)
4837 return hfsc_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4838 tc_make_handle(1, 0xfffe), NULL
, stats
);
4842 hfsc_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4843 const struct ofpbuf
*nlmsg
,
4844 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4846 struct netdev_queue_stats stats
;
4847 unsigned int handle
, major
, minor
;
4850 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4855 major
= tc_get_major(handle
);
4856 minor
= tc_get_minor(handle
);
4857 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4858 (*cb
)(minor
- 1, &stats
, aux
);
4863 static const struct tc_ops tc_ops_hfsc
= {
4864 "hfsc", /* linux_name */
4865 "linux-hfsc", /* ovs_name */
4866 HFSC_N_QUEUES
, /* n_queues */
4867 hfsc_tc_install
, /* tc_install */
4868 hfsc_tc_load
, /* tc_load */
4869 hfsc_tc_destroy
, /* tc_destroy */
4870 hfsc_qdisc_get
, /* qdisc_get */
4871 hfsc_qdisc_set
, /* qdisc_set */
4872 hfsc_class_get
, /* class_get */
4873 hfsc_class_set
, /* class_set */
4874 hfsc_class_delete
, /* class_delete */
4875 hfsc_class_get_stats
, /* class_get_stats */
4876 hfsc_class_dump_stats
/* class_dump_stats */
4879 /* "linux-noop" traffic control class. */
4882 noop_install__(struct netdev
*netdev_
)
4884 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4885 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
4887 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4891 noop_tc_install(struct netdev
*netdev
,
4892 const struct smap
*details OVS_UNUSED
)
4894 noop_install__(netdev
);
4899 noop_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4901 noop_install__(netdev
);
4905 static const struct tc_ops tc_ops_noop
= {
4906 NULL
, /* linux_name */
4907 "linux-noop", /* ovs_name */
4911 NULL
, /* tc_destroy */
4912 NULL
, /* qdisc_get */
4913 NULL
, /* qdisc_set */
4914 NULL
, /* class_get */
4915 NULL
, /* class_set */
4916 NULL
, /* class_delete */
4917 NULL
, /* class_get_stats */
4918 NULL
/* class_dump_stats */
4921 /* "linux-default" traffic control class.
4923 * This class represents the default, unnamed Linux qdisc. It corresponds to
4924 * the "" (empty string) QoS type in the OVS database. */
4927 default_install__(struct netdev
*netdev_
)
4929 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4930 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
4932 /* Nothing but a tc class implementation is allowed to write to a tc. This
4933 * class never does that, so we can legitimately use a const tc object. */
4934 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4938 default_tc_install(struct netdev
*netdev
,
4939 const struct smap
*details OVS_UNUSED
)
4941 default_install__(netdev
);
4946 default_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4948 default_install__(netdev
);
4952 static const struct tc_ops tc_ops_default
= {
4953 NULL
, /* linux_name */
4958 NULL
, /* tc_destroy */
4959 NULL
, /* qdisc_get */
4960 NULL
, /* qdisc_set */
4961 NULL
, /* class_get */
4962 NULL
, /* class_set */
4963 NULL
, /* class_delete */
4964 NULL
, /* class_get_stats */
4965 NULL
/* class_dump_stats */
4968 /* "linux-other" traffic control class.
4973 other_tc_load(struct netdev
*netdev_
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4975 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4976 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_other
);
4978 /* Nothing but a tc class implementation is allowed to write to a tc. This
4979 * class never does that, so we can legitimately use a const tc object. */
4980 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4984 static const struct tc_ops tc_ops_other
= {
4985 NULL
, /* linux_name */
4986 "linux-other", /* ovs_name */
4988 NULL
, /* tc_install */
4990 NULL
, /* tc_destroy */
4991 NULL
, /* qdisc_get */
4992 NULL
, /* qdisc_set */
4993 NULL
, /* class_get */
4994 NULL
, /* class_set */
4995 NULL
, /* class_delete */
4996 NULL
, /* class_get_stats */
4997 NULL
/* class_dump_stats */
5000 /* Traffic control. */
5002 /* Number of kernel "tc" ticks per second. */
5003 static double ticks_per_s
;
5005 /* Number of kernel "jiffies" per second. This is used for the purpose of
5006 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
5007 * one jiffy's worth of data.
5009 * There are two possibilities here:
5011 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5012 * approximate range of 100 to 1024. That means that we really need to
5013 * make sure that the qdisc can buffer that much data.
5015 * - 'buffer_hz' is an absurdly large number. That means that the kernel
5016 * has finely granular timers and there's no need to fudge additional room
5017 * for buffers. (There's no extra effort needed to implement that: the
5018 * large 'buffer_hz' is used as a divisor, so practically any number will
5019 * come out as 0 in the division. Small integer results in the case of
5020 * really high dividends won't have any real effect anyhow.)
5022 static unsigned int buffer_hz
;
5024 static struct tcmsg
*
5025 netdev_linux_tc_make_request(const struct netdev
*netdev
, int type
,
5026 unsigned int flags
, struct ofpbuf
*request
)
5031 error
= get_ifindex(netdev
, &ifindex
);
5036 return tc_make_request(ifindex
, type
, flags
, request
);
5039 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5042 * This function is equivalent to running:
5043 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5044 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
5047 * The configuration and stats may be seen with the following command:
5048 * /sbin/tc -s filter show dev <devname> parent ffff:
5050 * Returns 0 if successful, otherwise a positive errno value.
5053 tc_add_policer(struct netdev
*netdev
,
5054 uint32_t kbits_rate
, uint32_t kbits_burst
)
5056 struct tc_police tc_police
;
5057 struct ofpbuf request
;
5058 struct tcmsg
*tcmsg
;
5059 size_t basic_offset
;
5060 size_t police_offset
;
5064 memset(&tc_police
, 0, sizeof tc_police
);
5065 tc_police
.action
= TC_POLICE_SHOT
;
5066 tc_police
.mtu
= mtu
;
5067 tc_fill_rate(&tc_police
.rate
, ((uint64_t) kbits_rate
* 1000)/8, mtu
);
5069 /* The following appears wrong in one way: In networking a kilobit is
5070 * usually 1000 bits but this uses 1024 bits.
5072 * However if you "fix" those problems then "tc filter show ..." shows
5073 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5074 * 1,000,000 bits, whereas this actually ends up doing the right thing from
5075 * tc's point of view. Whatever. */
5076 tc_police
.burst
= tc_bytes_to_ticks(
5077 tc_police
.rate
.rate
, MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024 / 8);
5079 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTFILTER
,
5080 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
5084 tcmsg
->tcm_parent
= tc_make_handle(0xffff, 0);
5085 tcmsg
->tcm_info
= tc_make_handle(49,
5086 (OVS_FORCE
uint16_t) htons(ETH_P_ALL
));
5088 nl_msg_put_string(&request
, TCA_KIND
, "basic");
5089 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
5090 police_offset
= nl_msg_start_nested(&request
, TCA_BASIC_POLICE
);
5091 nl_msg_put_unspec(&request
, TCA_POLICE_TBF
, &tc_police
, sizeof tc_police
);
5092 tc_put_rtab(&request
, TCA_POLICE_RATE
, &tc_police
.rate
);
5093 nl_msg_end_nested(&request
, police_offset
);
5094 nl_msg_end_nested(&request
, basic_offset
);
5096 error
= tc_transact(&request
, NULL
);
5107 /* The values in psched are not individually very meaningful, but they are
5108 * important. The tables below show some values seen in the wild.
5112 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5113 * (Before that, there are hints that it was 1000000000.)
5115 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5119 * -----------------------------------
5120 * [1] 000c8000 000f4240 000f4240 00000064
5121 * [2] 000003e8 00000400 000f4240 3b9aca00
5122 * [3] 000003e8 00000400 000f4240 3b9aca00
5123 * [4] 000003e8 00000400 000f4240 00000064
5124 * [5] 000003e8 00000040 000f4240 3b9aca00
5125 * [6] 000003e8 00000040 000f4240 000000f9
5127 * a b c d ticks_per_s buffer_hz
5128 * ------- --------- ---------- ------------- ----------- -------------
5129 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5130 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5131 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5132 * [4] 1,000 1,024 1,000,000 100 976,562 100
5133 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5134 * [6] 1,000 64 1,000,000 249 15,625,000 249
5136 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5137 * [2] 2.6.26-1-686-bigmem from Debian lenny
5138 * [3] 2.6.26-2-sparc64 from Debian lenny
5139 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5140 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5141 * [6] 2.6.34 from kernel.org on KVM
5143 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5144 static const char fn
[] = "/proc/net/psched";
5145 unsigned int a
, b
, c
, d
;
5148 if (!ovsthread_once_start(&once
)) {
5155 stream
= fopen(fn
, "r");
5157 VLOG_WARN("%s: open failed: %s", fn
, ovs_strerror(errno
));
5161 if (fscanf(stream
, "%x %x %x %x", &a
, &b
, &c
, &d
) != 4) {
5162 VLOG_WARN("%s: read failed", fn
);
5166 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn
, a
, b
, c
, d
);
5170 VLOG_WARN("%s: invalid scheduler parameters", fn
);
5174 ticks_per_s
= (double) a
* c
/ b
;
5178 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5181 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn
, ticks_per_s
, buffer_hz
);
5184 ovsthread_once_done(&once
);
5187 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5188 * rate of 'rate' bytes per second. */
5190 tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
)
5193 return (rate
* ticks
) / ticks_per_s
;
5196 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
5197 * rate of 'rate' bytes per second. */
5199 tc_bytes_to_ticks(unsigned int rate
, unsigned int size
)
5202 return rate
? ((unsigned long long int) ticks_per_s
* size
) / rate
: 0;
5205 /* Returns the number of bytes that need to be reserved for qdisc buffering at
5206 * a transmission rate of 'rate' bytes per second. */
5208 tc_buffer_per_jiffy(unsigned int rate
)
5211 return rate
/ buffer_hz
;
5214 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5215 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5216 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5217 * stores NULL into it if it is absent.
5219 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5222 * Returns 0 if successful, otherwise a positive errno value. */
5224 tc_parse_qdisc(const struct ofpbuf
*msg
, const char **kind
,
5225 struct nlattr
**options
)
5227 static const struct nl_policy tca_policy
[] = {
5228 [TCA_KIND
] = { .type
= NL_A_STRING
, .optional
= false },
5229 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= true },
5231 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5233 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5234 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5235 VLOG_WARN_RL(&rl
, "failed to parse qdisc message");
5240 *kind
= nl_attr_get_string(ta
[TCA_KIND
]);
5244 *options
= ta
[TCA_OPTIONS
];
5259 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5260 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5261 * into '*options', and its queue statistics into '*stats'. Any of the output
5262 * arguments may be null.
5264 * Returns 0 if successful, otherwise a positive errno value. */
5266 tc_parse_class(const struct ofpbuf
*msg
, unsigned int *handlep
,
5267 struct nlattr
**options
, struct netdev_queue_stats
*stats
)
5269 static const struct nl_policy tca_policy
[] = {
5270 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= false },
5271 [TCA_STATS2
] = { .type
= NL_A_NESTED
, .optional
= false },
5273 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5275 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5276 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5277 VLOG_WARN_RL(&rl
, "failed to parse class message");
5282 struct tcmsg
*tc
= ofpbuf_at_assert(msg
, NLMSG_HDRLEN
, sizeof *tc
);
5283 *handlep
= tc
->tcm_handle
;
5287 *options
= ta
[TCA_OPTIONS
];
5291 const struct gnet_stats_queue
*gsq
;
5292 struct gnet_stats_basic gsb
;
5294 static const struct nl_policy stats_policy
[] = {
5295 [TCA_STATS_BASIC
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5296 .min_len
= sizeof gsb
},
5297 [TCA_STATS_QUEUE
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5298 .min_len
= sizeof *gsq
},
5300 struct nlattr
*sa
[ARRAY_SIZE(stats_policy
)];
5302 if (!nl_parse_nested(ta
[TCA_STATS2
], stats_policy
,
5303 sa
, ARRAY_SIZE(sa
))) {
5304 VLOG_WARN_RL(&rl
, "failed to parse class stats");
5308 /* Alignment issues screw up the length of struct gnet_stats_basic on
5309 * some arch/bitsize combinations. Newer versions of Linux have a
5310 * struct gnet_stats_basic_packed, but we can't depend on that. The
5311 * easiest thing to do is just to make a copy. */
5312 memset(&gsb
, 0, sizeof gsb
);
5313 memcpy(&gsb
, nl_attr_get(sa
[TCA_STATS_BASIC
]),
5314 MIN(nl_attr_get_size(sa
[TCA_STATS_BASIC
]), sizeof gsb
));
5315 stats
->tx_bytes
= gsb
.bytes
;
5316 stats
->tx_packets
= gsb
.packets
;
5318 gsq
= nl_attr_get(sa
[TCA_STATS_QUEUE
]);
5319 stats
->tx_errors
= gsq
->drops
;
5329 memset(stats
, 0, sizeof *stats
);
5334 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5337 tc_query_class(const struct netdev
*netdev
,
5338 unsigned int handle
, unsigned int parent
,
5339 struct ofpbuf
**replyp
)
5341 struct ofpbuf request
;
5342 struct tcmsg
*tcmsg
;
5345 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, NLM_F_ECHO
,
5350 tcmsg
->tcm_handle
= handle
;
5351 tcmsg
->tcm_parent
= parent
;
5353 error
= tc_transact(&request
, replyp
);
5355 VLOG_WARN_RL(&rl
, "query %s class %u:%u (parent %u:%u) failed (%s)",
5356 netdev_get_name(netdev
),
5357 tc_get_major(handle
), tc_get_minor(handle
),
5358 tc_get_major(parent
), tc_get_minor(parent
),
5359 ovs_strerror(error
));
5364 /* Equivalent to "tc class del dev <name> handle <handle>". */
5366 tc_delete_class(const struct netdev
*netdev
, unsigned int handle
)
5368 struct ofpbuf request
;
5369 struct tcmsg
*tcmsg
;
5372 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_DELTCLASS
, 0, &request
);
5376 tcmsg
->tcm_handle
= handle
;
5377 tcmsg
->tcm_parent
= 0;
5379 error
= tc_transact(&request
, NULL
);
5381 VLOG_WARN_RL(&rl
, "delete %s class %u:%u failed (%s)",
5382 netdev_get_name(netdev
),
5383 tc_get_major(handle
), tc_get_minor(handle
),
5384 ovs_strerror(error
));
5389 /* Equivalent to "tc qdisc del dev <name> root". */
5391 tc_del_qdisc(struct netdev
*netdev_
)
5393 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5394 struct ofpbuf request
;
5395 struct tcmsg
*tcmsg
;
5398 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_DELQDISC
, 0, &request
);
5402 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
5403 tcmsg
->tcm_parent
= TC_H_ROOT
;
5405 error
= tc_transact(&request
, NULL
);
5406 if (error
== EINVAL
) {
5407 /* EINVAL probably means that the default qdisc was in use, in which
5408 * case we've accomplished our purpose. */
5411 if (!error
&& netdev
->tc
) {
5412 if (netdev
->tc
->ops
->tc_destroy
) {
5413 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
5421 getqdisc_is_safe(void)
5423 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5424 static bool safe
= false;
5426 if (ovsthread_once_start(&once
)) {
5427 struct utsname utsname
;
5430 if (uname(&utsname
) == -1) {
5431 VLOG_WARN("uname failed (%s)", ovs_strerror(errno
));
5432 } else if (!ovs_scan(utsname
.release
, "%d.%d", &major
, &minor
)) {
5433 VLOG_WARN("uname reported bad OS release (%s)", utsname
.release
);
5434 } else if (major
< 2 || (major
== 2 && minor
< 35)) {
5435 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5440 ovsthread_once_done(&once
);
5445 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5446 * kernel to determine what they are. Returns 0 if successful, otherwise a
5447 * positive errno value. */
5449 tc_query_qdisc(const struct netdev
*netdev_
)
5451 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5452 struct ofpbuf request
, *qdisc
;
5453 const struct tc_ops
*ops
;
5454 struct tcmsg
*tcmsg
;
5462 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5463 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5464 * 2.6.35 without that fix backported to it.
5466 * To avoid the OOPS, we must not make a request that would attempt to dump
5467 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5468 * few others. There are a few ways that I can see to do this, but most of
5469 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5470 * technique chosen here is to assume that any non-default qdisc that we
5471 * create will have a class with handle 1:0. The built-in qdiscs only have
5472 * a class with handle 0:0.
5474 * On Linux 2.6.35+ we use the straightforward method because it allows us
5475 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5476 * in such a case we get no response at all from the kernel (!) if a
5477 * builtin qdisc is in use (which is later caught by "!error &&
5478 * !qdisc->size"). */
5479 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_GETQDISC
, NLM_F_ECHO
,
5484 tcmsg
->tcm_handle
= tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5485 tcmsg
->tcm_parent
= getqdisc_is_safe() ? TC_H_ROOT
: 0;
5487 /* Figure out what tc class to instantiate. */
5488 error
= tc_transact(&request
, &qdisc
);
5489 if (!error
&& qdisc
->size
) {
5492 error
= tc_parse_qdisc(qdisc
, &kind
, NULL
);
5494 ops
= &tc_ops_other
;
5496 ops
= tc_lookup_linux_name(kind
);
5498 static struct vlog_rate_limit rl2
= VLOG_RATE_LIMIT_INIT(1, 1);
5499 VLOG_DBG_RL(&rl2
, "unknown qdisc \"%s\"", kind
);
5501 ops
= &tc_ops_other
;
5504 } else if ((!error
&& !qdisc
->size
) || error
== ENOENT
) {
5505 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5506 * set up by some other entity that doesn't have a handle 1:0. We will
5507 * assume that it's the system default qdisc. */
5508 ops
= &tc_ops_default
;
5511 /* Who knows? Maybe the device got deleted. */
5512 VLOG_WARN_RL(&rl
, "query %s qdisc failed (%s)",
5513 netdev_get_name(netdev_
), ovs_strerror(error
));
5514 ops
= &tc_ops_other
;
5517 /* Instantiate it. */
5518 load_error
= ops
->tc_load(CONST_CAST(struct netdev
*, netdev_
), qdisc
);
5519 ovs_assert((load_error
== 0) == (netdev
->tc
!= NULL
));
5520 ofpbuf_delete(qdisc
);
5522 return error
? error
: load_error
;
5525 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5526 approximate the time to transmit packets of various lengths. For an MTU of
5527 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5528 represents two possible packet lengths; for a MTU of 513 through 1024, four
5529 possible lengths; and so on.
5531 Returns, for the specified 'mtu', the number of bits that packet lengths
5532 need to be shifted right to fit within such a 256-entry table. */
5534 tc_calc_cell_log(unsigned int mtu
)
5539 mtu
= ETH_PAYLOAD_MAX
;
5541 mtu
+= ETH_HEADER_LEN
+ VLAN_HEADER_LEN
;
5543 for (cell_log
= 0; mtu
>= 256; cell_log
++) {
5550 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5553 tc_fill_rate(struct tc_ratespec
*rate
, uint64_t Bps
, int mtu
)
5555 memset(rate
, 0, sizeof *rate
);
5556 rate
->cell_log
= tc_calc_cell_log(mtu
);
5557 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5558 /* rate->cell_align = 0; */ /* distro headers. */
5559 rate
->mpu
= ETH_TOTAL_MIN
;
5563 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5564 * attribute of the specified "type".
5566 * See tc_calc_cell_log() above for a description of "rtab"s. */
5568 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
)
5573 rtab
= nl_msg_put_unspec_uninit(msg
, type
, TC_RTAB_SIZE
);
5574 for (i
= 0; i
< TC_RTAB_SIZE
/ sizeof *rtab
; i
++) {
5575 unsigned packet_size
= (i
+ 1) << rate
->cell_log
;
5576 if (packet_size
< rate
->mpu
) {
5577 packet_size
= rate
->mpu
;
5579 rtab
[i
] = tc_bytes_to_ticks(rate
->rate
, packet_size
);
5583 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5584 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5585 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5588 tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
)
5590 unsigned int min_burst
= tc_buffer_per_jiffy(Bps
) + mtu
;
5591 return tc_bytes_to_ticks(Bps
, MAX(burst_bytes
, min_burst
));
5594 /* Linux-only functions declared in netdev-linux.h */
5596 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5597 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5599 netdev_linux_ethtool_set_flag(struct netdev
*netdev
, uint32_t flag
,
5600 const char *flag_name
, bool enable
)
5602 const char *netdev_name
= netdev_get_name(netdev
);
5603 struct ethtool_value evalue
;
5607 COVERAGE_INC(netdev_get_ethtool
);
5608 memset(&evalue
, 0, sizeof evalue
);
5609 error
= netdev_linux_do_ethtool(netdev_name
,
5610 (struct ethtool_cmd
*)&evalue
,
5611 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5616 COVERAGE_INC(netdev_set_ethtool
);
5617 new_flags
= (evalue
.data
& ~flag
) | (enable
? flag
: 0);
5618 if (new_flags
== evalue
.data
) {
5621 evalue
.data
= new_flags
;
5622 error
= netdev_linux_do_ethtool(netdev_name
,
5623 (struct ethtool_cmd
*)&evalue
,
5624 ETHTOOL_SFLAGS
, "ETHTOOL_SFLAGS");
5629 COVERAGE_INC(netdev_get_ethtool
);
5630 memset(&evalue
, 0, sizeof evalue
);
5631 error
= netdev_linux_do_ethtool(netdev_name
,
5632 (struct ethtool_cmd
*)&evalue
,
5633 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5638 if (new_flags
!= evalue
.data
) {
5639 VLOG_WARN_RL(&rl
, "attempt to %s ethtool %s flag on network "
5640 "device %s failed", enable
? "enable" : "disable",
5641 flag_name
, netdev_name
);
5648 /* Utility functions. */
5650 /* Copies 'src' into 'dst', performing format conversion in the process. */
5652 netdev_stats_from_rtnl_link_stats(struct netdev_stats
*dst
,
5653 const struct rtnl_link_stats
*src
)
5655 dst
->rx_packets
= src
->rx_packets
;
5656 dst
->tx_packets
= src
->tx_packets
;
5657 dst
->rx_bytes
= src
->rx_bytes
;
5658 dst
->tx_bytes
= src
->tx_bytes
;
5659 dst
->rx_errors
= src
->rx_errors
;
5660 dst
->tx_errors
= src
->tx_errors
;
5661 dst
->rx_dropped
= src
->rx_dropped
;
5662 dst
->tx_dropped
= src
->tx_dropped
;
5663 dst
->multicast
= src
->multicast
;
5664 dst
->collisions
= src
->collisions
;
5665 dst
->rx_length_errors
= src
->rx_length_errors
;
5666 dst
->rx_over_errors
= src
->rx_over_errors
;
5667 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5668 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5669 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5670 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5671 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5672 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5673 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5674 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5675 dst
->tx_window_errors
= src
->tx_window_errors
;
5678 /* Copies 'src' into 'dst', performing format conversion in the process. */
5680 netdev_stats_from_rtnl_link_stats64(struct netdev_stats
*dst
,
5681 const struct rtnl_link_stats64
*src
)
5683 dst
->rx_packets
= src
->rx_packets
;
5684 dst
->tx_packets
= src
->tx_packets
;
5685 dst
->rx_bytes
= src
->rx_bytes
;
5686 dst
->tx_bytes
= src
->tx_bytes
;
5687 dst
->rx_errors
= src
->rx_errors
;
5688 dst
->tx_errors
= src
->tx_errors
;
5689 dst
->rx_dropped
= src
->rx_dropped
;
5690 dst
->tx_dropped
= src
->tx_dropped
;
5691 dst
->multicast
= src
->multicast
;
5692 dst
->collisions
= src
->collisions
;
5693 dst
->rx_length_errors
= src
->rx_length_errors
;
5694 dst
->rx_over_errors
= src
->rx_over_errors
;
5695 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5696 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5697 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5698 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5699 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5700 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5701 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5702 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5703 dst
->tx_window_errors
= src
->tx_window_errors
;
5707 get_stats_via_netlink(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
5709 struct ofpbuf request
;
5710 struct ofpbuf
*reply
;
5713 /* Filtering all counters by default */
5714 memset(stats
, 0xFF, sizeof(struct netdev_stats
));
5716 ofpbuf_init(&request
, 0);
5717 nl_msg_put_nlmsghdr(&request
,
5718 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
5719 RTM_GETLINK
, NLM_F_REQUEST
);
5720 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
5721 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(netdev_
));
5722 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
5723 ofpbuf_uninit(&request
);
5728 if (ofpbuf_try_pull(reply
, NLMSG_HDRLEN
+ sizeof(struct ifinfomsg
))) {
5729 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS64
);
5730 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats64
)) {
5731 netdev_stats_from_rtnl_link_stats64(stats
, nl_attr_get(a
));
5734 a
= nl_attr_find(reply
, 0, IFLA_STATS
);
5735 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats
)) {
5736 netdev_stats_from_rtnl_link_stats(stats
, nl_attr_get(a
));
5739 VLOG_WARN_RL(&rl
, "RTM_GETLINK reply lacks stats");
5744 VLOG_WARN_RL(&rl
, "short RTM_GETLINK reply");
5749 ofpbuf_delete(reply
);
5754 get_flags(const struct netdev
*dev
, unsigned int *flags
)
5760 error
= af_inet_ifreq_ioctl(dev
->name
, &ifr
, SIOCGIFFLAGS
, "SIOCGIFFLAGS");
5762 *flags
= ifr
.ifr_flags
;
5768 set_flags(const char *name
, unsigned int flags
)
5772 ifr
.ifr_flags
= flags
;
5773 return af_inet_ifreq_ioctl(name
, &ifr
, SIOCSIFFLAGS
, "SIOCSIFFLAGS");
5777 linux_get_ifindex(const char *netdev_name
)
5782 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5783 COVERAGE_INC(netdev_get_ifindex
);
5785 error
= af_inet_ioctl(SIOCGIFINDEX
, &ifr
);
5787 /* ENODEV probably means that a vif disappeared asynchronously and
5788 * hasn't been removed from the database yet, so reduce the log level
5789 * to INFO for that case. */
5790 VLOG_RL(&rl
, error
== ENODEV
? VLL_INFO
: VLL_ERR
,
5791 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5792 netdev_name
, ovs_strerror(error
));
5795 return ifr
.ifr_ifindex
;
5799 get_ifindex(const struct netdev
*netdev_
, int *ifindexp
)
5801 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5803 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
5804 netdev_linux_update_via_netlink(netdev
);
5807 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
5808 /* Fall back to ioctl if netlink fails */
5809 int ifindex
= linux_get_ifindex(netdev_get_name(netdev_
));
5812 netdev
->get_ifindex_error
= -ifindex
;
5813 netdev
->ifindex
= 0;
5815 netdev
->get_ifindex_error
= 0;
5816 netdev
->ifindex
= ifindex
;
5818 netdev
->cache_valid
|= VALID_IFINDEX
;
5821 *ifindexp
= netdev
->ifindex
;
5822 return netdev
->get_ifindex_error
;
5826 netdev_linux_update_via_netlink(struct netdev_linux
*netdev
)
5828 struct ofpbuf request
;
5829 struct ofpbuf
*reply
;
5830 struct rtnetlink_change chg
;
5831 struct rtnetlink_change
*change
= &chg
;
5834 ofpbuf_init(&request
, 0);
5835 nl_msg_put_nlmsghdr(&request
,
5836 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
5837 RTM_GETLINK
, NLM_F_REQUEST
);
5838 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
5840 /* The correct identifiers for a Linux device are netnsid and ifindex,
5841 * but ifindex changes as the port is moved to another network namespace
5842 * and the interface name statically stored in ovsdb. */
5843 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(&netdev
->up
));
5844 if (netdev_linux_netnsid_is_remote(netdev
)) {
5845 nl_msg_push_u32(&request
, IFLA_IF_NETNSID
, netdev
->netnsid
);
5847 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
5848 ofpbuf_uninit(&request
);
5850 ofpbuf_delete(reply
);
5854 if (rtnetlink_parse(reply
, change
)
5855 && change
->nlmsg_type
== RTM_NEWLINK
) {
5856 bool changed
= false;
5859 /* Update netdev from rtnl msg and increment its seq if needed. */
5860 if ((change
->ifi_flags
^ netdev
->ifi_flags
) & IFF_RUNNING
) {
5861 netdev
->carrier_resets
++;
5864 if (change
->ifi_flags
!= netdev
->ifi_flags
) {
5865 netdev
->ifi_flags
= change
->ifi_flags
;
5868 if (change
->mtu
&& change
->mtu
!= netdev
->mtu
) {
5869 netdev
->mtu
= change
->mtu
;
5870 netdev
->cache_valid
|= VALID_MTU
;
5871 netdev
->netdev_mtu_error
= 0;
5874 if (!eth_addr_is_zero(change
->mac
)
5875 && !eth_addr_equals(change
->mac
, netdev
->etheraddr
)) {
5876 netdev
->etheraddr
= change
->mac
;
5877 netdev
->cache_valid
|= VALID_ETHERADDR
;
5878 netdev
->ether_addr_error
= 0;
5881 if (change
->if_index
!= netdev
->ifindex
) {
5882 netdev
->ifindex
= change
->if_index
;
5883 netdev
->cache_valid
|= VALID_IFINDEX
;
5884 netdev
->get_ifindex_error
= 0;
5887 if (change
->master
&& netdev_linux_kind_is_lag(change
->master
)) {
5888 netdev
->is_lag_master
= true;
5891 netdev_change_seq_changed(&netdev
->up
);
5897 ofpbuf_delete(reply
);
5902 get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
)
5908 memset(&ifr
, 0, sizeof ifr
);
5909 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5910 COVERAGE_INC(netdev_get_hwaddr
);
5911 error
= af_inet_ioctl(SIOCGIFHWADDR
, &ifr
);
5913 /* ENODEV probably means that a vif disappeared asynchronously and
5914 * hasn't been removed from the database yet, so reduce the log level
5915 * to INFO for that case. */
5916 VLOG(error
== ENODEV
? VLL_INFO
: VLL_ERR
,
5917 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5918 netdev_name
, ovs_strerror(error
));
5921 hwaddr_family
= ifr
.ifr_hwaddr
.sa_family
;
5922 if (hwaddr_family
!= AF_UNSPEC
&& hwaddr_family
!= ARPHRD_ETHER
&&
5923 hwaddr_family
!= ARPHRD_NONE
) {
5924 VLOG_INFO("%s device has unknown hardware address family %d",
5925 netdev_name
, hwaddr_family
);
5928 memcpy(ea
, ifr
.ifr_hwaddr
.sa_data
, ETH_ADDR_LEN
);
5933 set_etheraddr(const char *netdev_name
, const struct eth_addr mac
)
5938 memset(&ifr
, 0, sizeof ifr
);
5939 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5940 ifr
.ifr_hwaddr
.sa_family
= ARPHRD_ETHER
;
5941 memcpy(ifr
.ifr_hwaddr
.sa_data
, &mac
, ETH_ADDR_LEN
);
5942 COVERAGE_INC(netdev_set_hwaddr
);
5943 error
= af_inet_ioctl(SIOCSIFHWADDR
, &ifr
);
5945 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5946 netdev_name
, ovs_strerror(error
));
5952 netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*ecmd
,
5953 int cmd
, const char *cmd_name
)
5958 memset(&ifr
, 0, sizeof ifr
);
5959 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
5960 ifr
.ifr_data
= (caddr_t
) ecmd
;
5963 error
= af_inet_ioctl(SIOCETHTOOL
, &ifr
);
5965 if (error
!= EOPNOTSUPP
) {
5966 VLOG_WARN_RL(&rl
, "ethtool command %s on network device %s "
5967 "failed: %s", cmd_name
, name
, ovs_strerror(error
));
5969 /* The device doesn't support this operation. That's pretty
5970 * common, so there's no point in logging anything. */
5976 /* Returns an AF_PACKET raw socket or a negative errno value. */
5978 af_packet_sock(void)
5980 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5983 if (ovsthread_once_start(&once
)) {
5984 sock
= socket(AF_PACKET
, SOCK_RAW
, 0);
5986 int error
= set_nonblocking(sock
);
5993 VLOG_ERR("failed to create packet socket: %s",
5994 ovs_strerror(errno
));
5996 ovsthread_once_done(&once
);