2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <sys/types.h>
24 #include <netinet/in.h>
25 #include <arpa/inet.h>
27 #include <linux/filter.h>
28 #include <linux/gen_stats.h>
29 #include <linux/if_ether.h>
30 #include <linux/if_tun.h>
31 #include <linux/types.h>
32 #include <linux/ethtool.h>
33 #include <linux/mii.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/ioctl.h>
37 #include <sys/socket.h>
38 #include <sys/utsname.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
50 #include "dp-packet.h"
51 #include "dpif-netlink.h"
52 #include "dpif-netdev.h"
53 #include "openvswitch/dynamic-string.h"
54 #include "fatal-signal.h"
56 #include "openvswitch/hmap.h"
57 #include "netdev-provider.h"
58 #include "netdev-tc-offloads.h"
59 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "openvswitch/poll-loop.h"
69 #include "rtnetlink.h"
70 #include "openvswitch/shash.h"
71 #include "socket-util.h"
75 #include "unaligned.h"
76 #include "openvswitch/vlog.h"
79 VLOG_DEFINE_THIS_MODULE(netdev_linux
);
81 COVERAGE_DEFINE(netdev_set_policing
);
82 COVERAGE_DEFINE(netdev_arp_lookup
);
83 COVERAGE_DEFINE(netdev_get_ifindex
);
84 COVERAGE_DEFINE(netdev_get_hwaddr
);
85 COVERAGE_DEFINE(netdev_set_hwaddr
);
86 COVERAGE_DEFINE(netdev_get_ethtool
);
87 COVERAGE_DEFINE(netdev_set_ethtool
);
90 #ifndef IFLA_IF_NETNSID
91 #define IFLA_IF_NETNSID 0x45
93 /* These were introduced in Linux 2.6.14, so they might be missing if we have
95 #ifndef ADVERTISED_Pause
96 #define ADVERTISED_Pause (1 << 13)
98 #ifndef ADVERTISED_Asym_Pause
99 #define ADVERTISED_Asym_Pause (1 << 14)
102 /* These were introduced in Linux 2.6.24, so they might be missing if we
103 * have old headers. */
104 #ifndef ETHTOOL_GFLAGS
105 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
107 #ifndef ETHTOOL_SFLAGS
108 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
111 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
114 #define TC_RTAB_SIZE 1024
117 /* Linux 2.6.21 introduced struct tpacket_auxdata.
118 * Linux 2.6.27 added the tp_vlan_tci member.
119 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
120 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
121 * TP_STATUS_VLAN_TPID_VALID.
123 * With all this churn it's easiest to unconditionally define a replacement
124 * structure that has everything we want.
126 #ifndef PACKET_AUXDATA
127 #define PACKET_AUXDATA 8
129 #ifndef TP_STATUS_VLAN_VALID
130 #define TP_STATUS_VLAN_VALID (1 << 4)
132 #ifndef TP_STATUS_VLAN_TPID_VALID
133 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
135 #undef tpacket_auxdata
136 #define tpacket_auxdata rpl_tpacket_auxdata
137 struct tpacket_auxdata
{
143 uint16_t tp_vlan_tci
;
144 uint16_t tp_vlan_tpid
;
147 /* Linux 2.6.27 introduced ethtool_cmd_speed
149 * To avoid revisiting problems reported with using configure to detect
150 * compatibility (see report at
151 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
152 * unconditionally replace ethtool_cmd_speed. */
153 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
154 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd
*ep
)
156 return ep
->speed
| (ep
->speed_hi
<< 16);
159 /* Linux 2.6.30 introduced supported and advertised flags for
160 * 1G base KX, and 10G base KX4, KR and R. */
161 #ifndef SUPPORTED_1000baseKX_Full
162 #define SUPPORTED_1000baseKX_Full (1 << 17)
163 #define SUPPORTED_10000baseKX4_Full (1 << 18)
164 #define SUPPORTED_10000baseKR_Full (1 << 19)
165 #define SUPPORTED_10000baseR_FEC (1 << 20)
166 #define ADVERTISED_1000baseKX_Full (1 << 17)
167 #define ADVERTISED_10000baseKX4_Full (1 << 18)
168 #define ADVERTISED_10000baseKR_Full (1 << 19)
169 #define ADVERTISED_10000baseR_FEC (1 << 20)
172 /* Linux 3.5 introduced supported and advertised flags for
173 * 40G base KR4, CR4, SR4 and LR4. */
174 #ifndef SUPPORTED_40000baseKR4_Full
175 #define SUPPORTED_40000baseKR4_Full (1 << 23)
176 #define SUPPORTED_40000baseCR4_Full (1 << 24)
177 #define SUPPORTED_40000baseSR4_Full (1 << 25)
178 #define SUPPORTED_40000baseLR4_Full (1 << 26)
179 #define ADVERTISED_40000baseKR4_Full (1 << 23)
180 #define ADVERTISED_40000baseCR4_Full (1 << 24)
181 #define ADVERTISED_40000baseSR4_Full (1 << 25)
182 #define ADVERTISED_40000baseLR4_Full (1 << 26)
185 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
187 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
188 * 2.6.32-431.29.2.el6.x86_64 (see report at
189 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
190 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
191 * unconditionally define a replacement. */
193 #define IFLA_STATS64 23
195 #define rtnl_link_stats64 rpl_rtnl_link_stats64
196 struct rtnl_link_stats64
{
208 uint64_t rx_length_errors
;
209 uint64_t rx_over_errors
;
210 uint64_t rx_crc_errors
;
211 uint64_t rx_frame_errors
;
212 uint64_t rx_fifo_errors
;
213 uint64_t rx_missed_errors
;
215 uint64_t tx_aborted_errors
;
216 uint64_t tx_carrier_errors
;
217 uint64_t tx_fifo_errors
;
218 uint64_t tx_heartbeat_errors
;
219 uint64_t tx_window_errors
;
221 uint64_t rx_compressed
;
222 uint64_t tx_compressed
;
226 VALID_IFINDEX
= 1 << 0,
227 VALID_ETHERADDR
= 1 << 1,
230 VALID_POLICING
= 1 << 4,
231 VALID_VPORT_STAT_ERROR
= 1 << 5,
232 VALID_DRVINFO
= 1 << 6,
233 VALID_FEATURES
= 1 << 7,
236 struct linux_lag_slave
{
238 struct shash_node
*node
;
241 /* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
242 static struct ovs_mutex lag_mutex
= OVS_MUTEX_INITIALIZER
;
244 /* All slaves whose LAG masters are network devices in OvS. */
245 static struct shash lag_shash
OVS_GUARDED_BY(lag_mutex
)
246 = SHASH_INITIALIZER(&lag_shash
);
248 /* Traffic control. */
250 /* An instance of a traffic control class. Always associated with a particular
253 * Each TC implementation subclasses this with whatever additional data it
256 const struct tc_ops
*ops
;
257 struct hmap queues
; /* Contains "struct tc_queue"s.
258 * Read by generic TC layer.
259 * Written only by TC implementation. */
262 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
264 /* One traffic control queue.
266 * Each TC implementation subclasses this with whatever additional data it
269 struct hmap_node hmap_node
; /* In struct tc's "queues" hmap. */
270 unsigned int queue_id
; /* OpenFlow queue ID. */
271 long long int created
; /* Time queue was created, in msecs. */
274 /* A particular kind of traffic control. Each implementation generally maps to
275 * one particular Linux qdisc class.
277 * The functions below return 0 if successful or a positive errno value on
278 * failure, except where otherwise noted. All of them must be provided, except
279 * where otherwise noted. */
281 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
282 * This is null for tc_ops_default and tc_ops_other, for which there are no
283 * appropriate values. */
284 const char *linux_name
;
286 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
287 const char *ovs_name
;
289 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
290 * queues. The queues are numbered 0 through n_queues - 1. */
291 unsigned int n_queues
;
293 /* Called to install this TC class on 'netdev'. The implementation should
294 * make the Netlink calls required to set up 'netdev' with the right qdisc
295 * and configure it according to 'details'. The implementation may assume
296 * that the current qdisc is the default; that is, there is no need for it
297 * to delete the current qdisc before installing itself.
299 * The contents of 'details' should be documented as valid for 'ovs_name'
300 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
301 * (which is built as ovs-vswitchd.conf.db(8)).
303 * This function must return 0 if and only if it sets 'netdev->tc' to an
304 * initialized 'struct tc'.
306 * (This function is null for tc_ops_other, which cannot be installed. For
307 * other TC classes it should always be nonnull.) */
308 int (*tc_install
)(struct netdev
*netdev
, const struct smap
*details
);
310 /* Called when the netdev code determines (through a Netlink query) that
311 * this TC class's qdisc is installed on 'netdev', but we didn't install
312 * it ourselves and so don't know any of the details.
314 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
315 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
316 * implementation should parse the other attributes of 'nlmsg' as
317 * necessary to determine its configuration. If necessary it should also
318 * use Netlink queries to determine the configuration of queues on
321 * This function must return 0 if and only if it sets 'netdev->tc' to an
322 * initialized 'struct tc'. */
323 int (*tc_load
)(struct netdev
*netdev
, struct ofpbuf
*nlmsg
);
325 /* Destroys the data structures allocated by the implementation as part of
326 * 'tc'. (This includes destroying 'tc->queues' by calling
329 * The implementation should not need to perform any Netlink calls. If
330 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
331 * (But it may not be desirable.)
333 * This function may be null if 'tc' is trivial. */
334 void (*tc_destroy
)(struct tc
*tc
);
336 /* Retrieves details of 'netdev->tc' configuration into 'details'.
338 * The implementation should not need to perform any Netlink calls, because
339 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
340 * cached the configuration.
342 * The contents of 'details' should be documented as valid for 'ovs_name'
343 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
344 * (which is built as ovs-vswitchd.conf.db(8)).
346 * This function may be null if 'tc' is not configurable.
348 int (*qdisc_get
)(const struct netdev
*netdev
, struct smap
*details
);
350 /* Reconfigures 'netdev->tc' according to 'details', performing any
351 * required Netlink calls to complete the reconfiguration.
353 * The contents of 'details' should be documented as valid for 'ovs_name'
354 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
355 * (which is built as ovs-vswitchd.conf.db(8)).
357 * This function may be null if 'tc' is not configurable.
359 int (*qdisc_set
)(struct netdev
*, const struct smap
*details
);
361 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
362 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
364 * The contents of 'details' should be documented as valid for 'ovs_name'
365 * in the "other_config" column in the "Queue" table in
366 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
368 * The implementation should not need to perform any Netlink calls, because
369 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
370 * cached the queue configuration.
372 * This function may be null if 'tc' does not have queues ('n_queues' is
374 int (*class_get
)(const struct netdev
*netdev
, const struct tc_queue
*queue
,
375 struct smap
*details
);
377 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
378 * 'details', perfoming any required Netlink calls to complete the
379 * reconfiguration. The caller ensures that 'queue_id' is less than
382 * The contents of 'details' should be documented as valid for 'ovs_name'
383 * in the "other_config" column in the "Queue" table in
384 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
386 * This function may be null if 'tc' does not have queues or its queues are
387 * not configurable. */
388 int (*class_set
)(struct netdev
*, unsigned int queue_id
,
389 const struct smap
*details
);
391 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
392 * tc_queue's within 'netdev->tc->queues'.
394 * This function may be null if 'tc' does not have queues or its queues
395 * cannot be deleted. */
396 int (*class_delete
)(struct netdev
*, struct tc_queue
*queue
);
398 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
399 * 'struct tc_queue's within 'netdev->tc->queues'.
401 * On success, initializes '*stats'.
403 * This function may be null if 'tc' does not have queues or if it cannot
404 * report queue statistics. */
405 int (*class_get_stats
)(const struct netdev
*netdev
,
406 const struct tc_queue
*queue
,
407 struct netdev_queue_stats
*stats
);
409 /* Extracts queue stats from 'nlmsg', which is a response to a
410 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
412 * This function may be null if 'tc' does not have queues or if it cannot
413 * report queue statistics. */
414 int (*class_dump_stats
)(const struct netdev
*netdev
,
415 const struct ofpbuf
*nlmsg
,
416 netdev_dump_queue_stats_cb
*cb
, void *aux
);
420 tc_init(struct tc
*tc
, const struct tc_ops
*ops
)
423 hmap_init(&tc
->queues
);
427 tc_destroy(struct tc
*tc
)
429 hmap_destroy(&tc
->queues
);
432 static const struct tc_ops tc_ops_htb
;
433 static const struct tc_ops tc_ops_hfsc
;
434 static const struct tc_ops tc_ops_codel
;
435 static const struct tc_ops tc_ops_fqcodel
;
436 static const struct tc_ops tc_ops_sfq
;
437 static const struct tc_ops tc_ops_default
;
438 static const struct tc_ops tc_ops_noop
;
439 static const struct tc_ops tc_ops_other
;
441 static const struct tc_ops
*const tcs
[] = {
442 &tc_ops_htb
, /* Hierarchy token bucket (see tc-htb(8)). */
443 &tc_ops_hfsc
, /* Hierarchical fair service curve. */
444 &tc_ops_codel
, /* Controlled delay */
445 &tc_ops_fqcodel
, /* Fair queue controlled delay */
446 &tc_ops_sfq
, /* Stochastic fair queueing */
447 &tc_ops_noop
, /* Non operating qos type. */
448 &tc_ops_default
, /* Default qdisc (see tc-pfifo_fast(8)). */
449 &tc_ops_other
, /* Some other qdisc. */
453 static unsigned int tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
);
454 static unsigned int tc_bytes_to_ticks(unsigned int rate
, unsigned int size
);
455 static unsigned int tc_buffer_per_jiffy(unsigned int rate
);
457 static struct tcmsg
*netdev_linux_tc_make_request(const struct netdev
*,
461 static int tc_add_policer(struct netdev
*,
462 uint32_t kbits_rate
, uint32_t kbits_burst
);
464 static int tc_parse_qdisc(const struct ofpbuf
*, const char **kind
,
465 struct nlattr
**options
);
466 static int tc_parse_class(const struct ofpbuf
*, unsigned int *queue_id
,
467 struct nlattr
**options
,
468 struct netdev_queue_stats
*);
469 static int tc_query_class(const struct netdev
*,
470 unsigned int handle
, unsigned int parent
,
471 struct ofpbuf
**replyp
);
472 static int tc_delete_class(const struct netdev
*, unsigned int handle
);
474 static int tc_del_qdisc(struct netdev
*netdev
);
475 static int tc_query_qdisc(const struct netdev
*netdev
);
477 static int tc_calc_cell_log(unsigned int mtu
);
478 static void tc_fill_rate(struct tc_ratespec
*rate
, uint64_t bps
, int mtu
);
479 static void tc_put_rtab(struct ofpbuf
*, uint16_t type
,
480 const struct tc_ratespec
*rate
);
481 static int tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
);
483 struct netdev_linux
{
486 /* Protects all members below. */
487 struct ovs_mutex mutex
;
489 unsigned int cache_valid
;
491 bool miimon
; /* Link status of last poll. */
492 long long int miimon_interval
; /* Miimon Poll rate. Disabled if <= 0. */
493 struct timer miimon_timer
;
495 int netnsid
; /* Network namespace ID. */
496 /* The following are figured out "on demand" only. They are only valid
497 * when the corresponding VALID_* bit in 'cache_valid' is set. */
499 struct eth_addr etheraddr
;
501 unsigned int ifi_flags
;
502 long long int carrier_resets
;
503 uint32_t kbits_rate
; /* Policing data. */
504 uint32_t kbits_burst
;
505 int vport_stats_error
; /* Cached error code from vport_get_stats().
506 0 or an errno value. */
507 int netdev_mtu_error
; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
508 int ether_addr_error
; /* Cached error code from set/get etheraddr. */
509 int netdev_policing_error
; /* Cached error code from set policing. */
510 int get_features_error
; /* Cached error code from ETHTOOL_GSET. */
511 int get_ifindex_error
; /* Cached error code from SIOCGIFINDEX. */
513 enum netdev_features current
; /* Cached from ETHTOOL_GSET. */
514 enum netdev_features advertised
; /* Cached from ETHTOOL_GSET. */
515 enum netdev_features supported
; /* Cached from ETHTOOL_GSET. */
517 struct ethtool_drvinfo drvinfo
; /* Cached from ETHTOOL_GDRVINFO. */
520 /* For devices of class netdev_tap_class only. */
522 bool present
; /* If the device is present in the namespace */
523 uint64_t tx_dropped
; /* tap device can drop if the iface is down */
525 /* LAG information. */
526 bool is_lag_master
; /* True if the netdev is a LAG master. */
529 struct netdev_rxq_linux
{
530 struct netdev_rxq up
;
535 /* This is set pretty low because we probably won't learn anything from the
536 * additional log messages. */
537 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
539 /* Polling miimon status for all ports causes performance degradation when
540 * handling a large number of ports. If there are no devices using miimon, then
541 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
543 * Readers do not depend on this variable synchronizing with the related
544 * changes in the device miimon status, so we can use atomic_count. */
545 static atomic_count miimon_cnt
= ATOMIC_COUNT_INIT(0);
547 static void netdev_linux_run(const struct netdev_class
*);
549 static int netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*,
550 int cmd
, const char *cmd_name
);
551 static int get_flags(const struct netdev
*, unsigned int *flags
);
552 static int set_flags(const char *, unsigned int flags
);
553 static int update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
554 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
555 OVS_REQUIRES(netdev
->mutex
);
556 static int get_ifindex(const struct netdev
*, int *ifindexp
);
557 static int do_set_addr(struct netdev
*netdev
,
558 int ioctl_nr
, const char *ioctl_name
,
559 struct in_addr addr
);
560 static int get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
);
561 static int set_etheraddr(const char *netdev_name
, const struct eth_addr
);
562 static int get_stats_via_netlink(const struct netdev
*, struct netdev_stats
*);
563 static int af_packet_sock(void);
564 static bool netdev_linux_miimon_enabled(void);
565 static void netdev_linux_miimon_run(void);
566 static void netdev_linux_miimon_wait(void);
567 static int netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
);
570 is_netdev_linux_class(const struct netdev_class
*netdev_class
)
572 return netdev_class
->run
== netdev_linux_run
;
576 is_tap_netdev(const struct netdev
*netdev
)
578 return netdev_get_class(netdev
) == &netdev_tap_class
;
581 static struct netdev_linux
*
582 netdev_linux_cast(const struct netdev
*netdev
)
584 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev
)));
586 return CONTAINER_OF(netdev
, struct netdev_linux
, up
);
589 static struct netdev_rxq_linux
*
590 netdev_rxq_linux_cast(const struct netdev_rxq
*rx
)
592 ovs_assert(is_netdev_linux_class(netdev_get_class(rx
->netdev
)));
593 return CONTAINER_OF(rx
, struct netdev_rxq_linux
, up
);
597 netdev_linux_netnsid_update__(struct netdev_linux
*netdev
)
599 struct dpif_netlink_vport reply
;
603 error
= dpif_netlink_vport_get(netdev_get_name(&netdev
->up
), &reply
, &buf
);
605 if (error
== ENOENT
) {
606 /* Assume it is local if there is no API (e.g. if the openvswitch
607 * kernel module is not loaded). */
608 netnsid_set_local(&netdev
->netnsid
);
610 netnsid_unset(&netdev
->netnsid
);
615 netnsid_set(&netdev
->netnsid
, reply
.netnsid
);
621 netdev_linux_netnsid_update(struct netdev_linux
*netdev
)
623 if (netnsid_is_unset(netdev
->netnsid
)) {
624 if (netdev_get_class(&netdev
->up
) == &netdev_tap_class
) {
625 netnsid_set_local(&netdev
->netnsid
);
627 return netdev_linux_netnsid_update__(netdev
);
635 netdev_linux_netnsid_is_eq(struct netdev_linux
*netdev
, int nsid
)
637 netdev_linux_netnsid_update(netdev
);
638 return netnsid_eq(netdev
->netnsid
, nsid
);
642 netdev_linux_netnsid_is_remote(struct netdev_linux
*netdev
)
644 netdev_linux_netnsid_update(netdev
);
645 return netnsid_is_remote(netdev
->netnsid
);
648 static int netdev_linux_update_via_netlink(struct netdev_linux
*);
649 static void netdev_linux_update(struct netdev_linux
*netdev
, int,
650 const struct rtnetlink_change
*)
651 OVS_REQUIRES(netdev
->mutex
);
652 static void netdev_linux_changed(struct netdev_linux
*netdev
,
653 unsigned int ifi_flags
, unsigned int mask
)
654 OVS_REQUIRES(netdev
->mutex
);
656 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
657 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
658 * if no such socket could be created. */
659 static struct nl_sock
*
660 netdev_linux_notify_sock(void)
662 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
663 static struct nl_sock
*sock
;
664 unsigned int mcgroups
[] = {RTNLGRP_LINK
, RTNLGRP_IPV4_IFADDR
,
665 RTNLGRP_IPV6_IFADDR
, RTNLGRP_IPV6_IFINFO
};
667 if (ovsthread_once_start(&once
)) {
670 error
= nl_sock_create(NETLINK_ROUTE
, &sock
);
674 for (i
= 0; i
< ARRAY_SIZE(mcgroups
); i
++) {
675 error
= nl_sock_join_mcgroup(sock
, mcgroups
[i
]);
677 nl_sock_destroy(sock
);
683 nl_sock_listen_all_nsid(sock
, true);
684 ovsthread_once_done(&once
);
691 netdev_linux_miimon_enabled(void)
693 return atomic_count_get(&miimon_cnt
) > 0;
697 netdev_linux_kind_is_lag(const char *kind
)
699 if (!strcmp(kind
, "bond") || !strcmp(kind
, "team")) {
707 netdev_linux_update_lag(struct rtnetlink_change
*change
)
708 OVS_REQUIRES(lag_mutex
)
710 struct linux_lag_slave
*lag
;
712 if (!rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)) {
716 if (change
->slave
&& netdev_linux_kind_is_lag(change
->slave
)) {
717 lag
= shash_find_data(&lag_shash
, change
->ifname
);
720 struct netdev
*master_netdev
;
721 char master_name
[IFNAMSIZ
];
725 if_indextoname(change
->master_ifindex
, master_name
);
726 master_netdev
= netdev_from_name(master_name
);
727 if (!master_netdev
) {
731 if (is_netdev_linux_class(master_netdev
->netdev_class
)) {
732 block_id
= netdev_get_block_id(master_netdev
);
734 netdev_close(master_netdev
);
738 lag
= xmalloc(sizeof *lag
);
739 lag
->block_id
= block_id
;
740 lag
->node
= shash_add(&lag_shash
, change
->ifname
, lag
);
742 /* LAG master is linux netdev so add slave to same block. */
743 error
= tc_add_del_ingress_qdisc(change
->if_index
, true,
746 VLOG_WARN("failed to bind LAG slave to master's block");
747 shash_delete(&lag_shash
, lag
->node
);
752 netdev_close(master_netdev
);
754 } else if (change
->master_ifindex
== 0) {
755 /* Check if this was a lag slave that has been freed. */
756 lag
= shash_find_data(&lag_shash
, change
->ifname
);
759 tc_add_del_ingress_qdisc(change
->if_index
, false,
761 shash_delete(&lag_shash
, lag
->node
);
768 netdev_linux_run(const struct netdev_class
*netdev_class OVS_UNUSED
)
770 struct nl_sock
*sock
;
773 if (netdev_linux_miimon_enabled()) {
774 netdev_linux_miimon_run();
777 sock
= netdev_linux_notify_sock();
783 uint64_t buf_stub
[4096 / 8];
787 ofpbuf_use_stub(&buf
, buf_stub
, sizeof buf_stub
);
788 error
= nl_sock_recv(sock
, &buf
, &nsid
, false);
790 struct rtnetlink_change change
;
792 if (rtnetlink_parse(&buf
, &change
)) {
793 struct netdev
*netdev_
= NULL
;
794 char dev_name
[IFNAMSIZ
];
796 if (!change
.ifname
) {
797 change
.ifname
= if_indextoname(change
.if_index
, dev_name
);
801 netdev_
= netdev_from_name(change
.ifname
);
803 if (netdev_
&& is_netdev_linux_class(netdev_
->netdev_class
)) {
804 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
806 ovs_mutex_lock(&netdev
->mutex
);
807 netdev_linux_update(netdev
, nsid
, &change
);
808 ovs_mutex_unlock(&netdev
->mutex
);
810 else if (!netdev_
&& change
.ifname
) {
811 /* Netdev is not present in OvS but its master could be. */
812 ovs_mutex_lock(&lag_mutex
);
813 netdev_linux_update_lag(&change
);
814 ovs_mutex_unlock(&lag_mutex
);
816 netdev_close(netdev_
);
818 } else if (error
== ENOBUFS
) {
819 struct shash device_shash
;
820 struct shash_node
*node
;
824 shash_init(&device_shash
);
825 netdev_get_devices(&netdev_linux_class
, &device_shash
);
826 SHASH_FOR_EACH (node
, &device_shash
) {
827 struct netdev
*netdev_
= node
->data
;
828 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
831 ovs_mutex_lock(&netdev
->mutex
);
832 get_flags(netdev_
, &flags
);
833 netdev_linux_changed(netdev
, flags
, 0);
834 ovs_mutex_unlock(&netdev
->mutex
);
836 netdev_close(netdev_
);
838 shash_destroy(&device_shash
);
839 } else if (error
!= EAGAIN
) {
840 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 5);
841 VLOG_WARN_RL(&rll
, "error reading or parsing netlink (%s)",
842 ovs_strerror(error
));
849 netdev_linux_wait(const struct netdev_class
*netdev_class OVS_UNUSED
)
851 struct nl_sock
*sock
;
853 if (netdev_linux_miimon_enabled()) {
854 netdev_linux_miimon_wait();
856 sock
= netdev_linux_notify_sock();
858 nl_sock_wait(sock
, POLLIN
);
863 netdev_linux_changed(struct netdev_linux
*dev
,
864 unsigned int ifi_flags
, unsigned int mask
)
865 OVS_REQUIRES(dev
->mutex
)
867 netdev_change_seq_changed(&dev
->up
);
869 if ((dev
->ifi_flags
^ ifi_flags
) & IFF_RUNNING
) {
870 dev
->carrier_resets
++;
872 dev
->ifi_flags
= ifi_flags
;
874 dev
->cache_valid
&= mask
;
875 if (!(mask
& VALID_IN
)) {
876 netdev_get_addrs_list_flush();
881 netdev_linux_update__(struct netdev_linux
*dev
,
882 const struct rtnetlink_change
*change
)
883 OVS_REQUIRES(dev
->mutex
)
885 if (rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)) {
886 if (change
->nlmsg_type
== RTM_NEWLINK
) {
887 /* Keep drv-info, and ip addresses. */
888 netdev_linux_changed(dev
, change
->ifi_flags
,
889 VALID_DRVINFO
| VALID_IN
);
891 /* Update netdev from rtnl-change msg. */
893 dev
->mtu
= change
->mtu
;
894 dev
->cache_valid
|= VALID_MTU
;
895 dev
->netdev_mtu_error
= 0;
898 if (!eth_addr_is_zero(change
->mac
)) {
899 dev
->etheraddr
= change
->mac
;
900 dev
->cache_valid
|= VALID_ETHERADDR
;
901 dev
->ether_addr_error
= 0;
903 /* The mac addr has been changed, report it now. */
904 rtnetlink_report_link();
907 if (change
->master
&& netdev_linux_kind_is_lag(change
->master
)) {
908 dev
->is_lag_master
= true;
911 dev
->ifindex
= change
->if_index
;
912 dev
->cache_valid
|= VALID_IFINDEX
;
913 dev
->get_ifindex_error
= 0;
917 netdev_linux_changed(dev
, change
->ifi_flags
, 0);
918 dev
->present
= false;
919 netnsid_unset(&dev
->netnsid
);
921 } else if (rtnetlink_type_is_rtnlgrp_addr(change
->nlmsg_type
)) {
922 /* Invalidates in4, in6. */
923 netdev_linux_changed(dev
, dev
->ifi_flags
, ~VALID_IN
);
930 netdev_linux_update(struct netdev_linux
*dev
, int nsid
,
931 const struct rtnetlink_change
*change
)
932 OVS_REQUIRES(dev
->mutex
)
934 if (netdev_linux_netnsid_is_eq(dev
, nsid
)) {
935 netdev_linux_update__(dev
, change
);
939 static struct netdev
*
940 netdev_linux_alloc(void)
942 struct netdev_linux
*netdev
= xzalloc(sizeof *netdev
);
947 netdev_linux_common_construct(struct netdev
*netdev_
)
949 /* Prevent any attempt to create (or open) a network device named "default"
950 * or "all". These device names are effectively reserved on Linux because
951 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
952 * itself this wouldn't call for any special treatment, but in practice if
953 * a program tries to create devices with these names, it causes the kernel
954 * to fire a "new device" notification event even though creation failed,
955 * and in turn that causes OVS to wake up and try to create them again,
956 * which ends up as a 100% CPU loop. */
957 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
958 const char *name
= netdev_
->name
;
959 if (!strcmp(name
, "default") || !strcmp(name
, "all")) {
960 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 1);
961 VLOG_WARN_RL(&rll
, "%s: Linux forbids network device with this name",
966 /* The device could be in the same network namespace or in another one. */
967 netnsid_unset(&netdev
->netnsid
);
968 ovs_mutex_init(&netdev
->mutex
);
972 /* Creates system and internal devices. */
974 netdev_linux_construct(struct netdev
*netdev_
)
976 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
977 int error
= netdev_linux_common_construct(netdev_
);
982 error
= get_flags(&netdev
->up
, &netdev
->ifi_flags
);
983 if (error
== ENODEV
) {
984 if (netdev
->up
.netdev_class
!= &netdev_internal_class
) {
985 /* The device does not exist, so don't allow it to be opened. */
988 /* "Internal" netdevs have to be created as netdev objects before
989 * they exist in the kernel, because creating them in the kernel
990 * happens by passing a netdev object to dpif_port_add().
991 * Therefore, ignore the error. */
998 /* For most types of netdevs we open the device for each call of
999 * netdev_open(). However, this is not the case with tap devices,
1000 * since it is only possible to open the device once. In this
1001 * situation we share a single file descriptor, and consequently
1002 * buffers, across all readers. Therefore once data is read it will
1003 * be unavailable to other reads for tap devices. */
1005 netdev_linux_construct_tap(struct netdev
*netdev_
)
1007 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1008 static const char tap_dev
[] = "/dev/net/tun";
1009 const char *name
= netdev_
->name
;
1012 int error
= netdev_linux_common_construct(netdev_
);
1017 /* Open tap device. */
1018 netdev
->tap_fd
= open(tap_dev
, O_RDWR
);
1019 if (netdev
->tap_fd
< 0) {
1021 VLOG_WARN("opening \"%s\" failed: %s", tap_dev
, ovs_strerror(error
));
1025 /* Create tap device. */
1026 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
1027 ifr
.ifr_flags
= IFF_TAP
| IFF_NO_PI
;
1028 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
1029 if (ioctl(netdev
->tap_fd
, TUNSETIFF
, &ifr
) == -1) {
1030 VLOG_WARN("%s: creating tap device failed: %s", name
,
1031 ovs_strerror(errno
));
1036 /* Make non-blocking. */
1037 error
= set_nonblocking(netdev
->tap_fd
);
1042 if (ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 1)) {
1043 VLOG_WARN("%s: creating tap device failed (persist): %s", name
,
1044 ovs_strerror(errno
));
1049 netdev
->present
= true;
1053 close(netdev
->tap_fd
);
1058 netdev_linux_destruct(struct netdev
*netdev_
)
1060 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1062 if (netdev
->tc
&& netdev
->tc
->ops
->tc_destroy
) {
1063 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
1066 if (netdev_get_class(netdev_
) == &netdev_tap_class
1067 && netdev
->tap_fd
>= 0)
1069 ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 0);
1070 close(netdev
->tap_fd
);
1073 if (netdev
->miimon_interval
> 0) {
1074 atomic_count_dec(&miimon_cnt
);
1077 ovs_mutex_destroy(&netdev
->mutex
);
1081 netdev_linux_dealloc(struct netdev
*netdev_
)
1083 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1087 static struct netdev_rxq
*
1088 netdev_linux_rxq_alloc(void)
1090 struct netdev_rxq_linux
*rx
= xzalloc(sizeof *rx
);
1095 netdev_linux_rxq_construct(struct netdev_rxq
*rxq_
)
1097 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1098 struct netdev
*netdev_
= rx
->up
.netdev
;
1099 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1102 ovs_mutex_lock(&netdev
->mutex
);
1103 rx
->is_tap
= is_tap_netdev(netdev_
);
1105 rx
->fd
= netdev
->tap_fd
;
1107 struct sockaddr_ll sll
;
1109 /* Result of tcpdump -dd inbound */
1110 static const struct sock_filter filt
[] = {
1111 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1112 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1113 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1114 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1116 static const struct sock_fprog fprog
= {
1117 ARRAY_SIZE(filt
), (struct sock_filter
*) filt
1120 /* Create file descriptor. */
1121 rx
->fd
= socket(PF_PACKET
, SOCK_RAW
, 0);
1124 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error
));
1129 if (setsockopt(rx
->fd
, SOL_PACKET
, PACKET_AUXDATA
, &val
, sizeof val
)) {
1131 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1132 netdev_get_name(netdev_
), ovs_strerror(error
));
1136 /* Set non-blocking mode. */
1137 error
= set_nonblocking(rx
->fd
);
1142 /* Get ethernet device index. */
1143 error
= get_ifindex(&netdev
->up
, &ifindex
);
1148 /* Bind to specific ethernet device. */
1149 memset(&sll
, 0, sizeof sll
);
1150 sll
.sll_family
= AF_PACKET
;
1151 sll
.sll_ifindex
= ifindex
;
1152 sll
.sll_protocol
= htons(ETH_P_ALL
);
1153 if (bind(rx
->fd
, (struct sockaddr
*) &sll
, sizeof sll
) < 0) {
1155 VLOG_ERR("%s: failed to bind raw socket (%s)",
1156 netdev_get_name(netdev_
), ovs_strerror(error
));
1160 /* Filter for only inbound packets. */
1161 error
= setsockopt(rx
->fd
, SOL_SOCKET
, SO_ATTACH_FILTER
, &fprog
,
1165 VLOG_ERR("%s: failed to attach filter (%s)",
1166 netdev_get_name(netdev_
), ovs_strerror(error
));
1170 ovs_mutex_unlock(&netdev
->mutex
);
1178 ovs_mutex_unlock(&netdev
->mutex
);
1183 netdev_linux_rxq_destruct(struct netdev_rxq
*rxq_
)
1185 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1193 netdev_linux_rxq_dealloc(struct netdev_rxq
*rxq_
)
1195 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1201 auxdata_to_vlan_tpid(const struct tpacket_auxdata
*aux
, bool double_tagged
)
1203 if (aux
->tp_status
& TP_STATUS_VLAN_TPID_VALID
) {
1204 return htons(aux
->tp_vlan_tpid
);
1205 } else if (double_tagged
) {
1206 return htons(ETH_TYPE_VLAN_8021AD
);
1208 return htons(ETH_TYPE_VLAN_8021Q
);
1213 auxdata_has_vlan_tci(const struct tpacket_auxdata
*aux
)
1215 return aux
->tp_vlan_tci
|| aux
->tp_status
& TP_STATUS_VLAN_VALID
;
1219 netdev_linux_rxq_recv_sock(int fd
, struct dp_packet
*buffer
)
1224 struct cmsghdr
*cmsg
;
1226 struct cmsghdr cmsg
;
1227 char buffer
[CMSG_SPACE(sizeof(struct tpacket_auxdata
))];
1231 /* Reserve headroom for a single VLAN tag */
1232 dp_packet_reserve(buffer
, VLAN_HEADER_LEN
);
1233 size
= dp_packet_tailroom(buffer
);
1235 iov
.iov_base
= dp_packet_data(buffer
);
1237 msgh
.msg_name
= NULL
;
1238 msgh
.msg_namelen
= 0;
1239 msgh
.msg_iov
= &iov
;
1240 msgh
.msg_iovlen
= 1;
1241 msgh
.msg_control
= &cmsg_buffer
;
1242 msgh
.msg_controllen
= sizeof cmsg_buffer
;
1246 retval
= recvmsg(fd
, &msgh
, MSG_TRUNC
);
1247 } while (retval
< 0 && errno
== EINTR
);
1251 } else if (retval
> size
) {
1255 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1257 for (cmsg
= CMSG_FIRSTHDR(&msgh
); cmsg
; cmsg
= CMSG_NXTHDR(&msgh
, cmsg
)) {
1258 const struct tpacket_auxdata
*aux
;
1260 if (cmsg
->cmsg_level
!= SOL_PACKET
1261 || cmsg
->cmsg_type
!= PACKET_AUXDATA
1262 || cmsg
->cmsg_len
< CMSG_LEN(sizeof(struct tpacket_auxdata
))) {
1266 aux
= ALIGNED_CAST(struct tpacket_auxdata
*, CMSG_DATA(cmsg
));
1267 if (auxdata_has_vlan_tci(aux
)) {
1268 struct eth_header
*eth
;
1271 if (retval
< ETH_HEADER_LEN
) {
1275 eth
= dp_packet_data(buffer
);
1276 double_tagged
= eth
->eth_type
== htons(ETH_TYPE_VLAN_8021Q
);
1278 eth_push_vlan(buffer
, auxdata_to_vlan_tpid(aux
, double_tagged
),
1279 htons(aux
->tp_vlan_tci
));
1288 netdev_linux_rxq_recv_tap(int fd
, struct dp_packet
*buffer
)
1291 size_t size
= dp_packet_tailroom(buffer
);
1294 retval
= read(fd
, dp_packet_data(buffer
), size
);
1295 } while (retval
< 0 && errno
== EINTR
);
1301 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1306 netdev_linux_rxq_recv(struct netdev_rxq
*rxq_
, struct dp_packet_batch
*batch
,
1309 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1310 struct netdev
*netdev
= rx
->up
.netdev
;
1311 struct dp_packet
*buffer
;
1315 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
1316 mtu
= ETH_PAYLOAD_MAX
;
1319 /* Assume Ethernet port. No need to set packet_type. */
1320 buffer
= dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN
+ mtu
,
1321 DP_NETDEV_HEADROOM
);
1322 retval
= (rx
->is_tap
1323 ? netdev_linux_rxq_recv_tap(rx
->fd
, buffer
)
1324 : netdev_linux_rxq_recv_sock(rx
->fd
, buffer
));
1327 if (retval
!= EAGAIN
&& retval
!= EMSGSIZE
) {
1328 VLOG_WARN_RL(&rl
, "error receiving Ethernet packet on %s: %s",
1329 netdev_rxq_get_name(rxq_
), ovs_strerror(errno
));
1331 dp_packet_delete(buffer
);
1333 dp_packet_batch_init_packet(batch
, buffer
);
1344 netdev_linux_rxq_wait(struct netdev_rxq
*rxq_
)
1346 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1347 poll_fd_wait(rx
->fd
, POLLIN
);
1351 netdev_linux_rxq_drain(struct netdev_rxq
*rxq_
)
1353 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1356 int error
= af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_
), &ifr
,
1357 SIOCGIFTXQLEN
, "SIOCGIFTXQLEN");
1361 drain_fd(rx
->fd
, ifr
.ifr_qlen
);
1364 return drain_rcvbuf(rx
->fd
);
1369 netdev_linux_sock_batch_send(int sock
, int ifindex
,
1370 struct dp_packet_batch
*batch
)
1372 const size_t size
= dp_packet_batch_size(batch
);
1373 /* We don't bother setting most fields in sockaddr_ll because the
1374 * kernel ignores them for SOCK_RAW. */
1375 struct sockaddr_ll sll
= { .sll_family
= AF_PACKET
,
1376 .sll_ifindex
= ifindex
};
1378 struct mmsghdr
*mmsg
= xmalloc(sizeof(*mmsg
) * size
);
1379 struct iovec
*iov
= xmalloc(sizeof(*iov
) * size
);
1381 struct dp_packet
*packet
;
1382 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1383 iov
[i
].iov_base
= dp_packet_data(packet
);
1384 iov
[i
].iov_len
= dp_packet_size(packet
);
1385 mmsg
[i
].msg_hdr
= (struct msghdr
) { .msg_name
= &sll
,
1386 .msg_namelen
= sizeof sll
,
1392 for (uint32_t ofs
= 0; ofs
< size
; ) {
1395 retval
= sendmmsg(sock
, mmsg
+ ofs
, size
- ofs
, 0);
1396 error
= retval
< 0 ? errno
: 0;
1397 } while (error
== EINTR
);
1409 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1410 * essential, because packets sent to a tap device with an AF_PACKET socket
1411 * will loop back to be *received* again on the tap device. This doesn't occur
1412 * on other interface types because we attach a socket filter to the rx
1415 netdev_linux_tap_batch_send(struct netdev
*netdev_
,
1416 struct dp_packet_batch
*batch
)
1418 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1419 struct dp_packet
*packet
;
1421 /* The Linux tap driver returns EIO if the device is not up,
1422 * so if the device is not up, don't waste time sending it.
1423 * However, if the device is in another network namespace
1424 * then OVS can't retrieve the state. In that case, send the
1425 * packets anyway. */
1426 if (netdev
->present
&& !(netdev
->ifi_flags
& IFF_UP
)) {
1427 netdev
->tx_dropped
+= dp_packet_batch_size(batch
);
1431 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1432 size_t size
= dp_packet_size(packet
);
1437 retval
= write(netdev
->tap_fd
, dp_packet_data(packet
), size
);
1438 error
= retval
< 0 ? errno
: 0;
1439 } while (error
== EINTR
);
1442 /* The Linux tap driver returns EIO if the device is not up. From
1443 * the OVS side this is not an error, so we ignore it; otherwise,
1444 * return the erro. */
1448 } else if (retval
!= size
) {
1449 VLOG_WARN_RL(&rl
, "sent partial Ethernet packet (%"PRIuSIZE
" "
1450 "bytes of %"PRIuSIZE
") on %s",
1451 retval
, size
, netdev_get_name(netdev_
));
1458 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1459 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1460 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1461 * the packet is too big or too small to transmit on the device.
1463 * The kernel maintains a packet transmission queue, so the caller is not
1464 * expected to do additional queuing of packets. */
1466 netdev_linux_send(struct netdev
*netdev_
, int qid OVS_UNUSED
,
1467 struct dp_packet_batch
*batch
,
1468 bool concurrent_txq OVS_UNUSED
)
1473 if (!is_tap_netdev(netdev_
)) {
1474 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_
))) {
1479 sock
= af_packet_sock();
1485 int ifindex
= netdev_get_ifindex(netdev_
);
1491 error
= netdev_linux_sock_batch_send(sock
, ifindex
, batch
);
1493 error
= netdev_linux_tap_batch_send(netdev_
, batch
);
1496 if (error
== ENOBUFS
) {
1497 /* The Linux AF_PACKET implementation never blocks waiting
1498 * for room for packets, instead returning ENOBUFS.
1499 * Translate this into EAGAIN for the caller. */
1502 VLOG_WARN_RL(&rl
, "error sending Ethernet packet on %s: %s",
1503 netdev_get_name(netdev_
), ovs_strerror(error
));
1508 dp_packet_delete_batch(batch
, true);
1512 /* Registers with the poll loop to wake up from the next call to poll_block()
1513 * when the packet transmission queue has sufficient room to transmit a packet
1514 * with netdev_send().
1516 * The kernel maintains a packet transmission queue, so the client is not
1517 * expected to do additional queuing of packets. Thus, this function is
1518 * unlikely to ever be used. It is included for completeness. */
1520 netdev_linux_send_wait(struct netdev
*netdev
, int qid OVS_UNUSED
)
1522 if (is_tap_netdev(netdev
)) {
1523 /* TAP device always accepts packets.*/
1524 poll_immediate_wake();
1528 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1529 * otherwise a positive errno value. */
1531 netdev_linux_set_etheraddr(struct netdev
*netdev_
, const struct eth_addr mac
)
1533 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1534 enum netdev_flags old_flags
= 0;
1537 ovs_mutex_lock(&netdev
->mutex
);
1538 if (netdev_linux_netnsid_is_remote(netdev
)) {
1543 if (netdev
->cache_valid
& VALID_ETHERADDR
) {
1544 error
= netdev
->ether_addr_error
;
1545 if (error
|| eth_addr_equals(netdev
->etheraddr
, mac
)) {
1548 netdev
->cache_valid
&= ~VALID_ETHERADDR
;
1551 /* Tap devices must be brought down before setting the address. */
1552 if (is_tap_netdev(netdev_
)) {
1553 update_flags(netdev
, NETDEV_UP
, 0, &old_flags
);
1555 error
= set_etheraddr(netdev_get_name(netdev_
), mac
);
1556 if (!error
|| error
== ENODEV
) {
1557 netdev
->ether_addr_error
= error
;
1558 netdev
->cache_valid
|= VALID_ETHERADDR
;
1560 netdev
->etheraddr
= mac
;
1564 if (is_tap_netdev(netdev_
) && old_flags
& NETDEV_UP
) {
1565 update_flags(netdev
, 0, NETDEV_UP
, &old_flags
);
1569 ovs_mutex_unlock(&netdev
->mutex
);
1573 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1575 netdev_linux_get_etheraddr(const struct netdev
*netdev_
, struct eth_addr
*mac
)
1577 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1580 ovs_mutex_lock(&netdev
->mutex
);
1581 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1582 netdev_linux_update_via_netlink(netdev
);
1585 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1586 /* Fall back to ioctl if netlink fails */
1587 netdev
->ether_addr_error
= get_etheraddr(netdev_get_name(netdev_
),
1588 &netdev
->etheraddr
);
1589 netdev
->cache_valid
|= VALID_ETHERADDR
;
1592 error
= netdev
->ether_addr_error
;
1594 *mac
= netdev
->etheraddr
;
1596 ovs_mutex_unlock(&netdev
->mutex
);
1602 netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
)
1606 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1607 netdev_linux_update_via_netlink(netdev
);
1610 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1611 /* Fall back to ioctl if netlink fails */
1614 netdev
->netdev_mtu_error
= af_inet_ifreq_ioctl(
1615 netdev_get_name(&netdev
->up
), &ifr
, SIOCGIFMTU
, "SIOCGIFMTU");
1616 netdev
->mtu
= ifr
.ifr_mtu
;
1617 netdev
->cache_valid
|= VALID_MTU
;
1620 error
= netdev
->netdev_mtu_error
;
1622 *mtup
= netdev
->mtu
;
1628 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1629 * in bytes, not including the hardware header; thus, this is typically 1500
1630 * bytes for Ethernet devices. */
1632 netdev_linux_get_mtu(const struct netdev
*netdev_
, int *mtup
)
1634 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1637 ovs_mutex_lock(&netdev
->mutex
);
1638 error
= netdev_linux_get_mtu__(netdev
, mtup
);
1639 ovs_mutex_unlock(&netdev
->mutex
);
1644 /* Sets the maximum size of transmitted (MTU) for given device using linux
1645 * networking ioctl interface.
1648 netdev_linux_set_mtu(struct netdev
*netdev_
, int mtu
)
1650 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1654 ovs_mutex_lock(&netdev
->mutex
);
1655 if (netdev_linux_netnsid_is_remote(netdev
)) {
1660 if (netdev
->cache_valid
& VALID_MTU
) {
1661 error
= netdev
->netdev_mtu_error
;
1662 if (error
|| netdev
->mtu
== mtu
) {
1665 netdev
->cache_valid
&= ~VALID_MTU
;
1668 error
= af_inet_ifreq_ioctl(netdev_get_name(netdev_
), &ifr
,
1669 SIOCSIFMTU
, "SIOCSIFMTU");
1670 if (!error
|| error
== ENODEV
) {
1671 netdev
->netdev_mtu_error
= error
;
1672 netdev
->mtu
= ifr
.ifr_mtu
;
1673 netdev
->cache_valid
|= VALID_MTU
;
1676 ovs_mutex_unlock(&netdev
->mutex
);
1680 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1681 * On failure, returns a negative errno value. */
1683 netdev_linux_get_ifindex(const struct netdev
*netdev_
)
1685 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1688 ovs_mutex_lock(&netdev
->mutex
);
1689 if (netdev_linux_netnsid_is_remote(netdev
)) {
1693 error
= get_ifindex(netdev_
, &ifindex
);
1696 ovs_mutex_unlock(&netdev
->mutex
);
1697 return error
? -error
: ifindex
;
1701 netdev_linux_get_carrier(const struct netdev
*netdev_
, bool *carrier
)
1703 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1705 ovs_mutex_lock(&netdev
->mutex
);
1706 if (netdev
->miimon_interval
> 0) {
1707 *carrier
= netdev
->miimon
;
1709 *carrier
= (netdev
->ifi_flags
& IFF_RUNNING
) != 0;
1711 ovs_mutex_unlock(&netdev
->mutex
);
1716 static long long int
1717 netdev_linux_get_carrier_resets(const struct netdev
*netdev_
)
1719 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1720 long long int carrier_resets
;
1722 ovs_mutex_lock(&netdev
->mutex
);
1723 carrier_resets
= netdev
->carrier_resets
;
1724 ovs_mutex_unlock(&netdev
->mutex
);
1726 return carrier_resets
;
1730 netdev_linux_do_miimon(const char *name
, int cmd
, const char *cmd_name
,
1731 struct mii_ioctl_data
*data
)
1736 memset(&ifr
, 0, sizeof ifr
);
1737 memcpy(&ifr
.ifr_data
, data
, sizeof *data
);
1738 error
= af_inet_ifreq_ioctl(name
, &ifr
, cmd
, cmd_name
);
1739 memcpy(data
, &ifr
.ifr_data
, sizeof *data
);
1745 netdev_linux_get_miimon(const char *name
, bool *miimon
)
1747 struct mii_ioctl_data data
;
1752 memset(&data
, 0, sizeof data
);
1753 error
= netdev_linux_do_miimon(name
, SIOCGMIIPHY
, "SIOCGMIIPHY", &data
);
1755 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1756 data
.reg_num
= MII_BMSR
;
1757 error
= netdev_linux_do_miimon(name
, SIOCGMIIREG
, "SIOCGMIIREG",
1761 *miimon
= !!(data
.val_out
& BMSR_LSTATUS
);
1765 struct ethtool_cmd ecmd
;
1767 VLOG_DBG_RL(&rl
, "%s: failed to query MII, falling back to ethtool",
1770 COVERAGE_INC(netdev_get_ethtool
);
1771 memset(&ecmd
, 0, sizeof ecmd
);
1772 error
= netdev_linux_do_ethtool(name
, &ecmd
, ETHTOOL_GLINK
,
1775 struct ethtool_value eval
;
1777 memcpy(&eval
, &ecmd
, sizeof eval
);
1778 *miimon
= !!eval
.data
;
1780 VLOG_WARN_RL(&rl
, "%s: ethtool link status failed", name
);
1788 netdev_linux_set_miimon_interval(struct netdev
*netdev_
,
1789 long long int interval
)
1791 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1793 ovs_mutex_lock(&netdev
->mutex
);
1794 interval
= interval
> 0 ? MAX(interval
, 100) : 0;
1795 if (netdev
->miimon_interval
!= interval
) {
1796 if (interval
&& !netdev
->miimon_interval
) {
1797 atomic_count_inc(&miimon_cnt
);
1798 } else if (!interval
&& netdev
->miimon_interval
) {
1799 atomic_count_dec(&miimon_cnt
);
1802 netdev
->miimon_interval
= interval
;
1803 timer_set_expired(&netdev
->miimon_timer
);
1805 ovs_mutex_unlock(&netdev
->mutex
);
1811 netdev_linux_miimon_run(void)
1813 struct shash device_shash
;
1814 struct shash_node
*node
;
1816 shash_init(&device_shash
);
1817 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1818 SHASH_FOR_EACH (node
, &device_shash
) {
1819 struct netdev
*netdev
= node
->data
;
1820 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1823 ovs_mutex_lock(&dev
->mutex
);
1824 if (dev
->miimon_interval
> 0 && timer_expired(&dev
->miimon_timer
)) {
1825 netdev_linux_get_miimon(dev
->up
.name
, &miimon
);
1826 if (miimon
!= dev
->miimon
) {
1827 dev
->miimon
= miimon
;
1828 netdev_linux_changed(dev
, dev
->ifi_flags
, 0);
1831 timer_set_duration(&dev
->miimon_timer
, dev
->miimon_interval
);
1833 ovs_mutex_unlock(&dev
->mutex
);
1834 netdev_close(netdev
);
1837 shash_destroy(&device_shash
);
1841 netdev_linux_miimon_wait(void)
1843 struct shash device_shash
;
1844 struct shash_node
*node
;
1846 shash_init(&device_shash
);
1847 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1848 SHASH_FOR_EACH (node
, &device_shash
) {
1849 struct netdev
*netdev
= node
->data
;
1850 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1852 ovs_mutex_lock(&dev
->mutex
);
1853 if (dev
->miimon_interval
> 0) {
1854 timer_wait(&dev
->miimon_timer
);
1856 ovs_mutex_unlock(&dev
->mutex
);
1857 netdev_close(netdev
);
1859 shash_destroy(&device_shash
);
1863 swap_uint64(uint64_t *a
, uint64_t *b
)
1870 /* Copies 'src' into 'dst', performing format conversion in the process.
1872 * 'src' is allowed to be misaligned. */
1874 netdev_stats_from_ovs_vport_stats(struct netdev_stats
*dst
,
1875 const struct ovs_vport_stats
*src
)
1877 dst
->rx_packets
= get_32aligned_u64(&src
->rx_packets
);
1878 dst
->tx_packets
= get_32aligned_u64(&src
->tx_packets
);
1879 dst
->rx_bytes
= get_32aligned_u64(&src
->rx_bytes
);
1880 dst
->tx_bytes
= get_32aligned_u64(&src
->tx_bytes
);
1881 dst
->rx_errors
= get_32aligned_u64(&src
->rx_errors
);
1882 dst
->tx_errors
= get_32aligned_u64(&src
->tx_errors
);
1883 dst
->rx_dropped
= get_32aligned_u64(&src
->rx_dropped
);
1884 dst
->tx_dropped
= get_32aligned_u64(&src
->tx_dropped
);
1886 dst
->collisions
= 0;
1887 dst
->rx_length_errors
= 0;
1888 dst
->rx_over_errors
= 0;
1889 dst
->rx_crc_errors
= 0;
1890 dst
->rx_frame_errors
= 0;
1891 dst
->rx_fifo_errors
= 0;
1892 dst
->rx_missed_errors
= 0;
1893 dst
->tx_aborted_errors
= 0;
1894 dst
->tx_carrier_errors
= 0;
1895 dst
->tx_fifo_errors
= 0;
1896 dst
->tx_heartbeat_errors
= 0;
1897 dst
->tx_window_errors
= 0;
1901 get_stats_via_vport__(const struct netdev
*netdev
, struct netdev_stats
*stats
)
1903 struct dpif_netlink_vport reply
;
1907 error
= dpif_netlink_vport_get(netdev_get_name(netdev
), &reply
, &buf
);
1910 } else if (!reply
.stats
) {
1915 netdev_stats_from_ovs_vport_stats(stats
, reply
.stats
);
1923 get_stats_via_vport(const struct netdev
*netdev_
,
1924 struct netdev_stats
*stats
)
1926 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1928 if (!netdev
->vport_stats_error
||
1929 !(netdev
->cache_valid
& VALID_VPORT_STAT_ERROR
)) {
1932 error
= get_stats_via_vport__(netdev_
, stats
);
1933 if (error
&& error
!= ENOENT
&& error
!= ENODEV
) {
1934 VLOG_WARN_RL(&rl
, "%s: obtaining netdev stats via vport failed "
1936 netdev_get_name(netdev_
), ovs_strerror(error
));
1938 netdev
->vport_stats_error
= error
;
1939 netdev
->cache_valid
|= VALID_VPORT_STAT_ERROR
;
1943 /* Retrieves current device stats for 'netdev-linux'. */
1945 netdev_linux_get_stats(const struct netdev
*netdev_
,
1946 struct netdev_stats
*stats
)
1948 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1949 struct netdev_stats dev_stats
;
1952 ovs_mutex_lock(&netdev
->mutex
);
1953 get_stats_via_vport(netdev_
, stats
);
1954 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1956 if (!netdev
->vport_stats_error
) {
1959 } else if (netdev
->vport_stats_error
) {
1960 /* stats not available from OVS then use netdev stats. */
1963 /* Use kernel netdev's packet and byte counts since vport's counters
1964 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1966 stats
->rx_packets
= dev_stats
.rx_packets
;
1967 stats
->rx_bytes
= dev_stats
.rx_bytes
;
1968 stats
->tx_packets
= dev_stats
.tx_packets
;
1969 stats
->tx_bytes
= dev_stats
.tx_bytes
;
1971 stats
->rx_errors
+= dev_stats
.rx_errors
;
1972 stats
->tx_errors
+= dev_stats
.tx_errors
;
1973 stats
->rx_dropped
+= dev_stats
.rx_dropped
;
1974 stats
->tx_dropped
+= dev_stats
.tx_dropped
;
1975 stats
->multicast
+= dev_stats
.multicast
;
1976 stats
->collisions
+= dev_stats
.collisions
;
1977 stats
->rx_length_errors
+= dev_stats
.rx_length_errors
;
1978 stats
->rx_over_errors
+= dev_stats
.rx_over_errors
;
1979 stats
->rx_crc_errors
+= dev_stats
.rx_crc_errors
;
1980 stats
->rx_frame_errors
+= dev_stats
.rx_frame_errors
;
1981 stats
->rx_fifo_errors
+= dev_stats
.rx_fifo_errors
;
1982 stats
->rx_missed_errors
+= dev_stats
.rx_missed_errors
;
1983 stats
->tx_aborted_errors
+= dev_stats
.tx_aborted_errors
;
1984 stats
->tx_carrier_errors
+= dev_stats
.tx_carrier_errors
;
1985 stats
->tx_fifo_errors
+= dev_stats
.tx_fifo_errors
;
1986 stats
->tx_heartbeat_errors
+= dev_stats
.tx_heartbeat_errors
;
1987 stats
->tx_window_errors
+= dev_stats
.tx_window_errors
;
1989 ovs_mutex_unlock(&netdev
->mutex
);
1994 /* Retrieves current device stats for 'netdev-tap' netdev or
1995 * netdev-internal. */
1997 netdev_tap_get_stats(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
1999 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2000 struct netdev_stats dev_stats
;
2003 ovs_mutex_lock(&netdev
->mutex
);
2004 get_stats_via_vport(netdev_
, stats
);
2005 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
2007 if (!netdev
->vport_stats_error
) {
2010 } else if (netdev
->vport_stats_error
) {
2011 /* Transmit and receive stats will appear to be swapped relative to the
2012 * other ports since we are the one sending the data, not a remote
2013 * computer. For consistency, we swap them back here. This does not
2014 * apply if we are getting stats from the vport layer because it always
2015 * tracks stats from the perspective of the switch. */
2018 swap_uint64(&stats
->rx_packets
, &stats
->tx_packets
);
2019 swap_uint64(&stats
->rx_bytes
, &stats
->tx_bytes
);
2020 swap_uint64(&stats
->rx_errors
, &stats
->tx_errors
);
2021 swap_uint64(&stats
->rx_dropped
, &stats
->tx_dropped
);
2022 stats
->rx_length_errors
= 0;
2023 stats
->rx_over_errors
= 0;
2024 stats
->rx_crc_errors
= 0;
2025 stats
->rx_frame_errors
= 0;
2026 stats
->rx_fifo_errors
= 0;
2027 stats
->rx_missed_errors
= 0;
2028 stats
->tx_aborted_errors
= 0;
2029 stats
->tx_carrier_errors
= 0;
2030 stats
->tx_fifo_errors
= 0;
2031 stats
->tx_heartbeat_errors
= 0;
2032 stats
->tx_window_errors
= 0;
2034 /* Use kernel netdev's packet and byte counts since vport counters
2035 * do not reflect packet counts on the wire when GSO, TSO or GRO
2037 stats
->rx_packets
= dev_stats
.tx_packets
;
2038 stats
->rx_bytes
= dev_stats
.tx_bytes
;
2039 stats
->tx_packets
= dev_stats
.rx_packets
;
2040 stats
->tx_bytes
= dev_stats
.rx_bytes
;
2042 stats
->rx_dropped
+= dev_stats
.tx_dropped
;
2043 stats
->tx_dropped
+= dev_stats
.rx_dropped
;
2045 stats
->rx_errors
+= dev_stats
.tx_errors
;
2046 stats
->tx_errors
+= dev_stats
.rx_errors
;
2048 stats
->multicast
+= dev_stats
.multicast
;
2049 stats
->collisions
+= dev_stats
.collisions
;
2051 stats
->tx_dropped
+= netdev
->tx_dropped
;
2052 ovs_mutex_unlock(&netdev
->mutex
);
2058 netdev_internal_get_stats(const struct netdev
*netdev_
,
2059 struct netdev_stats
*stats
)
2061 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2064 ovs_mutex_lock(&netdev
->mutex
);
2065 get_stats_via_vport(netdev_
, stats
);
2066 error
= netdev
->vport_stats_error
;
2067 ovs_mutex_unlock(&netdev
->mutex
);
2073 netdev_linux_read_features(struct netdev_linux
*netdev
)
2075 struct ethtool_cmd ecmd
;
2079 if (netdev
->cache_valid
& VALID_FEATURES
) {
2083 COVERAGE_INC(netdev_get_ethtool
);
2084 memset(&ecmd
, 0, sizeof ecmd
);
2085 error
= netdev_linux_do_ethtool(netdev
->up
.name
, &ecmd
,
2086 ETHTOOL_GSET
, "ETHTOOL_GSET");
2091 /* Supported features. */
2092 netdev
->supported
= 0;
2093 if (ecmd
.supported
& SUPPORTED_10baseT_Half
) {
2094 netdev
->supported
|= NETDEV_F_10MB_HD
;
2096 if (ecmd
.supported
& SUPPORTED_10baseT_Full
) {
2097 netdev
->supported
|= NETDEV_F_10MB_FD
;
2099 if (ecmd
.supported
& SUPPORTED_100baseT_Half
) {
2100 netdev
->supported
|= NETDEV_F_100MB_HD
;
2102 if (ecmd
.supported
& SUPPORTED_100baseT_Full
) {
2103 netdev
->supported
|= NETDEV_F_100MB_FD
;
2105 if (ecmd
.supported
& SUPPORTED_1000baseT_Half
) {
2106 netdev
->supported
|= NETDEV_F_1GB_HD
;
2108 if ((ecmd
.supported
& SUPPORTED_1000baseT_Full
) ||
2109 (ecmd
.supported
& SUPPORTED_1000baseKX_Full
)) {
2110 netdev
->supported
|= NETDEV_F_1GB_FD
;
2112 if ((ecmd
.supported
& SUPPORTED_10000baseT_Full
) ||
2113 (ecmd
.supported
& SUPPORTED_10000baseKX4_Full
) ||
2114 (ecmd
.supported
& SUPPORTED_10000baseKR_Full
) ||
2115 (ecmd
.supported
& SUPPORTED_10000baseR_FEC
)) {
2116 netdev
->supported
|= NETDEV_F_10GB_FD
;
2118 if ((ecmd
.supported
& SUPPORTED_40000baseKR4_Full
) ||
2119 (ecmd
.supported
& SUPPORTED_40000baseCR4_Full
) ||
2120 (ecmd
.supported
& SUPPORTED_40000baseSR4_Full
) ||
2121 (ecmd
.supported
& SUPPORTED_40000baseLR4_Full
)) {
2122 netdev
->supported
|= NETDEV_F_40GB_FD
;
2124 if (ecmd
.supported
& SUPPORTED_TP
) {
2125 netdev
->supported
|= NETDEV_F_COPPER
;
2127 if (ecmd
.supported
& SUPPORTED_FIBRE
) {
2128 netdev
->supported
|= NETDEV_F_FIBER
;
2130 if (ecmd
.supported
& SUPPORTED_Autoneg
) {
2131 netdev
->supported
|= NETDEV_F_AUTONEG
;
2133 if (ecmd
.supported
& SUPPORTED_Pause
) {
2134 netdev
->supported
|= NETDEV_F_PAUSE
;
2136 if (ecmd
.supported
& SUPPORTED_Asym_Pause
) {
2137 netdev
->supported
|= NETDEV_F_PAUSE_ASYM
;
2140 /* Advertised features. */
2141 netdev
->advertised
= 0;
2142 if (ecmd
.advertising
& ADVERTISED_10baseT_Half
) {
2143 netdev
->advertised
|= NETDEV_F_10MB_HD
;
2145 if (ecmd
.advertising
& ADVERTISED_10baseT_Full
) {
2146 netdev
->advertised
|= NETDEV_F_10MB_FD
;
2148 if (ecmd
.advertising
& ADVERTISED_100baseT_Half
) {
2149 netdev
->advertised
|= NETDEV_F_100MB_HD
;
2151 if (ecmd
.advertising
& ADVERTISED_100baseT_Full
) {
2152 netdev
->advertised
|= NETDEV_F_100MB_FD
;
2154 if (ecmd
.advertising
& ADVERTISED_1000baseT_Half
) {
2155 netdev
->advertised
|= NETDEV_F_1GB_HD
;
2157 if ((ecmd
.advertising
& ADVERTISED_1000baseT_Full
) ||
2158 (ecmd
.advertising
& ADVERTISED_1000baseKX_Full
)) {
2159 netdev
->advertised
|= NETDEV_F_1GB_FD
;
2161 if ((ecmd
.advertising
& ADVERTISED_10000baseT_Full
) ||
2162 (ecmd
.advertising
& ADVERTISED_10000baseKX4_Full
) ||
2163 (ecmd
.advertising
& ADVERTISED_10000baseKR_Full
) ||
2164 (ecmd
.advertising
& ADVERTISED_10000baseR_FEC
)) {
2165 netdev
->advertised
|= NETDEV_F_10GB_FD
;
2167 if ((ecmd
.advertising
& ADVERTISED_40000baseKR4_Full
) ||
2168 (ecmd
.advertising
& ADVERTISED_40000baseCR4_Full
) ||
2169 (ecmd
.advertising
& ADVERTISED_40000baseSR4_Full
) ||
2170 (ecmd
.advertising
& ADVERTISED_40000baseLR4_Full
)) {
2171 netdev
->advertised
|= NETDEV_F_40GB_FD
;
2173 if (ecmd
.advertising
& ADVERTISED_TP
) {
2174 netdev
->advertised
|= NETDEV_F_COPPER
;
2176 if (ecmd
.advertising
& ADVERTISED_FIBRE
) {
2177 netdev
->advertised
|= NETDEV_F_FIBER
;
2179 if (ecmd
.advertising
& ADVERTISED_Autoneg
) {
2180 netdev
->advertised
|= NETDEV_F_AUTONEG
;
2182 if (ecmd
.advertising
& ADVERTISED_Pause
) {
2183 netdev
->advertised
|= NETDEV_F_PAUSE
;
2185 if (ecmd
.advertising
& ADVERTISED_Asym_Pause
) {
2186 netdev
->advertised
|= NETDEV_F_PAUSE_ASYM
;
2189 /* Current settings. */
2190 speed
= ethtool_cmd_speed(&ecmd
);
2191 if (speed
== SPEED_10
) {
2192 netdev
->current
= ecmd
.duplex
? NETDEV_F_10MB_FD
: NETDEV_F_10MB_HD
;
2193 } else if (speed
== SPEED_100
) {
2194 netdev
->current
= ecmd
.duplex
? NETDEV_F_100MB_FD
: NETDEV_F_100MB_HD
;
2195 } else if (speed
== SPEED_1000
) {
2196 netdev
->current
= ecmd
.duplex
? NETDEV_F_1GB_FD
: NETDEV_F_1GB_HD
;
2197 } else if (speed
== SPEED_10000
) {
2198 netdev
->current
= NETDEV_F_10GB_FD
;
2199 } else if (speed
== 40000) {
2200 netdev
->current
= NETDEV_F_40GB_FD
;
2201 } else if (speed
== 100000) {
2202 netdev
->current
= NETDEV_F_100GB_FD
;
2203 } else if (speed
== 1000000) {
2204 netdev
->current
= NETDEV_F_1TB_FD
;
2206 netdev
->current
= 0;
2209 if (ecmd
.port
== PORT_TP
) {
2210 netdev
->current
|= NETDEV_F_COPPER
;
2211 } else if (ecmd
.port
== PORT_FIBRE
) {
2212 netdev
->current
|= NETDEV_F_FIBER
;
2216 netdev
->current
|= NETDEV_F_AUTONEG
;
2220 netdev
->cache_valid
|= VALID_FEATURES
;
2221 netdev
->get_features_error
= error
;
2224 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2225 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2226 * Returns 0 if successful, otherwise a positive errno value. */
2228 netdev_linux_get_features(const struct netdev
*netdev_
,
2229 enum netdev_features
*current
,
2230 enum netdev_features
*advertised
,
2231 enum netdev_features
*supported
,
2232 enum netdev_features
*peer
)
2234 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2237 ovs_mutex_lock(&netdev
->mutex
);
2238 if (netdev_linux_netnsid_is_remote(netdev
)) {
2243 netdev_linux_read_features(netdev
);
2244 if (!netdev
->get_features_error
) {
2245 *current
= netdev
->current
;
2246 *advertised
= netdev
->advertised
;
2247 *supported
= netdev
->supported
;
2248 *peer
= 0; /* XXX */
2250 error
= netdev
->get_features_error
;
2253 ovs_mutex_unlock(&netdev
->mutex
);
2257 /* Set the features advertised by 'netdev' to 'advertise'. */
2259 netdev_linux_set_advertisements(struct netdev
*netdev_
,
2260 enum netdev_features advertise
)
2262 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2263 struct ethtool_cmd ecmd
;
2266 ovs_mutex_lock(&netdev
->mutex
);
2268 COVERAGE_INC(netdev_get_ethtool
);
2270 if (netdev_linux_netnsid_is_remote(netdev
)) {
2275 memset(&ecmd
, 0, sizeof ecmd
);
2276 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2277 ETHTOOL_GSET
, "ETHTOOL_GSET");
2282 ecmd
.advertising
= 0;
2283 if (advertise
& NETDEV_F_10MB_HD
) {
2284 ecmd
.advertising
|= ADVERTISED_10baseT_Half
;
2286 if (advertise
& NETDEV_F_10MB_FD
) {
2287 ecmd
.advertising
|= ADVERTISED_10baseT_Full
;
2289 if (advertise
& NETDEV_F_100MB_HD
) {
2290 ecmd
.advertising
|= ADVERTISED_100baseT_Half
;
2292 if (advertise
& NETDEV_F_100MB_FD
) {
2293 ecmd
.advertising
|= ADVERTISED_100baseT_Full
;
2295 if (advertise
& NETDEV_F_1GB_HD
) {
2296 ecmd
.advertising
|= ADVERTISED_1000baseT_Half
;
2298 if (advertise
& NETDEV_F_1GB_FD
) {
2299 ecmd
.advertising
|= ADVERTISED_1000baseT_Full
;
2301 if (advertise
& NETDEV_F_10GB_FD
) {
2302 ecmd
.advertising
|= ADVERTISED_10000baseT_Full
;
2304 if (advertise
& NETDEV_F_COPPER
) {
2305 ecmd
.advertising
|= ADVERTISED_TP
;
2307 if (advertise
& NETDEV_F_FIBER
) {
2308 ecmd
.advertising
|= ADVERTISED_FIBRE
;
2310 if (advertise
& NETDEV_F_AUTONEG
) {
2311 ecmd
.advertising
|= ADVERTISED_Autoneg
;
2313 if (advertise
& NETDEV_F_PAUSE
) {
2314 ecmd
.advertising
|= ADVERTISED_Pause
;
2316 if (advertise
& NETDEV_F_PAUSE_ASYM
) {
2317 ecmd
.advertising
|= ADVERTISED_Asym_Pause
;
2319 COVERAGE_INC(netdev_set_ethtool
);
2320 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2321 ETHTOOL_SSET
, "ETHTOOL_SSET");
2324 ovs_mutex_unlock(&netdev
->mutex
);
2328 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2329 * successful, otherwise a positive errno value. */
2331 netdev_linux_set_policing(struct netdev
*netdev_
,
2332 uint32_t kbits_rate
, uint32_t kbits_burst
)
2334 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2335 const char *netdev_name
= netdev_get_name(netdev_
);
2339 if (netdev_is_flow_api_enabled()) {
2341 VLOG_WARN_RL(&rl
, "%s: policing with offload isn't supported",
2347 kbits_burst
= (!kbits_rate
? 0 /* Force to 0 if no rate specified. */
2348 : !kbits_burst
? 8000 /* Default to 8000 kbits if 0. */
2349 : kbits_burst
); /* Stick with user-specified value. */
2351 ovs_mutex_lock(&netdev
->mutex
);
2352 if (netdev_linux_netnsid_is_remote(netdev
)) {
2357 if (netdev
->cache_valid
& VALID_POLICING
) {
2358 error
= netdev
->netdev_policing_error
;
2359 if (error
|| (netdev
->kbits_rate
== kbits_rate
&&
2360 netdev
->kbits_burst
== kbits_burst
)) {
2361 /* Assume that settings haven't changed since we last set them. */
2364 netdev
->cache_valid
&= ~VALID_POLICING
;
2367 error
= get_ifindex(netdev_
, &ifindex
);
2372 COVERAGE_INC(netdev_set_policing
);
2373 /* Remove any existing ingress qdisc. */
2374 error
= tc_add_del_ingress_qdisc(ifindex
, false, 0);
2376 VLOG_WARN_RL(&rl
, "%s: removing policing failed: %s",
2377 netdev_name
, ovs_strerror(error
));
2382 error
= tc_add_del_ingress_qdisc(ifindex
, true, 0);
2384 VLOG_WARN_RL(&rl
, "%s: adding policing qdisc failed: %s",
2385 netdev_name
, ovs_strerror(error
));
2389 error
= tc_add_policer(netdev_
, kbits_rate
, kbits_burst
);
2391 VLOG_WARN_RL(&rl
, "%s: adding policing action failed: %s",
2392 netdev_name
, ovs_strerror(error
));
2397 netdev
->kbits_rate
= kbits_rate
;
2398 netdev
->kbits_burst
= kbits_burst
;
2401 if (!error
|| error
== ENODEV
) {
2402 netdev
->netdev_policing_error
= error
;
2403 netdev
->cache_valid
|= VALID_POLICING
;
2405 ovs_mutex_unlock(&netdev
->mutex
);
2410 netdev_linux_get_qos_types(const struct netdev
*netdev OVS_UNUSED
,
2413 const struct tc_ops
*const *opsp
;
2414 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2415 const struct tc_ops
*ops
= *opsp
;
2416 if (ops
->tc_install
&& ops
->ovs_name
[0] != '\0') {
2417 sset_add(types
, ops
->ovs_name
);
2423 static const struct tc_ops
*
2424 tc_lookup_ovs_name(const char *name
)
2426 const struct tc_ops
*const *opsp
;
2428 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2429 const struct tc_ops
*ops
= *opsp
;
2430 if (!strcmp(name
, ops
->ovs_name
)) {
2437 static const struct tc_ops
*
2438 tc_lookup_linux_name(const char *name
)
2440 const struct tc_ops
*const *opsp
;
2442 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2443 const struct tc_ops
*ops
= *opsp
;
2444 if (ops
->linux_name
&& !strcmp(name
, ops
->linux_name
)) {
2451 static struct tc_queue
*
2452 tc_find_queue__(const struct netdev
*netdev_
, unsigned int queue_id
,
2455 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2456 struct tc_queue
*queue
;
2458 HMAP_FOR_EACH_IN_BUCKET (queue
, hmap_node
, hash
, &netdev
->tc
->queues
) {
2459 if (queue
->queue_id
== queue_id
) {
2466 static struct tc_queue
*
2467 tc_find_queue(const struct netdev
*netdev
, unsigned int queue_id
)
2469 return tc_find_queue__(netdev
, queue_id
, hash_int(queue_id
, 0));
2473 netdev_linux_get_qos_capabilities(const struct netdev
*netdev OVS_UNUSED
,
2475 struct netdev_qos_capabilities
*caps
)
2477 const struct tc_ops
*ops
= tc_lookup_ovs_name(type
);
2481 caps
->n_queues
= ops
->n_queues
;
2486 netdev_linux_get_qos(const struct netdev
*netdev_
,
2487 const char **typep
, struct smap
*details
)
2489 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2492 ovs_mutex_lock(&netdev
->mutex
);
2493 if (netdev_linux_netnsid_is_remote(netdev
)) {
2498 error
= tc_query_qdisc(netdev_
);
2500 *typep
= netdev
->tc
->ops
->ovs_name
;
2501 error
= (netdev
->tc
->ops
->qdisc_get
2502 ? netdev
->tc
->ops
->qdisc_get(netdev_
, details
)
2507 ovs_mutex_unlock(&netdev
->mutex
);
2512 netdev_linux_set_qos(struct netdev
*netdev_
,
2513 const char *type
, const struct smap
*details
)
2515 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2516 const struct tc_ops
*new_ops
;
2519 new_ops
= tc_lookup_ovs_name(type
);
2520 if (!new_ops
|| !new_ops
->tc_install
) {
2524 if (new_ops
== &tc_ops_noop
) {
2525 return new_ops
->tc_install(netdev_
, details
);
2528 ovs_mutex_lock(&netdev
->mutex
);
2529 if (netdev_linux_netnsid_is_remote(netdev
)) {
2534 error
= tc_query_qdisc(netdev_
);
2539 if (new_ops
== netdev
->tc
->ops
) {
2540 error
= new_ops
->qdisc_set
? new_ops
->qdisc_set(netdev_
, details
) : 0;
2542 /* Delete existing qdisc. */
2543 error
= tc_del_qdisc(netdev_
);
2547 ovs_assert(netdev
->tc
== NULL
);
2549 /* Install new qdisc. */
2550 error
= new_ops
->tc_install(netdev_
, details
);
2551 ovs_assert((error
== 0) == (netdev
->tc
!= NULL
));
2555 ovs_mutex_unlock(&netdev
->mutex
);
2560 netdev_linux_get_queue(const struct netdev
*netdev_
,
2561 unsigned int queue_id
, struct smap
*details
)
2563 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2566 ovs_mutex_lock(&netdev
->mutex
);
2567 if (netdev_linux_netnsid_is_remote(netdev
)) {
2572 error
= tc_query_qdisc(netdev_
);
2574 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2576 ? netdev
->tc
->ops
->class_get(netdev_
, queue
, details
)
2581 ovs_mutex_unlock(&netdev
->mutex
);
2586 netdev_linux_set_queue(struct netdev
*netdev_
,
2587 unsigned int queue_id
, const struct smap
*details
)
2589 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2592 ovs_mutex_lock(&netdev
->mutex
);
2593 if (netdev_linux_netnsid_is_remote(netdev
)) {
2598 error
= tc_query_qdisc(netdev_
);
2600 error
= (queue_id
< netdev
->tc
->ops
->n_queues
2601 && netdev
->tc
->ops
->class_set
2602 ? netdev
->tc
->ops
->class_set(netdev_
, queue_id
, details
)
2607 ovs_mutex_unlock(&netdev
->mutex
);
2612 netdev_linux_delete_queue(struct netdev
*netdev_
, unsigned int queue_id
)
2614 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2617 ovs_mutex_lock(&netdev
->mutex
);
2618 if (netdev_linux_netnsid_is_remote(netdev
)) {
2623 error
= tc_query_qdisc(netdev_
);
2625 if (netdev
->tc
->ops
->class_delete
) {
2626 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2628 ? netdev
->tc
->ops
->class_delete(netdev_
, queue
)
2636 ovs_mutex_unlock(&netdev
->mutex
);
2641 netdev_linux_get_queue_stats(const struct netdev
*netdev_
,
2642 unsigned int queue_id
,
2643 struct netdev_queue_stats
*stats
)
2645 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2648 ovs_mutex_lock(&netdev
->mutex
);
2649 if (netdev_linux_netnsid_is_remote(netdev
)) {
2654 error
= tc_query_qdisc(netdev_
);
2656 if (netdev
->tc
->ops
->class_get_stats
) {
2657 const struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2659 stats
->created
= queue
->created
;
2660 error
= netdev
->tc
->ops
->class_get_stats(netdev_
, queue
,
2671 ovs_mutex_unlock(&netdev
->mutex
);
2675 struct queue_dump_state
{
2676 struct nl_dump dump
;
2681 start_queue_dump(const struct netdev
*netdev
, struct queue_dump_state
*state
)
2683 struct ofpbuf request
;
2684 struct tcmsg
*tcmsg
;
2686 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, 0, &request
);
2690 tcmsg
->tcm_parent
= 0;
2691 nl_dump_start(&state
->dump
, NETLINK_ROUTE
, &request
);
2692 ofpbuf_uninit(&request
);
2694 ofpbuf_init(&state
->buf
, NL_DUMP_BUFSIZE
);
2699 finish_queue_dump(struct queue_dump_state
*state
)
2701 ofpbuf_uninit(&state
->buf
);
2702 return nl_dump_done(&state
->dump
);
2705 struct netdev_linux_queue_state
{
2706 unsigned int *queues
;
2712 netdev_linux_queue_dump_start(const struct netdev
*netdev_
, void **statep
)
2714 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2717 ovs_mutex_lock(&netdev
->mutex
);
2718 if (netdev_linux_netnsid_is_remote(netdev
)) {
2723 error
= tc_query_qdisc(netdev_
);
2725 if (netdev
->tc
->ops
->class_get
) {
2726 struct netdev_linux_queue_state
*state
;
2727 struct tc_queue
*queue
;
2730 *statep
= state
= xmalloc(sizeof *state
);
2731 state
->n_queues
= hmap_count(&netdev
->tc
->queues
);
2732 state
->cur_queue
= 0;
2733 state
->queues
= xmalloc(state
->n_queues
* sizeof *state
->queues
);
2736 HMAP_FOR_EACH (queue
, hmap_node
, &netdev
->tc
->queues
) {
2737 state
->queues
[i
++] = queue
->queue_id
;
2745 ovs_mutex_unlock(&netdev
->mutex
);
2750 netdev_linux_queue_dump_next(const struct netdev
*netdev_
, void *state_
,
2751 unsigned int *queue_idp
, struct smap
*details
)
2753 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2754 struct netdev_linux_queue_state
*state
= state_
;
2757 ovs_mutex_lock(&netdev
->mutex
);
2758 if (netdev_linux_netnsid_is_remote(netdev
)) {
2763 while (state
->cur_queue
< state
->n_queues
) {
2764 unsigned int queue_id
= state
->queues
[state
->cur_queue
++];
2765 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2768 *queue_idp
= queue_id
;
2769 error
= netdev
->tc
->ops
->class_get(netdev_
, queue
, details
);
2775 ovs_mutex_unlock(&netdev
->mutex
);
2780 netdev_linux_queue_dump_done(const struct netdev
*netdev OVS_UNUSED
,
2783 struct netdev_linux_queue_state
*state
= state_
;
2785 free(state
->queues
);
2791 netdev_linux_dump_queue_stats(const struct netdev
*netdev_
,
2792 netdev_dump_queue_stats_cb
*cb
, void *aux
)
2794 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2797 ovs_mutex_lock(&netdev
->mutex
);
2798 if (netdev_linux_netnsid_is_remote(netdev
)) {
2803 error
= tc_query_qdisc(netdev_
);
2805 struct queue_dump_state state
;
2807 if (!netdev
->tc
->ops
->class_dump_stats
) {
2809 } else if (!start_queue_dump(netdev_
, &state
)) {
2815 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
2816 retval
= netdev
->tc
->ops
->class_dump_stats(netdev_
, &msg
,
2823 retval
= finish_queue_dump(&state
);
2831 ovs_mutex_unlock(&netdev
->mutex
);
2836 netdev_linux_set_in4(struct netdev
*netdev_
, struct in_addr address
,
2837 struct in_addr netmask
)
2839 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2842 ovs_mutex_lock(&netdev
->mutex
);
2843 if (netdev_linux_netnsid_is_remote(netdev
)) {
2848 error
= do_set_addr(netdev_
, SIOCSIFADDR
, "SIOCSIFADDR", address
);
2850 if (address
.s_addr
!= INADDR_ANY
) {
2851 error
= do_set_addr(netdev_
, SIOCSIFNETMASK
,
2852 "SIOCSIFNETMASK", netmask
);
2857 ovs_mutex_unlock(&netdev
->mutex
);
2861 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2862 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2865 netdev_linux_get_addr_list(const struct netdev
*netdev_
,
2866 struct in6_addr
**addr
, struct in6_addr
**mask
, int *n_cnt
)
2868 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2871 ovs_mutex_lock(&netdev
->mutex
);
2872 if (netdev_linux_netnsid_is_remote(netdev
)) {
2877 error
= netdev_get_addrs(netdev_get_name(netdev_
), addr
, mask
, n_cnt
);
2880 ovs_mutex_unlock(&netdev
->mutex
);
2885 make_in4_sockaddr(struct sockaddr
*sa
, struct in_addr addr
)
2887 struct sockaddr_in sin
;
2888 memset(&sin
, 0, sizeof sin
);
2889 sin
.sin_family
= AF_INET
;
2890 sin
.sin_addr
= addr
;
2893 memset(sa
, 0, sizeof *sa
);
2894 memcpy(sa
, &sin
, sizeof sin
);
2898 do_set_addr(struct netdev
*netdev
,
2899 int ioctl_nr
, const char *ioctl_name
, struct in_addr addr
)
2903 make_in4_sockaddr(&ifr
.ifr_addr
, addr
);
2904 return af_inet_ifreq_ioctl(netdev_get_name(netdev
), &ifr
, ioctl_nr
,
2908 /* Adds 'router' as a default IP gateway. */
2910 netdev_linux_add_router(struct netdev
*netdev OVS_UNUSED
, struct in_addr router
)
2912 struct in_addr any
= { INADDR_ANY
};
2916 memset(&rt
, 0, sizeof rt
);
2917 make_in4_sockaddr(&rt
.rt_dst
, any
);
2918 make_in4_sockaddr(&rt
.rt_gateway
, router
);
2919 make_in4_sockaddr(&rt
.rt_genmask
, any
);
2920 rt
.rt_flags
= RTF_UP
| RTF_GATEWAY
;
2921 error
= af_inet_ioctl(SIOCADDRT
, &rt
);
2923 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error
));
2929 netdev_linux_get_next_hop(const struct in_addr
*host
, struct in_addr
*next_hop
,
2932 static const char fn
[] = "/proc/net/route";
2937 *netdev_name
= NULL
;
2938 stream
= fopen(fn
, "r");
2939 if (stream
== NULL
) {
2940 VLOG_WARN_RL(&rl
, "%s: open failed: %s", fn
, ovs_strerror(errno
));
2945 while (fgets(line
, sizeof line
, stream
)) {
2948 ovs_be32 dest
, gateway
, mask
;
2949 int refcnt
, metric
, mtu
;
2950 unsigned int flags
, use
, window
, irtt
;
2953 "%16s %"SCNx32
" %"SCNx32
" %04X %d %u %d %"SCNx32
2955 iface
, &dest
, &gateway
, &flags
, &refcnt
,
2956 &use
, &metric
, &mask
, &mtu
, &window
, &irtt
)) {
2957 VLOG_WARN_RL(&rl
, "%s: could not parse line %d: %s",
2961 if (!(flags
& RTF_UP
)) {
2962 /* Skip routes that aren't up. */
2966 /* The output of 'dest', 'mask', and 'gateway' were given in
2967 * network byte order, so we don't need need any endian
2968 * conversions here. */
2969 if ((dest
& mask
) == (host
->s_addr
& mask
)) {
2971 /* The host is directly reachable. */
2972 next_hop
->s_addr
= 0;
2974 /* To reach the host, we must go through a gateway. */
2975 next_hop
->s_addr
= gateway
;
2977 *netdev_name
= xstrdup(iface
);
2989 netdev_linux_get_status(const struct netdev
*netdev_
, struct smap
*smap
)
2991 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2994 ovs_mutex_lock(&netdev
->mutex
);
2995 if (!(netdev
->cache_valid
& VALID_DRVINFO
)) {
2996 struct ethtool_cmd
*cmd
= (struct ethtool_cmd
*) &netdev
->drvinfo
;
2998 COVERAGE_INC(netdev_get_ethtool
);
2999 memset(&netdev
->drvinfo
, 0, sizeof netdev
->drvinfo
);
3000 error
= netdev_linux_do_ethtool(netdev
->up
.name
,
3003 "ETHTOOL_GDRVINFO");
3005 netdev
->cache_valid
|= VALID_DRVINFO
;
3010 smap_add(smap
, "driver_name", netdev
->drvinfo
.driver
);
3011 smap_add(smap
, "driver_version", netdev
->drvinfo
.version
);
3012 smap_add(smap
, "firmware_version", netdev
->drvinfo
.fw_version
);
3014 ovs_mutex_unlock(&netdev
->mutex
);
3020 netdev_internal_get_status(const struct netdev
*netdev OVS_UNUSED
,
3023 smap_add(smap
, "driver_name", "openvswitch");
3028 netdev_linux_get_block_id(struct netdev
*netdev_
)
3030 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3031 uint32_t block_id
= 0;
3033 ovs_mutex_lock(&netdev
->mutex
);
3034 /* Ensure the linux netdev has had its fields populated. */
3035 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
3036 netdev_linux_update_via_netlink(netdev
);
3039 /* Only assigning block ids to linux netdevs that are LAG masters. */
3040 if (netdev
->is_lag_master
) {
3041 block_id
= netdev
->ifindex
;
3043 ovs_mutex_unlock(&netdev
->mutex
);
3048 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3049 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3050 * returns 0. Otherwise, it returns a positive errno value; in particular,
3051 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3053 netdev_linux_arp_lookup(const struct netdev
*netdev
,
3054 ovs_be32 ip
, struct eth_addr
*mac
)
3057 struct sockaddr_in sin
;
3060 memset(&r
, 0, sizeof r
);
3061 memset(&sin
, 0, sizeof sin
);
3062 sin
.sin_family
= AF_INET
;
3063 sin
.sin_addr
.s_addr
= ip
;
3065 memcpy(&r
.arp_pa
, &sin
, sizeof sin
);
3066 r
.arp_ha
.sa_family
= ARPHRD_ETHER
;
3068 ovs_strzcpy(r
.arp_dev
, netdev_get_name(netdev
), sizeof r
.arp_dev
);
3069 COVERAGE_INC(netdev_arp_lookup
);
3070 retval
= af_inet_ioctl(SIOCGARP
, &r
);
3072 memcpy(mac
, r
.arp_ha
.sa_data
, ETH_ADDR_LEN
);
3073 } else if (retval
!= ENXIO
) {
3074 VLOG_WARN_RL(&rl
, "%s: could not look up ARP entry for "IP_FMT
": %s",
3075 netdev_get_name(netdev
), IP_ARGS(ip
),
3076 ovs_strerror(retval
));
3082 nd_to_iff_flags(enum netdev_flags nd
)
3085 if (nd
& NETDEV_UP
) {
3088 if (nd
& NETDEV_PROMISC
) {
3091 if (nd
& NETDEV_LOOPBACK
) {
3092 iff
|= IFF_LOOPBACK
;
3098 iff_to_nd_flags(int iff
)
3100 enum netdev_flags nd
= 0;
3104 if (iff
& IFF_PROMISC
) {
3105 nd
|= NETDEV_PROMISC
;
3107 if (iff
& IFF_LOOPBACK
) {
3108 nd
|= NETDEV_LOOPBACK
;
3114 update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
3115 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
3116 OVS_REQUIRES(netdev
->mutex
)
3118 int old_flags
, new_flags
;
3121 old_flags
= netdev
->ifi_flags
;
3122 *old_flagsp
= iff_to_nd_flags(old_flags
);
3123 new_flags
= (old_flags
& ~nd_to_iff_flags(off
)) | nd_to_iff_flags(on
);
3124 if (new_flags
!= old_flags
) {
3125 error
= set_flags(netdev_get_name(&netdev
->up
), new_flags
);
3126 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
3133 netdev_linux_update_flags(struct netdev
*netdev_
, enum netdev_flags off
,
3134 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
3136 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3139 ovs_mutex_lock(&netdev
->mutex
);
3141 /* Changing flags over netlink isn't support yet. */
3142 if (netdev_linux_netnsid_is_remote(netdev
)) {
3146 error
= update_flags(netdev
, off
, on
, old_flagsp
);
3148 /* Try reading flags over netlink, or fall back to ioctl. */
3149 if (!netdev_linux_update_via_netlink(netdev
)) {
3150 *old_flagsp
= iff_to_nd_flags(netdev
->ifi_flags
);
3152 error
= update_flags(netdev
, off
, on
, old_flagsp
);
3157 ovs_mutex_unlock(&netdev
->mutex
);
3161 #define NETDEV_LINUX_CLASS_COMMON \
3162 .run = netdev_linux_run, \
3163 .wait = netdev_linux_wait, \
3164 .alloc = netdev_linux_alloc, \
3165 .destruct = netdev_linux_destruct, \
3166 .dealloc = netdev_linux_dealloc, \
3167 .send = netdev_linux_send, \
3168 .send_wait = netdev_linux_send_wait, \
3169 .set_etheraddr = netdev_linux_set_etheraddr, \
3170 .get_etheraddr = netdev_linux_get_etheraddr, \
3171 .get_mtu = netdev_linux_get_mtu, \
3172 .set_mtu = netdev_linux_set_mtu, \
3173 .get_ifindex = netdev_linux_get_ifindex, \
3174 .get_carrier = netdev_linux_get_carrier, \
3175 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3176 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3177 .set_advertisements = netdev_linux_set_advertisements, \
3178 .set_policing = netdev_linux_set_policing, \
3179 .get_qos_types = netdev_linux_get_qos_types, \
3180 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3181 .get_qos = netdev_linux_get_qos, \
3182 .set_qos = netdev_linux_set_qos, \
3183 .get_queue = netdev_linux_get_queue, \
3184 .set_queue = netdev_linux_set_queue, \
3185 .delete_queue = netdev_linux_delete_queue, \
3186 .get_queue_stats = netdev_linux_get_queue_stats, \
3187 .queue_dump_start = netdev_linux_queue_dump_start, \
3188 .queue_dump_next = netdev_linux_queue_dump_next, \
3189 .queue_dump_done = netdev_linux_queue_dump_done, \
3190 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3191 .set_in4 = netdev_linux_set_in4, \
3192 .get_addr_list = netdev_linux_get_addr_list, \
3193 .add_router = netdev_linux_add_router, \
3194 .get_next_hop = netdev_linux_get_next_hop, \
3195 .arp_lookup = netdev_linux_arp_lookup, \
3196 .update_flags = netdev_linux_update_flags, \
3197 .rxq_alloc = netdev_linux_rxq_alloc, \
3198 .rxq_construct = netdev_linux_rxq_construct, \
3199 .rxq_destruct = netdev_linux_rxq_destruct, \
3200 .rxq_dealloc = netdev_linux_rxq_dealloc, \
3201 .rxq_recv = netdev_linux_rxq_recv, \
3202 .rxq_wait = netdev_linux_rxq_wait, \
3203 .rxq_drain = netdev_linux_rxq_drain
3205 const struct netdev_class netdev_linux_class
= {
3206 NETDEV_LINUX_CLASS_COMMON
,
3207 LINUX_FLOW_OFFLOAD_API
,
3209 .construct
= netdev_linux_construct
,
3210 .get_stats
= netdev_linux_get_stats
,
3211 .get_features
= netdev_linux_get_features
,
3212 .get_status
= netdev_linux_get_status
,
3213 .get_block_id
= netdev_linux_get_block_id
3216 const struct netdev_class netdev_tap_class
= {
3217 NETDEV_LINUX_CLASS_COMMON
,
3219 .construct
= netdev_linux_construct_tap
,
3220 .get_stats
= netdev_tap_get_stats
,
3221 .get_features
= netdev_linux_get_features
,
3222 .get_status
= netdev_linux_get_status
,
3225 const struct netdev_class netdev_internal_class
= {
3226 NETDEV_LINUX_CLASS_COMMON
,
3228 .construct
= netdev_linux_construct
,
3229 .get_stats
= netdev_internal_get_stats
,
3230 .get_status
= netdev_internal_get_status
,
3234 #define CODEL_N_QUEUES 0x0000
3236 /* In sufficiently new kernel headers these are defined as enums in
3237 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3238 * kernels. (This overrides any enum definition in the header file but that's
3240 #define TCA_CODEL_TARGET 1
3241 #define TCA_CODEL_LIMIT 2
3242 #define TCA_CODEL_INTERVAL 3
3251 static struct codel
*
3252 codel_get__(const struct netdev
*netdev_
)
3254 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3255 return CONTAINER_OF(netdev
->tc
, struct codel
, tc
);
3259 codel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3262 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3263 struct codel
*codel
;
3265 codel
= xmalloc(sizeof *codel
);
3266 tc_init(&codel
->tc
, &tc_ops_codel
);
3267 codel
->target
= target
;
3268 codel
->limit
= limit
;
3269 codel
->interval
= interval
;
3271 netdev
->tc
= &codel
->tc
;
3275 codel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3279 struct ofpbuf request
;
3280 struct tcmsg
*tcmsg
;
3281 uint32_t otarget
, olimit
, ointerval
;
3284 tc_del_qdisc(netdev
);
3286 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3287 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3291 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3292 tcmsg
->tcm_parent
= TC_H_ROOT
;
3294 otarget
= target
? target
: 5000;
3295 olimit
= limit
? limit
: 10240;
3296 ointerval
= interval
? interval
: 100000;
3298 nl_msg_put_string(&request
, TCA_KIND
, "codel");
3299 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3300 nl_msg_put_u32(&request
, TCA_CODEL_TARGET
, otarget
);
3301 nl_msg_put_u32(&request
, TCA_CODEL_LIMIT
, olimit
);
3302 nl_msg_put_u32(&request
, TCA_CODEL_INTERVAL
, ointerval
);
3303 nl_msg_end_nested(&request
, opt_offset
);
3305 error
= tc_transact(&request
, NULL
);
3307 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3308 "target %u, limit %u, interval %u error %d(%s)",
3309 netdev_get_name(netdev
),
3310 otarget
, olimit
, ointerval
,
3311 error
, ovs_strerror(error
));
3317 codel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3318 const struct smap
*details
, struct codel
*codel
)
3320 codel
->target
= smap_get_ullong(details
, "target", 0);
3321 codel
->limit
= smap_get_ullong(details
, "limit", 0);
3322 codel
->interval
= smap_get_ullong(details
, "interval", 0);
3324 if (!codel
->target
) {
3325 codel
->target
= 5000;
3327 if (!codel
->limit
) {
3328 codel
->limit
= 10240;
3330 if (!codel
->interval
) {
3331 codel
->interval
= 100000;
3336 codel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3341 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3342 error
= codel_setup_qdisc__(netdev
, codel
.target
, codel
.limit
,
3345 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3351 codel_parse_tca_options__(struct nlattr
*nl_options
, struct codel
*codel
)
3353 static const struct nl_policy tca_codel_policy
[] = {
3354 [TCA_CODEL_TARGET
] = { .type
= NL_A_U32
},
3355 [TCA_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3356 [TCA_CODEL_INTERVAL
] = { .type
= NL_A_U32
}
3359 struct nlattr
*attrs
[ARRAY_SIZE(tca_codel_policy
)];
3361 if (!nl_parse_nested(nl_options
, tca_codel_policy
,
3362 attrs
, ARRAY_SIZE(tca_codel_policy
))) {
3363 VLOG_WARN_RL(&rl
, "failed to parse CoDel class options");
3367 codel
->target
= nl_attr_get_u32(attrs
[TCA_CODEL_TARGET
]);
3368 codel
->limit
= nl_attr_get_u32(attrs
[TCA_CODEL_LIMIT
]);
3369 codel
->interval
= nl_attr_get_u32(attrs
[TCA_CODEL_INTERVAL
]);
3374 codel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3376 struct nlattr
*nlattr
;
3381 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3386 error
= codel_parse_tca_options__(nlattr
, &codel
);
3391 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3397 codel_tc_destroy(struct tc
*tc
)
3399 struct codel
*codel
= CONTAINER_OF(tc
, struct codel
, tc
);
3405 codel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3407 const struct codel
*codel
= codel_get__(netdev
);
3408 smap_add_format(details
, "target", "%u", codel
->target
);
3409 smap_add_format(details
, "limit", "%u", codel
->limit
);
3410 smap_add_format(details
, "interval", "%u", codel
->interval
);
3415 codel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3419 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3420 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3421 codel_get__(netdev
)->target
= codel
.target
;
3422 codel_get__(netdev
)->limit
= codel
.limit
;
3423 codel_get__(netdev
)->interval
= codel
.interval
;
3427 static const struct tc_ops tc_ops_codel
= {
3428 .linux_name
= "codel",
3429 .ovs_name
= "linux-codel",
3430 .n_queues
= CODEL_N_QUEUES
,
3431 .tc_install
= codel_tc_install
,
3432 .tc_load
= codel_tc_load
,
3433 .tc_destroy
= codel_tc_destroy
,
3434 .qdisc_get
= codel_qdisc_get
,
3435 .qdisc_set
= codel_qdisc_set
,
3438 /* FQ-CoDel traffic control class. */
3440 #define FQCODEL_N_QUEUES 0x0000
3442 /* In sufficiently new kernel headers these are defined as enums in
3443 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3444 * kernels. (This overrides any enum definition in the header file but that's
3446 #define TCA_FQ_CODEL_TARGET 1
3447 #define TCA_FQ_CODEL_LIMIT 2
3448 #define TCA_FQ_CODEL_INTERVAL 3
3449 #define TCA_FQ_CODEL_ECN 4
3450 #define TCA_FQ_CODEL_FLOWS 5
3451 #define TCA_FQ_CODEL_QUANTUM 6
3462 static struct fqcodel
*
3463 fqcodel_get__(const struct netdev
*netdev_
)
3465 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3466 return CONTAINER_OF(netdev
->tc
, struct fqcodel
, tc
);
3470 fqcodel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3471 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3473 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3474 struct fqcodel
*fqcodel
;
3476 fqcodel
= xmalloc(sizeof *fqcodel
);
3477 tc_init(&fqcodel
->tc
, &tc_ops_fqcodel
);
3478 fqcodel
->target
= target
;
3479 fqcodel
->limit
= limit
;
3480 fqcodel
->interval
= interval
;
3481 fqcodel
->flows
= flows
;
3482 fqcodel
->quantum
= quantum
;
3484 netdev
->tc
= &fqcodel
->tc
;
3488 fqcodel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3489 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3492 struct ofpbuf request
;
3493 struct tcmsg
*tcmsg
;
3494 uint32_t otarget
, olimit
, ointerval
, oflows
, oquantum
;
3497 tc_del_qdisc(netdev
);
3499 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3500 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3504 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3505 tcmsg
->tcm_parent
= TC_H_ROOT
;
3507 otarget
= target
? target
: 5000;
3508 olimit
= limit
? limit
: 10240;
3509 ointerval
= interval
? interval
: 100000;
3510 oflows
= flows
? flows
: 1024;
3511 oquantum
= quantum
? quantum
: 1514; /* fq_codel default quantum is 1514
3514 nl_msg_put_string(&request
, TCA_KIND
, "fq_codel");
3515 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3516 nl_msg_put_u32(&request
, TCA_FQ_CODEL_TARGET
, otarget
);
3517 nl_msg_put_u32(&request
, TCA_FQ_CODEL_LIMIT
, olimit
);
3518 nl_msg_put_u32(&request
, TCA_FQ_CODEL_INTERVAL
, ointerval
);
3519 nl_msg_put_u32(&request
, TCA_FQ_CODEL_FLOWS
, oflows
);
3520 nl_msg_put_u32(&request
, TCA_FQ_CODEL_QUANTUM
, oquantum
);
3521 nl_msg_end_nested(&request
, opt_offset
);
3523 error
= tc_transact(&request
, NULL
);
3525 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3526 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3527 netdev_get_name(netdev
),
3528 otarget
, olimit
, ointerval
, oflows
, oquantum
,
3529 error
, ovs_strerror(error
));
3535 fqcodel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3536 const struct smap
*details
, struct fqcodel
*fqcodel
)
3538 fqcodel
->target
= smap_get_ullong(details
, "target", 0);
3539 fqcodel
->limit
= smap_get_ullong(details
, "limit", 0);
3540 fqcodel
->interval
= smap_get_ullong(details
, "interval", 0);
3541 fqcodel
->flows
= smap_get_ullong(details
, "flows", 0);
3542 fqcodel
->quantum
= smap_get_ullong(details
, "quantum", 0);
3544 if (!fqcodel
->target
) {
3545 fqcodel
->target
= 5000;
3547 if (!fqcodel
->limit
) {
3548 fqcodel
->limit
= 10240;
3550 if (!fqcodel
->interval
) {
3551 fqcodel
->interval
= 1000000;
3553 if (!fqcodel
->flows
) {
3554 fqcodel
->flows
= 1024;
3556 if (!fqcodel
->quantum
) {
3557 fqcodel
->quantum
= 1514;
3562 fqcodel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3565 struct fqcodel fqcodel
;
3567 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3568 error
= fqcodel_setup_qdisc__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3569 fqcodel
.interval
, fqcodel
.flows
,
3572 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3573 fqcodel
.interval
, fqcodel
.flows
, fqcodel
.quantum
);
3579 fqcodel_parse_tca_options__(struct nlattr
*nl_options
, struct fqcodel
*fqcodel
)
3581 static const struct nl_policy tca_fqcodel_policy
[] = {
3582 [TCA_FQ_CODEL_TARGET
] = { .type
= NL_A_U32
},
3583 [TCA_FQ_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3584 [TCA_FQ_CODEL_INTERVAL
] = { .type
= NL_A_U32
},
3585 [TCA_FQ_CODEL_FLOWS
] = { .type
= NL_A_U32
},
3586 [TCA_FQ_CODEL_QUANTUM
] = { .type
= NL_A_U32
}
3589 struct nlattr
*attrs
[ARRAY_SIZE(tca_fqcodel_policy
)];
3591 if (!nl_parse_nested(nl_options
, tca_fqcodel_policy
,
3592 attrs
, ARRAY_SIZE(tca_fqcodel_policy
))) {
3593 VLOG_WARN_RL(&rl
, "failed to parse FQ_CoDel class options");
3597 fqcodel
->target
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_TARGET
]);
3598 fqcodel
->limit
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_LIMIT
]);
3599 fqcodel
->interval
=nl_attr_get_u32(attrs
[TCA_FQ_CODEL_INTERVAL
]);
3600 fqcodel
->flows
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_FLOWS
]);
3601 fqcodel
->quantum
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_QUANTUM
]);
3606 fqcodel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3608 struct nlattr
*nlattr
;
3611 struct fqcodel fqcodel
;
3613 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3618 error
= fqcodel_parse_tca_options__(nlattr
, &fqcodel
);
3623 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3624 fqcodel
.flows
, fqcodel
.quantum
);
3629 fqcodel_tc_destroy(struct tc
*tc
)
3631 struct fqcodel
*fqcodel
= CONTAINER_OF(tc
, struct fqcodel
, tc
);
3637 fqcodel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3639 const struct fqcodel
*fqcodel
= fqcodel_get__(netdev
);
3640 smap_add_format(details
, "target", "%u", fqcodel
->target
);
3641 smap_add_format(details
, "limit", "%u", fqcodel
->limit
);
3642 smap_add_format(details
, "interval", "%u", fqcodel
->interval
);
3643 smap_add_format(details
, "flows", "%u", fqcodel
->flows
);
3644 smap_add_format(details
, "quantum", "%u", fqcodel
->quantum
);
3649 fqcodel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3651 struct fqcodel fqcodel
;
3653 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3654 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3655 fqcodel
.flows
, fqcodel
.quantum
);
3656 fqcodel_get__(netdev
)->target
= fqcodel
.target
;
3657 fqcodel_get__(netdev
)->limit
= fqcodel
.limit
;
3658 fqcodel_get__(netdev
)->interval
= fqcodel
.interval
;
3659 fqcodel_get__(netdev
)->flows
= fqcodel
.flows
;
3660 fqcodel_get__(netdev
)->quantum
= fqcodel
.quantum
;
3664 static const struct tc_ops tc_ops_fqcodel
= {
3665 .linux_name
= "fq_codel",
3666 .ovs_name
= "linux-fq_codel",
3667 .n_queues
= FQCODEL_N_QUEUES
,
3668 .tc_install
= fqcodel_tc_install
,
3669 .tc_load
= fqcodel_tc_load
,
3670 .tc_destroy
= fqcodel_tc_destroy
,
3671 .qdisc_get
= fqcodel_qdisc_get
,
3672 .qdisc_set
= fqcodel_qdisc_set
,
3675 /* SFQ traffic control class. */
3677 #define SFQ_N_QUEUES 0x0000
3686 sfq_get__(const struct netdev
*netdev_
)
3688 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3689 return CONTAINER_OF(netdev
->tc
, struct sfq
, tc
);
3693 sfq_install__(struct netdev
*netdev_
, uint32_t quantum
, uint32_t perturb
)
3695 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3698 sfq
= xmalloc(sizeof *sfq
);
3699 tc_init(&sfq
->tc
, &tc_ops_sfq
);
3700 sfq
->perturb
= perturb
;
3701 sfq
->quantum
= quantum
;
3703 netdev
->tc
= &sfq
->tc
;
3707 sfq_setup_qdisc__(struct netdev
*netdev
, uint32_t quantum
, uint32_t perturb
)
3709 struct tc_sfq_qopt opt
;
3710 struct ofpbuf request
;
3711 struct tcmsg
*tcmsg
;
3713 int mtu_error
, error
;
3714 mtu_error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3716 tc_del_qdisc(netdev
);
3718 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3719 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3723 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3724 tcmsg
->tcm_parent
= TC_H_ROOT
;
3726 memset(&opt
, 0, sizeof opt
);
3729 opt
.quantum
= mtu
; /* if we cannot find mtu, use default */
3732 opt
.quantum
= quantum
;
3736 opt
.perturb_period
= 10;
3738 opt
.perturb_period
= perturb
;
3741 nl_msg_put_string(&request
, TCA_KIND
, "sfq");
3742 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
3744 error
= tc_transact(&request
, NULL
);
3746 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3747 "quantum %u, perturb %u error %d(%s)",
3748 netdev_get_name(netdev
),
3749 opt
.quantum
, opt
.perturb_period
,
3750 error
, ovs_strerror(error
));
3756 sfq_parse_qdisc_details__(struct netdev
*netdev
,
3757 const struct smap
*details
, struct sfq
*sfq
)
3759 sfq
->perturb
= smap_get_ullong(details
, "perturb", 0);
3760 sfq
->quantum
= smap_get_ullong(details
, "quantum", 0);
3762 if (!sfq
->perturb
) {
3766 if (!sfq
->quantum
) {
3768 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
3771 VLOG_WARN_RL(&rl
, "when using SFQ, you must specify quantum on a "
3772 "device without mtu");
3778 sfq_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3783 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3784 error
= sfq_setup_qdisc__(netdev
, sfq
.quantum
, sfq
.perturb
);
3786 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3792 sfq_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3794 const struct tc_sfq_qopt
*sfq
;
3795 struct nlattr
*nlattr
;
3799 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3801 sfq
= nl_attr_get(nlattr
);
3802 sfq_install__(netdev
, sfq
->perturb_period
, sfq
->quantum
);
3810 sfq_tc_destroy(struct tc
*tc
)
3812 struct sfq
*sfq
= CONTAINER_OF(tc
, struct sfq
, tc
);
3818 sfq_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3820 const struct sfq
*sfq
= sfq_get__(netdev
);
3821 smap_add_format(details
, "quantum", "%u", sfq
->quantum
);
3822 smap_add_format(details
, "perturb", "%u", sfq
->perturb
);
3827 sfq_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3831 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3832 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3833 sfq_get__(netdev
)->quantum
= sfq
.quantum
;
3834 sfq_get__(netdev
)->perturb
= sfq
.perturb
;
3838 static const struct tc_ops tc_ops_sfq
= {
3839 .linux_name
= "sfq",
3840 .ovs_name
= "linux-sfq",
3841 .n_queues
= SFQ_N_QUEUES
,
3842 .tc_install
= sfq_tc_install
,
3843 .tc_load
= sfq_tc_load
,
3844 .tc_destroy
= sfq_tc_destroy
,
3845 .qdisc_get
= sfq_qdisc_get
,
3846 .qdisc_set
= sfq_qdisc_set
,
3849 /* HTB traffic control class. */
3851 #define HTB_N_QUEUES 0xf000
3852 #define HTB_RATE2QUANTUM 10
3856 unsigned int max_rate
; /* In bytes/s. */
3860 struct tc_queue tc_queue
;
3861 unsigned int min_rate
; /* In bytes/s. */
3862 unsigned int max_rate
; /* In bytes/s. */
3863 unsigned int burst
; /* In bytes. */
3864 unsigned int priority
; /* Lower values are higher priorities. */
3868 htb_get__(const struct netdev
*netdev_
)
3870 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3871 return CONTAINER_OF(netdev
->tc
, struct htb
, tc
);
3875 htb_install__(struct netdev
*netdev_
, uint64_t max_rate
)
3877 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3880 htb
= xmalloc(sizeof *htb
);
3881 tc_init(&htb
->tc
, &tc_ops_htb
);
3882 htb
->max_rate
= max_rate
;
3884 netdev
->tc
= &htb
->tc
;
3887 /* Create an HTB qdisc.
3889 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3891 htb_setup_qdisc__(struct netdev
*netdev
)
3894 struct tc_htb_glob opt
;
3895 struct ofpbuf request
;
3896 struct tcmsg
*tcmsg
;
3898 tc_del_qdisc(netdev
);
3900 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3901 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3905 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3906 tcmsg
->tcm_parent
= TC_H_ROOT
;
3908 nl_msg_put_string(&request
, TCA_KIND
, "htb");
3910 memset(&opt
, 0, sizeof opt
);
3911 opt
.rate2quantum
= HTB_RATE2QUANTUM
;
3915 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3916 nl_msg_put_unspec(&request
, TCA_HTB_INIT
, &opt
, sizeof opt
);
3917 nl_msg_end_nested(&request
, opt_offset
);
3919 return tc_transact(&request
, NULL
);
3922 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3923 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3925 htb_setup_class__(struct netdev
*netdev
, unsigned int handle
,
3926 unsigned int parent
, struct htb_class
*class)
3929 struct tc_htb_opt opt
;
3930 struct ofpbuf request
;
3931 struct tcmsg
*tcmsg
;
3935 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3937 VLOG_WARN_RL(&rl
, "cannot set up HTB on device %s that lacks MTU",
3938 netdev_get_name(netdev
));
3942 memset(&opt
, 0, sizeof opt
);
3943 tc_fill_rate(&opt
.rate
, class->min_rate
, mtu
);
3944 tc_fill_rate(&opt
.ceil
, class->max_rate
, mtu
);
3945 /* Makes sure the quantum is at least MTU. Setting quantum will
3946 * make htb ignore the r2q for this class. */
3947 if ((class->min_rate
/ HTB_RATE2QUANTUM
) < mtu
) {
3950 opt
.buffer
= tc_calc_buffer(opt
.rate
.rate
, mtu
, class->burst
);
3951 opt
.cbuffer
= tc_calc_buffer(opt
.ceil
.rate
, mtu
, class->burst
);
3952 opt
.prio
= class->priority
;
3954 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
3959 tcmsg
->tcm_handle
= handle
;
3960 tcmsg
->tcm_parent
= parent
;
3962 nl_msg_put_string(&request
, TCA_KIND
, "htb");
3963 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3964 nl_msg_put_unspec(&request
, TCA_HTB_PARMS
, &opt
, sizeof opt
);
3965 tc_put_rtab(&request
, TCA_HTB_RTAB
, &opt
.rate
);
3966 tc_put_rtab(&request
, TCA_HTB_CTAB
, &opt
.ceil
);
3967 nl_msg_end_nested(&request
, opt_offset
);
3969 error
= tc_transact(&request
, NULL
);
3971 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
3972 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3973 netdev_get_name(netdev
),
3974 tc_get_major(handle
), tc_get_minor(handle
),
3975 tc_get_major(parent
), tc_get_minor(parent
),
3976 class->min_rate
, class->max_rate
,
3977 class->burst
, class->priority
, ovs_strerror(error
));
3982 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3983 * description of them into 'details'. The description complies with the
3984 * specification given in the vswitch database documentation for linux-htb
3987 htb_parse_tca_options__(struct nlattr
*nl_options
, struct htb_class
*class)
3989 static const struct nl_policy tca_htb_policy
[] = {
3990 [TCA_HTB_PARMS
] = { .type
= NL_A_UNSPEC
, .optional
= false,
3991 .min_len
= sizeof(struct tc_htb_opt
) },
3994 struct nlattr
*attrs
[ARRAY_SIZE(tca_htb_policy
)];
3995 const struct tc_htb_opt
*htb
;
3997 if (!nl_parse_nested(nl_options
, tca_htb_policy
,
3998 attrs
, ARRAY_SIZE(tca_htb_policy
))) {
3999 VLOG_WARN_RL(&rl
, "failed to parse HTB class options");
4003 htb
= nl_attr_get(attrs
[TCA_HTB_PARMS
]);
4004 class->min_rate
= htb
->rate
.rate
;
4005 class->max_rate
= htb
->ceil
.rate
;
4006 class->burst
= tc_ticks_to_bytes(htb
->rate
.rate
, htb
->buffer
);
4007 class->priority
= htb
->prio
;
4012 htb_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4013 struct htb_class
*options
,
4014 struct netdev_queue_stats
*stats
)
4016 struct nlattr
*nl_options
;
4017 unsigned int handle
;
4020 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4021 if (!error
&& queue_id
) {
4022 unsigned int major
= tc_get_major(handle
);
4023 unsigned int minor
= tc_get_minor(handle
);
4024 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4025 *queue_id
= minor
- 1;
4030 if (!error
&& options
) {
4031 error
= htb_parse_tca_options__(nl_options
, options
);
4037 htb_parse_qdisc_details__(struct netdev
*netdev_
,
4038 const struct smap
*details
, struct htb_class
*hc
)
4040 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4042 hc
->max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
4043 if (!hc
->max_rate
) {
4044 enum netdev_features current
;
4046 netdev_linux_read_features(netdev
);
4047 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4048 hc
->max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4050 hc
->min_rate
= hc
->max_rate
;
4056 htb_parse_class_details__(struct netdev
*netdev
,
4057 const struct smap
*details
, struct htb_class
*hc
)
4059 const struct htb
*htb
= htb_get__(netdev
);
4061 unsigned long long int max_rate_bit
;
4063 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
4065 VLOG_WARN_RL(&rl
, "cannot parse HTB class on device %s that lacks MTU",
4066 netdev_get_name(netdev
));
4070 /* HTB requires at least an mtu sized min-rate to send any traffic even
4071 * on uncongested links. */
4072 hc
->min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
4073 hc
->min_rate
= MAX(hc
->min_rate
, mtu
);
4074 hc
->min_rate
= MIN(hc
->min_rate
, htb
->max_rate
);
4077 max_rate_bit
= smap_get_ullong(details
, "max-rate", 0);
4078 hc
->max_rate
= max_rate_bit
? max_rate_bit
/ 8 : htb
->max_rate
;
4079 hc
->max_rate
= MAX(hc
->max_rate
, hc
->min_rate
);
4080 hc
->max_rate
= MIN(hc
->max_rate
, htb
->max_rate
);
4084 * According to hints in the documentation that I've read, it is important
4085 * that 'burst' be at least as big as the largest frame that might be
4086 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4087 * but having it a bit too small is a problem. Since netdev_get_mtu()
4088 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4089 * the MTU. We actually add 64, instead of 14, as a guard against
4090 * additional headers get tacked on somewhere that we're not aware of. */
4091 hc
->burst
= smap_get_ullong(details
, "burst", 0) / 8;
4092 hc
->burst
= MAX(hc
->burst
, mtu
+ 64);
4095 hc
->priority
= smap_get_ullong(details
, "priority", 0);
4101 htb_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4102 unsigned int parent
, struct htb_class
*options
,
4103 struct netdev_queue_stats
*stats
)
4105 struct ofpbuf
*reply
;
4108 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4110 error
= htb_parse_tcmsg__(reply
, NULL
, options
, stats
);
4111 ofpbuf_delete(reply
);
4117 htb_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4121 error
= htb_setup_qdisc__(netdev
);
4123 struct htb_class hc
;
4125 htb_parse_qdisc_details__(netdev
, details
, &hc
);
4126 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4127 tc_make_handle(1, 0), &hc
);
4129 htb_install__(netdev
, hc
.max_rate
);
4135 static struct htb_class
*
4136 htb_class_cast__(const struct tc_queue
*queue
)
4138 return CONTAINER_OF(queue
, struct htb_class
, tc_queue
);
4142 htb_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4143 const struct htb_class
*hc
)
4145 struct htb
*htb
= htb_get__(netdev
);
4146 size_t hash
= hash_int(queue_id
, 0);
4147 struct tc_queue
*queue
;
4148 struct htb_class
*hcp
;
4150 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4152 hcp
= htb_class_cast__(queue
);
4154 hcp
= xmalloc(sizeof *hcp
);
4155 queue
= &hcp
->tc_queue
;
4156 queue
->queue_id
= queue_id
;
4157 queue
->created
= time_msec();
4158 hmap_insert(&htb
->tc
.queues
, &queue
->hmap_node
, hash
);
4161 hcp
->min_rate
= hc
->min_rate
;
4162 hcp
->max_rate
= hc
->max_rate
;
4163 hcp
->burst
= hc
->burst
;
4164 hcp
->priority
= hc
->priority
;
4168 htb_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4171 struct queue_dump_state state
;
4172 struct htb_class hc
;
4174 /* Get qdisc options. */
4176 htb_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4177 htb_install__(netdev
, hc
.max_rate
);
4180 if (!start_queue_dump(netdev
, &state
)) {
4183 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4184 unsigned int queue_id
;
4186 if (!htb_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4187 htb_update_queue__(netdev
, queue_id
, &hc
);
4190 finish_queue_dump(&state
);
4196 htb_tc_destroy(struct tc
*tc
)
4198 struct htb
*htb
= CONTAINER_OF(tc
, struct htb
, tc
);
4199 struct htb_class
*hc
;
4201 HMAP_FOR_EACH_POP (hc
, tc_queue
.hmap_node
, &htb
->tc
.queues
) {
4209 htb_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4211 const struct htb
*htb
= htb_get__(netdev
);
4212 smap_add_format(details
, "max-rate", "%llu", 8ULL * htb
->max_rate
);
4217 htb_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4219 struct htb_class hc
;
4222 htb_parse_qdisc_details__(netdev
, details
, &hc
);
4223 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4224 tc_make_handle(1, 0), &hc
);
4226 htb_get__(netdev
)->max_rate
= hc
.max_rate
;
4232 htb_class_get(const struct netdev
*netdev OVS_UNUSED
,
4233 const struct tc_queue
*queue
, struct smap
*details
)
4235 const struct htb_class
*hc
= htb_class_cast__(queue
);
4237 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4238 if (hc
->min_rate
!= hc
->max_rate
) {
4239 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4241 smap_add_format(details
, "burst", "%llu", 8ULL * hc
->burst
);
4243 smap_add_format(details
, "priority", "%u", hc
->priority
);
4249 htb_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4250 const struct smap
*details
)
4252 struct htb_class hc
;
4255 error
= htb_parse_class_details__(netdev
, details
, &hc
);
4260 error
= htb_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
4261 tc_make_handle(1, 0xfffe), &hc
);
4266 htb_update_queue__(netdev
, queue_id
, &hc
);
4271 htb_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4273 struct htb_class
*hc
= htb_class_cast__(queue
);
4274 struct htb
*htb
= htb_get__(netdev
);
4277 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4279 hmap_remove(&htb
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4286 htb_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4287 struct netdev_queue_stats
*stats
)
4289 return htb_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4290 tc_make_handle(1, 0xfffe), NULL
, stats
);
4294 htb_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4295 const struct ofpbuf
*nlmsg
,
4296 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4298 struct netdev_queue_stats stats
;
4299 unsigned int handle
, major
, minor
;
4302 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4307 major
= tc_get_major(handle
);
4308 minor
= tc_get_minor(handle
);
4309 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4310 (*cb
)(minor
- 1, &stats
, aux
);
4315 static const struct tc_ops tc_ops_htb
= {
4316 .linux_name
= "htb",
4317 .ovs_name
= "linux-htb",
4318 .n_queues
= HTB_N_QUEUES
,
4319 .tc_install
= htb_tc_install
,
4320 .tc_load
= htb_tc_load
,
4321 .tc_destroy
= htb_tc_destroy
,
4322 .qdisc_get
= htb_qdisc_get
,
4323 .qdisc_set
= htb_qdisc_set
,
4324 .class_get
= htb_class_get
,
4325 .class_set
= htb_class_set
,
4326 .class_delete
= htb_class_delete
,
4327 .class_get_stats
= htb_class_get_stats
,
4328 .class_dump_stats
= htb_class_dump_stats
4331 /* "linux-hfsc" traffic control class. */
4333 #define HFSC_N_QUEUES 0xf000
4341 struct tc_queue tc_queue
;
4346 static struct hfsc
*
4347 hfsc_get__(const struct netdev
*netdev_
)
4349 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4350 return CONTAINER_OF(netdev
->tc
, struct hfsc
, tc
);
4353 static struct hfsc_class
*
4354 hfsc_class_cast__(const struct tc_queue
*queue
)
4356 return CONTAINER_OF(queue
, struct hfsc_class
, tc_queue
);
4360 hfsc_install__(struct netdev
*netdev_
, uint32_t max_rate
)
4362 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4365 hfsc
= xmalloc(sizeof *hfsc
);
4366 tc_init(&hfsc
->tc
, &tc_ops_hfsc
);
4367 hfsc
->max_rate
= max_rate
;
4368 netdev
->tc
= &hfsc
->tc
;
4372 hfsc_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4373 const struct hfsc_class
*hc
)
4377 struct hfsc_class
*hcp
;
4378 struct tc_queue
*queue
;
4380 hfsc
= hfsc_get__(netdev
);
4381 hash
= hash_int(queue_id
, 0);
4383 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4385 hcp
= hfsc_class_cast__(queue
);
4387 hcp
= xmalloc(sizeof *hcp
);
4388 queue
= &hcp
->tc_queue
;
4389 queue
->queue_id
= queue_id
;
4390 queue
->created
= time_msec();
4391 hmap_insert(&hfsc
->tc
.queues
, &queue
->hmap_node
, hash
);
4394 hcp
->min_rate
= hc
->min_rate
;
4395 hcp
->max_rate
= hc
->max_rate
;
4399 hfsc_parse_tca_options__(struct nlattr
*nl_options
, struct hfsc_class
*class)
4401 const struct tc_service_curve
*rsc
, *fsc
, *usc
;
4402 static const struct nl_policy tca_hfsc_policy
[] = {
4404 .type
= NL_A_UNSPEC
,
4406 .min_len
= sizeof(struct tc_service_curve
),
4409 .type
= NL_A_UNSPEC
,
4411 .min_len
= sizeof(struct tc_service_curve
),
4414 .type
= NL_A_UNSPEC
,
4416 .min_len
= sizeof(struct tc_service_curve
),
4419 struct nlattr
*attrs
[ARRAY_SIZE(tca_hfsc_policy
)];
4421 if (!nl_parse_nested(nl_options
, tca_hfsc_policy
,
4422 attrs
, ARRAY_SIZE(tca_hfsc_policy
))) {
4423 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options");
4427 rsc
= nl_attr_get(attrs
[TCA_HFSC_RSC
]);
4428 fsc
= nl_attr_get(attrs
[TCA_HFSC_FSC
]);
4429 usc
= nl_attr_get(attrs
[TCA_HFSC_USC
]);
4431 if (rsc
->m1
!= 0 || rsc
->d
!= 0 ||
4432 fsc
->m1
!= 0 || fsc
->d
!= 0 ||
4433 usc
->m1
!= 0 || usc
->d
!= 0) {
4434 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4435 "Non-linear service curves are not supported.");
4439 if (rsc
->m2
!= fsc
->m2
) {
4440 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4441 "Real-time service curves are not supported ");
4445 if (rsc
->m2
> usc
->m2
) {
4446 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4447 "Min-rate service curve is greater than "
4448 "the max-rate service curve.");
4452 class->min_rate
= fsc
->m2
;
4453 class->max_rate
= usc
->m2
;
4458 hfsc_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4459 struct hfsc_class
*options
,
4460 struct netdev_queue_stats
*stats
)
4463 unsigned int handle
;
4464 struct nlattr
*nl_options
;
4466 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4472 unsigned int major
, minor
;
4474 major
= tc_get_major(handle
);
4475 minor
= tc_get_minor(handle
);
4476 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4477 *queue_id
= minor
- 1;
4484 error
= hfsc_parse_tca_options__(nl_options
, options
);
4491 hfsc_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4492 unsigned int parent
, struct hfsc_class
*options
,
4493 struct netdev_queue_stats
*stats
)
4496 struct ofpbuf
*reply
;
4498 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4503 error
= hfsc_parse_tcmsg__(reply
, NULL
, options
, stats
);
4504 ofpbuf_delete(reply
);
4509 hfsc_parse_qdisc_details__(struct netdev
*netdev_
, const struct smap
*details
,
4510 struct hfsc_class
*class)
4512 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4514 uint32_t max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
4516 enum netdev_features current
;
4518 netdev_linux_read_features(netdev
);
4519 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4520 max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4523 class->min_rate
= max_rate
;
4524 class->max_rate
= max_rate
;
4528 hfsc_parse_class_details__(struct netdev
*netdev
,
4529 const struct smap
*details
,
4530 struct hfsc_class
* class)
4532 const struct hfsc
*hfsc
;
4533 uint32_t min_rate
, max_rate
;
4535 hfsc
= hfsc_get__(netdev
);
4537 min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
4538 min_rate
= MAX(min_rate
, 1);
4539 min_rate
= MIN(min_rate
, hfsc
->max_rate
);
4541 max_rate
= smap_get_ullong(details
, "max-rate", hfsc
->max_rate
* 8) / 8;
4542 max_rate
= MAX(max_rate
, min_rate
);
4543 max_rate
= MIN(max_rate
, hfsc
->max_rate
);
4545 class->min_rate
= min_rate
;
4546 class->max_rate
= max_rate
;
4551 /* Create an HFSC qdisc.
4553 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4555 hfsc_setup_qdisc__(struct netdev
* netdev
)
4557 struct tcmsg
*tcmsg
;
4558 struct ofpbuf request
;
4559 struct tc_hfsc_qopt opt
;
4561 tc_del_qdisc(netdev
);
4563 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4564 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4570 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4571 tcmsg
->tcm_parent
= TC_H_ROOT
;
4573 memset(&opt
, 0, sizeof opt
);
4576 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4577 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
4579 return tc_transact(&request
, NULL
);
4582 /* Create an HFSC class.
4584 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4585 * sc rate <min_rate> ul rate <max_rate>" */
4587 hfsc_setup_class__(struct netdev
*netdev
, unsigned int handle
,
4588 unsigned int parent
, struct hfsc_class
*class)
4592 struct tcmsg
*tcmsg
;
4593 struct ofpbuf request
;
4594 struct tc_service_curve min
, max
;
4596 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
4603 tcmsg
->tcm_handle
= handle
;
4604 tcmsg
->tcm_parent
= parent
;
4608 min
.m2
= class->min_rate
;
4612 max
.m2
= class->max_rate
;
4614 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4615 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4616 nl_msg_put_unspec(&request
, TCA_HFSC_RSC
, &min
, sizeof min
);
4617 nl_msg_put_unspec(&request
, TCA_HFSC_FSC
, &min
, sizeof min
);
4618 nl_msg_put_unspec(&request
, TCA_HFSC_USC
, &max
, sizeof max
);
4619 nl_msg_end_nested(&request
, opt_offset
);
4621 error
= tc_transact(&request
, NULL
);
4623 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
4624 "min-rate %ubps, max-rate %ubps (%s)",
4625 netdev_get_name(netdev
),
4626 tc_get_major(handle
), tc_get_minor(handle
),
4627 tc_get_major(parent
), tc_get_minor(parent
),
4628 class->min_rate
, class->max_rate
, ovs_strerror(error
));
4635 hfsc_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4638 struct hfsc_class
class;
4640 error
= hfsc_setup_qdisc__(netdev
);
4646 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4647 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4648 tc_make_handle(1, 0), &class);
4654 hfsc_install__(netdev
, class.max_rate
);
4659 hfsc_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4662 struct queue_dump_state state
;
4663 struct hfsc_class hc
;
4666 hfsc_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4667 hfsc_install__(netdev
, hc
.max_rate
);
4669 if (!start_queue_dump(netdev
, &state
)) {
4673 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4674 unsigned int queue_id
;
4676 if (!hfsc_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4677 hfsc_update_queue__(netdev
, queue_id
, &hc
);
4681 finish_queue_dump(&state
);
4686 hfsc_tc_destroy(struct tc
*tc
)
4689 struct hfsc_class
*hc
, *next
;
4691 hfsc
= CONTAINER_OF(tc
, struct hfsc
, tc
);
4693 HMAP_FOR_EACH_SAFE (hc
, next
, tc_queue
.hmap_node
, &hfsc
->tc
.queues
) {
4694 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4703 hfsc_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4705 const struct hfsc
*hfsc
;
4706 hfsc
= hfsc_get__(netdev
);
4707 smap_add_format(details
, "max-rate", "%llu", 8ULL * hfsc
->max_rate
);
4712 hfsc_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4715 struct hfsc_class
class;
4717 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4718 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4719 tc_make_handle(1, 0), &class);
4722 hfsc_get__(netdev
)->max_rate
= class.max_rate
;
4729 hfsc_class_get(const struct netdev
*netdev OVS_UNUSED
,
4730 const struct tc_queue
*queue
, struct smap
*details
)
4732 const struct hfsc_class
*hc
;
4734 hc
= hfsc_class_cast__(queue
);
4735 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4736 if (hc
->min_rate
!= hc
->max_rate
) {
4737 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4743 hfsc_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4744 const struct smap
*details
)
4747 struct hfsc_class
class;
4749 error
= hfsc_parse_class_details__(netdev
, details
, &class);
4754 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
4755 tc_make_handle(1, 0xfffe), &class);
4760 hfsc_update_queue__(netdev
, queue_id
, &class);
4765 hfsc_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4769 struct hfsc_class
*hc
;
4771 hc
= hfsc_class_cast__(queue
);
4772 hfsc
= hfsc_get__(netdev
);
4774 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4776 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4783 hfsc_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4784 struct netdev_queue_stats
*stats
)
4786 return hfsc_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4787 tc_make_handle(1, 0xfffe), NULL
, stats
);
4791 hfsc_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4792 const struct ofpbuf
*nlmsg
,
4793 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4795 struct netdev_queue_stats stats
;
4796 unsigned int handle
, major
, minor
;
4799 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4804 major
= tc_get_major(handle
);
4805 minor
= tc_get_minor(handle
);
4806 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4807 (*cb
)(minor
- 1, &stats
, aux
);
4812 static const struct tc_ops tc_ops_hfsc
= {
4813 .linux_name
= "hfsc",
4814 .ovs_name
= "linux-hfsc",
4815 .n_queues
= HFSC_N_QUEUES
, /* n_queues */
4816 .tc_install
= hfsc_tc_install
,
4817 .tc_load
= hfsc_tc_load
,
4818 .tc_destroy
= hfsc_tc_destroy
,
4819 .qdisc_get
= hfsc_qdisc_get
,
4820 .qdisc_set
= hfsc_qdisc_set
,
4821 .class_get
= hfsc_class_get
,
4822 .class_set
= hfsc_class_set
,
4823 .class_delete
= hfsc_class_delete
,
4824 .class_get_stats
= hfsc_class_get_stats
,
4825 .class_dump_stats
= hfsc_class_dump_stats
,
4828 /* "linux-noop" traffic control class. */
4831 noop_install__(struct netdev
*netdev_
)
4833 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4834 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
4836 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4840 noop_tc_install(struct netdev
*netdev
,
4841 const struct smap
*details OVS_UNUSED
)
4843 noop_install__(netdev
);
4848 noop_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4850 noop_install__(netdev
);
4854 static const struct tc_ops tc_ops_noop
= {
4855 .ovs_name
= "linux-noop", /* ovs_name */
4856 .tc_install
= noop_tc_install
,
4857 .tc_load
= noop_tc_load
,
4860 /* "linux-default" traffic control class.
4862 * This class represents the default, unnamed Linux qdisc. It corresponds to
4863 * the "" (empty string) QoS type in the OVS database. */
4866 default_install__(struct netdev
*netdev_
)
4868 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4869 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
4871 /* Nothing but a tc class implementation is allowed to write to a tc. This
4872 * class never does that, so we can legitimately use a const tc object. */
4873 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4877 default_tc_install(struct netdev
*netdev
,
4878 const struct smap
*details OVS_UNUSED
)
4880 default_install__(netdev
);
4885 default_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4887 default_install__(netdev
);
4891 static const struct tc_ops tc_ops_default
= {
4892 .ovs_name
= "", /* ovs_name */
4893 .tc_install
= default_tc_install
,
4894 .tc_load
= default_tc_load
,
4897 /* "linux-other" traffic control class.
4902 other_tc_load(struct netdev
*netdev_
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4904 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4905 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_other
);
4907 /* Nothing but a tc class implementation is allowed to write to a tc. This
4908 * class never does that, so we can legitimately use a const tc object. */
4909 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4913 static const struct tc_ops tc_ops_other
= {
4914 .ovs_name
= "linux-other",
4915 .tc_load
= other_tc_load
,
4918 /* Traffic control. */
4920 /* Number of kernel "tc" ticks per second. */
4921 static double ticks_per_s
;
4923 /* Number of kernel "jiffies" per second. This is used for the purpose of
4924 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4925 * one jiffy's worth of data.
4927 * There are two possibilities here:
4929 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4930 * approximate range of 100 to 1024. That means that we really need to
4931 * make sure that the qdisc can buffer that much data.
4933 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4934 * has finely granular timers and there's no need to fudge additional room
4935 * for buffers. (There's no extra effort needed to implement that: the
4936 * large 'buffer_hz' is used as a divisor, so practically any number will
4937 * come out as 0 in the division. Small integer results in the case of
4938 * really high dividends won't have any real effect anyhow.)
4940 static unsigned int buffer_hz
;
4942 static struct tcmsg
*
4943 netdev_linux_tc_make_request(const struct netdev
*netdev
, int type
,
4944 unsigned int flags
, struct ofpbuf
*request
)
4949 error
= get_ifindex(netdev
, &ifindex
);
4954 return tc_make_request(ifindex
, type
, flags
, request
);
4957 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4960 * This function is equivalent to running:
4961 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4962 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4965 * The configuration and stats may be seen with the following command:
4966 * /sbin/tc -s filter show dev <devname> parent ffff:
4968 * Returns 0 if successful, otherwise a positive errno value.
4971 tc_add_policer(struct netdev
*netdev
,
4972 uint32_t kbits_rate
, uint32_t kbits_burst
)
4974 struct tc_police tc_police
;
4975 struct ofpbuf request
;
4976 struct tcmsg
*tcmsg
;
4977 size_t basic_offset
;
4978 size_t police_offset
;
4982 memset(&tc_police
, 0, sizeof tc_police
);
4983 tc_police
.action
= TC_POLICE_SHOT
;
4984 tc_police
.mtu
= mtu
;
4985 tc_fill_rate(&tc_police
.rate
, ((uint64_t) kbits_rate
* 1000)/8, mtu
);
4987 /* The following appears wrong in one way: In networking a kilobit is
4988 * usually 1000 bits but this uses 1024 bits.
4990 * However if you "fix" those problems then "tc filter show ..." shows
4991 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4992 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4993 * tc's point of view. Whatever. */
4994 tc_police
.burst
= tc_bytes_to_ticks(
4995 tc_police
.rate
.rate
, MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024 / 8);
4997 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTFILTER
,
4998 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
5002 tcmsg
->tcm_parent
= tc_make_handle(0xffff, 0);
5003 tcmsg
->tcm_info
= tc_make_handle(49,
5004 (OVS_FORCE
uint16_t) htons(ETH_P_ALL
));
5006 nl_msg_put_string(&request
, TCA_KIND
, "basic");
5007 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
5008 police_offset
= nl_msg_start_nested(&request
, TCA_BASIC_POLICE
);
5009 nl_msg_put_unspec(&request
, TCA_POLICE_TBF
, &tc_police
, sizeof tc_police
);
5010 tc_put_rtab(&request
, TCA_POLICE_RATE
, &tc_police
.rate
);
5011 nl_msg_end_nested(&request
, police_offset
);
5012 nl_msg_end_nested(&request
, basic_offset
);
5014 error
= tc_transact(&request
, NULL
);
5025 /* The values in psched are not individually very meaningful, but they are
5026 * important. The tables below show some values seen in the wild.
5030 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5031 * (Before that, there are hints that it was 1000000000.)
5033 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5037 * -----------------------------------
5038 * [1] 000c8000 000f4240 000f4240 00000064
5039 * [2] 000003e8 00000400 000f4240 3b9aca00
5040 * [3] 000003e8 00000400 000f4240 3b9aca00
5041 * [4] 000003e8 00000400 000f4240 00000064
5042 * [5] 000003e8 00000040 000f4240 3b9aca00
5043 * [6] 000003e8 00000040 000f4240 000000f9
5045 * a b c d ticks_per_s buffer_hz
5046 * ------- --------- ---------- ------------- ----------- -------------
5047 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5048 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5049 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5050 * [4] 1,000 1,024 1,000,000 100 976,562 100
5051 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5052 * [6] 1,000 64 1,000,000 249 15,625,000 249
5054 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5055 * [2] 2.6.26-1-686-bigmem from Debian lenny
5056 * [3] 2.6.26-2-sparc64 from Debian lenny
5057 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5058 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5059 * [6] 2.6.34 from kernel.org on KVM
5061 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5062 static const char fn
[] = "/proc/net/psched";
5063 unsigned int a
, b
, c
, d
;
5066 if (!ovsthread_once_start(&once
)) {
5073 stream
= fopen(fn
, "r");
5075 VLOG_WARN("%s: open failed: %s", fn
, ovs_strerror(errno
));
5079 if (fscanf(stream
, "%x %x %x %x", &a
, &b
, &c
, &d
) != 4) {
5080 VLOG_WARN("%s: read failed", fn
);
5084 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn
, a
, b
, c
, d
);
5087 if (!a
|| !b
|| !c
) {
5088 VLOG_WARN("%s: invalid scheduler parameters", fn
);
5092 ticks_per_s
= (double) a
* c
/ b
;
5096 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5099 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn
, ticks_per_s
, buffer_hz
);
5102 ovsthread_once_done(&once
);
5105 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5106 * rate of 'rate' bytes per second. */
5108 tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
)
5111 return (rate
* ticks
) / ticks_per_s
;
5114 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
5115 * rate of 'rate' bytes per second. */
5117 tc_bytes_to_ticks(unsigned int rate
, unsigned int size
)
5120 return rate
? ((unsigned long long int) ticks_per_s
* size
) / rate
: 0;
5123 /* Returns the number of bytes that need to be reserved for qdisc buffering at
5124 * a transmission rate of 'rate' bytes per second. */
5126 tc_buffer_per_jiffy(unsigned int rate
)
5129 return rate
/ buffer_hz
;
5132 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5133 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5134 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5135 * stores NULL into it if it is absent.
5137 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5140 * Returns 0 if successful, otherwise a positive errno value. */
5142 tc_parse_qdisc(const struct ofpbuf
*msg
, const char **kind
,
5143 struct nlattr
**options
)
5145 static const struct nl_policy tca_policy
[] = {
5146 [TCA_KIND
] = { .type
= NL_A_STRING
, .optional
= false },
5147 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= true },
5149 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5151 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5152 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5153 VLOG_WARN_RL(&rl
, "failed to parse qdisc message");
5158 *kind
= nl_attr_get_string(ta
[TCA_KIND
]);
5162 *options
= ta
[TCA_OPTIONS
];
5177 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5178 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5179 * into '*options', and its queue statistics into '*stats'. Any of the output
5180 * arguments may be null.
5182 * Returns 0 if successful, otherwise a positive errno value. */
5184 tc_parse_class(const struct ofpbuf
*msg
, unsigned int *handlep
,
5185 struct nlattr
**options
, struct netdev_queue_stats
*stats
)
5187 static const struct nl_policy tca_policy
[] = {
5188 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= false },
5189 [TCA_STATS2
] = { .type
= NL_A_NESTED
, .optional
= false },
5191 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5193 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5194 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5195 VLOG_WARN_RL(&rl
, "failed to parse class message");
5200 struct tcmsg
*tc
= ofpbuf_at_assert(msg
, NLMSG_HDRLEN
, sizeof *tc
);
5201 *handlep
= tc
->tcm_handle
;
5205 *options
= ta
[TCA_OPTIONS
];
5209 const struct gnet_stats_queue
*gsq
;
5210 struct gnet_stats_basic gsb
;
5212 static const struct nl_policy stats_policy
[] = {
5213 [TCA_STATS_BASIC
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5214 .min_len
= sizeof gsb
},
5215 [TCA_STATS_QUEUE
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5216 .min_len
= sizeof *gsq
},
5218 struct nlattr
*sa
[ARRAY_SIZE(stats_policy
)];
5220 if (!nl_parse_nested(ta
[TCA_STATS2
], stats_policy
,
5221 sa
, ARRAY_SIZE(sa
))) {
5222 VLOG_WARN_RL(&rl
, "failed to parse class stats");
5226 /* Alignment issues screw up the length of struct gnet_stats_basic on
5227 * some arch/bitsize combinations. Newer versions of Linux have a
5228 * struct gnet_stats_basic_packed, but we can't depend on that. The
5229 * easiest thing to do is just to make a copy. */
5230 memset(&gsb
, 0, sizeof gsb
);
5231 memcpy(&gsb
, nl_attr_get(sa
[TCA_STATS_BASIC
]),
5232 MIN(nl_attr_get_size(sa
[TCA_STATS_BASIC
]), sizeof gsb
));
5233 stats
->tx_bytes
= gsb
.bytes
;
5234 stats
->tx_packets
= gsb
.packets
;
5236 gsq
= nl_attr_get(sa
[TCA_STATS_QUEUE
]);
5237 stats
->tx_errors
= gsq
->drops
;
5247 memset(stats
, 0, sizeof *stats
);
5252 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5255 tc_query_class(const struct netdev
*netdev
,
5256 unsigned int handle
, unsigned int parent
,
5257 struct ofpbuf
**replyp
)
5259 struct ofpbuf request
;
5260 struct tcmsg
*tcmsg
;
5263 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, NLM_F_ECHO
,
5268 tcmsg
->tcm_handle
= handle
;
5269 tcmsg
->tcm_parent
= parent
;
5271 error
= tc_transact(&request
, replyp
);
5273 VLOG_WARN_RL(&rl
, "query %s class %u:%u (parent %u:%u) failed (%s)",
5274 netdev_get_name(netdev
),
5275 tc_get_major(handle
), tc_get_minor(handle
),
5276 tc_get_major(parent
), tc_get_minor(parent
),
5277 ovs_strerror(error
));
5282 /* Equivalent to "tc class del dev <name> handle <handle>". */
5284 tc_delete_class(const struct netdev
*netdev
, unsigned int handle
)
5286 struct ofpbuf request
;
5287 struct tcmsg
*tcmsg
;
5290 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_DELTCLASS
, 0, &request
);
5294 tcmsg
->tcm_handle
= handle
;
5295 tcmsg
->tcm_parent
= 0;
5297 error
= tc_transact(&request
, NULL
);
5299 VLOG_WARN_RL(&rl
, "delete %s class %u:%u failed (%s)",
5300 netdev_get_name(netdev
),
5301 tc_get_major(handle
), tc_get_minor(handle
),
5302 ovs_strerror(error
));
5307 /* Equivalent to "tc qdisc del dev <name> root". */
5309 tc_del_qdisc(struct netdev
*netdev_
)
5311 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5312 struct ofpbuf request
;
5313 struct tcmsg
*tcmsg
;
5316 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_DELQDISC
, 0, &request
);
5320 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
5321 tcmsg
->tcm_parent
= TC_H_ROOT
;
5323 error
= tc_transact(&request
, NULL
);
5324 if (error
== EINVAL
) {
5325 /* EINVAL probably means that the default qdisc was in use, in which
5326 * case we've accomplished our purpose. */
5329 if (!error
&& netdev
->tc
) {
5330 if (netdev
->tc
->ops
->tc_destroy
) {
5331 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
5339 getqdisc_is_safe(void)
5341 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5342 static bool safe
= false;
5344 if (ovsthread_once_start(&once
)) {
5345 struct utsname utsname
;
5348 if (uname(&utsname
) == -1) {
5349 VLOG_WARN("uname failed (%s)", ovs_strerror(errno
));
5350 } else if (!ovs_scan(utsname
.release
, "%d.%d", &major
, &minor
)) {
5351 VLOG_WARN("uname reported bad OS release (%s)", utsname
.release
);
5352 } else if (major
< 2 || (major
== 2 && minor
< 35)) {
5353 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5358 ovsthread_once_done(&once
);
5363 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5364 * kernel to determine what they are. Returns 0 if successful, otherwise a
5365 * positive errno value. */
5367 tc_query_qdisc(const struct netdev
*netdev_
)
5369 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5370 struct ofpbuf request
, *qdisc
;
5371 const struct tc_ops
*ops
;
5372 struct tcmsg
*tcmsg
;
5380 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5381 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5382 * 2.6.35 without that fix backported to it.
5384 * To avoid the OOPS, we must not make a request that would attempt to dump
5385 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5386 * few others. There are a few ways that I can see to do this, but most of
5387 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5388 * technique chosen here is to assume that any non-default qdisc that we
5389 * create will have a class with handle 1:0. The built-in qdiscs only have
5390 * a class with handle 0:0.
5392 * On Linux 2.6.35+ we use the straightforward method because it allows us
5393 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5394 * in such a case we get no response at all from the kernel (!) if a
5395 * builtin qdisc is in use (which is later caught by "!error &&
5396 * !qdisc->size"). */
5397 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_GETQDISC
, NLM_F_ECHO
,
5402 tcmsg
->tcm_handle
= tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5403 tcmsg
->tcm_parent
= getqdisc_is_safe() ? TC_H_ROOT
: 0;
5405 /* Figure out what tc class to instantiate. */
5406 error
= tc_transact(&request
, &qdisc
);
5407 if (!error
&& qdisc
->size
) {
5410 error
= tc_parse_qdisc(qdisc
, &kind
, NULL
);
5412 ops
= &tc_ops_other
;
5414 ops
= tc_lookup_linux_name(kind
);
5416 static struct vlog_rate_limit rl2
= VLOG_RATE_LIMIT_INIT(1, 1);
5417 VLOG_DBG_RL(&rl2
, "unknown qdisc \"%s\"", kind
);
5419 ops
= &tc_ops_other
;
5422 } else if ((!error
&& !qdisc
->size
) || error
== ENOENT
) {
5423 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5424 * set up by some other entity that doesn't have a handle 1:0. We will
5425 * assume that it's the system default qdisc. */
5426 ops
= &tc_ops_default
;
5429 /* Who knows? Maybe the device got deleted. */
5430 VLOG_WARN_RL(&rl
, "query %s qdisc failed (%s)",
5431 netdev_get_name(netdev_
), ovs_strerror(error
));
5432 ops
= &tc_ops_other
;
5435 /* Instantiate it. */
5436 load_error
= ops
->tc_load(CONST_CAST(struct netdev
*, netdev_
), qdisc
);
5437 ovs_assert((load_error
== 0) == (netdev
->tc
!= NULL
));
5438 ofpbuf_delete(qdisc
);
5440 return error
? error
: load_error
;
5443 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5444 approximate the time to transmit packets of various lengths. For an MTU of
5445 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5446 represents two possible packet lengths; for a MTU of 513 through 1024, four
5447 possible lengths; and so on.
5449 Returns, for the specified 'mtu', the number of bits that packet lengths
5450 need to be shifted right to fit within such a 256-entry table. */
5452 tc_calc_cell_log(unsigned int mtu
)
5457 mtu
= ETH_PAYLOAD_MAX
;
5459 mtu
+= ETH_HEADER_LEN
+ VLAN_HEADER_LEN
;
5461 for (cell_log
= 0; mtu
>= 256; cell_log
++) {
5468 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5471 tc_fill_rate(struct tc_ratespec
*rate
, uint64_t Bps
, int mtu
)
5473 memset(rate
, 0, sizeof *rate
);
5474 rate
->cell_log
= tc_calc_cell_log(mtu
);
5475 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5476 /* rate->cell_align = 0; */ /* distro headers. */
5477 rate
->mpu
= ETH_TOTAL_MIN
;
5481 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5482 * attribute of the specified "type".
5484 * See tc_calc_cell_log() above for a description of "rtab"s. */
5486 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
)
5491 rtab
= nl_msg_put_unspec_uninit(msg
, type
, TC_RTAB_SIZE
);
5492 for (i
= 0; i
< TC_RTAB_SIZE
/ sizeof *rtab
; i
++) {
5493 unsigned packet_size
= (i
+ 1) << rate
->cell_log
;
5494 if (packet_size
< rate
->mpu
) {
5495 packet_size
= rate
->mpu
;
5497 rtab
[i
] = tc_bytes_to_ticks(rate
->rate
, packet_size
);
5501 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5502 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5503 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5506 tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
)
5508 unsigned int min_burst
= tc_buffer_per_jiffy(Bps
) + mtu
;
5509 return tc_bytes_to_ticks(Bps
, MAX(burst_bytes
, min_burst
));
5512 /* Linux-only functions declared in netdev-linux.h */
5514 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5515 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5517 netdev_linux_ethtool_set_flag(struct netdev
*netdev
, uint32_t flag
,
5518 const char *flag_name
, bool enable
)
5520 const char *netdev_name
= netdev_get_name(netdev
);
5521 struct ethtool_value evalue
;
5525 COVERAGE_INC(netdev_get_ethtool
);
5526 memset(&evalue
, 0, sizeof evalue
);
5527 error
= netdev_linux_do_ethtool(netdev_name
,
5528 (struct ethtool_cmd
*)&evalue
,
5529 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5534 COVERAGE_INC(netdev_set_ethtool
);
5535 new_flags
= (evalue
.data
& ~flag
) | (enable
? flag
: 0);
5536 if (new_flags
== evalue
.data
) {
5539 evalue
.data
= new_flags
;
5540 error
= netdev_linux_do_ethtool(netdev_name
,
5541 (struct ethtool_cmd
*)&evalue
,
5542 ETHTOOL_SFLAGS
, "ETHTOOL_SFLAGS");
5547 COVERAGE_INC(netdev_get_ethtool
);
5548 memset(&evalue
, 0, sizeof evalue
);
5549 error
= netdev_linux_do_ethtool(netdev_name
,
5550 (struct ethtool_cmd
*)&evalue
,
5551 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5556 if (new_flags
!= evalue
.data
) {
5557 VLOG_WARN_RL(&rl
, "attempt to %s ethtool %s flag on network "
5558 "device %s failed", enable
? "enable" : "disable",
5559 flag_name
, netdev_name
);
5566 /* Utility functions. */
5568 /* Copies 'src' into 'dst', performing format conversion in the process. */
5570 netdev_stats_from_rtnl_link_stats(struct netdev_stats
*dst
,
5571 const struct rtnl_link_stats
*src
)
5573 dst
->rx_packets
= src
->rx_packets
;
5574 dst
->tx_packets
= src
->tx_packets
;
5575 dst
->rx_bytes
= src
->rx_bytes
;
5576 dst
->tx_bytes
= src
->tx_bytes
;
5577 dst
->rx_errors
= src
->rx_errors
;
5578 dst
->tx_errors
= src
->tx_errors
;
5579 dst
->rx_dropped
= src
->rx_dropped
;
5580 dst
->tx_dropped
= src
->tx_dropped
;
5581 dst
->multicast
= src
->multicast
;
5582 dst
->collisions
= src
->collisions
;
5583 dst
->rx_length_errors
= src
->rx_length_errors
;
5584 dst
->rx_over_errors
= src
->rx_over_errors
;
5585 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5586 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5587 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5588 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5589 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5590 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5591 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5592 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5593 dst
->tx_window_errors
= src
->tx_window_errors
;
5596 /* Copies 'src' into 'dst', performing format conversion in the process. */
5598 netdev_stats_from_rtnl_link_stats64(struct netdev_stats
*dst
,
5599 const struct rtnl_link_stats64
*src
)
5601 dst
->rx_packets
= src
->rx_packets
;
5602 dst
->tx_packets
= src
->tx_packets
;
5603 dst
->rx_bytes
= src
->rx_bytes
;
5604 dst
->tx_bytes
= src
->tx_bytes
;
5605 dst
->rx_errors
= src
->rx_errors
;
5606 dst
->tx_errors
= src
->tx_errors
;
5607 dst
->rx_dropped
= src
->rx_dropped
;
5608 dst
->tx_dropped
= src
->tx_dropped
;
5609 dst
->multicast
= src
->multicast
;
5610 dst
->collisions
= src
->collisions
;
5611 dst
->rx_length_errors
= src
->rx_length_errors
;
5612 dst
->rx_over_errors
= src
->rx_over_errors
;
5613 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5614 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5615 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5616 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5617 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5618 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5619 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5620 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5621 dst
->tx_window_errors
= src
->tx_window_errors
;
5625 get_stats_via_netlink(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
5627 struct ofpbuf request
;
5628 struct ofpbuf
*reply
;
5631 /* Filtering all counters by default */
5632 memset(stats
, 0xFF, sizeof(struct netdev_stats
));
5634 ofpbuf_init(&request
, 0);
5635 nl_msg_put_nlmsghdr(&request
,
5636 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
5637 RTM_GETLINK
, NLM_F_REQUEST
);
5638 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
5639 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(netdev_
));
5640 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
5641 ofpbuf_uninit(&request
);
5646 if (ofpbuf_try_pull(reply
, NLMSG_HDRLEN
+ sizeof(struct ifinfomsg
))) {
5647 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS64
);
5648 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats64
)) {
5649 netdev_stats_from_rtnl_link_stats64(stats
, nl_attr_get(a
));
5652 a
= nl_attr_find(reply
, 0, IFLA_STATS
);
5653 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats
)) {
5654 netdev_stats_from_rtnl_link_stats(stats
, nl_attr_get(a
));
5657 VLOG_WARN_RL(&rl
, "RTM_GETLINK reply lacks stats");
5662 VLOG_WARN_RL(&rl
, "short RTM_GETLINK reply");
5667 ofpbuf_delete(reply
);
5672 get_flags(const struct netdev
*dev
, unsigned int *flags
)
5678 error
= af_inet_ifreq_ioctl(dev
->name
, &ifr
, SIOCGIFFLAGS
, "SIOCGIFFLAGS");
5680 *flags
= ifr
.ifr_flags
;
5686 set_flags(const char *name
, unsigned int flags
)
5690 ifr
.ifr_flags
= flags
;
5691 return af_inet_ifreq_ioctl(name
, &ifr
, SIOCSIFFLAGS
, "SIOCSIFFLAGS");
5695 linux_get_ifindex(const char *netdev_name
)
5700 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5701 COVERAGE_INC(netdev_get_ifindex
);
5703 error
= af_inet_ioctl(SIOCGIFINDEX
, &ifr
);
5705 /* ENODEV probably means that a vif disappeared asynchronously and
5706 * hasn't been removed from the database yet, so reduce the log level
5707 * to INFO for that case. */
5708 VLOG_RL(&rl
, error
== ENODEV
? VLL_INFO
: VLL_ERR
,
5709 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5710 netdev_name
, ovs_strerror(error
));
5713 return ifr
.ifr_ifindex
;
5717 get_ifindex(const struct netdev
*netdev_
, int *ifindexp
)
5719 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5721 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
5722 netdev_linux_update_via_netlink(netdev
);
5725 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
5726 /* Fall back to ioctl if netlink fails */
5727 int ifindex
= linux_get_ifindex(netdev_get_name(netdev_
));
5730 netdev
->get_ifindex_error
= -ifindex
;
5731 netdev
->ifindex
= 0;
5733 netdev
->get_ifindex_error
= 0;
5734 netdev
->ifindex
= ifindex
;
5736 netdev
->cache_valid
|= VALID_IFINDEX
;
5739 *ifindexp
= netdev
->ifindex
;
5740 return netdev
->get_ifindex_error
;
5744 netdev_linux_update_via_netlink(struct netdev_linux
*netdev
)
5746 struct ofpbuf request
;
5747 struct ofpbuf
*reply
;
5748 struct rtnetlink_change chg
;
5749 struct rtnetlink_change
*change
= &chg
;
5752 ofpbuf_init(&request
, 0);
5753 nl_msg_put_nlmsghdr(&request
,
5754 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
5755 RTM_GETLINK
, NLM_F_REQUEST
);
5756 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
5758 /* The correct identifiers for a Linux device are netnsid and ifindex,
5759 * but ifindex changes as the port is moved to another network namespace
5760 * and the interface name statically stored in ovsdb. */
5761 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(&netdev
->up
));
5762 if (netdev_linux_netnsid_is_remote(netdev
)) {
5763 nl_msg_push_u32(&request
, IFLA_IF_NETNSID
, netdev
->netnsid
);
5765 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
5766 ofpbuf_uninit(&request
);
5768 ofpbuf_delete(reply
);
5772 if (rtnetlink_parse(reply
, change
)
5773 && change
->nlmsg_type
== RTM_NEWLINK
) {
5774 bool changed
= false;
5777 /* Update netdev from rtnl msg and increment its seq if needed. */
5778 if ((change
->ifi_flags
^ netdev
->ifi_flags
) & IFF_RUNNING
) {
5779 netdev
->carrier_resets
++;
5782 if (change
->ifi_flags
!= netdev
->ifi_flags
) {
5783 netdev
->ifi_flags
= change
->ifi_flags
;
5786 if (change
->mtu
&& change
->mtu
!= netdev
->mtu
) {
5787 netdev
->mtu
= change
->mtu
;
5788 netdev
->cache_valid
|= VALID_MTU
;
5789 netdev
->netdev_mtu_error
= 0;
5792 if (!eth_addr_is_zero(change
->mac
)
5793 && !eth_addr_equals(change
->mac
, netdev
->etheraddr
)) {
5794 netdev
->etheraddr
= change
->mac
;
5795 netdev
->cache_valid
|= VALID_ETHERADDR
;
5796 netdev
->ether_addr_error
= 0;
5799 if (change
->if_index
!= netdev
->ifindex
) {
5800 netdev
->ifindex
= change
->if_index
;
5801 netdev
->cache_valid
|= VALID_IFINDEX
;
5802 netdev
->get_ifindex_error
= 0;
5805 if (change
->master
&& netdev_linux_kind_is_lag(change
->master
)) {
5806 netdev
->is_lag_master
= true;
5809 netdev_change_seq_changed(&netdev
->up
);
5815 ofpbuf_delete(reply
);
5820 get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
)
5826 memset(&ifr
, 0, sizeof ifr
);
5827 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5828 COVERAGE_INC(netdev_get_hwaddr
);
5829 error
= af_inet_ioctl(SIOCGIFHWADDR
, &ifr
);
5831 /* ENODEV probably means that a vif disappeared asynchronously and
5832 * hasn't been removed from the database yet, so reduce the log level
5833 * to INFO for that case. */
5834 VLOG(error
== ENODEV
? VLL_INFO
: VLL_ERR
,
5835 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5836 netdev_name
, ovs_strerror(error
));
5839 hwaddr_family
= ifr
.ifr_hwaddr
.sa_family
;
5840 if (hwaddr_family
!= AF_UNSPEC
&& hwaddr_family
!= ARPHRD_ETHER
&&
5841 hwaddr_family
!= ARPHRD_NONE
) {
5842 VLOG_INFO("%s device has unknown hardware address family %d",
5843 netdev_name
, hwaddr_family
);
5846 memcpy(ea
, ifr
.ifr_hwaddr
.sa_data
, ETH_ADDR_LEN
);
5851 set_etheraddr(const char *netdev_name
, const struct eth_addr mac
)
5856 memset(&ifr
, 0, sizeof ifr
);
5857 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5858 ifr
.ifr_hwaddr
.sa_family
= ARPHRD_ETHER
;
5859 memcpy(ifr
.ifr_hwaddr
.sa_data
, &mac
, ETH_ADDR_LEN
);
5860 COVERAGE_INC(netdev_set_hwaddr
);
5861 error
= af_inet_ioctl(SIOCSIFHWADDR
, &ifr
);
5863 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5864 netdev_name
, ovs_strerror(error
));
5870 netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*ecmd
,
5871 int cmd
, const char *cmd_name
)
5876 memset(&ifr
, 0, sizeof ifr
);
5877 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
5878 ifr
.ifr_data
= (caddr_t
) ecmd
;
5881 error
= af_inet_ioctl(SIOCETHTOOL
, &ifr
);
5883 if (error
!= EOPNOTSUPP
) {
5884 VLOG_WARN_RL(&rl
, "ethtool command %s on network device %s "
5885 "failed: %s", cmd_name
, name
, ovs_strerror(error
));
5887 /* The device doesn't support this operation. That's pretty
5888 * common, so there's no point in logging anything. */
5894 /* Returns an AF_PACKET raw socket or a negative errno value. */
5896 af_packet_sock(void)
5898 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5901 if (ovsthread_once_start(&once
)) {
5902 sock
= socket(AF_PACKET
, SOCK_RAW
, 0);
5904 int error
= set_nonblocking(sock
);
5911 VLOG_ERR("failed to create packet socket: %s",
5912 ovs_strerror(errno
));
5914 ovsthread_once_done(&once
);