2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <sys/types.h>
24 #include <netinet/in.h>
25 #include <arpa/inet.h>
27 #include <linux/filter.h>
28 #include <linux/gen_stats.h>
29 #include <linux/if_ether.h>
30 #include <linux/if_tun.h>
31 #include <linux/types.h>
32 #include <linux/ethtool.h>
33 #include <linux/mii.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/ioctl.h>
37 #include <sys/socket.h>
38 #include <sys/utsname.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/route.h>
49 #include "dp-packet.h"
50 #include "dpif-netlink.h"
51 #include "dpif-netdev.h"
52 #include "openvswitch/dynamic-string.h"
53 #include "fatal-signal.h"
55 #include "openvswitch/hmap.h"
56 #include "netdev-provider.h"
57 #include "netdev-tc-offloads.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openvswitch/ofpbuf.h"
64 #include "openflow/openflow.h"
65 #include "ovs-atomic.h"
67 #include "openvswitch/poll-loop.h"
68 #include "rtnetlink.h"
69 #include "openvswitch/shash.h"
70 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
78 VLOG_DEFINE_THIS_MODULE(netdev_linux
);
80 COVERAGE_DEFINE(netdev_set_policing
);
81 COVERAGE_DEFINE(netdev_arp_lookup
);
82 COVERAGE_DEFINE(netdev_get_ifindex
);
83 COVERAGE_DEFINE(netdev_get_hwaddr
);
84 COVERAGE_DEFINE(netdev_set_hwaddr
);
85 COVERAGE_DEFINE(netdev_get_ethtool
);
86 COVERAGE_DEFINE(netdev_set_ethtool
);
89 #ifndef IFLA_IF_NETNSID
90 #define IFLA_IF_NETNSID 0x45
92 /* These were introduced in Linux 2.6.14, so they might be missing if we have
94 #ifndef ADVERTISED_Pause
95 #define ADVERTISED_Pause (1 << 13)
97 #ifndef ADVERTISED_Asym_Pause
98 #define ADVERTISED_Asym_Pause (1 << 14)
101 /* These were introduced in Linux 2.6.24, so they might be missing if we
102 * have old headers. */
103 #ifndef ETHTOOL_GFLAGS
104 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
106 #ifndef ETHTOOL_SFLAGS
107 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
110 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
113 #define TC_RTAB_SIZE 1024
116 /* Linux 2.6.21 introduced struct tpacket_auxdata.
117 * Linux 2.6.27 added the tp_vlan_tci member.
118 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
119 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
120 * TP_STATUS_VLAN_TPID_VALID.
122 * With all this churn it's easiest to unconditionally define a replacement
123 * structure that has everything we want.
125 #ifndef PACKET_AUXDATA
126 #define PACKET_AUXDATA 8
128 #ifndef TP_STATUS_VLAN_VALID
129 #define TP_STATUS_VLAN_VALID (1 << 4)
131 #ifndef TP_STATUS_VLAN_TPID_VALID
132 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
134 #undef tpacket_auxdata
135 #define tpacket_auxdata rpl_tpacket_auxdata
136 struct tpacket_auxdata
{
142 uint16_t tp_vlan_tci
;
143 uint16_t tp_vlan_tpid
;
146 /* Linux 2.6.27 introduced ethtool_cmd_speed
148 * To avoid revisiting problems reported with using configure to detect
149 * compatibility (see report at
150 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
151 * unconditionally replace ethtool_cmd_speed. */
152 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
153 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd
*ep
)
155 return ep
->speed
| (ep
->speed_hi
<< 16);
158 /* Linux 2.6.30 introduced supported and advertised flags for
159 * 1G base KX, and 10G base KX4, KR and R. */
160 #ifndef SUPPORTED_1000baseKX_Full
161 #define SUPPORTED_1000baseKX_Full (1 << 17)
162 #define SUPPORTED_10000baseKX4_Full (1 << 18)
163 #define SUPPORTED_10000baseKR_Full (1 << 19)
164 #define SUPPORTED_10000baseR_FEC (1 << 20)
165 #define ADVERTISED_1000baseKX_Full (1 << 17)
166 #define ADVERTISED_10000baseKX4_Full (1 << 18)
167 #define ADVERTISED_10000baseKR_Full (1 << 19)
168 #define ADVERTISED_10000baseR_FEC (1 << 20)
171 /* Linux 3.5 introduced supported and advertised flags for
172 * 40G base KR4, CR4, SR4 and LR4. */
173 #ifndef SUPPORTED_40000baseKR4_Full
174 #define SUPPORTED_40000baseKR4_Full (1 << 23)
175 #define SUPPORTED_40000baseCR4_Full (1 << 24)
176 #define SUPPORTED_40000baseSR4_Full (1 << 25)
177 #define SUPPORTED_40000baseLR4_Full (1 << 26)
178 #define ADVERTISED_40000baseKR4_Full (1 << 23)
179 #define ADVERTISED_40000baseCR4_Full (1 << 24)
180 #define ADVERTISED_40000baseSR4_Full (1 << 25)
181 #define ADVERTISED_40000baseLR4_Full (1 << 26)
184 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
186 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
187 * 2.6.32-431.29.2.el6.x86_64 (see report at
188 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
189 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
190 * unconditionally define a replacement. */
192 #define IFLA_STATS64 23
194 #define rtnl_link_stats64 rpl_rtnl_link_stats64
195 struct rtnl_link_stats64
{
207 uint64_t rx_length_errors
;
208 uint64_t rx_over_errors
;
209 uint64_t rx_crc_errors
;
210 uint64_t rx_frame_errors
;
211 uint64_t rx_fifo_errors
;
212 uint64_t rx_missed_errors
;
214 uint64_t tx_aborted_errors
;
215 uint64_t tx_carrier_errors
;
216 uint64_t tx_fifo_errors
;
217 uint64_t tx_heartbeat_errors
;
218 uint64_t tx_window_errors
;
220 uint64_t rx_compressed
;
221 uint64_t tx_compressed
;
225 VALID_IFINDEX
= 1 << 0,
226 VALID_ETHERADDR
= 1 << 1,
229 VALID_POLICING
= 1 << 4,
230 VALID_VPORT_STAT_ERROR
= 1 << 5,
231 VALID_DRVINFO
= 1 << 6,
232 VALID_FEATURES
= 1 << 7,
235 struct linux_lag_slave
{
237 struct shash_node
*node
;
240 /* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
241 static struct ovs_mutex lag_mutex
= OVS_MUTEX_INITIALIZER
;
243 /* All slaves whose LAG masters are network devices in OvS. */
244 static struct shash lag_shash
OVS_GUARDED_BY(lag_mutex
)
245 = SHASH_INITIALIZER(&lag_shash
);
247 /* Traffic control. */
249 /* An instance of a traffic control class. Always associated with a particular
252 * Each TC implementation subclasses this with whatever additional data it
255 const struct tc_ops
*ops
;
256 struct hmap queues
; /* Contains "struct tc_queue"s.
257 * Read by generic TC layer.
258 * Written only by TC implementation. */
261 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
263 /* One traffic control queue.
265 * Each TC implementation subclasses this with whatever additional data it
268 struct hmap_node hmap_node
; /* In struct tc's "queues" hmap. */
269 unsigned int queue_id
; /* OpenFlow queue ID. */
270 long long int created
; /* Time queue was created, in msecs. */
273 /* A particular kind of traffic control. Each implementation generally maps to
274 * one particular Linux qdisc class.
276 * The functions below return 0 if successful or a positive errno value on
277 * failure, except where otherwise noted. All of them must be provided, except
278 * where otherwise noted. */
280 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
281 * This is null for tc_ops_default and tc_ops_other, for which there are no
282 * appropriate values. */
283 const char *linux_name
;
285 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
286 const char *ovs_name
;
288 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
289 * queues. The queues are numbered 0 through n_queues - 1. */
290 unsigned int n_queues
;
292 /* Called to install this TC class on 'netdev'. The implementation should
293 * make the Netlink calls required to set up 'netdev' with the right qdisc
294 * and configure it according to 'details'. The implementation may assume
295 * that the current qdisc is the default; that is, there is no need for it
296 * to delete the current qdisc before installing itself.
298 * The contents of 'details' should be documented as valid for 'ovs_name'
299 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
300 * (which is built as ovs-vswitchd.conf.db(8)).
302 * This function must return 0 if and only if it sets 'netdev->tc' to an
303 * initialized 'struct tc'.
305 * (This function is null for tc_ops_other, which cannot be installed. For
306 * other TC classes it should always be nonnull.) */
307 int (*tc_install
)(struct netdev
*netdev
, const struct smap
*details
);
309 /* Called when the netdev code determines (through a Netlink query) that
310 * this TC class's qdisc is installed on 'netdev', but we didn't install
311 * it ourselves and so don't know any of the details.
313 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
314 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
315 * implementation should parse the other attributes of 'nlmsg' as
316 * necessary to determine its configuration. If necessary it should also
317 * use Netlink queries to determine the configuration of queues on
320 * This function must return 0 if and only if it sets 'netdev->tc' to an
321 * initialized 'struct tc'. */
322 int (*tc_load
)(struct netdev
*netdev
, struct ofpbuf
*nlmsg
);
324 /* Destroys the data structures allocated by the implementation as part of
325 * 'tc'. (This includes destroying 'tc->queues' by calling
328 * The implementation should not need to perform any Netlink calls. If
329 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
330 * (But it may not be desirable.)
332 * This function may be null if 'tc' is trivial. */
333 void (*tc_destroy
)(struct tc
*tc
);
335 /* Retrieves details of 'netdev->tc' configuration into 'details'.
337 * The implementation should not need to perform any Netlink calls, because
338 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
339 * cached the configuration.
341 * The contents of 'details' should be documented as valid for 'ovs_name'
342 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
343 * (which is built as ovs-vswitchd.conf.db(8)).
345 * This function may be null if 'tc' is not configurable.
347 int (*qdisc_get
)(const struct netdev
*netdev
, struct smap
*details
);
349 /* Reconfigures 'netdev->tc' according to 'details', performing any
350 * required Netlink calls to complete the reconfiguration.
352 * The contents of 'details' should be documented as valid for 'ovs_name'
353 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
354 * (which is built as ovs-vswitchd.conf.db(8)).
356 * This function may be null if 'tc' is not configurable.
358 int (*qdisc_set
)(struct netdev
*, const struct smap
*details
);
360 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
361 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
363 * The contents of 'details' should be documented as valid for 'ovs_name'
364 * in the "other_config" column in the "Queue" table in
365 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
367 * The implementation should not need to perform any Netlink calls, because
368 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
369 * cached the queue configuration.
371 * This function may be null if 'tc' does not have queues ('n_queues' is
373 int (*class_get
)(const struct netdev
*netdev
, const struct tc_queue
*queue
,
374 struct smap
*details
);
376 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
377 * 'details', perfoming any required Netlink calls to complete the
378 * reconfiguration. The caller ensures that 'queue_id' is less than
381 * The contents of 'details' should be documented as valid for 'ovs_name'
382 * in the "other_config" column in the "Queue" table in
383 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
385 * This function may be null if 'tc' does not have queues or its queues are
386 * not configurable. */
387 int (*class_set
)(struct netdev
*, unsigned int queue_id
,
388 const struct smap
*details
);
390 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
391 * tc_queue's within 'netdev->tc->queues'.
393 * This function may be null if 'tc' does not have queues or its queues
394 * cannot be deleted. */
395 int (*class_delete
)(struct netdev
*, struct tc_queue
*queue
);
397 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
398 * 'struct tc_queue's within 'netdev->tc->queues'.
400 * On success, initializes '*stats'.
402 * This function may be null if 'tc' does not have queues or if it cannot
403 * report queue statistics. */
404 int (*class_get_stats
)(const struct netdev
*netdev
,
405 const struct tc_queue
*queue
,
406 struct netdev_queue_stats
*stats
);
408 /* Extracts queue stats from 'nlmsg', which is a response to a
409 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
411 * This function may be null if 'tc' does not have queues or if it cannot
412 * report queue statistics. */
413 int (*class_dump_stats
)(const struct netdev
*netdev
,
414 const struct ofpbuf
*nlmsg
,
415 netdev_dump_queue_stats_cb
*cb
, void *aux
);
419 tc_init(struct tc
*tc
, const struct tc_ops
*ops
)
422 hmap_init(&tc
->queues
);
426 tc_destroy(struct tc
*tc
)
428 hmap_destroy(&tc
->queues
);
431 static const struct tc_ops tc_ops_htb
;
432 static const struct tc_ops tc_ops_hfsc
;
433 static const struct tc_ops tc_ops_codel
;
434 static const struct tc_ops tc_ops_fqcodel
;
435 static const struct tc_ops tc_ops_sfq
;
436 static const struct tc_ops tc_ops_default
;
437 static const struct tc_ops tc_ops_noop
;
438 static const struct tc_ops tc_ops_other
;
440 static const struct tc_ops
*const tcs
[] = {
441 &tc_ops_htb
, /* Hierarchy token bucket (see tc-htb(8)). */
442 &tc_ops_hfsc
, /* Hierarchical fair service curve. */
443 &tc_ops_codel
, /* Controlled delay */
444 &tc_ops_fqcodel
, /* Fair queue controlled delay */
445 &tc_ops_sfq
, /* Stochastic fair queueing */
446 &tc_ops_noop
, /* Non operating qos type. */
447 &tc_ops_default
, /* Default qdisc (see tc-pfifo_fast(8)). */
448 &tc_ops_other
, /* Some other qdisc. */
452 static unsigned int tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
);
453 static unsigned int tc_bytes_to_ticks(unsigned int rate
, unsigned int size
);
454 static unsigned int tc_buffer_per_jiffy(unsigned int rate
);
456 static struct tcmsg
*netdev_linux_tc_make_request(const struct netdev
*,
460 static int tc_add_policer(struct netdev
*,
461 uint32_t kbits_rate
, uint32_t kbits_burst
);
463 static int tc_parse_qdisc(const struct ofpbuf
*, const char **kind
,
464 struct nlattr
**options
);
465 static int tc_parse_class(const struct ofpbuf
*, unsigned int *queue_id
,
466 struct nlattr
**options
,
467 struct netdev_queue_stats
*);
468 static int tc_query_class(const struct netdev
*,
469 unsigned int handle
, unsigned int parent
,
470 struct ofpbuf
**replyp
);
471 static int tc_delete_class(const struct netdev
*, unsigned int handle
);
473 static int tc_del_qdisc(struct netdev
*netdev
);
474 static int tc_query_qdisc(const struct netdev
*netdev
);
476 static int tc_calc_cell_log(unsigned int mtu
);
477 static void tc_fill_rate(struct tc_ratespec
*rate
, uint64_t bps
, int mtu
);
478 static void tc_put_rtab(struct ofpbuf
*, uint16_t type
,
479 const struct tc_ratespec
*rate
);
480 static int tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
);
482 struct netdev_linux
{
485 /* Protects all members below. */
486 struct ovs_mutex mutex
;
488 unsigned int cache_valid
;
490 bool miimon
; /* Link status of last poll. */
491 long long int miimon_interval
; /* Miimon Poll rate. Disabled if <= 0. */
492 struct timer miimon_timer
;
494 int netnsid
; /* Network namespace ID. */
495 /* The following are figured out "on demand" only. They are only valid
496 * when the corresponding VALID_* bit in 'cache_valid' is set. */
498 struct eth_addr etheraddr
;
500 unsigned int ifi_flags
;
501 long long int carrier_resets
;
502 uint32_t kbits_rate
; /* Policing data. */
503 uint32_t kbits_burst
;
504 int vport_stats_error
; /* Cached error code from vport_get_stats().
505 0 or an errno value. */
506 int netdev_mtu_error
; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
507 int ether_addr_error
; /* Cached error code from set/get etheraddr. */
508 int netdev_policing_error
; /* Cached error code from set policing. */
509 int get_features_error
; /* Cached error code from ETHTOOL_GSET. */
510 int get_ifindex_error
; /* Cached error code from SIOCGIFINDEX. */
512 enum netdev_features current
; /* Cached from ETHTOOL_GSET. */
513 enum netdev_features advertised
; /* Cached from ETHTOOL_GSET. */
514 enum netdev_features supported
; /* Cached from ETHTOOL_GSET. */
516 struct ethtool_drvinfo drvinfo
; /* Cached from ETHTOOL_GDRVINFO. */
519 /* For devices of class netdev_tap_class only. */
521 bool present
; /* If the device is present in the namespace */
522 uint64_t tx_dropped
; /* tap device can drop if the iface is down */
524 /* LAG information. */
525 bool is_lag_master
; /* True if the netdev is a LAG master. */
528 struct netdev_rxq_linux
{
529 struct netdev_rxq up
;
534 /* This is set pretty low because we probably won't learn anything from the
535 * additional log messages. */
536 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
538 /* Polling miimon status for all ports causes performance degradation when
539 * handling a large number of ports. If there are no devices using miimon, then
540 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
542 * Readers do not depend on this variable synchronizing with the related
543 * changes in the device miimon status, so we can use atomic_count. */
544 static atomic_count miimon_cnt
= ATOMIC_COUNT_INIT(0);
546 static void netdev_linux_run(const struct netdev_class
*);
548 static int netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*,
549 int cmd
, const char *cmd_name
);
550 static int get_flags(const struct netdev
*, unsigned int *flags
);
551 static int set_flags(const char *, unsigned int flags
);
552 static int update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
553 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
554 OVS_REQUIRES(netdev
->mutex
);
555 static int get_ifindex(const struct netdev
*, int *ifindexp
);
556 static int do_set_addr(struct netdev
*netdev
,
557 int ioctl_nr
, const char *ioctl_name
,
558 struct in_addr addr
);
559 static int get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
);
560 static int set_etheraddr(const char *netdev_name
, const struct eth_addr
);
561 static int get_stats_via_netlink(const struct netdev
*, struct netdev_stats
*);
562 static int af_packet_sock(void);
563 static bool netdev_linux_miimon_enabled(void);
564 static void netdev_linux_miimon_run(void);
565 static void netdev_linux_miimon_wait(void);
566 static int netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
);
569 is_netdev_linux_class(const struct netdev_class
*netdev_class
)
571 return netdev_class
->run
== netdev_linux_run
;
575 is_tap_netdev(const struct netdev
*netdev
)
577 return netdev_get_class(netdev
) == &netdev_tap_class
;
580 static struct netdev_linux
*
581 netdev_linux_cast(const struct netdev
*netdev
)
583 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev
)));
585 return CONTAINER_OF(netdev
, struct netdev_linux
, up
);
588 static struct netdev_rxq_linux
*
589 netdev_rxq_linux_cast(const struct netdev_rxq
*rx
)
591 ovs_assert(is_netdev_linux_class(netdev_get_class(rx
->netdev
)));
592 return CONTAINER_OF(rx
, struct netdev_rxq_linux
, up
);
596 netdev_linux_netnsid_update__(struct netdev_linux
*netdev
)
598 struct dpif_netlink_vport reply
;
602 error
= dpif_netlink_vport_get(netdev_get_name(&netdev
->up
), &reply
, &buf
);
604 if (error
== ENOENT
) {
605 /* Assume it is local if there is no API (e.g. if the openvswitch
606 * kernel module is not loaded). */
607 netnsid_set_local(&netdev
->netnsid
);
609 netnsid_unset(&netdev
->netnsid
);
614 netnsid_set(&netdev
->netnsid
, reply
.netnsid
);
620 netdev_linux_netnsid_update(struct netdev_linux
*netdev
)
622 if (netnsid_is_unset(netdev
->netnsid
)) {
623 if (netdev_get_class(&netdev
->up
) == &netdev_tap_class
) {
624 netnsid_set_local(&netdev
->netnsid
);
626 return netdev_linux_netnsid_update__(netdev
);
634 netdev_linux_netnsid_is_eq(struct netdev_linux
*netdev
, int nsid
)
636 netdev_linux_netnsid_update(netdev
);
637 return netnsid_eq(netdev
->netnsid
, nsid
);
641 netdev_linux_netnsid_is_remote(struct netdev_linux
*netdev
)
643 netdev_linux_netnsid_update(netdev
);
644 return netnsid_is_remote(netdev
->netnsid
);
647 static int netdev_linux_update_via_netlink(struct netdev_linux
*);
648 static void netdev_linux_update(struct netdev_linux
*netdev
, int,
649 const struct rtnetlink_change
*)
650 OVS_REQUIRES(netdev
->mutex
);
651 static void netdev_linux_changed(struct netdev_linux
*netdev
,
652 unsigned int ifi_flags
, unsigned int mask
)
653 OVS_REQUIRES(netdev
->mutex
);
655 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
656 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
657 * if no such socket could be created. */
658 static struct nl_sock
*
659 netdev_linux_notify_sock(void)
661 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
662 static struct nl_sock
*sock
;
663 unsigned int mcgroups
[] = {RTNLGRP_LINK
, RTNLGRP_IPV4_IFADDR
,
664 RTNLGRP_IPV6_IFADDR
, RTNLGRP_IPV6_IFINFO
};
666 if (ovsthread_once_start(&once
)) {
669 error
= nl_sock_create(NETLINK_ROUTE
, &sock
);
673 for (i
= 0; i
< ARRAY_SIZE(mcgroups
); i
++) {
674 error
= nl_sock_join_mcgroup(sock
, mcgroups
[i
]);
676 nl_sock_destroy(sock
);
682 nl_sock_listen_all_nsid(sock
, true);
683 ovsthread_once_done(&once
);
690 netdev_linux_miimon_enabled(void)
692 return atomic_count_get(&miimon_cnt
) > 0;
696 netdev_linux_kind_is_lag(const char *kind
)
698 if (!strcmp(kind
, "bond") || !strcmp(kind
, "team")) {
706 netdev_linux_update_lag(struct rtnetlink_change
*change
)
707 OVS_REQUIRES(lag_mutex
)
709 struct linux_lag_slave
*lag
;
711 if (!rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)) {
715 if (change
->slave
&& netdev_linux_kind_is_lag(change
->slave
)) {
716 lag
= shash_find_data(&lag_shash
, change
->ifname
);
719 struct netdev
*master_netdev
;
720 char master_name
[IFNAMSIZ
];
724 if_indextoname(change
->master_ifindex
, master_name
);
725 master_netdev
= netdev_from_name(master_name
);
726 if (!master_netdev
) {
730 if (is_netdev_linux_class(master_netdev
->netdev_class
)) {
731 block_id
= netdev_get_block_id(master_netdev
);
733 netdev_close(master_netdev
);
737 lag
= xmalloc(sizeof *lag
);
738 lag
->block_id
= block_id
;
739 lag
->node
= shash_add(&lag_shash
, change
->ifname
, lag
);
741 /* LAG master is linux netdev so add slave to same block. */
742 error
= tc_add_del_ingress_qdisc(change
->if_index
, true,
745 VLOG_WARN("failed to bind LAG slave to master's block");
746 shash_delete(&lag_shash
, lag
->node
);
751 netdev_close(master_netdev
);
753 } else if (change
->master_ifindex
== 0) {
754 /* Check if this was a lag slave that has been freed. */
755 lag
= shash_find_data(&lag_shash
, change
->ifname
);
758 tc_add_del_ingress_qdisc(change
->if_index
, false,
760 shash_delete(&lag_shash
, lag
->node
);
767 netdev_linux_run(const struct netdev_class
*netdev_class OVS_UNUSED
)
769 struct nl_sock
*sock
;
772 if (netdev_linux_miimon_enabled()) {
773 netdev_linux_miimon_run();
776 sock
= netdev_linux_notify_sock();
782 uint64_t buf_stub
[4096 / 8];
786 ofpbuf_use_stub(&buf
, buf_stub
, sizeof buf_stub
);
787 error
= nl_sock_recv(sock
, &buf
, &nsid
, false);
789 struct rtnetlink_change change
;
791 if (rtnetlink_parse(&buf
, &change
)) {
792 struct netdev
*netdev_
= NULL
;
793 char dev_name
[IFNAMSIZ
];
795 if (!change
.ifname
) {
796 change
.ifname
= if_indextoname(change
.if_index
, dev_name
);
800 netdev_
= netdev_from_name(change
.ifname
);
802 if (netdev_
&& is_netdev_linux_class(netdev_
->netdev_class
)) {
803 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
805 ovs_mutex_lock(&netdev
->mutex
);
806 netdev_linux_update(netdev
, nsid
, &change
);
807 ovs_mutex_unlock(&netdev
->mutex
);
809 else if (!netdev_
&& change
.ifname
) {
810 /* Netdev is not present in OvS but its master could be. */
811 ovs_mutex_lock(&lag_mutex
);
812 netdev_linux_update_lag(&change
);
813 ovs_mutex_unlock(&lag_mutex
);
815 netdev_close(netdev_
);
817 } else if (error
== ENOBUFS
) {
818 struct shash device_shash
;
819 struct shash_node
*node
;
823 shash_init(&device_shash
);
824 netdev_get_devices(&netdev_linux_class
, &device_shash
);
825 SHASH_FOR_EACH (node
, &device_shash
) {
826 struct netdev
*netdev_
= node
->data
;
827 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
830 ovs_mutex_lock(&netdev
->mutex
);
831 get_flags(netdev_
, &flags
);
832 netdev_linux_changed(netdev
, flags
, 0);
833 ovs_mutex_unlock(&netdev
->mutex
);
835 netdev_close(netdev_
);
837 shash_destroy(&device_shash
);
838 } else if (error
!= EAGAIN
) {
839 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 5);
840 VLOG_WARN_RL(&rll
, "error reading or parsing netlink (%s)",
841 ovs_strerror(error
));
848 netdev_linux_wait(const struct netdev_class
*netdev_class OVS_UNUSED
)
850 struct nl_sock
*sock
;
852 if (netdev_linux_miimon_enabled()) {
853 netdev_linux_miimon_wait();
855 sock
= netdev_linux_notify_sock();
857 nl_sock_wait(sock
, POLLIN
);
862 netdev_linux_changed(struct netdev_linux
*dev
,
863 unsigned int ifi_flags
, unsigned int mask
)
864 OVS_REQUIRES(dev
->mutex
)
866 netdev_change_seq_changed(&dev
->up
);
868 if ((dev
->ifi_flags
^ ifi_flags
) & IFF_RUNNING
) {
869 dev
->carrier_resets
++;
871 dev
->ifi_flags
= ifi_flags
;
873 dev
->cache_valid
&= mask
;
874 if (!(mask
& VALID_IN
)) {
875 netdev_get_addrs_list_flush();
880 netdev_linux_update__(struct netdev_linux
*dev
,
881 const struct rtnetlink_change
*change
)
882 OVS_REQUIRES(dev
->mutex
)
884 if (rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)) {
885 if (change
->nlmsg_type
== RTM_NEWLINK
) {
886 /* Keep drv-info, and ip addresses. */
887 netdev_linux_changed(dev
, change
->ifi_flags
,
888 VALID_DRVINFO
| VALID_IN
);
890 /* Update netdev from rtnl-change msg. */
892 dev
->mtu
= change
->mtu
;
893 dev
->cache_valid
|= VALID_MTU
;
894 dev
->netdev_mtu_error
= 0;
897 if (!eth_addr_is_zero(change
->mac
)) {
898 dev
->etheraddr
= change
->mac
;
899 dev
->cache_valid
|= VALID_ETHERADDR
;
900 dev
->ether_addr_error
= 0;
902 /* The mac addr has been changed, report it now. */
903 rtnetlink_report_link();
906 if (change
->master
&& netdev_linux_kind_is_lag(change
->master
)) {
907 dev
->is_lag_master
= true;
910 dev
->ifindex
= change
->if_index
;
911 dev
->cache_valid
|= VALID_IFINDEX
;
912 dev
->get_ifindex_error
= 0;
916 netdev_linux_changed(dev
, change
->ifi_flags
, 0);
917 dev
->present
= false;
918 netnsid_unset(&dev
->netnsid
);
920 } else if (rtnetlink_type_is_rtnlgrp_addr(change
->nlmsg_type
)) {
921 /* Invalidates in4, in6. */
922 netdev_linux_changed(dev
, dev
->ifi_flags
, ~VALID_IN
);
929 netdev_linux_update(struct netdev_linux
*dev
, int nsid
,
930 const struct rtnetlink_change
*change
)
931 OVS_REQUIRES(dev
->mutex
)
933 if (netdev_linux_netnsid_is_eq(dev
, nsid
)) {
934 netdev_linux_update__(dev
, change
);
938 static struct netdev
*
939 netdev_linux_alloc(void)
941 struct netdev_linux
*netdev
= xzalloc(sizeof *netdev
);
946 netdev_linux_common_construct(struct netdev
*netdev_
)
948 /* Prevent any attempt to create (or open) a network device named "default"
949 * or "all". These device names are effectively reserved on Linux because
950 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
951 * itself this wouldn't call for any special treatment, but in practice if
952 * a program tries to create devices with these names, it causes the kernel
953 * to fire a "new device" notification event even though creation failed,
954 * and in turn that causes OVS to wake up and try to create them again,
955 * which ends up as a 100% CPU loop. */
956 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
957 const char *name
= netdev_
->name
;
958 if (!strcmp(name
, "default") || !strcmp(name
, "all")) {
959 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 1);
960 VLOG_WARN_RL(&rll
, "%s: Linux forbids network device with this name",
965 /* The device could be in the same network namespace or in another one. */
966 netnsid_unset(&netdev
->netnsid
);
967 ovs_mutex_init(&netdev
->mutex
);
971 /* Creates system and internal devices. */
973 netdev_linux_construct(struct netdev
*netdev_
)
975 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
976 int error
= netdev_linux_common_construct(netdev_
);
981 error
= get_flags(&netdev
->up
, &netdev
->ifi_flags
);
982 if (error
== ENODEV
) {
983 if (netdev
->up
.netdev_class
!= &netdev_internal_class
) {
984 /* The device does not exist, so don't allow it to be opened. */
987 /* "Internal" netdevs have to be created as netdev objects before
988 * they exist in the kernel, because creating them in the kernel
989 * happens by passing a netdev object to dpif_port_add().
990 * Therefore, ignore the error. */
997 /* For most types of netdevs we open the device for each call of
998 * netdev_open(). However, this is not the case with tap devices,
999 * since it is only possible to open the device once. In this
1000 * situation we share a single file descriptor, and consequently
1001 * buffers, across all readers. Therefore once data is read it will
1002 * be unavailable to other reads for tap devices. */
1004 netdev_linux_construct_tap(struct netdev
*netdev_
)
1006 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1007 static const char tap_dev
[] = "/dev/net/tun";
1008 const char *name
= netdev_
->name
;
1011 int error
= netdev_linux_common_construct(netdev_
);
1016 /* Open tap device. */
1017 netdev
->tap_fd
= open(tap_dev
, O_RDWR
);
1018 if (netdev
->tap_fd
< 0) {
1020 VLOG_WARN("opening \"%s\" failed: %s", tap_dev
, ovs_strerror(error
));
1024 /* Create tap device. */
1025 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
1026 ifr
.ifr_flags
= IFF_TAP
| IFF_NO_PI
;
1027 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
1028 if (ioctl(netdev
->tap_fd
, TUNSETIFF
, &ifr
) == -1) {
1029 VLOG_WARN("%s: creating tap device failed: %s", name
,
1030 ovs_strerror(errno
));
1035 /* Make non-blocking. */
1036 error
= set_nonblocking(netdev
->tap_fd
);
1041 if (ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 1)) {
1042 VLOG_WARN("%s: creating tap device failed (persist): %s", name
,
1043 ovs_strerror(errno
));
1048 netdev
->present
= true;
1052 close(netdev
->tap_fd
);
1057 netdev_linux_destruct(struct netdev
*netdev_
)
1059 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1061 if (netdev
->tc
&& netdev
->tc
->ops
->tc_destroy
) {
1062 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
1065 if (netdev_get_class(netdev_
) == &netdev_tap_class
1066 && netdev
->tap_fd
>= 0)
1068 ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 0);
1069 close(netdev
->tap_fd
);
1072 if (netdev
->miimon_interval
> 0) {
1073 atomic_count_dec(&miimon_cnt
);
1076 ovs_mutex_destroy(&netdev
->mutex
);
1080 netdev_linux_dealloc(struct netdev
*netdev_
)
1082 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1086 static struct netdev_rxq
*
1087 netdev_linux_rxq_alloc(void)
1089 struct netdev_rxq_linux
*rx
= xzalloc(sizeof *rx
);
1094 netdev_linux_rxq_construct(struct netdev_rxq
*rxq_
)
1096 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1097 struct netdev
*netdev_
= rx
->up
.netdev
;
1098 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1101 ovs_mutex_lock(&netdev
->mutex
);
1102 rx
->is_tap
= is_tap_netdev(netdev_
);
1104 rx
->fd
= netdev
->tap_fd
;
1106 struct sockaddr_ll sll
;
1108 /* Result of tcpdump -dd inbound */
1109 static const struct sock_filter filt
[] = {
1110 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1111 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1112 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1113 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1115 static const struct sock_fprog fprog
= {
1116 ARRAY_SIZE(filt
), (struct sock_filter
*) filt
1119 /* Create file descriptor. */
1120 rx
->fd
= socket(PF_PACKET
, SOCK_RAW
, 0);
1123 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error
));
1128 if (setsockopt(rx
->fd
, SOL_PACKET
, PACKET_AUXDATA
, &val
, sizeof val
)) {
1130 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1131 netdev_get_name(netdev_
), ovs_strerror(error
));
1135 /* Set non-blocking mode. */
1136 error
= set_nonblocking(rx
->fd
);
1141 /* Get ethernet device index. */
1142 error
= get_ifindex(&netdev
->up
, &ifindex
);
1147 /* Bind to specific ethernet device. */
1148 memset(&sll
, 0, sizeof sll
);
1149 sll
.sll_family
= AF_PACKET
;
1150 sll
.sll_ifindex
= ifindex
;
1151 sll
.sll_protocol
= htons(ETH_P_ALL
);
1152 if (bind(rx
->fd
, (struct sockaddr
*) &sll
, sizeof sll
) < 0) {
1154 VLOG_ERR("%s: failed to bind raw socket (%s)",
1155 netdev_get_name(netdev_
), ovs_strerror(error
));
1159 /* Filter for only inbound packets. */
1160 error
= setsockopt(rx
->fd
, SOL_SOCKET
, SO_ATTACH_FILTER
, &fprog
,
1164 VLOG_ERR("%s: failed to attach filter (%s)",
1165 netdev_get_name(netdev_
), ovs_strerror(error
));
1169 ovs_mutex_unlock(&netdev
->mutex
);
1177 ovs_mutex_unlock(&netdev
->mutex
);
1182 netdev_linux_rxq_destruct(struct netdev_rxq
*rxq_
)
1184 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1192 netdev_linux_rxq_dealloc(struct netdev_rxq
*rxq_
)
1194 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1200 auxdata_to_vlan_tpid(const struct tpacket_auxdata
*aux
, bool double_tagged
)
1202 if (aux
->tp_status
& TP_STATUS_VLAN_TPID_VALID
) {
1203 return htons(aux
->tp_vlan_tpid
);
1204 } else if (double_tagged
) {
1205 return htons(ETH_TYPE_VLAN_8021AD
);
1207 return htons(ETH_TYPE_VLAN_8021Q
);
1212 auxdata_has_vlan_tci(const struct tpacket_auxdata
*aux
)
1214 return aux
->tp_vlan_tci
|| aux
->tp_status
& TP_STATUS_VLAN_VALID
;
1218 netdev_linux_rxq_recv_sock(int fd
, struct dp_packet
*buffer
)
1223 struct cmsghdr
*cmsg
;
1225 struct cmsghdr cmsg
;
1226 char buffer
[CMSG_SPACE(sizeof(struct tpacket_auxdata
))];
1230 /* Reserve headroom for a single VLAN tag */
1231 dp_packet_reserve(buffer
, VLAN_HEADER_LEN
);
1232 size
= dp_packet_tailroom(buffer
);
1234 iov
.iov_base
= dp_packet_data(buffer
);
1236 msgh
.msg_name
= NULL
;
1237 msgh
.msg_namelen
= 0;
1238 msgh
.msg_iov
= &iov
;
1239 msgh
.msg_iovlen
= 1;
1240 msgh
.msg_control
= &cmsg_buffer
;
1241 msgh
.msg_controllen
= sizeof cmsg_buffer
;
1245 retval
= recvmsg(fd
, &msgh
, MSG_TRUNC
);
1246 } while (retval
< 0 && errno
== EINTR
);
1250 } else if (retval
> size
) {
1254 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1256 for (cmsg
= CMSG_FIRSTHDR(&msgh
); cmsg
; cmsg
= CMSG_NXTHDR(&msgh
, cmsg
)) {
1257 const struct tpacket_auxdata
*aux
;
1259 if (cmsg
->cmsg_level
!= SOL_PACKET
1260 || cmsg
->cmsg_type
!= PACKET_AUXDATA
1261 || cmsg
->cmsg_len
< CMSG_LEN(sizeof(struct tpacket_auxdata
))) {
1265 aux
= ALIGNED_CAST(struct tpacket_auxdata
*, CMSG_DATA(cmsg
));
1266 if (auxdata_has_vlan_tci(aux
)) {
1267 struct eth_header
*eth
;
1270 if (retval
< ETH_HEADER_LEN
) {
1274 eth
= dp_packet_data(buffer
);
1275 double_tagged
= eth
->eth_type
== htons(ETH_TYPE_VLAN_8021Q
);
1277 eth_push_vlan(buffer
, auxdata_to_vlan_tpid(aux
, double_tagged
),
1278 htons(aux
->tp_vlan_tci
));
1287 netdev_linux_rxq_recv_tap(int fd
, struct dp_packet
*buffer
)
1290 size_t size
= dp_packet_tailroom(buffer
);
1293 retval
= read(fd
, dp_packet_data(buffer
), size
);
1294 } while (retval
< 0 && errno
== EINTR
);
1300 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1305 netdev_linux_rxq_recv(struct netdev_rxq
*rxq_
, struct dp_packet_batch
*batch
,
1308 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1309 struct netdev
*netdev
= rx
->up
.netdev
;
1310 struct dp_packet
*buffer
;
1314 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
1315 mtu
= ETH_PAYLOAD_MAX
;
1318 /* Assume Ethernet port. No need to set packet_type. */
1319 buffer
= dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN
+ mtu
,
1320 DP_NETDEV_HEADROOM
);
1321 retval
= (rx
->is_tap
1322 ? netdev_linux_rxq_recv_tap(rx
->fd
, buffer
)
1323 : netdev_linux_rxq_recv_sock(rx
->fd
, buffer
));
1326 if (retval
!= EAGAIN
&& retval
!= EMSGSIZE
) {
1327 VLOG_WARN_RL(&rl
, "error receiving Ethernet packet on %s: %s",
1328 netdev_rxq_get_name(rxq_
), ovs_strerror(errno
));
1330 dp_packet_delete(buffer
);
1332 dp_packet_batch_init_packet(batch
, buffer
);
1343 netdev_linux_rxq_wait(struct netdev_rxq
*rxq_
)
1345 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1346 poll_fd_wait(rx
->fd
, POLLIN
);
1350 netdev_linux_rxq_drain(struct netdev_rxq
*rxq_
)
1352 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1355 int error
= af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_
), &ifr
,
1356 SIOCGIFTXQLEN
, "SIOCGIFTXQLEN");
1360 drain_fd(rx
->fd
, ifr
.ifr_qlen
);
1363 return drain_rcvbuf(rx
->fd
);
1368 netdev_linux_sock_batch_send(int sock
, int ifindex
,
1369 struct dp_packet_batch
*batch
)
1371 const size_t size
= dp_packet_batch_size(batch
);
1372 /* We don't bother setting most fields in sockaddr_ll because the
1373 * kernel ignores them for SOCK_RAW. */
1374 struct sockaddr_ll sll
= { .sll_family
= AF_PACKET
,
1375 .sll_ifindex
= ifindex
};
1377 struct mmsghdr
*mmsg
= xmalloc(sizeof(*mmsg
) * size
);
1378 struct iovec
*iov
= xmalloc(sizeof(*iov
) * size
);
1380 struct dp_packet
*packet
;
1381 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1382 iov
[i
].iov_base
= dp_packet_data(packet
);
1383 iov
[i
].iov_len
= dp_packet_size(packet
);
1384 mmsg
[i
].msg_hdr
= (struct msghdr
) { .msg_name
= &sll
,
1385 .msg_namelen
= sizeof sll
,
1391 for (uint32_t ofs
= 0; ofs
< size
; ) {
1394 retval
= sendmmsg(sock
, mmsg
+ ofs
, size
- ofs
, 0);
1395 error
= retval
< 0 ? errno
: 0;
1396 } while (error
== EINTR
);
1408 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1409 * essential, because packets sent to a tap device with an AF_PACKET socket
1410 * will loop back to be *received* again on the tap device. This doesn't occur
1411 * on other interface types because we attach a socket filter to the rx
1414 netdev_linux_tap_batch_send(struct netdev
*netdev_
,
1415 struct dp_packet_batch
*batch
)
1417 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1418 struct dp_packet
*packet
;
1420 /* The Linux tap driver returns EIO if the device is not up,
1421 * so if the device is not up, don't waste time sending it.
1422 * However, if the device is in another network namespace
1423 * then OVS can't retrieve the state. In that case, send the
1424 * packets anyway. */
1425 if (netdev
->present
&& !(netdev
->ifi_flags
& IFF_UP
)) {
1426 netdev
->tx_dropped
+= dp_packet_batch_size(batch
);
1430 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1431 size_t size
= dp_packet_size(packet
);
1436 retval
= write(netdev
->tap_fd
, dp_packet_data(packet
), size
);
1437 error
= retval
< 0 ? errno
: 0;
1438 } while (error
== EINTR
);
1441 /* The Linux tap driver returns EIO if the device is not up. From
1442 * the OVS side this is not an error, so we ignore it; otherwise,
1443 * return the erro. */
1447 } else if (retval
!= size
) {
1448 VLOG_WARN_RL(&rl
, "sent partial Ethernet packet (%"PRIuSIZE
" "
1449 "bytes of %"PRIuSIZE
") on %s",
1450 retval
, size
, netdev_get_name(netdev_
));
1457 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1458 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1459 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1460 * the packet is too big or too small to transmit on the device.
1462 * The kernel maintains a packet transmission queue, so the caller is not
1463 * expected to do additional queuing of packets. */
1465 netdev_linux_send(struct netdev
*netdev_
, int qid OVS_UNUSED
,
1466 struct dp_packet_batch
*batch
,
1467 bool concurrent_txq OVS_UNUSED
)
1472 if (!is_tap_netdev(netdev_
)) {
1473 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_
))) {
1478 sock
= af_packet_sock();
1484 int ifindex
= netdev_get_ifindex(netdev_
);
1490 error
= netdev_linux_sock_batch_send(sock
, ifindex
, batch
);
1492 error
= netdev_linux_tap_batch_send(netdev_
, batch
);
1495 if (error
== ENOBUFS
) {
1496 /* The Linux AF_PACKET implementation never blocks waiting
1497 * for room for packets, instead returning ENOBUFS.
1498 * Translate this into EAGAIN for the caller. */
1501 VLOG_WARN_RL(&rl
, "error sending Ethernet packet on %s: %s",
1502 netdev_get_name(netdev_
), ovs_strerror(error
));
1507 dp_packet_delete_batch(batch
, true);
1511 /* Registers with the poll loop to wake up from the next call to poll_block()
1512 * when the packet transmission queue has sufficient room to transmit a packet
1513 * with netdev_send().
1515 * The kernel maintains a packet transmission queue, so the client is not
1516 * expected to do additional queuing of packets. Thus, this function is
1517 * unlikely to ever be used. It is included for completeness. */
1519 netdev_linux_send_wait(struct netdev
*netdev
, int qid OVS_UNUSED
)
1521 if (is_tap_netdev(netdev
)) {
1522 /* TAP device always accepts packets.*/
1523 poll_immediate_wake();
1527 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1528 * otherwise a positive errno value. */
1530 netdev_linux_set_etheraddr(struct netdev
*netdev_
, const struct eth_addr mac
)
1532 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1533 enum netdev_flags old_flags
= 0;
1536 ovs_mutex_lock(&netdev
->mutex
);
1537 if (netdev_linux_netnsid_is_remote(netdev
)) {
1542 if (netdev
->cache_valid
& VALID_ETHERADDR
) {
1543 error
= netdev
->ether_addr_error
;
1544 if (error
|| eth_addr_equals(netdev
->etheraddr
, mac
)) {
1547 netdev
->cache_valid
&= ~VALID_ETHERADDR
;
1550 /* Tap devices must be brought down before setting the address. */
1551 if (is_tap_netdev(netdev_
)) {
1552 update_flags(netdev
, NETDEV_UP
, 0, &old_flags
);
1554 error
= set_etheraddr(netdev_get_name(netdev_
), mac
);
1555 if (!error
|| error
== ENODEV
) {
1556 netdev
->ether_addr_error
= error
;
1557 netdev
->cache_valid
|= VALID_ETHERADDR
;
1559 netdev
->etheraddr
= mac
;
1563 if (is_tap_netdev(netdev_
) && old_flags
& NETDEV_UP
) {
1564 update_flags(netdev
, 0, NETDEV_UP
, &old_flags
);
1568 ovs_mutex_unlock(&netdev
->mutex
);
1572 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1574 netdev_linux_get_etheraddr(const struct netdev
*netdev_
, struct eth_addr
*mac
)
1576 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1579 ovs_mutex_lock(&netdev
->mutex
);
1580 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1581 netdev_linux_update_via_netlink(netdev
);
1584 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1585 /* Fall back to ioctl if netlink fails */
1586 netdev
->ether_addr_error
= get_etheraddr(netdev_get_name(netdev_
),
1587 &netdev
->etheraddr
);
1588 netdev
->cache_valid
|= VALID_ETHERADDR
;
1591 error
= netdev
->ether_addr_error
;
1593 *mac
= netdev
->etheraddr
;
1595 ovs_mutex_unlock(&netdev
->mutex
);
1601 netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
)
1605 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1606 netdev_linux_update_via_netlink(netdev
);
1609 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1610 /* Fall back to ioctl if netlink fails */
1613 netdev
->netdev_mtu_error
= af_inet_ifreq_ioctl(
1614 netdev_get_name(&netdev
->up
), &ifr
, SIOCGIFMTU
, "SIOCGIFMTU");
1615 netdev
->mtu
= ifr
.ifr_mtu
;
1616 netdev
->cache_valid
|= VALID_MTU
;
1619 error
= netdev
->netdev_mtu_error
;
1621 *mtup
= netdev
->mtu
;
1627 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1628 * in bytes, not including the hardware header; thus, this is typically 1500
1629 * bytes for Ethernet devices. */
1631 netdev_linux_get_mtu(const struct netdev
*netdev_
, int *mtup
)
1633 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1636 ovs_mutex_lock(&netdev
->mutex
);
1637 error
= netdev_linux_get_mtu__(netdev
, mtup
);
1638 ovs_mutex_unlock(&netdev
->mutex
);
1643 /* Sets the maximum size of transmitted (MTU) for given device using linux
1644 * networking ioctl interface.
1647 netdev_linux_set_mtu(struct netdev
*netdev_
, int mtu
)
1649 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1653 ovs_mutex_lock(&netdev
->mutex
);
1654 if (netdev_linux_netnsid_is_remote(netdev
)) {
1659 if (netdev
->cache_valid
& VALID_MTU
) {
1660 error
= netdev
->netdev_mtu_error
;
1661 if (error
|| netdev
->mtu
== mtu
) {
1664 netdev
->cache_valid
&= ~VALID_MTU
;
1667 error
= af_inet_ifreq_ioctl(netdev_get_name(netdev_
), &ifr
,
1668 SIOCSIFMTU
, "SIOCSIFMTU");
1669 if (!error
|| error
== ENODEV
) {
1670 netdev
->netdev_mtu_error
= error
;
1671 netdev
->mtu
= ifr
.ifr_mtu
;
1672 netdev
->cache_valid
|= VALID_MTU
;
1675 ovs_mutex_unlock(&netdev
->mutex
);
1679 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1680 * On failure, returns a negative errno value. */
1682 netdev_linux_get_ifindex(const struct netdev
*netdev_
)
1684 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1687 ovs_mutex_lock(&netdev
->mutex
);
1688 if (netdev_linux_netnsid_is_remote(netdev
)) {
1692 error
= get_ifindex(netdev_
, &ifindex
);
1695 ovs_mutex_unlock(&netdev
->mutex
);
1696 return error
? -error
: ifindex
;
1700 netdev_linux_get_carrier(const struct netdev
*netdev_
, bool *carrier
)
1702 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1704 ovs_mutex_lock(&netdev
->mutex
);
1705 if (netdev
->miimon_interval
> 0) {
1706 *carrier
= netdev
->miimon
;
1708 *carrier
= (netdev
->ifi_flags
& IFF_RUNNING
) != 0;
1710 ovs_mutex_unlock(&netdev
->mutex
);
1715 static long long int
1716 netdev_linux_get_carrier_resets(const struct netdev
*netdev_
)
1718 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1719 long long int carrier_resets
;
1721 ovs_mutex_lock(&netdev
->mutex
);
1722 carrier_resets
= netdev
->carrier_resets
;
1723 ovs_mutex_unlock(&netdev
->mutex
);
1725 return carrier_resets
;
1729 netdev_linux_do_miimon(const char *name
, int cmd
, const char *cmd_name
,
1730 struct mii_ioctl_data
*data
)
1735 memset(&ifr
, 0, sizeof ifr
);
1736 memcpy(&ifr
.ifr_data
, data
, sizeof *data
);
1737 error
= af_inet_ifreq_ioctl(name
, &ifr
, cmd
, cmd_name
);
1738 memcpy(data
, &ifr
.ifr_data
, sizeof *data
);
1744 netdev_linux_get_miimon(const char *name
, bool *miimon
)
1746 struct mii_ioctl_data data
;
1751 memset(&data
, 0, sizeof data
);
1752 error
= netdev_linux_do_miimon(name
, SIOCGMIIPHY
, "SIOCGMIIPHY", &data
);
1754 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1755 data
.reg_num
= MII_BMSR
;
1756 error
= netdev_linux_do_miimon(name
, SIOCGMIIREG
, "SIOCGMIIREG",
1760 *miimon
= !!(data
.val_out
& BMSR_LSTATUS
);
1764 struct ethtool_cmd ecmd
;
1766 VLOG_DBG_RL(&rl
, "%s: failed to query MII, falling back to ethtool",
1769 COVERAGE_INC(netdev_get_ethtool
);
1770 memset(&ecmd
, 0, sizeof ecmd
);
1771 error
= netdev_linux_do_ethtool(name
, &ecmd
, ETHTOOL_GLINK
,
1774 struct ethtool_value eval
;
1776 memcpy(&eval
, &ecmd
, sizeof eval
);
1777 *miimon
= !!eval
.data
;
1779 VLOG_WARN_RL(&rl
, "%s: ethtool link status failed", name
);
1787 netdev_linux_set_miimon_interval(struct netdev
*netdev_
,
1788 long long int interval
)
1790 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1792 ovs_mutex_lock(&netdev
->mutex
);
1793 interval
= interval
> 0 ? MAX(interval
, 100) : 0;
1794 if (netdev
->miimon_interval
!= interval
) {
1795 if (interval
&& !netdev
->miimon_interval
) {
1796 atomic_count_inc(&miimon_cnt
);
1797 } else if (!interval
&& netdev
->miimon_interval
) {
1798 atomic_count_dec(&miimon_cnt
);
1801 netdev
->miimon_interval
= interval
;
1802 timer_set_expired(&netdev
->miimon_timer
);
1804 ovs_mutex_unlock(&netdev
->mutex
);
1810 netdev_linux_miimon_run(void)
1812 struct shash device_shash
;
1813 struct shash_node
*node
;
1815 shash_init(&device_shash
);
1816 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1817 SHASH_FOR_EACH (node
, &device_shash
) {
1818 struct netdev
*netdev
= node
->data
;
1819 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1822 ovs_mutex_lock(&dev
->mutex
);
1823 if (dev
->miimon_interval
> 0 && timer_expired(&dev
->miimon_timer
)) {
1824 netdev_linux_get_miimon(dev
->up
.name
, &miimon
);
1825 if (miimon
!= dev
->miimon
) {
1826 dev
->miimon
= miimon
;
1827 netdev_linux_changed(dev
, dev
->ifi_flags
, 0);
1830 timer_set_duration(&dev
->miimon_timer
, dev
->miimon_interval
);
1832 ovs_mutex_unlock(&dev
->mutex
);
1833 netdev_close(netdev
);
1836 shash_destroy(&device_shash
);
1840 netdev_linux_miimon_wait(void)
1842 struct shash device_shash
;
1843 struct shash_node
*node
;
1845 shash_init(&device_shash
);
1846 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1847 SHASH_FOR_EACH (node
, &device_shash
) {
1848 struct netdev
*netdev
= node
->data
;
1849 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1851 ovs_mutex_lock(&dev
->mutex
);
1852 if (dev
->miimon_interval
> 0) {
1853 timer_wait(&dev
->miimon_timer
);
1855 ovs_mutex_unlock(&dev
->mutex
);
1856 netdev_close(netdev
);
1858 shash_destroy(&device_shash
);
1862 swap_uint64(uint64_t *a
, uint64_t *b
)
1869 /* Copies 'src' into 'dst', performing format conversion in the process.
1871 * 'src' is allowed to be misaligned. */
1873 netdev_stats_from_ovs_vport_stats(struct netdev_stats
*dst
,
1874 const struct ovs_vport_stats
*src
)
1876 dst
->rx_packets
= get_32aligned_u64(&src
->rx_packets
);
1877 dst
->tx_packets
= get_32aligned_u64(&src
->tx_packets
);
1878 dst
->rx_bytes
= get_32aligned_u64(&src
->rx_bytes
);
1879 dst
->tx_bytes
= get_32aligned_u64(&src
->tx_bytes
);
1880 dst
->rx_errors
= get_32aligned_u64(&src
->rx_errors
);
1881 dst
->tx_errors
= get_32aligned_u64(&src
->tx_errors
);
1882 dst
->rx_dropped
= get_32aligned_u64(&src
->rx_dropped
);
1883 dst
->tx_dropped
= get_32aligned_u64(&src
->tx_dropped
);
1885 dst
->collisions
= 0;
1886 dst
->rx_length_errors
= 0;
1887 dst
->rx_over_errors
= 0;
1888 dst
->rx_crc_errors
= 0;
1889 dst
->rx_frame_errors
= 0;
1890 dst
->rx_fifo_errors
= 0;
1891 dst
->rx_missed_errors
= 0;
1892 dst
->tx_aborted_errors
= 0;
1893 dst
->tx_carrier_errors
= 0;
1894 dst
->tx_fifo_errors
= 0;
1895 dst
->tx_heartbeat_errors
= 0;
1896 dst
->tx_window_errors
= 0;
1900 get_stats_via_vport__(const struct netdev
*netdev
, struct netdev_stats
*stats
)
1902 struct dpif_netlink_vport reply
;
1906 error
= dpif_netlink_vport_get(netdev_get_name(netdev
), &reply
, &buf
);
1909 } else if (!reply
.stats
) {
1914 netdev_stats_from_ovs_vport_stats(stats
, reply
.stats
);
1922 get_stats_via_vport(const struct netdev
*netdev_
,
1923 struct netdev_stats
*stats
)
1925 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1927 if (!netdev
->vport_stats_error
||
1928 !(netdev
->cache_valid
& VALID_VPORT_STAT_ERROR
)) {
1931 error
= get_stats_via_vport__(netdev_
, stats
);
1932 if (error
&& error
!= ENOENT
&& error
!= ENODEV
) {
1933 VLOG_WARN_RL(&rl
, "%s: obtaining netdev stats via vport failed "
1935 netdev_get_name(netdev_
), ovs_strerror(error
));
1937 netdev
->vport_stats_error
= error
;
1938 netdev
->cache_valid
|= VALID_VPORT_STAT_ERROR
;
1942 /* Retrieves current device stats for 'netdev-linux'. */
1944 netdev_linux_get_stats(const struct netdev
*netdev_
,
1945 struct netdev_stats
*stats
)
1947 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1948 struct netdev_stats dev_stats
;
1951 ovs_mutex_lock(&netdev
->mutex
);
1952 get_stats_via_vport(netdev_
, stats
);
1953 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1955 if (!netdev
->vport_stats_error
) {
1958 } else if (netdev
->vport_stats_error
) {
1959 /* stats not available from OVS then use netdev stats. */
1962 /* Use kernel netdev's packet and byte counts since vport's counters
1963 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1965 stats
->rx_packets
= dev_stats
.rx_packets
;
1966 stats
->rx_bytes
= dev_stats
.rx_bytes
;
1967 stats
->tx_packets
= dev_stats
.tx_packets
;
1968 stats
->tx_bytes
= dev_stats
.tx_bytes
;
1970 stats
->rx_errors
+= dev_stats
.rx_errors
;
1971 stats
->tx_errors
+= dev_stats
.tx_errors
;
1972 stats
->rx_dropped
+= dev_stats
.rx_dropped
;
1973 stats
->tx_dropped
+= dev_stats
.tx_dropped
;
1974 stats
->multicast
+= dev_stats
.multicast
;
1975 stats
->collisions
+= dev_stats
.collisions
;
1976 stats
->rx_length_errors
+= dev_stats
.rx_length_errors
;
1977 stats
->rx_over_errors
+= dev_stats
.rx_over_errors
;
1978 stats
->rx_crc_errors
+= dev_stats
.rx_crc_errors
;
1979 stats
->rx_frame_errors
+= dev_stats
.rx_frame_errors
;
1980 stats
->rx_fifo_errors
+= dev_stats
.rx_fifo_errors
;
1981 stats
->rx_missed_errors
+= dev_stats
.rx_missed_errors
;
1982 stats
->tx_aborted_errors
+= dev_stats
.tx_aborted_errors
;
1983 stats
->tx_carrier_errors
+= dev_stats
.tx_carrier_errors
;
1984 stats
->tx_fifo_errors
+= dev_stats
.tx_fifo_errors
;
1985 stats
->tx_heartbeat_errors
+= dev_stats
.tx_heartbeat_errors
;
1986 stats
->tx_window_errors
+= dev_stats
.tx_window_errors
;
1988 ovs_mutex_unlock(&netdev
->mutex
);
1993 /* Retrieves current device stats for 'netdev-tap' netdev or
1994 * netdev-internal. */
1996 netdev_tap_get_stats(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
1998 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1999 struct netdev_stats dev_stats
;
2002 ovs_mutex_lock(&netdev
->mutex
);
2003 get_stats_via_vport(netdev_
, stats
);
2004 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
2006 if (!netdev
->vport_stats_error
) {
2009 } else if (netdev
->vport_stats_error
) {
2010 /* Transmit and receive stats will appear to be swapped relative to the
2011 * other ports since we are the one sending the data, not a remote
2012 * computer. For consistency, we swap them back here. This does not
2013 * apply if we are getting stats from the vport layer because it always
2014 * tracks stats from the perspective of the switch. */
2017 swap_uint64(&stats
->rx_packets
, &stats
->tx_packets
);
2018 swap_uint64(&stats
->rx_bytes
, &stats
->tx_bytes
);
2019 swap_uint64(&stats
->rx_errors
, &stats
->tx_errors
);
2020 swap_uint64(&stats
->rx_dropped
, &stats
->tx_dropped
);
2021 stats
->rx_length_errors
= 0;
2022 stats
->rx_over_errors
= 0;
2023 stats
->rx_crc_errors
= 0;
2024 stats
->rx_frame_errors
= 0;
2025 stats
->rx_fifo_errors
= 0;
2026 stats
->rx_missed_errors
= 0;
2027 stats
->tx_aborted_errors
= 0;
2028 stats
->tx_carrier_errors
= 0;
2029 stats
->tx_fifo_errors
= 0;
2030 stats
->tx_heartbeat_errors
= 0;
2031 stats
->tx_window_errors
= 0;
2033 /* Use kernel netdev's packet and byte counts since vport counters
2034 * do not reflect packet counts on the wire when GSO, TSO or GRO
2036 stats
->rx_packets
= dev_stats
.tx_packets
;
2037 stats
->rx_bytes
= dev_stats
.tx_bytes
;
2038 stats
->tx_packets
= dev_stats
.rx_packets
;
2039 stats
->tx_bytes
= dev_stats
.rx_bytes
;
2041 stats
->rx_dropped
+= dev_stats
.tx_dropped
;
2042 stats
->tx_dropped
+= dev_stats
.rx_dropped
;
2044 stats
->rx_errors
+= dev_stats
.tx_errors
;
2045 stats
->tx_errors
+= dev_stats
.rx_errors
;
2047 stats
->multicast
+= dev_stats
.multicast
;
2048 stats
->collisions
+= dev_stats
.collisions
;
2050 stats
->tx_dropped
+= netdev
->tx_dropped
;
2051 ovs_mutex_unlock(&netdev
->mutex
);
2057 netdev_internal_get_stats(const struct netdev
*netdev_
,
2058 struct netdev_stats
*stats
)
2060 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2063 ovs_mutex_lock(&netdev
->mutex
);
2064 get_stats_via_vport(netdev_
, stats
);
2065 error
= netdev
->vport_stats_error
;
2066 ovs_mutex_unlock(&netdev
->mutex
);
2072 netdev_linux_read_features(struct netdev_linux
*netdev
)
2074 struct ethtool_cmd ecmd
;
2078 if (netdev
->cache_valid
& VALID_FEATURES
) {
2082 COVERAGE_INC(netdev_get_ethtool
);
2083 memset(&ecmd
, 0, sizeof ecmd
);
2084 error
= netdev_linux_do_ethtool(netdev
->up
.name
, &ecmd
,
2085 ETHTOOL_GSET
, "ETHTOOL_GSET");
2090 /* Supported features. */
2091 netdev
->supported
= 0;
2092 if (ecmd
.supported
& SUPPORTED_10baseT_Half
) {
2093 netdev
->supported
|= NETDEV_F_10MB_HD
;
2095 if (ecmd
.supported
& SUPPORTED_10baseT_Full
) {
2096 netdev
->supported
|= NETDEV_F_10MB_FD
;
2098 if (ecmd
.supported
& SUPPORTED_100baseT_Half
) {
2099 netdev
->supported
|= NETDEV_F_100MB_HD
;
2101 if (ecmd
.supported
& SUPPORTED_100baseT_Full
) {
2102 netdev
->supported
|= NETDEV_F_100MB_FD
;
2104 if (ecmd
.supported
& SUPPORTED_1000baseT_Half
) {
2105 netdev
->supported
|= NETDEV_F_1GB_HD
;
2107 if ((ecmd
.supported
& SUPPORTED_1000baseT_Full
) ||
2108 (ecmd
.supported
& SUPPORTED_1000baseKX_Full
)) {
2109 netdev
->supported
|= NETDEV_F_1GB_FD
;
2111 if ((ecmd
.supported
& SUPPORTED_10000baseT_Full
) ||
2112 (ecmd
.supported
& SUPPORTED_10000baseKX4_Full
) ||
2113 (ecmd
.supported
& SUPPORTED_10000baseKR_Full
) ||
2114 (ecmd
.supported
& SUPPORTED_10000baseR_FEC
)) {
2115 netdev
->supported
|= NETDEV_F_10GB_FD
;
2117 if ((ecmd
.supported
& SUPPORTED_40000baseKR4_Full
) ||
2118 (ecmd
.supported
& SUPPORTED_40000baseCR4_Full
) ||
2119 (ecmd
.supported
& SUPPORTED_40000baseSR4_Full
) ||
2120 (ecmd
.supported
& SUPPORTED_40000baseLR4_Full
)) {
2121 netdev
->supported
|= NETDEV_F_40GB_FD
;
2123 if (ecmd
.supported
& SUPPORTED_TP
) {
2124 netdev
->supported
|= NETDEV_F_COPPER
;
2126 if (ecmd
.supported
& SUPPORTED_FIBRE
) {
2127 netdev
->supported
|= NETDEV_F_FIBER
;
2129 if (ecmd
.supported
& SUPPORTED_Autoneg
) {
2130 netdev
->supported
|= NETDEV_F_AUTONEG
;
2132 if (ecmd
.supported
& SUPPORTED_Pause
) {
2133 netdev
->supported
|= NETDEV_F_PAUSE
;
2135 if (ecmd
.supported
& SUPPORTED_Asym_Pause
) {
2136 netdev
->supported
|= NETDEV_F_PAUSE_ASYM
;
2139 /* Advertised features. */
2140 netdev
->advertised
= 0;
2141 if (ecmd
.advertising
& ADVERTISED_10baseT_Half
) {
2142 netdev
->advertised
|= NETDEV_F_10MB_HD
;
2144 if (ecmd
.advertising
& ADVERTISED_10baseT_Full
) {
2145 netdev
->advertised
|= NETDEV_F_10MB_FD
;
2147 if (ecmd
.advertising
& ADVERTISED_100baseT_Half
) {
2148 netdev
->advertised
|= NETDEV_F_100MB_HD
;
2150 if (ecmd
.advertising
& ADVERTISED_100baseT_Full
) {
2151 netdev
->advertised
|= NETDEV_F_100MB_FD
;
2153 if (ecmd
.advertising
& ADVERTISED_1000baseT_Half
) {
2154 netdev
->advertised
|= NETDEV_F_1GB_HD
;
2156 if ((ecmd
.advertising
& ADVERTISED_1000baseT_Full
) ||
2157 (ecmd
.advertising
& ADVERTISED_1000baseKX_Full
)) {
2158 netdev
->advertised
|= NETDEV_F_1GB_FD
;
2160 if ((ecmd
.advertising
& ADVERTISED_10000baseT_Full
) ||
2161 (ecmd
.advertising
& ADVERTISED_10000baseKX4_Full
) ||
2162 (ecmd
.advertising
& ADVERTISED_10000baseKR_Full
) ||
2163 (ecmd
.advertising
& ADVERTISED_10000baseR_FEC
)) {
2164 netdev
->advertised
|= NETDEV_F_10GB_FD
;
2166 if ((ecmd
.advertising
& ADVERTISED_40000baseKR4_Full
) ||
2167 (ecmd
.advertising
& ADVERTISED_40000baseCR4_Full
) ||
2168 (ecmd
.advertising
& ADVERTISED_40000baseSR4_Full
) ||
2169 (ecmd
.advertising
& ADVERTISED_40000baseLR4_Full
)) {
2170 netdev
->advertised
|= NETDEV_F_40GB_FD
;
2172 if (ecmd
.advertising
& ADVERTISED_TP
) {
2173 netdev
->advertised
|= NETDEV_F_COPPER
;
2175 if (ecmd
.advertising
& ADVERTISED_FIBRE
) {
2176 netdev
->advertised
|= NETDEV_F_FIBER
;
2178 if (ecmd
.advertising
& ADVERTISED_Autoneg
) {
2179 netdev
->advertised
|= NETDEV_F_AUTONEG
;
2181 if (ecmd
.advertising
& ADVERTISED_Pause
) {
2182 netdev
->advertised
|= NETDEV_F_PAUSE
;
2184 if (ecmd
.advertising
& ADVERTISED_Asym_Pause
) {
2185 netdev
->advertised
|= NETDEV_F_PAUSE_ASYM
;
2188 /* Current settings. */
2189 speed
= ethtool_cmd_speed(&ecmd
);
2190 if (speed
== SPEED_10
) {
2191 netdev
->current
= ecmd
.duplex
? NETDEV_F_10MB_FD
: NETDEV_F_10MB_HD
;
2192 } else if (speed
== SPEED_100
) {
2193 netdev
->current
= ecmd
.duplex
? NETDEV_F_100MB_FD
: NETDEV_F_100MB_HD
;
2194 } else if (speed
== SPEED_1000
) {
2195 netdev
->current
= ecmd
.duplex
? NETDEV_F_1GB_FD
: NETDEV_F_1GB_HD
;
2196 } else if (speed
== SPEED_10000
) {
2197 netdev
->current
= NETDEV_F_10GB_FD
;
2198 } else if (speed
== 40000) {
2199 netdev
->current
= NETDEV_F_40GB_FD
;
2200 } else if (speed
== 100000) {
2201 netdev
->current
= NETDEV_F_100GB_FD
;
2202 } else if (speed
== 1000000) {
2203 netdev
->current
= NETDEV_F_1TB_FD
;
2205 netdev
->current
= 0;
2208 if (ecmd
.port
== PORT_TP
) {
2209 netdev
->current
|= NETDEV_F_COPPER
;
2210 } else if (ecmd
.port
== PORT_FIBRE
) {
2211 netdev
->current
|= NETDEV_F_FIBER
;
2215 netdev
->current
|= NETDEV_F_AUTONEG
;
2219 netdev
->cache_valid
|= VALID_FEATURES
;
2220 netdev
->get_features_error
= error
;
2223 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2224 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2225 * Returns 0 if successful, otherwise a positive errno value. */
2227 netdev_linux_get_features(const struct netdev
*netdev_
,
2228 enum netdev_features
*current
,
2229 enum netdev_features
*advertised
,
2230 enum netdev_features
*supported
,
2231 enum netdev_features
*peer
)
2233 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2236 ovs_mutex_lock(&netdev
->mutex
);
2237 if (netdev_linux_netnsid_is_remote(netdev
)) {
2242 netdev_linux_read_features(netdev
);
2243 if (!netdev
->get_features_error
) {
2244 *current
= netdev
->current
;
2245 *advertised
= netdev
->advertised
;
2246 *supported
= netdev
->supported
;
2247 *peer
= 0; /* XXX */
2249 error
= netdev
->get_features_error
;
2252 ovs_mutex_unlock(&netdev
->mutex
);
2256 /* Set the features advertised by 'netdev' to 'advertise'. */
2258 netdev_linux_set_advertisements(struct netdev
*netdev_
,
2259 enum netdev_features advertise
)
2261 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2262 struct ethtool_cmd ecmd
;
2265 ovs_mutex_lock(&netdev
->mutex
);
2267 COVERAGE_INC(netdev_get_ethtool
);
2269 if (netdev_linux_netnsid_is_remote(netdev
)) {
2274 memset(&ecmd
, 0, sizeof ecmd
);
2275 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2276 ETHTOOL_GSET
, "ETHTOOL_GSET");
2281 ecmd
.advertising
= 0;
2282 if (advertise
& NETDEV_F_10MB_HD
) {
2283 ecmd
.advertising
|= ADVERTISED_10baseT_Half
;
2285 if (advertise
& NETDEV_F_10MB_FD
) {
2286 ecmd
.advertising
|= ADVERTISED_10baseT_Full
;
2288 if (advertise
& NETDEV_F_100MB_HD
) {
2289 ecmd
.advertising
|= ADVERTISED_100baseT_Half
;
2291 if (advertise
& NETDEV_F_100MB_FD
) {
2292 ecmd
.advertising
|= ADVERTISED_100baseT_Full
;
2294 if (advertise
& NETDEV_F_1GB_HD
) {
2295 ecmd
.advertising
|= ADVERTISED_1000baseT_Half
;
2297 if (advertise
& NETDEV_F_1GB_FD
) {
2298 ecmd
.advertising
|= ADVERTISED_1000baseT_Full
;
2300 if (advertise
& NETDEV_F_10GB_FD
) {
2301 ecmd
.advertising
|= ADVERTISED_10000baseT_Full
;
2303 if (advertise
& NETDEV_F_COPPER
) {
2304 ecmd
.advertising
|= ADVERTISED_TP
;
2306 if (advertise
& NETDEV_F_FIBER
) {
2307 ecmd
.advertising
|= ADVERTISED_FIBRE
;
2309 if (advertise
& NETDEV_F_AUTONEG
) {
2310 ecmd
.advertising
|= ADVERTISED_Autoneg
;
2312 if (advertise
& NETDEV_F_PAUSE
) {
2313 ecmd
.advertising
|= ADVERTISED_Pause
;
2315 if (advertise
& NETDEV_F_PAUSE_ASYM
) {
2316 ecmd
.advertising
|= ADVERTISED_Asym_Pause
;
2318 COVERAGE_INC(netdev_set_ethtool
);
2319 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2320 ETHTOOL_SSET
, "ETHTOOL_SSET");
2323 ovs_mutex_unlock(&netdev
->mutex
);
2327 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2328 * successful, otherwise a positive errno value. */
2330 netdev_linux_set_policing(struct netdev
*netdev_
,
2331 uint32_t kbits_rate
, uint32_t kbits_burst
)
2333 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2334 const char *netdev_name
= netdev_get_name(netdev_
);
2338 if (netdev_is_flow_api_enabled()) {
2340 VLOG_WARN_RL(&rl
, "%s: policing with offload isn't supported",
2346 kbits_burst
= (!kbits_rate
? 0 /* Force to 0 if no rate specified. */
2347 : !kbits_burst
? 8000 /* Default to 8000 kbits if 0. */
2348 : kbits_burst
); /* Stick with user-specified value. */
2350 ovs_mutex_lock(&netdev
->mutex
);
2351 if (netdev_linux_netnsid_is_remote(netdev
)) {
2356 if (netdev
->cache_valid
& VALID_POLICING
) {
2357 error
= netdev
->netdev_policing_error
;
2358 if (error
|| (netdev
->kbits_rate
== kbits_rate
&&
2359 netdev
->kbits_burst
== kbits_burst
)) {
2360 /* Assume that settings haven't changed since we last set them. */
2363 netdev
->cache_valid
&= ~VALID_POLICING
;
2366 error
= get_ifindex(netdev_
, &ifindex
);
2371 COVERAGE_INC(netdev_set_policing
);
2372 /* Remove any existing ingress qdisc. */
2373 error
= tc_add_del_ingress_qdisc(ifindex
, false, 0);
2375 VLOG_WARN_RL(&rl
, "%s: removing policing failed: %s",
2376 netdev_name
, ovs_strerror(error
));
2381 error
= tc_add_del_ingress_qdisc(ifindex
, true, 0);
2383 VLOG_WARN_RL(&rl
, "%s: adding policing qdisc failed: %s",
2384 netdev_name
, ovs_strerror(error
));
2388 error
= tc_add_policer(netdev_
, kbits_rate
, kbits_burst
);
2390 VLOG_WARN_RL(&rl
, "%s: adding policing action failed: %s",
2391 netdev_name
, ovs_strerror(error
));
2396 netdev
->kbits_rate
= kbits_rate
;
2397 netdev
->kbits_burst
= kbits_burst
;
2400 if (!error
|| error
== ENODEV
) {
2401 netdev
->netdev_policing_error
= error
;
2402 netdev
->cache_valid
|= VALID_POLICING
;
2404 ovs_mutex_unlock(&netdev
->mutex
);
2409 netdev_linux_get_qos_types(const struct netdev
*netdev OVS_UNUSED
,
2412 const struct tc_ops
*const *opsp
;
2413 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2414 const struct tc_ops
*ops
= *opsp
;
2415 if (ops
->tc_install
&& ops
->ovs_name
[0] != '\0') {
2416 sset_add(types
, ops
->ovs_name
);
2422 static const struct tc_ops
*
2423 tc_lookup_ovs_name(const char *name
)
2425 const struct tc_ops
*const *opsp
;
2427 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2428 const struct tc_ops
*ops
= *opsp
;
2429 if (!strcmp(name
, ops
->ovs_name
)) {
2436 static const struct tc_ops
*
2437 tc_lookup_linux_name(const char *name
)
2439 const struct tc_ops
*const *opsp
;
2441 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2442 const struct tc_ops
*ops
= *opsp
;
2443 if (ops
->linux_name
&& !strcmp(name
, ops
->linux_name
)) {
2450 static struct tc_queue
*
2451 tc_find_queue__(const struct netdev
*netdev_
, unsigned int queue_id
,
2454 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2455 struct tc_queue
*queue
;
2457 HMAP_FOR_EACH_IN_BUCKET (queue
, hmap_node
, hash
, &netdev
->tc
->queues
) {
2458 if (queue
->queue_id
== queue_id
) {
2465 static struct tc_queue
*
2466 tc_find_queue(const struct netdev
*netdev
, unsigned int queue_id
)
2468 return tc_find_queue__(netdev
, queue_id
, hash_int(queue_id
, 0));
2472 netdev_linux_get_qos_capabilities(const struct netdev
*netdev OVS_UNUSED
,
2474 struct netdev_qos_capabilities
*caps
)
2476 const struct tc_ops
*ops
= tc_lookup_ovs_name(type
);
2480 caps
->n_queues
= ops
->n_queues
;
2485 netdev_linux_get_qos(const struct netdev
*netdev_
,
2486 const char **typep
, struct smap
*details
)
2488 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2491 ovs_mutex_lock(&netdev
->mutex
);
2492 if (netdev_linux_netnsid_is_remote(netdev
)) {
2497 error
= tc_query_qdisc(netdev_
);
2499 *typep
= netdev
->tc
->ops
->ovs_name
;
2500 error
= (netdev
->tc
->ops
->qdisc_get
2501 ? netdev
->tc
->ops
->qdisc_get(netdev_
, details
)
2506 ovs_mutex_unlock(&netdev
->mutex
);
2511 netdev_linux_set_qos(struct netdev
*netdev_
,
2512 const char *type
, const struct smap
*details
)
2514 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2515 const struct tc_ops
*new_ops
;
2518 new_ops
= tc_lookup_ovs_name(type
);
2519 if (!new_ops
|| !new_ops
->tc_install
) {
2523 if (new_ops
== &tc_ops_noop
) {
2524 return new_ops
->tc_install(netdev_
, details
);
2527 ovs_mutex_lock(&netdev
->mutex
);
2528 if (netdev_linux_netnsid_is_remote(netdev
)) {
2533 error
= tc_query_qdisc(netdev_
);
2538 if (new_ops
== netdev
->tc
->ops
) {
2539 error
= new_ops
->qdisc_set
? new_ops
->qdisc_set(netdev_
, details
) : 0;
2541 /* Delete existing qdisc. */
2542 error
= tc_del_qdisc(netdev_
);
2546 ovs_assert(netdev
->tc
== NULL
);
2548 /* Install new qdisc. */
2549 error
= new_ops
->tc_install(netdev_
, details
);
2550 ovs_assert((error
== 0) == (netdev
->tc
!= NULL
));
2554 ovs_mutex_unlock(&netdev
->mutex
);
2559 netdev_linux_get_queue(const struct netdev
*netdev_
,
2560 unsigned int queue_id
, struct smap
*details
)
2562 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2565 ovs_mutex_lock(&netdev
->mutex
);
2566 if (netdev_linux_netnsid_is_remote(netdev
)) {
2571 error
= tc_query_qdisc(netdev_
);
2573 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2575 ? netdev
->tc
->ops
->class_get(netdev_
, queue
, details
)
2580 ovs_mutex_unlock(&netdev
->mutex
);
2585 netdev_linux_set_queue(struct netdev
*netdev_
,
2586 unsigned int queue_id
, const struct smap
*details
)
2588 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2591 ovs_mutex_lock(&netdev
->mutex
);
2592 if (netdev_linux_netnsid_is_remote(netdev
)) {
2597 error
= tc_query_qdisc(netdev_
);
2599 error
= (queue_id
< netdev
->tc
->ops
->n_queues
2600 && netdev
->tc
->ops
->class_set
2601 ? netdev
->tc
->ops
->class_set(netdev_
, queue_id
, details
)
2606 ovs_mutex_unlock(&netdev
->mutex
);
2611 netdev_linux_delete_queue(struct netdev
*netdev_
, unsigned int queue_id
)
2613 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2616 ovs_mutex_lock(&netdev
->mutex
);
2617 if (netdev_linux_netnsid_is_remote(netdev
)) {
2622 error
= tc_query_qdisc(netdev_
);
2624 if (netdev
->tc
->ops
->class_delete
) {
2625 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2627 ? netdev
->tc
->ops
->class_delete(netdev_
, queue
)
2635 ovs_mutex_unlock(&netdev
->mutex
);
2640 netdev_linux_get_queue_stats(const struct netdev
*netdev_
,
2641 unsigned int queue_id
,
2642 struct netdev_queue_stats
*stats
)
2644 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2647 ovs_mutex_lock(&netdev
->mutex
);
2648 if (netdev_linux_netnsid_is_remote(netdev
)) {
2653 error
= tc_query_qdisc(netdev_
);
2655 if (netdev
->tc
->ops
->class_get_stats
) {
2656 const struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2658 stats
->created
= queue
->created
;
2659 error
= netdev
->tc
->ops
->class_get_stats(netdev_
, queue
,
2670 ovs_mutex_unlock(&netdev
->mutex
);
2674 struct queue_dump_state
{
2675 struct nl_dump dump
;
2680 start_queue_dump(const struct netdev
*netdev
, struct queue_dump_state
*state
)
2682 struct ofpbuf request
;
2683 struct tcmsg
*tcmsg
;
2685 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, 0, &request
);
2689 tcmsg
->tcm_parent
= 0;
2690 nl_dump_start(&state
->dump
, NETLINK_ROUTE
, &request
);
2691 ofpbuf_uninit(&request
);
2693 ofpbuf_init(&state
->buf
, NL_DUMP_BUFSIZE
);
2698 finish_queue_dump(struct queue_dump_state
*state
)
2700 ofpbuf_uninit(&state
->buf
);
2701 return nl_dump_done(&state
->dump
);
2704 struct netdev_linux_queue_state
{
2705 unsigned int *queues
;
2711 netdev_linux_queue_dump_start(const struct netdev
*netdev_
, void **statep
)
2713 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2716 ovs_mutex_lock(&netdev
->mutex
);
2717 if (netdev_linux_netnsid_is_remote(netdev
)) {
2722 error
= tc_query_qdisc(netdev_
);
2724 if (netdev
->tc
->ops
->class_get
) {
2725 struct netdev_linux_queue_state
*state
;
2726 struct tc_queue
*queue
;
2729 *statep
= state
= xmalloc(sizeof *state
);
2730 state
->n_queues
= hmap_count(&netdev
->tc
->queues
);
2731 state
->cur_queue
= 0;
2732 state
->queues
= xmalloc(state
->n_queues
* sizeof *state
->queues
);
2735 HMAP_FOR_EACH (queue
, hmap_node
, &netdev
->tc
->queues
) {
2736 state
->queues
[i
++] = queue
->queue_id
;
2744 ovs_mutex_unlock(&netdev
->mutex
);
2749 netdev_linux_queue_dump_next(const struct netdev
*netdev_
, void *state_
,
2750 unsigned int *queue_idp
, struct smap
*details
)
2752 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2753 struct netdev_linux_queue_state
*state
= state_
;
2756 ovs_mutex_lock(&netdev
->mutex
);
2757 if (netdev_linux_netnsid_is_remote(netdev
)) {
2762 while (state
->cur_queue
< state
->n_queues
) {
2763 unsigned int queue_id
= state
->queues
[state
->cur_queue
++];
2764 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2767 *queue_idp
= queue_id
;
2768 error
= netdev
->tc
->ops
->class_get(netdev_
, queue
, details
);
2774 ovs_mutex_unlock(&netdev
->mutex
);
2779 netdev_linux_queue_dump_done(const struct netdev
*netdev OVS_UNUSED
,
2782 struct netdev_linux_queue_state
*state
= state_
;
2784 free(state
->queues
);
2790 netdev_linux_dump_queue_stats(const struct netdev
*netdev_
,
2791 netdev_dump_queue_stats_cb
*cb
, void *aux
)
2793 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2796 ovs_mutex_lock(&netdev
->mutex
);
2797 if (netdev_linux_netnsid_is_remote(netdev
)) {
2802 error
= tc_query_qdisc(netdev_
);
2804 struct queue_dump_state state
;
2806 if (!netdev
->tc
->ops
->class_dump_stats
) {
2808 } else if (!start_queue_dump(netdev_
, &state
)) {
2814 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
2815 retval
= netdev
->tc
->ops
->class_dump_stats(netdev_
, &msg
,
2822 retval
= finish_queue_dump(&state
);
2830 ovs_mutex_unlock(&netdev
->mutex
);
2835 netdev_linux_set_in4(struct netdev
*netdev_
, struct in_addr address
,
2836 struct in_addr netmask
)
2838 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2841 ovs_mutex_lock(&netdev
->mutex
);
2842 if (netdev_linux_netnsid_is_remote(netdev
)) {
2847 error
= do_set_addr(netdev_
, SIOCSIFADDR
, "SIOCSIFADDR", address
);
2849 if (address
.s_addr
!= INADDR_ANY
) {
2850 error
= do_set_addr(netdev_
, SIOCSIFNETMASK
,
2851 "SIOCSIFNETMASK", netmask
);
2856 ovs_mutex_unlock(&netdev
->mutex
);
2860 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2861 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2864 netdev_linux_get_addr_list(const struct netdev
*netdev_
,
2865 struct in6_addr
**addr
, struct in6_addr
**mask
, int *n_cnt
)
2867 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2870 ovs_mutex_lock(&netdev
->mutex
);
2871 if (netdev_linux_netnsid_is_remote(netdev
)) {
2876 error
= netdev_get_addrs(netdev_get_name(netdev_
), addr
, mask
, n_cnt
);
2879 ovs_mutex_unlock(&netdev
->mutex
);
2884 make_in4_sockaddr(struct sockaddr
*sa
, struct in_addr addr
)
2886 struct sockaddr_in sin
;
2887 memset(&sin
, 0, sizeof sin
);
2888 sin
.sin_family
= AF_INET
;
2889 sin
.sin_addr
= addr
;
2892 memset(sa
, 0, sizeof *sa
);
2893 memcpy(sa
, &sin
, sizeof sin
);
2897 do_set_addr(struct netdev
*netdev
,
2898 int ioctl_nr
, const char *ioctl_name
, struct in_addr addr
)
2902 make_in4_sockaddr(&ifr
.ifr_addr
, addr
);
2903 return af_inet_ifreq_ioctl(netdev_get_name(netdev
), &ifr
, ioctl_nr
,
2907 /* Adds 'router' as a default IP gateway. */
2909 netdev_linux_add_router(struct netdev
*netdev OVS_UNUSED
, struct in_addr router
)
2911 struct in_addr any
= { INADDR_ANY
};
2915 memset(&rt
, 0, sizeof rt
);
2916 make_in4_sockaddr(&rt
.rt_dst
, any
);
2917 make_in4_sockaddr(&rt
.rt_gateway
, router
);
2918 make_in4_sockaddr(&rt
.rt_genmask
, any
);
2919 rt
.rt_flags
= RTF_UP
| RTF_GATEWAY
;
2920 error
= af_inet_ioctl(SIOCADDRT
, &rt
);
2922 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error
));
2928 netdev_linux_get_next_hop(const struct in_addr
*host
, struct in_addr
*next_hop
,
2931 static const char fn
[] = "/proc/net/route";
2936 *netdev_name
= NULL
;
2937 stream
= fopen(fn
, "r");
2938 if (stream
== NULL
) {
2939 VLOG_WARN_RL(&rl
, "%s: open failed: %s", fn
, ovs_strerror(errno
));
2944 while (fgets(line
, sizeof line
, stream
)) {
2947 ovs_be32 dest
, gateway
, mask
;
2948 int refcnt
, metric
, mtu
;
2949 unsigned int flags
, use
, window
, irtt
;
2952 "%16s %"SCNx32
" %"SCNx32
" %04X %d %u %d %"SCNx32
2954 iface
, &dest
, &gateway
, &flags
, &refcnt
,
2955 &use
, &metric
, &mask
, &mtu
, &window
, &irtt
)) {
2956 VLOG_WARN_RL(&rl
, "%s: could not parse line %d: %s",
2960 if (!(flags
& RTF_UP
)) {
2961 /* Skip routes that aren't up. */
2965 /* The output of 'dest', 'mask', and 'gateway' were given in
2966 * network byte order, so we don't need need any endian
2967 * conversions here. */
2968 if ((dest
& mask
) == (host
->s_addr
& mask
)) {
2970 /* The host is directly reachable. */
2971 next_hop
->s_addr
= 0;
2973 /* To reach the host, we must go through a gateway. */
2974 next_hop
->s_addr
= gateway
;
2976 *netdev_name
= xstrdup(iface
);
2988 netdev_linux_get_status(const struct netdev
*netdev_
, struct smap
*smap
)
2990 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2993 ovs_mutex_lock(&netdev
->mutex
);
2994 if (!(netdev
->cache_valid
& VALID_DRVINFO
)) {
2995 struct ethtool_cmd
*cmd
= (struct ethtool_cmd
*) &netdev
->drvinfo
;
2997 COVERAGE_INC(netdev_get_ethtool
);
2998 memset(&netdev
->drvinfo
, 0, sizeof netdev
->drvinfo
);
2999 error
= netdev_linux_do_ethtool(netdev
->up
.name
,
3002 "ETHTOOL_GDRVINFO");
3004 netdev
->cache_valid
|= VALID_DRVINFO
;
3009 smap_add(smap
, "driver_name", netdev
->drvinfo
.driver
);
3010 smap_add(smap
, "driver_version", netdev
->drvinfo
.version
);
3011 smap_add(smap
, "firmware_version", netdev
->drvinfo
.fw_version
);
3013 ovs_mutex_unlock(&netdev
->mutex
);
3019 netdev_internal_get_status(const struct netdev
*netdev OVS_UNUSED
,
3022 smap_add(smap
, "driver_name", "openvswitch");
3027 netdev_linux_get_block_id(struct netdev
*netdev_
)
3029 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3030 uint32_t block_id
= 0;
3032 ovs_mutex_lock(&netdev
->mutex
);
3033 /* Ensure the linux netdev has had its fields populated. */
3034 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
3035 netdev_linux_update_via_netlink(netdev
);
3038 /* Only assigning block ids to linux netdevs that are LAG masters. */
3039 if (netdev
->is_lag_master
) {
3040 block_id
= netdev
->ifindex
;
3042 ovs_mutex_unlock(&netdev
->mutex
);
3047 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3048 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3049 * returns 0. Otherwise, it returns a positive errno value; in particular,
3050 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3052 netdev_linux_arp_lookup(const struct netdev
*netdev
,
3053 ovs_be32 ip
, struct eth_addr
*mac
)
3056 struct sockaddr_in sin
;
3059 memset(&r
, 0, sizeof r
);
3060 memset(&sin
, 0, sizeof sin
);
3061 sin
.sin_family
= AF_INET
;
3062 sin
.sin_addr
.s_addr
= ip
;
3064 memcpy(&r
.arp_pa
, &sin
, sizeof sin
);
3065 r
.arp_ha
.sa_family
= ARPHRD_ETHER
;
3067 ovs_strzcpy(r
.arp_dev
, netdev_get_name(netdev
), sizeof r
.arp_dev
);
3068 COVERAGE_INC(netdev_arp_lookup
);
3069 retval
= af_inet_ioctl(SIOCGARP
, &r
);
3071 memcpy(mac
, r
.arp_ha
.sa_data
, ETH_ADDR_LEN
);
3072 } else if (retval
!= ENXIO
) {
3073 VLOG_WARN_RL(&rl
, "%s: could not look up ARP entry for "IP_FMT
": %s",
3074 netdev_get_name(netdev
), IP_ARGS(ip
),
3075 ovs_strerror(retval
));
3081 nd_to_iff_flags(enum netdev_flags nd
)
3083 unsigned int iff
= 0;
3084 if (nd
& NETDEV_UP
) {
3087 if (nd
& NETDEV_PROMISC
) {
3090 if (nd
& NETDEV_LOOPBACK
) {
3091 iff
|= IFF_LOOPBACK
;
3097 iff_to_nd_flags(unsigned int iff
)
3099 enum netdev_flags nd
= 0;
3103 if (iff
& IFF_PROMISC
) {
3104 nd
|= NETDEV_PROMISC
;
3106 if (iff
& IFF_LOOPBACK
) {
3107 nd
|= NETDEV_LOOPBACK
;
3113 update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
3114 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
3115 OVS_REQUIRES(netdev
->mutex
)
3117 unsigned int old_flags
, new_flags
;
3120 old_flags
= netdev
->ifi_flags
;
3121 *old_flagsp
= iff_to_nd_flags(old_flags
);
3122 new_flags
= (old_flags
& ~nd_to_iff_flags(off
)) | nd_to_iff_flags(on
);
3123 if (new_flags
!= old_flags
) {
3124 error
= set_flags(netdev_get_name(&netdev
->up
), new_flags
);
3125 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
3132 netdev_linux_update_flags(struct netdev
*netdev_
, enum netdev_flags off
,
3133 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
3135 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3138 ovs_mutex_lock(&netdev
->mutex
);
3140 /* Changing flags over netlink isn't support yet. */
3141 if (netdev_linux_netnsid_is_remote(netdev
)) {
3145 error
= update_flags(netdev
, off
, on
, old_flagsp
);
3147 /* Try reading flags over netlink, or fall back to ioctl. */
3148 if (!netdev_linux_update_via_netlink(netdev
)) {
3149 *old_flagsp
= iff_to_nd_flags(netdev
->ifi_flags
);
3151 error
= update_flags(netdev
, off
, on
, old_flagsp
);
3156 ovs_mutex_unlock(&netdev
->mutex
);
3160 #define NETDEV_LINUX_CLASS_COMMON \
3161 .run = netdev_linux_run, \
3162 .wait = netdev_linux_wait, \
3163 .alloc = netdev_linux_alloc, \
3164 .destruct = netdev_linux_destruct, \
3165 .dealloc = netdev_linux_dealloc, \
3166 .send = netdev_linux_send, \
3167 .send_wait = netdev_linux_send_wait, \
3168 .set_etheraddr = netdev_linux_set_etheraddr, \
3169 .get_etheraddr = netdev_linux_get_etheraddr, \
3170 .get_mtu = netdev_linux_get_mtu, \
3171 .set_mtu = netdev_linux_set_mtu, \
3172 .get_ifindex = netdev_linux_get_ifindex, \
3173 .get_carrier = netdev_linux_get_carrier, \
3174 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3175 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3176 .set_advertisements = netdev_linux_set_advertisements, \
3177 .set_policing = netdev_linux_set_policing, \
3178 .get_qos_types = netdev_linux_get_qos_types, \
3179 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3180 .get_qos = netdev_linux_get_qos, \
3181 .set_qos = netdev_linux_set_qos, \
3182 .get_queue = netdev_linux_get_queue, \
3183 .set_queue = netdev_linux_set_queue, \
3184 .delete_queue = netdev_linux_delete_queue, \
3185 .get_queue_stats = netdev_linux_get_queue_stats, \
3186 .queue_dump_start = netdev_linux_queue_dump_start, \
3187 .queue_dump_next = netdev_linux_queue_dump_next, \
3188 .queue_dump_done = netdev_linux_queue_dump_done, \
3189 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3190 .set_in4 = netdev_linux_set_in4, \
3191 .get_addr_list = netdev_linux_get_addr_list, \
3192 .add_router = netdev_linux_add_router, \
3193 .get_next_hop = netdev_linux_get_next_hop, \
3194 .arp_lookup = netdev_linux_arp_lookup, \
3195 .update_flags = netdev_linux_update_flags, \
3196 .rxq_alloc = netdev_linux_rxq_alloc, \
3197 .rxq_construct = netdev_linux_rxq_construct, \
3198 .rxq_destruct = netdev_linux_rxq_destruct, \
3199 .rxq_dealloc = netdev_linux_rxq_dealloc, \
3200 .rxq_recv = netdev_linux_rxq_recv, \
3201 .rxq_wait = netdev_linux_rxq_wait, \
3202 .rxq_drain = netdev_linux_rxq_drain
3204 const struct netdev_class netdev_linux_class
= {
3205 NETDEV_LINUX_CLASS_COMMON
,
3206 LINUX_FLOW_OFFLOAD_API
,
3208 .construct
= netdev_linux_construct
,
3209 .get_stats
= netdev_linux_get_stats
,
3210 .get_features
= netdev_linux_get_features
,
3211 .get_status
= netdev_linux_get_status
,
3212 .get_block_id
= netdev_linux_get_block_id
3215 const struct netdev_class netdev_tap_class
= {
3216 NETDEV_LINUX_CLASS_COMMON
,
3218 .construct
= netdev_linux_construct_tap
,
3219 .get_stats
= netdev_tap_get_stats
,
3220 .get_features
= netdev_linux_get_features
,
3221 .get_status
= netdev_linux_get_status
,
3224 const struct netdev_class netdev_internal_class
= {
3225 NETDEV_LINUX_CLASS_COMMON
,
3227 .construct
= netdev_linux_construct
,
3228 .get_stats
= netdev_internal_get_stats
,
3229 .get_status
= netdev_internal_get_status
,
3233 #define CODEL_N_QUEUES 0x0000
3235 /* In sufficiently new kernel headers these are defined as enums in
3236 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3237 * kernels. (This overrides any enum definition in the header file but that's
3239 #define TCA_CODEL_TARGET 1
3240 #define TCA_CODEL_LIMIT 2
3241 #define TCA_CODEL_INTERVAL 3
3250 static struct codel
*
3251 codel_get__(const struct netdev
*netdev_
)
3253 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3254 return CONTAINER_OF(netdev
->tc
, struct codel
, tc
);
3258 codel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3261 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3262 struct codel
*codel
;
3264 codel
= xmalloc(sizeof *codel
);
3265 tc_init(&codel
->tc
, &tc_ops_codel
);
3266 codel
->target
= target
;
3267 codel
->limit
= limit
;
3268 codel
->interval
= interval
;
3270 netdev
->tc
= &codel
->tc
;
3274 codel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3278 struct ofpbuf request
;
3279 struct tcmsg
*tcmsg
;
3280 uint32_t otarget
, olimit
, ointerval
;
3283 tc_del_qdisc(netdev
);
3285 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3286 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3290 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3291 tcmsg
->tcm_parent
= TC_H_ROOT
;
3293 otarget
= target
? target
: 5000;
3294 olimit
= limit
? limit
: 10240;
3295 ointerval
= interval
? interval
: 100000;
3297 nl_msg_put_string(&request
, TCA_KIND
, "codel");
3298 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3299 nl_msg_put_u32(&request
, TCA_CODEL_TARGET
, otarget
);
3300 nl_msg_put_u32(&request
, TCA_CODEL_LIMIT
, olimit
);
3301 nl_msg_put_u32(&request
, TCA_CODEL_INTERVAL
, ointerval
);
3302 nl_msg_end_nested(&request
, opt_offset
);
3304 error
= tc_transact(&request
, NULL
);
3306 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3307 "target %u, limit %u, interval %u error %d(%s)",
3308 netdev_get_name(netdev
),
3309 otarget
, olimit
, ointerval
,
3310 error
, ovs_strerror(error
));
3316 codel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3317 const struct smap
*details
, struct codel
*codel
)
3319 codel
->target
= smap_get_ullong(details
, "target", 0);
3320 codel
->limit
= smap_get_ullong(details
, "limit", 0);
3321 codel
->interval
= smap_get_ullong(details
, "interval", 0);
3323 if (!codel
->target
) {
3324 codel
->target
= 5000;
3326 if (!codel
->limit
) {
3327 codel
->limit
= 10240;
3329 if (!codel
->interval
) {
3330 codel
->interval
= 100000;
3335 codel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3340 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3341 error
= codel_setup_qdisc__(netdev
, codel
.target
, codel
.limit
,
3344 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3350 codel_parse_tca_options__(struct nlattr
*nl_options
, struct codel
*codel
)
3352 static const struct nl_policy tca_codel_policy
[] = {
3353 [TCA_CODEL_TARGET
] = { .type
= NL_A_U32
},
3354 [TCA_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3355 [TCA_CODEL_INTERVAL
] = { .type
= NL_A_U32
}
3358 struct nlattr
*attrs
[ARRAY_SIZE(tca_codel_policy
)];
3360 if (!nl_parse_nested(nl_options
, tca_codel_policy
,
3361 attrs
, ARRAY_SIZE(tca_codel_policy
))) {
3362 VLOG_WARN_RL(&rl
, "failed to parse CoDel class options");
3366 codel
->target
= nl_attr_get_u32(attrs
[TCA_CODEL_TARGET
]);
3367 codel
->limit
= nl_attr_get_u32(attrs
[TCA_CODEL_LIMIT
]);
3368 codel
->interval
= nl_attr_get_u32(attrs
[TCA_CODEL_INTERVAL
]);
3373 codel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3375 struct nlattr
*nlattr
;
3380 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3385 error
= codel_parse_tca_options__(nlattr
, &codel
);
3390 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3396 codel_tc_destroy(struct tc
*tc
)
3398 struct codel
*codel
= CONTAINER_OF(tc
, struct codel
, tc
);
3404 codel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3406 const struct codel
*codel
= codel_get__(netdev
);
3407 smap_add_format(details
, "target", "%u", codel
->target
);
3408 smap_add_format(details
, "limit", "%u", codel
->limit
);
3409 smap_add_format(details
, "interval", "%u", codel
->interval
);
3414 codel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3418 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3419 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3420 codel_get__(netdev
)->target
= codel
.target
;
3421 codel_get__(netdev
)->limit
= codel
.limit
;
3422 codel_get__(netdev
)->interval
= codel
.interval
;
3426 static const struct tc_ops tc_ops_codel
= {
3427 .linux_name
= "codel",
3428 .ovs_name
= "linux-codel",
3429 .n_queues
= CODEL_N_QUEUES
,
3430 .tc_install
= codel_tc_install
,
3431 .tc_load
= codel_tc_load
,
3432 .tc_destroy
= codel_tc_destroy
,
3433 .qdisc_get
= codel_qdisc_get
,
3434 .qdisc_set
= codel_qdisc_set
,
3437 /* FQ-CoDel traffic control class. */
3439 #define FQCODEL_N_QUEUES 0x0000
3441 /* In sufficiently new kernel headers these are defined as enums in
3442 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3443 * kernels. (This overrides any enum definition in the header file but that's
3445 #define TCA_FQ_CODEL_TARGET 1
3446 #define TCA_FQ_CODEL_LIMIT 2
3447 #define TCA_FQ_CODEL_INTERVAL 3
3448 #define TCA_FQ_CODEL_ECN 4
3449 #define TCA_FQ_CODEL_FLOWS 5
3450 #define TCA_FQ_CODEL_QUANTUM 6
3461 static struct fqcodel
*
3462 fqcodel_get__(const struct netdev
*netdev_
)
3464 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3465 return CONTAINER_OF(netdev
->tc
, struct fqcodel
, tc
);
3469 fqcodel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3470 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3472 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3473 struct fqcodel
*fqcodel
;
3475 fqcodel
= xmalloc(sizeof *fqcodel
);
3476 tc_init(&fqcodel
->tc
, &tc_ops_fqcodel
);
3477 fqcodel
->target
= target
;
3478 fqcodel
->limit
= limit
;
3479 fqcodel
->interval
= interval
;
3480 fqcodel
->flows
= flows
;
3481 fqcodel
->quantum
= quantum
;
3483 netdev
->tc
= &fqcodel
->tc
;
3487 fqcodel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3488 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3491 struct ofpbuf request
;
3492 struct tcmsg
*tcmsg
;
3493 uint32_t otarget
, olimit
, ointerval
, oflows
, oquantum
;
3496 tc_del_qdisc(netdev
);
3498 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3499 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3503 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3504 tcmsg
->tcm_parent
= TC_H_ROOT
;
3506 otarget
= target
? target
: 5000;
3507 olimit
= limit
? limit
: 10240;
3508 ointerval
= interval
? interval
: 100000;
3509 oflows
= flows
? flows
: 1024;
3510 oquantum
= quantum
? quantum
: 1514; /* fq_codel default quantum is 1514
3513 nl_msg_put_string(&request
, TCA_KIND
, "fq_codel");
3514 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3515 nl_msg_put_u32(&request
, TCA_FQ_CODEL_TARGET
, otarget
);
3516 nl_msg_put_u32(&request
, TCA_FQ_CODEL_LIMIT
, olimit
);
3517 nl_msg_put_u32(&request
, TCA_FQ_CODEL_INTERVAL
, ointerval
);
3518 nl_msg_put_u32(&request
, TCA_FQ_CODEL_FLOWS
, oflows
);
3519 nl_msg_put_u32(&request
, TCA_FQ_CODEL_QUANTUM
, oquantum
);
3520 nl_msg_end_nested(&request
, opt_offset
);
3522 error
= tc_transact(&request
, NULL
);
3524 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3525 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3526 netdev_get_name(netdev
),
3527 otarget
, olimit
, ointerval
, oflows
, oquantum
,
3528 error
, ovs_strerror(error
));
3534 fqcodel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3535 const struct smap
*details
, struct fqcodel
*fqcodel
)
3537 fqcodel
->target
= smap_get_ullong(details
, "target", 0);
3538 fqcodel
->limit
= smap_get_ullong(details
, "limit", 0);
3539 fqcodel
->interval
= smap_get_ullong(details
, "interval", 0);
3540 fqcodel
->flows
= smap_get_ullong(details
, "flows", 0);
3541 fqcodel
->quantum
= smap_get_ullong(details
, "quantum", 0);
3543 if (!fqcodel
->target
) {
3544 fqcodel
->target
= 5000;
3546 if (!fqcodel
->limit
) {
3547 fqcodel
->limit
= 10240;
3549 if (!fqcodel
->interval
) {
3550 fqcodel
->interval
= 1000000;
3552 if (!fqcodel
->flows
) {
3553 fqcodel
->flows
= 1024;
3555 if (!fqcodel
->quantum
) {
3556 fqcodel
->quantum
= 1514;
3561 fqcodel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3564 struct fqcodel fqcodel
;
3566 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3567 error
= fqcodel_setup_qdisc__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3568 fqcodel
.interval
, fqcodel
.flows
,
3571 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3572 fqcodel
.interval
, fqcodel
.flows
, fqcodel
.quantum
);
3578 fqcodel_parse_tca_options__(struct nlattr
*nl_options
, struct fqcodel
*fqcodel
)
3580 static const struct nl_policy tca_fqcodel_policy
[] = {
3581 [TCA_FQ_CODEL_TARGET
] = { .type
= NL_A_U32
},
3582 [TCA_FQ_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3583 [TCA_FQ_CODEL_INTERVAL
] = { .type
= NL_A_U32
},
3584 [TCA_FQ_CODEL_FLOWS
] = { .type
= NL_A_U32
},
3585 [TCA_FQ_CODEL_QUANTUM
] = { .type
= NL_A_U32
}
3588 struct nlattr
*attrs
[ARRAY_SIZE(tca_fqcodel_policy
)];
3590 if (!nl_parse_nested(nl_options
, tca_fqcodel_policy
,
3591 attrs
, ARRAY_SIZE(tca_fqcodel_policy
))) {
3592 VLOG_WARN_RL(&rl
, "failed to parse FQ_CoDel class options");
3596 fqcodel
->target
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_TARGET
]);
3597 fqcodel
->limit
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_LIMIT
]);
3598 fqcodel
->interval
=nl_attr_get_u32(attrs
[TCA_FQ_CODEL_INTERVAL
]);
3599 fqcodel
->flows
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_FLOWS
]);
3600 fqcodel
->quantum
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_QUANTUM
]);
3605 fqcodel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3607 struct nlattr
*nlattr
;
3610 struct fqcodel fqcodel
;
3612 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3617 error
= fqcodel_parse_tca_options__(nlattr
, &fqcodel
);
3622 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3623 fqcodel
.flows
, fqcodel
.quantum
);
3628 fqcodel_tc_destroy(struct tc
*tc
)
3630 struct fqcodel
*fqcodel
= CONTAINER_OF(tc
, struct fqcodel
, tc
);
3636 fqcodel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3638 const struct fqcodel
*fqcodel
= fqcodel_get__(netdev
);
3639 smap_add_format(details
, "target", "%u", fqcodel
->target
);
3640 smap_add_format(details
, "limit", "%u", fqcodel
->limit
);
3641 smap_add_format(details
, "interval", "%u", fqcodel
->interval
);
3642 smap_add_format(details
, "flows", "%u", fqcodel
->flows
);
3643 smap_add_format(details
, "quantum", "%u", fqcodel
->quantum
);
3648 fqcodel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3650 struct fqcodel fqcodel
;
3652 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3653 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3654 fqcodel
.flows
, fqcodel
.quantum
);
3655 fqcodel_get__(netdev
)->target
= fqcodel
.target
;
3656 fqcodel_get__(netdev
)->limit
= fqcodel
.limit
;
3657 fqcodel_get__(netdev
)->interval
= fqcodel
.interval
;
3658 fqcodel_get__(netdev
)->flows
= fqcodel
.flows
;
3659 fqcodel_get__(netdev
)->quantum
= fqcodel
.quantum
;
3663 static const struct tc_ops tc_ops_fqcodel
= {
3664 .linux_name
= "fq_codel",
3665 .ovs_name
= "linux-fq_codel",
3666 .n_queues
= FQCODEL_N_QUEUES
,
3667 .tc_install
= fqcodel_tc_install
,
3668 .tc_load
= fqcodel_tc_load
,
3669 .tc_destroy
= fqcodel_tc_destroy
,
3670 .qdisc_get
= fqcodel_qdisc_get
,
3671 .qdisc_set
= fqcodel_qdisc_set
,
3674 /* SFQ traffic control class. */
3676 #define SFQ_N_QUEUES 0x0000
3685 sfq_get__(const struct netdev
*netdev_
)
3687 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3688 return CONTAINER_OF(netdev
->tc
, struct sfq
, tc
);
3692 sfq_install__(struct netdev
*netdev_
, uint32_t quantum
, uint32_t perturb
)
3694 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3697 sfq
= xmalloc(sizeof *sfq
);
3698 tc_init(&sfq
->tc
, &tc_ops_sfq
);
3699 sfq
->perturb
= perturb
;
3700 sfq
->quantum
= quantum
;
3702 netdev
->tc
= &sfq
->tc
;
3706 sfq_setup_qdisc__(struct netdev
*netdev
, uint32_t quantum
, uint32_t perturb
)
3708 struct tc_sfq_qopt opt
;
3709 struct ofpbuf request
;
3710 struct tcmsg
*tcmsg
;
3712 int mtu_error
, error
;
3713 mtu_error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3715 tc_del_qdisc(netdev
);
3717 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3718 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3722 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3723 tcmsg
->tcm_parent
= TC_H_ROOT
;
3725 memset(&opt
, 0, sizeof opt
);
3728 opt
.quantum
= mtu
; /* if we cannot find mtu, use default */
3731 opt
.quantum
= quantum
;
3735 opt
.perturb_period
= 10;
3737 opt
.perturb_period
= perturb
;
3740 nl_msg_put_string(&request
, TCA_KIND
, "sfq");
3741 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
3743 error
= tc_transact(&request
, NULL
);
3745 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3746 "quantum %u, perturb %u error %d(%s)",
3747 netdev_get_name(netdev
),
3748 opt
.quantum
, opt
.perturb_period
,
3749 error
, ovs_strerror(error
));
3755 sfq_parse_qdisc_details__(struct netdev
*netdev
,
3756 const struct smap
*details
, struct sfq
*sfq
)
3758 sfq
->perturb
= smap_get_ullong(details
, "perturb", 0);
3759 sfq
->quantum
= smap_get_ullong(details
, "quantum", 0);
3761 if (!sfq
->perturb
) {
3765 if (!sfq
->quantum
) {
3767 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
3770 VLOG_WARN_RL(&rl
, "when using SFQ, you must specify quantum on a "
3771 "device without mtu");
3777 sfq_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3782 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3783 error
= sfq_setup_qdisc__(netdev
, sfq
.quantum
, sfq
.perturb
);
3785 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3791 sfq_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3793 const struct tc_sfq_qopt
*sfq
;
3794 struct nlattr
*nlattr
;
3798 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3800 sfq
= nl_attr_get(nlattr
);
3801 sfq_install__(netdev
, sfq
->quantum
, sfq
->perturb_period
);
3809 sfq_tc_destroy(struct tc
*tc
)
3811 struct sfq
*sfq
= CONTAINER_OF(tc
, struct sfq
, tc
);
3817 sfq_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3819 const struct sfq
*sfq
= sfq_get__(netdev
);
3820 smap_add_format(details
, "quantum", "%u", sfq
->quantum
);
3821 smap_add_format(details
, "perturb", "%u", sfq
->perturb
);
3826 sfq_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3830 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3831 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3832 sfq_get__(netdev
)->quantum
= sfq
.quantum
;
3833 sfq_get__(netdev
)->perturb
= sfq
.perturb
;
3837 static const struct tc_ops tc_ops_sfq
= {
3838 .linux_name
= "sfq",
3839 .ovs_name
= "linux-sfq",
3840 .n_queues
= SFQ_N_QUEUES
,
3841 .tc_install
= sfq_tc_install
,
3842 .tc_load
= sfq_tc_load
,
3843 .tc_destroy
= sfq_tc_destroy
,
3844 .qdisc_get
= sfq_qdisc_get
,
3845 .qdisc_set
= sfq_qdisc_set
,
3848 /* HTB traffic control class. */
3850 #define HTB_N_QUEUES 0xf000
3851 #define HTB_RATE2QUANTUM 10
3855 unsigned int max_rate
; /* In bytes/s. */
3859 struct tc_queue tc_queue
;
3860 unsigned int min_rate
; /* In bytes/s. */
3861 unsigned int max_rate
; /* In bytes/s. */
3862 unsigned int burst
; /* In bytes. */
3863 unsigned int priority
; /* Lower values are higher priorities. */
3867 htb_get__(const struct netdev
*netdev_
)
3869 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3870 return CONTAINER_OF(netdev
->tc
, struct htb
, tc
);
3874 htb_install__(struct netdev
*netdev_
, uint64_t max_rate
)
3876 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3879 htb
= xmalloc(sizeof *htb
);
3880 tc_init(&htb
->tc
, &tc_ops_htb
);
3881 htb
->max_rate
= max_rate
;
3883 netdev
->tc
= &htb
->tc
;
3886 /* Create an HTB qdisc.
3888 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3890 htb_setup_qdisc__(struct netdev
*netdev
)
3893 struct tc_htb_glob opt
;
3894 struct ofpbuf request
;
3895 struct tcmsg
*tcmsg
;
3897 tc_del_qdisc(netdev
);
3899 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3900 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3904 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3905 tcmsg
->tcm_parent
= TC_H_ROOT
;
3907 nl_msg_put_string(&request
, TCA_KIND
, "htb");
3909 memset(&opt
, 0, sizeof opt
);
3910 opt
.rate2quantum
= HTB_RATE2QUANTUM
;
3914 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3915 nl_msg_put_unspec(&request
, TCA_HTB_INIT
, &opt
, sizeof opt
);
3916 nl_msg_end_nested(&request
, opt_offset
);
3918 return tc_transact(&request
, NULL
);
3921 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3922 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3924 htb_setup_class__(struct netdev
*netdev
, unsigned int handle
,
3925 unsigned int parent
, struct htb_class
*class)
3928 struct tc_htb_opt opt
;
3929 struct ofpbuf request
;
3930 struct tcmsg
*tcmsg
;
3934 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3936 VLOG_WARN_RL(&rl
, "cannot set up HTB on device %s that lacks MTU",
3937 netdev_get_name(netdev
));
3941 memset(&opt
, 0, sizeof opt
);
3942 tc_fill_rate(&opt
.rate
, class->min_rate
, mtu
);
3943 tc_fill_rate(&opt
.ceil
, class->max_rate
, mtu
);
3944 /* Makes sure the quantum is at least MTU. Setting quantum will
3945 * make htb ignore the r2q for this class. */
3946 if ((class->min_rate
/ HTB_RATE2QUANTUM
) < mtu
) {
3949 opt
.buffer
= tc_calc_buffer(opt
.rate
.rate
, mtu
, class->burst
);
3950 opt
.cbuffer
= tc_calc_buffer(opt
.ceil
.rate
, mtu
, class->burst
);
3951 opt
.prio
= class->priority
;
3953 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
3958 tcmsg
->tcm_handle
= handle
;
3959 tcmsg
->tcm_parent
= parent
;
3961 nl_msg_put_string(&request
, TCA_KIND
, "htb");
3962 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3963 nl_msg_put_unspec(&request
, TCA_HTB_PARMS
, &opt
, sizeof opt
);
3964 tc_put_rtab(&request
, TCA_HTB_RTAB
, &opt
.rate
);
3965 tc_put_rtab(&request
, TCA_HTB_CTAB
, &opt
.ceil
);
3966 nl_msg_end_nested(&request
, opt_offset
);
3968 error
= tc_transact(&request
, NULL
);
3970 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
3971 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3972 netdev_get_name(netdev
),
3973 tc_get_major(handle
), tc_get_minor(handle
),
3974 tc_get_major(parent
), tc_get_minor(parent
),
3975 class->min_rate
, class->max_rate
,
3976 class->burst
, class->priority
, ovs_strerror(error
));
3981 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3982 * description of them into 'details'. The description complies with the
3983 * specification given in the vswitch database documentation for linux-htb
3986 htb_parse_tca_options__(struct nlattr
*nl_options
, struct htb_class
*class)
3988 static const struct nl_policy tca_htb_policy
[] = {
3989 [TCA_HTB_PARMS
] = { .type
= NL_A_UNSPEC
, .optional
= false,
3990 .min_len
= sizeof(struct tc_htb_opt
) },
3993 struct nlattr
*attrs
[ARRAY_SIZE(tca_htb_policy
)];
3994 const struct tc_htb_opt
*htb
;
3996 if (!nl_parse_nested(nl_options
, tca_htb_policy
,
3997 attrs
, ARRAY_SIZE(tca_htb_policy
))) {
3998 VLOG_WARN_RL(&rl
, "failed to parse HTB class options");
4002 htb
= nl_attr_get(attrs
[TCA_HTB_PARMS
]);
4003 class->min_rate
= htb
->rate
.rate
;
4004 class->max_rate
= htb
->ceil
.rate
;
4005 class->burst
= tc_ticks_to_bytes(htb
->rate
.rate
, htb
->buffer
);
4006 class->priority
= htb
->prio
;
4011 htb_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4012 struct htb_class
*options
,
4013 struct netdev_queue_stats
*stats
)
4015 struct nlattr
*nl_options
;
4016 unsigned int handle
;
4019 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4020 if (!error
&& queue_id
) {
4021 unsigned int major
= tc_get_major(handle
);
4022 unsigned int minor
= tc_get_minor(handle
);
4023 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4024 *queue_id
= minor
- 1;
4029 if (!error
&& options
) {
4030 error
= htb_parse_tca_options__(nl_options
, options
);
4036 htb_parse_qdisc_details__(struct netdev
*netdev_
,
4037 const struct smap
*details
, struct htb_class
*hc
)
4039 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4041 hc
->max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
4042 if (!hc
->max_rate
) {
4043 enum netdev_features current
;
4045 netdev_linux_read_features(netdev
);
4046 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4047 hc
->max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4049 hc
->min_rate
= hc
->max_rate
;
4055 htb_parse_class_details__(struct netdev
*netdev
,
4056 const struct smap
*details
, struct htb_class
*hc
)
4058 const struct htb
*htb
= htb_get__(netdev
);
4060 unsigned long long int max_rate_bit
;
4062 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
4064 VLOG_WARN_RL(&rl
, "cannot parse HTB class on device %s that lacks MTU",
4065 netdev_get_name(netdev
));
4069 /* HTB requires at least an mtu sized min-rate to send any traffic even
4070 * on uncongested links. */
4071 hc
->min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
4072 hc
->min_rate
= MAX(hc
->min_rate
, mtu
);
4073 hc
->min_rate
= MIN(hc
->min_rate
, htb
->max_rate
);
4076 max_rate_bit
= smap_get_ullong(details
, "max-rate", 0);
4077 hc
->max_rate
= max_rate_bit
? max_rate_bit
/ 8 : htb
->max_rate
;
4078 hc
->max_rate
= MAX(hc
->max_rate
, hc
->min_rate
);
4079 hc
->max_rate
= MIN(hc
->max_rate
, htb
->max_rate
);
4083 * According to hints in the documentation that I've read, it is important
4084 * that 'burst' be at least as big as the largest frame that might be
4085 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4086 * but having it a bit too small is a problem. Since netdev_get_mtu()
4087 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4088 * the MTU. We actually add 64, instead of 14, as a guard against
4089 * additional headers get tacked on somewhere that we're not aware of. */
4090 hc
->burst
= smap_get_ullong(details
, "burst", 0) / 8;
4091 hc
->burst
= MAX(hc
->burst
, mtu
+ 64);
4094 hc
->priority
= smap_get_ullong(details
, "priority", 0);
4100 htb_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4101 unsigned int parent
, struct htb_class
*options
,
4102 struct netdev_queue_stats
*stats
)
4104 struct ofpbuf
*reply
;
4107 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4109 error
= htb_parse_tcmsg__(reply
, NULL
, options
, stats
);
4110 ofpbuf_delete(reply
);
4116 htb_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4120 error
= htb_setup_qdisc__(netdev
);
4122 struct htb_class hc
;
4124 htb_parse_qdisc_details__(netdev
, details
, &hc
);
4125 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4126 tc_make_handle(1, 0), &hc
);
4128 htb_install__(netdev
, hc
.max_rate
);
4134 static struct htb_class
*
4135 htb_class_cast__(const struct tc_queue
*queue
)
4137 return CONTAINER_OF(queue
, struct htb_class
, tc_queue
);
4141 htb_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4142 const struct htb_class
*hc
)
4144 struct htb
*htb
= htb_get__(netdev
);
4145 size_t hash
= hash_int(queue_id
, 0);
4146 struct tc_queue
*queue
;
4147 struct htb_class
*hcp
;
4149 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4151 hcp
= htb_class_cast__(queue
);
4153 hcp
= xmalloc(sizeof *hcp
);
4154 queue
= &hcp
->tc_queue
;
4155 queue
->queue_id
= queue_id
;
4156 queue
->created
= time_msec();
4157 hmap_insert(&htb
->tc
.queues
, &queue
->hmap_node
, hash
);
4160 hcp
->min_rate
= hc
->min_rate
;
4161 hcp
->max_rate
= hc
->max_rate
;
4162 hcp
->burst
= hc
->burst
;
4163 hcp
->priority
= hc
->priority
;
4167 htb_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4170 struct queue_dump_state state
;
4171 struct htb_class hc
;
4173 /* Get qdisc options. */
4175 htb_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4176 htb_install__(netdev
, hc
.max_rate
);
4179 if (!start_queue_dump(netdev
, &state
)) {
4182 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4183 unsigned int queue_id
;
4185 if (!htb_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4186 htb_update_queue__(netdev
, queue_id
, &hc
);
4189 finish_queue_dump(&state
);
4195 htb_tc_destroy(struct tc
*tc
)
4197 struct htb
*htb
= CONTAINER_OF(tc
, struct htb
, tc
);
4198 struct htb_class
*hc
;
4200 HMAP_FOR_EACH_POP (hc
, tc_queue
.hmap_node
, &htb
->tc
.queues
) {
4208 htb_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4210 const struct htb
*htb
= htb_get__(netdev
);
4211 smap_add_format(details
, "max-rate", "%llu", 8ULL * htb
->max_rate
);
4216 htb_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4218 struct htb_class hc
;
4221 htb_parse_qdisc_details__(netdev
, details
, &hc
);
4222 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4223 tc_make_handle(1, 0), &hc
);
4225 htb_get__(netdev
)->max_rate
= hc
.max_rate
;
4231 htb_class_get(const struct netdev
*netdev OVS_UNUSED
,
4232 const struct tc_queue
*queue
, struct smap
*details
)
4234 const struct htb_class
*hc
= htb_class_cast__(queue
);
4236 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4237 if (hc
->min_rate
!= hc
->max_rate
) {
4238 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4240 smap_add_format(details
, "burst", "%llu", 8ULL * hc
->burst
);
4242 smap_add_format(details
, "priority", "%u", hc
->priority
);
4248 htb_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4249 const struct smap
*details
)
4251 struct htb_class hc
;
4254 error
= htb_parse_class_details__(netdev
, details
, &hc
);
4259 error
= htb_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
4260 tc_make_handle(1, 0xfffe), &hc
);
4265 htb_update_queue__(netdev
, queue_id
, &hc
);
4270 htb_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4272 struct htb_class
*hc
= htb_class_cast__(queue
);
4273 struct htb
*htb
= htb_get__(netdev
);
4276 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4278 hmap_remove(&htb
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4285 htb_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4286 struct netdev_queue_stats
*stats
)
4288 return htb_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4289 tc_make_handle(1, 0xfffe), NULL
, stats
);
4293 htb_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4294 const struct ofpbuf
*nlmsg
,
4295 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4297 struct netdev_queue_stats stats
;
4298 unsigned int handle
, major
, minor
;
4301 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4306 major
= tc_get_major(handle
);
4307 minor
= tc_get_minor(handle
);
4308 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4309 (*cb
)(minor
- 1, &stats
, aux
);
4314 static const struct tc_ops tc_ops_htb
= {
4315 .linux_name
= "htb",
4316 .ovs_name
= "linux-htb",
4317 .n_queues
= HTB_N_QUEUES
,
4318 .tc_install
= htb_tc_install
,
4319 .tc_load
= htb_tc_load
,
4320 .tc_destroy
= htb_tc_destroy
,
4321 .qdisc_get
= htb_qdisc_get
,
4322 .qdisc_set
= htb_qdisc_set
,
4323 .class_get
= htb_class_get
,
4324 .class_set
= htb_class_set
,
4325 .class_delete
= htb_class_delete
,
4326 .class_get_stats
= htb_class_get_stats
,
4327 .class_dump_stats
= htb_class_dump_stats
4330 /* "linux-hfsc" traffic control class. */
4332 #define HFSC_N_QUEUES 0xf000
4340 struct tc_queue tc_queue
;
4345 static struct hfsc
*
4346 hfsc_get__(const struct netdev
*netdev_
)
4348 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4349 return CONTAINER_OF(netdev
->tc
, struct hfsc
, tc
);
4352 static struct hfsc_class
*
4353 hfsc_class_cast__(const struct tc_queue
*queue
)
4355 return CONTAINER_OF(queue
, struct hfsc_class
, tc_queue
);
4359 hfsc_install__(struct netdev
*netdev_
, uint32_t max_rate
)
4361 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4364 hfsc
= xmalloc(sizeof *hfsc
);
4365 tc_init(&hfsc
->tc
, &tc_ops_hfsc
);
4366 hfsc
->max_rate
= max_rate
;
4367 netdev
->tc
= &hfsc
->tc
;
4371 hfsc_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4372 const struct hfsc_class
*hc
)
4376 struct hfsc_class
*hcp
;
4377 struct tc_queue
*queue
;
4379 hfsc
= hfsc_get__(netdev
);
4380 hash
= hash_int(queue_id
, 0);
4382 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4384 hcp
= hfsc_class_cast__(queue
);
4386 hcp
= xmalloc(sizeof *hcp
);
4387 queue
= &hcp
->tc_queue
;
4388 queue
->queue_id
= queue_id
;
4389 queue
->created
= time_msec();
4390 hmap_insert(&hfsc
->tc
.queues
, &queue
->hmap_node
, hash
);
4393 hcp
->min_rate
= hc
->min_rate
;
4394 hcp
->max_rate
= hc
->max_rate
;
4398 hfsc_parse_tca_options__(struct nlattr
*nl_options
, struct hfsc_class
*class)
4400 const struct tc_service_curve
*rsc
, *fsc
, *usc
;
4401 static const struct nl_policy tca_hfsc_policy
[] = {
4403 .type
= NL_A_UNSPEC
,
4405 .min_len
= sizeof(struct tc_service_curve
),
4408 .type
= NL_A_UNSPEC
,
4410 .min_len
= sizeof(struct tc_service_curve
),
4413 .type
= NL_A_UNSPEC
,
4415 .min_len
= sizeof(struct tc_service_curve
),
4418 struct nlattr
*attrs
[ARRAY_SIZE(tca_hfsc_policy
)];
4420 if (!nl_parse_nested(nl_options
, tca_hfsc_policy
,
4421 attrs
, ARRAY_SIZE(tca_hfsc_policy
))) {
4422 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options");
4426 rsc
= nl_attr_get(attrs
[TCA_HFSC_RSC
]);
4427 fsc
= nl_attr_get(attrs
[TCA_HFSC_FSC
]);
4428 usc
= nl_attr_get(attrs
[TCA_HFSC_USC
]);
4430 if (rsc
->m1
!= 0 || rsc
->d
!= 0 ||
4431 fsc
->m1
!= 0 || fsc
->d
!= 0 ||
4432 usc
->m1
!= 0 || usc
->d
!= 0) {
4433 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4434 "Non-linear service curves are not supported.");
4438 if (rsc
->m2
!= fsc
->m2
) {
4439 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4440 "Real-time service curves are not supported ");
4444 if (rsc
->m2
> usc
->m2
) {
4445 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4446 "Min-rate service curve is greater than "
4447 "the max-rate service curve.");
4451 class->min_rate
= fsc
->m2
;
4452 class->max_rate
= usc
->m2
;
4457 hfsc_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4458 struct hfsc_class
*options
,
4459 struct netdev_queue_stats
*stats
)
4462 unsigned int handle
;
4463 struct nlattr
*nl_options
;
4465 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4471 unsigned int major
, minor
;
4473 major
= tc_get_major(handle
);
4474 minor
= tc_get_minor(handle
);
4475 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4476 *queue_id
= minor
- 1;
4483 error
= hfsc_parse_tca_options__(nl_options
, options
);
4490 hfsc_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4491 unsigned int parent
, struct hfsc_class
*options
,
4492 struct netdev_queue_stats
*stats
)
4495 struct ofpbuf
*reply
;
4497 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4502 error
= hfsc_parse_tcmsg__(reply
, NULL
, options
, stats
);
4503 ofpbuf_delete(reply
);
4508 hfsc_parse_qdisc_details__(struct netdev
*netdev_
, const struct smap
*details
,
4509 struct hfsc_class
*class)
4511 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4513 uint32_t max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
4515 enum netdev_features current
;
4517 netdev_linux_read_features(netdev
);
4518 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4519 max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4522 class->min_rate
= max_rate
;
4523 class->max_rate
= max_rate
;
4527 hfsc_parse_class_details__(struct netdev
*netdev
,
4528 const struct smap
*details
,
4529 struct hfsc_class
* class)
4531 const struct hfsc
*hfsc
;
4532 uint32_t min_rate
, max_rate
;
4534 hfsc
= hfsc_get__(netdev
);
4536 min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
4537 min_rate
= MAX(min_rate
, 1);
4538 min_rate
= MIN(min_rate
, hfsc
->max_rate
);
4540 max_rate
= smap_get_ullong(details
, "max-rate", hfsc
->max_rate
* 8) / 8;
4541 max_rate
= MAX(max_rate
, min_rate
);
4542 max_rate
= MIN(max_rate
, hfsc
->max_rate
);
4544 class->min_rate
= min_rate
;
4545 class->max_rate
= max_rate
;
4550 /* Create an HFSC qdisc.
4552 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4554 hfsc_setup_qdisc__(struct netdev
* netdev
)
4556 struct tcmsg
*tcmsg
;
4557 struct ofpbuf request
;
4558 struct tc_hfsc_qopt opt
;
4560 tc_del_qdisc(netdev
);
4562 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4563 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4569 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4570 tcmsg
->tcm_parent
= TC_H_ROOT
;
4572 memset(&opt
, 0, sizeof opt
);
4575 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4576 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
4578 return tc_transact(&request
, NULL
);
4581 /* Create an HFSC class.
4583 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4584 * sc rate <min_rate> ul rate <max_rate>" */
4586 hfsc_setup_class__(struct netdev
*netdev
, unsigned int handle
,
4587 unsigned int parent
, struct hfsc_class
*class)
4591 struct tcmsg
*tcmsg
;
4592 struct ofpbuf request
;
4593 struct tc_service_curve min
, max
;
4595 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
4602 tcmsg
->tcm_handle
= handle
;
4603 tcmsg
->tcm_parent
= parent
;
4607 min
.m2
= class->min_rate
;
4611 max
.m2
= class->max_rate
;
4613 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4614 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4615 nl_msg_put_unspec(&request
, TCA_HFSC_RSC
, &min
, sizeof min
);
4616 nl_msg_put_unspec(&request
, TCA_HFSC_FSC
, &min
, sizeof min
);
4617 nl_msg_put_unspec(&request
, TCA_HFSC_USC
, &max
, sizeof max
);
4618 nl_msg_end_nested(&request
, opt_offset
);
4620 error
= tc_transact(&request
, NULL
);
4622 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
4623 "min-rate %ubps, max-rate %ubps (%s)",
4624 netdev_get_name(netdev
),
4625 tc_get_major(handle
), tc_get_minor(handle
),
4626 tc_get_major(parent
), tc_get_minor(parent
),
4627 class->min_rate
, class->max_rate
, ovs_strerror(error
));
4634 hfsc_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4637 struct hfsc_class
class;
4639 error
= hfsc_setup_qdisc__(netdev
);
4645 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4646 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4647 tc_make_handle(1, 0), &class);
4653 hfsc_install__(netdev
, class.max_rate
);
4658 hfsc_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4661 struct queue_dump_state state
;
4662 struct hfsc_class hc
;
4665 hfsc_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4666 hfsc_install__(netdev
, hc
.max_rate
);
4668 if (!start_queue_dump(netdev
, &state
)) {
4672 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4673 unsigned int queue_id
;
4675 if (!hfsc_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4676 hfsc_update_queue__(netdev
, queue_id
, &hc
);
4680 finish_queue_dump(&state
);
4685 hfsc_tc_destroy(struct tc
*tc
)
4688 struct hfsc_class
*hc
, *next
;
4690 hfsc
= CONTAINER_OF(tc
, struct hfsc
, tc
);
4692 HMAP_FOR_EACH_SAFE (hc
, next
, tc_queue
.hmap_node
, &hfsc
->tc
.queues
) {
4693 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4702 hfsc_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4704 const struct hfsc
*hfsc
;
4705 hfsc
= hfsc_get__(netdev
);
4706 smap_add_format(details
, "max-rate", "%llu", 8ULL * hfsc
->max_rate
);
4711 hfsc_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4714 struct hfsc_class
class;
4716 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4717 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4718 tc_make_handle(1, 0), &class);
4721 hfsc_get__(netdev
)->max_rate
= class.max_rate
;
4728 hfsc_class_get(const struct netdev
*netdev OVS_UNUSED
,
4729 const struct tc_queue
*queue
, struct smap
*details
)
4731 const struct hfsc_class
*hc
;
4733 hc
= hfsc_class_cast__(queue
);
4734 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4735 if (hc
->min_rate
!= hc
->max_rate
) {
4736 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4742 hfsc_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4743 const struct smap
*details
)
4746 struct hfsc_class
class;
4748 error
= hfsc_parse_class_details__(netdev
, details
, &class);
4753 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
4754 tc_make_handle(1, 0xfffe), &class);
4759 hfsc_update_queue__(netdev
, queue_id
, &class);
4764 hfsc_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4768 struct hfsc_class
*hc
;
4770 hc
= hfsc_class_cast__(queue
);
4771 hfsc
= hfsc_get__(netdev
);
4773 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4775 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4782 hfsc_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4783 struct netdev_queue_stats
*stats
)
4785 return hfsc_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4786 tc_make_handle(1, 0xfffe), NULL
, stats
);
4790 hfsc_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4791 const struct ofpbuf
*nlmsg
,
4792 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4794 struct netdev_queue_stats stats
;
4795 unsigned int handle
, major
, minor
;
4798 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4803 major
= tc_get_major(handle
);
4804 minor
= tc_get_minor(handle
);
4805 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4806 (*cb
)(minor
- 1, &stats
, aux
);
4811 static const struct tc_ops tc_ops_hfsc
= {
4812 .linux_name
= "hfsc",
4813 .ovs_name
= "linux-hfsc",
4814 .n_queues
= HFSC_N_QUEUES
, /* n_queues */
4815 .tc_install
= hfsc_tc_install
,
4816 .tc_load
= hfsc_tc_load
,
4817 .tc_destroy
= hfsc_tc_destroy
,
4818 .qdisc_get
= hfsc_qdisc_get
,
4819 .qdisc_set
= hfsc_qdisc_set
,
4820 .class_get
= hfsc_class_get
,
4821 .class_set
= hfsc_class_set
,
4822 .class_delete
= hfsc_class_delete
,
4823 .class_get_stats
= hfsc_class_get_stats
,
4824 .class_dump_stats
= hfsc_class_dump_stats
,
4827 /* "linux-noop" traffic control class. */
4830 noop_install__(struct netdev
*netdev_
)
4832 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4833 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
4835 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4839 noop_tc_install(struct netdev
*netdev
,
4840 const struct smap
*details OVS_UNUSED
)
4842 noop_install__(netdev
);
4847 noop_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4849 noop_install__(netdev
);
4853 static const struct tc_ops tc_ops_noop
= {
4854 .ovs_name
= "linux-noop", /* ovs_name */
4855 .tc_install
= noop_tc_install
,
4856 .tc_load
= noop_tc_load
,
4859 /* "linux-default" traffic control class.
4861 * This class represents the default, unnamed Linux qdisc. It corresponds to
4862 * the "" (empty string) QoS type in the OVS database. */
4865 default_install__(struct netdev
*netdev_
)
4867 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4868 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
4870 /* Nothing but a tc class implementation is allowed to write to a tc. This
4871 * class never does that, so we can legitimately use a const tc object. */
4872 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4876 default_tc_install(struct netdev
*netdev
,
4877 const struct smap
*details OVS_UNUSED
)
4879 default_install__(netdev
);
4884 default_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4886 default_install__(netdev
);
4890 static const struct tc_ops tc_ops_default
= {
4891 .ovs_name
= "", /* ovs_name */
4892 .tc_install
= default_tc_install
,
4893 .tc_load
= default_tc_load
,
4896 /* "linux-other" traffic control class.
4901 other_tc_load(struct netdev
*netdev_
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4903 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4904 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_other
);
4906 /* Nothing but a tc class implementation is allowed to write to a tc. This
4907 * class never does that, so we can legitimately use a const tc object. */
4908 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4912 static const struct tc_ops tc_ops_other
= {
4913 .ovs_name
= "linux-other",
4914 .tc_load
= other_tc_load
,
4917 /* Traffic control. */
4919 /* Number of kernel "tc" ticks per second. */
4920 static double ticks_per_s
;
4922 /* Number of kernel "jiffies" per second. This is used for the purpose of
4923 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4924 * one jiffy's worth of data.
4926 * There are two possibilities here:
4928 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4929 * approximate range of 100 to 1024. That means that we really need to
4930 * make sure that the qdisc can buffer that much data.
4932 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4933 * has finely granular timers and there's no need to fudge additional room
4934 * for buffers. (There's no extra effort needed to implement that: the
4935 * large 'buffer_hz' is used as a divisor, so practically any number will
4936 * come out as 0 in the division. Small integer results in the case of
4937 * really high dividends won't have any real effect anyhow.)
4939 static unsigned int buffer_hz
;
4941 static struct tcmsg
*
4942 netdev_linux_tc_make_request(const struct netdev
*netdev
, int type
,
4943 unsigned int flags
, struct ofpbuf
*request
)
4948 error
= get_ifindex(netdev
, &ifindex
);
4953 return tc_make_request(ifindex
, type
, flags
, request
);
4956 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4959 * This function is equivalent to running:
4960 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4961 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4964 * The configuration and stats may be seen with the following command:
4965 * /sbin/tc -s filter show dev <devname> parent ffff:
4967 * Returns 0 if successful, otherwise a positive errno value.
4970 tc_add_policer(struct netdev
*netdev
,
4971 uint32_t kbits_rate
, uint32_t kbits_burst
)
4973 struct tc_police tc_police
;
4974 struct ofpbuf request
;
4975 struct tcmsg
*tcmsg
;
4976 size_t basic_offset
;
4977 size_t police_offset
;
4981 memset(&tc_police
, 0, sizeof tc_police
);
4982 tc_police
.action
= TC_POLICE_SHOT
;
4983 tc_police
.mtu
= mtu
;
4984 tc_fill_rate(&tc_police
.rate
, ((uint64_t) kbits_rate
* 1000)/8, mtu
);
4986 /* The following appears wrong in one way: In networking a kilobit is
4987 * usually 1000 bits but this uses 1024 bits.
4989 * However if you "fix" those problems then "tc filter show ..." shows
4990 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4991 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4992 * tc's point of view. Whatever. */
4993 tc_police
.burst
= tc_bytes_to_ticks(
4994 tc_police
.rate
.rate
, MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024 / 8);
4996 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTFILTER
,
4997 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
5001 tcmsg
->tcm_parent
= tc_make_handle(0xffff, 0);
5002 tcmsg
->tcm_info
= tc_make_handle(49,
5003 (OVS_FORCE
uint16_t) htons(ETH_P_ALL
));
5005 nl_msg_put_string(&request
, TCA_KIND
, "basic");
5006 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
5007 police_offset
= nl_msg_start_nested(&request
, TCA_BASIC_POLICE
);
5008 nl_msg_put_unspec(&request
, TCA_POLICE_TBF
, &tc_police
, sizeof tc_police
);
5009 tc_put_rtab(&request
, TCA_POLICE_RATE
, &tc_police
.rate
);
5010 nl_msg_end_nested(&request
, police_offset
);
5011 nl_msg_end_nested(&request
, basic_offset
);
5013 error
= tc_transact(&request
, NULL
);
5024 /* The values in psched are not individually very meaningful, but they are
5025 * important. The tables below show some values seen in the wild.
5029 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5030 * (Before that, there are hints that it was 1000000000.)
5032 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5036 * -----------------------------------
5037 * [1] 000c8000 000f4240 000f4240 00000064
5038 * [2] 000003e8 00000400 000f4240 3b9aca00
5039 * [3] 000003e8 00000400 000f4240 3b9aca00
5040 * [4] 000003e8 00000400 000f4240 00000064
5041 * [5] 000003e8 00000040 000f4240 3b9aca00
5042 * [6] 000003e8 00000040 000f4240 000000f9
5044 * a b c d ticks_per_s buffer_hz
5045 * ------- --------- ---------- ------------- ----------- -------------
5046 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5047 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5048 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5049 * [4] 1,000 1,024 1,000,000 100 976,562 100
5050 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5051 * [6] 1,000 64 1,000,000 249 15,625,000 249
5053 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5054 * [2] 2.6.26-1-686-bigmem from Debian lenny
5055 * [3] 2.6.26-2-sparc64 from Debian lenny
5056 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5057 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5058 * [6] 2.6.34 from kernel.org on KVM
5060 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5061 static const char fn
[] = "/proc/net/psched";
5062 unsigned int a
, b
, c
, d
;
5065 if (!ovsthread_once_start(&once
)) {
5072 stream
= fopen(fn
, "r");
5074 VLOG_WARN("%s: open failed: %s", fn
, ovs_strerror(errno
));
5078 if (fscanf(stream
, "%x %x %x %x", &a
, &b
, &c
, &d
) != 4) {
5079 VLOG_WARN("%s: read failed", fn
);
5083 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn
, a
, b
, c
, d
);
5086 if (!a
|| !b
|| !c
) {
5087 VLOG_WARN("%s: invalid scheduler parameters", fn
);
5091 ticks_per_s
= (double) a
* c
/ b
;
5095 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5098 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn
, ticks_per_s
, buffer_hz
);
5101 ovsthread_once_done(&once
);
5104 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5105 * rate of 'rate' bytes per second. */
5107 tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
)
5110 return (rate
* ticks
) / ticks_per_s
;
5113 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
5114 * rate of 'rate' bytes per second. */
5116 tc_bytes_to_ticks(unsigned int rate
, unsigned int size
)
5119 return rate
? ((unsigned long long int) ticks_per_s
* size
) / rate
: 0;
5122 /* Returns the number of bytes that need to be reserved for qdisc buffering at
5123 * a transmission rate of 'rate' bytes per second. */
5125 tc_buffer_per_jiffy(unsigned int rate
)
5128 return rate
/ buffer_hz
;
5131 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5132 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5133 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5134 * stores NULL into it if it is absent.
5136 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5139 * Returns 0 if successful, otherwise a positive errno value. */
5141 tc_parse_qdisc(const struct ofpbuf
*msg
, const char **kind
,
5142 struct nlattr
**options
)
5144 static const struct nl_policy tca_policy
[] = {
5145 [TCA_KIND
] = { .type
= NL_A_STRING
, .optional
= false },
5146 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= true },
5148 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5150 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5151 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5152 VLOG_WARN_RL(&rl
, "failed to parse qdisc message");
5157 *kind
= nl_attr_get_string(ta
[TCA_KIND
]);
5161 *options
= ta
[TCA_OPTIONS
];
5176 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5177 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5178 * into '*options', and its queue statistics into '*stats'. Any of the output
5179 * arguments may be null.
5181 * Returns 0 if successful, otherwise a positive errno value. */
5183 tc_parse_class(const struct ofpbuf
*msg
, unsigned int *handlep
,
5184 struct nlattr
**options
, struct netdev_queue_stats
*stats
)
5186 static const struct nl_policy tca_policy
[] = {
5187 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= false },
5188 [TCA_STATS2
] = { .type
= NL_A_NESTED
, .optional
= false },
5190 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5192 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5193 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5194 VLOG_WARN_RL(&rl
, "failed to parse class message");
5199 struct tcmsg
*tc
= ofpbuf_at_assert(msg
, NLMSG_HDRLEN
, sizeof *tc
);
5200 *handlep
= tc
->tcm_handle
;
5204 *options
= ta
[TCA_OPTIONS
];
5208 const struct gnet_stats_queue
*gsq
;
5209 struct gnet_stats_basic gsb
;
5211 static const struct nl_policy stats_policy
[] = {
5212 [TCA_STATS_BASIC
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5213 .min_len
= sizeof gsb
},
5214 [TCA_STATS_QUEUE
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5215 .min_len
= sizeof *gsq
},
5217 struct nlattr
*sa
[ARRAY_SIZE(stats_policy
)];
5219 if (!nl_parse_nested(ta
[TCA_STATS2
], stats_policy
,
5220 sa
, ARRAY_SIZE(sa
))) {
5221 VLOG_WARN_RL(&rl
, "failed to parse class stats");
5225 /* Alignment issues screw up the length of struct gnet_stats_basic on
5226 * some arch/bitsize combinations. Newer versions of Linux have a
5227 * struct gnet_stats_basic_packed, but we can't depend on that. The
5228 * easiest thing to do is just to make a copy. */
5229 memset(&gsb
, 0, sizeof gsb
);
5230 memcpy(&gsb
, nl_attr_get(sa
[TCA_STATS_BASIC
]),
5231 MIN(nl_attr_get_size(sa
[TCA_STATS_BASIC
]), sizeof gsb
));
5232 stats
->tx_bytes
= gsb
.bytes
;
5233 stats
->tx_packets
= gsb
.packets
;
5235 gsq
= nl_attr_get(sa
[TCA_STATS_QUEUE
]);
5236 stats
->tx_errors
= gsq
->drops
;
5246 memset(stats
, 0, sizeof *stats
);
5251 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5254 tc_query_class(const struct netdev
*netdev
,
5255 unsigned int handle
, unsigned int parent
,
5256 struct ofpbuf
**replyp
)
5258 struct ofpbuf request
;
5259 struct tcmsg
*tcmsg
;
5262 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, NLM_F_ECHO
,
5267 tcmsg
->tcm_handle
= handle
;
5268 tcmsg
->tcm_parent
= parent
;
5270 error
= tc_transact(&request
, replyp
);
5272 VLOG_WARN_RL(&rl
, "query %s class %u:%u (parent %u:%u) failed (%s)",
5273 netdev_get_name(netdev
),
5274 tc_get_major(handle
), tc_get_minor(handle
),
5275 tc_get_major(parent
), tc_get_minor(parent
),
5276 ovs_strerror(error
));
5281 /* Equivalent to "tc class del dev <name> handle <handle>". */
5283 tc_delete_class(const struct netdev
*netdev
, unsigned int handle
)
5285 struct ofpbuf request
;
5286 struct tcmsg
*tcmsg
;
5289 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_DELTCLASS
, 0, &request
);
5293 tcmsg
->tcm_handle
= handle
;
5294 tcmsg
->tcm_parent
= 0;
5296 error
= tc_transact(&request
, NULL
);
5298 VLOG_WARN_RL(&rl
, "delete %s class %u:%u failed (%s)",
5299 netdev_get_name(netdev
),
5300 tc_get_major(handle
), tc_get_minor(handle
),
5301 ovs_strerror(error
));
5306 /* Equivalent to "tc qdisc del dev <name> root". */
5308 tc_del_qdisc(struct netdev
*netdev_
)
5310 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5311 struct ofpbuf request
;
5312 struct tcmsg
*tcmsg
;
5315 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_DELQDISC
, 0, &request
);
5319 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
5320 tcmsg
->tcm_parent
= TC_H_ROOT
;
5322 error
= tc_transact(&request
, NULL
);
5323 if (error
== EINVAL
) {
5324 /* EINVAL probably means that the default qdisc was in use, in which
5325 * case we've accomplished our purpose. */
5328 if (!error
&& netdev
->tc
) {
5329 if (netdev
->tc
->ops
->tc_destroy
) {
5330 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
5338 getqdisc_is_safe(void)
5340 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5341 static bool safe
= false;
5343 if (ovsthread_once_start(&once
)) {
5344 struct utsname utsname
;
5347 if (uname(&utsname
) == -1) {
5348 VLOG_WARN("uname failed (%s)", ovs_strerror(errno
));
5349 } else if (!ovs_scan(utsname
.release
, "%d.%d", &major
, &minor
)) {
5350 VLOG_WARN("uname reported bad OS release (%s)", utsname
.release
);
5351 } else if (major
< 2 || (major
== 2 && minor
< 35)) {
5352 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5357 ovsthread_once_done(&once
);
5362 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5363 * kernel to determine what they are. Returns 0 if successful, otherwise a
5364 * positive errno value. */
5366 tc_query_qdisc(const struct netdev
*netdev_
)
5368 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5369 struct ofpbuf request
, *qdisc
;
5370 const struct tc_ops
*ops
;
5371 struct tcmsg
*tcmsg
;
5379 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5380 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5381 * 2.6.35 without that fix backported to it.
5383 * To avoid the OOPS, we must not make a request that would attempt to dump
5384 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5385 * few others. There are a few ways that I can see to do this, but most of
5386 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5387 * technique chosen here is to assume that any non-default qdisc that we
5388 * create will have a class with handle 1:0. The built-in qdiscs only have
5389 * a class with handle 0:0.
5391 * On Linux 2.6.35+ we use the straightforward method because it allows us
5392 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5393 * in such a case we get no response at all from the kernel (!) if a
5394 * builtin qdisc is in use (which is later caught by "!error &&
5395 * !qdisc->size"). */
5396 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_GETQDISC
, NLM_F_ECHO
,
5401 tcmsg
->tcm_handle
= tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5402 tcmsg
->tcm_parent
= getqdisc_is_safe() ? TC_H_ROOT
: 0;
5404 /* Figure out what tc class to instantiate. */
5405 error
= tc_transact(&request
, &qdisc
);
5406 if (!error
&& qdisc
->size
) {
5409 error
= tc_parse_qdisc(qdisc
, &kind
, NULL
);
5411 ops
= &tc_ops_other
;
5413 ops
= tc_lookup_linux_name(kind
);
5415 static struct vlog_rate_limit rl2
= VLOG_RATE_LIMIT_INIT(1, 1);
5416 VLOG_DBG_RL(&rl2
, "unknown qdisc \"%s\"", kind
);
5418 ops
= &tc_ops_other
;
5421 } else if ((!error
&& !qdisc
->size
) || error
== ENOENT
) {
5422 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5423 * set up by some other entity that doesn't have a handle 1:0. We will
5424 * assume that it's the system default qdisc. */
5425 ops
= &tc_ops_default
;
5428 /* Who knows? Maybe the device got deleted. */
5429 VLOG_WARN_RL(&rl
, "query %s qdisc failed (%s)",
5430 netdev_get_name(netdev_
), ovs_strerror(error
));
5431 ops
= &tc_ops_other
;
5434 /* Instantiate it. */
5435 load_error
= ops
->tc_load(CONST_CAST(struct netdev
*, netdev_
), qdisc
);
5436 ovs_assert((load_error
== 0) == (netdev
->tc
!= NULL
));
5437 ofpbuf_delete(qdisc
);
5439 return error
? error
: load_error
;
5442 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5443 approximate the time to transmit packets of various lengths. For an MTU of
5444 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5445 represents two possible packet lengths; for a MTU of 513 through 1024, four
5446 possible lengths; and so on.
5448 Returns, for the specified 'mtu', the number of bits that packet lengths
5449 need to be shifted right to fit within such a 256-entry table. */
5451 tc_calc_cell_log(unsigned int mtu
)
5456 mtu
= ETH_PAYLOAD_MAX
;
5458 mtu
+= ETH_HEADER_LEN
+ VLAN_HEADER_LEN
;
5460 for (cell_log
= 0; mtu
>= 256; cell_log
++) {
5467 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5470 tc_fill_rate(struct tc_ratespec
*rate
, uint64_t Bps
, int mtu
)
5472 memset(rate
, 0, sizeof *rate
);
5473 rate
->cell_log
= tc_calc_cell_log(mtu
);
5474 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5475 /* rate->cell_align = 0; */ /* distro headers. */
5476 rate
->mpu
= ETH_TOTAL_MIN
;
5480 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5481 * attribute of the specified "type".
5483 * See tc_calc_cell_log() above for a description of "rtab"s. */
5485 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
)
5490 rtab
= nl_msg_put_unspec_uninit(msg
, type
, TC_RTAB_SIZE
);
5491 for (i
= 0; i
< TC_RTAB_SIZE
/ sizeof *rtab
; i
++) {
5492 unsigned packet_size
= (i
+ 1) << rate
->cell_log
;
5493 if (packet_size
< rate
->mpu
) {
5494 packet_size
= rate
->mpu
;
5496 rtab
[i
] = tc_bytes_to_ticks(rate
->rate
, packet_size
);
5500 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5501 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5502 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5505 tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
)
5507 unsigned int min_burst
= tc_buffer_per_jiffy(Bps
) + mtu
;
5508 return tc_bytes_to_ticks(Bps
, MAX(burst_bytes
, min_burst
));
5511 /* Linux-only functions declared in netdev-linux.h */
5513 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5514 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5516 netdev_linux_ethtool_set_flag(struct netdev
*netdev
, uint32_t flag
,
5517 const char *flag_name
, bool enable
)
5519 const char *netdev_name
= netdev_get_name(netdev
);
5520 struct ethtool_value evalue
;
5524 COVERAGE_INC(netdev_get_ethtool
);
5525 memset(&evalue
, 0, sizeof evalue
);
5526 error
= netdev_linux_do_ethtool(netdev_name
,
5527 (struct ethtool_cmd
*)&evalue
,
5528 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5533 COVERAGE_INC(netdev_set_ethtool
);
5534 new_flags
= (evalue
.data
& ~flag
) | (enable
? flag
: 0);
5535 if (new_flags
== evalue
.data
) {
5538 evalue
.data
= new_flags
;
5539 error
= netdev_linux_do_ethtool(netdev_name
,
5540 (struct ethtool_cmd
*)&evalue
,
5541 ETHTOOL_SFLAGS
, "ETHTOOL_SFLAGS");
5546 COVERAGE_INC(netdev_get_ethtool
);
5547 memset(&evalue
, 0, sizeof evalue
);
5548 error
= netdev_linux_do_ethtool(netdev_name
,
5549 (struct ethtool_cmd
*)&evalue
,
5550 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5555 if (new_flags
!= evalue
.data
) {
5556 VLOG_WARN_RL(&rl
, "attempt to %s ethtool %s flag on network "
5557 "device %s failed", enable
? "enable" : "disable",
5558 flag_name
, netdev_name
);
5565 /* Utility functions. */
5567 /* Copies 'src' into 'dst', performing format conversion in the process. */
5569 netdev_stats_from_rtnl_link_stats(struct netdev_stats
*dst
,
5570 const struct rtnl_link_stats
*src
)
5572 dst
->rx_packets
= src
->rx_packets
;
5573 dst
->tx_packets
= src
->tx_packets
;
5574 dst
->rx_bytes
= src
->rx_bytes
;
5575 dst
->tx_bytes
= src
->tx_bytes
;
5576 dst
->rx_errors
= src
->rx_errors
;
5577 dst
->tx_errors
= src
->tx_errors
;
5578 dst
->rx_dropped
= src
->rx_dropped
;
5579 dst
->tx_dropped
= src
->tx_dropped
;
5580 dst
->multicast
= src
->multicast
;
5581 dst
->collisions
= src
->collisions
;
5582 dst
->rx_length_errors
= src
->rx_length_errors
;
5583 dst
->rx_over_errors
= src
->rx_over_errors
;
5584 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5585 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5586 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5587 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5588 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5589 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5590 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5591 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5592 dst
->tx_window_errors
= src
->tx_window_errors
;
5595 /* Copies 'src' into 'dst', performing format conversion in the process. */
5597 netdev_stats_from_rtnl_link_stats64(struct netdev_stats
*dst
,
5598 const struct rtnl_link_stats64
*src
)
5600 dst
->rx_packets
= src
->rx_packets
;
5601 dst
->tx_packets
= src
->tx_packets
;
5602 dst
->rx_bytes
= src
->rx_bytes
;
5603 dst
->tx_bytes
= src
->tx_bytes
;
5604 dst
->rx_errors
= src
->rx_errors
;
5605 dst
->tx_errors
= src
->tx_errors
;
5606 dst
->rx_dropped
= src
->rx_dropped
;
5607 dst
->tx_dropped
= src
->tx_dropped
;
5608 dst
->multicast
= src
->multicast
;
5609 dst
->collisions
= src
->collisions
;
5610 dst
->rx_length_errors
= src
->rx_length_errors
;
5611 dst
->rx_over_errors
= src
->rx_over_errors
;
5612 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5613 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5614 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5615 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5616 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5617 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5618 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5619 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5620 dst
->tx_window_errors
= src
->tx_window_errors
;
5624 get_stats_via_netlink(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
5626 struct ofpbuf request
;
5627 struct ofpbuf
*reply
;
5630 /* Filtering all counters by default */
5631 memset(stats
, 0xFF, sizeof(struct netdev_stats
));
5633 ofpbuf_init(&request
, 0);
5634 nl_msg_put_nlmsghdr(&request
,
5635 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
5636 RTM_GETLINK
, NLM_F_REQUEST
);
5637 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
5638 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(netdev_
));
5639 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
5640 ofpbuf_uninit(&request
);
5645 if (ofpbuf_try_pull(reply
, NLMSG_HDRLEN
+ sizeof(struct ifinfomsg
))) {
5646 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS64
);
5647 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats64
)) {
5648 netdev_stats_from_rtnl_link_stats64(stats
, nl_attr_get(a
));
5651 a
= nl_attr_find(reply
, 0, IFLA_STATS
);
5652 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats
)) {
5653 netdev_stats_from_rtnl_link_stats(stats
, nl_attr_get(a
));
5656 VLOG_WARN_RL(&rl
, "RTM_GETLINK reply lacks stats");
5661 VLOG_WARN_RL(&rl
, "short RTM_GETLINK reply");
5666 ofpbuf_delete(reply
);
5671 get_flags(const struct netdev
*dev
, unsigned int *flags
)
5677 error
= af_inet_ifreq_ioctl(dev
->name
, &ifr
, SIOCGIFFLAGS
, "SIOCGIFFLAGS");
5679 *flags
= ifr
.ifr_flags
;
5685 set_flags(const char *name
, unsigned int flags
)
5689 ifr
.ifr_flags
= flags
;
5690 return af_inet_ifreq_ioctl(name
, &ifr
, SIOCSIFFLAGS
, "SIOCSIFFLAGS");
5694 linux_get_ifindex(const char *netdev_name
)
5699 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5700 COVERAGE_INC(netdev_get_ifindex
);
5702 error
= af_inet_ioctl(SIOCGIFINDEX
, &ifr
);
5704 /* ENODEV probably means that a vif disappeared asynchronously and
5705 * hasn't been removed from the database yet, so reduce the log level
5706 * to INFO for that case. */
5707 VLOG_RL(&rl
, error
== ENODEV
? VLL_INFO
: VLL_ERR
,
5708 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5709 netdev_name
, ovs_strerror(error
));
5712 return ifr
.ifr_ifindex
;
5716 get_ifindex(const struct netdev
*netdev_
, int *ifindexp
)
5718 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5720 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
5721 netdev_linux_update_via_netlink(netdev
);
5724 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
5725 /* Fall back to ioctl if netlink fails */
5726 int ifindex
= linux_get_ifindex(netdev_get_name(netdev_
));
5729 netdev
->get_ifindex_error
= -ifindex
;
5730 netdev
->ifindex
= 0;
5732 netdev
->get_ifindex_error
= 0;
5733 netdev
->ifindex
= ifindex
;
5735 netdev
->cache_valid
|= VALID_IFINDEX
;
5738 *ifindexp
= netdev
->ifindex
;
5739 return netdev
->get_ifindex_error
;
5743 netdev_linux_update_via_netlink(struct netdev_linux
*netdev
)
5745 struct ofpbuf request
;
5746 struct ofpbuf
*reply
;
5747 struct rtnetlink_change chg
;
5748 struct rtnetlink_change
*change
= &chg
;
5751 ofpbuf_init(&request
, 0);
5752 nl_msg_put_nlmsghdr(&request
,
5753 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
5754 RTM_GETLINK
, NLM_F_REQUEST
);
5755 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
5757 /* The correct identifiers for a Linux device are netnsid and ifindex,
5758 * but ifindex changes as the port is moved to another network namespace
5759 * and the interface name statically stored in ovsdb. */
5760 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(&netdev
->up
));
5761 if (netdev_linux_netnsid_is_remote(netdev
)) {
5762 nl_msg_push_u32(&request
, IFLA_IF_NETNSID
, netdev
->netnsid
);
5764 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
5765 ofpbuf_uninit(&request
);
5767 ofpbuf_delete(reply
);
5771 if (rtnetlink_parse(reply
, change
)
5772 && change
->nlmsg_type
== RTM_NEWLINK
) {
5773 bool changed
= false;
5776 /* Update netdev from rtnl msg and increment its seq if needed. */
5777 if ((change
->ifi_flags
^ netdev
->ifi_flags
) & IFF_RUNNING
) {
5778 netdev
->carrier_resets
++;
5781 if (change
->ifi_flags
!= netdev
->ifi_flags
) {
5782 netdev
->ifi_flags
= change
->ifi_flags
;
5785 if (change
->mtu
&& change
->mtu
!= netdev
->mtu
) {
5786 netdev
->mtu
= change
->mtu
;
5787 netdev
->cache_valid
|= VALID_MTU
;
5788 netdev
->netdev_mtu_error
= 0;
5791 if (!eth_addr_is_zero(change
->mac
)
5792 && !eth_addr_equals(change
->mac
, netdev
->etheraddr
)) {
5793 netdev
->etheraddr
= change
->mac
;
5794 netdev
->cache_valid
|= VALID_ETHERADDR
;
5795 netdev
->ether_addr_error
= 0;
5798 if (change
->if_index
!= netdev
->ifindex
) {
5799 netdev
->ifindex
= change
->if_index
;
5800 netdev
->cache_valid
|= VALID_IFINDEX
;
5801 netdev
->get_ifindex_error
= 0;
5804 if (change
->master
&& netdev_linux_kind_is_lag(change
->master
)) {
5805 netdev
->is_lag_master
= true;
5808 netdev_change_seq_changed(&netdev
->up
);
5814 ofpbuf_delete(reply
);
5819 get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
)
5825 memset(&ifr
, 0, sizeof ifr
);
5826 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5827 COVERAGE_INC(netdev_get_hwaddr
);
5828 error
= af_inet_ioctl(SIOCGIFHWADDR
, &ifr
);
5830 /* ENODEV probably means that a vif disappeared asynchronously and
5831 * hasn't been removed from the database yet, so reduce the log level
5832 * to INFO for that case. */
5833 VLOG(error
== ENODEV
? VLL_INFO
: VLL_ERR
,
5834 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5835 netdev_name
, ovs_strerror(error
));
5838 hwaddr_family
= ifr
.ifr_hwaddr
.sa_family
;
5839 if (hwaddr_family
!= AF_UNSPEC
&& hwaddr_family
!= ARPHRD_ETHER
&&
5840 hwaddr_family
!= ARPHRD_NONE
) {
5841 VLOG_INFO("%s device has unknown hardware address family %d",
5842 netdev_name
, hwaddr_family
);
5845 memcpy(ea
, ifr
.ifr_hwaddr
.sa_data
, ETH_ADDR_LEN
);
5850 set_etheraddr(const char *netdev_name
, const struct eth_addr mac
)
5855 memset(&ifr
, 0, sizeof ifr
);
5856 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5857 ifr
.ifr_hwaddr
.sa_family
= ARPHRD_ETHER
;
5858 memcpy(ifr
.ifr_hwaddr
.sa_data
, &mac
, ETH_ADDR_LEN
);
5859 COVERAGE_INC(netdev_set_hwaddr
);
5860 error
= af_inet_ioctl(SIOCSIFHWADDR
, &ifr
);
5862 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5863 netdev_name
, ovs_strerror(error
));
5869 netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*ecmd
,
5870 int cmd
, const char *cmd_name
)
5875 memset(&ifr
, 0, sizeof ifr
);
5876 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
5877 ifr
.ifr_data
= (caddr_t
) ecmd
;
5880 error
= af_inet_ioctl(SIOCETHTOOL
, &ifr
);
5882 if (error
!= EOPNOTSUPP
) {
5883 VLOG_WARN_RL(&rl
, "ethtool command %s on network device %s "
5884 "failed: %s", cmd_name
, name
, ovs_strerror(error
));
5886 /* The device doesn't support this operation. That's pretty
5887 * common, so there's no point in logging anything. */
5893 /* Returns an AF_PACKET raw socket or a negative errno value. */
5895 af_packet_sock(void)
5897 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5900 if (ovsthread_once_start(&once
)) {
5901 sock
= socket(AF_PACKET
, SOCK_RAW
, 0);
5903 int error
= set_nonblocking(sock
);
5910 VLOG_ERR("failed to create packet socket: %s",
5911 ovs_strerror(errno
));
5913 ovsthread_once_done(&once
);