2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "openvswitch/dynamic-string.h"
56 #include "fatal-signal.h"
58 #include "openvswitch/hmap.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
70 #include "openvswitch/shash.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
78 VLOG_DEFINE_THIS_MODULE(netdev_linux
);
80 COVERAGE_DEFINE(netdev_set_policing
);
81 COVERAGE_DEFINE(netdev_arp_lookup
);
82 COVERAGE_DEFINE(netdev_get_ifindex
);
83 COVERAGE_DEFINE(netdev_get_hwaddr
);
84 COVERAGE_DEFINE(netdev_set_hwaddr
);
85 COVERAGE_DEFINE(netdev_get_ethtool
);
86 COVERAGE_DEFINE(netdev_set_ethtool
);
89 /* These were introduced in Linux 2.6.14, so they might be missing if we have
91 #ifndef ADVERTISED_Pause
92 #define ADVERTISED_Pause (1 << 13)
94 #ifndef ADVERTISED_Asym_Pause
95 #define ADVERTISED_Asym_Pause (1 << 14)
98 /* These were introduced in Linux 2.6.24, so they might be missing if we
99 * have old headers. */
100 #ifndef ETHTOOL_GFLAGS
101 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
103 #ifndef ETHTOOL_SFLAGS
104 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
107 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
110 #define TC_RTAB_SIZE 1024
113 /* Linux 2.6.21 introduced struct tpacket_auxdata.
114 * Linux 2.6.27 added the tp_vlan_tci member.
115 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
116 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
117 * TP_STATUS_VLAN_TPID_VALID.
119 * With all this churn it's easiest to unconditionally define a replacement
120 * structure that has everything we want.
122 #ifndef PACKET_AUXDATA
123 #define PACKET_AUXDATA 8
125 #ifndef TP_STATUS_VLAN_VALID
126 #define TP_STATUS_VLAN_VALID (1 << 4)
128 #ifndef TP_STATUS_VLAN_TPID_VALID
129 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
131 #undef tpacket_auxdata
132 #define tpacket_auxdata rpl_tpacket_auxdata
133 struct tpacket_auxdata
{
139 uint16_t tp_vlan_tci
;
140 uint16_t tp_vlan_tpid
;
143 /* Linux 2.6.27 introduced ethtool_cmd_speed
145 * To avoid revisiting problems reported with using configure to detect
146 * compatibility (see report at
147 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
148 * unconditionally replace ethtool_cmd_speed. */
149 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
150 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd
*ep
)
152 return ep
->speed
| (ep
->speed_hi
<< 16);
155 /* Linux 2.6.30 introduced supported and advertised flags for
156 * 1G base KX, and 10G base KX4, KR and R. */
157 #ifndef SUPPORTED_1000baseKX_Full
158 #define SUPPORTED_1000baseKX_Full (1 << 17)
159 #define SUPPORTED_10000baseKX4_Full (1 << 18)
160 #define SUPPORTED_10000baseKR_Full (1 << 19)
161 #define SUPPORTED_10000baseR_FEC (1 << 20)
162 #define ADVERTISED_1000baseKX_Full (1 << 17)
163 #define ADVERTISED_10000baseKX4_Full (1 << 18)
164 #define ADVERTISED_10000baseKR_Full (1 << 19)
165 #define ADVERTISED_10000baseR_FEC (1 << 20)
168 /* Linux 3.5 introduced supported and advertised flags for
169 * 40G base KR4, CR4, SR4 and LR4. */
170 #ifndef SUPPORTED_40000baseKR4_Full
171 #define SUPPORTED_40000baseKR4_Full (1 << 23)
172 #define SUPPORTED_40000baseCR4_Full (1 << 24)
173 #define SUPPORTED_40000baseSR4_Full (1 << 25)
174 #define SUPPORTED_40000baseLR4_Full (1 << 26)
175 #define ADVERTISED_40000baseKR4_Full (1 << 23)
176 #define ADVERTISED_40000baseCR4_Full (1 << 24)
177 #define ADVERTISED_40000baseSR4_Full (1 << 25)
178 #define ADVERTISED_40000baseLR4_Full (1 << 26)
181 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
183 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
184 * 2.6.32-431.29.2.el6.x86_64 (see report at
185 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
186 * if_link.h is not self-contained on those kernels. It is easiest to
187 * unconditionally define a replacement. */
189 #define IFLA_STATS64 23
191 #define rtnl_link_stats64 rpl_rtnl_link_stats64
192 struct rtnl_link_stats64
{
204 uint64_t rx_length_errors
;
205 uint64_t rx_over_errors
;
206 uint64_t rx_crc_errors
;
207 uint64_t rx_frame_errors
;
208 uint64_t rx_fifo_errors
;
209 uint64_t rx_missed_errors
;
211 uint64_t tx_aborted_errors
;
212 uint64_t tx_carrier_errors
;
213 uint64_t tx_fifo_errors
;
214 uint64_t tx_heartbeat_errors
;
215 uint64_t tx_window_errors
;
217 uint64_t rx_compressed
;
218 uint64_t tx_compressed
;
222 VALID_IFINDEX
= 1 << 0,
223 VALID_ETHERADDR
= 1 << 1,
226 VALID_POLICING
= 1 << 4,
227 VALID_VPORT_STAT_ERROR
= 1 << 5,
228 VALID_DRVINFO
= 1 << 6,
229 VALID_FEATURES
= 1 << 7,
232 /* Traffic control. */
234 /* An instance of a traffic control class. Always associated with a particular
237 * Each TC implementation subclasses this with whatever additional data it
240 const struct tc_ops
*ops
;
241 struct hmap queues
; /* Contains "struct tc_queue"s.
242 * Read by generic TC layer.
243 * Written only by TC implementation. */
246 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
248 /* One traffic control queue.
250 * Each TC implementation subclasses this with whatever additional data it
253 struct hmap_node hmap_node
; /* In struct tc's "queues" hmap. */
254 unsigned int queue_id
; /* OpenFlow queue ID. */
255 long long int created
; /* Time queue was created, in msecs. */
258 /* A particular kind of traffic control. Each implementation generally maps to
259 * one particular Linux qdisc class.
261 * The functions below return 0 if successful or a positive errno value on
262 * failure, except where otherwise noted. All of them must be provided, except
263 * where otherwise noted. */
265 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
266 * This is null for tc_ops_default and tc_ops_other, for which there are no
267 * appropriate values. */
268 const char *linux_name
;
270 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
271 const char *ovs_name
;
273 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
274 * queues. The queues are numbered 0 through n_queues - 1. */
275 unsigned int n_queues
;
277 /* Called to install this TC class on 'netdev'. The implementation should
278 * make the Netlink calls required to set up 'netdev' with the right qdisc
279 * and configure it according to 'details'. The implementation may assume
280 * that the current qdisc is the default; that is, there is no need for it
281 * to delete the current qdisc before installing itself.
283 * The contents of 'details' should be documented as valid for 'ovs_name'
284 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
285 * (which is built as ovs-vswitchd.conf.db(8)).
287 * This function must return 0 if and only if it sets 'netdev->tc' to an
288 * initialized 'struct tc'.
290 * (This function is null for tc_ops_other, which cannot be installed. For
291 * other TC classes it should always be nonnull.) */
292 int (*tc_install
)(struct netdev
*netdev
, const struct smap
*details
);
294 /* Called when the netdev code determines (through a Netlink query) that
295 * this TC class's qdisc is installed on 'netdev', but we didn't install
296 * it ourselves and so don't know any of the details.
298 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
299 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
300 * implementation should parse the other attributes of 'nlmsg' as
301 * necessary to determine its configuration. If necessary it should also
302 * use Netlink queries to determine the configuration of queues on
305 * This function must return 0 if and only if it sets 'netdev->tc' to an
306 * initialized 'struct tc'. */
307 int (*tc_load
)(struct netdev
*netdev
, struct ofpbuf
*nlmsg
);
309 /* Destroys the data structures allocated by the implementation as part of
310 * 'tc'. (This includes destroying 'tc->queues' by calling
313 * The implementation should not need to perform any Netlink calls. If
314 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
315 * (But it may not be desirable.)
317 * This function may be null if 'tc' is trivial. */
318 void (*tc_destroy
)(struct tc
*tc
);
320 /* Retrieves details of 'netdev->tc' configuration into 'details'.
322 * The implementation should not need to perform any Netlink calls, because
323 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
324 * cached the configuration.
326 * The contents of 'details' should be documented as valid for 'ovs_name'
327 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
328 * (which is built as ovs-vswitchd.conf.db(8)).
330 * This function may be null if 'tc' is not configurable.
332 int (*qdisc_get
)(const struct netdev
*netdev
, struct smap
*details
);
334 /* Reconfigures 'netdev->tc' according to 'details', performing any
335 * required Netlink calls to complete the reconfiguration.
337 * The contents of 'details' should be documented as valid for 'ovs_name'
338 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
339 * (which is built as ovs-vswitchd.conf.db(8)).
341 * This function may be null if 'tc' is not configurable.
343 int (*qdisc_set
)(struct netdev
*, const struct smap
*details
);
345 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
346 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
348 * The contents of 'details' should be documented as valid for 'ovs_name'
349 * in the "other_config" column in the "Queue" table in
350 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
352 * The implementation should not need to perform any Netlink calls, because
353 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
354 * cached the queue configuration.
356 * This function may be null if 'tc' does not have queues ('n_queues' is
358 int (*class_get
)(const struct netdev
*netdev
, const struct tc_queue
*queue
,
359 struct smap
*details
);
361 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
362 * 'details', perfoming any required Netlink calls to complete the
363 * reconfiguration. The caller ensures that 'queue_id' is less than
366 * The contents of 'details' should be documented as valid for 'ovs_name'
367 * in the "other_config" column in the "Queue" table in
368 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
370 * This function may be null if 'tc' does not have queues or its queues are
371 * not configurable. */
372 int (*class_set
)(struct netdev
*, unsigned int queue_id
,
373 const struct smap
*details
);
375 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
376 * tc_queue's within 'netdev->tc->queues'.
378 * This function may be null if 'tc' does not have queues or its queues
379 * cannot be deleted. */
380 int (*class_delete
)(struct netdev
*, struct tc_queue
*queue
);
382 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
383 * 'struct tc_queue's within 'netdev->tc->queues'.
385 * On success, initializes '*stats'.
387 * This function may be null if 'tc' does not have queues or if it cannot
388 * report queue statistics. */
389 int (*class_get_stats
)(const struct netdev
*netdev
,
390 const struct tc_queue
*queue
,
391 struct netdev_queue_stats
*stats
);
393 /* Extracts queue stats from 'nlmsg', which is a response to a
394 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
396 * This function may be null if 'tc' does not have queues or if it cannot
397 * report queue statistics. */
398 int (*class_dump_stats
)(const struct netdev
*netdev
,
399 const struct ofpbuf
*nlmsg
,
400 netdev_dump_queue_stats_cb
*cb
, void *aux
);
404 tc_init(struct tc
*tc
, const struct tc_ops
*ops
)
407 hmap_init(&tc
->queues
);
411 tc_destroy(struct tc
*tc
)
413 hmap_destroy(&tc
->queues
);
416 static const struct tc_ops tc_ops_htb
;
417 static const struct tc_ops tc_ops_hfsc
;
418 static const struct tc_ops tc_ops_codel
;
419 static const struct tc_ops tc_ops_fqcodel
;
420 static const struct tc_ops tc_ops_sfq
;
421 static const struct tc_ops tc_ops_default
;
422 static const struct tc_ops tc_ops_noop
;
423 static const struct tc_ops tc_ops_other
;
425 static const struct tc_ops
*const tcs
[] = {
426 &tc_ops_htb
, /* Hierarchy token bucket (see tc-htb(8)). */
427 &tc_ops_hfsc
, /* Hierarchical fair service curve. */
428 &tc_ops_codel
, /* Controlled delay */
429 &tc_ops_fqcodel
, /* Fair queue controlled delay */
430 &tc_ops_sfq
, /* Stochastic fair queueing */
431 &tc_ops_noop
, /* Non operating qos type. */
432 &tc_ops_default
, /* Default qdisc (see tc-pfifo_fast(8)). */
433 &tc_ops_other
, /* Some other qdisc. */
437 static unsigned int tc_make_handle(unsigned int major
, unsigned int minor
);
438 static unsigned int tc_get_major(unsigned int handle
);
439 static unsigned int tc_get_minor(unsigned int handle
);
441 static unsigned int tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
);
442 static unsigned int tc_bytes_to_ticks(unsigned int rate
, unsigned int size
);
443 static unsigned int tc_buffer_per_jiffy(unsigned int rate
);
445 static struct tcmsg
*tc_make_request(const struct netdev
*, int type
,
446 unsigned int flags
, struct ofpbuf
*);
447 static int tc_transact(struct ofpbuf
*request
, struct ofpbuf
**replyp
);
448 static int tc_add_del_ingress_qdisc(struct netdev
*netdev
, bool add
);
449 static int tc_add_policer(struct netdev
*,
450 uint32_t kbits_rate
, uint32_t kbits_burst
);
452 static int tc_parse_qdisc(const struct ofpbuf
*, const char **kind
,
453 struct nlattr
**options
);
454 static int tc_parse_class(const struct ofpbuf
*, unsigned int *queue_id
,
455 struct nlattr
**options
,
456 struct netdev_queue_stats
*);
457 static int tc_query_class(const struct netdev
*,
458 unsigned int handle
, unsigned int parent
,
459 struct ofpbuf
**replyp
);
460 static int tc_delete_class(const struct netdev
*, unsigned int handle
);
462 static int tc_del_qdisc(struct netdev
*netdev
);
463 static int tc_query_qdisc(const struct netdev
*netdev
);
465 static int tc_calc_cell_log(unsigned int mtu
);
466 static void tc_fill_rate(struct tc_ratespec
*rate
, uint64_t bps
, int mtu
);
467 static void tc_put_rtab(struct ofpbuf
*, uint16_t type
,
468 const struct tc_ratespec
*rate
);
469 static int tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
);
471 struct netdev_linux
{
474 /* Protects all members below. */
475 struct ovs_mutex mutex
;
477 unsigned int cache_valid
;
479 bool miimon
; /* Link status of last poll. */
480 long long int miimon_interval
; /* Miimon Poll rate. Disabled if <= 0. */
481 struct timer miimon_timer
;
483 /* The following are figured out "on demand" only. They are only valid
484 * when the corresponding VALID_* bit in 'cache_valid' is set. */
486 struct eth_addr etheraddr
;
488 unsigned int ifi_flags
;
489 long long int carrier_resets
;
490 uint32_t kbits_rate
; /* Policing data. */
491 uint32_t kbits_burst
;
492 int vport_stats_error
; /* Cached error code from vport_get_stats().
493 0 or an errno value. */
494 int netdev_mtu_error
; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
495 int ether_addr_error
; /* Cached error code from set/get etheraddr. */
496 int netdev_policing_error
; /* Cached error code from set policing. */
497 int get_features_error
; /* Cached error code from ETHTOOL_GSET. */
498 int get_ifindex_error
; /* Cached error code from SIOCGIFINDEX. */
500 enum netdev_features current
; /* Cached from ETHTOOL_GSET. */
501 enum netdev_features advertised
; /* Cached from ETHTOOL_GSET. */
502 enum netdev_features supported
; /* Cached from ETHTOOL_GSET. */
504 struct ethtool_drvinfo drvinfo
; /* Cached from ETHTOOL_GDRVINFO. */
507 /* For devices of class netdev_tap_class only. */
511 struct netdev_rxq_linux
{
512 struct netdev_rxq up
;
517 /* This is set pretty low because we probably won't learn anything from the
518 * additional log messages. */
519 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
521 /* Polling miimon status for all ports causes performance degradation when
522 * handling a large number of ports. If there are no devices using miimon, then
523 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
525 * Readers do not depend on this variable synchronizing with the related
526 * changes in the device miimon status, so we can use atomic_count. */
527 static atomic_count miimon_cnt
= ATOMIC_COUNT_INIT(0);
529 static void netdev_linux_run(void);
531 static int netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*,
532 int cmd
, const char *cmd_name
);
533 static int get_flags(const struct netdev
*, unsigned int *flags
);
534 static int set_flags(const char *, unsigned int flags
);
535 static int update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
536 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
537 OVS_REQUIRES(netdev
->mutex
);
538 static int do_get_ifindex(const char *netdev_name
);
539 static int get_ifindex(const struct netdev
*, int *ifindexp
);
540 static int do_set_addr(struct netdev
*netdev
,
541 int ioctl_nr
, const char *ioctl_name
,
542 struct in_addr addr
);
543 static int get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
);
544 static int set_etheraddr(const char *netdev_name
, const struct eth_addr
);
545 static int get_stats_via_netlink(const struct netdev
*, struct netdev_stats
*);
546 static int af_packet_sock(void);
547 static bool netdev_linux_miimon_enabled(void);
548 static void netdev_linux_miimon_run(void);
549 static void netdev_linux_miimon_wait(void);
550 static int netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
);
553 is_netdev_linux_class(const struct netdev_class
*netdev_class
)
555 return netdev_class
->run
== netdev_linux_run
;
559 is_tap_netdev(const struct netdev
*netdev
)
561 return netdev_get_class(netdev
) == &netdev_tap_class
;
564 static struct netdev_linux
*
565 netdev_linux_cast(const struct netdev
*netdev
)
567 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev
)));
569 return CONTAINER_OF(netdev
, struct netdev_linux
, up
);
572 static struct netdev_rxq_linux
*
573 netdev_rxq_linux_cast(const struct netdev_rxq
*rx
)
575 ovs_assert(is_netdev_linux_class(netdev_get_class(rx
->netdev
)));
576 return CONTAINER_OF(rx
, struct netdev_rxq_linux
, up
);
579 static void netdev_linux_update(struct netdev_linux
*netdev
,
580 const struct rtnetlink_change
*)
581 OVS_REQUIRES(netdev
->mutex
);
582 static void netdev_linux_changed(struct netdev_linux
*netdev
,
583 unsigned int ifi_flags
, unsigned int mask
)
584 OVS_REQUIRES(netdev
->mutex
);
586 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
587 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
588 * if no such socket could be created. */
589 static struct nl_sock
*
590 netdev_linux_notify_sock(void)
592 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
593 static struct nl_sock
*sock
;
594 unsigned int mcgroups
[] = {RTNLGRP_LINK
, RTNLGRP_IPV4_IFADDR
,
595 RTNLGRP_IPV6_IFADDR
, RTNLGRP_IPV6_IFINFO
};
597 if (ovsthread_once_start(&once
)) {
600 error
= nl_sock_create(NETLINK_ROUTE
, &sock
);
604 for (i
= 0; i
< ARRAY_SIZE(mcgroups
); i
++) {
605 error
= nl_sock_join_mcgroup(sock
, mcgroups
[i
]);
607 nl_sock_destroy(sock
);
613 ovsthread_once_done(&once
);
620 netdev_linux_miimon_enabled(void)
622 return atomic_count_get(&miimon_cnt
) > 0;
626 netdev_linux_run(void)
628 struct nl_sock
*sock
;
631 if (netdev_linux_miimon_enabled()) {
632 netdev_linux_miimon_run();
635 sock
= netdev_linux_notify_sock();
641 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
642 uint64_t buf_stub
[4096 / 8];
645 ofpbuf_use_stub(&buf
, buf_stub
, sizeof buf_stub
);
646 error
= nl_sock_recv(sock
, &buf
, false);
648 struct rtnetlink_change change
;
650 if (rtnetlink_parse(&buf
, &change
)) {
651 struct netdev
*netdev_
= NULL
;
652 char dev_name
[IFNAMSIZ
];
654 if (!change
.ifname
) {
655 change
.ifname
= if_indextoname(change
.if_index
, dev_name
);
659 netdev_
= netdev_from_name(change
.ifname
);
661 if (netdev_
&& is_netdev_linux_class(netdev_
->netdev_class
)) {
662 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
664 ovs_mutex_lock(&netdev
->mutex
);
665 netdev_linux_update(netdev
, &change
);
666 ovs_mutex_unlock(&netdev
->mutex
);
668 netdev_close(netdev_
);
670 } else if (error
== ENOBUFS
) {
671 struct shash device_shash
;
672 struct shash_node
*node
;
676 shash_init(&device_shash
);
677 netdev_get_devices(&netdev_linux_class
, &device_shash
);
678 SHASH_FOR_EACH (node
, &device_shash
) {
679 struct netdev
*netdev_
= node
->data
;
680 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
683 ovs_mutex_lock(&netdev
->mutex
);
684 get_flags(netdev_
, &flags
);
685 netdev_linux_changed(netdev
, flags
, 0);
686 ovs_mutex_unlock(&netdev
->mutex
);
688 netdev_close(netdev_
);
690 shash_destroy(&device_shash
);
691 } else if (error
!= EAGAIN
) {
692 VLOG_WARN_RL(&rl
, "error reading or parsing netlink (%s)",
693 ovs_strerror(error
));
700 netdev_linux_wait(void)
702 struct nl_sock
*sock
;
704 if (netdev_linux_miimon_enabled()) {
705 netdev_linux_miimon_wait();
707 sock
= netdev_linux_notify_sock();
709 nl_sock_wait(sock
, POLLIN
);
714 netdev_linux_changed(struct netdev_linux
*dev
,
715 unsigned int ifi_flags
, unsigned int mask
)
716 OVS_REQUIRES(dev
->mutex
)
718 netdev_change_seq_changed(&dev
->up
);
720 if ((dev
->ifi_flags
^ ifi_flags
) & IFF_RUNNING
) {
721 dev
->carrier_resets
++;
723 dev
->ifi_flags
= ifi_flags
;
725 dev
->cache_valid
&= mask
;
726 if (!(mask
& VALID_IN
)) {
727 netdev_get_addrs_list_flush();
732 netdev_linux_update(struct netdev_linux
*dev
,
733 const struct rtnetlink_change
*change
)
734 OVS_REQUIRES(dev
->mutex
)
736 if (rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)){
737 if (change
->nlmsg_type
== RTM_NEWLINK
) {
738 /* Keep drv-info, and ip addresses. */
739 netdev_linux_changed(dev
, change
->ifi_flags
,
740 VALID_DRVINFO
| VALID_IN
);
742 /* Update netdev from rtnl-change msg. */
744 dev
->mtu
= change
->mtu
;
745 dev
->cache_valid
|= VALID_MTU
;
746 dev
->netdev_mtu_error
= 0;
749 if (!eth_addr_is_zero(change
->mac
)) {
750 dev
->etheraddr
= change
->mac
;
751 dev
->cache_valid
|= VALID_ETHERADDR
;
752 dev
->ether_addr_error
= 0;
755 dev
->ifindex
= change
->if_index
;
756 dev
->cache_valid
|= VALID_IFINDEX
;
757 dev
->get_ifindex_error
= 0;
759 netdev_linux_changed(dev
, change
->ifi_flags
, 0);
761 } else if (rtnetlink_type_is_rtnlgrp_addr(change
->nlmsg_type
)) {
762 /* Invalidates in4, in6. */
763 netdev_linux_changed(dev
, dev
->ifi_flags
, ~VALID_IN
);
769 static struct netdev
*
770 netdev_linux_alloc(void)
772 struct netdev_linux
*netdev
= xzalloc(sizeof *netdev
);
777 netdev_linux_common_construct(struct netdev_linux
*netdev
)
779 ovs_mutex_init(&netdev
->mutex
);
782 /* Creates system and internal devices. */
784 netdev_linux_construct(struct netdev
*netdev_
)
786 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
789 netdev_linux_common_construct(netdev
);
791 error
= get_flags(&netdev
->up
, &netdev
->ifi_flags
);
792 if (error
== ENODEV
) {
793 if (netdev
->up
.netdev_class
!= &netdev_internal_class
) {
794 /* The device does not exist, so don't allow it to be opened. */
797 /* "Internal" netdevs have to be created as netdev objects before
798 * they exist in the kernel, because creating them in the kernel
799 * happens by passing a netdev object to dpif_port_add().
800 * Therefore, ignore the error. */
807 /* For most types of netdevs we open the device for each call of
808 * netdev_open(). However, this is not the case with tap devices,
809 * since it is only possible to open the device once. In this
810 * situation we share a single file descriptor, and consequently
811 * buffers, across all readers. Therefore once data is read it will
812 * be unavailable to other reads for tap devices. */
814 netdev_linux_construct_tap(struct netdev
*netdev_
)
816 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
817 static const char tap_dev
[] = "/dev/net/tun";
818 const char *name
= netdev_
->name
;
822 netdev_linux_common_construct(netdev
);
824 /* Open tap device. */
825 netdev
->tap_fd
= open(tap_dev
, O_RDWR
);
826 if (netdev
->tap_fd
< 0) {
828 VLOG_WARN("opening \"%s\" failed: %s", tap_dev
, ovs_strerror(error
));
832 /* Create tap device. */
833 ifr
.ifr_flags
= IFF_TAP
| IFF_NO_PI
;
834 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
835 if (ioctl(netdev
->tap_fd
, TUNSETIFF
, &ifr
) == -1) {
836 VLOG_WARN("%s: creating tap device failed: %s", name
,
837 ovs_strerror(errno
));
842 /* Make non-blocking. */
843 error
= set_nonblocking(netdev
->tap_fd
);
851 close(netdev
->tap_fd
);
856 netdev_linux_destruct(struct netdev
*netdev_
)
858 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
860 if (netdev
->tc
&& netdev
->tc
->ops
->tc_destroy
) {
861 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
864 if (netdev_get_class(netdev_
) == &netdev_tap_class
865 && netdev
->tap_fd
>= 0)
867 close(netdev
->tap_fd
);
870 if (netdev
->miimon_interval
> 0) {
871 atomic_count_dec(&miimon_cnt
);
874 ovs_mutex_destroy(&netdev
->mutex
);
878 netdev_linux_dealloc(struct netdev
*netdev_
)
880 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
884 static struct netdev_rxq
*
885 netdev_linux_rxq_alloc(void)
887 struct netdev_rxq_linux
*rx
= xzalloc(sizeof *rx
);
892 netdev_linux_rxq_construct(struct netdev_rxq
*rxq_
)
894 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
895 struct netdev
*netdev_
= rx
->up
.netdev
;
896 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
899 ovs_mutex_lock(&netdev
->mutex
);
900 rx
->is_tap
= is_tap_netdev(netdev_
);
902 rx
->fd
= netdev
->tap_fd
;
904 struct sockaddr_ll sll
;
906 /* Result of tcpdump -dd inbound */
907 static const struct sock_filter filt
[] = {
908 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
909 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
910 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
911 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
913 static const struct sock_fprog fprog
= {
914 ARRAY_SIZE(filt
), (struct sock_filter
*) filt
917 /* Create file descriptor. */
918 rx
->fd
= socket(PF_PACKET
, SOCK_RAW
, 0);
921 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error
));
926 if (setsockopt(rx
->fd
, SOL_PACKET
, PACKET_AUXDATA
, &val
, sizeof val
)) {
928 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
929 netdev_get_name(netdev_
), ovs_strerror(error
));
933 /* Set non-blocking mode. */
934 error
= set_nonblocking(rx
->fd
);
939 /* Get ethernet device index. */
940 error
= get_ifindex(&netdev
->up
, &ifindex
);
945 /* Bind to specific ethernet device. */
946 memset(&sll
, 0, sizeof sll
);
947 sll
.sll_family
= AF_PACKET
;
948 sll
.sll_ifindex
= ifindex
;
949 sll
.sll_protocol
= htons(ETH_P_ALL
);
950 if (bind(rx
->fd
, (struct sockaddr
*) &sll
, sizeof sll
) < 0) {
952 VLOG_ERR("%s: failed to bind raw socket (%s)",
953 netdev_get_name(netdev_
), ovs_strerror(error
));
957 /* Filter for only inbound packets. */
958 error
= setsockopt(rx
->fd
, SOL_SOCKET
, SO_ATTACH_FILTER
, &fprog
,
962 VLOG_ERR("%s: failed to attach filter (%s)",
963 netdev_get_name(netdev_
), ovs_strerror(error
));
967 ovs_mutex_unlock(&netdev
->mutex
);
975 ovs_mutex_unlock(&netdev
->mutex
);
980 netdev_linux_rxq_destruct(struct netdev_rxq
*rxq_
)
982 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
990 netdev_linux_rxq_dealloc(struct netdev_rxq
*rxq_
)
992 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
998 auxdata_to_vlan_tpid(const struct tpacket_auxdata
*aux
)
1000 if (aux
->tp_status
& TP_STATUS_VLAN_TPID_VALID
) {
1001 return htons(aux
->tp_vlan_tpid
);
1003 return htons(ETH_TYPE_VLAN
);
1008 auxdata_has_vlan_tci(const struct tpacket_auxdata
*aux
)
1010 return aux
->tp_vlan_tci
|| aux
->tp_status
& TP_STATUS_VLAN_VALID
;
1014 netdev_linux_rxq_recv_sock(int fd
, struct dp_packet
*buffer
)
1019 struct cmsghdr
*cmsg
;
1021 struct cmsghdr cmsg
;
1022 char buffer
[CMSG_SPACE(sizeof(struct tpacket_auxdata
))];
1026 /* Reserve headroom for a single VLAN tag */
1027 dp_packet_reserve(buffer
, VLAN_HEADER_LEN
);
1028 size
= dp_packet_tailroom(buffer
);
1030 iov
.iov_base
= dp_packet_data(buffer
);
1032 msgh
.msg_name
= NULL
;
1033 msgh
.msg_namelen
= 0;
1034 msgh
.msg_iov
= &iov
;
1035 msgh
.msg_iovlen
= 1;
1036 msgh
.msg_control
= &cmsg_buffer
;
1037 msgh
.msg_controllen
= sizeof cmsg_buffer
;
1041 retval
= recvmsg(fd
, &msgh
, MSG_TRUNC
);
1042 } while (retval
< 0 && errno
== EINTR
);
1046 } else if (retval
> size
) {
1050 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1052 for (cmsg
= CMSG_FIRSTHDR(&msgh
); cmsg
; cmsg
= CMSG_NXTHDR(&msgh
, cmsg
)) {
1053 const struct tpacket_auxdata
*aux
;
1055 if (cmsg
->cmsg_level
!= SOL_PACKET
1056 || cmsg
->cmsg_type
!= PACKET_AUXDATA
1057 || cmsg
->cmsg_len
< CMSG_LEN(sizeof(struct tpacket_auxdata
))) {
1061 aux
= ALIGNED_CAST(struct tpacket_auxdata
*, CMSG_DATA(cmsg
));
1062 if (auxdata_has_vlan_tci(aux
)) {
1063 if (retval
< ETH_HEADER_LEN
) {
1067 eth_push_vlan(buffer
, auxdata_to_vlan_tpid(aux
),
1068 htons(aux
->tp_vlan_tci
));
1077 netdev_linux_rxq_recv_tap(int fd
, struct dp_packet
*buffer
)
1080 size_t size
= dp_packet_tailroom(buffer
);
1083 retval
= read(fd
, dp_packet_data(buffer
), size
);
1084 } while (retval
< 0 && errno
== EINTR
);
1090 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1095 netdev_linux_rxq_recv(struct netdev_rxq
*rxq_
, struct dp_packet_batch
*batch
)
1097 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1098 struct netdev
*netdev
= rx
->up
.netdev
;
1099 struct dp_packet
*buffer
;
1103 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
1104 mtu
= ETH_PAYLOAD_MAX
;
1107 buffer
= dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN
+ mtu
,
1108 DP_NETDEV_HEADROOM
);
1109 retval
= (rx
->is_tap
1110 ? netdev_linux_rxq_recv_tap(rx
->fd
, buffer
)
1111 : netdev_linux_rxq_recv_sock(rx
->fd
, buffer
));
1114 if (retval
!= EAGAIN
&& retval
!= EMSGSIZE
) {
1115 VLOG_WARN_RL(&rl
, "error receiving Ethernet packet on %s: %s",
1116 netdev_rxq_get_name(rxq_
), ovs_strerror(errno
));
1118 dp_packet_delete(buffer
);
1120 batch
->packets
[0] = buffer
;
1128 netdev_linux_rxq_wait(struct netdev_rxq
*rxq_
)
1130 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1131 poll_fd_wait(rx
->fd
, POLLIN
);
1135 netdev_linux_rxq_drain(struct netdev_rxq
*rxq_
)
1137 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1140 int error
= af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_
), &ifr
,
1141 SIOCGIFTXQLEN
, "SIOCGIFTXQLEN");
1145 drain_fd(rx
->fd
, ifr
.ifr_qlen
);
1148 return drain_rcvbuf(rx
->fd
);
1152 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1153 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1154 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1155 * the packet is too big or too small to transmit on the device.
1157 * The caller retains ownership of 'buffer' in all cases.
1159 * The kernel maintains a packet transmission queue, so the caller is not
1160 * expected to do additional queuing of packets. */
1162 netdev_linux_send(struct netdev
*netdev_
, int qid OVS_UNUSED
,
1163 struct dp_packet_batch
*batch
, bool may_steal
,
1164 bool concurrent_txq OVS_UNUSED
)
1169 /* 'i' is incremented only if there's no error */
1170 for (i
= 0; i
< batch
->count
;) {
1171 const void *data
= dp_packet_data(batch
->packets
[i
]);
1172 size_t size
= dp_packet_size(batch
->packets
[i
]);
1175 /* Truncate the packet if it is configured. */
1176 size
-= dp_packet_get_cutlen(batch
->packets
[i
]);
1178 if (!is_tap_netdev(netdev_
)) {
1179 /* Use our AF_PACKET socket to send to this device. */
1180 struct sockaddr_ll sll
;
1186 sock
= af_packet_sock();
1191 ifindex
= netdev_get_ifindex(netdev_
);
1196 /* We don't bother setting most fields in sockaddr_ll because the
1197 * kernel ignores them for SOCK_RAW. */
1198 memset(&sll
, 0, sizeof sll
);
1199 sll
.sll_family
= AF_PACKET
;
1200 sll
.sll_ifindex
= ifindex
;
1202 iov
.iov_base
= CONST_CAST(void *, data
);
1205 msg
.msg_name
= &sll
;
1206 msg
.msg_namelen
= sizeof sll
;
1209 msg
.msg_control
= NULL
;
1210 msg
.msg_controllen
= 0;
1213 retval
= sendmsg(sock
, &msg
, 0);
1215 /* Use the tap fd to send to this device. This is essential for
1216 * tap devices, because packets sent to a tap device with an
1217 * AF_PACKET socket will loop back to be *received* again on the
1218 * tap device. This doesn't occur on other interface types
1219 * because we attach a socket filter to the rx socket. */
1220 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1222 retval
= write(netdev
->tap_fd
, data
, size
);
1226 if (errno
== EINTR
) {
1227 /* The send was interrupted by a signal. Retry the packet by
1228 * continuing without incrementing 'i'.*/
1230 } else if (errno
== EIO
&& is_tap_netdev(netdev_
)) {
1231 /* The Linux tap driver returns EIO if the device is not up.
1232 * From the OVS side this is not an error, so ignore it. */
1234 /* The Linux AF_PACKET implementation never blocks waiting for
1235 * room for packets, instead returning ENOBUFS. Translate this
1236 * into EAGAIN for the caller. */
1237 error
= errno
== ENOBUFS
? EAGAIN
: errno
;
1240 } else if (retval
!= size
) {
1241 VLOG_WARN_RL(&rl
, "sent partial Ethernet packet (%"PRIuSIZE
" bytes"
1242 " of %"PRIuSIZE
") on %s", retval
, size
,
1243 netdev_get_name(netdev_
));
1248 /* Process the next packet in the batch */
1252 dp_packet_delete_batch(batch
, may_steal
);
1254 if (error
&& error
!= EAGAIN
) {
1255 VLOG_WARN_RL(&rl
, "error sending Ethernet packet on %s: %s",
1256 netdev_get_name(netdev_
), ovs_strerror(error
));
1263 /* Registers with the poll loop to wake up from the next call to poll_block()
1264 * when the packet transmission queue has sufficient room to transmit a packet
1265 * with netdev_send().
1267 * The kernel maintains a packet transmission queue, so the client is not
1268 * expected to do additional queuing of packets. Thus, this function is
1269 * unlikely to ever be used. It is included for completeness. */
1271 netdev_linux_send_wait(struct netdev
*netdev
, int qid OVS_UNUSED
)
1273 if (is_tap_netdev(netdev
)) {
1274 /* TAP device always accepts packets.*/
1275 poll_immediate_wake();
1279 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1280 * otherwise a positive errno value. */
1282 netdev_linux_set_etheraddr(struct netdev
*netdev_
, const struct eth_addr mac
)
1284 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1285 enum netdev_flags old_flags
= 0;
1288 ovs_mutex_lock(&netdev
->mutex
);
1290 if (netdev
->cache_valid
& VALID_ETHERADDR
) {
1291 error
= netdev
->ether_addr_error
;
1292 if (error
|| eth_addr_equals(netdev
->etheraddr
, mac
)) {
1295 netdev
->cache_valid
&= ~VALID_ETHERADDR
;
1298 /* Tap devices must be brought down before setting the address. */
1299 if (is_tap_netdev(netdev_
)) {
1300 update_flags(netdev
, NETDEV_UP
, 0, &old_flags
);
1302 error
= set_etheraddr(netdev_get_name(netdev_
), mac
);
1303 if (!error
|| error
== ENODEV
) {
1304 netdev
->ether_addr_error
= error
;
1305 netdev
->cache_valid
|= VALID_ETHERADDR
;
1307 netdev
->etheraddr
= mac
;
1311 if (is_tap_netdev(netdev_
) && old_flags
& NETDEV_UP
) {
1312 update_flags(netdev
, 0, NETDEV_UP
, &old_flags
);
1316 ovs_mutex_unlock(&netdev
->mutex
);
1320 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1322 netdev_linux_get_etheraddr(const struct netdev
*netdev_
, struct eth_addr
*mac
)
1324 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1327 ovs_mutex_lock(&netdev
->mutex
);
1328 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1329 netdev
->ether_addr_error
= get_etheraddr(netdev_get_name(netdev_
),
1330 &netdev
->etheraddr
);
1331 netdev
->cache_valid
|= VALID_ETHERADDR
;
1334 error
= netdev
->ether_addr_error
;
1336 *mac
= netdev
->etheraddr
;
1338 ovs_mutex_unlock(&netdev
->mutex
);
1344 netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
)
1348 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1351 netdev
->netdev_mtu_error
= af_inet_ifreq_ioctl(
1352 netdev_get_name(&netdev
->up
), &ifr
, SIOCGIFMTU
, "SIOCGIFMTU");
1353 netdev
->mtu
= ifr
.ifr_mtu
;
1354 netdev
->cache_valid
|= VALID_MTU
;
1357 error
= netdev
->netdev_mtu_error
;
1359 *mtup
= netdev
->mtu
;
1365 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1366 * in bytes, not including the hardware header; thus, this is typically 1500
1367 * bytes for Ethernet devices. */
1369 netdev_linux_get_mtu(const struct netdev
*netdev_
, int *mtup
)
1371 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1374 ovs_mutex_lock(&netdev
->mutex
);
1375 error
= netdev_linux_get_mtu__(netdev
, mtup
);
1376 ovs_mutex_unlock(&netdev
->mutex
);
1381 /* Sets the maximum size of transmitted (MTU) for given device using linux
1382 * networking ioctl interface.
1385 netdev_linux_set_mtu(struct netdev
*netdev_
, int mtu
)
1387 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1391 ovs_mutex_lock(&netdev
->mutex
);
1392 if (netdev
->cache_valid
& VALID_MTU
) {
1393 error
= netdev
->netdev_mtu_error
;
1394 if (error
|| netdev
->mtu
== mtu
) {
1397 netdev
->cache_valid
&= ~VALID_MTU
;
1400 error
= af_inet_ifreq_ioctl(netdev_get_name(netdev_
), &ifr
,
1401 SIOCSIFMTU
, "SIOCSIFMTU");
1402 if (!error
|| error
== ENODEV
) {
1403 netdev
->netdev_mtu_error
= error
;
1404 netdev
->mtu
= ifr
.ifr_mtu
;
1405 netdev
->cache_valid
|= VALID_MTU
;
1408 ovs_mutex_unlock(&netdev
->mutex
);
1412 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1413 * On failure, returns a negative errno value. */
1415 netdev_linux_get_ifindex(const struct netdev
*netdev_
)
1417 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1420 ovs_mutex_lock(&netdev
->mutex
);
1421 error
= get_ifindex(netdev_
, &ifindex
);
1422 ovs_mutex_unlock(&netdev
->mutex
);
1424 return error
? -error
: ifindex
;
1428 netdev_linux_get_carrier(const struct netdev
*netdev_
, bool *carrier
)
1430 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1432 ovs_mutex_lock(&netdev
->mutex
);
1433 if (netdev
->miimon_interval
> 0) {
1434 *carrier
= netdev
->miimon
;
1436 *carrier
= (netdev
->ifi_flags
& IFF_RUNNING
) != 0;
1438 ovs_mutex_unlock(&netdev
->mutex
);
1443 static long long int
1444 netdev_linux_get_carrier_resets(const struct netdev
*netdev_
)
1446 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1447 long long int carrier_resets
;
1449 ovs_mutex_lock(&netdev
->mutex
);
1450 carrier_resets
= netdev
->carrier_resets
;
1451 ovs_mutex_unlock(&netdev
->mutex
);
1453 return carrier_resets
;
1457 netdev_linux_do_miimon(const char *name
, int cmd
, const char *cmd_name
,
1458 struct mii_ioctl_data
*data
)
1463 memset(&ifr
, 0, sizeof ifr
);
1464 memcpy(&ifr
.ifr_data
, data
, sizeof *data
);
1465 error
= af_inet_ifreq_ioctl(name
, &ifr
, cmd
, cmd_name
);
1466 memcpy(data
, &ifr
.ifr_data
, sizeof *data
);
1472 netdev_linux_get_miimon(const char *name
, bool *miimon
)
1474 struct mii_ioctl_data data
;
1479 memset(&data
, 0, sizeof data
);
1480 error
= netdev_linux_do_miimon(name
, SIOCGMIIPHY
, "SIOCGMIIPHY", &data
);
1482 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1483 data
.reg_num
= MII_BMSR
;
1484 error
= netdev_linux_do_miimon(name
, SIOCGMIIREG
, "SIOCGMIIREG",
1488 *miimon
= !!(data
.val_out
& BMSR_LSTATUS
);
1490 VLOG_WARN_RL(&rl
, "%s: failed to query MII", name
);
1493 struct ethtool_cmd ecmd
;
1495 VLOG_DBG_RL(&rl
, "%s: failed to query MII, falling back to ethtool",
1498 COVERAGE_INC(netdev_get_ethtool
);
1499 memset(&ecmd
, 0, sizeof ecmd
);
1500 error
= netdev_linux_do_ethtool(name
, &ecmd
, ETHTOOL_GLINK
,
1503 struct ethtool_value eval
;
1505 memcpy(&eval
, &ecmd
, sizeof eval
);
1506 *miimon
= !!eval
.data
;
1508 VLOG_WARN_RL(&rl
, "%s: ethtool link status failed", name
);
1516 netdev_linux_set_miimon_interval(struct netdev
*netdev_
,
1517 long long int interval
)
1519 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1521 ovs_mutex_lock(&netdev
->mutex
);
1522 interval
= interval
> 0 ? MAX(interval
, 100) : 0;
1523 if (netdev
->miimon_interval
!= interval
) {
1524 if (interval
&& !netdev
->miimon_interval
) {
1525 atomic_count_inc(&miimon_cnt
);
1526 } else if (!interval
&& netdev
->miimon_interval
) {
1527 atomic_count_dec(&miimon_cnt
);
1530 netdev
->miimon_interval
= interval
;
1531 timer_set_expired(&netdev
->miimon_timer
);
1533 ovs_mutex_unlock(&netdev
->mutex
);
1539 netdev_linux_miimon_run(void)
1541 struct shash device_shash
;
1542 struct shash_node
*node
;
1544 shash_init(&device_shash
);
1545 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1546 SHASH_FOR_EACH (node
, &device_shash
) {
1547 struct netdev
*netdev
= node
->data
;
1548 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1551 ovs_mutex_lock(&dev
->mutex
);
1552 if (dev
->miimon_interval
> 0 && timer_expired(&dev
->miimon_timer
)) {
1553 netdev_linux_get_miimon(dev
->up
.name
, &miimon
);
1554 if (miimon
!= dev
->miimon
) {
1555 dev
->miimon
= miimon
;
1556 netdev_linux_changed(dev
, dev
->ifi_flags
, 0);
1559 timer_set_duration(&dev
->miimon_timer
, dev
->miimon_interval
);
1561 ovs_mutex_unlock(&dev
->mutex
);
1562 netdev_close(netdev
);
1565 shash_destroy(&device_shash
);
1569 netdev_linux_miimon_wait(void)
1571 struct shash device_shash
;
1572 struct shash_node
*node
;
1574 shash_init(&device_shash
);
1575 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1576 SHASH_FOR_EACH (node
, &device_shash
) {
1577 struct netdev
*netdev
= node
->data
;
1578 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1580 ovs_mutex_lock(&dev
->mutex
);
1581 if (dev
->miimon_interval
> 0) {
1582 timer_wait(&dev
->miimon_timer
);
1584 ovs_mutex_unlock(&dev
->mutex
);
1585 netdev_close(netdev
);
1587 shash_destroy(&device_shash
);
1591 swap_uint64(uint64_t *a
, uint64_t *b
)
1598 /* Copies 'src' into 'dst', performing format conversion in the process.
1600 * 'src' is allowed to be misaligned. */
1602 netdev_stats_from_ovs_vport_stats(struct netdev_stats
*dst
,
1603 const struct ovs_vport_stats
*src
)
1605 dst
->rx_packets
= get_32aligned_u64(&src
->rx_packets
);
1606 dst
->tx_packets
= get_32aligned_u64(&src
->tx_packets
);
1607 dst
->rx_bytes
= get_32aligned_u64(&src
->rx_bytes
);
1608 dst
->tx_bytes
= get_32aligned_u64(&src
->tx_bytes
);
1609 dst
->rx_errors
= get_32aligned_u64(&src
->rx_errors
);
1610 dst
->tx_errors
= get_32aligned_u64(&src
->tx_errors
);
1611 dst
->rx_dropped
= get_32aligned_u64(&src
->rx_dropped
);
1612 dst
->tx_dropped
= get_32aligned_u64(&src
->tx_dropped
);
1614 dst
->collisions
= 0;
1615 dst
->rx_length_errors
= 0;
1616 dst
->rx_over_errors
= 0;
1617 dst
->rx_crc_errors
= 0;
1618 dst
->rx_frame_errors
= 0;
1619 dst
->rx_fifo_errors
= 0;
1620 dst
->rx_missed_errors
= 0;
1621 dst
->tx_aborted_errors
= 0;
1622 dst
->tx_carrier_errors
= 0;
1623 dst
->tx_fifo_errors
= 0;
1624 dst
->tx_heartbeat_errors
= 0;
1625 dst
->tx_window_errors
= 0;
1629 get_stats_via_vport__(const struct netdev
*netdev
, struct netdev_stats
*stats
)
1631 struct dpif_netlink_vport reply
;
1635 error
= dpif_netlink_vport_get(netdev_get_name(netdev
), &reply
, &buf
);
1638 } else if (!reply
.stats
) {
1643 netdev_stats_from_ovs_vport_stats(stats
, reply
.stats
);
1651 get_stats_via_vport(const struct netdev
*netdev_
,
1652 struct netdev_stats
*stats
)
1654 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1656 if (!netdev
->vport_stats_error
||
1657 !(netdev
->cache_valid
& VALID_VPORT_STAT_ERROR
)) {
1660 error
= get_stats_via_vport__(netdev_
, stats
);
1661 if (error
&& error
!= ENOENT
&& error
!= ENODEV
) {
1662 VLOG_WARN_RL(&rl
, "%s: obtaining netdev stats via vport failed "
1664 netdev_get_name(netdev_
), ovs_strerror(error
));
1666 netdev
->vport_stats_error
= error
;
1667 netdev
->cache_valid
|= VALID_VPORT_STAT_ERROR
;
1671 /* Retrieves current device stats for 'netdev-linux'. */
1673 netdev_linux_get_stats(const struct netdev
*netdev_
,
1674 struct netdev_stats
*stats
)
1676 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1677 struct netdev_stats dev_stats
;
1680 ovs_mutex_lock(&netdev
->mutex
);
1681 get_stats_via_vport(netdev_
, stats
);
1682 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1684 if (!netdev
->vport_stats_error
) {
1687 } else if (netdev
->vport_stats_error
) {
1688 /* stats not available from OVS then use netdev stats. */
1691 /* Use kernel netdev's packet and byte counts since vport's counters
1692 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1694 stats
->rx_packets
= dev_stats
.rx_packets
;
1695 stats
->rx_bytes
= dev_stats
.rx_bytes
;
1696 stats
->tx_packets
= dev_stats
.tx_packets
;
1697 stats
->tx_bytes
= dev_stats
.tx_bytes
;
1699 stats
->rx_errors
+= dev_stats
.rx_errors
;
1700 stats
->tx_errors
+= dev_stats
.tx_errors
;
1701 stats
->rx_dropped
+= dev_stats
.rx_dropped
;
1702 stats
->tx_dropped
+= dev_stats
.tx_dropped
;
1703 stats
->multicast
+= dev_stats
.multicast
;
1704 stats
->collisions
+= dev_stats
.collisions
;
1705 stats
->rx_length_errors
+= dev_stats
.rx_length_errors
;
1706 stats
->rx_over_errors
+= dev_stats
.rx_over_errors
;
1707 stats
->rx_crc_errors
+= dev_stats
.rx_crc_errors
;
1708 stats
->rx_frame_errors
+= dev_stats
.rx_frame_errors
;
1709 stats
->rx_fifo_errors
+= dev_stats
.rx_fifo_errors
;
1710 stats
->rx_missed_errors
+= dev_stats
.rx_missed_errors
;
1711 stats
->tx_aborted_errors
+= dev_stats
.tx_aborted_errors
;
1712 stats
->tx_carrier_errors
+= dev_stats
.tx_carrier_errors
;
1713 stats
->tx_fifo_errors
+= dev_stats
.tx_fifo_errors
;
1714 stats
->tx_heartbeat_errors
+= dev_stats
.tx_heartbeat_errors
;
1715 stats
->tx_window_errors
+= dev_stats
.tx_window_errors
;
1717 ovs_mutex_unlock(&netdev
->mutex
);
1722 /* Retrieves current device stats for 'netdev-tap' netdev or
1723 * netdev-internal. */
1725 netdev_tap_get_stats(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
1727 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1728 struct netdev_stats dev_stats
;
1731 ovs_mutex_lock(&netdev
->mutex
);
1732 get_stats_via_vport(netdev_
, stats
);
1733 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1735 if (!netdev
->vport_stats_error
) {
1738 } else if (netdev
->vport_stats_error
) {
1739 /* Transmit and receive stats will appear to be swapped relative to the
1740 * other ports since we are the one sending the data, not a remote
1741 * computer. For consistency, we swap them back here. This does not
1742 * apply if we are getting stats from the vport layer because it always
1743 * tracks stats from the perspective of the switch. */
1746 swap_uint64(&stats
->rx_packets
, &stats
->tx_packets
);
1747 swap_uint64(&stats
->rx_bytes
, &stats
->tx_bytes
);
1748 swap_uint64(&stats
->rx_errors
, &stats
->tx_errors
);
1749 swap_uint64(&stats
->rx_dropped
, &stats
->tx_dropped
);
1750 stats
->rx_length_errors
= 0;
1751 stats
->rx_over_errors
= 0;
1752 stats
->rx_crc_errors
= 0;
1753 stats
->rx_frame_errors
= 0;
1754 stats
->rx_fifo_errors
= 0;
1755 stats
->rx_missed_errors
= 0;
1756 stats
->tx_aborted_errors
= 0;
1757 stats
->tx_carrier_errors
= 0;
1758 stats
->tx_fifo_errors
= 0;
1759 stats
->tx_heartbeat_errors
= 0;
1760 stats
->tx_window_errors
= 0;
1762 /* Use kernel netdev's packet and byte counts since vport counters
1763 * do not reflect packet counts on the wire when GSO, TSO or GRO
1765 stats
->rx_packets
= dev_stats
.tx_packets
;
1766 stats
->rx_bytes
= dev_stats
.tx_bytes
;
1767 stats
->tx_packets
= dev_stats
.rx_packets
;
1768 stats
->tx_bytes
= dev_stats
.rx_bytes
;
1770 stats
->rx_dropped
+= dev_stats
.tx_dropped
;
1771 stats
->tx_dropped
+= dev_stats
.rx_dropped
;
1773 stats
->rx_errors
+= dev_stats
.tx_errors
;
1774 stats
->tx_errors
+= dev_stats
.rx_errors
;
1776 stats
->multicast
+= dev_stats
.multicast
;
1777 stats
->collisions
+= dev_stats
.collisions
;
1779 ovs_mutex_unlock(&netdev
->mutex
);
1785 netdev_internal_get_stats(const struct netdev
*netdev_
,
1786 struct netdev_stats
*stats
)
1788 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1791 ovs_mutex_lock(&netdev
->mutex
);
1792 get_stats_via_vport(netdev_
, stats
);
1793 error
= netdev
->vport_stats_error
;
1794 ovs_mutex_unlock(&netdev
->mutex
);
1800 netdev_linux_read_features(struct netdev_linux
*netdev
)
1802 struct ethtool_cmd ecmd
;
1806 if (netdev
->cache_valid
& VALID_FEATURES
) {
1810 COVERAGE_INC(netdev_get_ethtool
);
1811 memset(&ecmd
, 0, sizeof ecmd
);
1812 error
= netdev_linux_do_ethtool(netdev
->up
.name
, &ecmd
,
1813 ETHTOOL_GSET
, "ETHTOOL_GSET");
1818 /* Supported features. */
1819 netdev
->supported
= 0;
1820 if (ecmd
.supported
& SUPPORTED_10baseT_Half
) {
1821 netdev
->supported
|= NETDEV_F_10MB_HD
;
1823 if (ecmd
.supported
& SUPPORTED_10baseT_Full
) {
1824 netdev
->supported
|= NETDEV_F_10MB_FD
;
1826 if (ecmd
.supported
& SUPPORTED_100baseT_Half
) {
1827 netdev
->supported
|= NETDEV_F_100MB_HD
;
1829 if (ecmd
.supported
& SUPPORTED_100baseT_Full
) {
1830 netdev
->supported
|= NETDEV_F_100MB_FD
;
1832 if (ecmd
.supported
& SUPPORTED_1000baseT_Half
) {
1833 netdev
->supported
|= NETDEV_F_1GB_HD
;
1835 if ((ecmd
.supported
& SUPPORTED_1000baseT_Full
) ||
1836 (ecmd
.supported
& SUPPORTED_1000baseKX_Full
)) {
1837 netdev
->supported
|= NETDEV_F_1GB_FD
;
1839 if ((ecmd
.supported
& SUPPORTED_10000baseT_Full
) ||
1840 (ecmd
.supported
& SUPPORTED_10000baseKX4_Full
) ||
1841 (ecmd
.supported
& SUPPORTED_10000baseKR_Full
) ||
1842 (ecmd
.supported
& SUPPORTED_10000baseR_FEC
)) {
1843 netdev
->supported
|= NETDEV_F_10GB_FD
;
1845 if ((ecmd
.supported
& SUPPORTED_40000baseKR4_Full
) ||
1846 (ecmd
.supported
& SUPPORTED_40000baseCR4_Full
) ||
1847 (ecmd
.supported
& SUPPORTED_40000baseSR4_Full
) ||
1848 (ecmd
.supported
& SUPPORTED_40000baseLR4_Full
)) {
1849 netdev
->supported
|= NETDEV_F_40GB_FD
;
1851 if (ecmd
.supported
& SUPPORTED_TP
) {
1852 netdev
->supported
|= NETDEV_F_COPPER
;
1854 if (ecmd
.supported
& SUPPORTED_FIBRE
) {
1855 netdev
->supported
|= NETDEV_F_FIBER
;
1857 if (ecmd
.supported
& SUPPORTED_Autoneg
) {
1858 netdev
->supported
|= NETDEV_F_AUTONEG
;
1860 if (ecmd
.supported
& SUPPORTED_Pause
) {
1861 netdev
->supported
|= NETDEV_F_PAUSE
;
1863 if (ecmd
.supported
& SUPPORTED_Asym_Pause
) {
1864 netdev
->supported
|= NETDEV_F_PAUSE_ASYM
;
1867 /* Advertised features. */
1868 netdev
->advertised
= 0;
1869 if (ecmd
.advertising
& ADVERTISED_10baseT_Half
) {
1870 netdev
->advertised
|= NETDEV_F_10MB_HD
;
1872 if (ecmd
.advertising
& ADVERTISED_10baseT_Full
) {
1873 netdev
->advertised
|= NETDEV_F_10MB_FD
;
1875 if (ecmd
.advertising
& ADVERTISED_100baseT_Half
) {
1876 netdev
->advertised
|= NETDEV_F_100MB_HD
;
1878 if (ecmd
.advertising
& ADVERTISED_100baseT_Full
) {
1879 netdev
->advertised
|= NETDEV_F_100MB_FD
;
1881 if (ecmd
.advertising
& ADVERTISED_1000baseT_Half
) {
1882 netdev
->advertised
|= NETDEV_F_1GB_HD
;
1884 if ((ecmd
.advertising
& ADVERTISED_1000baseT_Full
) ||
1885 (ecmd
.advertising
& ADVERTISED_1000baseKX_Full
)) {
1886 netdev
->advertised
|= NETDEV_F_1GB_FD
;
1888 if ((ecmd
.advertising
& ADVERTISED_10000baseT_Full
) ||
1889 (ecmd
.advertising
& ADVERTISED_10000baseKX4_Full
) ||
1890 (ecmd
.advertising
& ADVERTISED_10000baseKR_Full
) ||
1891 (ecmd
.advertising
& ADVERTISED_10000baseR_FEC
)) {
1892 netdev
->advertised
|= NETDEV_F_10GB_FD
;
1894 if ((ecmd
.advertising
& ADVERTISED_40000baseKR4_Full
) ||
1895 (ecmd
.advertising
& ADVERTISED_40000baseCR4_Full
) ||
1896 (ecmd
.advertising
& ADVERTISED_40000baseSR4_Full
) ||
1897 (ecmd
.advertising
& ADVERTISED_40000baseLR4_Full
)) {
1898 netdev
->advertised
|= NETDEV_F_40GB_FD
;
1900 if (ecmd
.advertising
& ADVERTISED_TP
) {
1901 netdev
->advertised
|= NETDEV_F_COPPER
;
1903 if (ecmd
.advertising
& ADVERTISED_FIBRE
) {
1904 netdev
->advertised
|= NETDEV_F_FIBER
;
1906 if (ecmd
.advertising
& ADVERTISED_Autoneg
) {
1907 netdev
->advertised
|= NETDEV_F_AUTONEG
;
1909 if (ecmd
.advertising
& ADVERTISED_Pause
) {
1910 netdev
->advertised
|= NETDEV_F_PAUSE
;
1912 if (ecmd
.advertising
& ADVERTISED_Asym_Pause
) {
1913 netdev
->advertised
|= NETDEV_F_PAUSE_ASYM
;
1916 /* Current settings. */
1917 speed
= ethtool_cmd_speed(&ecmd
);
1918 if (speed
== SPEED_10
) {
1919 netdev
->current
= ecmd
.duplex
? NETDEV_F_10MB_FD
: NETDEV_F_10MB_HD
;
1920 } else if (speed
== SPEED_100
) {
1921 netdev
->current
= ecmd
.duplex
? NETDEV_F_100MB_FD
: NETDEV_F_100MB_HD
;
1922 } else if (speed
== SPEED_1000
) {
1923 netdev
->current
= ecmd
.duplex
? NETDEV_F_1GB_FD
: NETDEV_F_1GB_HD
;
1924 } else if (speed
== SPEED_10000
) {
1925 netdev
->current
= NETDEV_F_10GB_FD
;
1926 } else if (speed
== 40000) {
1927 netdev
->current
= NETDEV_F_40GB_FD
;
1928 } else if (speed
== 100000) {
1929 netdev
->current
= NETDEV_F_100GB_FD
;
1930 } else if (speed
== 1000000) {
1931 netdev
->current
= NETDEV_F_1TB_FD
;
1933 netdev
->current
= 0;
1936 if (ecmd
.port
== PORT_TP
) {
1937 netdev
->current
|= NETDEV_F_COPPER
;
1938 } else if (ecmd
.port
== PORT_FIBRE
) {
1939 netdev
->current
|= NETDEV_F_FIBER
;
1943 netdev
->current
|= NETDEV_F_AUTONEG
;
1947 netdev
->cache_valid
|= VALID_FEATURES
;
1948 netdev
->get_features_error
= error
;
1951 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1952 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1953 * Returns 0 if successful, otherwise a positive errno value. */
1955 netdev_linux_get_features(const struct netdev
*netdev_
,
1956 enum netdev_features
*current
,
1957 enum netdev_features
*advertised
,
1958 enum netdev_features
*supported
,
1959 enum netdev_features
*peer
)
1961 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1964 ovs_mutex_lock(&netdev
->mutex
);
1965 netdev_linux_read_features(netdev
);
1966 if (!netdev
->get_features_error
) {
1967 *current
= netdev
->current
;
1968 *advertised
= netdev
->advertised
;
1969 *supported
= netdev
->supported
;
1970 *peer
= 0; /* XXX */
1972 error
= netdev
->get_features_error
;
1973 ovs_mutex_unlock(&netdev
->mutex
);
1978 /* Set the features advertised by 'netdev' to 'advertise'. */
1980 netdev_linux_set_advertisements(struct netdev
*netdev_
,
1981 enum netdev_features advertise
)
1983 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1984 struct ethtool_cmd ecmd
;
1987 ovs_mutex_lock(&netdev
->mutex
);
1989 COVERAGE_INC(netdev_get_ethtool
);
1990 memset(&ecmd
, 0, sizeof ecmd
);
1991 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
1992 ETHTOOL_GSET
, "ETHTOOL_GSET");
1997 ecmd
.advertising
= 0;
1998 if (advertise
& NETDEV_F_10MB_HD
) {
1999 ecmd
.advertising
|= ADVERTISED_10baseT_Half
;
2001 if (advertise
& NETDEV_F_10MB_FD
) {
2002 ecmd
.advertising
|= ADVERTISED_10baseT_Full
;
2004 if (advertise
& NETDEV_F_100MB_HD
) {
2005 ecmd
.advertising
|= ADVERTISED_100baseT_Half
;
2007 if (advertise
& NETDEV_F_100MB_FD
) {
2008 ecmd
.advertising
|= ADVERTISED_100baseT_Full
;
2010 if (advertise
& NETDEV_F_1GB_HD
) {
2011 ecmd
.advertising
|= ADVERTISED_1000baseT_Half
;
2013 if (advertise
& NETDEV_F_1GB_FD
) {
2014 ecmd
.advertising
|= ADVERTISED_1000baseT_Full
;
2016 if (advertise
& NETDEV_F_10GB_FD
) {
2017 ecmd
.advertising
|= ADVERTISED_10000baseT_Full
;
2019 if (advertise
& NETDEV_F_COPPER
) {
2020 ecmd
.advertising
|= ADVERTISED_TP
;
2022 if (advertise
& NETDEV_F_FIBER
) {
2023 ecmd
.advertising
|= ADVERTISED_FIBRE
;
2025 if (advertise
& NETDEV_F_AUTONEG
) {
2026 ecmd
.advertising
|= ADVERTISED_Autoneg
;
2028 if (advertise
& NETDEV_F_PAUSE
) {
2029 ecmd
.advertising
|= ADVERTISED_Pause
;
2031 if (advertise
& NETDEV_F_PAUSE_ASYM
) {
2032 ecmd
.advertising
|= ADVERTISED_Asym_Pause
;
2034 COVERAGE_INC(netdev_set_ethtool
);
2035 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2036 ETHTOOL_SSET
, "ETHTOOL_SSET");
2039 ovs_mutex_unlock(&netdev
->mutex
);
2043 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2044 * successful, otherwise a positive errno value. */
2046 netdev_linux_set_policing(struct netdev
*netdev_
,
2047 uint32_t kbits_rate
, uint32_t kbits_burst
)
2049 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2050 const char *netdev_name
= netdev_get_name(netdev_
);
2053 kbits_burst
= (!kbits_rate
? 0 /* Force to 0 if no rate specified. */
2054 : !kbits_burst
? 8000 /* Default to 8000 kbits if 0. */
2055 : kbits_burst
); /* Stick with user-specified value. */
2057 ovs_mutex_lock(&netdev
->mutex
);
2058 if (netdev
->cache_valid
& VALID_POLICING
) {
2059 error
= netdev
->netdev_policing_error
;
2060 if (error
|| (netdev
->kbits_rate
== kbits_rate
&&
2061 netdev
->kbits_burst
== kbits_burst
)) {
2062 /* Assume that settings haven't changed since we last set them. */
2065 netdev
->cache_valid
&= ~VALID_POLICING
;
2068 COVERAGE_INC(netdev_set_policing
);
2069 /* Remove any existing ingress qdisc. */
2070 error
= tc_add_del_ingress_qdisc(netdev_
, false);
2072 VLOG_WARN_RL(&rl
, "%s: removing policing failed: %s",
2073 netdev_name
, ovs_strerror(error
));
2078 error
= tc_add_del_ingress_qdisc(netdev_
, true);
2080 VLOG_WARN_RL(&rl
, "%s: adding policing qdisc failed: %s",
2081 netdev_name
, ovs_strerror(error
));
2085 error
= tc_add_policer(netdev_
, kbits_rate
, kbits_burst
);
2087 VLOG_WARN_RL(&rl
, "%s: adding policing action failed: %s",
2088 netdev_name
, ovs_strerror(error
));
2093 netdev
->kbits_rate
= kbits_rate
;
2094 netdev
->kbits_burst
= kbits_burst
;
2097 if (!error
|| error
== ENODEV
) {
2098 netdev
->netdev_policing_error
= error
;
2099 netdev
->cache_valid
|= VALID_POLICING
;
2101 ovs_mutex_unlock(&netdev
->mutex
);
2106 netdev_linux_get_qos_types(const struct netdev
*netdev OVS_UNUSED
,
2109 const struct tc_ops
*const *opsp
;
2110 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2111 const struct tc_ops
*ops
= *opsp
;
2112 if (ops
->tc_install
&& ops
->ovs_name
[0] != '\0') {
2113 sset_add(types
, ops
->ovs_name
);
2119 static const struct tc_ops
*
2120 tc_lookup_ovs_name(const char *name
)
2122 const struct tc_ops
*const *opsp
;
2124 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2125 const struct tc_ops
*ops
= *opsp
;
2126 if (!strcmp(name
, ops
->ovs_name
)) {
2133 static const struct tc_ops
*
2134 tc_lookup_linux_name(const char *name
)
2136 const struct tc_ops
*const *opsp
;
2138 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2139 const struct tc_ops
*ops
= *opsp
;
2140 if (ops
->linux_name
&& !strcmp(name
, ops
->linux_name
)) {
2147 static struct tc_queue
*
2148 tc_find_queue__(const struct netdev
*netdev_
, unsigned int queue_id
,
2151 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2152 struct tc_queue
*queue
;
2154 HMAP_FOR_EACH_IN_BUCKET (queue
, hmap_node
, hash
, &netdev
->tc
->queues
) {
2155 if (queue
->queue_id
== queue_id
) {
2162 static struct tc_queue
*
2163 tc_find_queue(const struct netdev
*netdev
, unsigned int queue_id
)
2165 return tc_find_queue__(netdev
, queue_id
, hash_int(queue_id
, 0));
2169 netdev_linux_get_qos_capabilities(const struct netdev
*netdev OVS_UNUSED
,
2171 struct netdev_qos_capabilities
*caps
)
2173 const struct tc_ops
*ops
= tc_lookup_ovs_name(type
);
2177 caps
->n_queues
= ops
->n_queues
;
2182 netdev_linux_get_qos(const struct netdev
*netdev_
,
2183 const char **typep
, struct smap
*details
)
2185 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2188 ovs_mutex_lock(&netdev
->mutex
);
2189 error
= tc_query_qdisc(netdev_
);
2191 *typep
= netdev
->tc
->ops
->ovs_name
;
2192 error
= (netdev
->tc
->ops
->qdisc_get
2193 ? netdev
->tc
->ops
->qdisc_get(netdev_
, details
)
2196 ovs_mutex_unlock(&netdev
->mutex
);
2202 netdev_linux_set_qos(struct netdev
*netdev_
,
2203 const char *type
, const struct smap
*details
)
2205 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2206 const struct tc_ops
*new_ops
;
2209 new_ops
= tc_lookup_ovs_name(type
);
2210 if (!new_ops
|| !new_ops
->tc_install
) {
2214 if (new_ops
== &tc_ops_noop
) {
2215 return new_ops
->tc_install(netdev_
, details
);
2218 ovs_mutex_lock(&netdev
->mutex
);
2219 error
= tc_query_qdisc(netdev_
);
2224 if (new_ops
== netdev
->tc
->ops
) {
2225 error
= new_ops
->qdisc_set
? new_ops
->qdisc_set(netdev_
, details
) : 0;
2227 /* Delete existing qdisc. */
2228 error
= tc_del_qdisc(netdev_
);
2232 ovs_assert(netdev
->tc
== NULL
);
2234 /* Install new qdisc. */
2235 error
= new_ops
->tc_install(netdev_
, details
);
2236 ovs_assert((error
== 0) == (netdev
->tc
!= NULL
));
2240 ovs_mutex_unlock(&netdev
->mutex
);
2245 netdev_linux_get_queue(const struct netdev
*netdev_
,
2246 unsigned int queue_id
, struct smap
*details
)
2248 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2251 ovs_mutex_lock(&netdev
->mutex
);
2252 error
= tc_query_qdisc(netdev_
);
2254 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2256 ? netdev
->tc
->ops
->class_get(netdev_
, queue
, details
)
2259 ovs_mutex_unlock(&netdev
->mutex
);
2265 netdev_linux_set_queue(struct netdev
*netdev_
,
2266 unsigned int queue_id
, const struct smap
*details
)
2268 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2271 ovs_mutex_lock(&netdev
->mutex
);
2272 error
= tc_query_qdisc(netdev_
);
2274 error
= (queue_id
< netdev
->tc
->ops
->n_queues
2275 && netdev
->tc
->ops
->class_set
2276 ? netdev
->tc
->ops
->class_set(netdev_
, queue_id
, details
)
2279 ovs_mutex_unlock(&netdev
->mutex
);
2285 netdev_linux_delete_queue(struct netdev
*netdev_
, unsigned int queue_id
)
2287 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2290 ovs_mutex_lock(&netdev
->mutex
);
2291 error
= tc_query_qdisc(netdev_
);
2293 if (netdev
->tc
->ops
->class_delete
) {
2294 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2296 ? netdev
->tc
->ops
->class_delete(netdev_
, queue
)
2302 ovs_mutex_unlock(&netdev
->mutex
);
2308 netdev_linux_get_queue_stats(const struct netdev
*netdev_
,
2309 unsigned int queue_id
,
2310 struct netdev_queue_stats
*stats
)
2312 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2315 ovs_mutex_lock(&netdev
->mutex
);
2316 error
= tc_query_qdisc(netdev_
);
2318 if (netdev
->tc
->ops
->class_get_stats
) {
2319 const struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2321 stats
->created
= queue
->created
;
2322 error
= netdev
->tc
->ops
->class_get_stats(netdev_
, queue
,
2331 ovs_mutex_unlock(&netdev
->mutex
);
2336 struct queue_dump_state
{
2337 struct nl_dump dump
;
2342 start_queue_dump(const struct netdev
*netdev
, struct queue_dump_state
*state
)
2344 struct ofpbuf request
;
2345 struct tcmsg
*tcmsg
;
2347 tcmsg
= tc_make_request(netdev
, RTM_GETTCLASS
, 0, &request
);
2351 tcmsg
->tcm_parent
= 0;
2352 nl_dump_start(&state
->dump
, NETLINK_ROUTE
, &request
);
2353 ofpbuf_uninit(&request
);
2355 ofpbuf_init(&state
->buf
, NL_DUMP_BUFSIZE
);
2360 finish_queue_dump(struct queue_dump_state
*state
)
2362 ofpbuf_uninit(&state
->buf
);
2363 return nl_dump_done(&state
->dump
);
2366 struct netdev_linux_queue_state
{
2367 unsigned int *queues
;
2373 netdev_linux_queue_dump_start(const struct netdev
*netdev_
, void **statep
)
2375 const struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2378 ovs_mutex_lock(&netdev
->mutex
);
2379 error
= tc_query_qdisc(netdev_
);
2381 if (netdev
->tc
->ops
->class_get
) {
2382 struct netdev_linux_queue_state
*state
;
2383 struct tc_queue
*queue
;
2386 *statep
= state
= xmalloc(sizeof *state
);
2387 state
->n_queues
= hmap_count(&netdev
->tc
->queues
);
2388 state
->cur_queue
= 0;
2389 state
->queues
= xmalloc(state
->n_queues
* sizeof *state
->queues
);
2392 HMAP_FOR_EACH (queue
, hmap_node
, &netdev
->tc
->queues
) {
2393 state
->queues
[i
++] = queue
->queue_id
;
2399 ovs_mutex_unlock(&netdev
->mutex
);
2405 netdev_linux_queue_dump_next(const struct netdev
*netdev_
, void *state_
,
2406 unsigned int *queue_idp
, struct smap
*details
)
2408 const struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2409 struct netdev_linux_queue_state
*state
= state_
;
2412 ovs_mutex_lock(&netdev
->mutex
);
2413 while (state
->cur_queue
< state
->n_queues
) {
2414 unsigned int queue_id
= state
->queues
[state
->cur_queue
++];
2415 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2418 *queue_idp
= queue_id
;
2419 error
= netdev
->tc
->ops
->class_get(netdev_
, queue
, details
);
2423 ovs_mutex_unlock(&netdev
->mutex
);
2429 netdev_linux_queue_dump_done(const struct netdev
*netdev OVS_UNUSED
,
2432 struct netdev_linux_queue_state
*state
= state_
;
2434 free(state
->queues
);
2440 netdev_linux_dump_queue_stats(const struct netdev
*netdev_
,
2441 netdev_dump_queue_stats_cb
*cb
, void *aux
)
2443 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2446 ovs_mutex_lock(&netdev
->mutex
);
2447 error
= tc_query_qdisc(netdev_
);
2449 struct queue_dump_state state
;
2451 if (!netdev
->tc
->ops
->class_dump_stats
) {
2453 } else if (!start_queue_dump(netdev_
, &state
)) {
2459 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
2460 retval
= netdev
->tc
->ops
->class_dump_stats(netdev_
, &msg
,
2467 retval
= finish_queue_dump(&state
);
2473 ovs_mutex_unlock(&netdev
->mutex
);
2479 netdev_linux_set_in4(struct netdev
*netdev_
, struct in_addr address
,
2480 struct in_addr netmask
)
2482 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2485 ovs_mutex_lock(&netdev
->mutex
);
2486 error
= do_set_addr(netdev_
, SIOCSIFADDR
, "SIOCSIFADDR", address
);
2488 if (address
.s_addr
!= INADDR_ANY
) {
2489 error
= do_set_addr(netdev_
, SIOCSIFNETMASK
,
2490 "SIOCSIFNETMASK", netmask
);
2494 ovs_mutex_unlock(&netdev
->mutex
);
2499 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2500 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2503 netdev_linux_get_addr_list(const struct netdev
*netdev_
,
2504 struct in6_addr
**addr
, struct in6_addr
**mask
, int *n_cnt
)
2506 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2509 ovs_mutex_lock(&netdev
->mutex
);
2510 error
= netdev_get_addrs(netdev_get_name(netdev_
), addr
, mask
, n_cnt
);
2511 ovs_mutex_unlock(&netdev
->mutex
);
2517 make_in4_sockaddr(struct sockaddr
*sa
, struct in_addr addr
)
2519 struct sockaddr_in sin
;
2520 memset(&sin
, 0, sizeof sin
);
2521 sin
.sin_family
= AF_INET
;
2522 sin
.sin_addr
= addr
;
2525 memset(sa
, 0, sizeof *sa
);
2526 memcpy(sa
, &sin
, sizeof sin
);
2530 do_set_addr(struct netdev
*netdev
,
2531 int ioctl_nr
, const char *ioctl_name
, struct in_addr addr
)
2535 make_in4_sockaddr(&ifr
.ifr_addr
, addr
);
2536 return af_inet_ifreq_ioctl(netdev_get_name(netdev
), &ifr
, ioctl_nr
,
2540 /* Adds 'router' as a default IP gateway. */
2542 netdev_linux_add_router(struct netdev
*netdev OVS_UNUSED
, struct in_addr router
)
2544 struct in_addr any
= { INADDR_ANY
};
2548 memset(&rt
, 0, sizeof rt
);
2549 make_in4_sockaddr(&rt
.rt_dst
, any
);
2550 make_in4_sockaddr(&rt
.rt_gateway
, router
);
2551 make_in4_sockaddr(&rt
.rt_genmask
, any
);
2552 rt
.rt_flags
= RTF_UP
| RTF_GATEWAY
;
2553 error
= af_inet_ioctl(SIOCADDRT
, &rt
);
2555 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error
));
2561 netdev_linux_get_next_hop(const struct in_addr
*host
, struct in_addr
*next_hop
,
2564 static const char fn
[] = "/proc/net/route";
2569 *netdev_name
= NULL
;
2570 stream
= fopen(fn
, "r");
2571 if (stream
== NULL
) {
2572 VLOG_WARN_RL(&rl
, "%s: open failed: %s", fn
, ovs_strerror(errno
));
2577 while (fgets(line
, sizeof line
, stream
)) {
2580 ovs_be32 dest
, gateway
, mask
;
2581 int refcnt
, metric
, mtu
;
2582 unsigned int flags
, use
, window
, irtt
;
2585 "%16s %"SCNx32
" %"SCNx32
" %04X %d %u %d %"SCNx32
2587 iface
, &dest
, &gateway
, &flags
, &refcnt
,
2588 &use
, &metric
, &mask
, &mtu
, &window
, &irtt
)) {
2589 VLOG_WARN_RL(&rl
, "%s: could not parse line %d: %s",
2593 if (!(flags
& RTF_UP
)) {
2594 /* Skip routes that aren't up. */
2598 /* The output of 'dest', 'mask', and 'gateway' were given in
2599 * network byte order, so we don't need need any endian
2600 * conversions here. */
2601 if ((dest
& mask
) == (host
->s_addr
& mask
)) {
2603 /* The host is directly reachable. */
2604 next_hop
->s_addr
= 0;
2606 /* To reach the host, we must go through a gateway. */
2607 next_hop
->s_addr
= gateway
;
2609 *netdev_name
= xstrdup(iface
);
2621 netdev_linux_get_status(const struct netdev
*netdev_
, struct smap
*smap
)
2623 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2626 ovs_mutex_lock(&netdev
->mutex
);
2627 if (!(netdev
->cache_valid
& VALID_DRVINFO
)) {
2628 struct ethtool_cmd
*cmd
= (struct ethtool_cmd
*) &netdev
->drvinfo
;
2630 COVERAGE_INC(netdev_get_ethtool
);
2631 memset(&netdev
->drvinfo
, 0, sizeof netdev
->drvinfo
);
2632 error
= netdev_linux_do_ethtool(netdev
->up
.name
,
2635 "ETHTOOL_GDRVINFO");
2637 netdev
->cache_valid
|= VALID_DRVINFO
;
2642 smap_add(smap
, "driver_name", netdev
->drvinfo
.driver
);
2643 smap_add(smap
, "driver_version", netdev
->drvinfo
.version
);
2644 smap_add(smap
, "firmware_version", netdev
->drvinfo
.fw_version
);
2646 ovs_mutex_unlock(&netdev
->mutex
);
2652 netdev_internal_get_status(const struct netdev
*netdev OVS_UNUSED
,
2655 smap_add(smap
, "driver_name", "openvswitch");
2659 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2660 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2661 * returns 0. Otherwise, it returns a positive errno value; in particular,
2662 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2664 netdev_linux_arp_lookup(const struct netdev
*netdev
,
2665 ovs_be32 ip
, struct eth_addr
*mac
)
2668 struct sockaddr_in sin
;
2671 memset(&r
, 0, sizeof r
);
2672 memset(&sin
, 0, sizeof sin
);
2673 sin
.sin_family
= AF_INET
;
2674 sin
.sin_addr
.s_addr
= ip
;
2676 memcpy(&r
.arp_pa
, &sin
, sizeof sin
);
2677 r
.arp_ha
.sa_family
= ARPHRD_ETHER
;
2679 ovs_strzcpy(r
.arp_dev
, netdev_get_name(netdev
), sizeof r
.arp_dev
);
2680 COVERAGE_INC(netdev_arp_lookup
);
2681 retval
= af_inet_ioctl(SIOCGARP
, &r
);
2683 memcpy(mac
, r
.arp_ha
.sa_data
, ETH_ADDR_LEN
);
2684 } else if (retval
!= ENXIO
) {
2685 VLOG_WARN_RL(&rl
, "%s: could not look up ARP entry for "IP_FMT
": %s",
2686 netdev_get_name(netdev
), IP_ARGS(ip
),
2687 ovs_strerror(retval
));
2693 nd_to_iff_flags(enum netdev_flags nd
)
2696 if (nd
& NETDEV_UP
) {
2699 if (nd
& NETDEV_PROMISC
) {
2702 if (nd
& NETDEV_LOOPBACK
) {
2703 iff
|= IFF_LOOPBACK
;
2709 iff_to_nd_flags(int iff
)
2711 enum netdev_flags nd
= 0;
2715 if (iff
& IFF_PROMISC
) {
2716 nd
|= NETDEV_PROMISC
;
2718 if (iff
& IFF_LOOPBACK
) {
2719 nd
|= NETDEV_LOOPBACK
;
2725 update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
2726 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
2727 OVS_REQUIRES(netdev
->mutex
)
2729 int old_flags
, new_flags
;
2732 old_flags
= netdev
->ifi_flags
;
2733 *old_flagsp
= iff_to_nd_flags(old_flags
);
2734 new_flags
= (old_flags
& ~nd_to_iff_flags(off
)) | nd_to_iff_flags(on
);
2735 if (new_flags
!= old_flags
) {
2736 error
= set_flags(netdev_get_name(&netdev
->up
), new_flags
);
2737 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
2744 netdev_linux_update_flags(struct netdev
*netdev_
, enum netdev_flags off
,
2745 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
2747 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2750 ovs_mutex_lock(&netdev
->mutex
);
2751 error
= update_flags(netdev
, off
, on
, old_flagsp
);
2752 ovs_mutex_unlock(&netdev
->mutex
);
2757 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2758 GET_FEATURES, GET_STATUS) \
2761 false, /* is_pmd */ \
2765 netdev_linux_wait, \
2767 netdev_linux_alloc, \
2769 netdev_linux_destruct, \
2770 netdev_linux_dealloc, \
2771 NULL, /* get_config */ \
2772 NULL, /* set_config */ \
2773 NULL, /* get_tunnel_config */ \
2774 NULL, /* build header */ \
2775 NULL, /* push header */ \
2776 NULL, /* pop header */ \
2777 NULL, /* get_numa_id */ \
2778 NULL, /* set_tx_multiq */ \
2780 netdev_linux_send, \
2781 netdev_linux_send_wait, \
2783 netdev_linux_set_etheraddr, \
2784 netdev_linux_get_etheraddr, \
2785 netdev_linux_get_mtu, \
2786 netdev_linux_set_mtu, \
2787 netdev_linux_get_ifindex, \
2788 netdev_linux_get_carrier, \
2789 netdev_linux_get_carrier_resets, \
2790 netdev_linux_set_miimon_interval, \
2794 netdev_linux_set_advertisements, \
2796 netdev_linux_set_policing, \
2797 netdev_linux_get_qos_types, \
2798 netdev_linux_get_qos_capabilities, \
2799 netdev_linux_get_qos, \
2800 netdev_linux_set_qos, \
2801 netdev_linux_get_queue, \
2802 netdev_linux_set_queue, \
2803 netdev_linux_delete_queue, \
2804 netdev_linux_get_queue_stats, \
2805 netdev_linux_queue_dump_start, \
2806 netdev_linux_queue_dump_next, \
2807 netdev_linux_queue_dump_done, \
2808 netdev_linux_dump_queue_stats, \
2810 netdev_linux_set_in4, \
2811 netdev_linux_get_addr_list, \
2812 netdev_linux_add_router, \
2813 netdev_linux_get_next_hop, \
2815 netdev_linux_arp_lookup, \
2817 netdev_linux_update_flags, \
2818 NULL, /* reconfigure */ \
2820 netdev_linux_rxq_alloc, \
2821 netdev_linux_rxq_construct, \
2822 netdev_linux_rxq_destruct, \
2823 netdev_linux_rxq_dealloc, \
2824 netdev_linux_rxq_recv, \
2825 netdev_linux_rxq_wait, \
2826 netdev_linux_rxq_drain, \
2829 const struct netdev_class netdev_linux_class
=
2832 netdev_linux_construct
,
2833 netdev_linux_get_stats
,
2834 netdev_linux_get_features
,
2835 netdev_linux_get_status
);
2837 const struct netdev_class netdev_tap_class
=
2840 netdev_linux_construct_tap
,
2841 netdev_tap_get_stats
,
2842 netdev_linux_get_features
,
2843 netdev_linux_get_status
);
2845 const struct netdev_class netdev_internal_class
=
2848 netdev_linux_construct
,
2849 netdev_internal_get_stats
,
2850 NULL
, /* get_features */
2851 netdev_internal_get_status
);
2854 #define CODEL_N_QUEUES 0x0000
2856 /* In sufficiently new kernel headers these are defined as enums in
2857 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2858 * kernels. (This overrides any enum definition in the header file but that's
2860 #define TCA_CODEL_TARGET 1
2861 #define TCA_CODEL_LIMIT 2
2862 #define TCA_CODEL_INTERVAL 3
2871 static struct codel
*
2872 codel_get__(const struct netdev
*netdev_
)
2874 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2875 return CONTAINER_OF(netdev
->tc
, struct codel
, tc
);
2879 codel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
2882 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2883 struct codel
*codel
;
2885 codel
= xmalloc(sizeof *codel
);
2886 tc_init(&codel
->tc
, &tc_ops_codel
);
2887 codel
->target
= target
;
2888 codel
->limit
= limit
;
2889 codel
->interval
= interval
;
2891 netdev
->tc
= &codel
->tc
;
2895 codel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
2899 struct ofpbuf request
;
2900 struct tcmsg
*tcmsg
;
2901 uint32_t otarget
, olimit
, ointerval
;
2904 tc_del_qdisc(netdev
);
2906 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
2907 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
2911 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
2912 tcmsg
->tcm_parent
= TC_H_ROOT
;
2914 otarget
= target
? target
: 5000;
2915 olimit
= limit
? limit
: 10240;
2916 ointerval
= interval
? interval
: 100000;
2918 nl_msg_put_string(&request
, TCA_KIND
, "codel");
2919 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
2920 nl_msg_put_u32(&request
, TCA_CODEL_TARGET
, otarget
);
2921 nl_msg_put_u32(&request
, TCA_CODEL_LIMIT
, olimit
);
2922 nl_msg_put_u32(&request
, TCA_CODEL_INTERVAL
, ointerval
);
2923 nl_msg_end_nested(&request
, opt_offset
);
2925 error
= tc_transact(&request
, NULL
);
2927 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
2928 "target %u, limit %u, interval %u error %d(%s)",
2929 netdev_get_name(netdev
),
2930 otarget
, olimit
, ointerval
,
2931 error
, ovs_strerror(error
));
2937 codel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
2938 const struct smap
*details
, struct codel
*codel
)
2940 codel
->target
= smap_get_ullong(details
, "target", 0);
2941 codel
->limit
= smap_get_ullong(details
, "limit", 0);
2942 codel
->interval
= smap_get_ullong(details
, "interval", 0);
2944 if (!codel
->target
) {
2945 codel
->target
= 5000;
2947 if (!codel
->limit
) {
2948 codel
->limit
= 10240;
2950 if (!codel
->interval
) {
2951 codel
->interval
= 100000;
2956 codel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
2961 codel_parse_qdisc_details__(netdev
, details
, &codel
);
2962 error
= codel_setup_qdisc__(netdev
, codel
.target
, codel
.limit
,
2965 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
2971 codel_parse_tca_options__(struct nlattr
*nl_options
, struct codel
*codel
)
2973 static const struct nl_policy tca_codel_policy
[] = {
2974 [TCA_CODEL_TARGET
] = { .type
= NL_A_U32
},
2975 [TCA_CODEL_LIMIT
] = { .type
= NL_A_U32
},
2976 [TCA_CODEL_INTERVAL
] = { .type
= NL_A_U32
}
2979 struct nlattr
*attrs
[ARRAY_SIZE(tca_codel_policy
)];
2981 if (!nl_parse_nested(nl_options
, tca_codel_policy
,
2982 attrs
, ARRAY_SIZE(tca_codel_policy
))) {
2983 VLOG_WARN_RL(&rl
, "failed to parse CoDel class options");
2987 codel
->target
= nl_attr_get_u32(attrs
[TCA_CODEL_TARGET
]);
2988 codel
->limit
= nl_attr_get_u32(attrs
[TCA_CODEL_LIMIT
]);
2989 codel
->interval
= nl_attr_get_u32(attrs
[TCA_CODEL_INTERVAL
]);
2994 codel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
2996 struct nlattr
*nlattr
;
3001 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3006 error
= codel_parse_tca_options__(nlattr
, &codel
);
3011 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3017 codel_tc_destroy(struct tc
*tc
)
3019 struct codel
*codel
= CONTAINER_OF(tc
, struct codel
, tc
);
3025 codel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3027 const struct codel
*codel
= codel_get__(netdev
);
3028 smap_add_format(details
, "target", "%u", codel
->target
);
3029 smap_add_format(details
, "limit", "%u", codel
->limit
);
3030 smap_add_format(details
, "interval", "%u", codel
->interval
);
3035 codel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3039 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3040 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3041 codel_get__(netdev
)->target
= codel
.target
;
3042 codel_get__(netdev
)->limit
= codel
.limit
;
3043 codel_get__(netdev
)->interval
= codel
.interval
;
3047 static const struct tc_ops tc_ops_codel
= {
3048 "codel", /* linux_name */
3049 "linux-codel", /* ovs_name */
3050 CODEL_N_QUEUES
, /* n_queues */
3063 /* FQ-CoDel traffic control class. */
3065 #define FQCODEL_N_QUEUES 0x0000
3067 /* In sufficiently new kernel headers these are defined as enums in
3068 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3069 * kernels. (This overrides any enum definition in the header file but that's
3071 #define TCA_FQ_CODEL_TARGET 1
3072 #define TCA_FQ_CODEL_LIMIT 2
3073 #define TCA_FQ_CODEL_INTERVAL 3
3074 #define TCA_FQ_CODEL_ECN 4
3075 #define TCA_FQ_CODEL_FLOWS 5
3076 #define TCA_FQ_CODEL_QUANTUM 6
3087 static struct fqcodel
*
3088 fqcodel_get__(const struct netdev
*netdev_
)
3090 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3091 return CONTAINER_OF(netdev
->tc
, struct fqcodel
, tc
);
3095 fqcodel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3096 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3098 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3099 struct fqcodel
*fqcodel
;
3101 fqcodel
= xmalloc(sizeof *fqcodel
);
3102 tc_init(&fqcodel
->tc
, &tc_ops_fqcodel
);
3103 fqcodel
->target
= target
;
3104 fqcodel
->limit
= limit
;
3105 fqcodel
->interval
= interval
;
3106 fqcodel
->flows
= flows
;
3107 fqcodel
->quantum
= quantum
;
3109 netdev
->tc
= &fqcodel
->tc
;
3113 fqcodel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3114 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3117 struct ofpbuf request
;
3118 struct tcmsg
*tcmsg
;
3119 uint32_t otarget
, olimit
, ointerval
, oflows
, oquantum
;
3122 tc_del_qdisc(netdev
);
3124 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
3125 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3129 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3130 tcmsg
->tcm_parent
= TC_H_ROOT
;
3132 otarget
= target
? target
: 5000;
3133 olimit
= limit
? limit
: 10240;
3134 ointerval
= interval
? interval
: 100000;
3135 oflows
= flows
? flows
: 1024;
3136 oquantum
= quantum
? quantum
: 1514; /* fq_codel default quantum is 1514
3139 nl_msg_put_string(&request
, TCA_KIND
, "fq_codel");
3140 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3141 nl_msg_put_u32(&request
, TCA_FQ_CODEL_TARGET
, otarget
);
3142 nl_msg_put_u32(&request
, TCA_FQ_CODEL_LIMIT
, olimit
);
3143 nl_msg_put_u32(&request
, TCA_FQ_CODEL_INTERVAL
, ointerval
);
3144 nl_msg_put_u32(&request
, TCA_FQ_CODEL_FLOWS
, oflows
);
3145 nl_msg_put_u32(&request
, TCA_FQ_CODEL_QUANTUM
, oquantum
);
3146 nl_msg_end_nested(&request
, opt_offset
);
3148 error
= tc_transact(&request
, NULL
);
3150 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3151 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3152 netdev_get_name(netdev
),
3153 otarget
, olimit
, ointerval
, oflows
, oquantum
,
3154 error
, ovs_strerror(error
));
3160 fqcodel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3161 const struct smap
*details
, struct fqcodel
*fqcodel
)
3163 fqcodel
->target
= smap_get_ullong(details
, "target", 0);
3164 fqcodel
->limit
= smap_get_ullong(details
, "limit", 0);
3165 fqcodel
->interval
= smap_get_ullong(details
, "interval", 0);
3166 fqcodel
->flows
= smap_get_ullong(details
, "flows", 0);
3167 fqcodel
->quantum
= smap_get_ullong(details
, "quantum", 0);
3169 if (!fqcodel
->target
) {
3170 fqcodel
->target
= 5000;
3172 if (!fqcodel
->limit
) {
3173 fqcodel
->limit
= 10240;
3175 if (!fqcodel
->interval
) {
3176 fqcodel
->interval
= 1000000;
3178 if (!fqcodel
->flows
) {
3179 fqcodel
->flows
= 1024;
3181 if (!fqcodel
->quantum
) {
3182 fqcodel
->quantum
= 1514;
3187 fqcodel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3190 struct fqcodel fqcodel
;
3192 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3193 error
= fqcodel_setup_qdisc__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3194 fqcodel
.interval
, fqcodel
.flows
,
3197 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3198 fqcodel
.interval
, fqcodel
.flows
, fqcodel
.quantum
);
3204 fqcodel_parse_tca_options__(struct nlattr
*nl_options
, struct fqcodel
*fqcodel
)
3206 static const struct nl_policy tca_fqcodel_policy
[] = {
3207 [TCA_FQ_CODEL_TARGET
] = { .type
= NL_A_U32
},
3208 [TCA_FQ_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3209 [TCA_FQ_CODEL_INTERVAL
] = { .type
= NL_A_U32
},
3210 [TCA_FQ_CODEL_FLOWS
] = { .type
= NL_A_U32
},
3211 [TCA_FQ_CODEL_QUANTUM
] = { .type
= NL_A_U32
}
3214 struct nlattr
*attrs
[ARRAY_SIZE(tca_fqcodel_policy
)];
3216 if (!nl_parse_nested(nl_options
, tca_fqcodel_policy
,
3217 attrs
, ARRAY_SIZE(tca_fqcodel_policy
))) {
3218 VLOG_WARN_RL(&rl
, "failed to parse FQ_CoDel class options");
3222 fqcodel
->target
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_TARGET
]);
3223 fqcodel
->limit
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_LIMIT
]);
3224 fqcodel
->interval
=nl_attr_get_u32(attrs
[TCA_FQ_CODEL_INTERVAL
]);
3225 fqcodel
->flows
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_FLOWS
]);
3226 fqcodel
->quantum
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_QUANTUM
]);
3231 fqcodel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3233 struct nlattr
*nlattr
;
3236 struct fqcodel fqcodel
;
3238 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3243 error
= fqcodel_parse_tca_options__(nlattr
, &fqcodel
);
3248 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3249 fqcodel
.flows
, fqcodel
.quantum
);
3254 fqcodel_tc_destroy(struct tc
*tc
)
3256 struct fqcodel
*fqcodel
= CONTAINER_OF(tc
, struct fqcodel
, tc
);
3262 fqcodel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3264 const struct fqcodel
*fqcodel
= fqcodel_get__(netdev
);
3265 smap_add_format(details
, "target", "%u", fqcodel
->target
);
3266 smap_add_format(details
, "limit", "%u", fqcodel
->limit
);
3267 smap_add_format(details
, "interval", "%u", fqcodel
->interval
);
3268 smap_add_format(details
, "flows", "%u", fqcodel
->flows
);
3269 smap_add_format(details
, "quantum", "%u", fqcodel
->quantum
);
3274 fqcodel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3276 struct fqcodel fqcodel
;
3278 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3279 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3280 fqcodel
.flows
, fqcodel
.quantum
);
3281 fqcodel_get__(netdev
)->target
= fqcodel
.target
;
3282 fqcodel_get__(netdev
)->limit
= fqcodel
.limit
;
3283 fqcodel_get__(netdev
)->interval
= fqcodel
.interval
;
3284 fqcodel_get__(netdev
)->flows
= fqcodel
.flows
;
3285 fqcodel_get__(netdev
)->quantum
= fqcodel
.quantum
;
3289 static const struct tc_ops tc_ops_fqcodel
= {
3290 "fq_codel", /* linux_name */
3291 "linux-fq_codel", /* ovs_name */
3292 FQCODEL_N_QUEUES
, /* n_queues */
3305 /* SFQ traffic control class. */
3307 #define SFQ_N_QUEUES 0x0000
3316 sfq_get__(const struct netdev
*netdev_
)
3318 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3319 return CONTAINER_OF(netdev
->tc
, struct sfq
, tc
);
3323 sfq_install__(struct netdev
*netdev_
, uint32_t quantum
, uint32_t perturb
)
3325 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3328 sfq
= xmalloc(sizeof *sfq
);
3329 tc_init(&sfq
->tc
, &tc_ops_sfq
);
3330 sfq
->perturb
= perturb
;
3331 sfq
->quantum
= quantum
;
3333 netdev
->tc
= &sfq
->tc
;
3337 sfq_setup_qdisc__(struct netdev
*netdev
, uint32_t quantum
, uint32_t perturb
)
3339 struct tc_sfq_qopt opt
;
3340 struct ofpbuf request
;
3341 struct tcmsg
*tcmsg
;
3343 int mtu_error
, error
;
3344 mtu_error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3346 tc_del_qdisc(netdev
);
3348 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
3349 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3353 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3354 tcmsg
->tcm_parent
= TC_H_ROOT
;
3356 memset(&opt
, 0, sizeof opt
);
3359 opt
.quantum
= mtu
; /* if we cannot find mtu, use default */
3362 opt
.quantum
= quantum
;
3366 opt
.perturb_period
= 10;
3368 opt
.perturb_period
= perturb
;
3371 nl_msg_put_string(&request
, TCA_KIND
, "sfq");
3372 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
3374 error
= tc_transact(&request
, NULL
);
3376 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3377 "quantum %u, perturb %u error %d(%s)",
3378 netdev_get_name(netdev
),
3379 opt
.quantum
, opt
.perturb_period
,
3380 error
, ovs_strerror(error
));
3386 sfq_parse_qdisc_details__(struct netdev
*netdev
,
3387 const struct smap
*details
, struct sfq
*sfq
)
3389 sfq
->perturb
= smap_get_ullong(details
, "perturb", 0);
3390 sfq
->quantum
= smap_get_ullong(details
, "quantum", 0);
3392 if (!sfq
->perturb
) {
3396 if (!sfq
->quantum
) {
3398 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
3401 VLOG_WARN_RL(&rl
, "when using SFQ, you must specify quantum on a "
3402 "device without mtu");
3408 sfq_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3413 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3414 error
= sfq_setup_qdisc__(netdev
, sfq
.quantum
, sfq
.perturb
);
3416 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3422 sfq_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3424 const struct tc_sfq_qopt
*sfq
;
3425 struct nlattr
*nlattr
;
3429 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3431 sfq
= nl_attr_get(nlattr
);
3432 sfq_install__(netdev
, sfq
->perturb_period
, sfq
->quantum
);
3440 sfq_tc_destroy(struct tc
*tc
)
3442 struct sfq
*sfq
= CONTAINER_OF(tc
, struct sfq
, tc
);
3448 sfq_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3450 const struct sfq
*sfq
= sfq_get__(netdev
);
3451 smap_add_format(details
, "quantum", "%u", sfq
->quantum
);
3452 smap_add_format(details
, "perturb", "%u", sfq
->perturb
);
3457 sfq_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3461 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3462 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3463 sfq_get__(netdev
)->quantum
= sfq
.quantum
;
3464 sfq_get__(netdev
)->perturb
= sfq
.perturb
;
3468 static const struct tc_ops tc_ops_sfq
= {
3469 "sfq", /* linux_name */
3470 "linux-sfq", /* ovs_name */
3471 SFQ_N_QUEUES
, /* n_queues */
3484 /* HTB traffic control class. */
3486 #define HTB_N_QUEUES 0xf000
3487 #define HTB_RATE2QUANTUM 10
3491 unsigned int max_rate
; /* In bytes/s. */
3495 struct tc_queue tc_queue
;
3496 unsigned int min_rate
; /* In bytes/s. */
3497 unsigned int max_rate
; /* In bytes/s. */
3498 unsigned int burst
; /* In bytes. */
3499 unsigned int priority
; /* Lower values are higher priorities. */
3503 htb_get__(const struct netdev
*netdev_
)
3505 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3506 return CONTAINER_OF(netdev
->tc
, struct htb
, tc
);
3510 htb_install__(struct netdev
*netdev_
, uint64_t max_rate
)
3512 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3515 htb
= xmalloc(sizeof *htb
);
3516 tc_init(&htb
->tc
, &tc_ops_htb
);
3517 htb
->max_rate
= max_rate
;
3519 netdev
->tc
= &htb
->tc
;
3522 /* Create an HTB qdisc.
3524 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3526 htb_setup_qdisc__(struct netdev
*netdev
)
3529 struct tc_htb_glob opt
;
3530 struct ofpbuf request
;
3531 struct tcmsg
*tcmsg
;
3533 tc_del_qdisc(netdev
);
3535 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
3536 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3540 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3541 tcmsg
->tcm_parent
= TC_H_ROOT
;
3543 nl_msg_put_string(&request
, TCA_KIND
, "htb");
3545 memset(&opt
, 0, sizeof opt
);
3546 opt
.rate2quantum
= HTB_RATE2QUANTUM
;
3550 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3551 nl_msg_put_unspec(&request
, TCA_HTB_INIT
, &opt
, sizeof opt
);
3552 nl_msg_end_nested(&request
, opt_offset
);
3554 return tc_transact(&request
, NULL
);
3557 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3558 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3560 htb_setup_class__(struct netdev
*netdev
, unsigned int handle
,
3561 unsigned int parent
, struct htb_class
*class)
3564 struct tc_htb_opt opt
;
3565 struct ofpbuf request
;
3566 struct tcmsg
*tcmsg
;
3570 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3572 VLOG_WARN_RL(&rl
, "cannot set up HTB on device %s that lacks MTU",
3573 netdev_get_name(netdev
));
3577 memset(&opt
, 0, sizeof opt
);
3578 tc_fill_rate(&opt
.rate
, class->min_rate
, mtu
);
3579 tc_fill_rate(&opt
.ceil
, class->max_rate
, mtu
);
3580 /* Makes sure the quantum is at least MTU. Setting quantum will
3581 * make htb ignore the r2q for this class. */
3582 if ((class->min_rate
/ HTB_RATE2QUANTUM
) < mtu
) {
3585 opt
.buffer
= tc_calc_buffer(opt
.rate
.rate
, mtu
, class->burst
);
3586 opt
.cbuffer
= tc_calc_buffer(opt
.ceil
.rate
, mtu
, class->burst
);
3587 opt
.prio
= class->priority
;
3589 tcmsg
= tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
, &request
);
3593 tcmsg
->tcm_handle
= handle
;
3594 tcmsg
->tcm_parent
= parent
;
3596 nl_msg_put_string(&request
, TCA_KIND
, "htb");
3597 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3598 nl_msg_put_unspec(&request
, TCA_HTB_PARMS
, &opt
, sizeof opt
);
3599 tc_put_rtab(&request
, TCA_HTB_RTAB
, &opt
.rate
);
3600 tc_put_rtab(&request
, TCA_HTB_CTAB
, &opt
.ceil
);
3601 nl_msg_end_nested(&request
, opt_offset
);
3603 error
= tc_transact(&request
, NULL
);
3605 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
3606 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3607 netdev_get_name(netdev
),
3608 tc_get_major(handle
), tc_get_minor(handle
),
3609 tc_get_major(parent
), tc_get_minor(parent
),
3610 class->min_rate
, class->max_rate
,
3611 class->burst
, class->priority
, ovs_strerror(error
));
3616 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3617 * description of them into 'details'. The description complies with the
3618 * specification given in the vswitch database documentation for linux-htb
3621 htb_parse_tca_options__(struct nlattr
*nl_options
, struct htb_class
*class)
3623 static const struct nl_policy tca_htb_policy
[] = {
3624 [TCA_HTB_PARMS
] = { .type
= NL_A_UNSPEC
, .optional
= false,
3625 .min_len
= sizeof(struct tc_htb_opt
) },
3628 struct nlattr
*attrs
[ARRAY_SIZE(tca_htb_policy
)];
3629 const struct tc_htb_opt
*htb
;
3631 if (!nl_parse_nested(nl_options
, tca_htb_policy
,
3632 attrs
, ARRAY_SIZE(tca_htb_policy
))) {
3633 VLOG_WARN_RL(&rl
, "failed to parse HTB class options");
3637 htb
= nl_attr_get(attrs
[TCA_HTB_PARMS
]);
3638 class->min_rate
= htb
->rate
.rate
;
3639 class->max_rate
= htb
->ceil
.rate
;
3640 class->burst
= tc_ticks_to_bytes(htb
->rate
.rate
, htb
->buffer
);
3641 class->priority
= htb
->prio
;
3646 htb_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
3647 struct htb_class
*options
,
3648 struct netdev_queue_stats
*stats
)
3650 struct nlattr
*nl_options
;
3651 unsigned int handle
;
3654 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
3655 if (!error
&& queue_id
) {
3656 unsigned int major
= tc_get_major(handle
);
3657 unsigned int minor
= tc_get_minor(handle
);
3658 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
3659 *queue_id
= minor
- 1;
3664 if (!error
&& options
) {
3665 error
= htb_parse_tca_options__(nl_options
, options
);
3671 htb_parse_qdisc_details__(struct netdev
*netdev_
,
3672 const struct smap
*details
, struct htb_class
*hc
)
3674 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3676 hc
->max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
3677 if (!hc
->max_rate
) {
3678 enum netdev_features current
;
3680 netdev_linux_read_features(netdev
);
3681 current
= !netdev
->get_features_error
? netdev
->current
: 0;
3682 hc
->max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
3684 hc
->min_rate
= hc
->max_rate
;
3690 htb_parse_class_details__(struct netdev
*netdev
,
3691 const struct smap
*details
, struct htb_class
*hc
)
3693 const struct htb
*htb
= htb_get__(netdev
);
3696 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3698 VLOG_WARN_RL(&rl
, "cannot parse HTB class on device %s that lacks MTU",
3699 netdev_get_name(netdev
));
3703 /* HTB requires at least an mtu sized min-rate to send any traffic even
3704 * on uncongested links. */
3705 hc
->min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
3706 hc
->min_rate
= MAX(hc
->min_rate
, mtu
);
3707 hc
->min_rate
= MIN(hc
->min_rate
, htb
->max_rate
);
3710 hc
->max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
3711 if (!hc
->max_rate
) {
3712 hc
->max_rate
= htb
->max_rate
;
3714 hc
->max_rate
= MAX(hc
->max_rate
, hc
->min_rate
);
3715 hc
->max_rate
= MIN(hc
->max_rate
, htb
->max_rate
);
3719 * According to hints in the documentation that I've read, it is important
3720 * that 'burst' be at least as big as the largest frame that might be
3721 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3722 * but having it a bit too small is a problem. Since netdev_get_mtu()
3723 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3724 * the MTU. We actually add 64, instead of 14, as a guard against
3725 * additional headers get tacked on somewhere that we're not aware of. */
3726 hc
->burst
= smap_get_ullong(details
, "burst", 0) / 8;
3727 hc
->burst
= MAX(hc
->burst
, mtu
+ 64);
3730 hc
->priority
= smap_get_ullong(details
, "priority", 0);
3736 htb_query_class__(const struct netdev
*netdev
, unsigned int handle
,
3737 unsigned int parent
, struct htb_class
*options
,
3738 struct netdev_queue_stats
*stats
)
3740 struct ofpbuf
*reply
;
3743 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
3745 error
= htb_parse_tcmsg__(reply
, NULL
, options
, stats
);
3746 ofpbuf_delete(reply
);
3752 htb_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3756 error
= htb_setup_qdisc__(netdev
);
3758 struct htb_class hc
;
3760 htb_parse_qdisc_details__(netdev
, details
, &hc
);
3761 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
3762 tc_make_handle(1, 0), &hc
);
3764 htb_install__(netdev
, hc
.max_rate
);
3770 static struct htb_class
*
3771 htb_class_cast__(const struct tc_queue
*queue
)
3773 return CONTAINER_OF(queue
, struct htb_class
, tc_queue
);
3777 htb_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
3778 const struct htb_class
*hc
)
3780 struct htb
*htb
= htb_get__(netdev
);
3781 size_t hash
= hash_int(queue_id
, 0);
3782 struct tc_queue
*queue
;
3783 struct htb_class
*hcp
;
3785 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
3787 hcp
= htb_class_cast__(queue
);
3789 hcp
= xmalloc(sizeof *hcp
);
3790 queue
= &hcp
->tc_queue
;
3791 queue
->queue_id
= queue_id
;
3792 queue
->created
= time_msec();
3793 hmap_insert(&htb
->tc
.queues
, &queue
->hmap_node
, hash
);
3796 hcp
->min_rate
= hc
->min_rate
;
3797 hcp
->max_rate
= hc
->max_rate
;
3798 hcp
->burst
= hc
->burst
;
3799 hcp
->priority
= hc
->priority
;
3803 htb_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
3806 struct queue_dump_state state
;
3807 struct htb_class hc
;
3809 /* Get qdisc options. */
3811 htb_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
3812 htb_install__(netdev
, hc
.max_rate
);
3815 if (!start_queue_dump(netdev
, &state
)) {
3818 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
3819 unsigned int queue_id
;
3821 if (!htb_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
3822 htb_update_queue__(netdev
, queue_id
, &hc
);
3825 finish_queue_dump(&state
);
3831 htb_tc_destroy(struct tc
*tc
)
3833 struct htb
*htb
= CONTAINER_OF(tc
, struct htb
, tc
);
3834 struct htb_class
*hc
;
3836 HMAP_FOR_EACH_POP (hc
, tc_queue
.hmap_node
, &htb
->tc
.queues
) {
3844 htb_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3846 const struct htb
*htb
= htb_get__(netdev
);
3847 smap_add_format(details
, "max-rate", "%llu", 8ULL * htb
->max_rate
);
3852 htb_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3854 struct htb_class hc
;
3857 htb_parse_qdisc_details__(netdev
, details
, &hc
);
3858 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
3859 tc_make_handle(1, 0), &hc
);
3861 htb_get__(netdev
)->max_rate
= hc
.max_rate
;
3867 htb_class_get(const struct netdev
*netdev OVS_UNUSED
,
3868 const struct tc_queue
*queue
, struct smap
*details
)
3870 const struct htb_class
*hc
= htb_class_cast__(queue
);
3872 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
3873 if (hc
->min_rate
!= hc
->max_rate
) {
3874 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
3876 smap_add_format(details
, "burst", "%llu", 8ULL * hc
->burst
);
3878 smap_add_format(details
, "priority", "%u", hc
->priority
);
3884 htb_class_set(struct netdev
*netdev
, unsigned int queue_id
,
3885 const struct smap
*details
)
3887 struct htb_class hc
;
3890 error
= htb_parse_class_details__(netdev
, details
, &hc
);
3895 error
= htb_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
3896 tc_make_handle(1, 0xfffe), &hc
);
3901 htb_update_queue__(netdev
, queue_id
, &hc
);
3906 htb_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
3908 struct htb_class
*hc
= htb_class_cast__(queue
);
3909 struct htb
*htb
= htb_get__(netdev
);
3912 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
3914 hmap_remove(&htb
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
3921 htb_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
3922 struct netdev_queue_stats
*stats
)
3924 return htb_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
3925 tc_make_handle(1, 0xfffe), NULL
, stats
);
3929 htb_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
3930 const struct ofpbuf
*nlmsg
,
3931 netdev_dump_queue_stats_cb
*cb
, void *aux
)
3933 struct netdev_queue_stats stats
;
3934 unsigned int handle
, major
, minor
;
3937 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
3942 major
= tc_get_major(handle
);
3943 minor
= tc_get_minor(handle
);
3944 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
3945 (*cb
)(minor
- 1, &stats
, aux
);
3950 static const struct tc_ops tc_ops_htb
= {
3951 "htb", /* linux_name */
3952 "linux-htb", /* ovs_name */
3953 HTB_N_QUEUES
, /* n_queues */
3962 htb_class_get_stats
,
3963 htb_class_dump_stats
3966 /* "linux-hfsc" traffic control class. */
3968 #define HFSC_N_QUEUES 0xf000
3976 struct tc_queue tc_queue
;
3981 static struct hfsc
*
3982 hfsc_get__(const struct netdev
*netdev_
)
3984 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3985 return CONTAINER_OF(netdev
->tc
, struct hfsc
, tc
);
3988 static struct hfsc_class
*
3989 hfsc_class_cast__(const struct tc_queue
*queue
)
3991 return CONTAINER_OF(queue
, struct hfsc_class
, tc_queue
);
3995 hfsc_install__(struct netdev
*netdev_
, uint32_t max_rate
)
3997 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4000 hfsc
= xmalloc(sizeof *hfsc
);
4001 tc_init(&hfsc
->tc
, &tc_ops_hfsc
);
4002 hfsc
->max_rate
= max_rate
;
4003 netdev
->tc
= &hfsc
->tc
;
4007 hfsc_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4008 const struct hfsc_class
*hc
)
4012 struct hfsc_class
*hcp
;
4013 struct tc_queue
*queue
;
4015 hfsc
= hfsc_get__(netdev
);
4016 hash
= hash_int(queue_id
, 0);
4018 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4020 hcp
= hfsc_class_cast__(queue
);
4022 hcp
= xmalloc(sizeof *hcp
);
4023 queue
= &hcp
->tc_queue
;
4024 queue
->queue_id
= queue_id
;
4025 queue
->created
= time_msec();
4026 hmap_insert(&hfsc
->tc
.queues
, &queue
->hmap_node
, hash
);
4029 hcp
->min_rate
= hc
->min_rate
;
4030 hcp
->max_rate
= hc
->max_rate
;
4034 hfsc_parse_tca_options__(struct nlattr
*nl_options
, struct hfsc_class
*class)
4036 const struct tc_service_curve
*rsc
, *fsc
, *usc
;
4037 static const struct nl_policy tca_hfsc_policy
[] = {
4039 .type
= NL_A_UNSPEC
,
4041 .min_len
= sizeof(struct tc_service_curve
),
4044 .type
= NL_A_UNSPEC
,
4046 .min_len
= sizeof(struct tc_service_curve
),
4049 .type
= NL_A_UNSPEC
,
4051 .min_len
= sizeof(struct tc_service_curve
),
4054 struct nlattr
*attrs
[ARRAY_SIZE(tca_hfsc_policy
)];
4056 if (!nl_parse_nested(nl_options
, tca_hfsc_policy
,
4057 attrs
, ARRAY_SIZE(tca_hfsc_policy
))) {
4058 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options");
4062 rsc
= nl_attr_get(attrs
[TCA_HFSC_RSC
]);
4063 fsc
= nl_attr_get(attrs
[TCA_HFSC_FSC
]);
4064 usc
= nl_attr_get(attrs
[TCA_HFSC_USC
]);
4066 if (rsc
->m1
!= 0 || rsc
->d
!= 0 ||
4067 fsc
->m1
!= 0 || fsc
->d
!= 0 ||
4068 usc
->m1
!= 0 || usc
->d
!= 0) {
4069 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4070 "Non-linear service curves are not supported.");
4074 if (rsc
->m2
!= fsc
->m2
) {
4075 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4076 "Real-time service curves are not supported ");
4080 if (rsc
->m2
> usc
->m2
) {
4081 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4082 "Min-rate service curve is greater than "
4083 "the max-rate service curve.");
4087 class->min_rate
= fsc
->m2
;
4088 class->max_rate
= usc
->m2
;
4093 hfsc_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4094 struct hfsc_class
*options
,
4095 struct netdev_queue_stats
*stats
)
4098 unsigned int handle
;
4099 struct nlattr
*nl_options
;
4101 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4107 unsigned int major
, minor
;
4109 major
= tc_get_major(handle
);
4110 minor
= tc_get_minor(handle
);
4111 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4112 *queue_id
= minor
- 1;
4119 error
= hfsc_parse_tca_options__(nl_options
, options
);
4126 hfsc_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4127 unsigned int parent
, struct hfsc_class
*options
,
4128 struct netdev_queue_stats
*stats
)
4131 struct ofpbuf
*reply
;
4133 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4138 error
= hfsc_parse_tcmsg__(reply
, NULL
, options
, stats
);
4139 ofpbuf_delete(reply
);
4144 hfsc_parse_qdisc_details__(struct netdev
*netdev_
, const struct smap
*details
,
4145 struct hfsc_class
*class)
4147 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4149 uint32_t max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
4151 enum netdev_features current
;
4153 netdev_linux_read_features(netdev
);
4154 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4155 max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4158 class->min_rate
= max_rate
;
4159 class->max_rate
= max_rate
;
4163 hfsc_parse_class_details__(struct netdev
*netdev
,
4164 const struct smap
*details
,
4165 struct hfsc_class
* class)
4167 const struct hfsc
*hfsc
;
4168 uint32_t min_rate
, max_rate
;
4170 hfsc
= hfsc_get__(netdev
);
4172 min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
4173 min_rate
= MAX(min_rate
, 1);
4174 min_rate
= MIN(min_rate
, hfsc
->max_rate
);
4176 max_rate
= smap_get_ullong(details
, "max-rate", hfsc
->max_rate
* 8) / 8;
4177 max_rate
= MAX(max_rate
, min_rate
);
4178 max_rate
= MIN(max_rate
, hfsc
->max_rate
);
4180 class->min_rate
= min_rate
;
4181 class->max_rate
= max_rate
;
4186 /* Create an HFSC qdisc.
4188 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4190 hfsc_setup_qdisc__(struct netdev
* netdev
)
4192 struct tcmsg
*tcmsg
;
4193 struct ofpbuf request
;
4194 struct tc_hfsc_qopt opt
;
4196 tc_del_qdisc(netdev
);
4198 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
4199 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4205 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4206 tcmsg
->tcm_parent
= TC_H_ROOT
;
4208 memset(&opt
, 0, sizeof opt
);
4211 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4212 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
4214 return tc_transact(&request
, NULL
);
4217 /* Create an HFSC class.
4219 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4220 * sc rate <min_rate> ul rate <max_rate>" */
4222 hfsc_setup_class__(struct netdev
*netdev
, unsigned int handle
,
4223 unsigned int parent
, struct hfsc_class
*class)
4227 struct tcmsg
*tcmsg
;
4228 struct ofpbuf request
;
4229 struct tc_service_curve min
, max
;
4231 tcmsg
= tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
, &request
);
4237 tcmsg
->tcm_handle
= handle
;
4238 tcmsg
->tcm_parent
= parent
;
4242 min
.m2
= class->min_rate
;
4246 max
.m2
= class->max_rate
;
4248 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4249 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4250 nl_msg_put_unspec(&request
, TCA_HFSC_RSC
, &min
, sizeof min
);
4251 nl_msg_put_unspec(&request
, TCA_HFSC_FSC
, &min
, sizeof min
);
4252 nl_msg_put_unspec(&request
, TCA_HFSC_USC
, &max
, sizeof max
);
4253 nl_msg_end_nested(&request
, opt_offset
);
4255 error
= tc_transact(&request
, NULL
);
4257 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
4258 "min-rate %ubps, max-rate %ubps (%s)",
4259 netdev_get_name(netdev
),
4260 tc_get_major(handle
), tc_get_minor(handle
),
4261 tc_get_major(parent
), tc_get_minor(parent
),
4262 class->min_rate
, class->max_rate
, ovs_strerror(error
));
4269 hfsc_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4272 struct hfsc_class
class;
4274 error
= hfsc_setup_qdisc__(netdev
);
4280 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4281 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4282 tc_make_handle(1, 0), &class);
4288 hfsc_install__(netdev
, class.max_rate
);
4293 hfsc_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4296 struct queue_dump_state state
;
4297 struct hfsc_class hc
;
4300 hfsc_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4301 hfsc_install__(netdev
, hc
.max_rate
);
4303 if (!start_queue_dump(netdev
, &state
)) {
4307 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4308 unsigned int queue_id
;
4310 if (!hfsc_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4311 hfsc_update_queue__(netdev
, queue_id
, &hc
);
4315 finish_queue_dump(&state
);
4320 hfsc_tc_destroy(struct tc
*tc
)
4323 struct hfsc_class
*hc
, *next
;
4325 hfsc
= CONTAINER_OF(tc
, struct hfsc
, tc
);
4327 HMAP_FOR_EACH_SAFE (hc
, next
, tc_queue
.hmap_node
, &hfsc
->tc
.queues
) {
4328 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4337 hfsc_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4339 const struct hfsc
*hfsc
;
4340 hfsc
= hfsc_get__(netdev
);
4341 smap_add_format(details
, "max-rate", "%llu", 8ULL * hfsc
->max_rate
);
4346 hfsc_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4349 struct hfsc_class
class;
4351 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4352 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4353 tc_make_handle(1, 0), &class);
4356 hfsc_get__(netdev
)->max_rate
= class.max_rate
;
4363 hfsc_class_get(const struct netdev
*netdev OVS_UNUSED
,
4364 const struct tc_queue
*queue
, struct smap
*details
)
4366 const struct hfsc_class
*hc
;
4368 hc
= hfsc_class_cast__(queue
);
4369 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4370 if (hc
->min_rate
!= hc
->max_rate
) {
4371 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4377 hfsc_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4378 const struct smap
*details
)
4381 struct hfsc_class
class;
4383 error
= hfsc_parse_class_details__(netdev
, details
, &class);
4388 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
4389 tc_make_handle(1, 0xfffe), &class);
4394 hfsc_update_queue__(netdev
, queue_id
, &class);
4399 hfsc_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4403 struct hfsc_class
*hc
;
4405 hc
= hfsc_class_cast__(queue
);
4406 hfsc
= hfsc_get__(netdev
);
4408 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4410 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4417 hfsc_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4418 struct netdev_queue_stats
*stats
)
4420 return hfsc_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4421 tc_make_handle(1, 0xfffe), NULL
, stats
);
4425 hfsc_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4426 const struct ofpbuf
*nlmsg
,
4427 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4429 struct netdev_queue_stats stats
;
4430 unsigned int handle
, major
, minor
;
4433 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4438 major
= tc_get_major(handle
);
4439 minor
= tc_get_minor(handle
);
4440 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4441 (*cb
)(minor
- 1, &stats
, aux
);
4446 static const struct tc_ops tc_ops_hfsc
= {
4447 "hfsc", /* linux_name */
4448 "linux-hfsc", /* ovs_name */
4449 HFSC_N_QUEUES
, /* n_queues */
4450 hfsc_tc_install
, /* tc_install */
4451 hfsc_tc_load
, /* tc_load */
4452 hfsc_tc_destroy
, /* tc_destroy */
4453 hfsc_qdisc_get
, /* qdisc_get */
4454 hfsc_qdisc_set
, /* qdisc_set */
4455 hfsc_class_get
, /* class_get */
4456 hfsc_class_set
, /* class_set */
4457 hfsc_class_delete
, /* class_delete */
4458 hfsc_class_get_stats
, /* class_get_stats */
4459 hfsc_class_dump_stats
/* class_dump_stats */
4462 /* "linux-noop" traffic control class. */
4465 noop_install__(struct netdev
*netdev_
)
4467 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4468 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
4470 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4474 noop_tc_install(struct netdev
*netdev
,
4475 const struct smap
*details OVS_UNUSED
)
4477 noop_install__(netdev
);
4482 noop_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4484 noop_install__(netdev
);
4488 static const struct tc_ops tc_ops_noop
= {
4489 NULL
, /* linux_name */
4490 "linux-noop", /* ovs_name */
4494 NULL
, /* tc_destroy */
4495 NULL
, /* qdisc_get */
4496 NULL
, /* qdisc_set */
4497 NULL
, /* class_get */
4498 NULL
, /* class_set */
4499 NULL
, /* class_delete */
4500 NULL
, /* class_get_stats */
4501 NULL
/* class_dump_stats */
4504 /* "linux-default" traffic control class.
4506 * This class represents the default, unnamed Linux qdisc. It corresponds to
4507 * the "" (empty string) QoS type in the OVS database. */
4510 default_install__(struct netdev
*netdev_
)
4512 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4513 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
4515 /* Nothing but a tc class implementation is allowed to write to a tc. This
4516 * class never does that, so we can legitimately use a const tc object. */
4517 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4521 default_tc_install(struct netdev
*netdev
,
4522 const struct smap
*details OVS_UNUSED
)
4524 default_install__(netdev
);
4529 default_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4531 default_install__(netdev
);
4535 static const struct tc_ops tc_ops_default
= {
4536 NULL
, /* linux_name */
4541 NULL
, /* tc_destroy */
4542 NULL
, /* qdisc_get */
4543 NULL
, /* qdisc_set */
4544 NULL
, /* class_get */
4545 NULL
, /* class_set */
4546 NULL
, /* class_delete */
4547 NULL
, /* class_get_stats */
4548 NULL
/* class_dump_stats */
4551 /* "linux-other" traffic control class.
4556 other_tc_load(struct netdev
*netdev_
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4558 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4559 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_other
);
4561 /* Nothing but a tc class implementation is allowed to write to a tc. This
4562 * class never does that, so we can legitimately use a const tc object. */
4563 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4567 static const struct tc_ops tc_ops_other
= {
4568 NULL
, /* linux_name */
4569 "linux-other", /* ovs_name */
4571 NULL
, /* tc_install */
4573 NULL
, /* tc_destroy */
4574 NULL
, /* qdisc_get */
4575 NULL
, /* qdisc_set */
4576 NULL
, /* class_get */
4577 NULL
, /* class_set */
4578 NULL
, /* class_delete */
4579 NULL
, /* class_get_stats */
4580 NULL
/* class_dump_stats */
4583 /* Traffic control. */
4585 /* Number of kernel "tc" ticks per second. */
4586 static double ticks_per_s
;
4588 /* Number of kernel "jiffies" per second. This is used for the purpose of
4589 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4590 * one jiffy's worth of data.
4592 * There are two possibilities here:
4594 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4595 * approximate range of 100 to 1024. That means that we really need to
4596 * make sure that the qdisc can buffer that much data.
4598 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4599 * has finely granular timers and there's no need to fudge additional room
4600 * for buffers. (There's no extra effort needed to implement that: the
4601 * large 'buffer_hz' is used as a divisor, so practically any number will
4602 * come out as 0 in the division. Small integer results in the case of
4603 * really high dividends won't have any real effect anyhow.)
4605 static unsigned int buffer_hz
;
4607 /* Returns tc handle 'major':'minor'. */
4609 tc_make_handle(unsigned int major
, unsigned int minor
)
4611 return TC_H_MAKE(major
<< 16, minor
);
4614 /* Returns the major number from 'handle'. */
4616 tc_get_major(unsigned int handle
)
4618 return TC_H_MAJ(handle
) >> 16;
4621 /* Returns the minor number from 'handle'. */
4623 tc_get_minor(unsigned int handle
)
4625 return TC_H_MIN(handle
);
4628 static struct tcmsg
*
4629 tc_make_request(const struct netdev
*netdev
, int type
, unsigned int flags
,
4630 struct ofpbuf
*request
)
4632 struct tcmsg
*tcmsg
;
4636 error
= get_ifindex(netdev
, &ifindex
);
4641 ofpbuf_init(request
, 512);
4642 nl_msg_put_nlmsghdr(request
, sizeof *tcmsg
, type
, NLM_F_REQUEST
| flags
);
4643 tcmsg
= ofpbuf_put_zeros(request
, sizeof *tcmsg
);
4644 tcmsg
->tcm_family
= AF_UNSPEC
;
4645 tcmsg
->tcm_ifindex
= ifindex
;
4646 /* Caller should fill in tcmsg->tcm_handle. */
4647 /* Caller should fill in tcmsg->tcm_parent. */
4653 tc_transact(struct ofpbuf
*request
, struct ofpbuf
**replyp
)
4655 int error
= nl_transact(NETLINK_ROUTE
, request
, replyp
);
4656 ofpbuf_uninit(request
);
4660 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4661 * policing configuration.
4663 * This function is equivalent to running the following when 'add' is true:
4664 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4666 * This function is equivalent to running the following when 'add' is false:
4667 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4669 * The configuration and stats may be seen with the following command:
4670 * /sbin/tc -s qdisc show dev <devname>
4672 * Returns 0 if successful, otherwise a positive errno value.
4675 tc_add_del_ingress_qdisc(struct netdev
*netdev
, bool add
)
4677 struct ofpbuf request
;
4678 struct tcmsg
*tcmsg
;
4680 int type
= add
? RTM_NEWQDISC
: RTM_DELQDISC
;
4681 int flags
= add
? NLM_F_EXCL
| NLM_F_CREATE
: 0;
4683 tcmsg
= tc_make_request(netdev
, type
, flags
, &request
);
4687 tcmsg
->tcm_handle
= tc_make_handle(0xffff, 0);
4688 tcmsg
->tcm_parent
= TC_H_INGRESS
;
4689 nl_msg_put_string(&request
, TCA_KIND
, "ingress");
4690 nl_msg_put_unspec(&request
, TCA_OPTIONS
, NULL
, 0);
4692 error
= tc_transact(&request
, NULL
);
4694 /* If we're deleting the qdisc, don't worry about some of the
4695 * error conditions. */
4696 if (!add
&& (error
== ENOENT
|| error
== EINVAL
)) {
4705 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4708 * This function is equivalent to running:
4709 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4710 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4713 * The configuration and stats may be seen with the following command:
4714 * /sbin/tc -s filter show dev <devname> parent ffff:
4716 * Returns 0 if successful, otherwise a positive errno value.
4719 tc_add_policer(struct netdev
*netdev
,
4720 uint32_t kbits_rate
, uint32_t kbits_burst
)
4722 struct tc_police tc_police
;
4723 struct ofpbuf request
;
4724 struct tcmsg
*tcmsg
;
4725 size_t basic_offset
;
4726 size_t police_offset
;
4730 memset(&tc_police
, 0, sizeof tc_police
);
4731 tc_police
.action
= TC_POLICE_SHOT
;
4732 tc_police
.mtu
= mtu
;
4733 tc_fill_rate(&tc_police
.rate
, ((uint64_t) kbits_rate
* 1000)/8, mtu
);
4735 /* The following appears wrong in one way: In networking a kilobit is
4736 * usually 1000 bits but this uses 1024 bits.
4738 * However if you "fix" those problems then "tc filter show ..." shows
4739 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4740 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4741 * tc's point of view. Whatever. */
4742 tc_police
.burst
= tc_bytes_to_ticks(
4743 tc_police
.rate
.rate
, MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024 / 8);
4745 tcmsg
= tc_make_request(netdev
, RTM_NEWTFILTER
,
4746 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4750 tcmsg
->tcm_parent
= tc_make_handle(0xffff, 0);
4751 tcmsg
->tcm_info
= tc_make_handle(49,
4752 (OVS_FORCE
uint16_t) htons(ETH_P_ALL
));
4754 nl_msg_put_string(&request
, TCA_KIND
, "basic");
4755 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4756 police_offset
= nl_msg_start_nested(&request
, TCA_BASIC_POLICE
);
4757 nl_msg_put_unspec(&request
, TCA_POLICE_TBF
, &tc_police
, sizeof tc_police
);
4758 tc_put_rtab(&request
, TCA_POLICE_RATE
, &tc_police
.rate
);
4759 nl_msg_end_nested(&request
, police_offset
);
4760 nl_msg_end_nested(&request
, basic_offset
);
4762 error
= tc_transact(&request
, NULL
);
4773 /* The values in psched are not individually very meaningful, but they are
4774 * important. The tables below show some values seen in the wild.
4778 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4779 * (Before that, there are hints that it was 1000000000.)
4781 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4785 * -----------------------------------
4786 * [1] 000c8000 000f4240 000f4240 00000064
4787 * [2] 000003e8 00000400 000f4240 3b9aca00
4788 * [3] 000003e8 00000400 000f4240 3b9aca00
4789 * [4] 000003e8 00000400 000f4240 00000064
4790 * [5] 000003e8 00000040 000f4240 3b9aca00
4791 * [6] 000003e8 00000040 000f4240 000000f9
4793 * a b c d ticks_per_s buffer_hz
4794 * ------- --------- ---------- ------------- ----------- -------------
4795 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4796 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4797 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4798 * [4] 1,000 1,024 1,000,000 100 976,562 100
4799 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4800 * [6] 1,000 64 1,000,000 249 15,625,000 249
4802 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4803 * [2] 2.6.26-1-686-bigmem from Debian lenny
4804 * [3] 2.6.26-2-sparc64 from Debian lenny
4805 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4806 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4807 * [6] 2.6.34 from kernel.org on KVM
4809 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
4810 static const char fn
[] = "/proc/net/psched";
4811 unsigned int a
, b
, c
, d
;
4814 if (!ovsthread_once_start(&once
)) {
4821 stream
= fopen(fn
, "r");
4823 VLOG_WARN("%s: open failed: %s", fn
, ovs_strerror(errno
));
4827 if (fscanf(stream
, "%x %x %x %x", &a
, &b
, &c
, &d
) != 4) {
4828 VLOG_WARN("%s: read failed", fn
);
4832 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn
, a
, b
, c
, d
);
4836 VLOG_WARN("%s: invalid scheduler parameters", fn
);
4840 ticks_per_s
= (double) a
* c
/ b
;
4844 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4847 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn
, ticks_per_s
, buffer_hz
);
4850 ovsthread_once_done(&once
);
4853 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4854 * rate of 'rate' bytes per second. */
4856 tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
)
4859 return (rate
* ticks
) / ticks_per_s
;
4862 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4863 * rate of 'rate' bytes per second. */
4865 tc_bytes_to_ticks(unsigned int rate
, unsigned int size
)
4868 return rate
? ((unsigned long long int) ticks_per_s
* size
) / rate
: 0;
4871 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4872 * a transmission rate of 'rate' bytes per second. */
4874 tc_buffer_per_jiffy(unsigned int rate
)
4877 return rate
/ buffer_hz
;
4880 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4881 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4882 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4883 * stores NULL into it if it is absent.
4885 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4888 * Returns 0 if successful, otherwise a positive errno value. */
4890 tc_parse_qdisc(const struct ofpbuf
*msg
, const char **kind
,
4891 struct nlattr
**options
)
4893 static const struct nl_policy tca_policy
[] = {
4894 [TCA_KIND
] = { .type
= NL_A_STRING
, .optional
= false },
4895 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= true },
4897 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
4899 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
4900 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
4901 VLOG_WARN_RL(&rl
, "failed to parse qdisc message");
4906 *kind
= nl_attr_get_string(ta
[TCA_KIND
]);
4910 *options
= ta
[TCA_OPTIONS
];
4925 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4926 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4927 * into '*options', and its queue statistics into '*stats'. Any of the output
4928 * arguments may be null.
4930 * Returns 0 if successful, otherwise a positive errno value. */
4932 tc_parse_class(const struct ofpbuf
*msg
, unsigned int *handlep
,
4933 struct nlattr
**options
, struct netdev_queue_stats
*stats
)
4935 static const struct nl_policy tca_policy
[] = {
4936 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= false },
4937 [TCA_STATS2
] = { .type
= NL_A_NESTED
, .optional
= false },
4939 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
4941 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
4942 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
4943 VLOG_WARN_RL(&rl
, "failed to parse class message");
4948 struct tcmsg
*tc
= ofpbuf_at_assert(msg
, NLMSG_HDRLEN
, sizeof *tc
);
4949 *handlep
= tc
->tcm_handle
;
4953 *options
= ta
[TCA_OPTIONS
];
4957 const struct gnet_stats_queue
*gsq
;
4958 struct gnet_stats_basic gsb
;
4960 static const struct nl_policy stats_policy
[] = {
4961 [TCA_STATS_BASIC
] = { .type
= NL_A_UNSPEC
, .optional
= false,
4962 .min_len
= sizeof gsb
},
4963 [TCA_STATS_QUEUE
] = { .type
= NL_A_UNSPEC
, .optional
= false,
4964 .min_len
= sizeof *gsq
},
4966 struct nlattr
*sa
[ARRAY_SIZE(stats_policy
)];
4968 if (!nl_parse_nested(ta
[TCA_STATS2
], stats_policy
,
4969 sa
, ARRAY_SIZE(sa
))) {
4970 VLOG_WARN_RL(&rl
, "failed to parse class stats");
4974 /* Alignment issues screw up the length of struct gnet_stats_basic on
4975 * some arch/bitsize combinations. Newer versions of Linux have a
4976 * struct gnet_stats_basic_packed, but we can't depend on that. The
4977 * easiest thing to do is just to make a copy. */
4978 memset(&gsb
, 0, sizeof gsb
);
4979 memcpy(&gsb
, nl_attr_get(sa
[TCA_STATS_BASIC
]),
4980 MIN(nl_attr_get_size(sa
[TCA_STATS_BASIC
]), sizeof gsb
));
4981 stats
->tx_bytes
= gsb
.bytes
;
4982 stats
->tx_packets
= gsb
.packets
;
4984 gsq
= nl_attr_get(sa
[TCA_STATS_QUEUE
]);
4985 stats
->tx_errors
= gsq
->drops
;
4995 memset(stats
, 0, sizeof *stats
);
5000 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5003 tc_query_class(const struct netdev
*netdev
,
5004 unsigned int handle
, unsigned int parent
,
5005 struct ofpbuf
**replyp
)
5007 struct ofpbuf request
;
5008 struct tcmsg
*tcmsg
;
5011 tcmsg
= tc_make_request(netdev
, RTM_GETTCLASS
, NLM_F_ECHO
, &request
);
5015 tcmsg
->tcm_handle
= handle
;
5016 tcmsg
->tcm_parent
= parent
;
5018 error
= tc_transact(&request
, replyp
);
5020 VLOG_WARN_RL(&rl
, "query %s class %u:%u (parent %u:%u) failed (%s)",
5021 netdev_get_name(netdev
),
5022 tc_get_major(handle
), tc_get_minor(handle
),
5023 tc_get_major(parent
), tc_get_minor(parent
),
5024 ovs_strerror(error
));
5029 /* Equivalent to "tc class del dev <name> handle <handle>". */
5031 tc_delete_class(const struct netdev
*netdev
, unsigned int handle
)
5033 struct ofpbuf request
;
5034 struct tcmsg
*tcmsg
;
5037 tcmsg
= tc_make_request(netdev
, RTM_DELTCLASS
, 0, &request
);
5041 tcmsg
->tcm_handle
= handle
;
5042 tcmsg
->tcm_parent
= 0;
5044 error
= tc_transact(&request
, NULL
);
5046 VLOG_WARN_RL(&rl
, "delete %s class %u:%u failed (%s)",
5047 netdev_get_name(netdev
),
5048 tc_get_major(handle
), tc_get_minor(handle
),
5049 ovs_strerror(error
));
5054 /* Equivalent to "tc qdisc del dev <name> root". */
5056 tc_del_qdisc(struct netdev
*netdev_
)
5058 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5059 struct ofpbuf request
;
5060 struct tcmsg
*tcmsg
;
5063 tcmsg
= tc_make_request(netdev_
, RTM_DELQDISC
, 0, &request
);
5067 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
5068 tcmsg
->tcm_parent
= TC_H_ROOT
;
5070 error
= tc_transact(&request
, NULL
);
5071 if (error
== EINVAL
) {
5072 /* EINVAL probably means that the default qdisc was in use, in which
5073 * case we've accomplished our purpose. */
5076 if (!error
&& netdev
->tc
) {
5077 if (netdev
->tc
->ops
->tc_destroy
) {
5078 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
5086 getqdisc_is_safe(void)
5088 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5089 static bool safe
= false;
5091 if (ovsthread_once_start(&once
)) {
5092 struct utsname utsname
;
5095 if (uname(&utsname
) == -1) {
5096 VLOG_WARN("uname failed (%s)", ovs_strerror(errno
));
5097 } else if (!ovs_scan(utsname
.release
, "%d.%d", &major
, &minor
)) {
5098 VLOG_WARN("uname reported bad OS release (%s)", utsname
.release
);
5099 } else if (major
< 2 || (major
== 2 && minor
< 35)) {
5100 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5105 ovsthread_once_done(&once
);
5110 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5111 * kernel to determine what they are. Returns 0 if successful, otherwise a
5112 * positive errno value. */
5114 tc_query_qdisc(const struct netdev
*netdev_
)
5116 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5117 struct ofpbuf request
, *qdisc
;
5118 const struct tc_ops
*ops
;
5119 struct tcmsg
*tcmsg
;
5127 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5128 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5129 * 2.6.35 without that fix backported to it.
5131 * To avoid the OOPS, we must not make a request that would attempt to dump
5132 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5133 * few others. There are a few ways that I can see to do this, but most of
5134 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5135 * technique chosen here is to assume that any non-default qdisc that we
5136 * create will have a class with handle 1:0. The built-in qdiscs only have
5137 * a class with handle 0:0.
5139 * On Linux 2.6.35+ we use the straightforward method because it allows us
5140 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5141 * in such a case we get no response at all from the kernel (!) if a
5142 * builtin qdisc is in use (which is later caught by "!error &&
5143 * !qdisc->size"). */
5144 tcmsg
= tc_make_request(netdev_
, RTM_GETQDISC
, NLM_F_ECHO
, &request
);
5148 tcmsg
->tcm_handle
= tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5149 tcmsg
->tcm_parent
= getqdisc_is_safe() ? TC_H_ROOT
: 0;
5151 /* Figure out what tc class to instantiate. */
5152 error
= tc_transact(&request
, &qdisc
);
5153 if (!error
&& qdisc
->size
) {
5156 error
= tc_parse_qdisc(qdisc
, &kind
, NULL
);
5158 ops
= &tc_ops_other
;
5160 ops
= tc_lookup_linux_name(kind
);
5162 static struct vlog_rate_limit rl2
= VLOG_RATE_LIMIT_INIT(1, 1);
5163 VLOG_DBG_RL(&rl2
, "unknown qdisc \"%s\"", kind
);
5165 ops
= &tc_ops_other
;
5168 } else if ((!error
&& !qdisc
->size
) || error
== ENOENT
) {
5169 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5170 * set up by some other entity that doesn't have a handle 1:0. We will
5171 * assume that it's the system default qdisc. */
5172 ops
= &tc_ops_default
;
5175 /* Who knows? Maybe the device got deleted. */
5176 VLOG_WARN_RL(&rl
, "query %s qdisc failed (%s)",
5177 netdev_get_name(netdev_
), ovs_strerror(error
));
5178 ops
= &tc_ops_other
;
5181 /* Instantiate it. */
5182 load_error
= ops
->tc_load(CONST_CAST(struct netdev
*, netdev_
), qdisc
);
5183 ovs_assert((load_error
== 0) == (netdev
->tc
!= NULL
));
5184 ofpbuf_delete(qdisc
);
5186 return error
? error
: load_error
;
5189 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5190 approximate the time to transmit packets of various lengths. For an MTU of
5191 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5192 represents two possible packet lengths; for a MTU of 513 through 1024, four
5193 possible lengths; and so on.
5195 Returns, for the specified 'mtu', the number of bits that packet lengths
5196 need to be shifted right to fit within such a 256-entry table. */
5198 tc_calc_cell_log(unsigned int mtu
)
5203 mtu
= ETH_PAYLOAD_MAX
;
5205 mtu
+= ETH_HEADER_LEN
+ VLAN_HEADER_LEN
;
5207 for (cell_log
= 0; mtu
>= 256; cell_log
++) {
5214 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5217 tc_fill_rate(struct tc_ratespec
*rate
, uint64_t Bps
, int mtu
)
5219 memset(rate
, 0, sizeof *rate
);
5220 rate
->cell_log
= tc_calc_cell_log(mtu
);
5221 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5222 /* rate->cell_align = 0; */ /* distro headers. */
5223 rate
->mpu
= ETH_TOTAL_MIN
;
5227 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5228 * attribute of the specified "type".
5230 * See tc_calc_cell_log() above for a description of "rtab"s. */
5232 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
)
5237 rtab
= nl_msg_put_unspec_uninit(msg
, type
, TC_RTAB_SIZE
);
5238 for (i
= 0; i
< TC_RTAB_SIZE
/ sizeof *rtab
; i
++) {
5239 unsigned packet_size
= (i
+ 1) << rate
->cell_log
;
5240 if (packet_size
< rate
->mpu
) {
5241 packet_size
= rate
->mpu
;
5243 rtab
[i
] = tc_bytes_to_ticks(rate
->rate
, packet_size
);
5247 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5248 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5249 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5252 tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
)
5254 unsigned int min_burst
= tc_buffer_per_jiffy(Bps
) + mtu
;
5255 return tc_bytes_to_ticks(Bps
, MAX(burst_bytes
, min_burst
));
5258 /* Linux-only functions declared in netdev-linux.h */
5260 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5261 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5263 netdev_linux_ethtool_set_flag(struct netdev
*netdev
, uint32_t flag
,
5264 const char *flag_name
, bool enable
)
5266 const char *netdev_name
= netdev_get_name(netdev
);
5267 struct ethtool_value evalue
;
5271 COVERAGE_INC(netdev_get_ethtool
);
5272 memset(&evalue
, 0, sizeof evalue
);
5273 error
= netdev_linux_do_ethtool(netdev_name
,
5274 (struct ethtool_cmd
*)&evalue
,
5275 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5280 COVERAGE_INC(netdev_set_ethtool
);
5281 new_flags
= (evalue
.data
& ~flag
) | (enable
? flag
: 0);
5282 if (new_flags
== evalue
.data
) {
5285 evalue
.data
= new_flags
;
5286 error
= netdev_linux_do_ethtool(netdev_name
,
5287 (struct ethtool_cmd
*)&evalue
,
5288 ETHTOOL_SFLAGS
, "ETHTOOL_SFLAGS");
5293 COVERAGE_INC(netdev_get_ethtool
);
5294 memset(&evalue
, 0, sizeof evalue
);
5295 error
= netdev_linux_do_ethtool(netdev_name
,
5296 (struct ethtool_cmd
*)&evalue
,
5297 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5302 if (new_flags
!= evalue
.data
) {
5303 VLOG_WARN_RL(&rl
, "attempt to %s ethtool %s flag on network "
5304 "device %s failed", enable
? "enable" : "disable",
5305 flag_name
, netdev_name
);
5312 /* Utility functions. */
5314 /* Copies 'src' into 'dst', performing format conversion in the process. */
5316 netdev_stats_from_rtnl_link_stats(struct netdev_stats
*dst
,
5317 const struct rtnl_link_stats
*src
)
5319 dst
->rx_packets
= src
->rx_packets
;
5320 dst
->tx_packets
= src
->tx_packets
;
5321 dst
->rx_bytes
= src
->rx_bytes
;
5322 dst
->tx_bytes
= src
->tx_bytes
;
5323 dst
->rx_errors
= src
->rx_errors
;
5324 dst
->tx_errors
= src
->tx_errors
;
5325 dst
->rx_dropped
= src
->rx_dropped
;
5326 dst
->tx_dropped
= src
->tx_dropped
;
5327 dst
->multicast
= src
->multicast
;
5328 dst
->collisions
= src
->collisions
;
5329 dst
->rx_length_errors
= src
->rx_length_errors
;
5330 dst
->rx_over_errors
= src
->rx_over_errors
;
5331 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5332 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5333 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5334 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5335 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5336 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5337 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5338 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5339 dst
->tx_window_errors
= src
->tx_window_errors
;
5342 /* Copies 'src' into 'dst', performing format conversion in the process. */
5344 netdev_stats_from_rtnl_link_stats64(struct netdev_stats
*dst
,
5345 const struct rtnl_link_stats64
*src
)
5347 dst
->rx_packets
= src
->rx_packets
;
5348 dst
->tx_packets
= src
->tx_packets
;
5349 dst
->rx_bytes
= src
->rx_bytes
;
5350 dst
->tx_bytes
= src
->tx_bytes
;
5351 dst
->rx_errors
= src
->rx_errors
;
5352 dst
->tx_errors
= src
->tx_errors
;
5353 dst
->rx_dropped
= src
->rx_dropped
;
5354 dst
->tx_dropped
= src
->tx_dropped
;
5355 dst
->multicast
= src
->multicast
;
5356 dst
->collisions
= src
->collisions
;
5357 dst
->rx_length_errors
= src
->rx_length_errors
;
5358 dst
->rx_over_errors
= src
->rx_over_errors
;
5359 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5360 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5361 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5362 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5363 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5364 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5365 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5366 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5367 dst
->tx_window_errors
= src
->tx_window_errors
;
5371 get_stats_via_netlink(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
5373 struct ofpbuf request
;
5374 struct ofpbuf
*reply
;
5377 /* Filtering all counters by default */
5378 memset(stats
, 0xFF, sizeof(struct netdev_stats
));
5380 ofpbuf_init(&request
, 0);
5381 nl_msg_put_nlmsghdr(&request
,
5382 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
5383 RTM_GETLINK
, NLM_F_REQUEST
);
5384 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
5385 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(netdev_
));
5386 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
5387 ofpbuf_uninit(&request
);
5392 if (ofpbuf_try_pull(reply
, NLMSG_HDRLEN
+ sizeof(struct ifinfomsg
))) {
5393 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS64
);
5394 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats64
)) {
5395 netdev_stats_from_rtnl_link_stats64(stats
, nl_attr_get(a
));
5398 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS
);
5399 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats
)) {
5400 netdev_stats_from_rtnl_link_stats(stats
, nl_attr_get(a
));
5403 VLOG_WARN_RL(&rl
, "RTM_GETLINK reply lacks stats");
5408 VLOG_WARN_RL(&rl
, "short RTM_GETLINK reply");
5413 ofpbuf_delete(reply
);
5418 get_flags(const struct netdev
*dev
, unsigned int *flags
)
5424 error
= af_inet_ifreq_ioctl(dev
->name
, &ifr
, SIOCGIFFLAGS
, "SIOCGIFFLAGS");
5426 *flags
= ifr
.ifr_flags
;
5432 set_flags(const char *name
, unsigned int flags
)
5436 ifr
.ifr_flags
= flags
;
5437 return af_inet_ifreq_ioctl(name
, &ifr
, SIOCSIFFLAGS
, "SIOCSIFFLAGS");
5441 do_get_ifindex(const char *netdev_name
)
5446 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5447 COVERAGE_INC(netdev_get_ifindex
);
5449 error
= af_inet_ioctl(SIOCGIFINDEX
, &ifr
);
5451 VLOG_WARN_RL(&rl
, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5452 netdev_name
, ovs_strerror(error
));
5455 return ifr
.ifr_ifindex
;
5459 get_ifindex(const struct netdev
*netdev_
, int *ifindexp
)
5461 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5463 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
5464 int ifindex
= do_get_ifindex(netdev_get_name(netdev_
));
5467 netdev
->get_ifindex_error
= -ifindex
;
5468 netdev
->ifindex
= 0;
5470 netdev
->get_ifindex_error
= 0;
5471 netdev
->ifindex
= ifindex
;
5473 netdev
->cache_valid
|= VALID_IFINDEX
;
5476 *ifindexp
= netdev
->ifindex
;
5477 return netdev
->get_ifindex_error
;
5481 get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
)
5487 memset(&ifr
, 0, sizeof ifr
);
5488 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5489 COVERAGE_INC(netdev_get_hwaddr
);
5490 error
= af_inet_ioctl(SIOCGIFHWADDR
, &ifr
);
5492 /* ENODEV probably means that a vif disappeared asynchronously and
5493 * hasn't been removed from the database yet, so reduce the log level
5494 * to INFO for that case. */
5495 VLOG(error
== ENODEV
? VLL_INFO
: VLL_ERR
,
5496 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5497 netdev_name
, ovs_strerror(error
));
5500 hwaddr_family
= ifr
.ifr_hwaddr
.sa_family
;
5501 if (hwaddr_family
!= AF_UNSPEC
&& hwaddr_family
!= ARPHRD_ETHER
) {
5502 VLOG_INFO("%s device has unknown hardware address family %d",
5503 netdev_name
, hwaddr_family
);
5506 memcpy(ea
, ifr
.ifr_hwaddr
.sa_data
, ETH_ADDR_LEN
);
5511 set_etheraddr(const char *netdev_name
, const struct eth_addr mac
)
5516 memset(&ifr
, 0, sizeof ifr
);
5517 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5518 ifr
.ifr_hwaddr
.sa_family
= ARPHRD_ETHER
;
5519 memcpy(ifr
.ifr_hwaddr
.sa_data
, &mac
, ETH_ADDR_LEN
);
5520 COVERAGE_INC(netdev_set_hwaddr
);
5521 error
= af_inet_ioctl(SIOCSIFHWADDR
, &ifr
);
5523 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5524 netdev_name
, ovs_strerror(error
));
5530 netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*ecmd
,
5531 int cmd
, const char *cmd_name
)
5536 memset(&ifr
, 0, sizeof ifr
);
5537 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
5538 ifr
.ifr_data
= (caddr_t
) ecmd
;
5541 error
= af_inet_ioctl(SIOCETHTOOL
, &ifr
);
5543 if (error
!= EOPNOTSUPP
) {
5544 VLOG_WARN_RL(&rl
, "ethtool command %s on network device %s "
5545 "failed: %s", cmd_name
, name
, ovs_strerror(error
));
5547 /* The device doesn't support this operation. That's pretty
5548 * common, so there's no point in logging anything. */
5554 /* Returns an AF_PACKET raw socket or a negative errno value. */
5556 af_packet_sock(void)
5558 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5561 if (ovsthread_once_start(&once
)) {
5562 sock
= socket(AF_PACKET
, SOCK_RAW
, 0);
5564 int error
= set_nonblocking(sock
);
5571 VLOG_ERR("failed to create packet socket: %s",
5572 ovs_strerror(errno
));
5574 ovsthread_once_done(&once
);