2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "openvswitch/dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux
);
79 COVERAGE_DEFINE(netdev_set_policing
);
80 COVERAGE_DEFINE(netdev_arp_lookup
);
81 COVERAGE_DEFINE(netdev_get_ifindex
);
82 COVERAGE_DEFINE(netdev_get_hwaddr
);
83 COVERAGE_DEFINE(netdev_set_hwaddr
);
84 COVERAGE_DEFINE(netdev_get_ethtool
);
85 COVERAGE_DEFINE(netdev_set_ethtool
);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata
{
138 uint16_t tp_vlan_tci
;
139 uint16_t tp_vlan_tpid
;
142 /* Linux 2.6.27 introduced ethtool_cmd_speed
144 * To avoid revisiting problems reported with using configure to detect
145 * compatibility (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
147 * unconditionally replace ethtool_cmd_speed. */
148 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
149 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd
*ep
)
151 return ep
->speed
| (ep
->speed_hi
<< 16);
154 /* Linux 2.6.30 introduced supported and advertised flags for
155 * 1G base KX, and 10G base KX4, KR and R. */
156 #ifndef SUPPORTED_1000baseKX_Full
157 #define SUPPORTED_1000baseKX_Full (1 << 17)
158 #define SUPPORTED_10000baseKX4_Full (1 << 18)
159 #define SUPPORTED_10000baseKR_Full (1 << 19)
160 #define SUPPORTED_10000baseR_FEC (1 << 20)
161 #define ADVERTISED_1000baseKX_Full (1 << 17)
162 #define ADVERTISED_10000baseKX4_Full (1 << 18)
163 #define ADVERTISED_10000baseKR_Full (1 << 19)
164 #define ADVERTISED_10000baseR_FEC (1 << 20)
167 /* Linux 3.5 introduced supported and advertised flags for
168 * 40G base KR4, CR4, SR4 and LR4. */
169 #ifndef SUPPORTED_40000baseKR4_Full
170 #define SUPPORTED_40000baseKR4_Full (1 << 23)
171 #define SUPPORTED_40000baseCR4_Full (1 << 24)
172 #define SUPPORTED_40000baseSR4_Full (1 << 25)
173 #define SUPPORTED_40000baseLR4_Full (1 << 26)
174 #define ADVERTISED_40000baseKR4_Full (1 << 23)
175 #define ADVERTISED_40000baseCR4_Full (1 << 24)
176 #define ADVERTISED_40000baseSR4_Full (1 << 25)
177 #define ADVERTISED_40000baseLR4_Full (1 << 26)
180 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
182 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
183 * 2.6.32-431.29.2.el6.x86_64 (see report at
184 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
185 * if_link.h is not self-contained on those kernels. It is easiest to
186 * unconditionally define a replacement. */
188 #define IFLA_STATS64 23
190 #define rtnl_link_stats64 rpl_rtnl_link_stats64
191 struct rtnl_link_stats64
{
203 uint64_t rx_length_errors
;
204 uint64_t rx_over_errors
;
205 uint64_t rx_crc_errors
;
206 uint64_t rx_frame_errors
;
207 uint64_t rx_fifo_errors
;
208 uint64_t rx_missed_errors
;
210 uint64_t tx_aborted_errors
;
211 uint64_t tx_carrier_errors
;
212 uint64_t tx_fifo_errors
;
213 uint64_t tx_heartbeat_errors
;
214 uint64_t tx_window_errors
;
216 uint64_t rx_compressed
;
217 uint64_t tx_compressed
;
221 VALID_IFINDEX
= 1 << 0,
222 VALID_ETHERADDR
= 1 << 1,
225 VALID_POLICING
= 1 << 4,
226 VALID_VPORT_STAT_ERROR
= 1 << 5,
227 VALID_DRVINFO
= 1 << 6,
228 VALID_FEATURES
= 1 << 7,
231 /* Traffic control. */
233 /* An instance of a traffic control class. Always associated with a particular
236 * Each TC implementation subclasses this with whatever additional data it
239 const struct tc_ops
*ops
;
240 struct hmap queues
; /* Contains "struct tc_queue"s.
241 * Read by generic TC layer.
242 * Written only by TC implementation. */
245 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
247 /* One traffic control queue.
249 * Each TC implementation subclasses this with whatever additional data it
252 struct hmap_node hmap_node
; /* In struct tc's "queues" hmap. */
253 unsigned int queue_id
; /* OpenFlow queue ID. */
254 long long int created
; /* Time queue was created, in msecs. */
257 /* A particular kind of traffic control. Each implementation generally maps to
258 * one particular Linux qdisc class.
260 * The functions below return 0 if successful or a positive errno value on
261 * failure, except where otherwise noted. All of them must be provided, except
262 * where otherwise noted. */
264 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
265 * This is null for tc_ops_default and tc_ops_other, for which there are no
266 * appropriate values. */
267 const char *linux_name
;
269 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
270 const char *ovs_name
;
272 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
273 * queues. The queues are numbered 0 through n_queues - 1. */
274 unsigned int n_queues
;
276 /* Called to install this TC class on 'netdev'. The implementation should
277 * make the Netlink calls required to set up 'netdev' with the right qdisc
278 * and configure it according to 'details'. The implementation may assume
279 * that the current qdisc is the default; that is, there is no need for it
280 * to delete the current qdisc before installing itself.
282 * The contents of 'details' should be documented as valid for 'ovs_name'
283 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
284 * (which is built as ovs-vswitchd.conf.db(8)).
286 * This function must return 0 if and only if it sets 'netdev->tc' to an
287 * initialized 'struct tc'.
289 * (This function is null for tc_ops_other, which cannot be installed. For
290 * other TC classes it should always be nonnull.) */
291 int (*tc_install
)(struct netdev
*netdev
, const struct smap
*details
);
293 /* Called when the netdev code determines (through a Netlink query) that
294 * this TC class's qdisc is installed on 'netdev', but we didn't install
295 * it ourselves and so don't know any of the details.
297 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
298 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
299 * implementation should parse the other attributes of 'nlmsg' as
300 * necessary to determine its configuration. If necessary it should also
301 * use Netlink queries to determine the configuration of queues on
304 * This function must return 0 if and only if it sets 'netdev->tc' to an
305 * initialized 'struct tc'. */
306 int (*tc_load
)(struct netdev
*netdev
, struct ofpbuf
*nlmsg
);
308 /* Destroys the data structures allocated by the implementation as part of
309 * 'tc'. (This includes destroying 'tc->queues' by calling
312 * The implementation should not need to perform any Netlink calls. If
313 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
314 * (But it may not be desirable.)
316 * This function may be null if 'tc' is trivial. */
317 void (*tc_destroy
)(struct tc
*tc
);
319 /* Retrieves details of 'netdev->tc' configuration into 'details'.
321 * The implementation should not need to perform any Netlink calls, because
322 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
323 * cached the configuration.
325 * The contents of 'details' should be documented as valid for 'ovs_name'
326 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
327 * (which is built as ovs-vswitchd.conf.db(8)).
329 * This function may be null if 'tc' is not configurable.
331 int (*qdisc_get
)(const struct netdev
*netdev
, struct smap
*details
);
333 /* Reconfigures 'netdev->tc' according to 'details', performing any
334 * required Netlink calls to complete the reconfiguration.
336 * The contents of 'details' should be documented as valid for 'ovs_name'
337 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
338 * (which is built as ovs-vswitchd.conf.db(8)).
340 * This function may be null if 'tc' is not configurable.
342 int (*qdisc_set
)(struct netdev
*, const struct smap
*details
);
344 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
345 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
347 * The contents of 'details' should be documented as valid for 'ovs_name'
348 * in the "other_config" column in the "Queue" table in
349 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
351 * The implementation should not need to perform any Netlink calls, because
352 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
353 * cached the queue configuration.
355 * This function may be null if 'tc' does not have queues ('n_queues' is
357 int (*class_get
)(const struct netdev
*netdev
, const struct tc_queue
*queue
,
358 struct smap
*details
);
360 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
361 * 'details', perfoming any required Netlink calls to complete the
362 * reconfiguration. The caller ensures that 'queue_id' is less than
365 * The contents of 'details' should be documented as valid for 'ovs_name'
366 * in the "other_config" column in the "Queue" table in
367 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
369 * This function may be null if 'tc' does not have queues or its queues are
370 * not configurable. */
371 int (*class_set
)(struct netdev
*, unsigned int queue_id
,
372 const struct smap
*details
);
374 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
375 * tc_queue's within 'netdev->tc->queues'.
377 * This function may be null if 'tc' does not have queues or its queues
378 * cannot be deleted. */
379 int (*class_delete
)(struct netdev
*, struct tc_queue
*queue
);
381 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
382 * 'struct tc_queue's within 'netdev->tc->queues'.
384 * On success, initializes '*stats'.
386 * This function may be null if 'tc' does not have queues or if it cannot
387 * report queue statistics. */
388 int (*class_get_stats
)(const struct netdev
*netdev
,
389 const struct tc_queue
*queue
,
390 struct netdev_queue_stats
*stats
);
392 /* Extracts queue stats from 'nlmsg', which is a response to a
393 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
395 * This function may be null if 'tc' does not have queues or if it cannot
396 * report queue statistics. */
397 int (*class_dump_stats
)(const struct netdev
*netdev
,
398 const struct ofpbuf
*nlmsg
,
399 netdev_dump_queue_stats_cb
*cb
, void *aux
);
403 tc_init(struct tc
*tc
, const struct tc_ops
*ops
)
406 hmap_init(&tc
->queues
);
410 tc_destroy(struct tc
*tc
)
412 hmap_destroy(&tc
->queues
);
415 static const struct tc_ops tc_ops_htb
;
416 static const struct tc_ops tc_ops_hfsc
;
417 static const struct tc_ops tc_ops_codel
;
418 static const struct tc_ops tc_ops_fqcodel
;
419 static const struct tc_ops tc_ops_sfq
;
420 static const struct tc_ops tc_ops_default
;
421 static const struct tc_ops tc_ops_other
;
423 static const struct tc_ops
*const tcs
[] = {
424 &tc_ops_htb
, /* Hierarchy token bucket (see tc-htb(8)). */
425 &tc_ops_hfsc
, /* Hierarchical fair service curve. */
426 &tc_ops_codel
, /* Controlled delay */
427 &tc_ops_fqcodel
, /* Fair queue controlled delay */
428 &tc_ops_sfq
, /* Stochastic fair queueing */
429 &tc_ops_default
, /* Default qdisc (see tc-pfifo_fast(8)). */
430 &tc_ops_other
, /* Some other qdisc. */
434 static unsigned int tc_make_handle(unsigned int major
, unsigned int minor
);
435 static unsigned int tc_get_major(unsigned int handle
);
436 static unsigned int tc_get_minor(unsigned int handle
);
438 static unsigned int tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
);
439 static unsigned int tc_bytes_to_ticks(unsigned int rate
, unsigned int size
);
440 static unsigned int tc_buffer_per_jiffy(unsigned int rate
);
442 static struct tcmsg
*tc_make_request(const struct netdev
*, int type
,
443 unsigned int flags
, struct ofpbuf
*);
444 static int tc_transact(struct ofpbuf
*request
, struct ofpbuf
**replyp
);
445 static int tc_add_del_ingress_qdisc(struct netdev
*netdev
, bool add
);
446 static int tc_add_policer(struct netdev
*,
447 uint32_t kbits_rate
, uint32_t kbits_burst
);
449 static int tc_parse_qdisc(const struct ofpbuf
*, const char **kind
,
450 struct nlattr
**options
);
451 static int tc_parse_class(const struct ofpbuf
*, unsigned int *queue_id
,
452 struct nlattr
**options
,
453 struct netdev_queue_stats
*);
454 static int tc_query_class(const struct netdev
*,
455 unsigned int handle
, unsigned int parent
,
456 struct ofpbuf
**replyp
);
457 static int tc_delete_class(const struct netdev
*, unsigned int handle
);
459 static int tc_del_qdisc(struct netdev
*netdev
);
460 static int tc_query_qdisc(const struct netdev
*netdev
);
462 static int tc_calc_cell_log(unsigned int mtu
);
463 static void tc_fill_rate(struct tc_ratespec
*rate
, uint64_t bps
, int mtu
);
464 static void tc_put_rtab(struct ofpbuf
*, uint16_t type
,
465 const struct tc_ratespec
*rate
);
466 static int tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
);
468 struct netdev_linux
{
471 /* Protects all members below. */
472 struct ovs_mutex mutex
;
474 unsigned int cache_valid
;
476 bool miimon
; /* Link status of last poll. */
477 long long int miimon_interval
; /* Miimon Poll rate. Disabled if <= 0. */
478 struct timer miimon_timer
;
480 /* The following are figured out "on demand" only. They are only valid
481 * when the corresponding VALID_* bit in 'cache_valid' is set. */
483 struct eth_addr etheraddr
;
485 unsigned int ifi_flags
;
486 long long int carrier_resets
;
487 uint32_t kbits_rate
; /* Policing data. */
488 uint32_t kbits_burst
;
489 int vport_stats_error
; /* Cached error code from vport_get_stats().
490 0 or an errno value. */
491 int netdev_mtu_error
; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
492 int ether_addr_error
; /* Cached error code from set/get etheraddr. */
493 int netdev_policing_error
; /* Cached error code from set policing. */
494 int get_features_error
; /* Cached error code from ETHTOOL_GSET. */
495 int get_ifindex_error
; /* Cached error code from SIOCGIFINDEX. */
497 enum netdev_features current
; /* Cached from ETHTOOL_GSET. */
498 enum netdev_features advertised
; /* Cached from ETHTOOL_GSET. */
499 enum netdev_features supported
; /* Cached from ETHTOOL_GSET. */
501 struct ethtool_drvinfo drvinfo
; /* Cached from ETHTOOL_GDRVINFO. */
504 /* For devices of class netdev_tap_class only. */
508 struct netdev_rxq_linux
{
509 struct netdev_rxq up
;
514 /* This is set pretty low because we probably won't learn anything from the
515 * additional log messages. */
516 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
518 /* Polling miimon status for all ports causes performance degradation when
519 * handling a large number of ports. If there are no devices using miimon, then
520 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
522 * Readers do not depend on this variable synchronizing with the related
523 * changes in the device miimon status, so we can use atomic_count. */
524 static atomic_count miimon_cnt
= ATOMIC_COUNT_INIT(0);
526 static void netdev_linux_run(void);
528 static int netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*,
529 int cmd
, const char *cmd_name
);
530 static int get_flags(const struct netdev
*, unsigned int *flags
);
531 static int set_flags(const char *, unsigned int flags
);
532 static int update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
533 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
534 OVS_REQUIRES(netdev
->mutex
);
535 static int do_get_ifindex(const char *netdev_name
);
536 static int get_ifindex(const struct netdev
*, int *ifindexp
);
537 static int do_set_addr(struct netdev
*netdev
,
538 int ioctl_nr
, const char *ioctl_name
,
539 struct in_addr addr
);
540 static int get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
);
541 static int set_etheraddr(const char *netdev_name
, const struct eth_addr
);
542 static int get_stats_via_netlink(const struct netdev
*, struct netdev_stats
*);
543 static int af_packet_sock(void);
544 static bool netdev_linux_miimon_enabled(void);
545 static void netdev_linux_miimon_run(void);
546 static void netdev_linux_miimon_wait(void);
547 static int netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
);
550 is_netdev_linux_class(const struct netdev_class
*netdev_class
)
552 return netdev_class
->run
== netdev_linux_run
;
556 is_tap_netdev(const struct netdev
*netdev
)
558 return netdev_get_class(netdev
) == &netdev_tap_class
;
561 static struct netdev_linux
*
562 netdev_linux_cast(const struct netdev
*netdev
)
564 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev
)));
566 return CONTAINER_OF(netdev
, struct netdev_linux
, up
);
569 static struct netdev_rxq_linux
*
570 netdev_rxq_linux_cast(const struct netdev_rxq
*rx
)
572 ovs_assert(is_netdev_linux_class(netdev_get_class(rx
->netdev
)));
573 return CONTAINER_OF(rx
, struct netdev_rxq_linux
, up
);
576 static void netdev_linux_update(struct netdev_linux
*netdev
,
577 const struct rtnetlink_change
*)
578 OVS_REQUIRES(netdev
->mutex
);
579 static void netdev_linux_changed(struct netdev_linux
*netdev
,
580 unsigned int ifi_flags
, unsigned int mask
)
581 OVS_REQUIRES(netdev
->mutex
);
583 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
584 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
585 * if no such socket could be created. */
586 static struct nl_sock
*
587 netdev_linux_notify_sock(void)
589 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
590 static struct nl_sock
*sock
;
591 unsigned int mcgroups
[] = {RTNLGRP_LINK
, RTNLGRP_IPV4_IFADDR
,
592 RTNLGRP_IPV6_IFADDR
, RTNLGRP_IPV6_IFINFO
};
594 if (ovsthread_once_start(&once
)) {
597 error
= nl_sock_create(NETLINK_ROUTE
, &sock
);
601 for (i
= 0; i
< ARRAY_SIZE(mcgroups
); i
++) {
602 error
= nl_sock_join_mcgroup(sock
, mcgroups
[i
]);
604 nl_sock_destroy(sock
);
610 ovsthread_once_done(&once
);
617 netdev_linux_miimon_enabled(void)
619 return atomic_count_get(&miimon_cnt
) > 0;
623 netdev_linux_run(void)
625 struct nl_sock
*sock
;
628 if (netdev_linux_miimon_enabled()) {
629 netdev_linux_miimon_run();
632 sock
= netdev_linux_notify_sock();
638 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
639 uint64_t buf_stub
[4096 / 8];
642 ofpbuf_use_stub(&buf
, buf_stub
, sizeof buf_stub
);
643 error
= nl_sock_recv(sock
, &buf
, false);
645 struct rtnetlink_change change
;
647 if (rtnetlink_parse(&buf
, &change
)) {
648 struct netdev
*netdev_
= NULL
;
649 char dev_name
[IFNAMSIZ
];
651 if (!change
.ifname
) {
652 change
.ifname
= if_indextoname(change
.if_index
, dev_name
);
656 netdev_
= netdev_from_name(change
.ifname
);
658 if (netdev_
&& is_netdev_linux_class(netdev_
->netdev_class
)) {
659 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
661 ovs_mutex_lock(&netdev
->mutex
);
662 netdev_linux_update(netdev
, &change
);
663 ovs_mutex_unlock(&netdev
->mutex
);
665 netdev_close(netdev_
);
667 } else if (error
== ENOBUFS
) {
668 struct shash device_shash
;
669 struct shash_node
*node
;
673 shash_init(&device_shash
);
674 netdev_get_devices(&netdev_linux_class
, &device_shash
);
675 SHASH_FOR_EACH (node
, &device_shash
) {
676 struct netdev
*netdev_
= node
->data
;
677 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
680 ovs_mutex_lock(&netdev
->mutex
);
681 get_flags(netdev_
, &flags
);
682 netdev_linux_changed(netdev
, flags
, 0);
683 ovs_mutex_unlock(&netdev
->mutex
);
685 netdev_close(netdev_
);
687 shash_destroy(&device_shash
);
688 } else if (error
!= EAGAIN
) {
689 VLOG_WARN_RL(&rl
, "error reading or parsing netlink (%s)",
690 ovs_strerror(error
));
697 netdev_linux_wait(void)
699 struct nl_sock
*sock
;
701 if (netdev_linux_miimon_enabled()) {
702 netdev_linux_miimon_wait();
704 sock
= netdev_linux_notify_sock();
706 nl_sock_wait(sock
, POLLIN
);
711 netdev_linux_changed(struct netdev_linux
*dev
,
712 unsigned int ifi_flags
, unsigned int mask
)
713 OVS_REQUIRES(dev
->mutex
)
715 netdev_change_seq_changed(&dev
->up
);
717 if ((dev
->ifi_flags
^ ifi_flags
) & IFF_RUNNING
) {
718 dev
->carrier_resets
++;
720 dev
->ifi_flags
= ifi_flags
;
722 dev
->cache_valid
&= mask
;
723 if (!(mask
& VALID_IN
)) {
724 netdev_get_addrs_list_flush();
729 netdev_linux_update(struct netdev_linux
*dev
,
730 const struct rtnetlink_change
*change
)
731 OVS_REQUIRES(dev
->mutex
)
733 if (rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)){
734 if (change
->nlmsg_type
== RTM_NEWLINK
) {
735 /* Keep drv-info, and ip addresses. */
736 netdev_linux_changed(dev
, change
->ifi_flags
,
737 VALID_DRVINFO
| VALID_IN
);
739 /* Update netdev from rtnl-change msg. */
741 dev
->mtu
= change
->mtu
;
742 dev
->cache_valid
|= VALID_MTU
;
743 dev
->netdev_mtu_error
= 0;
746 if (!eth_addr_is_zero(change
->mac
)) {
747 dev
->etheraddr
= change
->mac
;
748 dev
->cache_valid
|= VALID_ETHERADDR
;
749 dev
->ether_addr_error
= 0;
752 dev
->ifindex
= change
->if_index
;
753 dev
->cache_valid
|= VALID_IFINDEX
;
754 dev
->get_ifindex_error
= 0;
756 netdev_linux_changed(dev
, change
->ifi_flags
, 0);
758 } else if (rtnetlink_type_is_rtnlgrp_addr(change
->nlmsg_type
)) {
759 /* Invalidates in4, in6. */
760 netdev_linux_changed(dev
, dev
->ifi_flags
, ~VALID_IN
);
766 static struct netdev
*
767 netdev_linux_alloc(void)
769 struct netdev_linux
*netdev
= xzalloc(sizeof *netdev
);
774 netdev_linux_common_construct(struct netdev_linux
*netdev
)
776 ovs_mutex_init(&netdev
->mutex
);
779 /* Creates system and internal devices. */
781 netdev_linux_construct(struct netdev
*netdev_
)
783 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
786 netdev_linux_common_construct(netdev
);
788 error
= get_flags(&netdev
->up
, &netdev
->ifi_flags
);
789 if (error
== ENODEV
) {
790 if (netdev
->up
.netdev_class
!= &netdev_internal_class
) {
791 /* The device does not exist, so don't allow it to be opened. */
794 /* "Internal" netdevs have to be created as netdev objects before
795 * they exist in the kernel, because creating them in the kernel
796 * happens by passing a netdev object to dpif_port_add().
797 * Therefore, ignore the error. */
804 /* For most types of netdevs we open the device for each call of
805 * netdev_open(). However, this is not the case with tap devices,
806 * since it is only possible to open the device once. In this
807 * situation we share a single file descriptor, and consequently
808 * buffers, across all readers. Therefore once data is read it will
809 * be unavailable to other reads for tap devices. */
811 netdev_linux_construct_tap(struct netdev
*netdev_
)
813 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
814 static const char tap_dev
[] = "/dev/net/tun";
815 const char *name
= netdev_
->name
;
819 netdev_linux_common_construct(netdev
);
821 /* Open tap device. */
822 netdev
->tap_fd
= open(tap_dev
, O_RDWR
);
823 if (netdev
->tap_fd
< 0) {
825 VLOG_WARN("opening \"%s\" failed: %s", tap_dev
, ovs_strerror(error
));
829 /* Create tap device. */
830 ifr
.ifr_flags
= IFF_TAP
| IFF_NO_PI
;
831 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
832 if (ioctl(netdev
->tap_fd
, TUNSETIFF
, &ifr
) == -1) {
833 VLOG_WARN("%s: creating tap device failed: %s", name
,
834 ovs_strerror(errno
));
839 /* Make non-blocking. */
840 error
= set_nonblocking(netdev
->tap_fd
);
848 close(netdev
->tap_fd
);
853 netdev_linux_destruct(struct netdev
*netdev_
)
855 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
857 if (netdev
->tc
&& netdev
->tc
->ops
->tc_destroy
) {
858 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
861 if (netdev_get_class(netdev_
) == &netdev_tap_class
862 && netdev
->tap_fd
>= 0)
864 close(netdev
->tap_fd
);
867 if (netdev
->miimon_interval
> 0) {
868 atomic_count_dec(&miimon_cnt
);
871 ovs_mutex_destroy(&netdev
->mutex
);
875 netdev_linux_dealloc(struct netdev
*netdev_
)
877 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
881 static struct netdev_rxq
*
882 netdev_linux_rxq_alloc(void)
884 struct netdev_rxq_linux
*rx
= xzalloc(sizeof *rx
);
889 netdev_linux_rxq_construct(struct netdev_rxq
*rxq_
)
891 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
892 struct netdev
*netdev_
= rx
->up
.netdev
;
893 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
896 ovs_mutex_lock(&netdev
->mutex
);
897 rx
->is_tap
= is_tap_netdev(netdev_
);
899 rx
->fd
= netdev
->tap_fd
;
901 struct sockaddr_ll sll
;
903 /* Result of tcpdump -dd inbound */
904 static const struct sock_filter filt
[] = {
905 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
906 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
907 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
908 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
910 static const struct sock_fprog fprog
= {
911 ARRAY_SIZE(filt
), (struct sock_filter
*) filt
914 /* Create file descriptor. */
915 rx
->fd
= socket(PF_PACKET
, SOCK_RAW
, 0);
918 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error
));
923 if (setsockopt(rx
->fd
, SOL_PACKET
, PACKET_AUXDATA
, &val
, sizeof val
)) {
925 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
926 netdev_get_name(netdev_
), ovs_strerror(error
));
930 /* Set non-blocking mode. */
931 error
= set_nonblocking(rx
->fd
);
936 /* Get ethernet device index. */
937 error
= get_ifindex(&netdev
->up
, &ifindex
);
942 /* Bind to specific ethernet device. */
943 memset(&sll
, 0, sizeof sll
);
944 sll
.sll_family
= AF_PACKET
;
945 sll
.sll_ifindex
= ifindex
;
946 sll
.sll_protocol
= htons(ETH_P_ALL
);
947 if (bind(rx
->fd
, (struct sockaddr
*) &sll
, sizeof sll
) < 0) {
949 VLOG_ERR("%s: failed to bind raw socket (%s)",
950 netdev_get_name(netdev_
), ovs_strerror(error
));
954 /* Filter for only inbound packets. */
955 error
= setsockopt(rx
->fd
, SOL_SOCKET
, SO_ATTACH_FILTER
, &fprog
,
959 VLOG_ERR("%s: failed to attach filter (%s)",
960 netdev_get_name(netdev_
), ovs_strerror(error
));
964 ovs_mutex_unlock(&netdev
->mutex
);
972 ovs_mutex_unlock(&netdev
->mutex
);
977 netdev_linux_rxq_destruct(struct netdev_rxq
*rxq_
)
979 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
987 netdev_linux_rxq_dealloc(struct netdev_rxq
*rxq_
)
989 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
995 auxdata_to_vlan_tpid(const struct tpacket_auxdata
*aux
)
997 if (aux
->tp_status
& TP_STATUS_VLAN_TPID_VALID
) {
998 return htons(aux
->tp_vlan_tpid
);
1000 return htons(ETH_TYPE_VLAN
);
1005 auxdata_has_vlan_tci(const struct tpacket_auxdata
*aux
)
1007 return aux
->tp_vlan_tci
|| aux
->tp_status
& TP_STATUS_VLAN_VALID
;
1011 netdev_linux_rxq_recv_sock(int fd
, struct dp_packet
*buffer
)
1016 struct cmsghdr
*cmsg
;
1018 struct cmsghdr cmsg
;
1019 char buffer
[CMSG_SPACE(sizeof(struct tpacket_auxdata
))];
1023 /* Reserve headroom for a single VLAN tag */
1024 dp_packet_reserve(buffer
, VLAN_HEADER_LEN
);
1025 size
= dp_packet_tailroom(buffer
);
1027 iov
.iov_base
= dp_packet_data(buffer
);
1029 msgh
.msg_name
= NULL
;
1030 msgh
.msg_namelen
= 0;
1031 msgh
.msg_iov
= &iov
;
1032 msgh
.msg_iovlen
= 1;
1033 msgh
.msg_control
= &cmsg_buffer
;
1034 msgh
.msg_controllen
= sizeof cmsg_buffer
;
1038 retval
= recvmsg(fd
, &msgh
, MSG_TRUNC
);
1039 } while (retval
< 0 && errno
== EINTR
);
1043 } else if (retval
> size
) {
1047 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1049 for (cmsg
= CMSG_FIRSTHDR(&msgh
); cmsg
; cmsg
= CMSG_NXTHDR(&msgh
, cmsg
)) {
1050 const struct tpacket_auxdata
*aux
;
1052 if (cmsg
->cmsg_level
!= SOL_PACKET
1053 || cmsg
->cmsg_type
!= PACKET_AUXDATA
1054 || cmsg
->cmsg_len
< CMSG_LEN(sizeof(struct tpacket_auxdata
))) {
1058 aux
= ALIGNED_CAST(struct tpacket_auxdata
*, CMSG_DATA(cmsg
));
1059 if (auxdata_has_vlan_tci(aux
)) {
1060 if (retval
< ETH_HEADER_LEN
) {
1064 eth_push_vlan(buffer
, auxdata_to_vlan_tpid(aux
),
1065 htons(aux
->tp_vlan_tci
));
1074 netdev_linux_rxq_recv_tap(int fd
, struct dp_packet
*buffer
)
1077 size_t size
= dp_packet_tailroom(buffer
);
1080 retval
= read(fd
, dp_packet_data(buffer
), size
);
1081 } while (retval
< 0 && errno
== EINTR
);
1087 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1092 netdev_linux_rxq_recv(struct netdev_rxq
*rxq_
, struct dp_packet
**packets
,
1095 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1096 struct netdev
*netdev
= rx
->up
.netdev
;
1097 struct dp_packet
*buffer
;
1101 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
1102 mtu
= ETH_PAYLOAD_MAX
;
1105 buffer
= dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN
+ mtu
,
1106 DP_NETDEV_HEADROOM
);
1107 retval
= (rx
->is_tap
1108 ? netdev_linux_rxq_recv_tap(rx
->fd
, buffer
)
1109 : netdev_linux_rxq_recv_sock(rx
->fd
, buffer
));
1112 if (retval
!= EAGAIN
&& retval
!= EMSGSIZE
) {
1113 VLOG_WARN_RL(&rl
, "error receiving Ethernet packet on %s: %s",
1114 netdev_rxq_get_name(rxq_
), ovs_strerror(errno
));
1116 dp_packet_delete(buffer
);
1118 dp_packet_pad(buffer
);
1119 packets
[0] = buffer
;
1127 netdev_linux_rxq_wait(struct netdev_rxq
*rxq_
)
1129 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1130 poll_fd_wait(rx
->fd
, POLLIN
);
1134 netdev_linux_rxq_drain(struct netdev_rxq
*rxq_
)
1136 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1139 int error
= af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_
), &ifr
,
1140 SIOCGIFTXQLEN
, "SIOCGIFTXQLEN");
1144 drain_fd(rx
->fd
, ifr
.ifr_qlen
);
1147 return drain_rcvbuf(rx
->fd
);
1151 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1152 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1153 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1154 * the packet is too big or too small to transmit on the device.
1156 * The caller retains ownership of 'buffer' in all cases.
1158 * The kernel maintains a packet transmission queue, so the caller is not
1159 * expected to do additional queuing of packets. */
1161 netdev_linux_send(struct netdev
*netdev_
, int qid OVS_UNUSED
,
1162 struct dp_packet
**pkts
, int cnt
, bool may_steal
)
1167 /* 'i' is incremented only if there's no error */
1168 for (i
= 0; i
< cnt
;) {
1169 const void *data
= dp_packet_data(pkts
[i
]);
1170 size_t size
= dp_packet_size(pkts
[i
]);
1173 if (!is_tap_netdev(netdev_
)) {
1174 /* Use our AF_PACKET socket to send to this device. */
1175 struct sockaddr_ll sll
;
1181 sock
= af_packet_sock();
1186 ifindex
= netdev_get_ifindex(netdev_
);
1191 /* We don't bother setting most fields in sockaddr_ll because the
1192 * kernel ignores them for SOCK_RAW. */
1193 memset(&sll
, 0, sizeof sll
);
1194 sll
.sll_family
= AF_PACKET
;
1195 sll
.sll_ifindex
= ifindex
;
1197 iov
.iov_base
= CONST_CAST(void *, data
);
1200 msg
.msg_name
= &sll
;
1201 msg
.msg_namelen
= sizeof sll
;
1204 msg
.msg_control
= NULL
;
1205 msg
.msg_controllen
= 0;
1208 retval
= sendmsg(sock
, &msg
, 0);
1210 /* Use the tap fd to send to this device. This is essential for
1211 * tap devices, because packets sent to a tap device with an
1212 * AF_PACKET socket will loop back to be *received* again on the
1213 * tap device. This doesn't occur on other interface types
1214 * because we attach a socket filter to the rx socket. */
1215 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1217 retval
= write(netdev
->tap_fd
, data
, size
);
1221 /* The Linux AF_PACKET implementation never blocks waiting for room
1222 * for packets, instead returning ENOBUFS. Translate this into
1223 * EAGAIN for the caller. */
1224 error
= errno
== ENOBUFS
? EAGAIN
: errno
;
1225 if (error
== EINTR
) {
1226 /* continue without incrementing 'i', i.e. retry this packet */
1230 } else if (retval
!= size
) {
1231 VLOG_WARN_RL(&rl
, "sent partial Ethernet packet (%"PRIuSIZE
" bytes"
1232 " of %"PRIuSIZE
") on %s", retval
, size
,
1233 netdev_get_name(netdev_
));
1238 /* Process the next packet in the batch */
1243 for (i
= 0; i
< cnt
; i
++) {
1244 dp_packet_delete(pkts
[i
]);
1248 if (error
&& error
!= EAGAIN
) {
1249 VLOG_WARN_RL(&rl
, "error sending Ethernet packet on %s: %s",
1250 netdev_get_name(netdev_
), ovs_strerror(error
));
1257 /* Registers with the poll loop to wake up from the next call to poll_block()
1258 * when the packet transmission queue has sufficient room to transmit a packet
1259 * with netdev_send().
1261 * The kernel maintains a packet transmission queue, so the client is not
1262 * expected to do additional queuing of packets. Thus, this function is
1263 * unlikely to ever be used. It is included for completeness. */
1265 netdev_linux_send_wait(struct netdev
*netdev
, int qid OVS_UNUSED
)
1267 if (is_tap_netdev(netdev
)) {
1268 /* TAP device always accepts packets.*/
1269 poll_immediate_wake();
1273 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1274 * otherwise a positive errno value. */
1276 netdev_linux_set_etheraddr(struct netdev
*netdev_
, const struct eth_addr mac
)
1278 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1279 enum netdev_flags old_flags
= 0;
1282 ovs_mutex_lock(&netdev
->mutex
);
1284 if (netdev
->cache_valid
& VALID_ETHERADDR
) {
1285 error
= netdev
->ether_addr_error
;
1286 if (error
|| eth_addr_equals(netdev
->etheraddr
, mac
)) {
1289 netdev
->cache_valid
&= ~VALID_ETHERADDR
;
1292 /* Tap devices must be brought down before setting the address. */
1293 if (is_tap_netdev(netdev_
)) {
1294 update_flags(netdev
, NETDEV_UP
, 0, &old_flags
);
1296 error
= set_etheraddr(netdev_get_name(netdev_
), mac
);
1297 if (!error
|| error
== ENODEV
) {
1298 netdev
->ether_addr_error
= error
;
1299 netdev
->cache_valid
|= VALID_ETHERADDR
;
1301 netdev
->etheraddr
= mac
;
1305 if (is_tap_netdev(netdev_
) && old_flags
& NETDEV_UP
) {
1306 update_flags(netdev
, 0, NETDEV_UP
, &old_flags
);
1310 ovs_mutex_unlock(&netdev
->mutex
);
1314 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1316 netdev_linux_get_etheraddr(const struct netdev
*netdev_
, struct eth_addr
*mac
)
1318 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1321 ovs_mutex_lock(&netdev
->mutex
);
1322 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1323 netdev
->ether_addr_error
= get_etheraddr(netdev_get_name(netdev_
),
1324 &netdev
->etheraddr
);
1325 netdev
->cache_valid
|= VALID_ETHERADDR
;
1328 error
= netdev
->ether_addr_error
;
1330 *mac
= netdev
->etheraddr
;
1332 ovs_mutex_unlock(&netdev
->mutex
);
1338 netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
)
1342 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1345 netdev
->netdev_mtu_error
= af_inet_ifreq_ioctl(
1346 netdev_get_name(&netdev
->up
), &ifr
, SIOCGIFMTU
, "SIOCGIFMTU");
1347 netdev
->mtu
= ifr
.ifr_mtu
;
1348 netdev
->cache_valid
|= VALID_MTU
;
1351 error
= netdev
->netdev_mtu_error
;
1353 *mtup
= netdev
->mtu
;
1359 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1360 * in bytes, not including the hardware header; thus, this is typically 1500
1361 * bytes for Ethernet devices. */
1363 netdev_linux_get_mtu(const struct netdev
*netdev_
, int *mtup
)
1365 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1368 ovs_mutex_lock(&netdev
->mutex
);
1369 error
= netdev_linux_get_mtu__(netdev
, mtup
);
1370 ovs_mutex_unlock(&netdev
->mutex
);
1375 /* Sets the maximum size of transmitted (MTU) for given device using linux
1376 * networking ioctl interface.
1379 netdev_linux_set_mtu(const struct netdev
*netdev_
, int mtu
)
1381 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1385 ovs_mutex_lock(&netdev
->mutex
);
1386 if (netdev
->cache_valid
& VALID_MTU
) {
1387 error
= netdev
->netdev_mtu_error
;
1388 if (error
|| netdev
->mtu
== mtu
) {
1391 netdev
->cache_valid
&= ~VALID_MTU
;
1394 error
= af_inet_ifreq_ioctl(netdev_get_name(netdev_
), &ifr
,
1395 SIOCSIFMTU
, "SIOCSIFMTU");
1396 if (!error
|| error
== ENODEV
) {
1397 netdev
->netdev_mtu_error
= error
;
1398 netdev
->mtu
= ifr
.ifr_mtu
;
1399 netdev
->cache_valid
|= VALID_MTU
;
1402 ovs_mutex_unlock(&netdev
->mutex
);
1406 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1407 * On failure, returns a negative errno value. */
1409 netdev_linux_get_ifindex(const struct netdev
*netdev_
)
1411 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1414 ovs_mutex_lock(&netdev
->mutex
);
1415 error
= get_ifindex(netdev_
, &ifindex
);
1416 ovs_mutex_unlock(&netdev
->mutex
);
1418 return error
? -error
: ifindex
;
1422 netdev_linux_get_carrier(const struct netdev
*netdev_
, bool *carrier
)
1424 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1426 ovs_mutex_lock(&netdev
->mutex
);
1427 if (netdev
->miimon_interval
> 0) {
1428 *carrier
= netdev
->miimon
;
1430 *carrier
= (netdev
->ifi_flags
& IFF_RUNNING
) != 0;
1432 ovs_mutex_unlock(&netdev
->mutex
);
1437 static long long int
1438 netdev_linux_get_carrier_resets(const struct netdev
*netdev_
)
1440 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1441 long long int carrier_resets
;
1443 ovs_mutex_lock(&netdev
->mutex
);
1444 carrier_resets
= netdev
->carrier_resets
;
1445 ovs_mutex_unlock(&netdev
->mutex
);
1447 return carrier_resets
;
1451 netdev_linux_do_miimon(const char *name
, int cmd
, const char *cmd_name
,
1452 struct mii_ioctl_data
*data
)
1457 memset(&ifr
, 0, sizeof ifr
);
1458 memcpy(&ifr
.ifr_data
, data
, sizeof *data
);
1459 error
= af_inet_ifreq_ioctl(name
, &ifr
, cmd
, cmd_name
);
1460 memcpy(data
, &ifr
.ifr_data
, sizeof *data
);
1466 netdev_linux_get_miimon(const char *name
, bool *miimon
)
1468 struct mii_ioctl_data data
;
1473 memset(&data
, 0, sizeof data
);
1474 error
= netdev_linux_do_miimon(name
, SIOCGMIIPHY
, "SIOCGMIIPHY", &data
);
1476 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1477 data
.reg_num
= MII_BMSR
;
1478 error
= netdev_linux_do_miimon(name
, SIOCGMIIREG
, "SIOCGMIIREG",
1482 *miimon
= !!(data
.val_out
& BMSR_LSTATUS
);
1484 VLOG_WARN_RL(&rl
, "%s: failed to query MII", name
);
1487 struct ethtool_cmd ecmd
;
1489 VLOG_DBG_RL(&rl
, "%s: failed to query MII, falling back to ethtool",
1492 COVERAGE_INC(netdev_get_ethtool
);
1493 memset(&ecmd
, 0, sizeof ecmd
);
1494 error
= netdev_linux_do_ethtool(name
, &ecmd
, ETHTOOL_GLINK
,
1497 struct ethtool_value eval
;
1499 memcpy(&eval
, &ecmd
, sizeof eval
);
1500 *miimon
= !!eval
.data
;
1502 VLOG_WARN_RL(&rl
, "%s: ethtool link status failed", name
);
1510 netdev_linux_set_miimon_interval(struct netdev
*netdev_
,
1511 long long int interval
)
1513 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1515 ovs_mutex_lock(&netdev
->mutex
);
1516 interval
= interval
> 0 ? MAX(interval
, 100) : 0;
1517 if (netdev
->miimon_interval
!= interval
) {
1518 if (interval
&& !netdev
->miimon_interval
) {
1519 atomic_count_inc(&miimon_cnt
);
1520 } else if (!interval
&& netdev
->miimon_interval
) {
1521 atomic_count_dec(&miimon_cnt
);
1524 netdev
->miimon_interval
= interval
;
1525 timer_set_expired(&netdev
->miimon_timer
);
1527 ovs_mutex_unlock(&netdev
->mutex
);
1533 netdev_linux_miimon_run(void)
1535 struct shash device_shash
;
1536 struct shash_node
*node
;
1538 shash_init(&device_shash
);
1539 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1540 SHASH_FOR_EACH (node
, &device_shash
) {
1541 struct netdev
*netdev
= node
->data
;
1542 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1545 ovs_mutex_lock(&dev
->mutex
);
1546 if (dev
->miimon_interval
> 0 && timer_expired(&dev
->miimon_timer
)) {
1547 netdev_linux_get_miimon(dev
->up
.name
, &miimon
);
1548 if (miimon
!= dev
->miimon
) {
1549 dev
->miimon
= miimon
;
1550 netdev_linux_changed(dev
, dev
->ifi_flags
, 0);
1553 timer_set_duration(&dev
->miimon_timer
, dev
->miimon_interval
);
1555 ovs_mutex_unlock(&dev
->mutex
);
1556 netdev_close(netdev
);
1559 shash_destroy(&device_shash
);
1563 netdev_linux_miimon_wait(void)
1565 struct shash device_shash
;
1566 struct shash_node
*node
;
1568 shash_init(&device_shash
);
1569 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1570 SHASH_FOR_EACH (node
, &device_shash
) {
1571 struct netdev
*netdev
= node
->data
;
1572 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1574 ovs_mutex_lock(&dev
->mutex
);
1575 if (dev
->miimon_interval
> 0) {
1576 timer_wait(&dev
->miimon_timer
);
1578 ovs_mutex_unlock(&dev
->mutex
);
1579 netdev_close(netdev
);
1581 shash_destroy(&device_shash
);
1585 swap_uint64(uint64_t *a
, uint64_t *b
)
1592 /* Copies 'src' into 'dst', performing format conversion in the process.
1594 * 'src' is allowed to be misaligned. */
1596 netdev_stats_from_ovs_vport_stats(struct netdev_stats
*dst
,
1597 const struct ovs_vport_stats
*src
)
1599 dst
->rx_packets
= get_32aligned_u64(&src
->rx_packets
);
1600 dst
->tx_packets
= get_32aligned_u64(&src
->tx_packets
);
1601 dst
->rx_bytes
= get_32aligned_u64(&src
->rx_bytes
);
1602 dst
->tx_bytes
= get_32aligned_u64(&src
->tx_bytes
);
1603 dst
->rx_errors
= get_32aligned_u64(&src
->rx_errors
);
1604 dst
->tx_errors
= get_32aligned_u64(&src
->tx_errors
);
1605 dst
->rx_dropped
= get_32aligned_u64(&src
->rx_dropped
);
1606 dst
->tx_dropped
= get_32aligned_u64(&src
->tx_dropped
);
1608 dst
->collisions
= 0;
1609 dst
->rx_length_errors
= 0;
1610 dst
->rx_over_errors
= 0;
1611 dst
->rx_crc_errors
= 0;
1612 dst
->rx_frame_errors
= 0;
1613 dst
->rx_fifo_errors
= 0;
1614 dst
->rx_missed_errors
= 0;
1615 dst
->tx_aborted_errors
= 0;
1616 dst
->tx_carrier_errors
= 0;
1617 dst
->tx_fifo_errors
= 0;
1618 dst
->tx_heartbeat_errors
= 0;
1619 dst
->tx_window_errors
= 0;
1623 get_stats_via_vport__(const struct netdev
*netdev
, struct netdev_stats
*stats
)
1625 struct dpif_netlink_vport reply
;
1629 error
= dpif_netlink_vport_get(netdev_get_name(netdev
), &reply
, &buf
);
1632 } else if (!reply
.stats
) {
1637 netdev_stats_from_ovs_vport_stats(stats
, reply
.stats
);
1645 get_stats_via_vport(const struct netdev
*netdev_
,
1646 struct netdev_stats
*stats
)
1648 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1650 if (!netdev
->vport_stats_error
||
1651 !(netdev
->cache_valid
& VALID_VPORT_STAT_ERROR
)) {
1654 error
= get_stats_via_vport__(netdev_
, stats
);
1655 if (error
&& error
!= ENOENT
&& error
!= ENODEV
) {
1656 VLOG_WARN_RL(&rl
, "%s: obtaining netdev stats via vport failed "
1658 netdev_get_name(netdev_
), ovs_strerror(error
));
1660 netdev
->vport_stats_error
= error
;
1661 netdev
->cache_valid
|= VALID_VPORT_STAT_ERROR
;
1665 /* Retrieves current device stats for 'netdev-linux'. */
1667 netdev_linux_get_stats(const struct netdev
*netdev_
,
1668 struct netdev_stats
*stats
)
1670 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1671 struct netdev_stats dev_stats
;
1674 ovs_mutex_lock(&netdev
->mutex
);
1675 get_stats_via_vport(netdev_
, stats
);
1676 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1678 if (!netdev
->vport_stats_error
) {
1681 } else if (netdev
->vport_stats_error
) {
1682 /* stats not available from OVS then use netdev stats. */
1685 /* Use kernel netdev's packet and byte counts since vport's counters
1686 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1688 stats
->rx_packets
= dev_stats
.rx_packets
;
1689 stats
->rx_bytes
= dev_stats
.rx_bytes
;
1690 stats
->tx_packets
= dev_stats
.tx_packets
;
1691 stats
->tx_bytes
= dev_stats
.tx_bytes
;
1693 stats
->rx_errors
+= dev_stats
.rx_errors
;
1694 stats
->tx_errors
+= dev_stats
.tx_errors
;
1695 stats
->rx_dropped
+= dev_stats
.rx_dropped
;
1696 stats
->tx_dropped
+= dev_stats
.tx_dropped
;
1697 stats
->multicast
+= dev_stats
.multicast
;
1698 stats
->collisions
+= dev_stats
.collisions
;
1699 stats
->rx_length_errors
+= dev_stats
.rx_length_errors
;
1700 stats
->rx_over_errors
+= dev_stats
.rx_over_errors
;
1701 stats
->rx_crc_errors
+= dev_stats
.rx_crc_errors
;
1702 stats
->rx_frame_errors
+= dev_stats
.rx_frame_errors
;
1703 stats
->rx_fifo_errors
+= dev_stats
.rx_fifo_errors
;
1704 stats
->rx_missed_errors
+= dev_stats
.rx_missed_errors
;
1705 stats
->tx_aborted_errors
+= dev_stats
.tx_aborted_errors
;
1706 stats
->tx_carrier_errors
+= dev_stats
.tx_carrier_errors
;
1707 stats
->tx_fifo_errors
+= dev_stats
.tx_fifo_errors
;
1708 stats
->tx_heartbeat_errors
+= dev_stats
.tx_heartbeat_errors
;
1709 stats
->tx_window_errors
+= dev_stats
.tx_window_errors
;
1711 ovs_mutex_unlock(&netdev
->mutex
);
1716 /* Retrieves current device stats for 'netdev-tap' netdev or
1717 * netdev-internal. */
1719 netdev_tap_get_stats(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
1721 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1722 struct netdev_stats dev_stats
;
1725 ovs_mutex_lock(&netdev
->mutex
);
1726 get_stats_via_vport(netdev_
, stats
);
1727 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1729 if (!netdev
->vport_stats_error
) {
1732 } else if (netdev
->vport_stats_error
) {
1733 /* Transmit and receive stats will appear to be swapped relative to the
1734 * other ports since we are the one sending the data, not a remote
1735 * computer. For consistency, we swap them back here. This does not
1736 * apply if we are getting stats from the vport layer because it always
1737 * tracks stats from the perspective of the switch. */
1740 swap_uint64(&stats
->rx_packets
, &stats
->tx_packets
);
1741 swap_uint64(&stats
->rx_bytes
, &stats
->tx_bytes
);
1742 swap_uint64(&stats
->rx_errors
, &stats
->tx_errors
);
1743 swap_uint64(&stats
->rx_dropped
, &stats
->tx_dropped
);
1744 stats
->rx_length_errors
= 0;
1745 stats
->rx_over_errors
= 0;
1746 stats
->rx_crc_errors
= 0;
1747 stats
->rx_frame_errors
= 0;
1748 stats
->rx_fifo_errors
= 0;
1749 stats
->rx_missed_errors
= 0;
1750 stats
->tx_aborted_errors
= 0;
1751 stats
->tx_carrier_errors
= 0;
1752 stats
->tx_fifo_errors
= 0;
1753 stats
->tx_heartbeat_errors
= 0;
1754 stats
->tx_window_errors
= 0;
1756 /* Use kernel netdev's packet and byte counts since vport counters
1757 * do not reflect packet counts on the wire when GSO, TSO or GRO
1759 stats
->rx_packets
= dev_stats
.tx_packets
;
1760 stats
->rx_bytes
= dev_stats
.tx_bytes
;
1761 stats
->tx_packets
= dev_stats
.rx_packets
;
1762 stats
->tx_bytes
= dev_stats
.rx_bytes
;
1764 stats
->rx_dropped
+= dev_stats
.tx_dropped
;
1765 stats
->tx_dropped
+= dev_stats
.rx_dropped
;
1767 stats
->rx_errors
+= dev_stats
.tx_errors
;
1768 stats
->tx_errors
+= dev_stats
.rx_errors
;
1770 stats
->multicast
+= dev_stats
.multicast
;
1771 stats
->collisions
+= dev_stats
.collisions
;
1773 ovs_mutex_unlock(&netdev
->mutex
);
1779 netdev_internal_get_stats(const struct netdev
*netdev_
,
1780 struct netdev_stats
*stats
)
1782 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1785 ovs_mutex_lock(&netdev
->mutex
);
1786 get_stats_via_vport(netdev_
, stats
);
1787 error
= netdev
->vport_stats_error
;
1788 ovs_mutex_unlock(&netdev
->mutex
);
1794 netdev_linux_read_features(struct netdev_linux
*netdev
)
1796 struct ethtool_cmd ecmd
;
1800 if (netdev
->cache_valid
& VALID_FEATURES
) {
1804 COVERAGE_INC(netdev_get_ethtool
);
1805 memset(&ecmd
, 0, sizeof ecmd
);
1806 error
= netdev_linux_do_ethtool(netdev
->up
.name
, &ecmd
,
1807 ETHTOOL_GSET
, "ETHTOOL_GSET");
1812 /* Supported features. */
1813 netdev
->supported
= 0;
1814 if (ecmd
.supported
& SUPPORTED_10baseT_Half
) {
1815 netdev
->supported
|= NETDEV_F_10MB_HD
;
1817 if (ecmd
.supported
& SUPPORTED_10baseT_Full
) {
1818 netdev
->supported
|= NETDEV_F_10MB_FD
;
1820 if (ecmd
.supported
& SUPPORTED_100baseT_Half
) {
1821 netdev
->supported
|= NETDEV_F_100MB_HD
;
1823 if (ecmd
.supported
& SUPPORTED_100baseT_Full
) {
1824 netdev
->supported
|= NETDEV_F_100MB_FD
;
1826 if (ecmd
.supported
& SUPPORTED_1000baseT_Half
) {
1827 netdev
->supported
|= NETDEV_F_1GB_HD
;
1829 if ((ecmd
.supported
& SUPPORTED_1000baseT_Full
) ||
1830 (ecmd
.supported
& SUPPORTED_1000baseKX_Full
)) {
1831 netdev
->supported
|= NETDEV_F_1GB_FD
;
1833 if ((ecmd
.supported
& SUPPORTED_10000baseT_Full
) ||
1834 (ecmd
.supported
& SUPPORTED_10000baseKX4_Full
) ||
1835 (ecmd
.supported
& SUPPORTED_10000baseKR_Full
) ||
1836 (ecmd
.supported
& SUPPORTED_10000baseR_FEC
)) {
1837 netdev
->supported
|= NETDEV_F_10GB_FD
;
1839 if ((ecmd
.supported
& SUPPORTED_40000baseKR4_Full
) ||
1840 (ecmd
.supported
& SUPPORTED_40000baseCR4_Full
) ||
1841 (ecmd
.supported
& SUPPORTED_40000baseSR4_Full
) ||
1842 (ecmd
.supported
& SUPPORTED_40000baseLR4_Full
)) {
1843 netdev
->supported
|= NETDEV_F_40GB_FD
;
1845 if (ecmd
.supported
& SUPPORTED_TP
) {
1846 netdev
->supported
|= NETDEV_F_COPPER
;
1848 if (ecmd
.supported
& SUPPORTED_FIBRE
) {
1849 netdev
->supported
|= NETDEV_F_FIBER
;
1851 if (ecmd
.supported
& SUPPORTED_Autoneg
) {
1852 netdev
->supported
|= NETDEV_F_AUTONEG
;
1854 if (ecmd
.supported
& SUPPORTED_Pause
) {
1855 netdev
->supported
|= NETDEV_F_PAUSE
;
1857 if (ecmd
.supported
& SUPPORTED_Asym_Pause
) {
1858 netdev
->supported
|= NETDEV_F_PAUSE_ASYM
;
1861 /* Advertised features. */
1862 netdev
->advertised
= 0;
1863 if (ecmd
.advertising
& ADVERTISED_10baseT_Half
) {
1864 netdev
->advertised
|= NETDEV_F_10MB_HD
;
1866 if (ecmd
.advertising
& ADVERTISED_10baseT_Full
) {
1867 netdev
->advertised
|= NETDEV_F_10MB_FD
;
1869 if (ecmd
.advertising
& ADVERTISED_100baseT_Half
) {
1870 netdev
->advertised
|= NETDEV_F_100MB_HD
;
1872 if (ecmd
.advertising
& ADVERTISED_100baseT_Full
) {
1873 netdev
->advertised
|= NETDEV_F_100MB_FD
;
1875 if (ecmd
.advertising
& ADVERTISED_1000baseT_Half
) {
1876 netdev
->advertised
|= NETDEV_F_1GB_HD
;
1878 if ((ecmd
.advertising
& ADVERTISED_1000baseT_Full
) ||
1879 (ecmd
.advertising
& ADVERTISED_1000baseKX_Full
)) {
1880 netdev
->advertised
|= NETDEV_F_1GB_FD
;
1882 if ((ecmd
.advertising
& ADVERTISED_10000baseT_Full
) ||
1883 (ecmd
.advertising
& ADVERTISED_10000baseKX4_Full
) ||
1884 (ecmd
.advertising
& ADVERTISED_10000baseKR_Full
) ||
1885 (ecmd
.advertising
& ADVERTISED_10000baseR_FEC
)) {
1886 netdev
->advertised
|= NETDEV_F_10GB_FD
;
1888 if ((ecmd
.advertising
& ADVERTISED_40000baseKR4_Full
) ||
1889 (ecmd
.advertising
& ADVERTISED_40000baseCR4_Full
) ||
1890 (ecmd
.advertising
& ADVERTISED_40000baseSR4_Full
) ||
1891 (ecmd
.advertising
& ADVERTISED_40000baseLR4_Full
)) {
1892 netdev
->advertised
|= NETDEV_F_40GB_FD
;
1894 if (ecmd
.advertising
& ADVERTISED_TP
) {
1895 netdev
->advertised
|= NETDEV_F_COPPER
;
1897 if (ecmd
.advertising
& ADVERTISED_FIBRE
) {
1898 netdev
->advertised
|= NETDEV_F_FIBER
;
1900 if (ecmd
.advertising
& ADVERTISED_Autoneg
) {
1901 netdev
->advertised
|= NETDEV_F_AUTONEG
;
1903 if (ecmd
.advertising
& ADVERTISED_Pause
) {
1904 netdev
->advertised
|= NETDEV_F_PAUSE
;
1906 if (ecmd
.advertising
& ADVERTISED_Asym_Pause
) {
1907 netdev
->advertised
|= NETDEV_F_PAUSE_ASYM
;
1910 /* Current settings. */
1911 speed
= ethtool_cmd_speed(&ecmd
);
1912 if (speed
== SPEED_10
) {
1913 netdev
->current
= ecmd
.duplex
? NETDEV_F_10MB_FD
: NETDEV_F_10MB_HD
;
1914 } else if (speed
== SPEED_100
) {
1915 netdev
->current
= ecmd
.duplex
? NETDEV_F_100MB_FD
: NETDEV_F_100MB_HD
;
1916 } else if (speed
== SPEED_1000
) {
1917 netdev
->current
= ecmd
.duplex
? NETDEV_F_1GB_FD
: NETDEV_F_1GB_HD
;
1918 } else if (speed
== SPEED_10000
) {
1919 netdev
->current
= NETDEV_F_10GB_FD
;
1920 } else if (speed
== 40000) {
1921 netdev
->current
= NETDEV_F_40GB_FD
;
1922 } else if (speed
== 100000) {
1923 netdev
->current
= NETDEV_F_100GB_FD
;
1924 } else if (speed
== 1000000) {
1925 netdev
->current
= NETDEV_F_1TB_FD
;
1927 netdev
->current
= 0;
1930 if (ecmd
.port
== PORT_TP
) {
1931 netdev
->current
|= NETDEV_F_COPPER
;
1932 } else if (ecmd
.port
== PORT_FIBRE
) {
1933 netdev
->current
|= NETDEV_F_FIBER
;
1937 netdev
->current
|= NETDEV_F_AUTONEG
;
1941 netdev
->cache_valid
|= VALID_FEATURES
;
1942 netdev
->get_features_error
= error
;
1945 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1946 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1947 * Returns 0 if successful, otherwise a positive errno value. */
1949 netdev_linux_get_features(const struct netdev
*netdev_
,
1950 enum netdev_features
*current
,
1951 enum netdev_features
*advertised
,
1952 enum netdev_features
*supported
,
1953 enum netdev_features
*peer
)
1955 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1958 ovs_mutex_lock(&netdev
->mutex
);
1959 netdev_linux_read_features(netdev
);
1960 if (!netdev
->get_features_error
) {
1961 *current
= netdev
->current
;
1962 *advertised
= netdev
->advertised
;
1963 *supported
= netdev
->supported
;
1964 *peer
= 0; /* XXX */
1966 error
= netdev
->get_features_error
;
1967 ovs_mutex_unlock(&netdev
->mutex
);
1972 /* Set the features advertised by 'netdev' to 'advertise'. */
1974 netdev_linux_set_advertisements(struct netdev
*netdev_
,
1975 enum netdev_features advertise
)
1977 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1978 struct ethtool_cmd ecmd
;
1981 ovs_mutex_lock(&netdev
->mutex
);
1983 COVERAGE_INC(netdev_get_ethtool
);
1984 memset(&ecmd
, 0, sizeof ecmd
);
1985 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
1986 ETHTOOL_GSET
, "ETHTOOL_GSET");
1991 ecmd
.advertising
= 0;
1992 if (advertise
& NETDEV_F_10MB_HD
) {
1993 ecmd
.advertising
|= ADVERTISED_10baseT_Half
;
1995 if (advertise
& NETDEV_F_10MB_FD
) {
1996 ecmd
.advertising
|= ADVERTISED_10baseT_Full
;
1998 if (advertise
& NETDEV_F_100MB_HD
) {
1999 ecmd
.advertising
|= ADVERTISED_100baseT_Half
;
2001 if (advertise
& NETDEV_F_100MB_FD
) {
2002 ecmd
.advertising
|= ADVERTISED_100baseT_Full
;
2004 if (advertise
& NETDEV_F_1GB_HD
) {
2005 ecmd
.advertising
|= ADVERTISED_1000baseT_Half
;
2007 if (advertise
& NETDEV_F_1GB_FD
) {
2008 ecmd
.advertising
|= ADVERTISED_1000baseT_Full
;
2010 if (advertise
& NETDEV_F_10GB_FD
) {
2011 ecmd
.advertising
|= ADVERTISED_10000baseT_Full
;
2013 if (advertise
& NETDEV_F_COPPER
) {
2014 ecmd
.advertising
|= ADVERTISED_TP
;
2016 if (advertise
& NETDEV_F_FIBER
) {
2017 ecmd
.advertising
|= ADVERTISED_FIBRE
;
2019 if (advertise
& NETDEV_F_AUTONEG
) {
2020 ecmd
.advertising
|= ADVERTISED_Autoneg
;
2022 if (advertise
& NETDEV_F_PAUSE
) {
2023 ecmd
.advertising
|= ADVERTISED_Pause
;
2025 if (advertise
& NETDEV_F_PAUSE_ASYM
) {
2026 ecmd
.advertising
|= ADVERTISED_Asym_Pause
;
2028 COVERAGE_INC(netdev_set_ethtool
);
2029 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2030 ETHTOOL_SSET
, "ETHTOOL_SSET");
2033 ovs_mutex_unlock(&netdev
->mutex
);
2037 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2038 * successful, otherwise a positive errno value. */
2040 netdev_linux_set_policing(struct netdev
*netdev_
,
2041 uint32_t kbits_rate
, uint32_t kbits_burst
)
2043 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2044 const char *netdev_name
= netdev_get_name(netdev_
);
2047 kbits_burst
= (!kbits_rate
? 0 /* Force to 0 if no rate specified. */
2048 : !kbits_burst
? 1000 /* Default to 1000 kbits if 0. */
2049 : kbits_burst
); /* Stick with user-specified value. */
2051 ovs_mutex_lock(&netdev
->mutex
);
2052 if (netdev
->cache_valid
& VALID_POLICING
) {
2053 error
= netdev
->netdev_policing_error
;
2054 if (error
|| (netdev
->kbits_rate
== kbits_rate
&&
2055 netdev
->kbits_burst
== kbits_burst
)) {
2056 /* Assume that settings haven't changed since we last set them. */
2059 netdev
->cache_valid
&= ~VALID_POLICING
;
2062 COVERAGE_INC(netdev_set_policing
);
2063 /* Remove any existing ingress qdisc. */
2064 error
= tc_add_del_ingress_qdisc(netdev_
, false);
2066 VLOG_WARN_RL(&rl
, "%s: removing policing failed: %s",
2067 netdev_name
, ovs_strerror(error
));
2072 error
= tc_add_del_ingress_qdisc(netdev_
, true);
2074 VLOG_WARN_RL(&rl
, "%s: adding policing qdisc failed: %s",
2075 netdev_name
, ovs_strerror(error
));
2079 error
= tc_add_policer(netdev_
, kbits_rate
, kbits_burst
);
2081 VLOG_WARN_RL(&rl
, "%s: adding policing action failed: %s",
2082 netdev_name
, ovs_strerror(error
));
2087 netdev
->kbits_rate
= kbits_rate
;
2088 netdev
->kbits_burst
= kbits_burst
;
2091 if (!error
|| error
== ENODEV
) {
2092 netdev
->netdev_policing_error
= error
;
2093 netdev
->cache_valid
|= VALID_POLICING
;
2095 ovs_mutex_unlock(&netdev
->mutex
);
2100 netdev_linux_get_qos_types(const struct netdev
*netdev OVS_UNUSED
,
2103 const struct tc_ops
*const *opsp
;
2105 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2106 const struct tc_ops
*ops
= *opsp
;
2107 if (ops
->tc_install
&& ops
->ovs_name
[0] != '\0') {
2108 sset_add(types
, ops
->ovs_name
);
2114 static const struct tc_ops
*
2115 tc_lookup_ovs_name(const char *name
)
2117 const struct tc_ops
*const *opsp
;
2119 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2120 const struct tc_ops
*ops
= *opsp
;
2121 if (!strcmp(name
, ops
->ovs_name
)) {
2128 static const struct tc_ops
*
2129 tc_lookup_linux_name(const char *name
)
2131 const struct tc_ops
*const *opsp
;
2133 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2134 const struct tc_ops
*ops
= *opsp
;
2135 if (ops
->linux_name
&& !strcmp(name
, ops
->linux_name
)) {
2142 static struct tc_queue
*
2143 tc_find_queue__(const struct netdev
*netdev_
, unsigned int queue_id
,
2146 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2147 struct tc_queue
*queue
;
2149 HMAP_FOR_EACH_IN_BUCKET (queue
, hmap_node
, hash
, &netdev
->tc
->queues
) {
2150 if (queue
->queue_id
== queue_id
) {
2157 static struct tc_queue
*
2158 tc_find_queue(const struct netdev
*netdev
, unsigned int queue_id
)
2160 return tc_find_queue__(netdev
, queue_id
, hash_int(queue_id
, 0));
2164 netdev_linux_get_qos_capabilities(const struct netdev
*netdev OVS_UNUSED
,
2166 struct netdev_qos_capabilities
*caps
)
2168 const struct tc_ops
*ops
= tc_lookup_ovs_name(type
);
2172 caps
->n_queues
= ops
->n_queues
;
2177 netdev_linux_get_qos(const struct netdev
*netdev_
,
2178 const char **typep
, struct smap
*details
)
2180 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2183 ovs_mutex_lock(&netdev
->mutex
);
2184 error
= tc_query_qdisc(netdev_
);
2186 *typep
= netdev
->tc
->ops
->ovs_name
;
2187 error
= (netdev
->tc
->ops
->qdisc_get
2188 ? netdev
->tc
->ops
->qdisc_get(netdev_
, details
)
2191 ovs_mutex_unlock(&netdev
->mutex
);
2197 netdev_linux_set_qos(struct netdev
*netdev_
,
2198 const char *type
, const struct smap
*details
)
2200 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2201 const struct tc_ops
*new_ops
;
2204 new_ops
= tc_lookup_ovs_name(type
);
2205 if (!new_ops
|| !new_ops
->tc_install
) {
2209 ovs_mutex_lock(&netdev
->mutex
);
2210 error
= tc_query_qdisc(netdev_
);
2215 if (new_ops
== netdev
->tc
->ops
) {
2216 error
= new_ops
->qdisc_set
? new_ops
->qdisc_set(netdev_
, details
) : 0;
2218 /* Delete existing qdisc. */
2219 error
= tc_del_qdisc(netdev_
);
2223 ovs_assert(netdev
->tc
== NULL
);
2225 /* Install new qdisc. */
2226 error
= new_ops
->tc_install(netdev_
, details
);
2227 ovs_assert((error
== 0) == (netdev
->tc
!= NULL
));
2231 ovs_mutex_unlock(&netdev
->mutex
);
2236 netdev_linux_get_queue(const struct netdev
*netdev_
,
2237 unsigned int queue_id
, struct smap
*details
)
2239 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2242 ovs_mutex_lock(&netdev
->mutex
);
2243 error
= tc_query_qdisc(netdev_
);
2245 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2247 ? netdev
->tc
->ops
->class_get(netdev_
, queue
, details
)
2250 ovs_mutex_unlock(&netdev
->mutex
);
2256 netdev_linux_set_queue(struct netdev
*netdev_
,
2257 unsigned int queue_id
, const struct smap
*details
)
2259 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2262 ovs_mutex_lock(&netdev
->mutex
);
2263 error
= tc_query_qdisc(netdev_
);
2265 error
= (queue_id
< netdev
->tc
->ops
->n_queues
2266 && netdev
->tc
->ops
->class_set
2267 ? netdev
->tc
->ops
->class_set(netdev_
, queue_id
, details
)
2270 ovs_mutex_unlock(&netdev
->mutex
);
2276 netdev_linux_delete_queue(struct netdev
*netdev_
, unsigned int queue_id
)
2278 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2281 ovs_mutex_lock(&netdev
->mutex
);
2282 error
= tc_query_qdisc(netdev_
);
2284 if (netdev
->tc
->ops
->class_delete
) {
2285 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2287 ? netdev
->tc
->ops
->class_delete(netdev_
, queue
)
2293 ovs_mutex_unlock(&netdev
->mutex
);
2299 netdev_linux_get_queue_stats(const struct netdev
*netdev_
,
2300 unsigned int queue_id
,
2301 struct netdev_queue_stats
*stats
)
2303 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2306 ovs_mutex_lock(&netdev
->mutex
);
2307 error
= tc_query_qdisc(netdev_
);
2309 if (netdev
->tc
->ops
->class_get_stats
) {
2310 const struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2312 stats
->created
= queue
->created
;
2313 error
= netdev
->tc
->ops
->class_get_stats(netdev_
, queue
,
2322 ovs_mutex_unlock(&netdev
->mutex
);
2327 struct queue_dump_state
{
2328 struct nl_dump dump
;
2333 start_queue_dump(const struct netdev
*netdev
, struct queue_dump_state
*state
)
2335 struct ofpbuf request
;
2336 struct tcmsg
*tcmsg
;
2338 tcmsg
= tc_make_request(netdev
, RTM_GETTCLASS
, 0, &request
);
2342 tcmsg
->tcm_parent
= 0;
2343 nl_dump_start(&state
->dump
, NETLINK_ROUTE
, &request
);
2344 ofpbuf_uninit(&request
);
2346 ofpbuf_init(&state
->buf
, NL_DUMP_BUFSIZE
);
2351 finish_queue_dump(struct queue_dump_state
*state
)
2353 ofpbuf_uninit(&state
->buf
);
2354 return nl_dump_done(&state
->dump
);
2357 struct netdev_linux_queue_state
{
2358 unsigned int *queues
;
2364 netdev_linux_queue_dump_start(const struct netdev
*netdev_
, void **statep
)
2366 const struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2369 ovs_mutex_lock(&netdev
->mutex
);
2370 error
= tc_query_qdisc(netdev_
);
2372 if (netdev
->tc
->ops
->class_get
) {
2373 struct netdev_linux_queue_state
*state
;
2374 struct tc_queue
*queue
;
2377 *statep
= state
= xmalloc(sizeof *state
);
2378 state
->n_queues
= hmap_count(&netdev
->tc
->queues
);
2379 state
->cur_queue
= 0;
2380 state
->queues
= xmalloc(state
->n_queues
* sizeof *state
->queues
);
2383 HMAP_FOR_EACH (queue
, hmap_node
, &netdev
->tc
->queues
) {
2384 state
->queues
[i
++] = queue
->queue_id
;
2390 ovs_mutex_unlock(&netdev
->mutex
);
2396 netdev_linux_queue_dump_next(const struct netdev
*netdev_
, void *state_
,
2397 unsigned int *queue_idp
, struct smap
*details
)
2399 const struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2400 struct netdev_linux_queue_state
*state
= state_
;
2403 ovs_mutex_lock(&netdev
->mutex
);
2404 while (state
->cur_queue
< state
->n_queues
) {
2405 unsigned int queue_id
= state
->queues
[state
->cur_queue
++];
2406 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2409 *queue_idp
= queue_id
;
2410 error
= netdev
->tc
->ops
->class_get(netdev_
, queue
, details
);
2414 ovs_mutex_unlock(&netdev
->mutex
);
2420 netdev_linux_queue_dump_done(const struct netdev
*netdev OVS_UNUSED
,
2423 struct netdev_linux_queue_state
*state
= state_
;
2425 free(state
->queues
);
2431 netdev_linux_dump_queue_stats(const struct netdev
*netdev_
,
2432 netdev_dump_queue_stats_cb
*cb
, void *aux
)
2434 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2437 ovs_mutex_lock(&netdev
->mutex
);
2438 error
= tc_query_qdisc(netdev_
);
2440 struct queue_dump_state state
;
2442 if (!netdev
->tc
->ops
->class_dump_stats
) {
2444 } else if (!start_queue_dump(netdev_
, &state
)) {
2450 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
2451 retval
= netdev
->tc
->ops
->class_dump_stats(netdev_
, &msg
,
2458 retval
= finish_queue_dump(&state
);
2464 ovs_mutex_unlock(&netdev
->mutex
);
2470 netdev_linux_set_in4(struct netdev
*netdev_
, struct in_addr address
,
2471 struct in_addr netmask
)
2473 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2476 ovs_mutex_lock(&netdev
->mutex
);
2477 error
= do_set_addr(netdev_
, SIOCSIFADDR
, "SIOCSIFADDR", address
);
2479 if (address
.s_addr
!= INADDR_ANY
) {
2480 error
= do_set_addr(netdev_
, SIOCSIFNETMASK
,
2481 "SIOCSIFNETMASK", netmask
);
2485 ovs_mutex_unlock(&netdev
->mutex
);
2490 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2491 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2494 netdev_linux_get_addr_list(const struct netdev
*netdev_
,
2495 struct in6_addr
**addr
, struct in6_addr
**mask
, int *n_cnt
)
2497 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2500 ovs_mutex_lock(&netdev
->mutex
);
2501 error
= netdev_get_addrs(netdev_get_name(netdev_
), addr
, mask
, n_cnt
);
2502 ovs_mutex_unlock(&netdev
->mutex
);
2508 make_in4_sockaddr(struct sockaddr
*sa
, struct in_addr addr
)
2510 struct sockaddr_in sin
;
2511 memset(&sin
, 0, sizeof sin
);
2512 sin
.sin_family
= AF_INET
;
2513 sin
.sin_addr
= addr
;
2516 memset(sa
, 0, sizeof *sa
);
2517 memcpy(sa
, &sin
, sizeof sin
);
2521 do_set_addr(struct netdev
*netdev
,
2522 int ioctl_nr
, const char *ioctl_name
, struct in_addr addr
)
2526 make_in4_sockaddr(&ifr
.ifr_addr
, addr
);
2527 return af_inet_ifreq_ioctl(netdev_get_name(netdev
), &ifr
, ioctl_nr
,
2531 /* Adds 'router' as a default IP gateway. */
2533 netdev_linux_add_router(struct netdev
*netdev OVS_UNUSED
, struct in_addr router
)
2535 struct in_addr any
= { INADDR_ANY
};
2539 memset(&rt
, 0, sizeof rt
);
2540 make_in4_sockaddr(&rt
.rt_dst
, any
);
2541 make_in4_sockaddr(&rt
.rt_gateway
, router
);
2542 make_in4_sockaddr(&rt
.rt_genmask
, any
);
2543 rt
.rt_flags
= RTF_UP
| RTF_GATEWAY
;
2544 error
= af_inet_ioctl(SIOCADDRT
, &rt
);
2546 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error
));
2552 netdev_linux_get_next_hop(const struct in_addr
*host
, struct in_addr
*next_hop
,
2555 static const char fn
[] = "/proc/net/route";
2560 *netdev_name
= NULL
;
2561 stream
= fopen(fn
, "r");
2562 if (stream
== NULL
) {
2563 VLOG_WARN_RL(&rl
, "%s: open failed: %s", fn
, ovs_strerror(errno
));
2568 while (fgets(line
, sizeof line
, stream
)) {
2571 ovs_be32 dest
, gateway
, mask
;
2572 int refcnt
, metric
, mtu
;
2573 unsigned int flags
, use
, window
, irtt
;
2576 "%16s %"SCNx32
" %"SCNx32
" %04X %d %u %d %"SCNx32
2578 iface
, &dest
, &gateway
, &flags
, &refcnt
,
2579 &use
, &metric
, &mask
, &mtu
, &window
, &irtt
)) {
2580 VLOG_WARN_RL(&rl
, "%s: could not parse line %d: %s",
2584 if (!(flags
& RTF_UP
)) {
2585 /* Skip routes that aren't up. */
2589 /* The output of 'dest', 'mask', and 'gateway' were given in
2590 * network byte order, so we don't need need any endian
2591 * conversions here. */
2592 if ((dest
& mask
) == (host
->s_addr
& mask
)) {
2594 /* The host is directly reachable. */
2595 next_hop
->s_addr
= 0;
2597 /* To reach the host, we must go through a gateway. */
2598 next_hop
->s_addr
= gateway
;
2600 *netdev_name
= xstrdup(iface
);
2612 netdev_linux_get_status(const struct netdev
*netdev_
, struct smap
*smap
)
2614 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2617 ovs_mutex_lock(&netdev
->mutex
);
2618 if (!(netdev
->cache_valid
& VALID_DRVINFO
)) {
2619 struct ethtool_cmd
*cmd
= (struct ethtool_cmd
*) &netdev
->drvinfo
;
2621 COVERAGE_INC(netdev_get_ethtool
);
2622 memset(&netdev
->drvinfo
, 0, sizeof netdev
->drvinfo
);
2623 error
= netdev_linux_do_ethtool(netdev
->up
.name
,
2626 "ETHTOOL_GDRVINFO");
2628 netdev
->cache_valid
|= VALID_DRVINFO
;
2633 smap_add(smap
, "driver_name", netdev
->drvinfo
.driver
);
2634 smap_add(smap
, "driver_version", netdev
->drvinfo
.version
);
2635 smap_add(smap
, "firmware_version", netdev
->drvinfo
.fw_version
);
2637 ovs_mutex_unlock(&netdev
->mutex
);
2643 netdev_internal_get_status(const struct netdev
*netdev OVS_UNUSED
,
2646 smap_add(smap
, "driver_name", "openvswitch");
2650 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2651 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2652 * returns 0. Otherwise, it returns a positive errno value; in particular,
2653 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2655 netdev_linux_arp_lookup(const struct netdev
*netdev
,
2656 ovs_be32 ip
, struct eth_addr
*mac
)
2659 struct sockaddr_in sin
;
2662 memset(&r
, 0, sizeof r
);
2663 memset(&sin
, 0, sizeof sin
);
2664 sin
.sin_family
= AF_INET
;
2665 sin
.sin_addr
.s_addr
= ip
;
2667 memcpy(&r
.arp_pa
, &sin
, sizeof sin
);
2668 r
.arp_ha
.sa_family
= ARPHRD_ETHER
;
2670 ovs_strzcpy(r
.arp_dev
, netdev_get_name(netdev
), sizeof r
.arp_dev
);
2671 COVERAGE_INC(netdev_arp_lookup
);
2672 retval
= af_inet_ioctl(SIOCGARP
, &r
);
2674 memcpy(mac
, r
.arp_ha
.sa_data
, ETH_ADDR_LEN
);
2675 } else if (retval
!= ENXIO
) {
2676 VLOG_WARN_RL(&rl
, "%s: could not look up ARP entry for "IP_FMT
": %s",
2677 netdev_get_name(netdev
), IP_ARGS(ip
),
2678 ovs_strerror(retval
));
2684 nd_to_iff_flags(enum netdev_flags nd
)
2687 if (nd
& NETDEV_UP
) {
2690 if (nd
& NETDEV_PROMISC
) {
2693 if (nd
& NETDEV_LOOPBACK
) {
2694 iff
|= IFF_LOOPBACK
;
2700 iff_to_nd_flags(int iff
)
2702 enum netdev_flags nd
= 0;
2706 if (iff
& IFF_PROMISC
) {
2707 nd
|= NETDEV_PROMISC
;
2709 if (iff
& IFF_LOOPBACK
) {
2710 nd
|= NETDEV_LOOPBACK
;
2716 update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
2717 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
2718 OVS_REQUIRES(netdev
->mutex
)
2720 int old_flags
, new_flags
;
2723 old_flags
= netdev
->ifi_flags
;
2724 *old_flagsp
= iff_to_nd_flags(old_flags
);
2725 new_flags
= (old_flags
& ~nd_to_iff_flags(off
)) | nd_to_iff_flags(on
);
2726 if (new_flags
!= old_flags
) {
2727 error
= set_flags(netdev_get_name(&netdev
->up
), new_flags
);
2728 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
2735 netdev_linux_update_flags(struct netdev
*netdev_
, enum netdev_flags off
,
2736 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
2738 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2741 ovs_mutex_lock(&netdev
->mutex
);
2742 error
= update_flags(netdev
, off
, on
, old_flagsp
);
2743 ovs_mutex_unlock(&netdev
->mutex
);
2748 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2749 GET_FEATURES, GET_STATUS) \
2752 false, /* is_pmd */ \
2756 netdev_linux_wait, \
2758 netdev_linux_alloc, \
2760 netdev_linux_destruct, \
2761 netdev_linux_dealloc, \
2762 NULL, /* get_config */ \
2763 NULL, /* set_config */ \
2764 NULL, /* get_tunnel_config */ \
2765 NULL, /* build header */ \
2766 NULL, /* push header */ \
2767 NULL, /* pop header */ \
2768 NULL, /* get_numa_id */ \
2769 NULL, /* set_multiq */ \
2771 netdev_linux_send, \
2772 netdev_linux_send_wait, \
2774 netdev_linux_set_etheraddr, \
2775 netdev_linux_get_etheraddr, \
2776 netdev_linux_get_mtu, \
2777 netdev_linux_set_mtu, \
2778 netdev_linux_get_ifindex, \
2779 netdev_linux_get_carrier, \
2780 netdev_linux_get_carrier_resets, \
2781 netdev_linux_set_miimon_interval, \
2785 netdev_linux_set_advertisements, \
2787 netdev_linux_set_policing, \
2788 netdev_linux_get_qos_types, \
2789 netdev_linux_get_qos_capabilities, \
2790 netdev_linux_get_qos, \
2791 netdev_linux_set_qos, \
2792 netdev_linux_get_queue, \
2793 netdev_linux_set_queue, \
2794 netdev_linux_delete_queue, \
2795 netdev_linux_get_queue_stats, \
2796 netdev_linux_queue_dump_start, \
2797 netdev_linux_queue_dump_next, \
2798 netdev_linux_queue_dump_done, \
2799 netdev_linux_dump_queue_stats, \
2801 netdev_linux_set_in4, \
2802 netdev_linux_get_addr_list, \
2803 netdev_linux_add_router, \
2804 netdev_linux_get_next_hop, \
2806 netdev_linux_arp_lookup, \
2808 netdev_linux_update_flags, \
2810 netdev_linux_rxq_alloc, \
2811 netdev_linux_rxq_construct, \
2812 netdev_linux_rxq_destruct, \
2813 netdev_linux_rxq_dealloc, \
2814 netdev_linux_rxq_recv, \
2815 netdev_linux_rxq_wait, \
2816 netdev_linux_rxq_drain, \
2819 const struct netdev_class netdev_linux_class
=
2822 netdev_linux_construct
,
2823 netdev_linux_get_stats
,
2824 netdev_linux_get_features
,
2825 netdev_linux_get_status
);
2827 const struct netdev_class netdev_tap_class
=
2830 netdev_linux_construct_tap
,
2831 netdev_tap_get_stats
,
2832 netdev_linux_get_features
,
2833 netdev_linux_get_status
);
2835 const struct netdev_class netdev_internal_class
=
2838 netdev_linux_construct
,
2839 netdev_internal_get_stats
,
2840 NULL
, /* get_features */
2841 netdev_internal_get_status
);
2844 #define CODEL_N_QUEUES 0x0000
2846 /* In sufficiently new kernel headers these are defined as enums in
2847 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2848 * kernels. (This overrides any enum definition in the header file but that's
2850 #define TCA_CODEL_TARGET 1
2851 #define TCA_CODEL_LIMIT 2
2852 #define TCA_CODEL_INTERVAL 3
2861 static struct codel
*
2862 codel_get__(const struct netdev
*netdev_
)
2864 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2865 return CONTAINER_OF(netdev
->tc
, struct codel
, tc
);
2869 codel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
2872 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2873 struct codel
*codel
;
2875 codel
= xmalloc(sizeof *codel
);
2876 tc_init(&codel
->tc
, &tc_ops_codel
);
2877 codel
->target
= target
;
2878 codel
->limit
= limit
;
2879 codel
->interval
= interval
;
2881 netdev
->tc
= &codel
->tc
;
2885 codel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
2889 struct ofpbuf request
;
2890 struct tcmsg
*tcmsg
;
2891 uint32_t otarget
, olimit
, ointerval
;
2894 tc_del_qdisc(netdev
);
2896 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
2897 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
2901 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
2902 tcmsg
->tcm_parent
= TC_H_ROOT
;
2904 otarget
= target
? target
: 5000;
2905 olimit
= limit
? limit
: 10240;
2906 ointerval
= interval
? interval
: 100000;
2908 nl_msg_put_string(&request
, TCA_KIND
, "codel");
2909 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
2910 nl_msg_put_u32(&request
, TCA_CODEL_TARGET
, otarget
);
2911 nl_msg_put_u32(&request
, TCA_CODEL_LIMIT
, olimit
);
2912 nl_msg_put_u32(&request
, TCA_CODEL_INTERVAL
, ointerval
);
2913 nl_msg_end_nested(&request
, opt_offset
);
2915 error
= tc_transact(&request
, NULL
);
2917 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
2918 "target %u, limit %u, interval %u error %d(%s)",
2919 netdev_get_name(netdev
),
2920 otarget
, olimit
, ointerval
,
2921 error
, ovs_strerror(error
));
2927 codel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
2928 const struct smap
*details
, struct codel
*codel
)
2930 const char *target_s
;
2931 const char *limit_s
;
2932 const char *interval_s
;
2934 target_s
= smap_get(details
, "target");
2935 limit_s
= smap_get(details
, "limit");
2936 interval_s
= smap_get(details
, "interval");
2938 codel
->target
= target_s
? strtoull(target_s
, NULL
, 10) : 0;
2939 codel
->limit
= limit_s
? strtoull(limit_s
, NULL
, 10) : 0;
2940 codel
->interval
= interval_s
? strtoull(interval_s
, NULL
, 10) : 0;
2942 if (!codel
->target
) {
2943 codel
->target
= 5000;
2945 if (!codel
->limit
) {
2946 codel
->limit
= 10240;
2948 if (!codel
->interval
) {
2949 codel
->interval
= 100000;
2954 codel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
2959 codel_parse_qdisc_details__(netdev
, details
, &codel
);
2960 error
= codel_setup_qdisc__(netdev
, codel
.target
, codel
.limit
,
2963 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
2969 codel_parse_tca_options__(struct nlattr
*nl_options
, struct codel
*codel
)
2971 static const struct nl_policy tca_codel_policy
[] = {
2972 [TCA_CODEL_TARGET
] = { .type
= NL_A_U32
},
2973 [TCA_CODEL_LIMIT
] = { .type
= NL_A_U32
},
2974 [TCA_CODEL_INTERVAL
] = { .type
= NL_A_U32
}
2977 struct nlattr
*attrs
[ARRAY_SIZE(tca_codel_policy
)];
2979 if (!nl_parse_nested(nl_options
, tca_codel_policy
,
2980 attrs
, ARRAY_SIZE(tca_codel_policy
))) {
2981 VLOG_WARN_RL(&rl
, "failed to parse CoDel class options");
2985 codel
->target
= nl_attr_get_u32(attrs
[TCA_CODEL_TARGET
]);
2986 codel
->limit
= nl_attr_get_u32(attrs
[TCA_CODEL_LIMIT
]);
2987 codel
->interval
= nl_attr_get_u32(attrs
[TCA_CODEL_INTERVAL
]);
2992 codel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
2994 struct nlattr
*nlattr
;
2999 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3004 error
= codel_parse_tca_options__(nlattr
, &codel
);
3009 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3015 codel_tc_destroy(struct tc
*tc
)
3017 struct codel
*codel
= CONTAINER_OF(tc
, struct codel
, tc
);
3023 codel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3025 const struct codel
*codel
= codel_get__(netdev
);
3026 smap_add_format(details
, "target", "%u", codel
->target
);
3027 smap_add_format(details
, "limit", "%u", codel
->limit
);
3028 smap_add_format(details
, "interval", "%u", codel
->interval
);
3033 codel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3037 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3038 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3039 codel_get__(netdev
)->target
= codel
.target
;
3040 codel_get__(netdev
)->limit
= codel
.limit
;
3041 codel_get__(netdev
)->interval
= codel
.interval
;
3045 static const struct tc_ops tc_ops_codel
= {
3046 "codel", /* linux_name */
3047 "linux-codel", /* ovs_name */
3048 CODEL_N_QUEUES
, /* n_queues */
3061 /* FQ-CoDel traffic control class. */
3063 #define FQCODEL_N_QUEUES 0x0000
3065 /* In sufficiently new kernel headers these are defined as enums in
3066 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3067 * kernels. (This overrides any enum definition in the header file but that's
3069 #define TCA_FQ_CODEL_TARGET 1
3070 #define TCA_FQ_CODEL_LIMIT 2
3071 #define TCA_FQ_CODEL_INTERVAL 3
3072 #define TCA_FQ_CODEL_ECN 4
3073 #define TCA_FQ_CODEL_FLOWS 5
3074 #define TCA_FQ_CODEL_QUANTUM 6
3085 static struct fqcodel
*
3086 fqcodel_get__(const struct netdev
*netdev_
)
3088 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3089 return CONTAINER_OF(netdev
->tc
, struct fqcodel
, tc
);
3093 fqcodel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3094 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3096 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3097 struct fqcodel
*fqcodel
;
3099 fqcodel
= xmalloc(sizeof *fqcodel
);
3100 tc_init(&fqcodel
->tc
, &tc_ops_fqcodel
);
3101 fqcodel
->target
= target
;
3102 fqcodel
->limit
= limit
;
3103 fqcodel
->interval
= interval
;
3104 fqcodel
->flows
= flows
;
3105 fqcodel
->quantum
= quantum
;
3107 netdev
->tc
= &fqcodel
->tc
;
3111 fqcodel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3112 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3115 struct ofpbuf request
;
3116 struct tcmsg
*tcmsg
;
3117 uint32_t otarget
, olimit
, ointerval
, oflows
, oquantum
;
3120 tc_del_qdisc(netdev
);
3122 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
3123 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3127 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3128 tcmsg
->tcm_parent
= TC_H_ROOT
;
3130 otarget
= target
? target
: 5000;
3131 olimit
= limit
? limit
: 10240;
3132 ointerval
= interval
? interval
: 100000;
3133 oflows
= flows
? flows
: 1024;
3134 oquantum
= quantum
? quantum
: 1514; /* fq_codel default quantum is 1514
3137 nl_msg_put_string(&request
, TCA_KIND
, "fq_codel");
3138 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3139 nl_msg_put_u32(&request
, TCA_FQ_CODEL_TARGET
, otarget
);
3140 nl_msg_put_u32(&request
, TCA_FQ_CODEL_LIMIT
, olimit
);
3141 nl_msg_put_u32(&request
, TCA_FQ_CODEL_INTERVAL
, ointerval
);
3142 nl_msg_put_u32(&request
, TCA_FQ_CODEL_FLOWS
, oflows
);
3143 nl_msg_put_u32(&request
, TCA_FQ_CODEL_QUANTUM
, oquantum
);
3144 nl_msg_end_nested(&request
, opt_offset
);
3146 error
= tc_transact(&request
, NULL
);
3148 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3149 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3150 netdev_get_name(netdev
),
3151 otarget
, olimit
, ointerval
, oflows
, oquantum
,
3152 error
, ovs_strerror(error
));
3158 fqcodel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3159 const struct smap
*details
, struct fqcodel
*fqcodel
)
3161 const char *target_s
;
3162 const char *limit_s
;
3163 const char *interval_s
;
3164 const char *flows_s
;
3165 const char *quantum_s
;
3167 target_s
= smap_get(details
, "target");
3168 limit_s
= smap_get(details
, "limit");
3169 interval_s
= smap_get(details
, "interval");
3170 flows_s
= smap_get(details
, "flows");
3171 quantum_s
= smap_get(details
, "quantum");
3172 fqcodel
->target
= target_s
? strtoull(target_s
, NULL
, 10) : 0;
3173 fqcodel
->limit
= limit_s
? strtoull(limit_s
, NULL
, 10) : 0;
3174 fqcodel
->interval
= interval_s
? strtoull(interval_s
, NULL
, 10) : 0;
3175 fqcodel
->flows
= flows_s
? strtoull(flows_s
, NULL
, 10) : 0;
3176 fqcodel
->quantum
= quantum_s
? strtoull(quantum_s
, NULL
, 10) : 0;
3177 if (!fqcodel
->target
) {
3178 fqcodel
->target
= 5000;
3180 if (!fqcodel
->limit
) {
3181 fqcodel
->limit
= 10240;
3183 if (!fqcodel
->interval
) {
3184 fqcodel
->interval
= 1000000;
3186 if (!fqcodel
->flows
) {
3187 fqcodel
->flows
= 1024;
3189 if (!fqcodel
->quantum
) {
3190 fqcodel
->quantum
= 1514;
3195 fqcodel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3198 struct fqcodel fqcodel
;
3200 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3201 error
= fqcodel_setup_qdisc__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3202 fqcodel
.interval
, fqcodel
.flows
,
3205 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3206 fqcodel
.interval
, fqcodel
.flows
, fqcodel
.quantum
);
3212 fqcodel_parse_tca_options__(struct nlattr
*nl_options
, struct fqcodel
*fqcodel
)
3214 static const struct nl_policy tca_fqcodel_policy
[] = {
3215 [TCA_FQ_CODEL_TARGET
] = { .type
= NL_A_U32
},
3216 [TCA_FQ_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3217 [TCA_FQ_CODEL_INTERVAL
] = { .type
= NL_A_U32
},
3218 [TCA_FQ_CODEL_FLOWS
] = { .type
= NL_A_U32
},
3219 [TCA_FQ_CODEL_QUANTUM
] = { .type
= NL_A_U32
}
3222 struct nlattr
*attrs
[ARRAY_SIZE(tca_fqcodel_policy
)];
3224 if (!nl_parse_nested(nl_options
, tca_fqcodel_policy
,
3225 attrs
, ARRAY_SIZE(tca_fqcodel_policy
))) {
3226 VLOG_WARN_RL(&rl
, "failed to parse FQ_CoDel class options");
3230 fqcodel
->target
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_TARGET
]);
3231 fqcodel
->limit
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_LIMIT
]);
3232 fqcodel
->interval
=nl_attr_get_u32(attrs
[TCA_FQ_CODEL_INTERVAL
]);
3233 fqcodel
->flows
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_FLOWS
]);
3234 fqcodel
->quantum
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_QUANTUM
]);
3239 fqcodel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3241 struct nlattr
*nlattr
;
3244 struct fqcodel fqcodel
;
3246 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3251 error
= fqcodel_parse_tca_options__(nlattr
, &fqcodel
);
3256 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3257 fqcodel
.flows
, fqcodel
.quantum
);
3262 fqcodel_tc_destroy(struct tc
*tc
)
3264 struct fqcodel
*fqcodel
= CONTAINER_OF(tc
, struct fqcodel
, tc
);
3270 fqcodel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3272 const struct fqcodel
*fqcodel
= fqcodel_get__(netdev
);
3273 smap_add_format(details
, "target", "%u", fqcodel
->target
);
3274 smap_add_format(details
, "limit", "%u", fqcodel
->limit
);
3275 smap_add_format(details
, "interval", "%u", fqcodel
->interval
);
3276 smap_add_format(details
, "flows", "%u", fqcodel
->flows
);
3277 smap_add_format(details
, "quantum", "%u", fqcodel
->quantum
);
3282 fqcodel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3284 struct fqcodel fqcodel
;
3286 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3287 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3288 fqcodel
.flows
, fqcodel
.quantum
);
3289 fqcodel_get__(netdev
)->target
= fqcodel
.target
;
3290 fqcodel_get__(netdev
)->limit
= fqcodel
.limit
;
3291 fqcodel_get__(netdev
)->interval
= fqcodel
.interval
;
3292 fqcodel_get__(netdev
)->flows
= fqcodel
.flows
;
3293 fqcodel_get__(netdev
)->quantum
= fqcodel
.quantum
;
3297 static const struct tc_ops tc_ops_fqcodel
= {
3298 "fq_codel", /* linux_name */
3299 "linux-fq_codel", /* ovs_name */
3300 FQCODEL_N_QUEUES
, /* n_queues */
3313 /* SFQ traffic control class. */
3315 #define SFQ_N_QUEUES 0x0000
3324 sfq_get__(const struct netdev
*netdev_
)
3326 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3327 return CONTAINER_OF(netdev
->tc
, struct sfq
, tc
);
3331 sfq_install__(struct netdev
*netdev_
, uint32_t quantum
, uint32_t perturb
)
3333 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3336 sfq
= xmalloc(sizeof *sfq
);
3337 tc_init(&sfq
->tc
, &tc_ops_sfq
);
3338 sfq
->perturb
= perturb
;
3339 sfq
->quantum
= quantum
;
3341 netdev
->tc
= &sfq
->tc
;
3345 sfq_setup_qdisc__(struct netdev
*netdev
, uint32_t quantum
, uint32_t perturb
)
3347 struct tc_sfq_qopt opt
;
3348 struct ofpbuf request
;
3349 struct tcmsg
*tcmsg
;
3351 int mtu_error
, error
;
3352 mtu_error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3354 tc_del_qdisc(netdev
);
3356 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
3357 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3361 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3362 tcmsg
->tcm_parent
= TC_H_ROOT
;
3364 memset(&opt
, 0, sizeof opt
);
3367 opt
.quantum
= mtu
; /* if we cannot find mtu, use default */
3370 opt
.quantum
= quantum
;
3374 opt
.perturb_period
= 10;
3376 opt
.perturb_period
= perturb
;
3379 nl_msg_put_string(&request
, TCA_KIND
, "sfq");
3380 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
3382 error
= tc_transact(&request
, NULL
);
3384 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3385 "quantum %u, perturb %u error %d(%s)",
3386 netdev_get_name(netdev
),
3387 opt
.quantum
, opt
.perturb_period
,
3388 error
, ovs_strerror(error
));
3394 sfq_parse_qdisc_details__(struct netdev
*netdev
,
3395 const struct smap
*details
, struct sfq
*sfq
)
3397 const char *perturb_s
;
3398 const char *quantum_s
;
3402 perturb_s
= smap_get(details
, "perturb");
3403 quantum_s
= smap_get(details
, "quantum");
3404 sfq
->perturb
= perturb_s
? strtoull(perturb_s
, NULL
, 10) : 0;
3405 sfq
->quantum
= quantum_s
? strtoull(quantum_s
, NULL
, 10) : 0;
3406 if (!sfq
->perturb
) {
3410 if (!sfq
->quantum
) {
3411 mtu_error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3415 VLOG_WARN_RL(&rl
, "when using SFQ, you must specify quantum on a "
3416 "device without mtu");
3423 sfq_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3428 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3429 error
= sfq_setup_qdisc__(netdev
, sfq
.quantum
, sfq
.perturb
);
3431 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3437 sfq_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3439 const struct tc_sfq_qopt
*sfq
;
3440 struct nlattr
*nlattr
;
3444 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3446 sfq
= nl_attr_get(nlattr
);
3447 sfq_install__(netdev
, sfq
->perturb_period
, sfq
->quantum
);
3455 sfq_tc_destroy(struct tc
*tc
)
3457 struct sfq
*sfq
= CONTAINER_OF(tc
, struct sfq
, tc
);
3463 sfq_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3465 const struct sfq
*sfq
= sfq_get__(netdev
);
3466 smap_add_format(details
, "quantum", "%u", sfq
->quantum
);
3467 smap_add_format(details
, "perturb", "%u", sfq
->perturb
);
3472 sfq_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3476 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3477 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3478 sfq_get__(netdev
)->quantum
= sfq
.quantum
;
3479 sfq_get__(netdev
)->perturb
= sfq
.perturb
;
3483 static const struct tc_ops tc_ops_sfq
= {
3484 "sfq", /* linux_name */
3485 "linux-sfq", /* ovs_name */
3486 SFQ_N_QUEUES
, /* n_queues */
3499 /* HTB traffic control class. */
3501 #define HTB_N_QUEUES 0xf000
3502 #define HTB_RATE2QUANTUM 10
3506 unsigned int max_rate
; /* In bytes/s. */
3510 struct tc_queue tc_queue
;
3511 unsigned int min_rate
; /* In bytes/s. */
3512 unsigned int max_rate
; /* In bytes/s. */
3513 unsigned int burst
; /* In bytes. */
3514 unsigned int priority
; /* Lower values are higher priorities. */
3518 htb_get__(const struct netdev
*netdev_
)
3520 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3521 return CONTAINER_OF(netdev
->tc
, struct htb
, tc
);
3525 htb_install__(struct netdev
*netdev_
, uint64_t max_rate
)
3527 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3530 htb
= xmalloc(sizeof *htb
);
3531 tc_init(&htb
->tc
, &tc_ops_htb
);
3532 htb
->max_rate
= max_rate
;
3534 netdev
->tc
= &htb
->tc
;
3537 /* Create an HTB qdisc.
3539 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3541 htb_setup_qdisc__(struct netdev
*netdev
)
3544 struct tc_htb_glob opt
;
3545 struct ofpbuf request
;
3546 struct tcmsg
*tcmsg
;
3548 tc_del_qdisc(netdev
);
3550 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
3551 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3555 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3556 tcmsg
->tcm_parent
= TC_H_ROOT
;
3558 nl_msg_put_string(&request
, TCA_KIND
, "htb");
3560 memset(&opt
, 0, sizeof opt
);
3561 opt
.rate2quantum
= HTB_RATE2QUANTUM
;
3565 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3566 nl_msg_put_unspec(&request
, TCA_HTB_INIT
, &opt
, sizeof opt
);
3567 nl_msg_end_nested(&request
, opt_offset
);
3569 return tc_transact(&request
, NULL
);
3572 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3573 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3575 htb_setup_class__(struct netdev
*netdev
, unsigned int handle
,
3576 unsigned int parent
, struct htb_class
*class)
3579 struct tc_htb_opt opt
;
3580 struct ofpbuf request
;
3581 struct tcmsg
*tcmsg
;
3585 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3587 VLOG_WARN_RL(&rl
, "cannot set up HTB on device %s that lacks MTU",
3588 netdev_get_name(netdev
));
3592 memset(&opt
, 0, sizeof opt
);
3593 tc_fill_rate(&opt
.rate
, class->min_rate
, mtu
);
3594 tc_fill_rate(&opt
.ceil
, class->max_rate
, mtu
);
3595 /* Makes sure the quantum is at least MTU. Setting quantum will
3596 * make htb ignore the r2q for this class. */
3597 if ((class->min_rate
/ HTB_RATE2QUANTUM
) < mtu
) {
3600 opt
.buffer
= tc_calc_buffer(opt
.rate
.rate
, mtu
, class->burst
);
3601 opt
.cbuffer
= tc_calc_buffer(opt
.ceil
.rate
, mtu
, class->burst
);
3602 opt
.prio
= class->priority
;
3604 tcmsg
= tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
, &request
);
3608 tcmsg
->tcm_handle
= handle
;
3609 tcmsg
->tcm_parent
= parent
;
3611 nl_msg_put_string(&request
, TCA_KIND
, "htb");
3612 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3613 nl_msg_put_unspec(&request
, TCA_HTB_PARMS
, &opt
, sizeof opt
);
3614 tc_put_rtab(&request
, TCA_HTB_RTAB
, &opt
.rate
);
3615 tc_put_rtab(&request
, TCA_HTB_CTAB
, &opt
.ceil
);
3616 nl_msg_end_nested(&request
, opt_offset
);
3618 error
= tc_transact(&request
, NULL
);
3620 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
3621 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3622 netdev_get_name(netdev
),
3623 tc_get_major(handle
), tc_get_minor(handle
),
3624 tc_get_major(parent
), tc_get_minor(parent
),
3625 class->min_rate
, class->max_rate
,
3626 class->burst
, class->priority
, ovs_strerror(error
));
3631 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3632 * description of them into 'details'. The description complies with the
3633 * specification given in the vswitch database documentation for linux-htb
3636 htb_parse_tca_options__(struct nlattr
*nl_options
, struct htb_class
*class)
3638 static const struct nl_policy tca_htb_policy
[] = {
3639 [TCA_HTB_PARMS
] = { .type
= NL_A_UNSPEC
, .optional
= false,
3640 .min_len
= sizeof(struct tc_htb_opt
) },
3643 struct nlattr
*attrs
[ARRAY_SIZE(tca_htb_policy
)];
3644 const struct tc_htb_opt
*htb
;
3646 if (!nl_parse_nested(nl_options
, tca_htb_policy
,
3647 attrs
, ARRAY_SIZE(tca_htb_policy
))) {
3648 VLOG_WARN_RL(&rl
, "failed to parse HTB class options");
3652 htb
= nl_attr_get(attrs
[TCA_HTB_PARMS
]);
3653 class->min_rate
= htb
->rate
.rate
;
3654 class->max_rate
= htb
->ceil
.rate
;
3655 class->burst
= tc_ticks_to_bytes(htb
->rate
.rate
, htb
->buffer
);
3656 class->priority
= htb
->prio
;
3661 htb_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
3662 struct htb_class
*options
,
3663 struct netdev_queue_stats
*stats
)
3665 struct nlattr
*nl_options
;
3666 unsigned int handle
;
3669 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
3670 if (!error
&& queue_id
) {
3671 unsigned int major
= tc_get_major(handle
);
3672 unsigned int minor
= tc_get_minor(handle
);
3673 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
3674 *queue_id
= minor
- 1;
3679 if (!error
&& options
) {
3680 error
= htb_parse_tca_options__(nl_options
, options
);
3686 htb_parse_qdisc_details__(struct netdev
*netdev_
,
3687 const struct smap
*details
, struct htb_class
*hc
)
3689 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3690 const char *max_rate_s
;
3692 max_rate_s
= smap_get(details
, "max-rate");
3693 hc
->max_rate
= max_rate_s
? strtoull(max_rate_s
, NULL
, 10) / 8 : 0;
3694 if (!hc
->max_rate
) {
3695 enum netdev_features current
;
3697 netdev_linux_read_features(netdev
);
3698 current
= !netdev
->get_features_error
? netdev
->current
: 0;
3699 hc
->max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
3701 hc
->min_rate
= hc
->max_rate
;
3707 htb_parse_class_details__(struct netdev
*netdev
,
3708 const struct smap
*details
, struct htb_class
*hc
)
3710 const struct htb
*htb
= htb_get__(netdev
);
3711 const char *min_rate_s
= smap_get(details
, "min-rate");
3712 const char *max_rate_s
= smap_get(details
, "max-rate");
3713 const char *burst_s
= smap_get(details
, "burst");
3714 const char *priority_s
= smap_get(details
, "priority");
3717 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3719 VLOG_WARN_RL(&rl
, "cannot parse HTB class on device %s that lacks MTU",
3720 netdev_get_name(netdev
));
3724 /* HTB requires at least an mtu sized min-rate to send any traffic even
3725 * on uncongested links. */
3726 hc
->min_rate
= min_rate_s
? strtoull(min_rate_s
, NULL
, 10) / 8 : 0;
3727 hc
->min_rate
= MAX(hc
->min_rate
, mtu
);
3728 hc
->min_rate
= MIN(hc
->min_rate
, htb
->max_rate
);
3731 hc
->max_rate
= (max_rate_s
3732 ? strtoull(max_rate_s
, NULL
, 10) / 8
3734 hc
->max_rate
= MAX(hc
->max_rate
, hc
->min_rate
);
3735 hc
->max_rate
= MIN(hc
->max_rate
, htb
->max_rate
);
3739 * According to hints in the documentation that I've read, it is important
3740 * that 'burst' be at least as big as the largest frame that might be
3741 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3742 * but having it a bit too small is a problem. Since netdev_get_mtu()
3743 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3744 * the MTU. We actually add 64, instead of 14, as a guard against
3745 * additional headers get tacked on somewhere that we're not aware of. */
3746 hc
->burst
= burst_s
? strtoull(burst_s
, NULL
, 10) / 8 : 0;
3747 hc
->burst
= MAX(hc
->burst
, mtu
+ 64);
3750 hc
->priority
= priority_s
? strtoul(priority_s
, NULL
, 10) : 0;
3756 htb_query_class__(const struct netdev
*netdev
, unsigned int handle
,
3757 unsigned int parent
, struct htb_class
*options
,
3758 struct netdev_queue_stats
*stats
)
3760 struct ofpbuf
*reply
;
3763 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
3765 error
= htb_parse_tcmsg__(reply
, NULL
, options
, stats
);
3766 ofpbuf_delete(reply
);
3772 htb_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3776 error
= htb_setup_qdisc__(netdev
);
3778 struct htb_class hc
;
3780 htb_parse_qdisc_details__(netdev
, details
, &hc
);
3781 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
3782 tc_make_handle(1, 0), &hc
);
3784 htb_install__(netdev
, hc
.max_rate
);
3790 static struct htb_class
*
3791 htb_class_cast__(const struct tc_queue
*queue
)
3793 return CONTAINER_OF(queue
, struct htb_class
, tc_queue
);
3797 htb_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
3798 const struct htb_class
*hc
)
3800 struct htb
*htb
= htb_get__(netdev
);
3801 size_t hash
= hash_int(queue_id
, 0);
3802 struct tc_queue
*queue
;
3803 struct htb_class
*hcp
;
3805 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
3807 hcp
= htb_class_cast__(queue
);
3809 hcp
= xmalloc(sizeof *hcp
);
3810 queue
= &hcp
->tc_queue
;
3811 queue
->queue_id
= queue_id
;
3812 queue
->created
= time_msec();
3813 hmap_insert(&htb
->tc
.queues
, &queue
->hmap_node
, hash
);
3816 hcp
->min_rate
= hc
->min_rate
;
3817 hcp
->max_rate
= hc
->max_rate
;
3818 hcp
->burst
= hc
->burst
;
3819 hcp
->priority
= hc
->priority
;
3823 htb_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
3826 struct queue_dump_state state
;
3827 struct htb_class hc
;
3829 /* Get qdisc options. */
3831 htb_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
3832 htb_install__(netdev
, hc
.max_rate
);
3835 if (!start_queue_dump(netdev
, &state
)) {
3838 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
3839 unsigned int queue_id
;
3841 if (!htb_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
3842 htb_update_queue__(netdev
, queue_id
, &hc
);
3845 finish_queue_dump(&state
);
3851 htb_tc_destroy(struct tc
*tc
)
3853 struct htb
*htb
= CONTAINER_OF(tc
, struct htb
, tc
);
3854 struct htb_class
*hc
, *next
;
3856 HMAP_FOR_EACH_SAFE (hc
, next
, tc_queue
.hmap_node
, &htb
->tc
.queues
) {
3857 hmap_remove(&htb
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
3865 htb_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3867 const struct htb
*htb
= htb_get__(netdev
);
3868 smap_add_format(details
, "max-rate", "%llu", 8ULL * htb
->max_rate
);
3873 htb_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3875 struct htb_class hc
;
3878 htb_parse_qdisc_details__(netdev
, details
, &hc
);
3879 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
3880 tc_make_handle(1, 0), &hc
);
3882 htb_get__(netdev
)->max_rate
= hc
.max_rate
;
3888 htb_class_get(const struct netdev
*netdev OVS_UNUSED
,
3889 const struct tc_queue
*queue
, struct smap
*details
)
3891 const struct htb_class
*hc
= htb_class_cast__(queue
);
3893 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
3894 if (hc
->min_rate
!= hc
->max_rate
) {
3895 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
3897 smap_add_format(details
, "burst", "%llu", 8ULL * hc
->burst
);
3899 smap_add_format(details
, "priority", "%u", hc
->priority
);
3905 htb_class_set(struct netdev
*netdev
, unsigned int queue_id
,
3906 const struct smap
*details
)
3908 struct htb_class hc
;
3911 error
= htb_parse_class_details__(netdev
, details
, &hc
);
3916 error
= htb_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
3917 tc_make_handle(1, 0xfffe), &hc
);
3922 htb_update_queue__(netdev
, queue_id
, &hc
);
3927 htb_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
3929 struct htb_class
*hc
= htb_class_cast__(queue
);
3930 struct htb
*htb
= htb_get__(netdev
);
3933 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
3935 hmap_remove(&htb
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
3942 htb_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
3943 struct netdev_queue_stats
*stats
)
3945 return htb_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
3946 tc_make_handle(1, 0xfffe), NULL
, stats
);
3950 htb_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
3951 const struct ofpbuf
*nlmsg
,
3952 netdev_dump_queue_stats_cb
*cb
, void *aux
)
3954 struct netdev_queue_stats stats
;
3955 unsigned int handle
, major
, minor
;
3958 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
3963 major
= tc_get_major(handle
);
3964 minor
= tc_get_minor(handle
);
3965 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
3966 (*cb
)(minor
- 1, &stats
, aux
);
3971 static const struct tc_ops tc_ops_htb
= {
3972 "htb", /* linux_name */
3973 "linux-htb", /* ovs_name */
3974 HTB_N_QUEUES
, /* n_queues */
3983 htb_class_get_stats
,
3984 htb_class_dump_stats
3987 /* "linux-hfsc" traffic control class. */
3989 #define HFSC_N_QUEUES 0xf000
3997 struct tc_queue tc_queue
;
4002 static struct hfsc
*
4003 hfsc_get__(const struct netdev
*netdev_
)
4005 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4006 return CONTAINER_OF(netdev
->tc
, struct hfsc
, tc
);
4009 static struct hfsc_class
*
4010 hfsc_class_cast__(const struct tc_queue
*queue
)
4012 return CONTAINER_OF(queue
, struct hfsc_class
, tc_queue
);
4016 hfsc_install__(struct netdev
*netdev_
, uint32_t max_rate
)
4018 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4021 hfsc
= xmalloc(sizeof *hfsc
);
4022 tc_init(&hfsc
->tc
, &tc_ops_hfsc
);
4023 hfsc
->max_rate
= max_rate
;
4024 netdev
->tc
= &hfsc
->tc
;
4028 hfsc_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4029 const struct hfsc_class
*hc
)
4033 struct hfsc_class
*hcp
;
4034 struct tc_queue
*queue
;
4036 hfsc
= hfsc_get__(netdev
);
4037 hash
= hash_int(queue_id
, 0);
4039 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4041 hcp
= hfsc_class_cast__(queue
);
4043 hcp
= xmalloc(sizeof *hcp
);
4044 queue
= &hcp
->tc_queue
;
4045 queue
->queue_id
= queue_id
;
4046 queue
->created
= time_msec();
4047 hmap_insert(&hfsc
->tc
.queues
, &queue
->hmap_node
, hash
);
4050 hcp
->min_rate
= hc
->min_rate
;
4051 hcp
->max_rate
= hc
->max_rate
;
4055 hfsc_parse_tca_options__(struct nlattr
*nl_options
, struct hfsc_class
*class)
4057 const struct tc_service_curve
*rsc
, *fsc
, *usc
;
4058 static const struct nl_policy tca_hfsc_policy
[] = {
4060 .type
= NL_A_UNSPEC
,
4062 .min_len
= sizeof(struct tc_service_curve
),
4065 .type
= NL_A_UNSPEC
,
4067 .min_len
= sizeof(struct tc_service_curve
),
4070 .type
= NL_A_UNSPEC
,
4072 .min_len
= sizeof(struct tc_service_curve
),
4075 struct nlattr
*attrs
[ARRAY_SIZE(tca_hfsc_policy
)];
4077 if (!nl_parse_nested(nl_options
, tca_hfsc_policy
,
4078 attrs
, ARRAY_SIZE(tca_hfsc_policy
))) {
4079 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options");
4083 rsc
= nl_attr_get(attrs
[TCA_HFSC_RSC
]);
4084 fsc
= nl_attr_get(attrs
[TCA_HFSC_FSC
]);
4085 usc
= nl_attr_get(attrs
[TCA_HFSC_USC
]);
4087 if (rsc
->m1
!= 0 || rsc
->d
!= 0 ||
4088 fsc
->m1
!= 0 || fsc
->d
!= 0 ||
4089 usc
->m1
!= 0 || usc
->d
!= 0) {
4090 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4091 "Non-linear service curves are not supported.");
4095 if (rsc
->m2
!= fsc
->m2
) {
4096 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4097 "Real-time service curves are not supported ");
4101 if (rsc
->m2
> usc
->m2
) {
4102 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4103 "Min-rate service curve is greater than "
4104 "the max-rate service curve.");
4108 class->min_rate
= fsc
->m2
;
4109 class->max_rate
= usc
->m2
;
4114 hfsc_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4115 struct hfsc_class
*options
,
4116 struct netdev_queue_stats
*stats
)
4119 unsigned int handle
;
4120 struct nlattr
*nl_options
;
4122 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4128 unsigned int major
, minor
;
4130 major
= tc_get_major(handle
);
4131 minor
= tc_get_minor(handle
);
4132 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4133 *queue_id
= minor
- 1;
4140 error
= hfsc_parse_tca_options__(nl_options
, options
);
4147 hfsc_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4148 unsigned int parent
, struct hfsc_class
*options
,
4149 struct netdev_queue_stats
*stats
)
4152 struct ofpbuf
*reply
;
4154 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4159 error
= hfsc_parse_tcmsg__(reply
, NULL
, options
, stats
);
4160 ofpbuf_delete(reply
);
4165 hfsc_parse_qdisc_details__(struct netdev
*netdev_
, const struct smap
*details
,
4166 struct hfsc_class
*class)
4168 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4170 const char *max_rate_s
;
4172 max_rate_s
= smap_get(details
, "max-rate");
4173 max_rate
= max_rate_s
? strtoull(max_rate_s
, NULL
, 10) / 8 : 0;
4176 enum netdev_features current
;
4178 netdev_linux_read_features(netdev
);
4179 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4180 max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4183 class->min_rate
= max_rate
;
4184 class->max_rate
= max_rate
;
4188 hfsc_parse_class_details__(struct netdev
*netdev
,
4189 const struct smap
*details
,
4190 struct hfsc_class
* class)
4192 const struct hfsc
*hfsc
;
4193 uint32_t min_rate
, max_rate
;
4194 const char *min_rate_s
, *max_rate_s
;
4196 hfsc
= hfsc_get__(netdev
);
4197 min_rate_s
= smap_get(details
, "min-rate");
4198 max_rate_s
= smap_get(details
, "max-rate");
4200 min_rate
= min_rate_s
? strtoull(min_rate_s
, NULL
, 10) / 8 : 0;
4201 min_rate
= MAX(min_rate
, 1);
4202 min_rate
= MIN(min_rate
, hfsc
->max_rate
);
4204 max_rate
= (max_rate_s
4205 ? strtoull(max_rate_s
, NULL
, 10) / 8
4207 max_rate
= MAX(max_rate
, min_rate
);
4208 max_rate
= MIN(max_rate
, hfsc
->max_rate
);
4210 class->min_rate
= min_rate
;
4211 class->max_rate
= max_rate
;
4216 /* Create an HFSC qdisc.
4218 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4220 hfsc_setup_qdisc__(struct netdev
* netdev
)
4222 struct tcmsg
*tcmsg
;
4223 struct ofpbuf request
;
4224 struct tc_hfsc_qopt opt
;
4226 tc_del_qdisc(netdev
);
4228 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
4229 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4235 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4236 tcmsg
->tcm_parent
= TC_H_ROOT
;
4238 memset(&opt
, 0, sizeof opt
);
4241 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4242 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
4244 return tc_transact(&request
, NULL
);
4247 /* Create an HFSC class.
4249 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4250 * sc rate <min_rate> ul rate <max_rate>" */
4252 hfsc_setup_class__(struct netdev
*netdev
, unsigned int handle
,
4253 unsigned int parent
, struct hfsc_class
*class)
4257 struct tcmsg
*tcmsg
;
4258 struct ofpbuf request
;
4259 struct tc_service_curve min
, max
;
4261 tcmsg
= tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
, &request
);
4267 tcmsg
->tcm_handle
= handle
;
4268 tcmsg
->tcm_parent
= parent
;
4272 min
.m2
= class->min_rate
;
4276 max
.m2
= class->max_rate
;
4278 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4279 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4280 nl_msg_put_unspec(&request
, TCA_HFSC_RSC
, &min
, sizeof min
);
4281 nl_msg_put_unspec(&request
, TCA_HFSC_FSC
, &min
, sizeof min
);
4282 nl_msg_put_unspec(&request
, TCA_HFSC_USC
, &max
, sizeof max
);
4283 nl_msg_end_nested(&request
, opt_offset
);
4285 error
= tc_transact(&request
, NULL
);
4287 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
4288 "min-rate %ubps, max-rate %ubps (%s)",
4289 netdev_get_name(netdev
),
4290 tc_get_major(handle
), tc_get_minor(handle
),
4291 tc_get_major(parent
), tc_get_minor(parent
),
4292 class->min_rate
, class->max_rate
, ovs_strerror(error
));
4299 hfsc_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4302 struct hfsc_class
class;
4304 error
= hfsc_setup_qdisc__(netdev
);
4310 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4311 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4312 tc_make_handle(1, 0), &class);
4318 hfsc_install__(netdev
, class.max_rate
);
4323 hfsc_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4326 struct queue_dump_state state
;
4327 struct hfsc_class hc
;
4330 hfsc_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4331 hfsc_install__(netdev
, hc
.max_rate
);
4333 if (!start_queue_dump(netdev
, &state
)) {
4337 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4338 unsigned int queue_id
;
4340 if (!hfsc_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4341 hfsc_update_queue__(netdev
, queue_id
, &hc
);
4345 finish_queue_dump(&state
);
4350 hfsc_tc_destroy(struct tc
*tc
)
4353 struct hfsc_class
*hc
, *next
;
4355 hfsc
= CONTAINER_OF(tc
, struct hfsc
, tc
);
4357 HMAP_FOR_EACH_SAFE (hc
, next
, tc_queue
.hmap_node
, &hfsc
->tc
.queues
) {
4358 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4367 hfsc_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4369 const struct hfsc
*hfsc
;
4370 hfsc
= hfsc_get__(netdev
);
4371 smap_add_format(details
, "max-rate", "%llu", 8ULL * hfsc
->max_rate
);
4376 hfsc_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4379 struct hfsc_class
class;
4381 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4382 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4383 tc_make_handle(1, 0), &class);
4386 hfsc_get__(netdev
)->max_rate
= class.max_rate
;
4393 hfsc_class_get(const struct netdev
*netdev OVS_UNUSED
,
4394 const struct tc_queue
*queue
, struct smap
*details
)
4396 const struct hfsc_class
*hc
;
4398 hc
= hfsc_class_cast__(queue
);
4399 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4400 if (hc
->min_rate
!= hc
->max_rate
) {
4401 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4407 hfsc_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4408 const struct smap
*details
)
4411 struct hfsc_class
class;
4413 error
= hfsc_parse_class_details__(netdev
, details
, &class);
4418 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
4419 tc_make_handle(1, 0xfffe), &class);
4424 hfsc_update_queue__(netdev
, queue_id
, &class);
4429 hfsc_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4433 struct hfsc_class
*hc
;
4435 hc
= hfsc_class_cast__(queue
);
4436 hfsc
= hfsc_get__(netdev
);
4438 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4440 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4447 hfsc_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4448 struct netdev_queue_stats
*stats
)
4450 return hfsc_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4451 tc_make_handle(1, 0xfffe), NULL
, stats
);
4455 hfsc_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4456 const struct ofpbuf
*nlmsg
,
4457 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4459 struct netdev_queue_stats stats
;
4460 unsigned int handle
, major
, minor
;
4463 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4468 major
= tc_get_major(handle
);
4469 minor
= tc_get_minor(handle
);
4470 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4471 (*cb
)(minor
- 1, &stats
, aux
);
4476 static const struct tc_ops tc_ops_hfsc
= {
4477 "hfsc", /* linux_name */
4478 "linux-hfsc", /* ovs_name */
4479 HFSC_N_QUEUES
, /* n_queues */
4480 hfsc_tc_install
, /* tc_install */
4481 hfsc_tc_load
, /* tc_load */
4482 hfsc_tc_destroy
, /* tc_destroy */
4483 hfsc_qdisc_get
, /* qdisc_get */
4484 hfsc_qdisc_set
, /* qdisc_set */
4485 hfsc_class_get
, /* class_get */
4486 hfsc_class_set
, /* class_set */
4487 hfsc_class_delete
, /* class_delete */
4488 hfsc_class_get_stats
, /* class_get_stats */
4489 hfsc_class_dump_stats
/* class_dump_stats */
4492 /* "linux-default" traffic control class.
4494 * This class represents the default, unnamed Linux qdisc. It corresponds to
4495 * the "" (empty string) QoS type in the OVS database. */
4498 default_install__(struct netdev
*netdev_
)
4500 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4501 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
4503 /* Nothing but a tc class implementation is allowed to write to a tc. This
4504 * class never does that, so we can legitimately use a const tc object. */
4505 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4509 default_tc_install(struct netdev
*netdev
,
4510 const struct smap
*details OVS_UNUSED
)
4512 default_install__(netdev
);
4517 default_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4519 default_install__(netdev
);
4523 static const struct tc_ops tc_ops_default
= {
4524 NULL
, /* linux_name */
4529 NULL
, /* tc_destroy */
4530 NULL
, /* qdisc_get */
4531 NULL
, /* qdisc_set */
4532 NULL
, /* class_get */
4533 NULL
, /* class_set */
4534 NULL
, /* class_delete */
4535 NULL
, /* class_get_stats */
4536 NULL
/* class_dump_stats */
4539 /* "linux-other" traffic control class.
4544 other_tc_load(struct netdev
*netdev_
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4546 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4547 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_other
);
4549 /* Nothing but a tc class implementation is allowed to write to a tc. This
4550 * class never does that, so we can legitimately use a const tc object. */
4551 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4555 static const struct tc_ops tc_ops_other
= {
4556 NULL
, /* linux_name */
4557 "linux-other", /* ovs_name */
4559 NULL
, /* tc_install */
4561 NULL
, /* tc_destroy */
4562 NULL
, /* qdisc_get */
4563 NULL
, /* qdisc_set */
4564 NULL
, /* class_get */
4565 NULL
, /* class_set */
4566 NULL
, /* class_delete */
4567 NULL
, /* class_get_stats */
4568 NULL
/* class_dump_stats */
4571 /* Traffic control. */
4573 /* Number of kernel "tc" ticks per second. */
4574 static double ticks_per_s
;
4576 /* Number of kernel "jiffies" per second. This is used for the purpose of
4577 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4578 * one jiffy's worth of data.
4580 * There are two possibilities here:
4582 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4583 * approximate range of 100 to 1024. That means that we really need to
4584 * make sure that the qdisc can buffer that much data.
4586 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4587 * has finely granular timers and there's no need to fudge additional room
4588 * for buffers. (There's no extra effort needed to implement that: the
4589 * large 'buffer_hz' is used as a divisor, so practically any number will
4590 * come out as 0 in the division. Small integer results in the case of
4591 * really high dividends won't have any real effect anyhow.)
4593 static unsigned int buffer_hz
;
4595 /* Returns tc handle 'major':'minor'. */
4597 tc_make_handle(unsigned int major
, unsigned int minor
)
4599 return TC_H_MAKE(major
<< 16, minor
);
4602 /* Returns the major number from 'handle'. */
4604 tc_get_major(unsigned int handle
)
4606 return TC_H_MAJ(handle
) >> 16;
4609 /* Returns the minor number from 'handle'. */
4611 tc_get_minor(unsigned int handle
)
4613 return TC_H_MIN(handle
);
4616 static struct tcmsg
*
4617 tc_make_request(const struct netdev
*netdev
, int type
, unsigned int flags
,
4618 struct ofpbuf
*request
)
4620 struct tcmsg
*tcmsg
;
4624 error
= get_ifindex(netdev
, &ifindex
);
4629 ofpbuf_init(request
, 512);
4630 nl_msg_put_nlmsghdr(request
, sizeof *tcmsg
, type
, NLM_F_REQUEST
| flags
);
4631 tcmsg
= ofpbuf_put_zeros(request
, sizeof *tcmsg
);
4632 tcmsg
->tcm_family
= AF_UNSPEC
;
4633 tcmsg
->tcm_ifindex
= ifindex
;
4634 /* Caller should fill in tcmsg->tcm_handle. */
4635 /* Caller should fill in tcmsg->tcm_parent. */
4641 tc_transact(struct ofpbuf
*request
, struct ofpbuf
**replyp
)
4643 int error
= nl_transact(NETLINK_ROUTE
, request
, replyp
);
4644 ofpbuf_uninit(request
);
4648 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4649 * policing configuration.
4651 * This function is equivalent to running the following when 'add' is true:
4652 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4654 * This function is equivalent to running the following when 'add' is false:
4655 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4657 * The configuration and stats may be seen with the following command:
4658 * /sbin/tc -s qdisc show dev <devname>
4660 * Returns 0 if successful, otherwise a positive errno value.
4663 tc_add_del_ingress_qdisc(struct netdev
*netdev
, bool add
)
4665 struct ofpbuf request
;
4666 struct tcmsg
*tcmsg
;
4668 int type
= add
? RTM_NEWQDISC
: RTM_DELQDISC
;
4669 int flags
= add
? NLM_F_EXCL
| NLM_F_CREATE
: 0;
4671 tcmsg
= tc_make_request(netdev
, type
, flags
, &request
);
4675 tcmsg
->tcm_handle
= tc_make_handle(0xffff, 0);
4676 tcmsg
->tcm_parent
= TC_H_INGRESS
;
4677 nl_msg_put_string(&request
, TCA_KIND
, "ingress");
4678 nl_msg_put_unspec(&request
, TCA_OPTIONS
, NULL
, 0);
4680 error
= tc_transact(&request
, NULL
);
4682 /* If we're deleting the qdisc, don't worry about some of the
4683 * error conditions. */
4684 if (!add
&& (error
== ENOENT
|| error
== EINVAL
)) {
4693 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4696 * This function is equivalent to running:
4697 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4698 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4701 * The configuration and stats may be seen with the following command:
4702 * /sbin/tc -s filter show dev <devname> parent ffff:
4704 * Returns 0 if successful, otherwise a positive errno value.
4707 tc_add_policer(struct netdev
*netdev
,
4708 uint32_t kbits_rate
, uint32_t kbits_burst
)
4710 struct tc_police tc_police
;
4711 struct ofpbuf request
;
4712 struct tcmsg
*tcmsg
;
4713 size_t basic_offset
;
4714 size_t police_offset
;
4718 memset(&tc_police
, 0, sizeof tc_police
);
4719 tc_police
.action
= TC_POLICE_SHOT
;
4720 tc_police
.mtu
= mtu
;
4721 tc_fill_rate(&tc_police
.rate
, ((uint64_t) kbits_rate
* 1000)/8, mtu
);
4723 /* The following appears wrong in two ways:
4725 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4726 * arguments (or at least consistently "bytes" as both or "bits" as
4727 * both), but this supplies bytes for the first argument and bits for the
4730 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4732 * However if you "fix" those problems then "tc filter show ..." shows
4733 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4734 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4735 * tc's point of view. Whatever. */
4736 tc_police
.burst
= tc_bytes_to_ticks(
4737 tc_police
.rate
.rate
, MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024);
4739 tcmsg
= tc_make_request(netdev
, RTM_NEWTFILTER
,
4740 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4744 tcmsg
->tcm_parent
= tc_make_handle(0xffff, 0);
4745 tcmsg
->tcm_info
= tc_make_handle(49,
4746 (OVS_FORCE
uint16_t) htons(ETH_P_ALL
));
4748 nl_msg_put_string(&request
, TCA_KIND
, "basic");
4749 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4750 police_offset
= nl_msg_start_nested(&request
, TCA_BASIC_POLICE
);
4751 nl_msg_put_unspec(&request
, TCA_POLICE_TBF
, &tc_police
, sizeof tc_police
);
4752 tc_put_rtab(&request
, TCA_POLICE_RATE
, &tc_police
.rate
);
4753 nl_msg_end_nested(&request
, police_offset
);
4754 nl_msg_end_nested(&request
, basic_offset
);
4756 error
= tc_transact(&request
, NULL
);
4767 /* The values in psched are not individually very meaningful, but they are
4768 * important. The tables below show some values seen in the wild.
4772 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4773 * (Before that, there are hints that it was 1000000000.)
4775 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4779 * -----------------------------------
4780 * [1] 000c8000 000f4240 000f4240 00000064
4781 * [2] 000003e8 00000400 000f4240 3b9aca00
4782 * [3] 000003e8 00000400 000f4240 3b9aca00
4783 * [4] 000003e8 00000400 000f4240 00000064
4784 * [5] 000003e8 00000040 000f4240 3b9aca00
4785 * [6] 000003e8 00000040 000f4240 000000f9
4787 * a b c d ticks_per_s buffer_hz
4788 * ------- --------- ---------- ------------- ----------- -------------
4789 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4790 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4791 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4792 * [4] 1,000 1,024 1,000,000 100 976,562 100
4793 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4794 * [6] 1,000 64 1,000,000 249 15,625,000 249
4796 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4797 * [2] 2.6.26-1-686-bigmem from Debian lenny
4798 * [3] 2.6.26-2-sparc64 from Debian lenny
4799 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4800 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4801 * [6] 2.6.34 from kernel.org on KVM
4803 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
4804 static const char fn
[] = "/proc/net/psched";
4805 unsigned int a
, b
, c
, d
;
4808 if (!ovsthread_once_start(&once
)) {
4815 stream
= fopen(fn
, "r");
4817 VLOG_WARN("%s: open failed: %s", fn
, ovs_strerror(errno
));
4821 if (fscanf(stream
, "%x %x %x %x", &a
, &b
, &c
, &d
) != 4) {
4822 VLOG_WARN("%s: read failed", fn
);
4826 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn
, a
, b
, c
, d
);
4830 VLOG_WARN("%s: invalid scheduler parameters", fn
);
4834 ticks_per_s
= (double) a
* c
/ b
;
4838 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4841 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn
, ticks_per_s
, buffer_hz
);
4844 ovsthread_once_done(&once
);
4847 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4848 * rate of 'rate' bytes per second. */
4850 tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
)
4853 return (rate
* ticks
) / ticks_per_s
;
4856 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4857 * rate of 'rate' bytes per second. */
4859 tc_bytes_to_ticks(unsigned int rate
, unsigned int size
)
4862 return rate
? ((unsigned long long int) ticks_per_s
* size
) / rate
: 0;
4865 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4866 * a transmission rate of 'rate' bytes per second. */
4868 tc_buffer_per_jiffy(unsigned int rate
)
4871 return rate
/ buffer_hz
;
4874 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4875 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4876 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4877 * stores NULL into it if it is absent.
4879 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4882 * Returns 0 if successful, otherwise a positive errno value. */
4884 tc_parse_qdisc(const struct ofpbuf
*msg
, const char **kind
,
4885 struct nlattr
**options
)
4887 static const struct nl_policy tca_policy
[] = {
4888 [TCA_KIND
] = { .type
= NL_A_STRING
, .optional
= false },
4889 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= true },
4891 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
4893 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
4894 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
4895 VLOG_WARN_RL(&rl
, "failed to parse qdisc message");
4900 *kind
= nl_attr_get_string(ta
[TCA_KIND
]);
4904 *options
= ta
[TCA_OPTIONS
];
4919 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4920 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4921 * into '*options', and its queue statistics into '*stats'. Any of the output
4922 * arguments may be null.
4924 * Returns 0 if successful, otherwise a positive errno value. */
4926 tc_parse_class(const struct ofpbuf
*msg
, unsigned int *handlep
,
4927 struct nlattr
**options
, struct netdev_queue_stats
*stats
)
4929 static const struct nl_policy tca_policy
[] = {
4930 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= false },
4931 [TCA_STATS2
] = { .type
= NL_A_NESTED
, .optional
= false },
4933 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
4935 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
4936 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
4937 VLOG_WARN_RL(&rl
, "failed to parse class message");
4942 struct tcmsg
*tc
= ofpbuf_at_assert(msg
, NLMSG_HDRLEN
, sizeof *tc
);
4943 *handlep
= tc
->tcm_handle
;
4947 *options
= ta
[TCA_OPTIONS
];
4951 const struct gnet_stats_queue
*gsq
;
4952 struct gnet_stats_basic gsb
;
4954 static const struct nl_policy stats_policy
[] = {
4955 [TCA_STATS_BASIC
] = { .type
= NL_A_UNSPEC
, .optional
= false,
4956 .min_len
= sizeof gsb
},
4957 [TCA_STATS_QUEUE
] = { .type
= NL_A_UNSPEC
, .optional
= false,
4958 .min_len
= sizeof *gsq
},
4960 struct nlattr
*sa
[ARRAY_SIZE(stats_policy
)];
4962 if (!nl_parse_nested(ta
[TCA_STATS2
], stats_policy
,
4963 sa
, ARRAY_SIZE(sa
))) {
4964 VLOG_WARN_RL(&rl
, "failed to parse class stats");
4968 /* Alignment issues screw up the length of struct gnet_stats_basic on
4969 * some arch/bitsize combinations. Newer versions of Linux have a
4970 * struct gnet_stats_basic_packed, but we can't depend on that. The
4971 * easiest thing to do is just to make a copy. */
4972 memset(&gsb
, 0, sizeof gsb
);
4973 memcpy(&gsb
, nl_attr_get(sa
[TCA_STATS_BASIC
]),
4974 MIN(nl_attr_get_size(sa
[TCA_STATS_BASIC
]), sizeof gsb
));
4975 stats
->tx_bytes
= gsb
.bytes
;
4976 stats
->tx_packets
= gsb
.packets
;
4978 gsq
= nl_attr_get(sa
[TCA_STATS_QUEUE
]);
4979 stats
->tx_errors
= gsq
->drops
;
4989 memset(stats
, 0, sizeof *stats
);
4994 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4997 tc_query_class(const struct netdev
*netdev
,
4998 unsigned int handle
, unsigned int parent
,
4999 struct ofpbuf
**replyp
)
5001 struct ofpbuf request
;
5002 struct tcmsg
*tcmsg
;
5005 tcmsg
= tc_make_request(netdev
, RTM_GETTCLASS
, NLM_F_ECHO
, &request
);
5009 tcmsg
->tcm_handle
= handle
;
5010 tcmsg
->tcm_parent
= parent
;
5012 error
= tc_transact(&request
, replyp
);
5014 VLOG_WARN_RL(&rl
, "query %s class %u:%u (parent %u:%u) failed (%s)",
5015 netdev_get_name(netdev
),
5016 tc_get_major(handle
), tc_get_minor(handle
),
5017 tc_get_major(parent
), tc_get_minor(parent
),
5018 ovs_strerror(error
));
5023 /* Equivalent to "tc class del dev <name> handle <handle>". */
5025 tc_delete_class(const struct netdev
*netdev
, unsigned int handle
)
5027 struct ofpbuf request
;
5028 struct tcmsg
*tcmsg
;
5031 tcmsg
= tc_make_request(netdev
, RTM_DELTCLASS
, 0, &request
);
5035 tcmsg
->tcm_handle
= handle
;
5036 tcmsg
->tcm_parent
= 0;
5038 error
= tc_transact(&request
, NULL
);
5040 VLOG_WARN_RL(&rl
, "delete %s class %u:%u failed (%s)",
5041 netdev_get_name(netdev
),
5042 tc_get_major(handle
), tc_get_minor(handle
),
5043 ovs_strerror(error
));
5048 /* Equivalent to "tc qdisc del dev <name> root". */
5050 tc_del_qdisc(struct netdev
*netdev_
)
5052 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5053 struct ofpbuf request
;
5054 struct tcmsg
*tcmsg
;
5057 tcmsg
= tc_make_request(netdev_
, RTM_DELQDISC
, 0, &request
);
5061 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
5062 tcmsg
->tcm_parent
= TC_H_ROOT
;
5064 error
= tc_transact(&request
, NULL
);
5065 if (error
== EINVAL
) {
5066 /* EINVAL probably means that the default qdisc was in use, in which
5067 * case we've accomplished our purpose. */
5070 if (!error
&& netdev
->tc
) {
5071 if (netdev
->tc
->ops
->tc_destroy
) {
5072 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
5080 getqdisc_is_safe(void)
5082 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5083 static bool safe
= false;
5085 if (ovsthread_once_start(&once
)) {
5086 struct utsname utsname
;
5089 if (uname(&utsname
) == -1) {
5090 VLOG_WARN("uname failed (%s)", ovs_strerror(errno
));
5091 } else if (!ovs_scan(utsname
.release
, "%d.%d", &major
, &minor
)) {
5092 VLOG_WARN("uname reported bad OS release (%s)", utsname
.release
);
5093 } else if (major
< 2 || (major
== 2 && minor
< 35)) {
5094 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5099 ovsthread_once_done(&once
);
5104 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5105 * kernel to determine what they are. Returns 0 if successful, otherwise a
5106 * positive errno value. */
5108 tc_query_qdisc(const struct netdev
*netdev_
)
5110 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5111 struct ofpbuf request
, *qdisc
;
5112 const struct tc_ops
*ops
;
5113 struct tcmsg
*tcmsg
;
5121 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5122 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5123 * 2.6.35 without that fix backported to it.
5125 * To avoid the OOPS, we must not make a request that would attempt to dump
5126 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5127 * few others. There are a few ways that I can see to do this, but most of
5128 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5129 * technique chosen here is to assume that any non-default qdisc that we
5130 * create will have a class with handle 1:0. The built-in qdiscs only have
5131 * a class with handle 0:0.
5133 * On Linux 2.6.35+ we use the straightforward method because it allows us
5134 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5135 * in such a case we get no response at all from the kernel (!) if a
5136 * builtin qdisc is in use (which is later caught by "!error &&
5137 * !qdisc->size"). */
5138 tcmsg
= tc_make_request(netdev_
, RTM_GETQDISC
, NLM_F_ECHO
, &request
);
5142 tcmsg
->tcm_handle
= tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5143 tcmsg
->tcm_parent
= getqdisc_is_safe() ? TC_H_ROOT
: 0;
5145 /* Figure out what tc class to instantiate. */
5146 error
= tc_transact(&request
, &qdisc
);
5147 if (!error
&& qdisc
->size
) {
5150 error
= tc_parse_qdisc(qdisc
, &kind
, NULL
);
5152 ops
= &tc_ops_other
;
5154 ops
= tc_lookup_linux_name(kind
);
5156 static struct vlog_rate_limit rl2
= VLOG_RATE_LIMIT_INIT(1, 1);
5157 VLOG_DBG_RL(&rl2
, "unknown qdisc \"%s\"", kind
);
5159 ops
= &tc_ops_other
;
5162 } else if ((!error
&& !qdisc
->size
) || error
== ENOENT
) {
5163 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5164 * set up by some other entity that doesn't have a handle 1:0. We will
5165 * assume that it's the system default qdisc. */
5166 ops
= &tc_ops_default
;
5169 /* Who knows? Maybe the device got deleted. */
5170 VLOG_WARN_RL(&rl
, "query %s qdisc failed (%s)",
5171 netdev_get_name(netdev_
), ovs_strerror(error
));
5172 ops
= &tc_ops_other
;
5175 /* Instantiate it. */
5176 load_error
= ops
->tc_load(CONST_CAST(struct netdev
*, netdev_
), qdisc
);
5177 ovs_assert((load_error
== 0) == (netdev
->tc
!= NULL
));
5178 ofpbuf_delete(qdisc
);
5180 return error
? error
: load_error
;
5183 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5184 approximate the time to transmit packets of various lengths. For an MTU of
5185 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5186 represents two possible packet lengths; for a MTU of 513 through 1024, four
5187 possible lengths; and so on.
5189 Returns, for the specified 'mtu', the number of bits that packet lengths
5190 need to be shifted right to fit within such a 256-entry table. */
5192 tc_calc_cell_log(unsigned int mtu
)
5197 mtu
= ETH_PAYLOAD_MAX
;
5199 mtu
+= ETH_HEADER_LEN
+ VLAN_HEADER_LEN
;
5201 for (cell_log
= 0; mtu
>= 256; cell_log
++) {
5208 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5211 tc_fill_rate(struct tc_ratespec
*rate
, uint64_t Bps
, int mtu
)
5213 memset(rate
, 0, sizeof *rate
);
5214 rate
->cell_log
= tc_calc_cell_log(mtu
);
5215 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5216 /* rate->cell_align = 0; */ /* distro headers. */
5217 rate
->mpu
= ETH_TOTAL_MIN
;
5221 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5222 * attribute of the specified "type".
5224 * See tc_calc_cell_log() above for a description of "rtab"s. */
5226 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
)
5231 rtab
= nl_msg_put_unspec_uninit(msg
, type
, TC_RTAB_SIZE
);
5232 for (i
= 0; i
< TC_RTAB_SIZE
/ sizeof *rtab
; i
++) {
5233 unsigned packet_size
= (i
+ 1) << rate
->cell_log
;
5234 if (packet_size
< rate
->mpu
) {
5235 packet_size
= rate
->mpu
;
5237 rtab
[i
] = tc_bytes_to_ticks(rate
->rate
, packet_size
);
5241 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5242 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5243 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5246 tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
)
5248 unsigned int min_burst
= tc_buffer_per_jiffy(Bps
) + mtu
;
5249 return tc_bytes_to_ticks(Bps
, MAX(burst_bytes
, min_burst
));
5252 /* Linux-only functions declared in netdev-linux.h */
5254 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5255 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5257 netdev_linux_ethtool_set_flag(struct netdev
*netdev
, uint32_t flag
,
5258 const char *flag_name
, bool enable
)
5260 const char *netdev_name
= netdev_get_name(netdev
);
5261 struct ethtool_value evalue
;
5265 COVERAGE_INC(netdev_get_ethtool
);
5266 memset(&evalue
, 0, sizeof evalue
);
5267 error
= netdev_linux_do_ethtool(netdev_name
,
5268 (struct ethtool_cmd
*)&evalue
,
5269 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5274 COVERAGE_INC(netdev_set_ethtool
);
5275 new_flags
= (evalue
.data
& ~flag
) | (enable
? flag
: 0);
5276 if (new_flags
== evalue
.data
) {
5279 evalue
.data
= new_flags
;
5280 error
= netdev_linux_do_ethtool(netdev_name
,
5281 (struct ethtool_cmd
*)&evalue
,
5282 ETHTOOL_SFLAGS
, "ETHTOOL_SFLAGS");
5287 COVERAGE_INC(netdev_get_ethtool
);
5288 memset(&evalue
, 0, sizeof evalue
);
5289 error
= netdev_linux_do_ethtool(netdev_name
,
5290 (struct ethtool_cmd
*)&evalue
,
5291 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5296 if (new_flags
!= evalue
.data
) {
5297 VLOG_WARN_RL(&rl
, "attempt to %s ethtool %s flag on network "
5298 "device %s failed", enable
? "enable" : "disable",
5299 flag_name
, netdev_name
);
5306 /* Utility functions. */
5308 /* Copies 'src' into 'dst', performing format conversion in the process. */
5310 netdev_stats_from_rtnl_link_stats(struct netdev_stats
*dst
,
5311 const struct rtnl_link_stats
*src
)
5313 dst
->rx_packets
= src
->rx_packets
;
5314 dst
->tx_packets
= src
->tx_packets
;
5315 dst
->rx_bytes
= src
->rx_bytes
;
5316 dst
->tx_bytes
= src
->tx_bytes
;
5317 dst
->rx_errors
= src
->rx_errors
;
5318 dst
->tx_errors
= src
->tx_errors
;
5319 dst
->rx_dropped
= src
->rx_dropped
;
5320 dst
->tx_dropped
= src
->tx_dropped
;
5321 dst
->multicast
= src
->multicast
;
5322 dst
->collisions
= src
->collisions
;
5323 dst
->rx_length_errors
= src
->rx_length_errors
;
5324 dst
->rx_over_errors
= src
->rx_over_errors
;
5325 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5326 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5327 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5328 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5329 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5330 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5331 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5332 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5333 dst
->tx_window_errors
= src
->tx_window_errors
;
5336 /* Copies 'src' into 'dst', performing format conversion in the process. */
5338 netdev_stats_from_rtnl_link_stats64(struct netdev_stats
*dst
,
5339 const struct rtnl_link_stats64
*src
)
5341 dst
->rx_packets
= src
->rx_packets
;
5342 dst
->tx_packets
= src
->tx_packets
;
5343 dst
->rx_bytes
= src
->rx_bytes
;
5344 dst
->tx_bytes
= src
->tx_bytes
;
5345 dst
->rx_errors
= src
->rx_errors
;
5346 dst
->tx_errors
= src
->tx_errors
;
5347 dst
->rx_dropped
= src
->rx_dropped
;
5348 dst
->tx_dropped
= src
->tx_dropped
;
5349 dst
->multicast
= src
->multicast
;
5350 dst
->collisions
= src
->collisions
;
5351 dst
->rx_length_errors
= src
->rx_length_errors
;
5352 dst
->rx_over_errors
= src
->rx_over_errors
;
5353 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5354 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5355 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5356 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5357 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5358 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5359 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5360 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5361 dst
->tx_window_errors
= src
->tx_window_errors
;
5365 get_stats_via_netlink(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
5367 struct ofpbuf request
;
5368 struct ofpbuf
*reply
;
5371 ofpbuf_init(&request
, 0);
5372 nl_msg_put_nlmsghdr(&request
,
5373 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
5374 RTM_GETLINK
, NLM_F_REQUEST
);
5375 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
5376 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(netdev_
));
5377 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
5378 ofpbuf_uninit(&request
);
5383 if (ofpbuf_try_pull(reply
, NLMSG_HDRLEN
+ sizeof(struct ifinfomsg
))) {
5384 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS64
);
5385 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats64
)) {
5386 netdev_stats_from_rtnl_link_stats64(stats
, nl_attr_get(a
));
5389 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS
);
5390 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats
)) {
5391 netdev_stats_from_rtnl_link_stats(stats
, nl_attr_get(a
));
5394 VLOG_WARN_RL(&rl
, "RTM_GETLINK reply lacks stats");
5399 VLOG_WARN_RL(&rl
, "short RTM_GETLINK reply");
5404 ofpbuf_delete(reply
);
5409 get_flags(const struct netdev
*dev
, unsigned int *flags
)
5415 error
= af_inet_ifreq_ioctl(dev
->name
, &ifr
, SIOCGIFFLAGS
, "SIOCGIFFLAGS");
5417 *flags
= ifr
.ifr_flags
;
5423 set_flags(const char *name
, unsigned int flags
)
5427 ifr
.ifr_flags
= flags
;
5428 return af_inet_ifreq_ioctl(name
, &ifr
, SIOCSIFFLAGS
, "SIOCSIFFLAGS");
5432 do_get_ifindex(const char *netdev_name
)
5437 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5438 COVERAGE_INC(netdev_get_ifindex
);
5440 error
= af_inet_ioctl(SIOCGIFINDEX
, &ifr
);
5442 VLOG_WARN_RL(&rl
, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5443 netdev_name
, ovs_strerror(error
));
5446 return ifr
.ifr_ifindex
;
5450 get_ifindex(const struct netdev
*netdev_
, int *ifindexp
)
5452 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5454 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
5455 int ifindex
= do_get_ifindex(netdev_get_name(netdev_
));
5458 netdev
->get_ifindex_error
= -ifindex
;
5459 netdev
->ifindex
= 0;
5461 netdev
->get_ifindex_error
= 0;
5462 netdev
->ifindex
= ifindex
;
5464 netdev
->cache_valid
|= VALID_IFINDEX
;
5467 *ifindexp
= netdev
->ifindex
;
5468 return netdev
->get_ifindex_error
;
5472 get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
)
5478 memset(&ifr
, 0, sizeof ifr
);
5479 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5480 COVERAGE_INC(netdev_get_hwaddr
);
5481 error
= af_inet_ioctl(SIOCGIFHWADDR
, &ifr
);
5483 /* ENODEV probably means that a vif disappeared asynchronously and
5484 * hasn't been removed from the database yet, so reduce the log level
5485 * to INFO for that case. */
5486 VLOG(error
== ENODEV
? VLL_INFO
: VLL_ERR
,
5487 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5488 netdev_name
, ovs_strerror(error
));
5491 hwaddr_family
= ifr
.ifr_hwaddr
.sa_family
;
5492 if (hwaddr_family
!= AF_UNSPEC
&& hwaddr_family
!= ARPHRD_ETHER
) {
5493 VLOG_INFO("%s device has unknown hardware address family %d",
5494 netdev_name
, hwaddr_family
);
5497 memcpy(ea
, ifr
.ifr_hwaddr
.sa_data
, ETH_ADDR_LEN
);
5502 set_etheraddr(const char *netdev_name
, const struct eth_addr mac
)
5507 memset(&ifr
, 0, sizeof ifr
);
5508 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5509 ifr
.ifr_hwaddr
.sa_family
= ARPHRD_ETHER
;
5510 memcpy(ifr
.ifr_hwaddr
.sa_data
, &mac
, ETH_ADDR_LEN
);
5511 COVERAGE_INC(netdev_set_hwaddr
);
5512 error
= af_inet_ioctl(SIOCSIFHWADDR
, &ifr
);
5514 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5515 netdev_name
, ovs_strerror(error
));
5521 netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*ecmd
,
5522 int cmd
, const char *cmd_name
)
5527 memset(&ifr
, 0, sizeof ifr
);
5528 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
5529 ifr
.ifr_data
= (caddr_t
) ecmd
;
5532 error
= af_inet_ioctl(SIOCETHTOOL
, &ifr
);
5534 if (error
!= EOPNOTSUPP
) {
5535 VLOG_WARN_RL(&rl
, "ethtool command %s on network device %s "
5536 "failed: %s", cmd_name
, name
, ovs_strerror(error
));
5538 /* The device doesn't support this operation. That's pretty
5539 * common, so there's no point in logging anything. */
5545 /* Returns an AF_PACKET raw socket or a negative errno value. */
5547 af_packet_sock(void)
5549 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5552 if (ovsthread_once_start(&once
)) {
5553 sock
= socket(AF_PACKET
, SOCK_RAW
, 0);
5555 int error
= set_nonblocking(sock
);
5562 VLOG_ERR("failed to create packet socket: %s",
5563 ovs_strerror(errno
));
5565 ovsthread_once_done(&once
);