2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "openvswitch/dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux
);
79 COVERAGE_DEFINE(netdev_set_policing
);
80 COVERAGE_DEFINE(netdev_arp_lookup
);
81 COVERAGE_DEFINE(netdev_get_ifindex
);
82 COVERAGE_DEFINE(netdev_get_hwaddr
);
83 COVERAGE_DEFINE(netdev_set_hwaddr
);
84 COVERAGE_DEFINE(netdev_get_ethtool
);
85 COVERAGE_DEFINE(netdev_set_ethtool
);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata
{
138 uint16_t tp_vlan_tci
;
139 uint16_t tp_vlan_tpid
;
142 /* Linux 2.6.27 introduced ethtool_cmd_speed
144 * To avoid revisiting problems reported with using configure to detect
145 * compatibility (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
147 * unconditionally replace ethtool_cmd_speed. */
148 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
149 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd
*ep
)
151 return ep
->speed
| (ep
->speed_hi
<< 16);
154 /* Linux 2.6.30 introduced supported and advertised flags for
155 * 1G base KX, and 10G base KX4, KR and R. */
156 #ifndef SUPPORTED_1000baseKX_Full
157 #define SUPPORTED_1000baseKX_Full (1 << 17)
158 #define SUPPORTED_10000baseKX4_Full (1 << 18)
159 #define SUPPORTED_10000baseKR_Full (1 << 19)
160 #define SUPPORTED_10000baseR_FEC (1 << 20)
161 #define ADVERTISED_1000baseKX_Full (1 << 17)
162 #define ADVERTISED_10000baseKX4_Full (1 << 18)
163 #define ADVERTISED_10000baseKR_Full (1 << 19)
164 #define ADVERTISED_10000baseR_FEC (1 << 20)
167 /* Linux 3.5 introduced supported and advertised flags for
168 * 40G base KR4, CR4, SR4 and LR4. */
169 #ifndef SUPPORTED_40000baseKR4_Full
170 #define SUPPORTED_40000baseKR4_Full (1 << 23)
171 #define SUPPORTED_40000baseCR4_Full (1 << 24)
172 #define SUPPORTED_40000baseSR4_Full (1 << 25)
173 #define SUPPORTED_40000baseLR4_Full (1 << 26)
174 #define ADVERTISED_40000baseKR4_Full (1 << 23)
175 #define ADVERTISED_40000baseCR4_Full (1 << 24)
176 #define ADVERTISED_40000baseSR4_Full (1 << 25)
177 #define ADVERTISED_40000baseLR4_Full (1 << 26)
180 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
182 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
183 * 2.6.32-431.29.2.el6.x86_64 (see report at
184 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
185 * if_link.h is not self-contained on those kernels. It is easiest to
186 * unconditionally define a replacement. */
188 #define IFLA_STATS64 23
190 #define rtnl_link_stats64 rpl_rtnl_link_stats64
191 struct rtnl_link_stats64
{
203 uint64_t rx_length_errors
;
204 uint64_t rx_over_errors
;
205 uint64_t rx_crc_errors
;
206 uint64_t rx_frame_errors
;
207 uint64_t rx_fifo_errors
;
208 uint64_t rx_missed_errors
;
210 uint64_t tx_aborted_errors
;
211 uint64_t tx_carrier_errors
;
212 uint64_t tx_fifo_errors
;
213 uint64_t tx_heartbeat_errors
;
214 uint64_t tx_window_errors
;
216 uint64_t rx_compressed
;
217 uint64_t tx_compressed
;
221 VALID_IFINDEX
= 1 << 0,
222 VALID_ETHERADDR
= 1 << 1,
225 VALID_POLICING
= 1 << 4,
226 VALID_VPORT_STAT_ERROR
= 1 << 5,
227 VALID_DRVINFO
= 1 << 6,
228 VALID_FEATURES
= 1 << 7,
231 /* Traffic control. */
233 /* An instance of a traffic control class. Always associated with a particular
236 * Each TC implementation subclasses this with whatever additional data it
239 const struct tc_ops
*ops
;
240 struct hmap queues
; /* Contains "struct tc_queue"s.
241 * Read by generic TC layer.
242 * Written only by TC implementation. */
245 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
247 /* One traffic control queue.
249 * Each TC implementation subclasses this with whatever additional data it
252 struct hmap_node hmap_node
; /* In struct tc's "queues" hmap. */
253 unsigned int queue_id
; /* OpenFlow queue ID. */
254 long long int created
; /* Time queue was created, in msecs. */
257 /* A particular kind of traffic control. Each implementation generally maps to
258 * one particular Linux qdisc class.
260 * The functions below return 0 if successful or a positive errno value on
261 * failure, except where otherwise noted. All of them must be provided, except
262 * where otherwise noted. */
264 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
265 * This is null for tc_ops_default and tc_ops_other, for which there are no
266 * appropriate values. */
267 const char *linux_name
;
269 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
270 const char *ovs_name
;
272 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
273 * queues. The queues are numbered 0 through n_queues - 1. */
274 unsigned int n_queues
;
276 /* Called to install this TC class on 'netdev'. The implementation should
277 * make the Netlink calls required to set up 'netdev' with the right qdisc
278 * and configure it according to 'details'. The implementation may assume
279 * that the current qdisc is the default; that is, there is no need for it
280 * to delete the current qdisc before installing itself.
282 * The contents of 'details' should be documented as valid for 'ovs_name'
283 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
284 * (which is built as ovs-vswitchd.conf.db(8)).
286 * This function must return 0 if and only if it sets 'netdev->tc' to an
287 * initialized 'struct tc'.
289 * (This function is null for tc_ops_other, which cannot be installed. For
290 * other TC classes it should always be nonnull.) */
291 int (*tc_install
)(struct netdev
*netdev
, const struct smap
*details
);
293 /* Called when the netdev code determines (through a Netlink query) that
294 * this TC class's qdisc is installed on 'netdev', but we didn't install
295 * it ourselves and so don't know any of the details.
297 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
298 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
299 * implementation should parse the other attributes of 'nlmsg' as
300 * necessary to determine its configuration. If necessary it should also
301 * use Netlink queries to determine the configuration of queues on
304 * This function must return 0 if and only if it sets 'netdev->tc' to an
305 * initialized 'struct tc'. */
306 int (*tc_load
)(struct netdev
*netdev
, struct ofpbuf
*nlmsg
);
308 /* Destroys the data structures allocated by the implementation as part of
309 * 'tc'. (This includes destroying 'tc->queues' by calling
312 * The implementation should not need to perform any Netlink calls. If
313 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
314 * (But it may not be desirable.)
316 * This function may be null if 'tc' is trivial. */
317 void (*tc_destroy
)(struct tc
*tc
);
319 /* Retrieves details of 'netdev->tc' configuration into 'details'.
321 * The implementation should not need to perform any Netlink calls, because
322 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
323 * cached the configuration.
325 * The contents of 'details' should be documented as valid for 'ovs_name'
326 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
327 * (which is built as ovs-vswitchd.conf.db(8)).
329 * This function may be null if 'tc' is not configurable.
331 int (*qdisc_get
)(const struct netdev
*netdev
, struct smap
*details
);
333 /* Reconfigures 'netdev->tc' according to 'details', performing any
334 * required Netlink calls to complete the reconfiguration.
336 * The contents of 'details' should be documented as valid for 'ovs_name'
337 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
338 * (which is built as ovs-vswitchd.conf.db(8)).
340 * This function may be null if 'tc' is not configurable.
342 int (*qdisc_set
)(struct netdev
*, const struct smap
*details
);
344 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
345 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
347 * The contents of 'details' should be documented as valid for 'ovs_name'
348 * in the "other_config" column in the "Queue" table in
349 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
351 * The implementation should not need to perform any Netlink calls, because
352 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
353 * cached the queue configuration.
355 * This function may be null if 'tc' does not have queues ('n_queues' is
357 int (*class_get
)(const struct netdev
*netdev
, const struct tc_queue
*queue
,
358 struct smap
*details
);
360 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
361 * 'details', perfoming any required Netlink calls to complete the
362 * reconfiguration. The caller ensures that 'queue_id' is less than
365 * The contents of 'details' should be documented as valid for 'ovs_name'
366 * in the "other_config" column in the "Queue" table in
367 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
369 * This function may be null if 'tc' does not have queues or its queues are
370 * not configurable. */
371 int (*class_set
)(struct netdev
*, unsigned int queue_id
,
372 const struct smap
*details
);
374 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
375 * tc_queue's within 'netdev->tc->queues'.
377 * This function may be null if 'tc' does not have queues or its queues
378 * cannot be deleted. */
379 int (*class_delete
)(struct netdev
*, struct tc_queue
*queue
);
381 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
382 * 'struct tc_queue's within 'netdev->tc->queues'.
384 * On success, initializes '*stats'.
386 * This function may be null if 'tc' does not have queues or if it cannot
387 * report queue statistics. */
388 int (*class_get_stats
)(const struct netdev
*netdev
,
389 const struct tc_queue
*queue
,
390 struct netdev_queue_stats
*stats
);
392 /* Extracts queue stats from 'nlmsg', which is a response to a
393 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
395 * This function may be null if 'tc' does not have queues or if it cannot
396 * report queue statistics. */
397 int (*class_dump_stats
)(const struct netdev
*netdev
,
398 const struct ofpbuf
*nlmsg
,
399 netdev_dump_queue_stats_cb
*cb
, void *aux
);
403 tc_init(struct tc
*tc
, const struct tc_ops
*ops
)
406 hmap_init(&tc
->queues
);
410 tc_destroy(struct tc
*tc
)
412 hmap_destroy(&tc
->queues
);
415 static const struct tc_ops tc_ops_htb
;
416 static const struct tc_ops tc_ops_hfsc
;
417 static const struct tc_ops tc_ops_codel
;
418 static const struct tc_ops tc_ops_fqcodel
;
419 static const struct tc_ops tc_ops_sfq
;
420 static const struct tc_ops tc_ops_default
;
421 static const struct tc_ops tc_ops_noop
;
422 static const struct tc_ops tc_ops_other
;
424 static const struct tc_ops
*const tcs
[] = {
425 &tc_ops_htb
, /* Hierarchy token bucket (see tc-htb(8)). */
426 &tc_ops_hfsc
, /* Hierarchical fair service curve. */
427 &tc_ops_codel
, /* Controlled delay */
428 &tc_ops_fqcodel
, /* Fair queue controlled delay */
429 &tc_ops_sfq
, /* Stochastic fair queueing */
430 &tc_ops_noop
, /* Non operating qos type. */
431 &tc_ops_default
, /* Default qdisc (see tc-pfifo_fast(8)). */
432 &tc_ops_other
, /* Some other qdisc. */
436 static unsigned int tc_make_handle(unsigned int major
, unsigned int minor
);
437 static unsigned int tc_get_major(unsigned int handle
);
438 static unsigned int tc_get_minor(unsigned int handle
);
440 static unsigned int tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
);
441 static unsigned int tc_bytes_to_ticks(unsigned int rate
, unsigned int size
);
442 static unsigned int tc_buffer_per_jiffy(unsigned int rate
);
444 static struct tcmsg
*tc_make_request(const struct netdev
*, int type
,
445 unsigned int flags
, struct ofpbuf
*);
446 static int tc_transact(struct ofpbuf
*request
, struct ofpbuf
**replyp
);
447 static int tc_add_del_ingress_qdisc(struct netdev
*netdev
, bool add
);
448 static int tc_add_policer(struct netdev
*,
449 uint32_t kbits_rate
, uint32_t kbits_burst
);
451 static int tc_parse_qdisc(const struct ofpbuf
*, const char **kind
,
452 struct nlattr
**options
);
453 static int tc_parse_class(const struct ofpbuf
*, unsigned int *queue_id
,
454 struct nlattr
**options
,
455 struct netdev_queue_stats
*);
456 static int tc_query_class(const struct netdev
*,
457 unsigned int handle
, unsigned int parent
,
458 struct ofpbuf
**replyp
);
459 static int tc_delete_class(const struct netdev
*, unsigned int handle
);
461 static int tc_del_qdisc(struct netdev
*netdev
);
462 static int tc_query_qdisc(const struct netdev
*netdev
);
464 static int tc_calc_cell_log(unsigned int mtu
);
465 static void tc_fill_rate(struct tc_ratespec
*rate
, uint64_t bps
, int mtu
);
466 static void tc_put_rtab(struct ofpbuf
*, uint16_t type
,
467 const struct tc_ratespec
*rate
);
468 static int tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
);
470 struct netdev_linux
{
473 /* Protects all members below. */
474 struct ovs_mutex mutex
;
476 unsigned int cache_valid
;
478 bool miimon
; /* Link status of last poll. */
479 long long int miimon_interval
; /* Miimon Poll rate. Disabled if <= 0. */
480 struct timer miimon_timer
;
482 /* The following are figured out "on demand" only. They are only valid
483 * when the corresponding VALID_* bit in 'cache_valid' is set. */
485 struct eth_addr etheraddr
;
487 unsigned int ifi_flags
;
488 long long int carrier_resets
;
489 uint32_t kbits_rate
; /* Policing data. */
490 uint32_t kbits_burst
;
491 int vport_stats_error
; /* Cached error code from vport_get_stats().
492 0 or an errno value. */
493 int netdev_mtu_error
; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
494 int ether_addr_error
; /* Cached error code from set/get etheraddr. */
495 int netdev_policing_error
; /* Cached error code from set policing. */
496 int get_features_error
; /* Cached error code from ETHTOOL_GSET. */
497 int get_ifindex_error
; /* Cached error code from SIOCGIFINDEX. */
499 enum netdev_features current
; /* Cached from ETHTOOL_GSET. */
500 enum netdev_features advertised
; /* Cached from ETHTOOL_GSET. */
501 enum netdev_features supported
; /* Cached from ETHTOOL_GSET. */
503 struct ethtool_drvinfo drvinfo
; /* Cached from ETHTOOL_GDRVINFO. */
506 /* For devices of class netdev_tap_class only. */
510 struct netdev_rxq_linux
{
511 struct netdev_rxq up
;
516 /* This is set pretty low because we probably won't learn anything from the
517 * additional log messages. */
518 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
520 /* Polling miimon status for all ports causes performance degradation when
521 * handling a large number of ports. If there are no devices using miimon, then
522 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
524 * Readers do not depend on this variable synchronizing with the related
525 * changes in the device miimon status, so we can use atomic_count. */
526 static atomic_count miimon_cnt
= ATOMIC_COUNT_INIT(0);
528 static void netdev_linux_run(void);
530 static int netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*,
531 int cmd
, const char *cmd_name
);
532 static int get_flags(const struct netdev
*, unsigned int *flags
);
533 static int set_flags(const char *, unsigned int flags
);
534 static int update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
535 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
536 OVS_REQUIRES(netdev
->mutex
);
537 static int do_get_ifindex(const char *netdev_name
);
538 static int get_ifindex(const struct netdev
*, int *ifindexp
);
539 static int do_set_addr(struct netdev
*netdev
,
540 int ioctl_nr
, const char *ioctl_name
,
541 struct in_addr addr
);
542 static int get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
);
543 static int set_etheraddr(const char *netdev_name
, const struct eth_addr
);
544 static int get_stats_via_netlink(const struct netdev
*, struct netdev_stats
*);
545 static int af_packet_sock(void);
546 static bool netdev_linux_miimon_enabled(void);
547 static void netdev_linux_miimon_run(void);
548 static void netdev_linux_miimon_wait(void);
549 static int netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
);
552 is_netdev_linux_class(const struct netdev_class
*netdev_class
)
554 return netdev_class
->run
== netdev_linux_run
;
558 is_tap_netdev(const struct netdev
*netdev
)
560 return netdev_get_class(netdev
) == &netdev_tap_class
;
563 static struct netdev_linux
*
564 netdev_linux_cast(const struct netdev
*netdev
)
566 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev
)));
568 return CONTAINER_OF(netdev
, struct netdev_linux
, up
);
571 static struct netdev_rxq_linux
*
572 netdev_rxq_linux_cast(const struct netdev_rxq
*rx
)
574 ovs_assert(is_netdev_linux_class(netdev_get_class(rx
->netdev
)));
575 return CONTAINER_OF(rx
, struct netdev_rxq_linux
, up
);
578 static void netdev_linux_update(struct netdev_linux
*netdev
,
579 const struct rtnetlink_change
*)
580 OVS_REQUIRES(netdev
->mutex
);
581 static void netdev_linux_changed(struct netdev_linux
*netdev
,
582 unsigned int ifi_flags
, unsigned int mask
)
583 OVS_REQUIRES(netdev
->mutex
);
585 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
586 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
587 * if no such socket could be created. */
588 static struct nl_sock
*
589 netdev_linux_notify_sock(void)
591 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
592 static struct nl_sock
*sock
;
593 unsigned int mcgroups
[] = {RTNLGRP_LINK
, RTNLGRP_IPV4_IFADDR
,
594 RTNLGRP_IPV6_IFADDR
, RTNLGRP_IPV6_IFINFO
};
596 if (ovsthread_once_start(&once
)) {
599 error
= nl_sock_create(NETLINK_ROUTE
, &sock
);
603 for (i
= 0; i
< ARRAY_SIZE(mcgroups
); i
++) {
604 error
= nl_sock_join_mcgroup(sock
, mcgroups
[i
]);
606 nl_sock_destroy(sock
);
612 ovsthread_once_done(&once
);
619 netdev_linux_miimon_enabled(void)
621 return atomic_count_get(&miimon_cnt
) > 0;
625 netdev_linux_run(void)
627 struct nl_sock
*sock
;
630 if (netdev_linux_miimon_enabled()) {
631 netdev_linux_miimon_run();
634 sock
= netdev_linux_notify_sock();
640 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
641 uint64_t buf_stub
[4096 / 8];
644 ofpbuf_use_stub(&buf
, buf_stub
, sizeof buf_stub
);
645 error
= nl_sock_recv(sock
, &buf
, false);
647 struct rtnetlink_change change
;
649 if (rtnetlink_parse(&buf
, &change
)) {
650 struct netdev
*netdev_
= NULL
;
651 char dev_name
[IFNAMSIZ
];
653 if (!change
.ifname
) {
654 change
.ifname
= if_indextoname(change
.if_index
, dev_name
);
658 netdev_
= netdev_from_name(change
.ifname
);
660 if (netdev_
&& is_netdev_linux_class(netdev_
->netdev_class
)) {
661 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
663 ovs_mutex_lock(&netdev
->mutex
);
664 netdev_linux_update(netdev
, &change
);
665 ovs_mutex_unlock(&netdev
->mutex
);
667 netdev_close(netdev_
);
669 } else if (error
== ENOBUFS
) {
670 struct shash device_shash
;
671 struct shash_node
*node
;
675 shash_init(&device_shash
);
676 netdev_get_devices(&netdev_linux_class
, &device_shash
);
677 SHASH_FOR_EACH (node
, &device_shash
) {
678 struct netdev
*netdev_
= node
->data
;
679 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
682 ovs_mutex_lock(&netdev
->mutex
);
683 get_flags(netdev_
, &flags
);
684 netdev_linux_changed(netdev
, flags
, 0);
685 ovs_mutex_unlock(&netdev
->mutex
);
687 netdev_close(netdev_
);
689 shash_destroy(&device_shash
);
690 } else if (error
!= EAGAIN
) {
691 VLOG_WARN_RL(&rl
, "error reading or parsing netlink (%s)",
692 ovs_strerror(error
));
699 netdev_linux_wait(void)
701 struct nl_sock
*sock
;
703 if (netdev_linux_miimon_enabled()) {
704 netdev_linux_miimon_wait();
706 sock
= netdev_linux_notify_sock();
708 nl_sock_wait(sock
, POLLIN
);
713 netdev_linux_changed(struct netdev_linux
*dev
,
714 unsigned int ifi_flags
, unsigned int mask
)
715 OVS_REQUIRES(dev
->mutex
)
717 netdev_change_seq_changed(&dev
->up
);
719 if ((dev
->ifi_flags
^ ifi_flags
) & IFF_RUNNING
) {
720 dev
->carrier_resets
++;
722 dev
->ifi_flags
= ifi_flags
;
724 dev
->cache_valid
&= mask
;
725 if (!(mask
& VALID_IN
)) {
726 netdev_get_addrs_list_flush();
731 netdev_linux_update(struct netdev_linux
*dev
,
732 const struct rtnetlink_change
*change
)
733 OVS_REQUIRES(dev
->mutex
)
735 if (rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)){
736 if (change
->nlmsg_type
== RTM_NEWLINK
) {
737 /* Keep drv-info, and ip addresses. */
738 netdev_linux_changed(dev
, change
->ifi_flags
,
739 VALID_DRVINFO
| VALID_IN
);
741 /* Update netdev from rtnl-change msg. */
743 dev
->mtu
= change
->mtu
;
744 dev
->cache_valid
|= VALID_MTU
;
745 dev
->netdev_mtu_error
= 0;
748 if (!eth_addr_is_zero(change
->mac
)) {
749 dev
->etheraddr
= change
->mac
;
750 dev
->cache_valid
|= VALID_ETHERADDR
;
751 dev
->ether_addr_error
= 0;
754 dev
->ifindex
= change
->if_index
;
755 dev
->cache_valid
|= VALID_IFINDEX
;
756 dev
->get_ifindex_error
= 0;
758 netdev_linux_changed(dev
, change
->ifi_flags
, 0);
760 } else if (rtnetlink_type_is_rtnlgrp_addr(change
->nlmsg_type
)) {
761 /* Invalidates in4, in6. */
762 netdev_linux_changed(dev
, dev
->ifi_flags
, ~VALID_IN
);
768 static struct netdev
*
769 netdev_linux_alloc(void)
771 struct netdev_linux
*netdev
= xzalloc(sizeof *netdev
);
776 netdev_linux_common_construct(struct netdev_linux
*netdev
)
778 ovs_mutex_init(&netdev
->mutex
);
781 /* Creates system and internal devices. */
783 netdev_linux_construct(struct netdev
*netdev_
)
785 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
788 netdev_linux_common_construct(netdev
);
790 error
= get_flags(&netdev
->up
, &netdev
->ifi_flags
);
791 if (error
== ENODEV
) {
792 if (netdev
->up
.netdev_class
!= &netdev_internal_class
) {
793 /* The device does not exist, so don't allow it to be opened. */
796 /* "Internal" netdevs have to be created as netdev objects before
797 * they exist in the kernel, because creating them in the kernel
798 * happens by passing a netdev object to dpif_port_add().
799 * Therefore, ignore the error. */
806 /* For most types of netdevs we open the device for each call of
807 * netdev_open(). However, this is not the case with tap devices,
808 * since it is only possible to open the device once. In this
809 * situation we share a single file descriptor, and consequently
810 * buffers, across all readers. Therefore once data is read it will
811 * be unavailable to other reads for tap devices. */
813 netdev_linux_construct_tap(struct netdev
*netdev_
)
815 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
816 static const char tap_dev
[] = "/dev/net/tun";
817 const char *name
= netdev_
->name
;
821 netdev_linux_common_construct(netdev
);
823 /* Open tap device. */
824 netdev
->tap_fd
= open(tap_dev
, O_RDWR
);
825 if (netdev
->tap_fd
< 0) {
827 VLOG_WARN("opening \"%s\" failed: %s", tap_dev
, ovs_strerror(error
));
831 /* Create tap device. */
832 ifr
.ifr_flags
= IFF_TAP
| IFF_NO_PI
;
833 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
834 if (ioctl(netdev
->tap_fd
, TUNSETIFF
, &ifr
) == -1) {
835 VLOG_WARN("%s: creating tap device failed: %s", name
,
836 ovs_strerror(errno
));
841 /* Make non-blocking. */
842 error
= set_nonblocking(netdev
->tap_fd
);
850 close(netdev
->tap_fd
);
855 netdev_linux_destruct(struct netdev
*netdev_
)
857 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
859 if (netdev
->tc
&& netdev
->tc
->ops
->tc_destroy
) {
860 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
863 if (netdev_get_class(netdev_
) == &netdev_tap_class
864 && netdev
->tap_fd
>= 0)
866 close(netdev
->tap_fd
);
869 if (netdev
->miimon_interval
> 0) {
870 atomic_count_dec(&miimon_cnt
);
873 ovs_mutex_destroy(&netdev
->mutex
);
877 netdev_linux_dealloc(struct netdev
*netdev_
)
879 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
883 static struct netdev_rxq
*
884 netdev_linux_rxq_alloc(void)
886 struct netdev_rxq_linux
*rx
= xzalloc(sizeof *rx
);
891 netdev_linux_rxq_construct(struct netdev_rxq
*rxq_
)
893 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
894 struct netdev
*netdev_
= rx
->up
.netdev
;
895 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
898 ovs_mutex_lock(&netdev
->mutex
);
899 rx
->is_tap
= is_tap_netdev(netdev_
);
901 rx
->fd
= netdev
->tap_fd
;
903 struct sockaddr_ll sll
;
905 /* Result of tcpdump -dd inbound */
906 static const struct sock_filter filt
[] = {
907 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
908 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
909 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
910 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
912 static const struct sock_fprog fprog
= {
913 ARRAY_SIZE(filt
), (struct sock_filter
*) filt
916 /* Create file descriptor. */
917 rx
->fd
= socket(PF_PACKET
, SOCK_RAW
, 0);
920 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error
));
925 if (setsockopt(rx
->fd
, SOL_PACKET
, PACKET_AUXDATA
, &val
, sizeof val
)) {
927 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
928 netdev_get_name(netdev_
), ovs_strerror(error
));
932 /* Set non-blocking mode. */
933 error
= set_nonblocking(rx
->fd
);
938 /* Get ethernet device index. */
939 error
= get_ifindex(&netdev
->up
, &ifindex
);
944 /* Bind to specific ethernet device. */
945 memset(&sll
, 0, sizeof sll
);
946 sll
.sll_family
= AF_PACKET
;
947 sll
.sll_ifindex
= ifindex
;
948 sll
.sll_protocol
= htons(ETH_P_ALL
);
949 if (bind(rx
->fd
, (struct sockaddr
*) &sll
, sizeof sll
) < 0) {
951 VLOG_ERR("%s: failed to bind raw socket (%s)",
952 netdev_get_name(netdev_
), ovs_strerror(error
));
956 /* Filter for only inbound packets. */
957 error
= setsockopt(rx
->fd
, SOL_SOCKET
, SO_ATTACH_FILTER
, &fprog
,
961 VLOG_ERR("%s: failed to attach filter (%s)",
962 netdev_get_name(netdev_
), ovs_strerror(error
));
966 ovs_mutex_unlock(&netdev
->mutex
);
974 ovs_mutex_unlock(&netdev
->mutex
);
979 netdev_linux_rxq_destruct(struct netdev_rxq
*rxq_
)
981 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
989 netdev_linux_rxq_dealloc(struct netdev_rxq
*rxq_
)
991 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
997 auxdata_to_vlan_tpid(const struct tpacket_auxdata
*aux
)
999 if (aux
->tp_status
& TP_STATUS_VLAN_TPID_VALID
) {
1000 return htons(aux
->tp_vlan_tpid
);
1002 return htons(ETH_TYPE_VLAN
);
1007 auxdata_has_vlan_tci(const struct tpacket_auxdata
*aux
)
1009 return aux
->tp_vlan_tci
|| aux
->tp_status
& TP_STATUS_VLAN_VALID
;
1013 netdev_linux_rxq_recv_sock(int fd
, struct dp_packet
*buffer
)
1018 struct cmsghdr
*cmsg
;
1020 struct cmsghdr cmsg
;
1021 char buffer
[CMSG_SPACE(sizeof(struct tpacket_auxdata
))];
1025 /* Reserve headroom for a single VLAN tag */
1026 dp_packet_reserve(buffer
, VLAN_HEADER_LEN
);
1027 size
= dp_packet_tailroom(buffer
);
1029 iov
.iov_base
= dp_packet_data(buffer
);
1031 msgh
.msg_name
= NULL
;
1032 msgh
.msg_namelen
= 0;
1033 msgh
.msg_iov
= &iov
;
1034 msgh
.msg_iovlen
= 1;
1035 msgh
.msg_control
= &cmsg_buffer
;
1036 msgh
.msg_controllen
= sizeof cmsg_buffer
;
1040 retval
= recvmsg(fd
, &msgh
, MSG_TRUNC
);
1041 } while (retval
< 0 && errno
== EINTR
);
1045 } else if (retval
> size
) {
1049 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1051 for (cmsg
= CMSG_FIRSTHDR(&msgh
); cmsg
; cmsg
= CMSG_NXTHDR(&msgh
, cmsg
)) {
1052 const struct tpacket_auxdata
*aux
;
1054 if (cmsg
->cmsg_level
!= SOL_PACKET
1055 || cmsg
->cmsg_type
!= PACKET_AUXDATA
1056 || cmsg
->cmsg_len
< CMSG_LEN(sizeof(struct tpacket_auxdata
))) {
1060 aux
= ALIGNED_CAST(struct tpacket_auxdata
*, CMSG_DATA(cmsg
));
1061 if (auxdata_has_vlan_tci(aux
)) {
1062 if (retval
< ETH_HEADER_LEN
) {
1066 eth_push_vlan(buffer
, auxdata_to_vlan_tpid(aux
),
1067 htons(aux
->tp_vlan_tci
));
1076 netdev_linux_rxq_recv_tap(int fd
, struct dp_packet
*buffer
)
1079 size_t size
= dp_packet_tailroom(buffer
);
1082 retval
= read(fd
, dp_packet_data(buffer
), size
);
1083 } while (retval
< 0 && errno
== EINTR
);
1089 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1094 netdev_linux_rxq_recv(struct netdev_rxq
*rxq_
, struct dp_packet
**packets
,
1097 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1098 struct netdev
*netdev
= rx
->up
.netdev
;
1099 struct dp_packet
*buffer
;
1103 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
1104 mtu
= ETH_PAYLOAD_MAX
;
1107 buffer
= dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN
+ mtu
,
1108 DP_NETDEV_HEADROOM
);
1109 retval
= (rx
->is_tap
1110 ? netdev_linux_rxq_recv_tap(rx
->fd
, buffer
)
1111 : netdev_linux_rxq_recv_sock(rx
->fd
, buffer
));
1114 if (retval
!= EAGAIN
&& retval
!= EMSGSIZE
) {
1115 VLOG_WARN_RL(&rl
, "error receiving Ethernet packet on %s: %s",
1116 netdev_rxq_get_name(rxq_
), ovs_strerror(errno
));
1118 dp_packet_delete(buffer
);
1120 dp_packet_pad(buffer
);
1121 packets
[0] = buffer
;
1129 netdev_linux_rxq_wait(struct netdev_rxq
*rxq_
)
1131 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1132 poll_fd_wait(rx
->fd
, POLLIN
);
1136 netdev_linux_rxq_drain(struct netdev_rxq
*rxq_
)
1138 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1141 int error
= af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_
), &ifr
,
1142 SIOCGIFTXQLEN
, "SIOCGIFTXQLEN");
1146 drain_fd(rx
->fd
, ifr
.ifr_qlen
);
1149 return drain_rcvbuf(rx
->fd
);
1153 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1154 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1155 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1156 * the packet is too big or too small to transmit on the device.
1158 * The caller retains ownership of 'buffer' in all cases.
1160 * The kernel maintains a packet transmission queue, so the caller is not
1161 * expected to do additional queuing of packets. */
1163 netdev_linux_send(struct netdev
*netdev_
, int qid OVS_UNUSED
,
1164 struct dp_packet
**pkts
, int cnt
, bool may_steal
)
1169 /* 'i' is incremented only if there's no error */
1170 for (i
= 0; i
< cnt
;) {
1171 const void *data
= dp_packet_data(pkts
[i
]);
1172 size_t size
= dp_packet_size(pkts
[i
]);
1175 if (!is_tap_netdev(netdev_
)) {
1176 /* Use our AF_PACKET socket to send to this device. */
1177 struct sockaddr_ll sll
;
1183 sock
= af_packet_sock();
1188 ifindex
= netdev_get_ifindex(netdev_
);
1193 /* We don't bother setting most fields in sockaddr_ll because the
1194 * kernel ignores them for SOCK_RAW. */
1195 memset(&sll
, 0, sizeof sll
);
1196 sll
.sll_family
= AF_PACKET
;
1197 sll
.sll_ifindex
= ifindex
;
1199 iov
.iov_base
= CONST_CAST(void *, data
);
1202 msg
.msg_name
= &sll
;
1203 msg
.msg_namelen
= sizeof sll
;
1206 msg
.msg_control
= NULL
;
1207 msg
.msg_controllen
= 0;
1210 retval
= sendmsg(sock
, &msg
, 0);
1212 /* Use the tap fd to send to this device. This is essential for
1213 * tap devices, because packets sent to a tap device with an
1214 * AF_PACKET socket will loop back to be *received* again on the
1215 * tap device. This doesn't occur on other interface types
1216 * because we attach a socket filter to the rx socket. */
1217 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1219 retval
= write(netdev
->tap_fd
, data
, size
);
1223 /* The Linux AF_PACKET implementation never blocks waiting for room
1224 * for packets, instead returning ENOBUFS. Translate this into
1225 * EAGAIN for the caller. */
1226 error
= errno
== ENOBUFS
? EAGAIN
: errno
;
1227 if (error
== EINTR
) {
1228 /* continue without incrementing 'i', i.e. retry this packet */
1232 } else if (retval
!= size
) {
1233 VLOG_WARN_RL(&rl
, "sent partial Ethernet packet (%"PRIuSIZE
" bytes"
1234 " of %"PRIuSIZE
") on %s", retval
, size
,
1235 netdev_get_name(netdev_
));
1240 /* Process the next packet in the batch */
1245 for (i
= 0; i
< cnt
; i
++) {
1246 dp_packet_delete(pkts
[i
]);
1250 if (error
&& error
!= EAGAIN
) {
1251 VLOG_WARN_RL(&rl
, "error sending Ethernet packet on %s: %s",
1252 netdev_get_name(netdev_
), ovs_strerror(error
));
1259 /* Registers with the poll loop to wake up from the next call to poll_block()
1260 * when the packet transmission queue has sufficient room to transmit a packet
1261 * with netdev_send().
1263 * The kernel maintains a packet transmission queue, so the client is not
1264 * expected to do additional queuing of packets. Thus, this function is
1265 * unlikely to ever be used. It is included for completeness. */
1267 netdev_linux_send_wait(struct netdev
*netdev
, int qid OVS_UNUSED
)
1269 if (is_tap_netdev(netdev
)) {
1270 /* TAP device always accepts packets.*/
1271 poll_immediate_wake();
1275 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1276 * otherwise a positive errno value. */
1278 netdev_linux_set_etheraddr(struct netdev
*netdev_
, const struct eth_addr mac
)
1280 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1281 enum netdev_flags old_flags
= 0;
1284 ovs_mutex_lock(&netdev
->mutex
);
1286 if (netdev
->cache_valid
& VALID_ETHERADDR
) {
1287 error
= netdev
->ether_addr_error
;
1288 if (error
|| eth_addr_equals(netdev
->etheraddr
, mac
)) {
1291 netdev
->cache_valid
&= ~VALID_ETHERADDR
;
1294 /* Tap devices must be brought down before setting the address. */
1295 if (is_tap_netdev(netdev_
)) {
1296 update_flags(netdev
, NETDEV_UP
, 0, &old_flags
);
1298 error
= set_etheraddr(netdev_get_name(netdev_
), mac
);
1299 if (!error
|| error
== ENODEV
) {
1300 netdev
->ether_addr_error
= error
;
1301 netdev
->cache_valid
|= VALID_ETHERADDR
;
1303 netdev
->etheraddr
= mac
;
1307 if (is_tap_netdev(netdev_
) && old_flags
& NETDEV_UP
) {
1308 update_flags(netdev
, 0, NETDEV_UP
, &old_flags
);
1312 ovs_mutex_unlock(&netdev
->mutex
);
1316 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1318 netdev_linux_get_etheraddr(const struct netdev
*netdev_
, struct eth_addr
*mac
)
1320 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1323 ovs_mutex_lock(&netdev
->mutex
);
1324 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1325 netdev
->ether_addr_error
= get_etheraddr(netdev_get_name(netdev_
),
1326 &netdev
->etheraddr
);
1327 netdev
->cache_valid
|= VALID_ETHERADDR
;
1330 error
= netdev
->ether_addr_error
;
1332 *mac
= netdev
->etheraddr
;
1334 ovs_mutex_unlock(&netdev
->mutex
);
1340 netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
)
1344 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1347 netdev
->netdev_mtu_error
= af_inet_ifreq_ioctl(
1348 netdev_get_name(&netdev
->up
), &ifr
, SIOCGIFMTU
, "SIOCGIFMTU");
1349 netdev
->mtu
= ifr
.ifr_mtu
;
1350 netdev
->cache_valid
|= VALID_MTU
;
1353 error
= netdev
->netdev_mtu_error
;
1355 *mtup
= netdev
->mtu
;
1361 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1362 * in bytes, not including the hardware header; thus, this is typically 1500
1363 * bytes for Ethernet devices. */
1365 netdev_linux_get_mtu(const struct netdev
*netdev_
, int *mtup
)
1367 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1370 ovs_mutex_lock(&netdev
->mutex
);
1371 error
= netdev_linux_get_mtu__(netdev
, mtup
);
1372 ovs_mutex_unlock(&netdev
->mutex
);
1377 /* Sets the maximum size of transmitted (MTU) for given device using linux
1378 * networking ioctl interface.
1381 netdev_linux_set_mtu(const struct netdev
*netdev_
, int mtu
)
1383 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1387 ovs_mutex_lock(&netdev
->mutex
);
1388 if (netdev
->cache_valid
& VALID_MTU
) {
1389 error
= netdev
->netdev_mtu_error
;
1390 if (error
|| netdev
->mtu
== mtu
) {
1393 netdev
->cache_valid
&= ~VALID_MTU
;
1396 error
= af_inet_ifreq_ioctl(netdev_get_name(netdev_
), &ifr
,
1397 SIOCSIFMTU
, "SIOCSIFMTU");
1398 if (!error
|| error
== ENODEV
) {
1399 netdev
->netdev_mtu_error
= error
;
1400 netdev
->mtu
= ifr
.ifr_mtu
;
1401 netdev
->cache_valid
|= VALID_MTU
;
1404 ovs_mutex_unlock(&netdev
->mutex
);
1408 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1409 * On failure, returns a negative errno value. */
1411 netdev_linux_get_ifindex(const struct netdev
*netdev_
)
1413 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1416 ovs_mutex_lock(&netdev
->mutex
);
1417 error
= get_ifindex(netdev_
, &ifindex
);
1418 ovs_mutex_unlock(&netdev
->mutex
);
1420 return error
? -error
: ifindex
;
1424 netdev_linux_get_carrier(const struct netdev
*netdev_
, bool *carrier
)
1426 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1428 ovs_mutex_lock(&netdev
->mutex
);
1429 if (netdev
->miimon_interval
> 0) {
1430 *carrier
= netdev
->miimon
;
1432 *carrier
= (netdev
->ifi_flags
& IFF_RUNNING
) != 0;
1434 ovs_mutex_unlock(&netdev
->mutex
);
1439 static long long int
1440 netdev_linux_get_carrier_resets(const struct netdev
*netdev_
)
1442 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1443 long long int carrier_resets
;
1445 ovs_mutex_lock(&netdev
->mutex
);
1446 carrier_resets
= netdev
->carrier_resets
;
1447 ovs_mutex_unlock(&netdev
->mutex
);
1449 return carrier_resets
;
1453 netdev_linux_do_miimon(const char *name
, int cmd
, const char *cmd_name
,
1454 struct mii_ioctl_data
*data
)
1459 memset(&ifr
, 0, sizeof ifr
);
1460 memcpy(&ifr
.ifr_data
, data
, sizeof *data
);
1461 error
= af_inet_ifreq_ioctl(name
, &ifr
, cmd
, cmd_name
);
1462 memcpy(data
, &ifr
.ifr_data
, sizeof *data
);
1468 netdev_linux_get_miimon(const char *name
, bool *miimon
)
1470 struct mii_ioctl_data data
;
1475 memset(&data
, 0, sizeof data
);
1476 error
= netdev_linux_do_miimon(name
, SIOCGMIIPHY
, "SIOCGMIIPHY", &data
);
1478 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1479 data
.reg_num
= MII_BMSR
;
1480 error
= netdev_linux_do_miimon(name
, SIOCGMIIREG
, "SIOCGMIIREG",
1484 *miimon
= !!(data
.val_out
& BMSR_LSTATUS
);
1486 VLOG_WARN_RL(&rl
, "%s: failed to query MII", name
);
1489 struct ethtool_cmd ecmd
;
1491 VLOG_DBG_RL(&rl
, "%s: failed to query MII, falling back to ethtool",
1494 COVERAGE_INC(netdev_get_ethtool
);
1495 memset(&ecmd
, 0, sizeof ecmd
);
1496 error
= netdev_linux_do_ethtool(name
, &ecmd
, ETHTOOL_GLINK
,
1499 struct ethtool_value eval
;
1501 memcpy(&eval
, &ecmd
, sizeof eval
);
1502 *miimon
= !!eval
.data
;
1504 VLOG_WARN_RL(&rl
, "%s: ethtool link status failed", name
);
1512 netdev_linux_set_miimon_interval(struct netdev
*netdev_
,
1513 long long int interval
)
1515 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1517 ovs_mutex_lock(&netdev
->mutex
);
1518 interval
= interval
> 0 ? MAX(interval
, 100) : 0;
1519 if (netdev
->miimon_interval
!= interval
) {
1520 if (interval
&& !netdev
->miimon_interval
) {
1521 atomic_count_inc(&miimon_cnt
);
1522 } else if (!interval
&& netdev
->miimon_interval
) {
1523 atomic_count_dec(&miimon_cnt
);
1526 netdev
->miimon_interval
= interval
;
1527 timer_set_expired(&netdev
->miimon_timer
);
1529 ovs_mutex_unlock(&netdev
->mutex
);
1535 netdev_linux_miimon_run(void)
1537 struct shash device_shash
;
1538 struct shash_node
*node
;
1540 shash_init(&device_shash
);
1541 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1542 SHASH_FOR_EACH (node
, &device_shash
) {
1543 struct netdev
*netdev
= node
->data
;
1544 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1547 ovs_mutex_lock(&dev
->mutex
);
1548 if (dev
->miimon_interval
> 0 && timer_expired(&dev
->miimon_timer
)) {
1549 netdev_linux_get_miimon(dev
->up
.name
, &miimon
);
1550 if (miimon
!= dev
->miimon
) {
1551 dev
->miimon
= miimon
;
1552 netdev_linux_changed(dev
, dev
->ifi_flags
, 0);
1555 timer_set_duration(&dev
->miimon_timer
, dev
->miimon_interval
);
1557 ovs_mutex_unlock(&dev
->mutex
);
1558 netdev_close(netdev
);
1561 shash_destroy(&device_shash
);
1565 netdev_linux_miimon_wait(void)
1567 struct shash device_shash
;
1568 struct shash_node
*node
;
1570 shash_init(&device_shash
);
1571 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1572 SHASH_FOR_EACH (node
, &device_shash
) {
1573 struct netdev
*netdev
= node
->data
;
1574 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1576 ovs_mutex_lock(&dev
->mutex
);
1577 if (dev
->miimon_interval
> 0) {
1578 timer_wait(&dev
->miimon_timer
);
1580 ovs_mutex_unlock(&dev
->mutex
);
1581 netdev_close(netdev
);
1583 shash_destroy(&device_shash
);
1587 swap_uint64(uint64_t *a
, uint64_t *b
)
1594 /* Copies 'src' into 'dst', performing format conversion in the process.
1596 * 'src' is allowed to be misaligned. */
1598 netdev_stats_from_ovs_vport_stats(struct netdev_stats
*dst
,
1599 const struct ovs_vport_stats
*src
)
1601 dst
->rx_packets
= get_32aligned_u64(&src
->rx_packets
);
1602 dst
->tx_packets
= get_32aligned_u64(&src
->tx_packets
);
1603 dst
->rx_bytes
= get_32aligned_u64(&src
->rx_bytes
);
1604 dst
->tx_bytes
= get_32aligned_u64(&src
->tx_bytes
);
1605 dst
->rx_errors
= get_32aligned_u64(&src
->rx_errors
);
1606 dst
->tx_errors
= get_32aligned_u64(&src
->tx_errors
);
1607 dst
->rx_dropped
= get_32aligned_u64(&src
->rx_dropped
);
1608 dst
->tx_dropped
= get_32aligned_u64(&src
->tx_dropped
);
1610 dst
->collisions
= 0;
1611 dst
->rx_length_errors
= 0;
1612 dst
->rx_over_errors
= 0;
1613 dst
->rx_crc_errors
= 0;
1614 dst
->rx_frame_errors
= 0;
1615 dst
->rx_fifo_errors
= 0;
1616 dst
->rx_missed_errors
= 0;
1617 dst
->tx_aborted_errors
= 0;
1618 dst
->tx_carrier_errors
= 0;
1619 dst
->tx_fifo_errors
= 0;
1620 dst
->tx_heartbeat_errors
= 0;
1621 dst
->tx_window_errors
= 0;
1625 get_stats_via_vport__(const struct netdev
*netdev
, struct netdev_stats
*stats
)
1627 struct dpif_netlink_vport reply
;
1631 error
= dpif_netlink_vport_get(netdev_get_name(netdev
), &reply
, &buf
);
1634 } else if (!reply
.stats
) {
1639 netdev_stats_from_ovs_vport_stats(stats
, reply
.stats
);
1647 get_stats_via_vport(const struct netdev
*netdev_
,
1648 struct netdev_stats
*stats
)
1650 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1652 if (!netdev
->vport_stats_error
||
1653 !(netdev
->cache_valid
& VALID_VPORT_STAT_ERROR
)) {
1656 error
= get_stats_via_vport__(netdev_
, stats
);
1657 if (error
&& error
!= ENOENT
&& error
!= ENODEV
) {
1658 VLOG_WARN_RL(&rl
, "%s: obtaining netdev stats via vport failed "
1660 netdev_get_name(netdev_
), ovs_strerror(error
));
1662 netdev
->vport_stats_error
= error
;
1663 netdev
->cache_valid
|= VALID_VPORT_STAT_ERROR
;
1667 /* Retrieves current device stats for 'netdev-linux'. */
1669 netdev_linux_get_stats(const struct netdev
*netdev_
,
1670 struct netdev_stats
*stats
)
1672 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1673 struct netdev_stats dev_stats
;
1676 ovs_mutex_lock(&netdev
->mutex
);
1677 get_stats_via_vport(netdev_
, stats
);
1678 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1680 if (!netdev
->vport_stats_error
) {
1683 } else if (netdev
->vport_stats_error
) {
1684 /* stats not available from OVS then use netdev stats. */
1687 /* Use kernel netdev's packet and byte counts since vport's counters
1688 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1690 stats
->rx_packets
= dev_stats
.rx_packets
;
1691 stats
->rx_bytes
= dev_stats
.rx_bytes
;
1692 stats
->tx_packets
= dev_stats
.tx_packets
;
1693 stats
->tx_bytes
= dev_stats
.tx_bytes
;
1695 stats
->rx_errors
+= dev_stats
.rx_errors
;
1696 stats
->tx_errors
+= dev_stats
.tx_errors
;
1697 stats
->rx_dropped
+= dev_stats
.rx_dropped
;
1698 stats
->tx_dropped
+= dev_stats
.tx_dropped
;
1699 stats
->multicast
+= dev_stats
.multicast
;
1700 stats
->collisions
+= dev_stats
.collisions
;
1701 stats
->rx_length_errors
+= dev_stats
.rx_length_errors
;
1702 stats
->rx_over_errors
+= dev_stats
.rx_over_errors
;
1703 stats
->rx_crc_errors
+= dev_stats
.rx_crc_errors
;
1704 stats
->rx_frame_errors
+= dev_stats
.rx_frame_errors
;
1705 stats
->rx_fifo_errors
+= dev_stats
.rx_fifo_errors
;
1706 stats
->rx_missed_errors
+= dev_stats
.rx_missed_errors
;
1707 stats
->tx_aborted_errors
+= dev_stats
.tx_aborted_errors
;
1708 stats
->tx_carrier_errors
+= dev_stats
.tx_carrier_errors
;
1709 stats
->tx_fifo_errors
+= dev_stats
.tx_fifo_errors
;
1710 stats
->tx_heartbeat_errors
+= dev_stats
.tx_heartbeat_errors
;
1711 stats
->tx_window_errors
+= dev_stats
.tx_window_errors
;
1713 ovs_mutex_unlock(&netdev
->mutex
);
1718 /* Retrieves current device stats for 'netdev-tap' netdev or
1719 * netdev-internal. */
1721 netdev_tap_get_stats(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
1723 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1724 struct netdev_stats dev_stats
;
1727 ovs_mutex_lock(&netdev
->mutex
);
1728 get_stats_via_vport(netdev_
, stats
);
1729 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1731 if (!netdev
->vport_stats_error
) {
1734 } else if (netdev
->vport_stats_error
) {
1735 /* Transmit and receive stats will appear to be swapped relative to the
1736 * other ports since we are the one sending the data, not a remote
1737 * computer. For consistency, we swap them back here. This does not
1738 * apply if we are getting stats from the vport layer because it always
1739 * tracks stats from the perspective of the switch. */
1742 swap_uint64(&stats
->rx_packets
, &stats
->tx_packets
);
1743 swap_uint64(&stats
->rx_bytes
, &stats
->tx_bytes
);
1744 swap_uint64(&stats
->rx_errors
, &stats
->tx_errors
);
1745 swap_uint64(&stats
->rx_dropped
, &stats
->tx_dropped
);
1746 stats
->rx_length_errors
= 0;
1747 stats
->rx_over_errors
= 0;
1748 stats
->rx_crc_errors
= 0;
1749 stats
->rx_frame_errors
= 0;
1750 stats
->rx_fifo_errors
= 0;
1751 stats
->rx_missed_errors
= 0;
1752 stats
->tx_aborted_errors
= 0;
1753 stats
->tx_carrier_errors
= 0;
1754 stats
->tx_fifo_errors
= 0;
1755 stats
->tx_heartbeat_errors
= 0;
1756 stats
->tx_window_errors
= 0;
1758 /* Use kernel netdev's packet and byte counts since vport counters
1759 * do not reflect packet counts on the wire when GSO, TSO or GRO
1761 stats
->rx_packets
= dev_stats
.tx_packets
;
1762 stats
->rx_bytes
= dev_stats
.tx_bytes
;
1763 stats
->tx_packets
= dev_stats
.rx_packets
;
1764 stats
->tx_bytes
= dev_stats
.rx_bytes
;
1766 stats
->rx_dropped
+= dev_stats
.tx_dropped
;
1767 stats
->tx_dropped
+= dev_stats
.rx_dropped
;
1769 stats
->rx_errors
+= dev_stats
.tx_errors
;
1770 stats
->tx_errors
+= dev_stats
.rx_errors
;
1772 stats
->multicast
+= dev_stats
.multicast
;
1773 stats
->collisions
+= dev_stats
.collisions
;
1775 ovs_mutex_unlock(&netdev
->mutex
);
1781 netdev_internal_get_stats(const struct netdev
*netdev_
,
1782 struct netdev_stats
*stats
)
1784 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1787 ovs_mutex_lock(&netdev
->mutex
);
1788 get_stats_via_vport(netdev_
, stats
);
1789 error
= netdev
->vport_stats_error
;
1790 ovs_mutex_unlock(&netdev
->mutex
);
1796 netdev_linux_read_features(struct netdev_linux
*netdev
)
1798 struct ethtool_cmd ecmd
;
1802 if (netdev
->cache_valid
& VALID_FEATURES
) {
1806 COVERAGE_INC(netdev_get_ethtool
);
1807 memset(&ecmd
, 0, sizeof ecmd
);
1808 error
= netdev_linux_do_ethtool(netdev
->up
.name
, &ecmd
,
1809 ETHTOOL_GSET
, "ETHTOOL_GSET");
1814 /* Supported features. */
1815 netdev
->supported
= 0;
1816 if (ecmd
.supported
& SUPPORTED_10baseT_Half
) {
1817 netdev
->supported
|= NETDEV_F_10MB_HD
;
1819 if (ecmd
.supported
& SUPPORTED_10baseT_Full
) {
1820 netdev
->supported
|= NETDEV_F_10MB_FD
;
1822 if (ecmd
.supported
& SUPPORTED_100baseT_Half
) {
1823 netdev
->supported
|= NETDEV_F_100MB_HD
;
1825 if (ecmd
.supported
& SUPPORTED_100baseT_Full
) {
1826 netdev
->supported
|= NETDEV_F_100MB_FD
;
1828 if (ecmd
.supported
& SUPPORTED_1000baseT_Half
) {
1829 netdev
->supported
|= NETDEV_F_1GB_HD
;
1831 if ((ecmd
.supported
& SUPPORTED_1000baseT_Full
) ||
1832 (ecmd
.supported
& SUPPORTED_1000baseKX_Full
)) {
1833 netdev
->supported
|= NETDEV_F_1GB_FD
;
1835 if ((ecmd
.supported
& SUPPORTED_10000baseT_Full
) ||
1836 (ecmd
.supported
& SUPPORTED_10000baseKX4_Full
) ||
1837 (ecmd
.supported
& SUPPORTED_10000baseKR_Full
) ||
1838 (ecmd
.supported
& SUPPORTED_10000baseR_FEC
)) {
1839 netdev
->supported
|= NETDEV_F_10GB_FD
;
1841 if ((ecmd
.supported
& SUPPORTED_40000baseKR4_Full
) ||
1842 (ecmd
.supported
& SUPPORTED_40000baseCR4_Full
) ||
1843 (ecmd
.supported
& SUPPORTED_40000baseSR4_Full
) ||
1844 (ecmd
.supported
& SUPPORTED_40000baseLR4_Full
)) {
1845 netdev
->supported
|= NETDEV_F_40GB_FD
;
1847 if (ecmd
.supported
& SUPPORTED_TP
) {
1848 netdev
->supported
|= NETDEV_F_COPPER
;
1850 if (ecmd
.supported
& SUPPORTED_FIBRE
) {
1851 netdev
->supported
|= NETDEV_F_FIBER
;
1853 if (ecmd
.supported
& SUPPORTED_Autoneg
) {
1854 netdev
->supported
|= NETDEV_F_AUTONEG
;
1856 if (ecmd
.supported
& SUPPORTED_Pause
) {
1857 netdev
->supported
|= NETDEV_F_PAUSE
;
1859 if (ecmd
.supported
& SUPPORTED_Asym_Pause
) {
1860 netdev
->supported
|= NETDEV_F_PAUSE_ASYM
;
1863 /* Advertised features. */
1864 netdev
->advertised
= 0;
1865 if (ecmd
.advertising
& ADVERTISED_10baseT_Half
) {
1866 netdev
->advertised
|= NETDEV_F_10MB_HD
;
1868 if (ecmd
.advertising
& ADVERTISED_10baseT_Full
) {
1869 netdev
->advertised
|= NETDEV_F_10MB_FD
;
1871 if (ecmd
.advertising
& ADVERTISED_100baseT_Half
) {
1872 netdev
->advertised
|= NETDEV_F_100MB_HD
;
1874 if (ecmd
.advertising
& ADVERTISED_100baseT_Full
) {
1875 netdev
->advertised
|= NETDEV_F_100MB_FD
;
1877 if (ecmd
.advertising
& ADVERTISED_1000baseT_Half
) {
1878 netdev
->advertised
|= NETDEV_F_1GB_HD
;
1880 if ((ecmd
.advertising
& ADVERTISED_1000baseT_Full
) ||
1881 (ecmd
.advertising
& ADVERTISED_1000baseKX_Full
)) {
1882 netdev
->advertised
|= NETDEV_F_1GB_FD
;
1884 if ((ecmd
.advertising
& ADVERTISED_10000baseT_Full
) ||
1885 (ecmd
.advertising
& ADVERTISED_10000baseKX4_Full
) ||
1886 (ecmd
.advertising
& ADVERTISED_10000baseKR_Full
) ||
1887 (ecmd
.advertising
& ADVERTISED_10000baseR_FEC
)) {
1888 netdev
->advertised
|= NETDEV_F_10GB_FD
;
1890 if ((ecmd
.advertising
& ADVERTISED_40000baseKR4_Full
) ||
1891 (ecmd
.advertising
& ADVERTISED_40000baseCR4_Full
) ||
1892 (ecmd
.advertising
& ADVERTISED_40000baseSR4_Full
) ||
1893 (ecmd
.advertising
& ADVERTISED_40000baseLR4_Full
)) {
1894 netdev
->advertised
|= NETDEV_F_40GB_FD
;
1896 if (ecmd
.advertising
& ADVERTISED_TP
) {
1897 netdev
->advertised
|= NETDEV_F_COPPER
;
1899 if (ecmd
.advertising
& ADVERTISED_FIBRE
) {
1900 netdev
->advertised
|= NETDEV_F_FIBER
;
1902 if (ecmd
.advertising
& ADVERTISED_Autoneg
) {
1903 netdev
->advertised
|= NETDEV_F_AUTONEG
;
1905 if (ecmd
.advertising
& ADVERTISED_Pause
) {
1906 netdev
->advertised
|= NETDEV_F_PAUSE
;
1908 if (ecmd
.advertising
& ADVERTISED_Asym_Pause
) {
1909 netdev
->advertised
|= NETDEV_F_PAUSE_ASYM
;
1912 /* Current settings. */
1913 speed
= ethtool_cmd_speed(&ecmd
);
1914 if (speed
== SPEED_10
) {
1915 netdev
->current
= ecmd
.duplex
? NETDEV_F_10MB_FD
: NETDEV_F_10MB_HD
;
1916 } else if (speed
== SPEED_100
) {
1917 netdev
->current
= ecmd
.duplex
? NETDEV_F_100MB_FD
: NETDEV_F_100MB_HD
;
1918 } else if (speed
== SPEED_1000
) {
1919 netdev
->current
= ecmd
.duplex
? NETDEV_F_1GB_FD
: NETDEV_F_1GB_HD
;
1920 } else if (speed
== SPEED_10000
) {
1921 netdev
->current
= NETDEV_F_10GB_FD
;
1922 } else if (speed
== 40000) {
1923 netdev
->current
= NETDEV_F_40GB_FD
;
1924 } else if (speed
== 100000) {
1925 netdev
->current
= NETDEV_F_100GB_FD
;
1926 } else if (speed
== 1000000) {
1927 netdev
->current
= NETDEV_F_1TB_FD
;
1929 netdev
->current
= 0;
1932 if (ecmd
.port
== PORT_TP
) {
1933 netdev
->current
|= NETDEV_F_COPPER
;
1934 } else if (ecmd
.port
== PORT_FIBRE
) {
1935 netdev
->current
|= NETDEV_F_FIBER
;
1939 netdev
->current
|= NETDEV_F_AUTONEG
;
1943 netdev
->cache_valid
|= VALID_FEATURES
;
1944 netdev
->get_features_error
= error
;
1947 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1948 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1949 * Returns 0 if successful, otherwise a positive errno value. */
1951 netdev_linux_get_features(const struct netdev
*netdev_
,
1952 enum netdev_features
*current
,
1953 enum netdev_features
*advertised
,
1954 enum netdev_features
*supported
,
1955 enum netdev_features
*peer
)
1957 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1960 ovs_mutex_lock(&netdev
->mutex
);
1961 netdev_linux_read_features(netdev
);
1962 if (!netdev
->get_features_error
) {
1963 *current
= netdev
->current
;
1964 *advertised
= netdev
->advertised
;
1965 *supported
= netdev
->supported
;
1966 *peer
= 0; /* XXX */
1968 error
= netdev
->get_features_error
;
1969 ovs_mutex_unlock(&netdev
->mutex
);
1974 /* Set the features advertised by 'netdev' to 'advertise'. */
1976 netdev_linux_set_advertisements(struct netdev
*netdev_
,
1977 enum netdev_features advertise
)
1979 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1980 struct ethtool_cmd ecmd
;
1983 ovs_mutex_lock(&netdev
->mutex
);
1985 COVERAGE_INC(netdev_get_ethtool
);
1986 memset(&ecmd
, 0, sizeof ecmd
);
1987 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
1988 ETHTOOL_GSET
, "ETHTOOL_GSET");
1993 ecmd
.advertising
= 0;
1994 if (advertise
& NETDEV_F_10MB_HD
) {
1995 ecmd
.advertising
|= ADVERTISED_10baseT_Half
;
1997 if (advertise
& NETDEV_F_10MB_FD
) {
1998 ecmd
.advertising
|= ADVERTISED_10baseT_Full
;
2000 if (advertise
& NETDEV_F_100MB_HD
) {
2001 ecmd
.advertising
|= ADVERTISED_100baseT_Half
;
2003 if (advertise
& NETDEV_F_100MB_FD
) {
2004 ecmd
.advertising
|= ADVERTISED_100baseT_Full
;
2006 if (advertise
& NETDEV_F_1GB_HD
) {
2007 ecmd
.advertising
|= ADVERTISED_1000baseT_Half
;
2009 if (advertise
& NETDEV_F_1GB_FD
) {
2010 ecmd
.advertising
|= ADVERTISED_1000baseT_Full
;
2012 if (advertise
& NETDEV_F_10GB_FD
) {
2013 ecmd
.advertising
|= ADVERTISED_10000baseT_Full
;
2015 if (advertise
& NETDEV_F_COPPER
) {
2016 ecmd
.advertising
|= ADVERTISED_TP
;
2018 if (advertise
& NETDEV_F_FIBER
) {
2019 ecmd
.advertising
|= ADVERTISED_FIBRE
;
2021 if (advertise
& NETDEV_F_AUTONEG
) {
2022 ecmd
.advertising
|= ADVERTISED_Autoneg
;
2024 if (advertise
& NETDEV_F_PAUSE
) {
2025 ecmd
.advertising
|= ADVERTISED_Pause
;
2027 if (advertise
& NETDEV_F_PAUSE_ASYM
) {
2028 ecmd
.advertising
|= ADVERTISED_Asym_Pause
;
2030 COVERAGE_INC(netdev_set_ethtool
);
2031 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2032 ETHTOOL_SSET
, "ETHTOOL_SSET");
2035 ovs_mutex_unlock(&netdev
->mutex
);
2039 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2040 * successful, otherwise a positive errno value. */
2042 netdev_linux_set_policing(struct netdev
*netdev_
,
2043 uint32_t kbits_rate
, uint32_t kbits_burst
)
2045 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2046 const char *netdev_name
= netdev_get_name(netdev_
);
2049 kbits_burst
= (!kbits_rate
? 0 /* Force to 0 if no rate specified. */
2050 : !kbits_burst
? 8000 /* Default to 8000 kbits if 0. */
2051 : kbits_burst
); /* Stick with user-specified value. */
2053 ovs_mutex_lock(&netdev
->mutex
);
2054 if (netdev
->cache_valid
& VALID_POLICING
) {
2055 error
= netdev
->netdev_policing_error
;
2056 if (error
|| (netdev
->kbits_rate
== kbits_rate
&&
2057 netdev
->kbits_burst
== kbits_burst
)) {
2058 /* Assume that settings haven't changed since we last set them. */
2061 netdev
->cache_valid
&= ~VALID_POLICING
;
2064 COVERAGE_INC(netdev_set_policing
);
2065 /* Remove any existing ingress qdisc. */
2066 error
= tc_add_del_ingress_qdisc(netdev_
, false);
2068 VLOG_WARN_RL(&rl
, "%s: removing policing failed: %s",
2069 netdev_name
, ovs_strerror(error
));
2074 error
= tc_add_del_ingress_qdisc(netdev_
, true);
2076 VLOG_WARN_RL(&rl
, "%s: adding policing qdisc failed: %s",
2077 netdev_name
, ovs_strerror(error
));
2081 error
= tc_add_policer(netdev_
, kbits_rate
, kbits_burst
);
2083 VLOG_WARN_RL(&rl
, "%s: adding policing action failed: %s",
2084 netdev_name
, ovs_strerror(error
));
2089 netdev
->kbits_rate
= kbits_rate
;
2090 netdev
->kbits_burst
= kbits_burst
;
2093 if (!error
|| error
== ENODEV
) {
2094 netdev
->netdev_policing_error
= error
;
2095 netdev
->cache_valid
|= VALID_POLICING
;
2097 ovs_mutex_unlock(&netdev
->mutex
);
2102 netdev_linux_get_qos_types(const struct netdev
*netdev OVS_UNUSED
,
2105 const struct tc_ops
*const *opsp
;
2106 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2107 const struct tc_ops
*ops
= *opsp
;
2108 if (ops
->tc_install
&& ops
->ovs_name
[0] != '\0') {
2109 sset_add(types
, ops
->ovs_name
);
2115 static const struct tc_ops
*
2116 tc_lookup_ovs_name(const char *name
)
2118 const struct tc_ops
*const *opsp
;
2120 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2121 const struct tc_ops
*ops
= *opsp
;
2122 if (!strcmp(name
, ops
->ovs_name
)) {
2129 static const struct tc_ops
*
2130 tc_lookup_linux_name(const char *name
)
2132 const struct tc_ops
*const *opsp
;
2134 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2135 const struct tc_ops
*ops
= *opsp
;
2136 if (ops
->linux_name
&& !strcmp(name
, ops
->linux_name
)) {
2143 static struct tc_queue
*
2144 tc_find_queue__(const struct netdev
*netdev_
, unsigned int queue_id
,
2147 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2148 struct tc_queue
*queue
;
2150 HMAP_FOR_EACH_IN_BUCKET (queue
, hmap_node
, hash
, &netdev
->tc
->queues
) {
2151 if (queue
->queue_id
== queue_id
) {
2158 static struct tc_queue
*
2159 tc_find_queue(const struct netdev
*netdev
, unsigned int queue_id
)
2161 return tc_find_queue__(netdev
, queue_id
, hash_int(queue_id
, 0));
2165 netdev_linux_get_qos_capabilities(const struct netdev
*netdev OVS_UNUSED
,
2167 struct netdev_qos_capabilities
*caps
)
2169 const struct tc_ops
*ops
= tc_lookup_ovs_name(type
);
2173 caps
->n_queues
= ops
->n_queues
;
2178 netdev_linux_get_qos(const struct netdev
*netdev_
,
2179 const char **typep
, struct smap
*details
)
2181 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2184 ovs_mutex_lock(&netdev
->mutex
);
2185 error
= tc_query_qdisc(netdev_
);
2187 *typep
= netdev
->tc
->ops
->ovs_name
;
2188 error
= (netdev
->tc
->ops
->qdisc_get
2189 ? netdev
->tc
->ops
->qdisc_get(netdev_
, details
)
2192 ovs_mutex_unlock(&netdev
->mutex
);
2198 netdev_linux_set_qos(struct netdev
*netdev_
,
2199 const char *type
, const struct smap
*details
)
2201 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2202 const struct tc_ops
*new_ops
;
2205 new_ops
= tc_lookup_ovs_name(type
);
2206 if (!new_ops
|| !new_ops
->tc_install
) {
2210 if (new_ops
== &tc_ops_noop
) {
2211 return new_ops
->tc_install(netdev_
, details
);
2214 ovs_mutex_lock(&netdev
->mutex
);
2215 error
= tc_query_qdisc(netdev_
);
2220 if (new_ops
== netdev
->tc
->ops
) {
2221 error
= new_ops
->qdisc_set
? new_ops
->qdisc_set(netdev_
, details
) : 0;
2223 /* Delete existing qdisc. */
2224 error
= tc_del_qdisc(netdev_
);
2228 ovs_assert(netdev
->tc
== NULL
);
2230 /* Install new qdisc. */
2231 error
= new_ops
->tc_install(netdev_
, details
);
2232 ovs_assert((error
== 0) == (netdev
->tc
!= NULL
));
2236 ovs_mutex_unlock(&netdev
->mutex
);
2241 netdev_linux_get_queue(const struct netdev
*netdev_
,
2242 unsigned int queue_id
, struct smap
*details
)
2244 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2247 ovs_mutex_lock(&netdev
->mutex
);
2248 error
= tc_query_qdisc(netdev_
);
2250 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2252 ? netdev
->tc
->ops
->class_get(netdev_
, queue
, details
)
2255 ovs_mutex_unlock(&netdev
->mutex
);
2261 netdev_linux_set_queue(struct netdev
*netdev_
,
2262 unsigned int queue_id
, const struct smap
*details
)
2264 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2267 ovs_mutex_lock(&netdev
->mutex
);
2268 error
= tc_query_qdisc(netdev_
);
2270 error
= (queue_id
< netdev
->tc
->ops
->n_queues
2271 && netdev
->tc
->ops
->class_set
2272 ? netdev
->tc
->ops
->class_set(netdev_
, queue_id
, details
)
2275 ovs_mutex_unlock(&netdev
->mutex
);
2281 netdev_linux_delete_queue(struct netdev
*netdev_
, unsigned int queue_id
)
2283 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2286 ovs_mutex_lock(&netdev
->mutex
);
2287 error
= tc_query_qdisc(netdev_
);
2289 if (netdev
->tc
->ops
->class_delete
) {
2290 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2292 ? netdev
->tc
->ops
->class_delete(netdev_
, queue
)
2298 ovs_mutex_unlock(&netdev
->mutex
);
2304 netdev_linux_get_queue_stats(const struct netdev
*netdev_
,
2305 unsigned int queue_id
,
2306 struct netdev_queue_stats
*stats
)
2308 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2311 ovs_mutex_lock(&netdev
->mutex
);
2312 error
= tc_query_qdisc(netdev_
);
2314 if (netdev
->tc
->ops
->class_get_stats
) {
2315 const struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2317 stats
->created
= queue
->created
;
2318 error
= netdev
->tc
->ops
->class_get_stats(netdev_
, queue
,
2327 ovs_mutex_unlock(&netdev
->mutex
);
2332 struct queue_dump_state
{
2333 struct nl_dump dump
;
2338 start_queue_dump(const struct netdev
*netdev
, struct queue_dump_state
*state
)
2340 struct ofpbuf request
;
2341 struct tcmsg
*tcmsg
;
2343 tcmsg
= tc_make_request(netdev
, RTM_GETTCLASS
, 0, &request
);
2347 tcmsg
->tcm_parent
= 0;
2348 nl_dump_start(&state
->dump
, NETLINK_ROUTE
, &request
);
2349 ofpbuf_uninit(&request
);
2351 ofpbuf_init(&state
->buf
, NL_DUMP_BUFSIZE
);
2356 finish_queue_dump(struct queue_dump_state
*state
)
2358 ofpbuf_uninit(&state
->buf
);
2359 return nl_dump_done(&state
->dump
);
2362 struct netdev_linux_queue_state
{
2363 unsigned int *queues
;
2369 netdev_linux_queue_dump_start(const struct netdev
*netdev_
, void **statep
)
2371 const struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2374 ovs_mutex_lock(&netdev
->mutex
);
2375 error
= tc_query_qdisc(netdev_
);
2377 if (netdev
->tc
->ops
->class_get
) {
2378 struct netdev_linux_queue_state
*state
;
2379 struct tc_queue
*queue
;
2382 *statep
= state
= xmalloc(sizeof *state
);
2383 state
->n_queues
= hmap_count(&netdev
->tc
->queues
);
2384 state
->cur_queue
= 0;
2385 state
->queues
= xmalloc(state
->n_queues
* sizeof *state
->queues
);
2388 HMAP_FOR_EACH (queue
, hmap_node
, &netdev
->tc
->queues
) {
2389 state
->queues
[i
++] = queue
->queue_id
;
2395 ovs_mutex_unlock(&netdev
->mutex
);
2401 netdev_linux_queue_dump_next(const struct netdev
*netdev_
, void *state_
,
2402 unsigned int *queue_idp
, struct smap
*details
)
2404 const struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2405 struct netdev_linux_queue_state
*state
= state_
;
2408 ovs_mutex_lock(&netdev
->mutex
);
2409 while (state
->cur_queue
< state
->n_queues
) {
2410 unsigned int queue_id
= state
->queues
[state
->cur_queue
++];
2411 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2414 *queue_idp
= queue_id
;
2415 error
= netdev
->tc
->ops
->class_get(netdev_
, queue
, details
);
2419 ovs_mutex_unlock(&netdev
->mutex
);
2425 netdev_linux_queue_dump_done(const struct netdev
*netdev OVS_UNUSED
,
2428 struct netdev_linux_queue_state
*state
= state_
;
2430 free(state
->queues
);
2436 netdev_linux_dump_queue_stats(const struct netdev
*netdev_
,
2437 netdev_dump_queue_stats_cb
*cb
, void *aux
)
2439 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2442 ovs_mutex_lock(&netdev
->mutex
);
2443 error
= tc_query_qdisc(netdev_
);
2445 struct queue_dump_state state
;
2447 if (!netdev
->tc
->ops
->class_dump_stats
) {
2449 } else if (!start_queue_dump(netdev_
, &state
)) {
2455 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
2456 retval
= netdev
->tc
->ops
->class_dump_stats(netdev_
, &msg
,
2463 retval
= finish_queue_dump(&state
);
2469 ovs_mutex_unlock(&netdev
->mutex
);
2475 netdev_linux_set_in4(struct netdev
*netdev_
, struct in_addr address
,
2476 struct in_addr netmask
)
2478 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2481 ovs_mutex_lock(&netdev
->mutex
);
2482 error
= do_set_addr(netdev_
, SIOCSIFADDR
, "SIOCSIFADDR", address
);
2484 if (address
.s_addr
!= INADDR_ANY
) {
2485 error
= do_set_addr(netdev_
, SIOCSIFNETMASK
,
2486 "SIOCSIFNETMASK", netmask
);
2490 ovs_mutex_unlock(&netdev
->mutex
);
2495 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2496 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2499 netdev_linux_get_addr_list(const struct netdev
*netdev_
,
2500 struct in6_addr
**addr
, struct in6_addr
**mask
, int *n_cnt
)
2502 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2505 ovs_mutex_lock(&netdev
->mutex
);
2506 error
= netdev_get_addrs(netdev_get_name(netdev_
), addr
, mask
, n_cnt
);
2507 ovs_mutex_unlock(&netdev
->mutex
);
2513 make_in4_sockaddr(struct sockaddr
*sa
, struct in_addr addr
)
2515 struct sockaddr_in sin
;
2516 memset(&sin
, 0, sizeof sin
);
2517 sin
.sin_family
= AF_INET
;
2518 sin
.sin_addr
= addr
;
2521 memset(sa
, 0, sizeof *sa
);
2522 memcpy(sa
, &sin
, sizeof sin
);
2526 do_set_addr(struct netdev
*netdev
,
2527 int ioctl_nr
, const char *ioctl_name
, struct in_addr addr
)
2531 make_in4_sockaddr(&ifr
.ifr_addr
, addr
);
2532 return af_inet_ifreq_ioctl(netdev_get_name(netdev
), &ifr
, ioctl_nr
,
2536 /* Adds 'router' as a default IP gateway. */
2538 netdev_linux_add_router(struct netdev
*netdev OVS_UNUSED
, struct in_addr router
)
2540 struct in_addr any
= { INADDR_ANY
};
2544 memset(&rt
, 0, sizeof rt
);
2545 make_in4_sockaddr(&rt
.rt_dst
, any
);
2546 make_in4_sockaddr(&rt
.rt_gateway
, router
);
2547 make_in4_sockaddr(&rt
.rt_genmask
, any
);
2548 rt
.rt_flags
= RTF_UP
| RTF_GATEWAY
;
2549 error
= af_inet_ioctl(SIOCADDRT
, &rt
);
2551 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error
));
2557 netdev_linux_get_next_hop(const struct in_addr
*host
, struct in_addr
*next_hop
,
2560 static const char fn
[] = "/proc/net/route";
2565 *netdev_name
= NULL
;
2566 stream
= fopen(fn
, "r");
2567 if (stream
== NULL
) {
2568 VLOG_WARN_RL(&rl
, "%s: open failed: %s", fn
, ovs_strerror(errno
));
2573 while (fgets(line
, sizeof line
, stream
)) {
2576 ovs_be32 dest
, gateway
, mask
;
2577 int refcnt
, metric
, mtu
;
2578 unsigned int flags
, use
, window
, irtt
;
2581 "%16s %"SCNx32
" %"SCNx32
" %04X %d %u %d %"SCNx32
2583 iface
, &dest
, &gateway
, &flags
, &refcnt
,
2584 &use
, &metric
, &mask
, &mtu
, &window
, &irtt
)) {
2585 VLOG_WARN_RL(&rl
, "%s: could not parse line %d: %s",
2589 if (!(flags
& RTF_UP
)) {
2590 /* Skip routes that aren't up. */
2594 /* The output of 'dest', 'mask', and 'gateway' were given in
2595 * network byte order, so we don't need need any endian
2596 * conversions here. */
2597 if ((dest
& mask
) == (host
->s_addr
& mask
)) {
2599 /* The host is directly reachable. */
2600 next_hop
->s_addr
= 0;
2602 /* To reach the host, we must go through a gateway. */
2603 next_hop
->s_addr
= gateway
;
2605 *netdev_name
= xstrdup(iface
);
2617 netdev_linux_get_status(const struct netdev
*netdev_
, struct smap
*smap
)
2619 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2622 ovs_mutex_lock(&netdev
->mutex
);
2623 if (!(netdev
->cache_valid
& VALID_DRVINFO
)) {
2624 struct ethtool_cmd
*cmd
= (struct ethtool_cmd
*) &netdev
->drvinfo
;
2626 COVERAGE_INC(netdev_get_ethtool
);
2627 memset(&netdev
->drvinfo
, 0, sizeof netdev
->drvinfo
);
2628 error
= netdev_linux_do_ethtool(netdev
->up
.name
,
2631 "ETHTOOL_GDRVINFO");
2633 netdev
->cache_valid
|= VALID_DRVINFO
;
2638 smap_add(smap
, "driver_name", netdev
->drvinfo
.driver
);
2639 smap_add(smap
, "driver_version", netdev
->drvinfo
.version
);
2640 smap_add(smap
, "firmware_version", netdev
->drvinfo
.fw_version
);
2642 ovs_mutex_unlock(&netdev
->mutex
);
2648 netdev_internal_get_status(const struct netdev
*netdev OVS_UNUSED
,
2651 smap_add(smap
, "driver_name", "openvswitch");
2655 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2656 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2657 * returns 0. Otherwise, it returns a positive errno value; in particular,
2658 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2660 netdev_linux_arp_lookup(const struct netdev
*netdev
,
2661 ovs_be32 ip
, struct eth_addr
*mac
)
2664 struct sockaddr_in sin
;
2667 memset(&r
, 0, sizeof r
);
2668 memset(&sin
, 0, sizeof sin
);
2669 sin
.sin_family
= AF_INET
;
2670 sin
.sin_addr
.s_addr
= ip
;
2672 memcpy(&r
.arp_pa
, &sin
, sizeof sin
);
2673 r
.arp_ha
.sa_family
= ARPHRD_ETHER
;
2675 ovs_strzcpy(r
.arp_dev
, netdev_get_name(netdev
), sizeof r
.arp_dev
);
2676 COVERAGE_INC(netdev_arp_lookup
);
2677 retval
= af_inet_ioctl(SIOCGARP
, &r
);
2679 memcpy(mac
, r
.arp_ha
.sa_data
, ETH_ADDR_LEN
);
2680 } else if (retval
!= ENXIO
) {
2681 VLOG_WARN_RL(&rl
, "%s: could not look up ARP entry for "IP_FMT
": %s",
2682 netdev_get_name(netdev
), IP_ARGS(ip
),
2683 ovs_strerror(retval
));
2689 nd_to_iff_flags(enum netdev_flags nd
)
2692 if (nd
& NETDEV_UP
) {
2695 if (nd
& NETDEV_PROMISC
) {
2698 if (nd
& NETDEV_LOOPBACK
) {
2699 iff
|= IFF_LOOPBACK
;
2705 iff_to_nd_flags(int iff
)
2707 enum netdev_flags nd
= 0;
2711 if (iff
& IFF_PROMISC
) {
2712 nd
|= NETDEV_PROMISC
;
2714 if (iff
& IFF_LOOPBACK
) {
2715 nd
|= NETDEV_LOOPBACK
;
2721 update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
2722 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
2723 OVS_REQUIRES(netdev
->mutex
)
2725 int old_flags
, new_flags
;
2728 old_flags
= netdev
->ifi_flags
;
2729 *old_flagsp
= iff_to_nd_flags(old_flags
);
2730 new_flags
= (old_flags
& ~nd_to_iff_flags(off
)) | nd_to_iff_flags(on
);
2731 if (new_flags
!= old_flags
) {
2732 error
= set_flags(netdev_get_name(&netdev
->up
), new_flags
);
2733 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
2740 netdev_linux_update_flags(struct netdev
*netdev_
, enum netdev_flags off
,
2741 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
2743 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2746 ovs_mutex_lock(&netdev
->mutex
);
2747 error
= update_flags(netdev
, off
, on
, old_flagsp
);
2748 ovs_mutex_unlock(&netdev
->mutex
);
2753 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2754 GET_FEATURES, GET_STATUS) \
2757 false, /* is_pmd */ \
2761 netdev_linux_wait, \
2763 netdev_linux_alloc, \
2765 netdev_linux_destruct, \
2766 netdev_linux_dealloc, \
2767 NULL, /* get_config */ \
2768 NULL, /* set_config */ \
2769 NULL, /* get_tunnel_config */ \
2770 NULL, /* build header */ \
2771 NULL, /* push header */ \
2772 NULL, /* pop header */ \
2773 NULL, /* get_numa_id */ \
2774 NULL, /* set_tx_multiq */ \
2776 netdev_linux_send, \
2777 netdev_linux_send_wait, \
2779 netdev_linux_set_etheraddr, \
2780 netdev_linux_get_etheraddr, \
2781 netdev_linux_get_mtu, \
2782 netdev_linux_set_mtu, \
2783 netdev_linux_get_ifindex, \
2784 netdev_linux_get_carrier, \
2785 netdev_linux_get_carrier_resets, \
2786 netdev_linux_set_miimon_interval, \
2790 netdev_linux_set_advertisements, \
2792 netdev_linux_set_policing, \
2793 netdev_linux_get_qos_types, \
2794 netdev_linux_get_qos_capabilities, \
2795 netdev_linux_get_qos, \
2796 netdev_linux_set_qos, \
2797 netdev_linux_get_queue, \
2798 netdev_linux_set_queue, \
2799 netdev_linux_delete_queue, \
2800 netdev_linux_get_queue_stats, \
2801 netdev_linux_queue_dump_start, \
2802 netdev_linux_queue_dump_next, \
2803 netdev_linux_queue_dump_done, \
2804 netdev_linux_dump_queue_stats, \
2806 netdev_linux_set_in4, \
2807 netdev_linux_get_addr_list, \
2808 netdev_linux_add_router, \
2809 netdev_linux_get_next_hop, \
2811 netdev_linux_arp_lookup, \
2813 netdev_linux_update_flags, \
2814 NULL, /* reconfigure */ \
2816 netdev_linux_rxq_alloc, \
2817 netdev_linux_rxq_construct, \
2818 netdev_linux_rxq_destruct, \
2819 netdev_linux_rxq_dealloc, \
2820 netdev_linux_rxq_recv, \
2821 netdev_linux_rxq_wait, \
2822 netdev_linux_rxq_drain, \
2825 const struct netdev_class netdev_linux_class
=
2828 netdev_linux_construct
,
2829 netdev_linux_get_stats
,
2830 netdev_linux_get_features
,
2831 netdev_linux_get_status
);
2833 const struct netdev_class netdev_tap_class
=
2836 netdev_linux_construct_tap
,
2837 netdev_tap_get_stats
,
2838 netdev_linux_get_features
,
2839 netdev_linux_get_status
);
2841 const struct netdev_class netdev_internal_class
=
2844 netdev_linux_construct
,
2845 netdev_internal_get_stats
,
2846 NULL
, /* get_features */
2847 netdev_internal_get_status
);
2850 #define CODEL_N_QUEUES 0x0000
2852 /* In sufficiently new kernel headers these are defined as enums in
2853 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2854 * kernels. (This overrides any enum definition in the header file but that's
2856 #define TCA_CODEL_TARGET 1
2857 #define TCA_CODEL_LIMIT 2
2858 #define TCA_CODEL_INTERVAL 3
2867 static struct codel
*
2868 codel_get__(const struct netdev
*netdev_
)
2870 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2871 return CONTAINER_OF(netdev
->tc
, struct codel
, tc
);
2875 codel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
2878 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2879 struct codel
*codel
;
2881 codel
= xmalloc(sizeof *codel
);
2882 tc_init(&codel
->tc
, &tc_ops_codel
);
2883 codel
->target
= target
;
2884 codel
->limit
= limit
;
2885 codel
->interval
= interval
;
2887 netdev
->tc
= &codel
->tc
;
2891 codel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
2895 struct ofpbuf request
;
2896 struct tcmsg
*tcmsg
;
2897 uint32_t otarget
, olimit
, ointerval
;
2900 tc_del_qdisc(netdev
);
2902 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
2903 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
2907 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
2908 tcmsg
->tcm_parent
= TC_H_ROOT
;
2910 otarget
= target
? target
: 5000;
2911 olimit
= limit
? limit
: 10240;
2912 ointerval
= interval
? interval
: 100000;
2914 nl_msg_put_string(&request
, TCA_KIND
, "codel");
2915 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
2916 nl_msg_put_u32(&request
, TCA_CODEL_TARGET
, otarget
);
2917 nl_msg_put_u32(&request
, TCA_CODEL_LIMIT
, olimit
);
2918 nl_msg_put_u32(&request
, TCA_CODEL_INTERVAL
, ointerval
);
2919 nl_msg_end_nested(&request
, opt_offset
);
2921 error
= tc_transact(&request
, NULL
);
2923 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
2924 "target %u, limit %u, interval %u error %d(%s)",
2925 netdev_get_name(netdev
),
2926 otarget
, olimit
, ointerval
,
2927 error
, ovs_strerror(error
));
2933 codel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
2934 const struct smap
*details
, struct codel
*codel
)
2936 const char *target_s
;
2937 const char *limit_s
;
2938 const char *interval_s
;
2940 target_s
= smap_get(details
, "target");
2941 limit_s
= smap_get(details
, "limit");
2942 interval_s
= smap_get(details
, "interval");
2944 codel
->target
= target_s
? strtoull(target_s
, NULL
, 10) : 0;
2945 codel
->limit
= limit_s
? strtoull(limit_s
, NULL
, 10) : 0;
2946 codel
->interval
= interval_s
? strtoull(interval_s
, NULL
, 10) : 0;
2948 if (!codel
->target
) {
2949 codel
->target
= 5000;
2951 if (!codel
->limit
) {
2952 codel
->limit
= 10240;
2954 if (!codel
->interval
) {
2955 codel
->interval
= 100000;
2960 codel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
2965 codel_parse_qdisc_details__(netdev
, details
, &codel
);
2966 error
= codel_setup_qdisc__(netdev
, codel
.target
, codel
.limit
,
2969 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
2975 codel_parse_tca_options__(struct nlattr
*nl_options
, struct codel
*codel
)
2977 static const struct nl_policy tca_codel_policy
[] = {
2978 [TCA_CODEL_TARGET
] = { .type
= NL_A_U32
},
2979 [TCA_CODEL_LIMIT
] = { .type
= NL_A_U32
},
2980 [TCA_CODEL_INTERVAL
] = { .type
= NL_A_U32
}
2983 struct nlattr
*attrs
[ARRAY_SIZE(tca_codel_policy
)];
2985 if (!nl_parse_nested(nl_options
, tca_codel_policy
,
2986 attrs
, ARRAY_SIZE(tca_codel_policy
))) {
2987 VLOG_WARN_RL(&rl
, "failed to parse CoDel class options");
2991 codel
->target
= nl_attr_get_u32(attrs
[TCA_CODEL_TARGET
]);
2992 codel
->limit
= nl_attr_get_u32(attrs
[TCA_CODEL_LIMIT
]);
2993 codel
->interval
= nl_attr_get_u32(attrs
[TCA_CODEL_INTERVAL
]);
2998 codel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3000 struct nlattr
*nlattr
;
3005 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3010 error
= codel_parse_tca_options__(nlattr
, &codel
);
3015 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3021 codel_tc_destroy(struct tc
*tc
)
3023 struct codel
*codel
= CONTAINER_OF(tc
, struct codel
, tc
);
3029 codel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3031 const struct codel
*codel
= codel_get__(netdev
);
3032 smap_add_format(details
, "target", "%u", codel
->target
);
3033 smap_add_format(details
, "limit", "%u", codel
->limit
);
3034 smap_add_format(details
, "interval", "%u", codel
->interval
);
3039 codel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3043 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3044 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3045 codel_get__(netdev
)->target
= codel
.target
;
3046 codel_get__(netdev
)->limit
= codel
.limit
;
3047 codel_get__(netdev
)->interval
= codel
.interval
;
3051 static const struct tc_ops tc_ops_codel
= {
3052 "codel", /* linux_name */
3053 "linux-codel", /* ovs_name */
3054 CODEL_N_QUEUES
, /* n_queues */
3067 /* FQ-CoDel traffic control class. */
3069 #define FQCODEL_N_QUEUES 0x0000
3071 /* In sufficiently new kernel headers these are defined as enums in
3072 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3073 * kernels. (This overrides any enum definition in the header file but that's
3075 #define TCA_FQ_CODEL_TARGET 1
3076 #define TCA_FQ_CODEL_LIMIT 2
3077 #define TCA_FQ_CODEL_INTERVAL 3
3078 #define TCA_FQ_CODEL_ECN 4
3079 #define TCA_FQ_CODEL_FLOWS 5
3080 #define TCA_FQ_CODEL_QUANTUM 6
3091 static struct fqcodel
*
3092 fqcodel_get__(const struct netdev
*netdev_
)
3094 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3095 return CONTAINER_OF(netdev
->tc
, struct fqcodel
, tc
);
3099 fqcodel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3100 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3102 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3103 struct fqcodel
*fqcodel
;
3105 fqcodel
= xmalloc(sizeof *fqcodel
);
3106 tc_init(&fqcodel
->tc
, &tc_ops_fqcodel
);
3107 fqcodel
->target
= target
;
3108 fqcodel
->limit
= limit
;
3109 fqcodel
->interval
= interval
;
3110 fqcodel
->flows
= flows
;
3111 fqcodel
->quantum
= quantum
;
3113 netdev
->tc
= &fqcodel
->tc
;
3117 fqcodel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3118 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3121 struct ofpbuf request
;
3122 struct tcmsg
*tcmsg
;
3123 uint32_t otarget
, olimit
, ointerval
, oflows
, oquantum
;
3126 tc_del_qdisc(netdev
);
3128 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
3129 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3133 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3134 tcmsg
->tcm_parent
= TC_H_ROOT
;
3136 otarget
= target
? target
: 5000;
3137 olimit
= limit
? limit
: 10240;
3138 ointerval
= interval
? interval
: 100000;
3139 oflows
= flows
? flows
: 1024;
3140 oquantum
= quantum
? quantum
: 1514; /* fq_codel default quantum is 1514
3143 nl_msg_put_string(&request
, TCA_KIND
, "fq_codel");
3144 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3145 nl_msg_put_u32(&request
, TCA_FQ_CODEL_TARGET
, otarget
);
3146 nl_msg_put_u32(&request
, TCA_FQ_CODEL_LIMIT
, olimit
);
3147 nl_msg_put_u32(&request
, TCA_FQ_CODEL_INTERVAL
, ointerval
);
3148 nl_msg_put_u32(&request
, TCA_FQ_CODEL_FLOWS
, oflows
);
3149 nl_msg_put_u32(&request
, TCA_FQ_CODEL_QUANTUM
, oquantum
);
3150 nl_msg_end_nested(&request
, opt_offset
);
3152 error
= tc_transact(&request
, NULL
);
3154 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3155 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3156 netdev_get_name(netdev
),
3157 otarget
, olimit
, ointerval
, oflows
, oquantum
,
3158 error
, ovs_strerror(error
));
3164 fqcodel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3165 const struct smap
*details
, struct fqcodel
*fqcodel
)
3167 const char *target_s
;
3168 const char *limit_s
;
3169 const char *interval_s
;
3170 const char *flows_s
;
3171 const char *quantum_s
;
3173 target_s
= smap_get(details
, "target");
3174 limit_s
= smap_get(details
, "limit");
3175 interval_s
= smap_get(details
, "interval");
3176 flows_s
= smap_get(details
, "flows");
3177 quantum_s
= smap_get(details
, "quantum");
3178 fqcodel
->target
= target_s
? strtoull(target_s
, NULL
, 10) : 0;
3179 fqcodel
->limit
= limit_s
? strtoull(limit_s
, NULL
, 10) : 0;
3180 fqcodel
->interval
= interval_s
? strtoull(interval_s
, NULL
, 10) : 0;
3181 fqcodel
->flows
= flows_s
? strtoull(flows_s
, NULL
, 10) : 0;
3182 fqcodel
->quantum
= quantum_s
? strtoull(quantum_s
, NULL
, 10) : 0;
3183 if (!fqcodel
->target
) {
3184 fqcodel
->target
= 5000;
3186 if (!fqcodel
->limit
) {
3187 fqcodel
->limit
= 10240;
3189 if (!fqcodel
->interval
) {
3190 fqcodel
->interval
= 1000000;
3192 if (!fqcodel
->flows
) {
3193 fqcodel
->flows
= 1024;
3195 if (!fqcodel
->quantum
) {
3196 fqcodel
->quantum
= 1514;
3201 fqcodel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3204 struct fqcodel fqcodel
;
3206 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3207 error
= fqcodel_setup_qdisc__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3208 fqcodel
.interval
, fqcodel
.flows
,
3211 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3212 fqcodel
.interval
, fqcodel
.flows
, fqcodel
.quantum
);
3218 fqcodel_parse_tca_options__(struct nlattr
*nl_options
, struct fqcodel
*fqcodel
)
3220 static const struct nl_policy tca_fqcodel_policy
[] = {
3221 [TCA_FQ_CODEL_TARGET
] = { .type
= NL_A_U32
},
3222 [TCA_FQ_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3223 [TCA_FQ_CODEL_INTERVAL
] = { .type
= NL_A_U32
},
3224 [TCA_FQ_CODEL_FLOWS
] = { .type
= NL_A_U32
},
3225 [TCA_FQ_CODEL_QUANTUM
] = { .type
= NL_A_U32
}
3228 struct nlattr
*attrs
[ARRAY_SIZE(tca_fqcodel_policy
)];
3230 if (!nl_parse_nested(nl_options
, tca_fqcodel_policy
,
3231 attrs
, ARRAY_SIZE(tca_fqcodel_policy
))) {
3232 VLOG_WARN_RL(&rl
, "failed to parse FQ_CoDel class options");
3236 fqcodel
->target
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_TARGET
]);
3237 fqcodel
->limit
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_LIMIT
]);
3238 fqcodel
->interval
=nl_attr_get_u32(attrs
[TCA_FQ_CODEL_INTERVAL
]);
3239 fqcodel
->flows
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_FLOWS
]);
3240 fqcodel
->quantum
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_QUANTUM
]);
3245 fqcodel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3247 struct nlattr
*nlattr
;
3250 struct fqcodel fqcodel
;
3252 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3257 error
= fqcodel_parse_tca_options__(nlattr
, &fqcodel
);
3262 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3263 fqcodel
.flows
, fqcodel
.quantum
);
3268 fqcodel_tc_destroy(struct tc
*tc
)
3270 struct fqcodel
*fqcodel
= CONTAINER_OF(tc
, struct fqcodel
, tc
);
3276 fqcodel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3278 const struct fqcodel
*fqcodel
= fqcodel_get__(netdev
);
3279 smap_add_format(details
, "target", "%u", fqcodel
->target
);
3280 smap_add_format(details
, "limit", "%u", fqcodel
->limit
);
3281 smap_add_format(details
, "interval", "%u", fqcodel
->interval
);
3282 smap_add_format(details
, "flows", "%u", fqcodel
->flows
);
3283 smap_add_format(details
, "quantum", "%u", fqcodel
->quantum
);
3288 fqcodel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3290 struct fqcodel fqcodel
;
3292 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3293 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3294 fqcodel
.flows
, fqcodel
.quantum
);
3295 fqcodel_get__(netdev
)->target
= fqcodel
.target
;
3296 fqcodel_get__(netdev
)->limit
= fqcodel
.limit
;
3297 fqcodel_get__(netdev
)->interval
= fqcodel
.interval
;
3298 fqcodel_get__(netdev
)->flows
= fqcodel
.flows
;
3299 fqcodel_get__(netdev
)->quantum
= fqcodel
.quantum
;
3303 static const struct tc_ops tc_ops_fqcodel
= {
3304 "fq_codel", /* linux_name */
3305 "linux-fq_codel", /* ovs_name */
3306 FQCODEL_N_QUEUES
, /* n_queues */
3319 /* SFQ traffic control class. */
3321 #define SFQ_N_QUEUES 0x0000
3330 sfq_get__(const struct netdev
*netdev_
)
3332 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3333 return CONTAINER_OF(netdev
->tc
, struct sfq
, tc
);
3337 sfq_install__(struct netdev
*netdev_
, uint32_t quantum
, uint32_t perturb
)
3339 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3342 sfq
= xmalloc(sizeof *sfq
);
3343 tc_init(&sfq
->tc
, &tc_ops_sfq
);
3344 sfq
->perturb
= perturb
;
3345 sfq
->quantum
= quantum
;
3347 netdev
->tc
= &sfq
->tc
;
3351 sfq_setup_qdisc__(struct netdev
*netdev
, uint32_t quantum
, uint32_t perturb
)
3353 struct tc_sfq_qopt opt
;
3354 struct ofpbuf request
;
3355 struct tcmsg
*tcmsg
;
3357 int mtu_error
, error
;
3358 mtu_error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3360 tc_del_qdisc(netdev
);
3362 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
3363 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3367 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3368 tcmsg
->tcm_parent
= TC_H_ROOT
;
3370 memset(&opt
, 0, sizeof opt
);
3373 opt
.quantum
= mtu
; /* if we cannot find mtu, use default */
3376 opt
.quantum
= quantum
;
3380 opt
.perturb_period
= 10;
3382 opt
.perturb_period
= perturb
;
3385 nl_msg_put_string(&request
, TCA_KIND
, "sfq");
3386 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
3388 error
= tc_transact(&request
, NULL
);
3390 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3391 "quantum %u, perturb %u error %d(%s)",
3392 netdev_get_name(netdev
),
3393 opt
.quantum
, opt
.perturb_period
,
3394 error
, ovs_strerror(error
));
3400 sfq_parse_qdisc_details__(struct netdev
*netdev
,
3401 const struct smap
*details
, struct sfq
*sfq
)
3403 const char *perturb_s
;
3404 const char *quantum_s
;
3408 perturb_s
= smap_get(details
, "perturb");
3409 quantum_s
= smap_get(details
, "quantum");
3410 sfq
->perturb
= perturb_s
? strtoull(perturb_s
, NULL
, 10) : 0;
3411 sfq
->quantum
= quantum_s
? strtoull(quantum_s
, NULL
, 10) : 0;
3412 if (!sfq
->perturb
) {
3416 if (!sfq
->quantum
) {
3417 mtu_error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3421 VLOG_WARN_RL(&rl
, "when using SFQ, you must specify quantum on a "
3422 "device without mtu");
3429 sfq_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3434 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3435 error
= sfq_setup_qdisc__(netdev
, sfq
.quantum
, sfq
.perturb
);
3437 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3443 sfq_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3445 const struct tc_sfq_qopt
*sfq
;
3446 struct nlattr
*nlattr
;
3450 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3452 sfq
= nl_attr_get(nlattr
);
3453 sfq_install__(netdev
, sfq
->perturb_period
, sfq
->quantum
);
3461 sfq_tc_destroy(struct tc
*tc
)
3463 struct sfq
*sfq
= CONTAINER_OF(tc
, struct sfq
, tc
);
3469 sfq_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3471 const struct sfq
*sfq
= sfq_get__(netdev
);
3472 smap_add_format(details
, "quantum", "%u", sfq
->quantum
);
3473 smap_add_format(details
, "perturb", "%u", sfq
->perturb
);
3478 sfq_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3482 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3483 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3484 sfq_get__(netdev
)->quantum
= sfq
.quantum
;
3485 sfq_get__(netdev
)->perturb
= sfq
.perturb
;
3489 static const struct tc_ops tc_ops_sfq
= {
3490 "sfq", /* linux_name */
3491 "linux-sfq", /* ovs_name */
3492 SFQ_N_QUEUES
, /* n_queues */
3505 /* HTB traffic control class. */
3507 #define HTB_N_QUEUES 0xf000
3508 #define HTB_RATE2QUANTUM 10
3512 unsigned int max_rate
; /* In bytes/s. */
3516 struct tc_queue tc_queue
;
3517 unsigned int min_rate
; /* In bytes/s. */
3518 unsigned int max_rate
; /* In bytes/s. */
3519 unsigned int burst
; /* In bytes. */
3520 unsigned int priority
; /* Lower values are higher priorities. */
3524 htb_get__(const struct netdev
*netdev_
)
3526 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3527 return CONTAINER_OF(netdev
->tc
, struct htb
, tc
);
3531 htb_install__(struct netdev
*netdev_
, uint64_t max_rate
)
3533 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3536 htb
= xmalloc(sizeof *htb
);
3537 tc_init(&htb
->tc
, &tc_ops_htb
);
3538 htb
->max_rate
= max_rate
;
3540 netdev
->tc
= &htb
->tc
;
3543 /* Create an HTB qdisc.
3545 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3547 htb_setup_qdisc__(struct netdev
*netdev
)
3550 struct tc_htb_glob opt
;
3551 struct ofpbuf request
;
3552 struct tcmsg
*tcmsg
;
3554 tc_del_qdisc(netdev
);
3556 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
3557 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3561 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3562 tcmsg
->tcm_parent
= TC_H_ROOT
;
3564 nl_msg_put_string(&request
, TCA_KIND
, "htb");
3566 memset(&opt
, 0, sizeof opt
);
3567 opt
.rate2quantum
= HTB_RATE2QUANTUM
;
3571 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3572 nl_msg_put_unspec(&request
, TCA_HTB_INIT
, &opt
, sizeof opt
);
3573 nl_msg_end_nested(&request
, opt_offset
);
3575 return tc_transact(&request
, NULL
);
3578 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3579 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3581 htb_setup_class__(struct netdev
*netdev
, unsigned int handle
,
3582 unsigned int parent
, struct htb_class
*class)
3585 struct tc_htb_opt opt
;
3586 struct ofpbuf request
;
3587 struct tcmsg
*tcmsg
;
3591 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3593 VLOG_WARN_RL(&rl
, "cannot set up HTB on device %s that lacks MTU",
3594 netdev_get_name(netdev
));
3598 memset(&opt
, 0, sizeof opt
);
3599 tc_fill_rate(&opt
.rate
, class->min_rate
, mtu
);
3600 tc_fill_rate(&opt
.ceil
, class->max_rate
, mtu
);
3601 /* Makes sure the quantum is at least MTU. Setting quantum will
3602 * make htb ignore the r2q for this class. */
3603 if ((class->min_rate
/ HTB_RATE2QUANTUM
) < mtu
) {
3606 opt
.buffer
= tc_calc_buffer(opt
.rate
.rate
, mtu
, class->burst
);
3607 opt
.cbuffer
= tc_calc_buffer(opt
.ceil
.rate
, mtu
, class->burst
);
3608 opt
.prio
= class->priority
;
3610 tcmsg
= tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
, &request
);
3614 tcmsg
->tcm_handle
= handle
;
3615 tcmsg
->tcm_parent
= parent
;
3617 nl_msg_put_string(&request
, TCA_KIND
, "htb");
3618 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3619 nl_msg_put_unspec(&request
, TCA_HTB_PARMS
, &opt
, sizeof opt
);
3620 tc_put_rtab(&request
, TCA_HTB_RTAB
, &opt
.rate
);
3621 tc_put_rtab(&request
, TCA_HTB_CTAB
, &opt
.ceil
);
3622 nl_msg_end_nested(&request
, opt_offset
);
3624 error
= tc_transact(&request
, NULL
);
3626 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
3627 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3628 netdev_get_name(netdev
),
3629 tc_get_major(handle
), tc_get_minor(handle
),
3630 tc_get_major(parent
), tc_get_minor(parent
),
3631 class->min_rate
, class->max_rate
,
3632 class->burst
, class->priority
, ovs_strerror(error
));
3637 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3638 * description of them into 'details'. The description complies with the
3639 * specification given in the vswitch database documentation for linux-htb
3642 htb_parse_tca_options__(struct nlattr
*nl_options
, struct htb_class
*class)
3644 static const struct nl_policy tca_htb_policy
[] = {
3645 [TCA_HTB_PARMS
] = { .type
= NL_A_UNSPEC
, .optional
= false,
3646 .min_len
= sizeof(struct tc_htb_opt
) },
3649 struct nlattr
*attrs
[ARRAY_SIZE(tca_htb_policy
)];
3650 const struct tc_htb_opt
*htb
;
3652 if (!nl_parse_nested(nl_options
, tca_htb_policy
,
3653 attrs
, ARRAY_SIZE(tca_htb_policy
))) {
3654 VLOG_WARN_RL(&rl
, "failed to parse HTB class options");
3658 htb
= nl_attr_get(attrs
[TCA_HTB_PARMS
]);
3659 class->min_rate
= htb
->rate
.rate
;
3660 class->max_rate
= htb
->ceil
.rate
;
3661 class->burst
= tc_ticks_to_bytes(htb
->rate
.rate
, htb
->buffer
);
3662 class->priority
= htb
->prio
;
3667 htb_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
3668 struct htb_class
*options
,
3669 struct netdev_queue_stats
*stats
)
3671 struct nlattr
*nl_options
;
3672 unsigned int handle
;
3675 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
3676 if (!error
&& queue_id
) {
3677 unsigned int major
= tc_get_major(handle
);
3678 unsigned int minor
= tc_get_minor(handle
);
3679 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
3680 *queue_id
= minor
- 1;
3685 if (!error
&& options
) {
3686 error
= htb_parse_tca_options__(nl_options
, options
);
3692 htb_parse_qdisc_details__(struct netdev
*netdev_
,
3693 const struct smap
*details
, struct htb_class
*hc
)
3695 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3696 const char *max_rate_s
;
3698 max_rate_s
= smap_get(details
, "max-rate");
3699 hc
->max_rate
= max_rate_s
? strtoull(max_rate_s
, NULL
, 10) / 8 : 0;
3700 if (!hc
->max_rate
) {
3701 enum netdev_features current
;
3703 netdev_linux_read_features(netdev
);
3704 current
= !netdev
->get_features_error
? netdev
->current
: 0;
3705 hc
->max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
3707 hc
->min_rate
= hc
->max_rate
;
3713 htb_parse_class_details__(struct netdev
*netdev
,
3714 const struct smap
*details
, struct htb_class
*hc
)
3716 const struct htb
*htb
= htb_get__(netdev
);
3717 const char *min_rate_s
= smap_get(details
, "min-rate");
3718 const char *max_rate_s
= smap_get(details
, "max-rate");
3719 const char *burst_s
= smap_get(details
, "burst");
3720 const char *priority_s
= smap_get(details
, "priority");
3723 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3725 VLOG_WARN_RL(&rl
, "cannot parse HTB class on device %s that lacks MTU",
3726 netdev_get_name(netdev
));
3730 /* HTB requires at least an mtu sized min-rate to send any traffic even
3731 * on uncongested links. */
3732 hc
->min_rate
= min_rate_s
? strtoull(min_rate_s
, NULL
, 10) / 8 : 0;
3733 hc
->min_rate
= MAX(hc
->min_rate
, mtu
);
3734 hc
->min_rate
= MIN(hc
->min_rate
, htb
->max_rate
);
3737 hc
->max_rate
= (max_rate_s
3738 ? strtoull(max_rate_s
, NULL
, 10) / 8
3740 hc
->max_rate
= MAX(hc
->max_rate
, hc
->min_rate
);
3741 hc
->max_rate
= MIN(hc
->max_rate
, htb
->max_rate
);
3745 * According to hints in the documentation that I've read, it is important
3746 * that 'burst' be at least as big as the largest frame that might be
3747 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3748 * but having it a bit too small is a problem. Since netdev_get_mtu()
3749 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3750 * the MTU. We actually add 64, instead of 14, as a guard against
3751 * additional headers get tacked on somewhere that we're not aware of. */
3752 hc
->burst
= burst_s
? strtoull(burst_s
, NULL
, 10) / 8 : 0;
3753 hc
->burst
= MAX(hc
->burst
, mtu
+ 64);
3756 hc
->priority
= priority_s
? strtoul(priority_s
, NULL
, 10) : 0;
3762 htb_query_class__(const struct netdev
*netdev
, unsigned int handle
,
3763 unsigned int parent
, struct htb_class
*options
,
3764 struct netdev_queue_stats
*stats
)
3766 struct ofpbuf
*reply
;
3769 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
3771 error
= htb_parse_tcmsg__(reply
, NULL
, options
, stats
);
3772 ofpbuf_delete(reply
);
3778 htb_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3782 error
= htb_setup_qdisc__(netdev
);
3784 struct htb_class hc
;
3786 htb_parse_qdisc_details__(netdev
, details
, &hc
);
3787 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
3788 tc_make_handle(1, 0), &hc
);
3790 htb_install__(netdev
, hc
.max_rate
);
3796 static struct htb_class
*
3797 htb_class_cast__(const struct tc_queue
*queue
)
3799 return CONTAINER_OF(queue
, struct htb_class
, tc_queue
);
3803 htb_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
3804 const struct htb_class
*hc
)
3806 struct htb
*htb
= htb_get__(netdev
);
3807 size_t hash
= hash_int(queue_id
, 0);
3808 struct tc_queue
*queue
;
3809 struct htb_class
*hcp
;
3811 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
3813 hcp
= htb_class_cast__(queue
);
3815 hcp
= xmalloc(sizeof *hcp
);
3816 queue
= &hcp
->tc_queue
;
3817 queue
->queue_id
= queue_id
;
3818 queue
->created
= time_msec();
3819 hmap_insert(&htb
->tc
.queues
, &queue
->hmap_node
, hash
);
3822 hcp
->min_rate
= hc
->min_rate
;
3823 hcp
->max_rate
= hc
->max_rate
;
3824 hcp
->burst
= hc
->burst
;
3825 hcp
->priority
= hc
->priority
;
3829 htb_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
3832 struct queue_dump_state state
;
3833 struct htb_class hc
;
3835 /* Get qdisc options. */
3837 htb_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
3838 htb_install__(netdev
, hc
.max_rate
);
3841 if (!start_queue_dump(netdev
, &state
)) {
3844 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
3845 unsigned int queue_id
;
3847 if (!htb_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
3848 htb_update_queue__(netdev
, queue_id
, &hc
);
3851 finish_queue_dump(&state
);
3857 htb_tc_destroy(struct tc
*tc
)
3859 struct htb
*htb
= CONTAINER_OF(tc
, struct htb
, tc
);
3860 struct htb_class
*hc
;
3862 HMAP_FOR_EACH_POP (hc
, tc_queue
.hmap_node
, &htb
->tc
.queues
) {
3870 htb_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3872 const struct htb
*htb
= htb_get__(netdev
);
3873 smap_add_format(details
, "max-rate", "%llu", 8ULL * htb
->max_rate
);
3878 htb_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3880 struct htb_class hc
;
3883 htb_parse_qdisc_details__(netdev
, details
, &hc
);
3884 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
3885 tc_make_handle(1, 0), &hc
);
3887 htb_get__(netdev
)->max_rate
= hc
.max_rate
;
3893 htb_class_get(const struct netdev
*netdev OVS_UNUSED
,
3894 const struct tc_queue
*queue
, struct smap
*details
)
3896 const struct htb_class
*hc
= htb_class_cast__(queue
);
3898 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
3899 if (hc
->min_rate
!= hc
->max_rate
) {
3900 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
3902 smap_add_format(details
, "burst", "%llu", 8ULL * hc
->burst
);
3904 smap_add_format(details
, "priority", "%u", hc
->priority
);
3910 htb_class_set(struct netdev
*netdev
, unsigned int queue_id
,
3911 const struct smap
*details
)
3913 struct htb_class hc
;
3916 error
= htb_parse_class_details__(netdev
, details
, &hc
);
3921 error
= htb_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
3922 tc_make_handle(1, 0xfffe), &hc
);
3927 htb_update_queue__(netdev
, queue_id
, &hc
);
3932 htb_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
3934 struct htb_class
*hc
= htb_class_cast__(queue
);
3935 struct htb
*htb
= htb_get__(netdev
);
3938 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
3940 hmap_remove(&htb
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
3947 htb_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
3948 struct netdev_queue_stats
*stats
)
3950 return htb_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
3951 tc_make_handle(1, 0xfffe), NULL
, stats
);
3955 htb_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
3956 const struct ofpbuf
*nlmsg
,
3957 netdev_dump_queue_stats_cb
*cb
, void *aux
)
3959 struct netdev_queue_stats stats
;
3960 unsigned int handle
, major
, minor
;
3963 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
3968 major
= tc_get_major(handle
);
3969 minor
= tc_get_minor(handle
);
3970 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
3971 (*cb
)(minor
- 1, &stats
, aux
);
3976 static const struct tc_ops tc_ops_htb
= {
3977 "htb", /* linux_name */
3978 "linux-htb", /* ovs_name */
3979 HTB_N_QUEUES
, /* n_queues */
3988 htb_class_get_stats
,
3989 htb_class_dump_stats
3992 /* "linux-hfsc" traffic control class. */
3994 #define HFSC_N_QUEUES 0xf000
4002 struct tc_queue tc_queue
;
4007 static struct hfsc
*
4008 hfsc_get__(const struct netdev
*netdev_
)
4010 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4011 return CONTAINER_OF(netdev
->tc
, struct hfsc
, tc
);
4014 static struct hfsc_class
*
4015 hfsc_class_cast__(const struct tc_queue
*queue
)
4017 return CONTAINER_OF(queue
, struct hfsc_class
, tc_queue
);
4021 hfsc_install__(struct netdev
*netdev_
, uint32_t max_rate
)
4023 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4026 hfsc
= xmalloc(sizeof *hfsc
);
4027 tc_init(&hfsc
->tc
, &tc_ops_hfsc
);
4028 hfsc
->max_rate
= max_rate
;
4029 netdev
->tc
= &hfsc
->tc
;
4033 hfsc_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4034 const struct hfsc_class
*hc
)
4038 struct hfsc_class
*hcp
;
4039 struct tc_queue
*queue
;
4041 hfsc
= hfsc_get__(netdev
);
4042 hash
= hash_int(queue_id
, 0);
4044 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4046 hcp
= hfsc_class_cast__(queue
);
4048 hcp
= xmalloc(sizeof *hcp
);
4049 queue
= &hcp
->tc_queue
;
4050 queue
->queue_id
= queue_id
;
4051 queue
->created
= time_msec();
4052 hmap_insert(&hfsc
->tc
.queues
, &queue
->hmap_node
, hash
);
4055 hcp
->min_rate
= hc
->min_rate
;
4056 hcp
->max_rate
= hc
->max_rate
;
4060 hfsc_parse_tca_options__(struct nlattr
*nl_options
, struct hfsc_class
*class)
4062 const struct tc_service_curve
*rsc
, *fsc
, *usc
;
4063 static const struct nl_policy tca_hfsc_policy
[] = {
4065 .type
= NL_A_UNSPEC
,
4067 .min_len
= sizeof(struct tc_service_curve
),
4070 .type
= NL_A_UNSPEC
,
4072 .min_len
= sizeof(struct tc_service_curve
),
4075 .type
= NL_A_UNSPEC
,
4077 .min_len
= sizeof(struct tc_service_curve
),
4080 struct nlattr
*attrs
[ARRAY_SIZE(tca_hfsc_policy
)];
4082 if (!nl_parse_nested(nl_options
, tca_hfsc_policy
,
4083 attrs
, ARRAY_SIZE(tca_hfsc_policy
))) {
4084 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options");
4088 rsc
= nl_attr_get(attrs
[TCA_HFSC_RSC
]);
4089 fsc
= nl_attr_get(attrs
[TCA_HFSC_FSC
]);
4090 usc
= nl_attr_get(attrs
[TCA_HFSC_USC
]);
4092 if (rsc
->m1
!= 0 || rsc
->d
!= 0 ||
4093 fsc
->m1
!= 0 || fsc
->d
!= 0 ||
4094 usc
->m1
!= 0 || usc
->d
!= 0) {
4095 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4096 "Non-linear service curves are not supported.");
4100 if (rsc
->m2
!= fsc
->m2
) {
4101 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4102 "Real-time service curves are not supported ");
4106 if (rsc
->m2
> usc
->m2
) {
4107 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4108 "Min-rate service curve is greater than "
4109 "the max-rate service curve.");
4113 class->min_rate
= fsc
->m2
;
4114 class->max_rate
= usc
->m2
;
4119 hfsc_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4120 struct hfsc_class
*options
,
4121 struct netdev_queue_stats
*stats
)
4124 unsigned int handle
;
4125 struct nlattr
*nl_options
;
4127 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4133 unsigned int major
, minor
;
4135 major
= tc_get_major(handle
);
4136 minor
= tc_get_minor(handle
);
4137 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4138 *queue_id
= minor
- 1;
4145 error
= hfsc_parse_tca_options__(nl_options
, options
);
4152 hfsc_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4153 unsigned int parent
, struct hfsc_class
*options
,
4154 struct netdev_queue_stats
*stats
)
4157 struct ofpbuf
*reply
;
4159 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4164 error
= hfsc_parse_tcmsg__(reply
, NULL
, options
, stats
);
4165 ofpbuf_delete(reply
);
4170 hfsc_parse_qdisc_details__(struct netdev
*netdev_
, const struct smap
*details
,
4171 struct hfsc_class
*class)
4173 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4175 const char *max_rate_s
;
4177 max_rate_s
= smap_get(details
, "max-rate");
4178 max_rate
= max_rate_s
? strtoull(max_rate_s
, NULL
, 10) / 8 : 0;
4181 enum netdev_features current
;
4183 netdev_linux_read_features(netdev
);
4184 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4185 max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4188 class->min_rate
= max_rate
;
4189 class->max_rate
= max_rate
;
4193 hfsc_parse_class_details__(struct netdev
*netdev
,
4194 const struct smap
*details
,
4195 struct hfsc_class
* class)
4197 const struct hfsc
*hfsc
;
4198 uint32_t min_rate
, max_rate
;
4199 const char *min_rate_s
, *max_rate_s
;
4201 hfsc
= hfsc_get__(netdev
);
4202 min_rate_s
= smap_get(details
, "min-rate");
4203 max_rate_s
= smap_get(details
, "max-rate");
4205 min_rate
= min_rate_s
? strtoull(min_rate_s
, NULL
, 10) / 8 : 0;
4206 min_rate
= MAX(min_rate
, 1);
4207 min_rate
= MIN(min_rate
, hfsc
->max_rate
);
4209 max_rate
= (max_rate_s
4210 ? strtoull(max_rate_s
, NULL
, 10) / 8
4212 max_rate
= MAX(max_rate
, min_rate
);
4213 max_rate
= MIN(max_rate
, hfsc
->max_rate
);
4215 class->min_rate
= min_rate
;
4216 class->max_rate
= max_rate
;
4221 /* Create an HFSC qdisc.
4223 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4225 hfsc_setup_qdisc__(struct netdev
* netdev
)
4227 struct tcmsg
*tcmsg
;
4228 struct ofpbuf request
;
4229 struct tc_hfsc_qopt opt
;
4231 tc_del_qdisc(netdev
);
4233 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
4234 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4240 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4241 tcmsg
->tcm_parent
= TC_H_ROOT
;
4243 memset(&opt
, 0, sizeof opt
);
4246 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4247 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
4249 return tc_transact(&request
, NULL
);
4252 /* Create an HFSC class.
4254 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4255 * sc rate <min_rate> ul rate <max_rate>" */
4257 hfsc_setup_class__(struct netdev
*netdev
, unsigned int handle
,
4258 unsigned int parent
, struct hfsc_class
*class)
4262 struct tcmsg
*tcmsg
;
4263 struct ofpbuf request
;
4264 struct tc_service_curve min
, max
;
4266 tcmsg
= tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
, &request
);
4272 tcmsg
->tcm_handle
= handle
;
4273 tcmsg
->tcm_parent
= parent
;
4277 min
.m2
= class->min_rate
;
4281 max
.m2
= class->max_rate
;
4283 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4284 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4285 nl_msg_put_unspec(&request
, TCA_HFSC_RSC
, &min
, sizeof min
);
4286 nl_msg_put_unspec(&request
, TCA_HFSC_FSC
, &min
, sizeof min
);
4287 nl_msg_put_unspec(&request
, TCA_HFSC_USC
, &max
, sizeof max
);
4288 nl_msg_end_nested(&request
, opt_offset
);
4290 error
= tc_transact(&request
, NULL
);
4292 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
4293 "min-rate %ubps, max-rate %ubps (%s)",
4294 netdev_get_name(netdev
),
4295 tc_get_major(handle
), tc_get_minor(handle
),
4296 tc_get_major(parent
), tc_get_minor(parent
),
4297 class->min_rate
, class->max_rate
, ovs_strerror(error
));
4304 hfsc_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4307 struct hfsc_class
class;
4309 error
= hfsc_setup_qdisc__(netdev
);
4315 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4316 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4317 tc_make_handle(1, 0), &class);
4323 hfsc_install__(netdev
, class.max_rate
);
4328 hfsc_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4331 struct queue_dump_state state
;
4332 struct hfsc_class hc
;
4335 hfsc_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4336 hfsc_install__(netdev
, hc
.max_rate
);
4338 if (!start_queue_dump(netdev
, &state
)) {
4342 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4343 unsigned int queue_id
;
4345 if (!hfsc_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4346 hfsc_update_queue__(netdev
, queue_id
, &hc
);
4350 finish_queue_dump(&state
);
4355 hfsc_tc_destroy(struct tc
*tc
)
4358 struct hfsc_class
*hc
, *next
;
4360 hfsc
= CONTAINER_OF(tc
, struct hfsc
, tc
);
4362 HMAP_FOR_EACH_SAFE (hc
, next
, tc_queue
.hmap_node
, &hfsc
->tc
.queues
) {
4363 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4372 hfsc_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4374 const struct hfsc
*hfsc
;
4375 hfsc
= hfsc_get__(netdev
);
4376 smap_add_format(details
, "max-rate", "%llu", 8ULL * hfsc
->max_rate
);
4381 hfsc_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4384 struct hfsc_class
class;
4386 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4387 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4388 tc_make_handle(1, 0), &class);
4391 hfsc_get__(netdev
)->max_rate
= class.max_rate
;
4398 hfsc_class_get(const struct netdev
*netdev OVS_UNUSED
,
4399 const struct tc_queue
*queue
, struct smap
*details
)
4401 const struct hfsc_class
*hc
;
4403 hc
= hfsc_class_cast__(queue
);
4404 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4405 if (hc
->min_rate
!= hc
->max_rate
) {
4406 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4412 hfsc_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4413 const struct smap
*details
)
4416 struct hfsc_class
class;
4418 error
= hfsc_parse_class_details__(netdev
, details
, &class);
4423 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
4424 tc_make_handle(1, 0xfffe), &class);
4429 hfsc_update_queue__(netdev
, queue_id
, &class);
4434 hfsc_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4438 struct hfsc_class
*hc
;
4440 hc
= hfsc_class_cast__(queue
);
4441 hfsc
= hfsc_get__(netdev
);
4443 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4445 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4452 hfsc_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4453 struct netdev_queue_stats
*stats
)
4455 return hfsc_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4456 tc_make_handle(1, 0xfffe), NULL
, stats
);
4460 hfsc_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4461 const struct ofpbuf
*nlmsg
,
4462 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4464 struct netdev_queue_stats stats
;
4465 unsigned int handle
, major
, minor
;
4468 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4473 major
= tc_get_major(handle
);
4474 minor
= tc_get_minor(handle
);
4475 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4476 (*cb
)(minor
- 1, &stats
, aux
);
4481 static const struct tc_ops tc_ops_hfsc
= {
4482 "hfsc", /* linux_name */
4483 "linux-hfsc", /* ovs_name */
4484 HFSC_N_QUEUES
, /* n_queues */
4485 hfsc_tc_install
, /* tc_install */
4486 hfsc_tc_load
, /* tc_load */
4487 hfsc_tc_destroy
, /* tc_destroy */
4488 hfsc_qdisc_get
, /* qdisc_get */
4489 hfsc_qdisc_set
, /* qdisc_set */
4490 hfsc_class_get
, /* class_get */
4491 hfsc_class_set
, /* class_set */
4492 hfsc_class_delete
, /* class_delete */
4493 hfsc_class_get_stats
, /* class_get_stats */
4494 hfsc_class_dump_stats
/* class_dump_stats */
4497 /* "linux-noop" traffic control class. */
4500 noop_install__(struct netdev
*netdev_
)
4502 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4503 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
4505 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4509 noop_tc_install(struct netdev
*netdev
,
4510 const struct smap
*details OVS_UNUSED
)
4512 noop_install__(netdev
);
4517 noop_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4519 noop_install__(netdev
);
4523 static const struct tc_ops tc_ops_noop
= {
4524 NULL
, /* linux_name */
4525 "linux-noop", /* ovs_name */
4529 NULL
, /* tc_destroy */
4530 NULL
, /* qdisc_get */
4531 NULL
, /* qdisc_set */
4532 NULL
, /* class_get */
4533 NULL
, /* class_set */
4534 NULL
, /* class_delete */
4535 NULL
, /* class_get_stats */
4536 NULL
/* class_dump_stats */
4539 /* "linux-default" traffic control class.
4541 * This class represents the default, unnamed Linux qdisc. It corresponds to
4542 * the "" (empty string) QoS type in the OVS database. */
4545 default_install__(struct netdev
*netdev_
)
4547 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4548 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
4550 /* Nothing but a tc class implementation is allowed to write to a tc. This
4551 * class never does that, so we can legitimately use a const tc object. */
4552 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4556 default_tc_install(struct netdev
*netdev
,
4557 const struct smap
*details OVS_UNUSED
)
4559 default_install__(netdev
);
4564 default_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4566 default_install__(netdev
);
4570 static const struct tc_ops tc_ops_default
= {
4571 NULL
, /* linux_name */
4576 NULL
, /* tc_destroy */
4577 NULL
, /* qdisc_get */
4578 NULL
, /* qdisc_set */
4579 NULL
, /* class_get */
4580 NULL
, /* class_set */
4581 NULL
, /* class_delete */
4582 NULL
, /* class_get_stats */
4583 NULL
/* class_dump_stats */
4586 /* "linux-other" traffic control class.
4591 other_tc_load(struct netdev
*netdev_
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4593 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4594 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_other
);
4596 /* Nothing but a tc class implementation is allowed to write to a tc. This
4597 * class never does that, so we can legitimately use a const tc object. */
4598 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4602 static const struct tc_ops tc_ops_other
= {
4603 NULL
, /* linux_name */
4604 "linux-other", /* ovs_name */
4606 NULL
, /* tc_install */
4608 NULL
, /* tc_destroy */
4609 NULL
, /* qdisc_get */
4610 NULL
, /* qdisc_set */
4611 NULL
, /* class_get */
4612 NULL
, /* class_set */
4613 NULL
, /* class_delete */
4614 NULL
, /* class_get_stats */
4615 NULL
/* class_dump_stats */
4618 /* Traffic control. */
4620 /* Number of kernel "tc" ticks per second. */
4621 static double ticks_per_s
;
4623 /* Number of kernel "jiffies" per second. This is used for the purpose of
4624 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4625 * one jiffy's worth of data.
4627 * There are two possibilities here:
4629 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4630 * approximate range of 100 to 1024. That means that we really need to
4631 * make sure that the qdisc can buffer that much data.
4633 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4634 * has finely granular timers and there's no need to fudge additional room
4635 * for buffers. (There's no extra effort needed to implement that: the
4636 * large 'buffer_hz' is used as a divisor, so practically any number will
4637 * come out as 0 in the division. Small integer results in the case of
4638 * really high dividends won't have any real effect anyhow.)
4640 static unsigned int buffer_hz
;
4642 /* Returns tc handle 'major':'minor'. */
4644 tc_make_handle(unsigned int major
, unsigned int minor
)
4646 return TC_H_MAKE(major
<< 16, minor
);
4649 /* Returns the major number from 'handle'. */
4651 tc_get_major(unsigned int handle
)
4653 return TC_H_MAJ(handle
) >> 16;
4656 /* Returns the minor number from 'handle'. */
4658 tc_get_minor(unsigned int handle
)
4660 return TC_H_MIN(handle
);
4663 static struct tcmsg
*
4664 tc_make_request(const struct netdev
*netdev
, int type
, unsigned int flags
,
4665 struct ofpbuf
*request
)
4667 struct tcmsg
*tcmsg
;
4671 error
= get_ifindex(netdev
, &ifindex
);
4676 ofpbuf_init(request
, 512);
4677 nl_msg_put_nlmsghdr(request
, sizeof *tcmsg
, type
, NLM_F_REQUEST
| flags
);
4678 tcmsg
= ofpbuf_put_zeros(request
, sizeof *tcmsg
);
4679 tcmsg
->tcm_family
= AF_UNSPEC
;
4680 tcmsg
->tcm_ifindex
= ifindex
;
4681 /* Caller should fill in tcmsg->tcm_handle. */
4682 /* Caller should fill in tcmsg->tcm_parent. */
4688 tc_transact(struct ofpbuf
*request
, struct ofpbuf
**replyp
)
4690 int error
= nl_transact(NETLINK_ROUTE
, request
, replyp
);
4691 ofpbuf_uninit(request
);
4695 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4696 * policing configuration.
4698 * This function is equivalent to running the following when 'add' is true:
4699 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4701 * This function is equivalent to running the following when 'add' is false:
4702 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4704 * The configuration and stats may be seen with the following command:
4705 * /sbin/tc -s qdisc show dev <devname>
4707 * Returns 0 if successful, otherwise a positive errno value.
4710 tc_add_del_ingress_qdisc(struct netdev
*netdev
, bool add
)
4712 struct ofpbuf request
;
4713 struct tcmsg
*tcmsg
;
4715 int type
= add
? RTM_NEWQDISC
: RTM_DELQDISC
;
4716 int flags
= add
? NLM_F_EXCL
| NLM_F_CREATE
: 0;
4718 tcmsg
= tc_make_request(netdev
, type
, flags
, &request
);
4722 tcmsg
->tcm_handle
= tc_make_handle(0xffff, 0);
4723 tcmsg
->tcm_parent
= TC_H_INGRESS
;
4724 nl_msg_put_string(&request
, TCA_KIND
, "ingress");
4725 nl_msg_put_unspec(&request
, TCA_OPTIONS
, NULL
, 0);
4727 error
= tc_transact(&request
, NULL
);
4729 /* If we're deleting the qdisc, don't worry about some of the
4730 * error conditions. */
4731 if (!add
&& (error
== ENOENT
|| error
== EINVAL
)) {
4740 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4743 * This function is equivalent to running:
4744 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4745 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4748 * The configuration and stats may be seen with the following command:
4749 * /sbin/tc -s filter show dev <devname> parent ffff:
4751 * Returns 0 if successful, otherwise a positive errno value.
4754 tc_add_policer(struct netdev
*netdev
,
4755 uint32_t kbits_rate
, uint32_t kbits_burst
)
4757 struct tc_police tc_police
;
4758 struct ofpbuf request
;
4759 struct tcmsg
*tcmsg
;
4760 size_t basic_offset
;
4761 size_t police_offset
;
4765 memset(&tc_police
, 0, sizeof tc_police
);
4766 tc_police
.action
= TC_POLICE_SHOT
;
4767 tc_police
.mtu
= mtu
;
4768 tc_fill_rate(&tc_police
.rate
, ((uint64_t) kbits_rate
* 1000)/8, mtu
);
4770 /* The following appears wrong in one way: In networking a kilobit is
4771 * usually 1000 bits but this uses 1024 bits.
4773 * However if you "fix" those problems then "tc filter show ..." shows
4774 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4775 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4776 * tc's point of view. Whatever. */
4777 tc_police
.burst
= tc_bytes_to_ticks(
4778 tc_police
.rate
.rate
, MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024 / 8);
4780 tcmsg
= tc_make_request(netdev
, RTM_NEWTFILTER
,
4781 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4785 tcmsg
->tcm_parent
= tc_make_handle(0xffff, 0);
4786 tcmsg
->tcm_info
= tc_make_handle(49,
4787 (OVS_FORCE
uint16_t) htons(ETH_P_ALL
));
4789 nl_msg_put_string(&request
, TCA_KIND
, "basic");
4790 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4791 police_offset
= nl_msg_start_nested(&request
, TCA_BASIC_POLICE
);
4792 nl_msg_put_unspec(&request
, TCA_POLICE_TBF
, &tc_police
, sizeof tc_police
);
4793 tc_put_rtab(&request
, TCA_POLICE_RATE
, &tc_police
.rate
);
4794 nl_msg_end_nested(&request
, police_offset
);
4795 nl_msg_end_nested(&request
, basic_offset
);
4797 error
= tc_transact(&request
, NULL
);
4808 /* The values in psched are not individually very meaningful, but they are
4809 * important. The tables below show some values seen in the wild.
4813 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4814 * (Before that, there are hints that it was 1000000000.)
4816 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4820 * -----------------------------------
4821 * [1] 000c8000 000f4240 000f4240 00000064
4822 * [2] 000003e8 00000400 000f4240 3b9aca00
4823 * [3] 000003e8 00000400 000f4240 3b9aca00
4824 * [4] 000003e8 00000400 000f4240 00000064
4825 * [5] 000003e8 00000040 000f4240 3b9aca00
4826 * [6] 000003e8 00000040 000f4240 000000f9
4828 * a b c d ticks_per_s buffer_hz
4829 * ------- --------- ---------- ------------- ----------- -------------
4830 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4831 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4832 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4833 * [4] 1,000 1,024 1,000,000 100 976,562 100
4834 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4835 * [6] 1,000 64 1,000,000 249 15,625,000 249
4837 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4838 * [2] 2.6.26-1-686-bigmem from Debian lenny
4839 * [3] 2.6.26-2-sparc64 from Debian lenny
4840 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4841 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4842 * [6] 2.6.34 from kernel.org on KVM
4844 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
4845 static const char fn
[] = "/proc/net/psched";
4846 unsigned int a
, b
, c
, d
;
4849 if (!ovsthread_once_start(&once
)) {
4856 stream
= fopen(fn
, "r");
4858 VLOG_WARN("%s: open failed: %s", fn
, ovs_strerror(errno
));
4862 if (fscanf(stream
, "%x %x %x %x", &a
, &b
, &c
, &d
) != 4) {
4863 VLOG_WARN("%s: read failed", fn
);
4867 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn
, a
, b
, c
, d
);
4871 VLOG_WARN("%s: invalid scheduler parameters", fn
);
4875 ticks_per_s
= (double) a
* c
/ b
;
4879 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4882 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn
, ticks_per_s
, buffer_hz
);
4885 ovsthread_once_done(&once
);
4888 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4889 * rate of 'rate' bytes per second. */
4891 tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
)
4894 return (rate
* ticks
) / ticks_per_s
;
4897 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4898 * rate of 'rate' bytes per second. */
4900 tc_bytes_to_ticks(unsigned int rate
, unsigned int size
)
4903 return rate
? ((unsigned long long int) ticks_per_s
* size
) / rate
: 0;
4906 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4907 * a transmission rate of 'rate' bytes per second. */
4909 tc_buffer_per_jiffy(unsigned int rate
)
4912 return rate
/ buffer_hz
;
4915 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4916 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4917 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4918 * stores NULL into it if it is absent.
4920 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4923 * Returns 0 if successful, otherwise a positive errno value. */
4925 tc_parse_qdisc(const struct ofpbuf
*msg
, const char **kind
,
4926 struct nlattr
**options
)
4928 static const struct nl_policy tca_policy
[] = {
4929 [TCA_KIND
] = { .type
= NL_A_STRING
, .optional
= false },
4930 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= true },
4932 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
4934 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
4935 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
4936 VLOG_WARN_RL(&rl
, "failed to parse qdisc message");
4941 *kind
= nl_attr_get_string(ta
[TCA_KIND
]);
4945 *options
= ta
[TCA_OPTIONS
];
4960 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4961 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4962 * into '*options', and its queue statistics into '*stats'. Any of the output
4963 * arguments may be null.
4965 * Returns 0 if successful, otherwise a positive errno value. */
4967 tc_parse_class(const struct ofpbuf
*msg
, unsigned int *handlep
,
4968 struct nlattr
**options
, struct netdev_queue_stats
*stats
)
4970 static const struct nl_policy tca_policy
[] = {
4971 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= false },
4972 [TCA_STATS2
] = { .type
= NL_A_NESTED
, .optional
= false },
4974 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
4976 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
4977 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
4978 VLOG_WARN_RL(&rl
, "failed to parse class message");
4983 struct tcmsg
*tc
= ofpbuf_at_assert(msg
, NLMSG_HDRLEN
, sizeof *tc
);
4984 *handlep
= tc
->tcm_handle
;
4988 *options
= ta
[TCA_OPTIONS
];
4992 const struct gnet_stats_queue
*gsq
;
4993 struct gnet_stats_basic gsb
;
4995 static const struct nl_policy stats_policy
[] = {
4996 [TCA_STATS_BASIC
] = { .type
= NL_A_UNSPEC
, .optional
= false,
4997 .min_len
= sizeof gsb
},
4998 [TCA_STATS_QUEUE
] = { .type
= NL_A_UNSPEC
, .optional
= false,
4999 .min_len
= sizeof *gsq
},
5001 struct nlattr
*sa
[ARRAY_SIZE(stats_policy
)];
5003 if (!nl_parse_nested(ta
[TCA_STATS2
], stats_policy
,
5004 sa
, ARRAY_SIZE(sa
))) {
5005 VLOG_WARN_RL(&rl
, "failed to parse class stats");
5009 /* Alignment issues screw up the length of struct gnet_stats_basic on
5010 * some arch/bitsize combinations. Newer versions of Linux have a
5011 * struct gnet_stats_basic_packed, but we can't depend on that. The
5012 * easiest thing to do is just to make a copy. */
5013 memset(&gsb
, 0, sizeof gsb
);
5014 memcpy(&gsb
, nl_attr_get(sa
[TCA_STATS_BASIC
]),
5015 MIN(nl_attr_get_size(sa
[TCA_STATS_BASIC
]), sizeof gsb
));
5016 stats
->tx_bytes
= gsb
.bytes
;
5017 stats
->tx_packets
= gsb
.packets
;
5019 gsq
= nl_attr_get(sa
[TCA_STATS_QUEUE
]);
5020 stats
->tx_errors
= gsq
->drops
;
5030 memset(stats
, 0, sizeof *stats
);
5035 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5038 tc_query_class(const struct netdev
*netdev
,
5039 unsigned int handle
, unsigned int parent
,
5040 struct ofpbuf
**replyp
)
5042 struct ofpbuf request
;
5043 struct tcmsg
*tcmsg
;
5046 tcmsg
= tc_make_request(netdev
, RTM_GETTCLASS
, NLM_F_ECHO
, &request
);
5050 tcmsg
->tcm_handle
= handle
;
5051 tcmsg
->tcm_parent
= parent
;
5053 error
= tc_transact(&request
, replyp
);
5055 VLOG_WARN_RL(&rl
, "query %s class %u:%u (parent %u:%u) failed (%s)",
5056 netdev_get_name(netdev
),
5057 tc_get_major(handle
), tc_get_minor(handle
),
5058 tc_get_major(parent
), tc_get_minor(parent
),
5059 ovs_strerror(error
));
5064 /* Equivalent to "tc class del dev <name> handle <handle>". */
5066 tc_delete_class(const struct netdev
*netdev
, unsigned int handle
)
5068 struct ofpbuf request
;
5069 struct tcmsg
*tcmsg
;
5072 tcmsg
= tc_make_request(netdev
, RTM_DELTCLASS
, 0, &request
);
5076 tcmsg
->tcm_handle
= handle
;
5077 tcmsg
->tcm_parent
= 0;
5079 error
= tc_transact(&request
, NULL
);
5081 VLOG_WARN_RL(&rl
, "delete %s class %u:%u failed (%s)",
5082 netdev_get_name(netdev
),
5083 tc_get_major(handle
), tc_get_minor(handle
),
5084 ovs_strerror(error
));
5089 /* Equivalent to "tc qdisc del dev <name> root". */
5091 tc_del_qdisc(struct netdev
*netdev_
)
5093 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5094 struct ofpbuf request
;
5095 struct tcmsg
*tcmsg
;
5098 tcmsg
= tc_make_request(netdev_
, RTM_DELQDISC
, 0, &request
);
5102 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
5103 tcmsg
->tcm_parent
= TC_H_ROOT
;
5105 error
= tc_transact(&request
, NULL
);
5106 if (error
== EINVAL
) {
5107 /* EINVAL probably means that the default qdisc was in use, in which
5108 * case we've accomplished our purpose. */
5111 if (!error
&& netdev
->tc
) {
5112 if (netdev
->tc
->ops
->tc_destroy
) {
5113 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
5121 getqdisc_is_safe(void)
5123 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5124 static bool safe
= false;
5126 if (ovsthread_once_start(&once
)) {
5127 struct utsname utsname
;
5130 if (uname(&utsname
) == -1) {
5131 VLOG_WARN("uname failed (%s)", ovs_strerror(errno
));
5132 } else if (!ovs_scan(utsname
.release
, "%d.%d", &major
, &minor
)) {
5133 VLOG_WARN("uname reported bad OS release (%s)", utsname
.release
);
5134 } else if (major
< 2 || (major
== 2 && minor
< 35)) {
5135 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5140 ovsthread_once_done(&once
);
5145 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5146 * kernel to determine what they are. Returns 0 if successful, otherwise a
5147 * positive errno value. */
5149 tc_query_qdisc(const struct netdev
*netdev_
)
5151 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5152 struct ofpbuf request
, *qdisc
;
5153 const struct tc_ops
*ops
;
5154 struct tcmsg
*tcmsg
;
5162 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5163 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5164 * 2.6.35 without that fix backported to it.
5166 * To avoid the OOPS, we must not make a request that would attempt to dump
5167 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5168 * few others. There are a few ways that I can see to do this, but most of
5169 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5170 * technique chosen here is to assume that any non-default qdisc that we
5171 * create will have a class with handle 1:0. The built-in qdiscs only have
5172 * a class with handle 0:0.
5174 * On Linux 2.6.35+ we use the straightforward method because it allows us
5175 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5176 * in such a case we get no response at all from the kernel (!) if a
5177 * builtin qdisc is in use (which is later caught by "!error &&
5178 * !qdisc->size"). */
5179 tcmsg
= tc_make_request(netdev_
, RTM_GETQDISC
, NLM_F_ECHO
, &request
);
5183 tcmsg
->tcm_handle
= tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5184 tcmsg
->tcm_parent
= getqdisc_is_safe() ? TC_H_ROOT
: 0;
5186 /* Figure out what tc class to instantiate. */
5187 error
= tc_transact(&request
, &qdisc
);
5188 if (!error
&& qdisc
->size
) {
5191 error
= tc_parse_qdisc(qdisc
, &kind
, NULL
);
5193 ops
= &tc_ops_other
;
5195 ops
= tc_lookup_linux_name(kind
);
5197 static struct vlog_rate_limit rl2
= VLOG_RATE_LIMIT_INIT(1, 1);
5198 VLOG_DBG_RL(&rl2
, "unknown qdisc \"%s\"", kind
);
5200 ops
= &tc_ops_other
;
5203 } else if ((!error
&& !qdisc
->size
) || error
== ENOENT
) {
5204 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5205 * set up by some other entity that doesn't have a handle 1:0. We will
5206 * assume that it's the system default qdisc. */
5207 ops
= &tc_ops_default
;
5210 /* Who knows? Maybe the device got deleted. */
5211 VLOG_WARN_RL(&rl
, "query %s qdisc failed (%s)",
5212 netdev_get_name(netdev_
), ovs_strerror(error
));
5213 ops
= &tc_ops_other
;
5216 /* Instantiate it. */
5217 load_error
= ops
->tc_load(CONST_CAST(struct netdev
*, netdev_
), qdisc
);
5218 ovs_assert((load_error
== 0) == (netdev
->tc
!= NULL
));
5219 ofpbuf_delete(qdisc
);
5221 return error
? error
: load_error
;
5224 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5225 approximate the time to transmit packets of various lengths. For an MTU of
5226 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5227 represents two possible packet lengths; for a MTU of 513 through 1024, four
5228 possible lengths; and so on.
5230 Returns, for the specified 'mtu', the number of bits that packet lengths
5231 need to be shifted right to fit within such a 256-entry table. */
5233 tc_calc_cell_log(unsigned int mtu
)
5238 mtu
= ETH_PAYLOAD_MAX
;
5240 mtu
+= ETH_HEADER_LEN
+ VLAN_HEADER_LEN
;
5242 for (cell_log
= 0; mtu
>= 256; cell_log
++) {
5249 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5252 tc_fill_rate(struct tc_ratespec
*rate
, uint64_t Bps
, int mtu
)
5254 memset(rate
, 0, sizeof *rate
);
5255 rate
->cell_log
= tc_calc_cell_log(mtu
);
5256 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5257 /* rate->cell_align = 0; */ /* distro headers. */
5258 rate
->mpu
= ETH_TOTAL_MIN
;
5262 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5263 * attribute of the specified "type".
5265 * See tc_calc_cell_log() above for a description of "rtab"s. */
5267 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
)
5272 rtab
= nl_msg_put_unspec_uninit(msg
, type
, TC_RTAB_SIZE
);
5273 for (i
= 0; i
< TC_RTAB_SIZE
/ sizeof *rtab
; i
++) {
5274 unsigned packet_size
= (i
+ 1) << rate
->cell_log
;
5275 if (packet_size
< rate
->mpu
) {
5276 packet_size
= rate
->mpu
;
5278 rtab
[i
] = tc_bytes_to_ticks(rate
->rate
, packet_size
);
5282 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5283 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5284 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5287 tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
)
5289 unsigned int min_burst
= tc_buffer_per_jiffy(Bps
) + mtu
;
5290 return tc_bytes_to_ticks(Bps
, MAX(burst_bytes
, min_burst
));
5293 /* Linux-only functions declared in netdev-linux.h */
5295 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5296 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5298 netdev_linux_ethtool_set_flag(struct netdev
*netdev
, uint32_t flag
,
5299 const char *flag_name
, bool enable
)
5301 const char *netdev_name
= netdev_get_name(netdev
);
5302 struct ethtool_value evalue
;
5306 COVERAGE_INC(netdev_get_ethtool
);
5307 memset(&evalue
, 0, sizeof evalue
);
5308 error
= netdev_linux_do_ethtool(netdev_name
,
5309 (struct ethtool_cmd
*)&evalue
,
5310 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5315 COVERAGE_INC(netdev_set_ethtool
);
5316 new_flags
= (evalue
.data
& ~flag
) | (enable
? flag
: 0);
5317 if (new_flags
== evalue
.data
) {
5320 evalue
.data
= new_flags
;
5321 error
= netdev_linux_do_ethtool(netdev_name
,
5322 (struct ethtool_cmd
*)&evalue
,
5323 ETHTOOL_SFLAGS
, "ETHTOOL_SFLAGS");
5328 COVERAGE_INC(netdev_get_ethtool
);
5329 memset(&evalue
, 0, sizeof evalue
);
5330 error
= netdev_linux_do_ethtool(netdev_name
,
5331 (struct ethtool_cmd
*)&evalue
,
5332 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5337 if (new_flags
!= evalue
.data
) {
5338 VLOG_WARN_RL(&rl
, "attempt to %s ethtool %s flag on network "
5339 "device %s failed", enable
? "enable" : "disable",
5340 flag_name
, netdev_name
);
5347 /* Utility functions. */
5349 /* Copies 'src' into 'dst', performing format conversion in the process. */
5351 netdev_stats_from_rtnl_link_stats(struct netdev_stats
*dst
,
5352 const struct rtnl_link_stats
*src
)
5354 dst
->rx_packets
= src
->rx_packets
;
5355 dst
->tx_packets
= src
->tx_packets
;
5356 dst
->rx_bytes
= src
->rx_bytes
;
5357 dst
->tx_bytes
= src
->tx_bytes
;
5358 dst
->rx_errors
= src
->rx_errors
;
5359 dst
->tx_errors
= src
->tx_errors
;
5360 dst
->rx_dropped
= src
->rx_dropped
;
5361 dst
->tx_dropped
= src
->tx_dropped
;
5362 dst
->multicast
= src
->multicast
;
5363 dst
->collisions
= src
->collisions
;
5364 dst
->rx_length_errors
= src
->rx_length_errors
;
5365 dst
->rx_over_errors
= src
->rx_over_errors
;
5366 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5367 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5368 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5369 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5370 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5371 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5372 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5373 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5374 dst
->tx_window_errors
= src
->tx_window_errors
;
5377 /* Copies 'src' into 'dst', performing format conversion in the process. */
5379 netdev_stats_from_rtnl_link_stats64(struct netdev_stats
*dst
,
5380 const struct rtnl_link_stats64
*src
)
5382 dst
->rx_packets
= src
->rx_packets
;
5383 dst
->tx_packets
= src
->tx_packets
;
5384 dst
->rx_bytes
= src
->rx_bytes
;
5385 dst
->tx_bytes
= src
->tx_bytes
;
5386 dst
->rx_errors
= src
->rx_errors
;
5387 dst
->tx_errors
= src
->tx_errors
;
5388 dst
->rx_dropped
= src
->rx_dropped
;
5389 dst
->tx_dropped
= src
->tx_dropped
;
5390 dst
->multicast
= src
->multicast
;
5391 dst
->collisions
= src
->collisions
;
5392 dst
->rx_length_errors
= src
->rx_length_errors
;
5393 dst
->rx_over_errors
= src
->rx_over_errors
;
5394 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5395 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5396 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5397 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5398 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5399 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5400 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5401 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5402 dst
->tx_window_errors
= src
->tx_window_errors
;
5406 get_stats_via_netlink(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
5408 struct ofpbuf request
;
5409 struct ofpbuf
*reply
;
5412 /* Filtering all counters by default */
5413 memset(stats
, 0xFF, sizeof(struct netdev_stats
));
5415 ofpbuf_init(&request
, 0);
5416 nl_msg_put_nlmsghdr(&request
,
5417 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
5418 RTM_GETLINK
, NLM_F_REQUEST
);
5419 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
5420 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(netdev_
));
5421 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
5422 ofpbuf_uninit(&request
);
5427 if (ofpbuf_try_pull(reply
, NLMSG_HDRLEN
+ sizeof(struct ifinfomsg
))) {
5428 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS64
);
5429 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats64
)) {
5430 netdev_stats_from_rtnl_link_stats64(stats
, nl_attr_get(a
));
5433 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS
);
5434 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats
)) {
5435 netdev_stats_from_rtnl_link_stats(stats
, nl_attr_get(a
));
5438 VLOG_WARN_RL(&rl
, "RTM_GETLINK reply lacks stats");
5443 VLOG_WARN_RL(&rl
, "short RTM_GETLINK reply");
5448 ofpbuf_delete(reply
);
5453 get_flags(const struct netdev
*dev
, unsigned int *flags
)
5459 error
= af_inet_ifreq_ioctl(dev
->name
, &ifr
, SIOCGIFFLAGS
, "SIOCGIFFLAGS");
5461 *flags
= ifr
.ifr_flags
;
5467 set_flags(const char *name
, unsigned int flags
)
5471 ifr
.ifr_flags
= flags
;
5472 return af_inet_ifreq_ioctl(name
, &ifr
, SIOCSIFFLAGS
, "SIOCSIFFLAGS");
5476 do_get_ifindex(const char *netdev_name
)
5481 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5482 COVERAGE_INC(netdev_get_ifindex
);
5484 error
= af_inet_ioctl(SIOCGIFINDEX
, &ifr
);
5486 VLOG_WARN_RL(&rl
, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5487 netdev_name
, ovs_strerror(error
));
5490 return ifr
.ifr_ifindex
;
5494 get_ifindex(const struct netdev
*netdev_
, int *ifindexp
)
5496 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5498 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
5499 int ifindex
= do_get_ifindex(netdev_get_name(netdev_
));
5502 netdev
->get_ifindex_error
= -ifindex
;
5503 netdev
->ifindex
= 0;
5505 netdev
->get_ifindex_error
= 0;
5506 netdev
->ifindex
= ifindex
;
5508 netdev
->cache_valid
|= VALID_IFINDEX
;
5511 *ifindexp
= netdev
->ifindex
;
5512 return netdev
->get_ifindex_error
;
5516 get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
)
5522 memset(&ifr
, 0, sizeof ifr
);
5523 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5524 COVERAGE_INC(netdev_get_hwaddr
);
5525 error
= af_inet_ioctl(SIOCGIFHWADDR
, &ifr
);
5527 /* ENODEV probably means that a vif disappeared asynchronously and
5528 * hasn't been removed from the database yet, so reduce the log level
5529 * to INFO for that case. */
5530 VLOG(error
== ENODEV
? VLL_INFO
: VLL_ERR
,
5531 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5532 netdev_name
, ovs_strerror(error
));
5535 hwaddr_family
= ifr
.ifr_hwaddr
.sa_family
;
5536 if (hwaddr_family
!= AF_UNSPEC
&& hwaddr_family
!= ARPHRD_ETHER
) {
5537 VLOG_INFO("%s device has unknown hardware address family %d",
5538 netdev_name
, hwaddr_family
);
5541 memcpy(ea
, ifr
.ifr_hwaddr
.sa_data
, ETH_ADDR_LEN
);
5546 set_etheraddr(const char *netdev_name
, const struct eth_addr mac
)
5551 memset(&ifr
, 0, sizeof ifr
);
5552 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5553 ifr
.ifr_hwaddr
.sa_family
= ARPHRD_ETHER
;
5554 memcpy(ifr
.ifr_hwaddr
.sa_data
, &mac
, ETH_ADDR_LEN
);
5555 COVERAGE_INC(netdev_set_hwaddr
);
5556 error
= af_inet_ioctl(SIOCSIFHWADDR
, &ifr
);
5558 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5559 netdev_name
, ovs_strerror(error
));
5565 netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*ecmd
,
5566 int cmd
, const char *cmd_name
)
5571 memset(&ifr
, 0, sizeof ifr
);
5572 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
5573 ifr
.ifr_data
= (caddr_t
) ecmd
;
5576 error
= af_inet_ioctl(SIOCETHTOOL
, &ifr
);
5578 if (error
!= EOPNOTSUPP
) {
5579 VLOG_WARN_RL(&rl
, "ethtool command %s on network device %s "
5580 "failed: %s", cmd_name
, name
, ovs_strerror(error
));
5582 /* The device doesn't support this operation. That's pretty
5583 * common, so there's no point in logging anything. */
5589 /* Returns an AF_PACKET raw socket or a negative errno value. */
5591 af_packet_sock(void)
5593 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5596 if (ovsthread_once_start(&once
)) {
5597 sock
= socket(AF_PACKET
, SOCK_RAW
, 0);
5599 int error
= set_nonblocking(sock
);
5606 VLOG_ERR("failed to create packet socket: %s",
5607 ovs_strerror(errno
));
5609 ovsthread_once_done(&once
);