2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/if_packet.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
52 #include "dp-packet.h"
53 #include "dpif-netlink.h"
54 #include "dpif-netdev.h"
55 #include "dynamic-string.h"
56 #include "fatal-signal.h"
59 #include "netdev-provider.h"
60 #include "netdev-vport.h"
61 #include "netlink-notifier.h"
62 #include "netlink-socket.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "poll-loop.h"
69 #include "rtnetlink.h"
71 #include "socket-util.h"
74 #include "unaligned.h"
75 #include "openvswitch/vlog.h"
77 VLOG_DEFINE_THIS_MODULE(netdev_linux
);
79 COVERAGE_DEFINE(netdev_set_policing
);
80 COVERAGE_DEFINE(netdev_arp_lookup
);
81 COVERAGE_DEFINE(netdev_get_ifindex
);
82 COVERAGE_DEFINE(netdev_get_hwaddr
);
83 COVERAGE_DEFINE(netdev_set_hwaddr
);
84 COVERAGE_DEFINE(netdev_get_ethtool
);
85 COVERAGE_DEFINE(netdev_set_ethtool
);
88 /* These were introduced in Linux 2.6.14, so they might be missing if we have
90 #ifndef ADVERTISED_Pause
91 #define ADVERTISED_Pause (1 << 13)
93 #ifndef ADVERTISED_Asym_Pause
94 #define ADVERTISED_Asym_Pause (1 << 14)
97 /* These were introduced in Linux 2.6.24, so they might be missing if we
98 * have old headers. */
99 #ifndef ETHTOOL_GFLAGS
100 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
102 #ifndef ETHTOOL_SFLAGS
103 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
106 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
109 #define TC_RTAB_SIZE 1024
112 /* Linux 2.6.21 introduced struct tpacket_auxdata.
113 * Linux 2.6.27 added the tp_vlan_tci member.
114 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
115 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
116 * TP_STATUS_VLAN_TPID_VALID.
118 * With all this churn it's easiest to unconditionally define a replacement
119 * structure that has everything we want.
121 #ifndef PACKET_AUXDATA
122 #define PACKET_AUXDATA 8
124 #ifndef TP_STATUS_VLAN_VALID
125 #define TP_STATUS_VLAN_VALID (1 << 4)
127 #ifndef TP_STATUS_VLAN_TPID_VALID
128 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
130 #undef tpacket_auxdata
131 #define tpacket_auxdata rpl_tpacket_auxdata
132 struct tpacket_auxdata
{
138 uint16_t tp_vlan_tci
;
139 uint16_t tp_vlan_tpid
;
142 /* Linux 2.6.27 introduced ethtool_cmd_speed
144 * To avoid revisiting problems reported with using configure to detect
145 * compatibility (see report at
146 * http://openvswitch.org/pipermail/dev/2014-October/047978.html)
147 * unconditionally replace ethtool_cmd_speed. */
148 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
149 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd
*ep
)
151 return ep
->speed
| (ep
->speed_hi
<< 16);
154 /* Linux 2.6.30 introduced supported and advertised flags for
155 * 1G base KX, and 10G base KX4, KR and R. */
156 #ifndef SUPPORTED_1000baseKX_Full
157 #define SUPPORTED_1000baseKX_Full (1 << 17)
158 #define SUPPORTED_10000baseKX4_Full (1 << 18)
159 #define SUPPORTED_10000baseKR_Full (1 << 19)
160 #define SUPPORTED_10000baseR_FEC (1 << 20)
161 #define ADVERTISED_1000baseKX_Full (1 << 17)
162 #define ADVERTISED_10000baseKX4_Full (1 << 18)
163 #define ADVERTISED_10000baseKR_Full (1 << 19)
164 #define ADVERTISED_10000baseR_FEC (1 << 20)
167 /* Linux 3.5 introduced supported and advertised flags for
168 * 40G base KR4, CR4, SR4 and LR4. */
169 #ifndef SUPPORTED_40000baseKR4_Full
170 #define SUPPORTED_40000baseKR4_Full (1 << 23)
171 #define SUPPORTED_40000baseCR4_Full (1 << 24)
172 #define SUPPORTED_40000baseSR4_Full (1 << 25)
173 #define SUPPORTED_40000baseLR4_Full (1 << 26)
174 #define ADVERTISED_40000baseKR4_Full (1 << 23)
175 #define ADVERTISED_40000baseCR4_Full (1 << 24)
176 #define ADVERTISED_40000baseSR4_Full (1 << 25)
177 #define ADVERTISED_40000baseLR4_Full (1 << 26)
180 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
182 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
183 * 2.6.32-431.29.2.el6.x86_64 (see report at
184 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
185 * if_link.h is not self-contained on those kernels. It is easiest to
186 * unconditionally define a replacement. */
188 #define IFLA_STATS64 23
190 #define rtnl_link_stats64 rpl_rtnl_link_stats64
191 struct rtnl_link_stats64
{
203 uint64_t rx_length_errors
;
204 uint64_t rx_over_errors
;
205 uint64_t rx_crc_errors
;
206 uint64_t rx_frame_errors
;
207 uint64_t rx_fifo_errors
;
208 uint64_t rx_missed_errors
;
210 uint64_t tx_aborted_errors
;
211 uint64_t tx_carrier_errors
;
212 uint64_t tx_fifo_errors
;
213 uint64_t tx_heartbeat_errors
;
214 uint64_t tx_window_errors
;
216 uint64_t rx_compressed
;
217 uint64_t tx_compressed
;
221 VALID_IFINDEX
= 1 << 0,
222 VALID_ETHERADDR
= 1 << 1,
226 VALID_POLICING
= 1 << 5,
227 VALID_VPORT_STAT_ERROR
= 1 << 6,
228 VALID_DRVINFO
= 1 << 7,
229 VALID_FEATURES
= 1 << 8,
232 /* Traffic control. */
234 /* An instance of a traffic control class. Always associated with a particular
237 * Each TC implementation subclasses this with whatever additional data it
240 const struct tc_ops
*ops
;
241 struct hmap queues
; /* Contains "struct tc_queue"s.
242 * Read by generic TC layer.
243 * Written only by TC implementation. */
246 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
248 /* One traffic control queue.
250 * Each TC implementation subclasses this with whatever additional data it
253 struct hmap_node hmap_node
; /* In struct tc's "queues" hmap. */
254 unsigned int queue_id
; /* OpenFlow queue ID. */
255 long long int created
; /* Time queue was created, in msecs. */
258 /* A particular kind of traffic control. Each implementation generally maps to
259 * one particular Linux qdisc class.
261 * The functions below return 0 if successful or a positive errno value on
262 * failure, except where otherwise noted. All of them must be provided, except
263 * where otherwise noted. */
265 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
266 * This is null for tc_ops_default and tc_ops_other, for which there are no
267 * appropriate values. */
268 const char *linux_name
;
270 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
271 const char *ovs_name
;
273 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
274 * queues. The queues are numbered 0 through n_queues - 1. */
275 unsigned int n_queues
;
277 /* Called to install this TC class on 'netdev'. The implementation should
278 * make the Netlink calls required to set up 'netdev' with the right qdisc
279 * and configure it according to 'details'. The implementation may assume
280 * that the current qdisc is the default; that is, there is no need for it
281 * to delete the current qdisc before installing itself.
283 * The contents of 'details' should be documented as valid for 'ovs_name'
284 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
285 * (which is built as ovs-vswitchd.conf.db(8)).
287 * This function must return 0 if and only if it sets 'netdev->tc' to an
288 * initialized 'struct tc'.
290 * (This function is null for tc_ops_other, which cannot be installed. For
291 * other TC classes it should always be nonnull.) */
292 int (*tc_install
)(struct netdev
*netdev
, const struct smap
*details
);
294 /* Called when the netdev code determines (through a Netlink query) that
295 * this TC class's qdisc is installed on 'netdev', but we didn't install
296 * it ourselves and so don't know any of the details.
298 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
299 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
300 * implementation should parse the other attributes of 'nlmsg' as
301 * necessary to determine its configuration. If necessary it should also
302 * use Netlink queries to determine the configuration of queues on
305 * This function must return 0 if and only if it sets 'netdev->tc' to an
306 * initialized 'struct tc'. */
307 int (*tc_load
)(struct netdev
*netdev
, struct ofpbuf
*nlmsg
);
309 /* Destroys the data structures allocated by the implementation as part of
310 * 'tc'. (This includes destroying 'tc->queues' by calling
313 * The implementation should not need to perform any Netlink calls. If
314 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
315 * (But it may not be desirable.)
317 * This function may be null if 'tc' is trivial. */
318 void (*tc_destroy
)(struct tc
*tc
);
320 /* Retrieves details of 'netdev->tc' configuration into 'details'.
322 * The implementation should not need to perform any Netlink calls, because
323 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
324 * cached the configuration.
326 * The contents of 'details' should be documented as valid for 'ovs_name'
327 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
328 * (which is built as ovs-vswitchd.conf.db(8)).
330 * This function may be null if 'tc' is not configurable.
332 int (*qdisc_get
)(const struct netdev
*netdev
, struct smap
*details
);
334 /* Reconfigures 'netdev->tc' according to 'details', performing any
335 * required Netlink calls to complete the reconfiguration.
337 * The contents of 'details' should be documented as valid for 'ovs_name'
338 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
339 * (which is built as ovs-vswitchd.conf.db(8)).
341 * This function may be null if 'tc' is not configurable.
343 int (*qdisc_set
)(struct netdev
*, const struct smap
*details
);
345 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
346 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
348 * The contents of 'details' should be documented as valid for 'ovs_name'
349 * in the "other_config" column in the "Queue" table in
350 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
352 * The implementation should not need to perform any Netlink calls, because
353 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
354 * cached the queue configuration.
356 * This function may be null if 'tc' does not have queues ('n_queues' is
358 int (*class_get
)(const struct netdev
*netdev
, const struct tc_queue
*queue
,
359 struct smap
*details
);
361 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
362 * 'details', perfoming any required Netlink calls to complete the
363 * reconfiguration. The caller ensures that 'queue_id' is less than
366 * The contents of 'details' should be documented as valid for 'ovs_name'
367 * in the "other_config" column in the "Queue" table in
368 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
370 * This function may be null if 'tc' does not have queues or its queues are
371 * not configurable. */
372 int (*class_set
)(struct netdev
*, unsigned int queue_id
,
373 const struct smap
*details
);
375 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
376 * tc_queue's within 'netdev->tc->queues'.
378 * This function may be null if 'tc' does not have queues or its queues
379 * cannot be deleted. */
380 int (*class_delete
)(struct netdev
*, struct tc_queue
*queue
);
382 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
383 * 'struct tc_queue's within 'netdev->tc->queues'.
385 * On success, initializes '*stats'.
387 * This function may be null if 'tc' does not have queues or if it cannot
388 * report queue statistics. */
389 int (*class_get_stats
)(const struct netdev
*netdev
,
390 const struct tc_queue
*queue
,
391 struct netdev_queue_stats
*stats
);
393 /* Extracts queue stats from 'nlmsg', which is a response to a
394 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
396 * This function may be null if 'tc' does not have queues or if it cannot
397 * report queue statistics. */
398 int (*class_dump_stats
)(const struct netdev
*netdev
,
399 const struct ofpbuf
*nlmsg
,
400 netdev_dump_queue_stats_cb
*cb
, void *aux
);
404 tc_init(struct tc
*tc
, const struct tc_ops
*ops
)
407 hmap_init(&tc
->queues
);
411 tc_destroy(struct tc
*tc
)
413 hmap_destroy(&tc
->queues
);
416 static const struct tc_ops tc_ops_htb
;
417 static const struct tc_ops tc_ops_hfsc
;
418 static const struct tc_ops tc_ops_codel
;
419 static const struct tc_ops tc_ops_fqcodel
;
420 static const struct tc_ops tc_ops_sfq
;
421 static const struct tc_ops tc_ops_default
;
422 static const struct tc_ops tc_ops_other
;
424 static const struct tc_ops
*const tcs
[] = {
425 &tc_ops_htb
, /* Hierarchy token bucket (see tc-htb(8)). */
426 &tc_ops_hfsc
, /* Hierarchical fair service curve. */
427 &tc_ops_codel
, /* Controlled delay */
428 &tc_ops_fqcodel
, /* Fair queue controlled delay */
429 &tc_ops_sfq
, /* Stochastic fair queueing */
430 &tc_ops_default
, /* Default qdisc (see tc-pfifo_fast(8)). */
431 &tc_ops_other
, /* Some other qdisc. */
435 static unsigned int tc_make_handle(unsigned int major
, unsigned int minor
);
436 static unsigned int tc_get_major(unsigned int handle
);
437 static unsigned int tc_get_minor(unsigned int handle
);
439 static unsigned int tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
);
440 static unsigned int tc_bytes_to_ticks(unsigned int rate
, unsigned int size
);
441 static unsigned int tc_buffer_per_jiffy(unsigned int rate
);
443 static struct tcmsg
*tc_make_request(const struct netdev
*, int type
,
444 unsigned int flags
, struct ofpbuf
*);
445 static int tc_transact(struct ofpbuf
*request
, struct ofpbuf
**replyp
);
446 static int tc_add_del_ingress_qdisc(struct netdev
*netdev
, bool add
);
447 static int tc_add_policer(struct netdev
*,
448 uint32_t kbits_rate
, uint32_t kbits_burst
);
450 static int tc_parse_qdisc(const struct ofpbuf
*, const char **kind
,
451 struct nlattr
**options
);
452 static int tc_parse_class(const struct ofpbuf
*, unsigned int *queue_id
,
453 struct nlattr
**options
,
454 struct netdev_queue_stats
*);
455 static int tc_query_class(const struct netdev
*,
456 unsigned int handle
, unsigned int parent
,
457 struct ofpbuf
**replyp
);
458 static int tc_delete_class(const struct netdev
*, unsigned int handle
);
460 static int tc_del_qdisc(struct netdev
*netdev
);
461 static int tc_query_qdisc(const struct netdev
*netdev
);
463 static int tc_calc_cell_log(unsigned int mtu
);
464 static void tc_fill_rate(struct tc_ratespec
*rate
, uint64_t bps
, int mtu
);
465 static void tc_put_rtab(struct ofpbuf
*, uint16_t type
,
466 const struct tc_ratespec
*rate
);
467 static int tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
);
469 struct netdev_linux
{
472 /* Protects all members below. */
473 struct ovs_mutex mutex
;
475 unsigned int cache_valid
;
477 bool miimon
; /* Link status of last poll. */
478 long long int miimon_interval
; /* Miimon Poll rate. Disabled if <= 0. */
479 struct timer miimon_timer
;
481 /* The following are figured out "on demand" only. They are only valid
482 * when the corresponding VALID_* bit in 'cache_valid' is set. */
484 struct eth_addr etheraddr
;
485 struct in_addr address
, netmask
;
488 unsigned int ifi_flags
;
489 long long int carrier_resets
;
490 uint32_t kbits_rate
; /* Policing data. */
491 uint32_t kbits_burst
;
492 int vport_stats_error
; /* Cached error code from vport_get_stats().
493 0 or an errno value. */
494 int netdev_mtu_error
; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
495 int ether_addr_error
; /* Cached error code from set/get etheraddr. */
496 int netdev_policing_error
; /* Cached error code from set policing. */
497 int get_features_error
; /* Cached error code from ETHTOOL_GSET. */
498 int get_ifindex_error
; /* Cached error code from SIOCGIFINDEX. */
499 int in4_error
; /* Cached error code from reading in4 addr. */
500 int in6_error
; /* Cached error code from reading in6 addr. */
502 enum netdev_features current
; /* Cached from ETHTOOL_GSET. */
503 enum netdev_features advertised
; /* Cached from ETHTOOL_GSET. */
504 enum netdev_features supported
; /* Cached from ETHTOOL_GSET. */
506 struct ethtool_drvinfo drvinfo
; /* Cached from ETHTOOL_GDRVINFO. */
509 /* For devices of class netdev_tap_class only. */
513 struct netdev_rxq_linux
{
514 struct netdev_rxq up
;
519 /* This is set pretty low because we probably won't learn anything from the
520 * additional log messages. */
521 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
523 /* Polling miimon status for all ports causes performance degradation when
524 * handling a large number of ports. If there are no devices using miimon, then
525 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
527 * Readers do not depend on this variable synchronizing with the related
528 * changes in the device miimon status, so we can use atomic_count. */
529 static atomic_count miimon_cnt
= ATOMIC_COUNT_INIT(0);
531 static void netdev_linux_run(void);
533 static int netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*,
534 int cmd
, const char *cmd_name
);
535 static int netdev_linux_get_ipv4(const struct netdev
*, struct in_addr
*,
536 int cmd
, const char *cmd_name
);
537 static int get_flags(const struct netdev
*, unsigned int *flags
);
538 static int set_flags(const char *, unsigned int flags
);
539 static int update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
540 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
541 OVS_REQUIRES(netdev
->mutex
);
542 static int do_get_ifindex(const char *netdev_name
);
543 static int get_ifindex(const struct netdev
*, int *ifindexp
);
544 static int do_set_addr(struct netdev
*netdev
,
545 int ioctl_nr
, const char *ioctl_name
,
546 struct in_addr addr
);
547 static int get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
);
548 static int set_etheraddr(const char *netdev_name
, const struct eth_addr
);
549 static int get_stats_via_netlink(const struct netdev
*, struct netdev_stats
*);
550 static int af_packet_sock(void);
551 static bool netdev_linux_miimon_enabled(void);
552 static void netdev_linux_miimon_run(void);
553 static void netdev_linux_miimon_wait(void);
554 static int netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
);
557 is_netdev_linux_class(const struct netdev_class
*netdev_class
)
559 return netdev_class
->run
== netdev_linux_run
;
563 is_tap_netdev(const struct netdev
*netdev
)
565 return netdev_get_class(netdev
) == &netdev_tap_class
;
568 static struct netdev_linux
*
569 netdev_linux_cast(const struct netdev
*netdev
)
571 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev
)));
573 return CONTAINER_OF(netdev
, struct netdev_linux
, up
);
576 static struct netdev_rxq_linux
*
577 netdev_rxq_linux_cast(const struct netdev_rxq
*rx
)
579 ovs_assert(is_netdev_linux_class(netdev_get_class(rx
->netdev
)));
580 return CONTAINER_OF(rx
, struct netdev_rxq_linux
, up
);
583 static void netdev_linux_update(struct netdev_linux
*netdev
,
584 const struct rtnetlink_change
*)
585 OVS_REQUIRES(netdev
->mutex
);
586 static void netdev_linux_changed(struct netdev_linux
*netdev
,
587 unsigned int ifi_flags
, unsigned int mask
)
588 OVS_REQUIRES(netdev
->mutex
);
590 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
591 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
592 * if no such socket could be created. */
593 static struct nl_sock
*
594 netdev_linux_notify_sock(void)
596 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
597 static struct nl_sock
*sock
;
598 unsigned int mcgroups
[3] = {RTNLGRP_LINK
, RTNLGRP_IPV4_IFADDR
,
599 RTNLGRP_IPV6_IFADDR
};
601 if (ovsthread_once_start(&once
)) {
604 error
= nl_sock_create(NETLINK_ROUTE
, &sock
);
608 for (i
= 0; i
< ARRAY_SIZE(mcgroups
); i
++) {
609 error
= nl_sock_join_mcgroup(sock
, mcgroups
[i
]);
611 nl_sock_destroy(sock
);
617 ovsthread_once_done(&once
);
624 netdev_linux_miimon_enabled(void)
626 return atomic_count_get(&miimon_cnt
) > 0;
630 netdev_linux_run(void)
632 struct nl_sock
*sock
;
635 if (netdev_linux_miimon_enabled()) {
636 netdev_linux_miimon_run();
639 sock
= netdev_linux_notify_sock();
645 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
646 uint64_t buf_stub
[4096 / 8];
649 ofpbuf_use_stub(&buf
, buf_stub
, sizeof buf_stub
);
650 error
= nl_sock_recv(sock
, &buf
, false);
652 struct rtnetlink_change change
;
654 if (rtnetlink_parse(&buf
, &change
)) {
655 struct netdev
*netdev_
= netdev_from_name(change
.ifname
);
656 if (netdev_
&& is_netdev_linux_class(netdev_
->netdev_class
)) {
657 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
659 ovs_mutex_lock(&netdev
->mutex
);
660 netdev_linux_update(netdev
, &change
);
661 ovs_mutex_unlock(&netdev
->mutex
);
663 netdev_close(netdev_
);
665 } else if (error
== ENOBUFS
) {
666 struct shash device_shash
;
667 struct shash_node
*node
;
671 shash_init(&device_shash
);
672 netdev_get_devices(&netdev_linux_class
, &device_shash
);
673 SHASH_FOR_EACH (node
, &device_shash
) {
674 struct netdev
*netdev_
= node
->data
;
675 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
678 ovs_mutex_lock(&netdev
->mutex
);
679 get_flags(netdev_
, &flags
);
680 netdev_linux_changed(netdev
, flags
, 0);
681 ovs_mutex_unlock(&netdev
->mutex
);
683 netdev_close(netdev_
);
685 shash_destroy(&device_shash
);
686 } else if (error
!= EAGAIN
) {
687 VLOG_WARN_RL(&rl
, "error reading or parsing netlink (%s)",
688 ovs_strerror(error
));
695 netdev_linux_wait(void)
697 struct nl_sock
*sock
;
699 if (netdev_linux_miimon_enabled()) {
700 netdev_linux_miimon_wait();
702 sock
= netdev_linux_notify_sock();
704 nl_sock_wait(sock
, POLLIN
);
709 netdev_linux_changed(struct netdev_linux
*dev
,
710 unsigned int ifi_flags
, unsigned int mask
)
711 OVS_REQUIRES(dev
->mutex
)
713 netdev_change_seq_changed(&dev
->up
);
715 if ((dev
->ifi_flags
^ ifi_flags
) & IFF_RUNNING
) {
716 dev
->carrier_resets
++;
718 dev
->ifi_flags
= ifi_flags
;
720 dev
->cache_valid
&= mask
;
724 netdev_linux_update(struct netdev_linux
*dev
,
725 const struct rtnetlink_change
*change
)
726 OVS_REQUIRES(dev
->mutex
)
728 if (rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)){
729 if (change
->nlmsg_type
== RTM_NEWLINK
) {
730 /* Keep drv-info, in4, in6. */
731 netdev_linux_changed(dev
, change
->ifi_flags
,
732 VALID_DRVINFO
| VALID_IN4
| VALID_IN6
);
734 /* Update netdev from rtnl-change msg. */
736 dev
->mtu
= change
->mtu
;
737 dev
->cache_valid
|= VALID_MTU
;
738 dev
->netdev_mtu_error
= 0;
741 if (!eth_addr_is_zero(change
->mac
)) {
742 dev
->etheraddr
= change
->mac
;
743 dev
->cache_valid
|= VALID_ETHERADDR
;
744 dev
->ether_addr_error
= 0;
747 dev
->ifindex
= change
->if_index
;
748 dev
->cache_valid
|= VALID_IFINDEX
;
749 dev
->get_ifindex_error
= 0;
751 netdev_linux_changed(dev
, change
->ifi_flags
, 0);
753 } else if (rtnetlink_type_is_rtnlgrp_addr(change
->nlmsg_type
)) {
754 /* Invalidates in4, in6. */
755 netdev_linux_changed(dev
, dev
->ifi_flags
,
756 ~(VALID_IN4
| VALID_IN6
));
762 static struct netdev
*
763 netdev_linux_alloc(void)
765 struct netdev_linux
*netdev
= xzalloc(sizeof *netdev
);
770 netdev_linux_common_construct(struct netdev_linux
*netdev
)
772 ovs_mutex_init(&netdev
->mutex
);
775 /* Creates system and internal devices. */
777 netdev_linux_construct(struct netdev
*netdev_
)
779 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
782 netdev_linux_common_construct(netdev
);
784 error
= get_flags(&netdev
->up
, &netdev
->ifi_flags
);
785 if (error
== ENODEV
) {
786 if (netdev
->up
.netdev_class
!= &netdev_internal_class
) {
787 /* The device does not exist, so don't allow it to be opened. */
790 /* "Internal" netdevs have to be created as netdev objects before
791 * they exist in the kernel, because creating them in the kernel
792 * happens by passing a netdev object to dpif_port_add().
793 * Therefore, ignore the error. */
800 /* For most types of netdevs we open the device for each call of
801 * netdev_open(). However, this is not the case with tap devices,
802 * since it is only possible to open the device once. In this
803 * situation we share a single file descriptor, and consequently
804 * buffers, across all readers. Therefore once data is read it will
805 * be unavailable to other reads for tap devices. */
807 netdev_linux_construct_tap(struct netdev
*netdev_
)
809 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
810 static const char tap_dev
[] = "/dev/net/tun";
811 const char *name
= netdev_
->name
;
815 netdev_linux_common_construct(netdev
);
817 /* Open tap device. */
818 netdev
->tap_fd
= open(tap_dev
, O_RDWR
);
819 if (netdev
->tap_fd
< 0) {
821 VLOG_WARN("opening \"%s\" failed: %s", tap_dev
, ovs_strerror(error
));
825 /* Create tap device. */
826 ifr
.ifr_flags
= IFF_TAP
| IFF_NO_PI
;
827 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
828 if (ioctl(netdev
->tap_fd
, TUNSETIFF
, &ifr
) == -1) {
829 VLOG_WARN("%s: creating tap device failed: %s", name
,
830 ovs_strerror(errno
));
835 /* Make non-blocking. */
836 error
= set_nonblocking(netdev
->tap_fd
);
844 close(netdev
->tap_fd
);
849 netdev_linux_destruct(struct netdev
*netdev_
)
851 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
853 if (netdev
->tc
&& netdev
->tc
->ops
->tc_destroy
) {
854 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
857 if (netdev_get_class(netdev_
) == &netdev_tap_class
858 && netdev
->tap_fd
>= 0)
860 close(netdev
->tap_fd
);
863 if (netdev
->miimon_interval
> 0) {
864 atomic_count_dec(&miimon_cnt
);
867 ovs_mutex_destroy(&netdev
->mutex
);
871 netdev_linux_dealloc(struct netdev
*netdev_
)
873 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
877 static struct netdev_rxq
*
878 netdev_linux_rxq_alloc(void)
880 struct netdev_rxq_linux
*rx
= xzalloc(sizeof *rx
);
885 netdev_linux_rxq_construct(struct netdev_rxq
*rxq_
)
887 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
888 struct netdev
*netdev_
= rx
->up
.netdev
;
889 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
892 ovs_mutex_lock(&netdev
->mutex
);
893 rx
->is_tap
= is_tap_netdev(netdev_
);
895 rx
->fd
= netdev
->tap_fd
;
897 struct sockaddr_ll sll
;
899 /* Result of tcpdump -dd inbound */
900 static const struct sock_filter filt
[] = {
901 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
902 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
903 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
904 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
906 static const struct sock_fprog fprog
= {
907 ARRAY_SIZE(filt
), (struct sock_filter
*) filt
910 /* Create file descriptor. */
911 rx
->fd
= socket(PF_PACKET
, SOCK_RAW
, 0);
914 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error
));
919 if (setsockopt(rx
->fd
, SOL_PACKET
, PACKET_AUXDATA
, &val
, sizeof val
)) {
921 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
922 netdev_get_name(netdev_
), ovs_strerror(error
));
926 /* Set non-blocking mode. */
927 error
= set_nonblocking(rx
->fd
);
932 /* Get ethernet device index. */
933 error
= get_ifindex(&netdev
->up
, &ifindex
);
938 /* Bind to specific ethernet device. */
939 memset(&sll
, 0, sizeof sll
);
940 sll
.sll_family
= AF_PACKET
;
941 sll
.sll_ifindex
= ifindex
;
942 sll
.sll_protocol
= htons(ETH_P_ALL
);
943 if (bind(rx
->fd
, (struct sockaddr
*) &sll
, sizeof sll
) < 0) {
945 VLOG_ERR("%s: failed to bind raw socket (%s)",
946 netdev_get_name(netdev_
), ovs_strerror(error
));
950 /* Filter for only inbound packets. */
951 error
= setsockopt(rx
->fd
, SOL_SOCKET
, SO_ATTACH_FILTER
, &fprog
,
955 VLOG_ERR("%s: failed to attach filter (%s)",
956 netdev_get_name(netdev_
), ovs_strerror(error
));
960 ovs_mutex_unlock(&netdev
->mutex
);
968 ovs_mutex_unlock(&netdev
->mutex
);
973 netdev_linux_rxq_destruct(struct netdev_rxq
*rxq_
)
975 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
983 netdev_linux_rxq_dealloc(struct netdev_rxq
*rxq_
)
985 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
991 auxdata_to_vlan_tpid(const struct tpacket_auxdata
*aux
)
993 if (aux
->tp_status
& TP_STATUS_VLAN_TPID_VALID
) {
994 return htons(aux
->tp_vlan_tpid
);
996 return htons(ETH_TYPE_VLAN
);
1001 auxdata_has_vlan_tci(const struct tpacket_auxdata
*aux
)
1003 return aux
->tp_vlan_tci
|| aux
->tp_status
& TP_STATUS_VLAN_VALID
;
1007 netdev_linux_rxq_recv_sock(int fd
, struct dp_packet
*buffer
)
1012 struct cmsghdr
*cmsg
;
1014 struct cmsghdr cmsg
;
1015 char buffer
[CMSG_SPACE(sizeof(struct tpacket_auxdata
))];
1019 /* Reserve headroom for a single VLAN tag */
1020 dp_packet_reserve(buffer
, VLAN_HEADER_LEN
);
1021 size
= dp_packet_tailroom(buffer
);
1023 iov
.iov_base
= dp_packet_data(buffer
);
1025 msgh
.msg_name
= NULL
;
1026 msgh
.msg_namelen
= 0;
1027 msgh
.msg_iov
= &iov
;
1028 msgh
.msg_iovlen
= 1;
1029 msgh
.msg_control
= &cmsg_buffer
;
1030 msgh
.msg_controllen
= sizeof cmsg_buffer
;
1034 retval
= recvmsg(fd
, &msgh
, MSG_TRUNC
);
1035 } while (retval
< 0 && errno
== EINTR
);
1039 } else if (retval
> size
) {
1043 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1045 for (cmsg
= CMSG_FIRSTHDR(&msgh
); cmsg
; cmsg
= CMSG_NXTHDR(&msgh
, cmsg
)) {
1046 const struct tpacket_auxdata
*aux
;
1048 if (cmsg
->cmsg_level
!= SOL_PACKET
1049 || cmsg
->cmsg_type
!= PACKET_AUXDATA
1050 || cmsg
->cmsg_len
< CMSG_LEN(sizeof(struct tpacket_auxdata
))) {
1054 aux
= ALIGNED_CAST(struct tpacket_auxdata
*, CMSG_DATA(cmsg
));
1055 if (auxdata_has_vlan_tci(aux
)) {
1056 if (retval
< ETH_HEADER_LEN
) {
1060 eth_push_vlan(buffer
, auxdata_to_vlan_tpid(aux
),
1061 htons(aux
->tp_vlan_tci
));
1070 netdev_linux_rxq_recv_tap(int fd
, struct dp_packet
*buffer
)
1073 size_t size
= dp_packet_tailroom(buffer
);
1076 retval
= read(fd
, dp_packet_data(buffer
), size
);
1077 } while (retval
< 0 && errno
== EINTR
);
1083 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1088 netdev_linux_rxq_recv(struct netdev_rxq
*rxq_
, struct dp_packet
**packets
,
1091 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1092 struct netdev
*netdev
= rx
->up
.netdev
;
1093 struct dp_packet
*buffer
;
1097 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
1098 mtu
= ETH_PAYLOAD_MAX
;
1101 buffer
= dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN
+ mtu
,
1102 DP_NETDEV_HEADROOM
);
1103 retval
= (rx
->is_tap
1104 ? netdev_linux_rxq_recv_tap(rx
->fd
, buffer
)
1105 : netdev_linux_rxq_recv_sock(rx
->fd
, buffer
));
1108 if (retval
!= EAGAIN
&& retval
!= EMSGSIZE
) {
1109 VLOG_WARN_RL(&rl
, "error receiving Ethernet packet on %s: %s",
1110 netdev_rxq_get_name(rxq_
), ovs_strerror(errno
));
1112 dp_packet_delete(buffer
);
1114 dp_packet_pad(buffer
);
1115 dp_packet_rss_invalidate(buffer
);
1116 packets
[0] = buffer
;
1124 netdev_linux_rxq_wait(struct netdev_rxq
*rxq_
)
1126 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1127 poll_fd_wait(rx
->fd
, POLLIN
);
1131 netdev_linux_rxq_drain(struct netdev_rxq
*rxq_
)
1133 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1136 int error
= af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_
), &ifr
,
1137 SIOCGIFTXQLEN
, "SIOCGIFTXQLEN");
1141 drain_fd(rx
->fd
, ifr
.ifr_qlen
);
1144 return drain_rcvbuf(rx
->fd
);
1148 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1149 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1150 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1151 * the packet is too big or too small to transmit on the device.
1153 * The caller retains ownership of 'buffer' in all cases.
1155 * The kernel maintains a packet transmission queue, so the caller is not
1156 * expected to do additional queuing of packets. */
1158 netdev_linux_send(struct netdev
*netdev_
, int qid OVS_UNUSED
,
1159 struct dp_packet
**pkts
, int cnt
, bool may_steal
)
1164 /* 'i' is incremented only if there's no error */
1165 for (i
= 0; i
< cnt
;) {
1166 const void *data
= dp_packet_data(pkts
[i
]);
1167 size_t size
= dp_packet_size(pkts
[i
]);
1170 if (!is_tap_netdev(netdev_
)) {
1171 /* Use our AF_PACKET socket to send to this device. */
1172 struct sockaddr_ll sll
;
1178 sock
= af_packet_sock();
1183 ifindex
= netdev_get_ifindex(netdev_
);
1188 /* We don't bother setting most fields in sockaddr_ll because the
1189 * kernel ignores them for SOCK_RAW. */
1190 memset(&sll
, 0, sizeof sll
);
1191 sll
.sll_family
= AF_PACKET
;
1192 sll
.sll_ifindex
= ifindex
;
1194 iov
.iov_base
= CONST_CAST(void *, data
);
1197 msg
.msg_name
= &sll
;
1198 msg
.msg_namelen
= sizeof sll
;
1201 msg
.msg_control
= NULL
;
1202 msg
.msg_controllen
= 0;
1205 retval
= sendmsg(sock
, &msg
, 0);
1207 /* Use the tap fd to send to this device. This is essential for
1208 * tap devices, because packets sent to a tap device with an
1209 * AF_PACKET socket will loop back to be *received* again on the
1210 * tap device. This doesn't occur on other interface types
1211 * because we attach a socket filter to the rx socket. */
1212 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1214 retval
= write(netdev
->tap_fd
, data
, size
);
1218 /* The Linux AF_PACKET implementation never blocks waiting for room
1219 * for packets, instead returning ENOBUFS. Translate this into
1220 * EAGAIN for the caller. */
1221 error
= errno
== ENOBUFS
? EAGAIN
: errno
;
1222 if (error
== EINTR
) {
1223 /* continue without incrementing 'i', i.e. retry this packet */
1227 } else if (retval
!= size
) {
1228 VLOG_WARN_RL(&rl
, "sent partial Ethernet packet (%"PRIuSIZE
" bytes"
1229 " of %"PRIuSIZE
") on %s", retval
, size
,
1230 netdev_get_name(netdev_
));
1235 /* Process the next packet in the batch */
1240 for (i
= 0; i
< cnt
; i
++) {
1241 dp_packet_delete(pkts
[i
]);
1245 if (error
&& error
!= EAGAIN
) {
1246 VLOG_WARN_RL(&rl
, "error sending Ethernet packet on %s: %s",
1247 netdev_get_name(netdev_
), ovs_strerror(error
));
1254 /* Registers with the poll loop to wake up from the next call to poll_block()
1255 * when the packet transmission queue has sufficient room to transmit a packet
1256 * with netdev_send().
1258 * The kernel maintains a packet transmission queue, so the client is not
1259 * expected to do additional queuing of packets. Thus, this function is
1260 * unlikely to ever be used. It is included for completeness. */
1262 netdev_linux_send_wait(struct netdev
*netdev
, int qid OVS_UNUSED
)
1264 if (is_tap_netdev(netdev
)) {
1265 /* TAP device always accepts packets.*/
1266 poll_immediate_wake();
1270 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1271 * otherwise a positive errno value. */
1273 netdev_linux_set_etheraddr(struct netdev
*netdev_
, const struct eth_addr mac
)
1275 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1276 enum netdev_flags old_flags
= 0;
1279 ovs_mutex_lock(&netdev
->mutex
);
1281 if (netdev
->cache_valid
& VALID_ETHERADDR
) {
1282 error
= netdev
->ether_addr_error
;
1283 if (error
|| eth_addr_equals(netdev
->etheraddr
, mac
)) {
1286 netdev
->cache_valid
&= ~VALID_ETHERADDR
;
1289 /* Tap devices must be brought down before setting the address. */
1290 if (is_tap_netdev(netdev_
)) {
1291 update_flags(netdev
, NETDEV_UP
, 0, &old_flags
);
1293 error
= set_etheraddr(netdev_get_name(netdev_
), mac
);
1294 if (!error
|| error
== ENODEV
) {
1295 netdev
->ether_addr_error
= error
;
1296 netdev
->cache_valid
|= VALID_ETHERADDR
;
1298 netdev
->etheraddr
= mac
;
1302 if (is_tap_netdev(netdev_
) && old_flags
& NETDEV_UP
) {
1303 update_flags(netdev
, 0, NETDEV_UP
, &old_flags
);
1307 ovs_mutex_unlock(&netdev
->mutex
);
1311 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1313 netdev_linux_get_etheraddr(const struct netdev
*netdev_
, struct eth_addr
*mac
)
1315 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1318 ovs_mutex_lock(&netdev
->mutex
);
1319 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1320 netdev
->ether_addr_error
= get_etheraddr(netdev_get_name(netdev_
),
1321 &netdev
->etheraddr
);
1322 netdev
->cache_valid
|= VALID_ETHERADDR
;
1325 error
= netdev
->ether_addr_error
;
1327 *mac
= netdev
->etheraddr
;
1329 ovs_mutex_unlock(&netdev
->mutex
);
1335 netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
)
1339 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1342 netdev
->netdev_mtu_error
= af_inet_ifreq_ioctl(
1343 netdev_get_name(&netdev
->up
), &ifr
, SIOCGIFMTU
, "SIOCGIFMTU");
1344 netdev
->mtu
= ifr
.ifr_mtu
;
1345 netdev
->cache_valid
|= VALID_MTU
;
1348 error
= netdev
->netdev_mtu_error
;
1350 *mtup
= netdev
->mtu
;
1356 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1357 * in bytes, not including the hardware header; thus, this is typically 1500
1358 * bytes for Ethernet devices. */
1360 netdev_linux_get_mtu(const struct netdev
*netdev_
, int *mtup
)
1362 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1365 ovs_mutex_lock(&netdev
->mutex
);
1366 error
= netdev_linux_get_mtu__(netdev
, mtup
);
1367 ovs_mutex_unlock(&netdev
->mutex
);
1372 /* Sets the maximum size of transmitted (MTU) for given device using linux
1373 * networking ioctl interface.
1376 netdev_linux_set_mtu(const struct netdev
*netdev_
, int mtu
)
1378 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1382 ovs_mutex_lock(&netdev
->mutex
);
1383 if (netdev
->cache_valid
& VALID_MTU
) {
1384 error
= netdev
->netdev_mtu_error
;
1385 if (error
|| netdev
->mtu
== mtu
) {
1388 netdev
->cache_valid
&= ~VALID_MTU
;
1391 error
= af_inet_ifreq_ioctl(netdev_get_name(netdev_
), &ifr
,
1392 SIOCSIFMTU
, "SIOCSIFMTU");
1393 if (!error
|| error
== ENODEV
) {
1394 netdev
->netdev_mtu_error
= error
;
1395 netdev
->mtu
= ifr
.ifr_mtu
;
1396 netdev
->cache_valid
|= VALID_MTU
;
1399 ovs_mutex_unlock(&netdev
->mutex
);
1403 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1404 * On failure, returns a negative errno value. */
1406 netdev_linux_get_ifindex(const struct netdev
*netdev_
)
1408 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1411 ovs_mutex_lock(&netdev
->mutex
);
1412 error
= get_ifindex(netdev_
, &ifindex
);
1413 ovs_mutex_unlock(&netdev
->mutex
);
1415 return error
? -error
: ifindex
;
1419 netdev_linux_get_carrier(const struct netdev
*netdev_
, bool *carrier
)
1421 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1423 ovs_mutex_lock(&netdev
->mutex
);
1424 if (netdev
->miimon_interval
> 0) {
1425 *carrier
= netdev
->miimon
;
1427 *carrier
= (netdev
->ifi_flags
& IFF_RUNNING
) != 0;
1429 ovs_mutex_unlock(&netdev
->mutex
);
1434 static long long int
1435 netdev_linux_get_carrier_resets(const struct netdev
*netdev_
)
1437 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1438 long long int carrier_resets
;
1440 ovs_mutex_lock(&netdev
->mutex
);
1441 carrier_resets
= netdev
->carrier_resets
;
1442 ovs_mutex_unlock(&netdev
->mutex
);
1444 return carrier_resets
;
1448 netdev_linux_do_miimon(const char *name
, int cmd
, const char *cmd_name
,
1449 struct mii_ioctl_data
*data
)
1454 memset(&ifr
, 0, sizeof ifr
);
1455 memcpy(&ifr
.ifr_data
, data
, sizeof *data
);
1456 error
= af_inet_ifreq_ioctl(name
, &ifr
, cmd
, cmd_name
);
1457 memcpy(data
, &ifr
.ifr_data
, sizeof *data
);
1463 netdev_linux_get_miimon(const char *name
, bool *miimon
)
1465 struct mii_ioctl_data data
;
1470 memset(&data
, 0, sizeof data
);
1471 error
= netdev_linux_do_miimon(name
, SIOCGMIIPHY
, "SIOCGMIIPHY", &data
);
1473 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1474 data
.reg_num
= MII_BMSR
;
1475 error
= netdev_linux_do_miimon(name
, SIOCGMIIREG
, "SIOCGMIIREG",
1479 *miimon
= !!(data
.val_out
& BMSR_LSTATUS
);
1481 VLOG_WARN_RL(&rl
, "%s: failed to query MII", name
);
1484 struct ethtool_cmd ecmd
;
1486 VLOG_DBG_RL(&rl
, "%s: failed to query MII, falling back to ethtool",
1489 COVERAGE_INC(netdev_get_ethtool
);
1490 memset(&ecmd
, 0, sizeof ecmd
);
1491 error
= netdev_linux_do_ethtool(name
, &ecmd
, ETHTOOL_GLINK
,
1494 struct ethtool_value eval
;
1496 memcpy(&eval
, &ecmd
, sizeof eval
);
1497 *miimon
= !!eval
.data
;
1499 VLOG_WARN_RL(&rl
, "%s: ethtool link status failed", name
);
1507 netdev_linux_set_miimon_interval(struct netdev
*netdev_
,
1508 long long int interval
)
1510 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1512 ovs_mutex_lock(&netdev
->mutex
);
1513 interval
= interval
> 0 ? MAX(interval
, 100) : 0;
1514 if (netdev
->miimon_interval
!= interval
) {
1515 if (interval
&& !netdev
->miimon_interval
) {
1516 atomic_count_inc(&miimon_cnt
);
1517 } else if (!interval
&& netdev
->miimon_interval
) {
1518 atomic_count_dec(&miimon_cnt
);
1521 netdev
->miimon_interval
= interval
;
1522 timer_set_expired(&netdev
->miimon_timer
);
1524 ovs_mutex_unlock(&netdev
->mutex
);
1530 netdev_linux_miimon_run(void)
1532 struct shash device_shash
;
1533 struct shash_node
*node
;
1535 shash_init(&device_shash
);
1536 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1537 SHASH_FOR_EACH (node
, &device_shash
) {
1538 struct netdev
*netdev
= node
->data
;
1539 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1542 ovs_mutex_lock(&dev
->mutex
);
1543 if (dev
->miimon_interval
> 0 && timer_expired(&dev
->miimon_timer
)) {
1544 netdev_linux_get_miimon(dev
->up
.name
, &miimon
);
1545 if (miimon
!= dev
->miimon
) {
1546 dev
->miimon
= miimon
;
1547 netdev_linux_changed(dev
, dev
->ifi_flags
, 0);
1550 timer_set_duration(&dev
->miimon_timer
, dev
->miimon_interval
);
1552 ovs_mutex_unlock(&dev
->mutex
);
1553 netdev_close(netdev
);
1556 shash_destroy(&device_shash
);
1560 netdev_linux_miimon_wait(void)
1562 struct shash device_shash
;
1563 struct shash_node
*node
;
1565 shash_init(&device_shash
);
1566 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1567 SHASH_FOR_EACH (node
, &device_shash
) {
1568 struct netdev
*netdev
= node
->data
;
1569 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1571 ovs_mutex_lock(&dev
->mutex
);
1572 if (dev
->miimon_interval
> 0) {
1573 timer_wait(&dev
->miimon_timer
);
1575 ovs_mutex_unlock(&dev
->mutex
);
1576 netdev_close(netdev
);
1578 shash_destroy(&device_shash
);
1582 swap_uint64(uint64_t *a
, uint64_t *b
)
1589 /* Copies 'src' into 'dst', performing format conversion in the process.
1591 * 'src' is allowed to be misaligned. */
1593 netdev_stats_from_ovs_vport_stats(struct netdev_stats
*dst
,
1594 const struct ovs_vport_stats
*src
)
1596 dst
->rx_packets
= get_32aligned_u64(&src
->rx_packets
);
1597 dst
->tx_packets
= get_32aligned_u64(&src
->tx_packets
);
1598 dst
->rx_bytes
= get_32aligned_u64(&src
->rx_bytes
);
1599 dst
->tx_bytes
= get_32aligned_u64(&src
->tx_bytes
);
1600 dst
->rx_errors
= get_32aligned_u64(&src
->rx_errors
);
1601 dst
->tx_errors
= get_32aligned_u64(&src
->tx_errors
);
1602 dst
->rx_dropped
= get_32aligned_u64(&src
->rx_dropped
);
1603 dst
->tx_dropped
= get_32aligned_u64(&src
->tx_dropped
);
1605 dst
->collisions
= 0;
1606 dst
->rx_length_errors
= 0;
1607 dst
->rx_over_errors
= 0;
1608 dst
->rx_crc_errors
= 0;
1609 dst
->rx_frame_errors
= 0;
1610 dst
->rx_fifo_errors
= 0;
1611 dst
->rx_missed_errors
= 0;
1612 dst
->tx_aborted_errors
= 0;
1613 dst
->tx_carrier_errors
= 0;
1614 dst
->tx_fifo_errors
= 0;
1615 dst
->tx_heartbeat_errors
= 0;
1616 dst
->tx_window_errors
= 0;
1620 get_stats_via_vport__(const struct netdev
*netdev
, struct netdev_stats
*stats
)
1622 struct dpif_netlink_vport reply
;
1626 error
= dpif_netlink_vport_get(netdev_get_name(netdev
), &reply
, &buf
);
1629 } else if (!reply
.stats
) {
1634 netdev_stats_from_ovs_vport_stats(stats
, reply
.stats
);
1642 get_stats_via_vport(const struct netdev
*netdev_
,
1643 struct netdev_stats
*stats
)
1645 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1647 if (!netdev
->vport_stats_error
||
1648 !(netdev
->cache_valid
& VALID_VPORT_STAT_ERROR
)) {
1651 error
= get_stats_via_vport__(netdev_
, stats
);
1652 if (error
&& error
!= ENOENT
&& error
!= ENODEV
) {
1653 VLOG_WARN_RL(&rl
, "%s: obtaining netdev stats via vport failed "
1655 netdev_get_name(netdev_
), ovs_strerror(error
));
1657 netdev
->vport_stats_error
= error
;
1658 netdev
->cache_valid
|= VALID_VPORT_STAT_ERROR
;
1662 /* Retrieves current device stats for 'netdev-linux'. */
1664 netdev_linux_get_stats(const struct netdev
*netdev_
,
1665 struct netdev_stats
*stats
)
1667 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1668 struct netdev_stats dev_stats
;
1671 ovs_mutex_lock(&netdev
->mutex
);
1672 get_stats_via_vport(netdev_
, stats
);
1673 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1675 if (!netdev
->vport_stats_error
) {
1678 } else if (netdev
->vport_stats_error
) {
1679 /* stats not available from OVS then use netdev stats. */
1682 /* Use kernel netdev's packet and byte counts since vport's counters
1683 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1685 stats
->rx_packets
= dev_stats
.rx_packets
;
1686 stats
->rx_bytes
= dev_stats
.rx_bytes
;
1687 stats
->tx_packets
= dev_stats
.tx_packets
;
1688 stats
->tx_bytes
= dev_stats
.tx_bytes
;
1690 stats
->rx_errors
+= dev_stats
.rx_errors
;
1691 stats
->tx_errors
+= dev_stats
.tx_errors
;
1692 stats
->rx_dropped
+= dev_stats
.rx_dropped
;
1693 stats
->tx_dropped
+= dev_stats
.tx_dropped
;
1694 stats
->multicast
+= dev_stats
.multicast
;
1695 stats
->collisions
+= dev_stats
.collisions
;
1696 stats
->rx_length_errors
+= dev_stats
.rx_length_errors
;
1697 stats
->rx_over_errors
+= dev_stats
.rx_over_errors
;
1698 stats
->rx_crc_errors
+= dev_stats
.rx_crc_errors
;
1699 stats
->rx_frame_errors
+= dev_stats
.rx_frame_errors
;
1700 stats
->rx_fifo_errors
+= dev_stats
.rx_fifo_errors
;
1701 stats
->rx_missed_errors
+= dev_stats
.rx_missed_errors
;
1702 stats
->tx_aborted_errors
+= dev_stats
.tx_aborted_errors
;
1703 stats
->tx_carrier_errors
+= dev_stats
.tx_carrier_errors
;
1704 stats
->tx_fifo_errors
+= dev_stats
.tx_fifo_errors
;
1705 stats
->tx_heartbeat_errors
+= dev_stats
.tx_heartbeat_errors
;
1706 stats
->tx_window_errors
+= dev_stats
.tx_window_errors
;
1708 ovs_mutex_unlock(&netdev
->mutex
);
1713 /* Retrieves current device stats for 'netdev-tap' netdev or
1714 * netdev-internal. */
1716 netdev_tap_get_stats(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
1718 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1719 struct netdev_stats dev_stats
;
1722 ovs_mutex_lock(&netdev
->mutex
);
1723 get_stats_via_vport(netdev_
, stats
);
1724 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1726 if (!netdev
->vport_stats_error
) {
1729 } else if (netdev
->vport_stats_error
) {
1730 /* Transmit and receive stats will appear to be swapped relative to the
1731 * other ports since we are the one sending the data, not a remote
1732 * computer. For consistency, we swap them back here. This does not
1733 * apply if we are getting stats from the vport layer because it always
1734 * tracks stats from the perspective of the switch. */
1737 swap_uint64(&stats
->rx_packets
, &stats
->tx_packets
);
1738 swap_uint64(&stats
->rx_bytes
, &stats
->tx_bytes
);
1739 swap_uint64(&stats
->rx_errors
, &stats
->tx_errors
);
1740 swap_uint64(&stats
->rx_dropped
, &stats
->tx_dropped
);
1741 stats
->rx_length_errors
= 0;
1742 stats
->rx_over_errors
= 0;
1743 stats
->rx_crc_errors
= 0;
1744 stats
->rx_frame_errors
= 0;
1745 stats
->rx_fifo_errors
= 0;
1746 stats
->rx_missed_errors
= 0;
1747 stats
->tx_aborted_errors
= 0;
1748 stats
->tx_carrier_errors
= 0;
1749 stats
->tx_fifo_errors
= 0;
1750 stats
->tx_heartbeat_errors
= 0;
1751 stats
->tx_window_errors
= 0;
1753 /* Use kernel netdev's packet and byte counts since vport counters
1754 * do not reflect packet counts on the wire when GSO, TSO or GRO
1756 stats
->rx_packets
= dev_stats
.tx_packets
;
1757 stats
->rx_bytes
= dev_stats
.tx_bytes
;
1758 stats
->tx_packets
= dev_stats
.rx_packets
;
1759 stats
->tx_bytes
= dev_stats
.rx_bytes
;
1761 stats
->rx_dropped
+= dev_stats
.tx_dropped
;
1762 stats
->tx_dropped
+= dev_stats
.rx_dropped
;
1764 stats
->rx_errors
+= dev_stats
.tx_errors
;
1765 stats
->tx_errors
+= dev_stats
.rx_errors
;
1767 stats
->multicast
+= dev_stats
.multicast
;
1768 stats
->collisions
+= dev_stats
.collisions
;
1770 ovs_mutex_unlock(&netdev
->mutex
);
1776 netdev_internal_get_stats(const struct netdev
*netdev_
,
1777 struct netdev_stats
*stats
)
1779 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1782 ovs_mutex_lock(&netdev
->mutex
);
1783 get_stats_via_vport(netdev_
, stats
);
1784 error
= netdev
->vport_stats_error
;
1785 ovs_mutex_unlock(&netdev
->mutex
);
1791 netdev_linux_read_features(struct netdev_linux
*netdev
)
1793 struct ethtool_cmd ecmd
;
1797 if (netdev
->cache_valid
& VALID_FEATURES
) {
1801 COVERAGE_INC(netdev_get_ethtool
);
1802 memset(&ecmd
, 0, sizeof ecmd
);
1803 error
= netdev_linux_do_ethtool(netdev
->up
.name
, &ecmd
,
1804 ETHTOOL_GSET
, "ETHTOOL_GSET");
1809 /* Supported features. */
1810 netdev
->supported
= 0;
1811 if (ecmd
.supported
& SUPPORTED_10baseT_Half
) {
1812 netdev
->supported
|= NETDEV_F_10MB_HD
;
1814 if (ecmd
.supported
& SUPPORTED_10baseT_Full
) {
1815 netdev
->supported
|= NETDEV_F_10MB_FD
;
1817 if (ecmd
.supported
& SUPPORTED_100baseT_Half
) {
1818 netdev
->supported
|= NETDEV_F_100MB_HD
;
1820 if (ecmd
.supported
& SUPPORTED_100baseT_Full
) {
1821 netdev
->supported
|= NETDEV_F_100MB_FD
;
1823 if (ecmd
.supported
& SUPPORTED_1000baseT_Half
) {
1824 netdev
->supported
|= NETDEV_F_1GB_HD
;
1826 if ((ecmd
.supported
& SUPPORTED_1000baseT_Full
) ||
1827 (ecmd
.supported
& SUPPORTED_1000baseKX_Full
)) {
1828 netdev
->supported
|= NETDEV_F_1GB_FD
;
1830 if ((ecmd
.supported
& SUPPORTED_10000baseT_Full
) ||
1831 (ecmd
.supported
& SUPPORTED_10000baseKX4_Full
) ||
1832 (ecmd
.supported
& SUPPORTED_10000baseKR_Full
) ||
1833 (ecmd
.supported
& SUPPORTED_10000baseR_FEC
)) {
1834 netdev
->supported
|= NETDEV_F_10GB_FD
;
1836 if ((ecmd
.supported
& SUPPORTED_40000baseKR4_Full
) ||
1837 (ecmd
.supported
& SUPPORTED_40000baseCR4_Full
) ||
1838 (ecmd
.supported
& SUPPORTED_40000baseSR4_Full
) ||
1839 (ecmd
.supported
& SUPPORTED_40000baseLR4_Full
)) {
1840 netdev
->supported
|= NETDEV_F_40GB_FD
;
1842 if (ecmd
.supported
& SUPPORTED_TP
) {
1843 netdev
->supported
|= NETDEV_F_COPPER
;
1845 if (ecmd
.supported
& SUPPORTED_FIBRE
) {
1846 netdev
->supported
|= NETDEV_F_FIBER
;
1848 if (ecmd
.supported
& SUPPORTED_Autoneg
) {
1849 netdev
->supported
|= NETDEV_F_AUTONEG
;
1851 if (ecmd
.supported
& SUPPORTED_Pause
) {
1852 netdev
->supported
|= NETDEV_F_PAUSE
;
1854 if (ecmd
.supported
& SUPPORTED_Asym_Pause
) {
1855 netdev
->supported
|= NETDEV_F_PAUSE_ASYM
;
1858 /* Advertised features. */
1859 netdev
->advertised
= 0;
1860 if (ecmd
.advertising
& ADVERTISED_10baseT_Half
) {
1861 netdev
->advertised
|= NETDEV_F_10MB_HD
;
1863 if (ecmd
.advertising
& ADVERTISED_10baseT_Full
) {
1864 netdev
->advertised
|= NETDEV_F_10MB_FD
;
1866 if (ecmd
.advertising
& ADVERTISED_100baseT_Half
) {
1867 netdev
->advertised
|= NETDEV_F_100MB_HD
;
1869 if (ecmd
.advertising
& ADVERTISED_100baseT_Full
) {
1870 netdev
->advertised
|= NETDEV_F_100MB_FD
;
1872 if (ecmd
.advertising
& ADVERTISED_1000baseT_Half
) {
1873 netdev
->advertised
|= NETDEV_F_1GB_HD
;
1875 if ((ecmd
.advertising
& ADVERTISED_1000baseT_Full
) ||
1876 (ecmd
.advertising
& ADVERTISED_1000baseKX_Full
)) {
1877 netdev
->advertised
|= NETDEV_F_1GB_FD
;
1879 if ((ecmd
.advertising
& ADVERTISED_10000baseT_Full
) ||
1880 (ecmd
.advertising
& ADVERTISED_10000baseKX4_Full
) ||
1881 (ecmd
.advertising
& ADVERTISED_10000baseKR_Full
) ||
1882 (ecmd
.advertising
& ADVERTISED_10000baseR_FEC
)) {
1883 netdev
->advertised
|= NETDEV_F_10GB_FD
;
1885 if ((ecmd
.advertising
& ADVERTISED_40000baseKR4_Full
) ||
1886 (ecmd
.advertising
& ADVERTISED_40000baseCR4_Full
) ||
1887 (ecmd
.advertising
& ADVERTISED_40000baseSR4_Full
) ||
1888 (ecmd
.advertising
& ADVERTISED_40000baseLR4_Full
)) {
1889 netdev
->advertised
|= NETDEV_F_40GB_FD
;
1891 if (ecmd
.advertising
& ADVERTISED_TP
) {
1892 netdev
->advertised
|= NETDEV_F_COPPER
;
1894 if (ecmd
.advertising
& ADVERTISED_FIBRE
) {
1895 netdev
->advertised
|= NETDEV_F_FIBER
;
1897 if (ecmd
.advertising
& ADVERTISED_Autoneg
) {
1898 netdev
->advertised
|= NETDEV_F_AUTONEG
;
1900 if (ecmd
.advertising
& ADVERTISED_Pause
) {
1901 netdev
->advertised
|= NETDEV_F_PAUSE
;
1903 if (ecmd
.advertising
& ADVERTISED_Asym_Pause
) {
1904 netdev
->advertised
|= NETDEV_F_PAUSE_ASYM
;
1907 /* Current settings. */
1908 speed
= ethtool_cmd_speed(&ecmd
);
1909 if (speed
== SPEED_10
) {
1910 netdev
->current
= ecmd
.duplex
? NETDEV_F_10MB_FD
: NETDEV_F_10MB_HD
;
1911 } else if (speed
== SPEED_100
) {
1912 netdev
->current
= ecmd
.duplex
? NETDEV_F_100MB_FD
: NETDEV_F_100MB_HD
;
1913 } else if (speed
== SPEED_1000
) {
1914 netdev
->current
= ecmd
.duplex
? NETDEV_F_1GB_FD
: NETDEV_F_1GB_HD
;
1915 } else if (speed
== SPEED_10000
) {
1916 netdev
->current
= NETDEV_F_10GB_FD
;
1917 } else if (speed
== 40000) {
1918 netdev
->current
= NETDEV_F_40GB_FD
;
1919 } else if (speed
== 100000) {
1920 netdev
->current
= NETDEV_F_100GB_FD
;
1921 } else if (speed
== 1000000) {
1922 netdev
->current
= NETDEV_F_1TB_FD
;
1924 netdev
->current
= 0;
1927 if (ecmd
.port
== PORT_TP
) {
1928 netdev
->current
|= NETDEV_F_COPPER
;
1929 } else if (ecmd
.port
== PORT_FIBRE
) {
1930 netdev
->current
|= NETDEV_F_FIBER
;
1934 netdev
->current
|= NETDEV_F_AUTONEG
;
1938 netdev
->cache_valid
|= VALID_FEATURES
;
1939 netdev
->get_features_error
= error
;
1942 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1943 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1944 * Returns 0 if successful, otherwise a positive errno value. */
1946 netdev_linux_get_features(const struct netdev
*netdev_
,
1947 enum netdev_features
*current
,
1948 enum netdev_features
*advertised
,
1949 enum netdev_features
*supported
,
1950 enum netdev_features
*peer
)
1952 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1955 ovs_mutex_lock(&netdev
->mutex
);
1956 netdev_linux_read_features(netdev
);
1957 if (!netdev
->get_features_error
) {
1958 *current
= netdev
->current
;
1959 *advertised
= netdev
->advertised
;
1960 *supported
= netdev
->supported
;
1961 *peer
= 0; /* XXX */
1963 error
= netdev
->get_features_error
;
1964 ovs_mutex_unlock(&netdev
->mutex
);
1969 /* Set the features advertised by 'netdev' to 'advertise'. */
1971 netdev_linux_set_advertisements(struct netdev
*netdev_
,
1972 enum netdev_features advertise
)
1974 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1975 struct ethtool_cmd ecmd
;
1978 ovs_mutex_lock(&netdev
->mutex
);
1980 COVERAGE_INC(netdev_get_ethtool
);
1981 memset(&ecmd
, 0, sizeof ecmd
);
1982 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
1983 ETHTOOL_GSET
, "ETHTOOL_GSET");
1988 ecmd
.advertising
= 0;
1989 if (advertise
& NETDEV_F_10MB_HD
) {
1990 ecmd
.advertising
|= ADVERTISED_10baseT_Half
;
1992 if (advertise
& NETDEV_F_10MB_FD
) {
1993 ecmd
.advertising
|= ADVERTISED_10baseT_Full
;
1995 if (advertise
& NETDEV_F_100MB_HD
) {
1996 ecmd
.advertising
|= ADVERTISED_100baseT_Half
;
1998 if (advertise
& NETDEV_F_100MB_FD
) {
1999 ecmd
.advertising
|= ADVERTISED_100baseT_Full
;
2001 if (advertise
& NETDEV_F_1GB_HD
) {
2002 ecmd
.advertising
|= ADVERTISED_1000baseT_Half
;
2004 if (advertise
& NETDEV_F_1GB_FD
) {
2005 ecmd
.advertising
|= ADVERTISED_1000baseT_Full
;
2007 if (advertise
& NETDEV_F_10GB_FD
) {
2008 ecmd
.advertising
|= ADVERTISED_10000baseT_Full
;
2010 if (advertise
& NETDEV_F_COPPER
) {
2011 ecmd
.advertising
|= ADVERTISED_TP
;
2013 if (advertise
& NETDEV_F_FIBER
) {
2014 ecmd
.advertising
|= ADVERTISED_FIBRE
;
2016 if (advertise
& NETDEV_F_AUTONEG
) {
2017 ecmd
.advertising
|= ADVERTISED_Autoneg
;
2019 if (advertise
& NETDEV_F_PAUSE
) {
2020 ecmd
.advertising
|= ADVERTISED_Pause
;
2022 if (advertise
& NETDEV_F_PAUSE_ASYM
) {
2023 ecmd
.advertising
|= ADVERTISED_Asym_Pause
;
2025 COVERAGE_INC(netdev_set_ethtool
);
2026 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2027 ETHTOOL_SSET
, "ETHTOOL_SSET");
2030 ovs_mutex_unlock(&netdev
->mutex
);
2034 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2035 * successful, otherwise a positive errno value. */
2037 netdev_linux_set_policing(struct netdev
*netdev_
,
2038 uint32_t kbits_rate
, uint32_t kbits_burst
)
2040 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2041 const char *netdev_name
= netdev_get_name(netdev_
);
2044 kbits_burst
= (!kbits_rate
? 0 /* Force to 0 if no rate specified. */
2045 : !kbits_burst
? 1000 /* Default to 1000 kbits if 0. */
2046 : kbits_burst
); /* Stick with user-specified value. */
2048 ovs_mutex_lock(&netdev
->mutex
);
2049 if (netdev
->cache_valid
& VALID_POLICING
) {
2050 error
= netdev
->netdev_policing_error
;
2051 if (error
|| (netdev
->kbits_rate
== kbits_rate
&&
2052 netdev
->kbits_burst
== kbits_burst
)) {
2053 /* Assume that settings haven't changed since we last set them. */
2056 netdev
->cache_valid
&= ~VALID_POLICING
;
2059 COVERAGE_INC(netdev_set_policing
);
2060 /* Remove any existing ingress qdisc. */
2061 error
= tc_add_del_ingress_qdisc(netdev_
, false);
2063 VLOG_WARN_RL(&rl
, "%s: removing policing failed: %s",
2064 netdev_name
, ovs_strerror(error
));
2069 error
= tc_add_del_ingress_qdisc(netdev_
, true);
2071 VLOG_WARN_RL(&rl
, "%s: adding policing qdisc failed: %s",
2072 netdev_name
, ovs_strerror(error
));
2076 error
= tc_add_policer(netdev_
, kbits_rate
, kbits_burst
);
2078 VLOG_WARN_RL(&rl
, "%s: adding policing action failed: %s",
2079 netdev_name
, ovs_strerror(error
));
2084 netdev
->kbits_rate
= kbits_rate
;
2085 netdev
->kbits_burst
= kbits_burst
;
2088 if (!error
|| error
== ENODEV
) {
2089 netdev
->netdev_policing_error
= error
;
2090 netdev
->cache_valid
|= VALID_POLICING
;
2092 ovs_mutex_unlock(&netdev
->mutex
);
2097 netdev_linux_get_qos_types(const struct netdev
*netdev OVS_UNUSED
,
2100 const struct tc_ops
*const *opsp
;
2102 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2103 const struct tc_ops
*ops
= *opsp
;
2104 if (ops
->tc_install
&& ops
->ovs_name
[0] != '\0') {
2105 sset_add(types
, ops
->ovs_name
);
2111 static const struct tc_ops
*
2112 tc_lookup_ovs_name(const char *name
)
2114 const struct tc_ops
*const *opsp
;
2116 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2117 const struct tc_ops
*ops
= *opsp
;
2118 if (!strcmp(name
, ops
->ovs_name
)) {
2125 static const struct tc_ops
*
2126 tc_lookup_linux_name(const char *name
)
2128 const struct tc_ops
*const *opsp
;
2130 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2131 const struct tc_ops
*ops
= *opsp
;
2132 if (ops
->linux_name
&& !strcmp(name
, ops
->linux_name
)) {
2139 static struct tc_queue
*
2140 tc_find_queue__(const struct netdev
*netdev_
, unsigned int queue_id
,
2143 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2144 struct tc_queue
*queue
;
2146 HMAP_FOR_EACH_IN_BUCKET (queue
, hmap_node
, hash
, &netdev
->tc
->queues
) {
2147 if (queue
->queue_id
== queue_id
) {
2154 static struct tc_queue
*
2155 tc_find_queue(const struct netdev
*netdev
, unsigned int queue_id
)
2157 return tc_find_queue__(netdev
, queue_id
, hash_int(queue_id
, 0));
2161 netdev_linux_get_qos_capabilities(const struct netdev
*netdev OVS_UNUSED
,
2163 struct netdev_qos_capabilities
*caps
)
2165 const struct tc_ops
*ops
= tc_lookup_ovs_name(type
);
2169 caps
->n_queues
= ops
->n_queues
;
2174 netdev_linux_get_qos(const struct netdev
*netdev_
,
2175 const char **typep
, struct smap
*details
)
2177 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2180 ovs_mutex_lock(&netdev
->mutex
);
2181 error
= tc_query_qdisc(netdev_
);
2183 *typep
= netdev
->tc
->ops
->ovs_name
;
2184 error
= (netdev
->tc
->ops
->qdisc_get
2185 ? netdev
->tc
->ops
->qdisc_get(netdev_
, details
)
2188 ovs_mutex_unlock(&netdev
->mutex
);
2194 netdev_linux_set_qos(struct netdev
*netdev_
,
2195 const char *type
, const struct smap
*details
)
2197 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2198 const struct tc_ops
*new_ops
;
2201 new_ops
= tc_lookup_ovs_name(type
);
2202 if (!new_ops
|| !new_ops
->tc_install
) {
2206 ovs_mutex_lock(&netdev
->mutex
);
2207 error
= tc_query_qdisc(netdev_
);
2212 if (new_ops
== netdev
->tc
->ops
) {
2213 error
= new_ops
->qdisc_set
? new_ops
->qdisc_set(netdev_
, details
) : 0;
2215 /* Delete existing qdisc. */
2216 error
= tc_del_qdisc(netdev_
);
2220 ovs_assert(netdev
->tc
== NULL
);
2222 /* Install new qdisc. */
2223 error
= new_ops
->tc_install(netdev_
, details
);
2224 ovs_assert((error
== 0) == (netdev
->tc
!= NULL
));
2228 ovs_mutex_unlock(&netdev
->mutex
);
2233 netdev_linux_get_queue(const struct netdev
*netdev_
,
2234 unsigned int queue_id
, struct smap
*details
)
2236 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2239 ovs_mutex_lock(&netdev
->mutex
);
2240 error
= tc_query_qdisc(netdev_
);
2242 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2244 ? netdev
->tc
->ops
->class_get(netdev_
, queue
, details
)
2247 ovs_mutex_unlock(&netdev
->mutex
);
2253 netdev_linux_set_queue(struct netdev
*netdev_
,
2254 unsigned int queue_id
, const struct smap
*details
)
2256 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2259 ovs_mutex_lock(&netdev
->mutex
);
2260 error
= tc_query_qdisc(netdev_
);
2262 error
= (queue_id
< netdev
->tc
->ops
->n_queues
2263 && netdev
->tc
->ops
->class_set
2264 ? netdev
->tc
->ops
->class_set(netdev_
, queue_id
, details
)
2267 ovs_mutex_unlock(&netdev
->mutex
);
2273 netdev_linux_delete_queue(struct netdev
*netdev_
, unsigned int queue_id
)
2275 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2278 ovs_mutex_lock(&netdev
->mutex
);
2279 error
= tc_query_qdisc(netdev_
);
2281 if (netdev
->tc
->ops
->class_delete
) {
2282 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2284 ? netdev
->tc
->ops
->class_delete(netdev_
, queue
)
2290 ovs_mutex_unlock(&netdev
->mutex
);
2296 netdev_linux_get_queue_stats(const struct netdev
*netdev_
,
2297 unsigned int queue_id
,
2298 struct netdev_queue_stats
*stats
)
2300 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2303 ovs_mutex_lock(&netdev
->mutex
);
2304 error
= tc_query_qdisc(netdev_
);
2306 if (netdev
->tc
->ops
->class_get_stats
) {
2307 const struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2309 stats
->created
= queue
->created
;
2310 error
= netdev
->tc
->ops
->class_get_stats(netdev_
, queue
,
2319 ovs_mutex_unlock(&netdev
->mutex
);
2324 struct queue_dump_state
{
2325 struct nl_dump dump
;
2330 start_queue_dump(const struct netdev
*netdev
, struct queue_dump_state
*state
)
2332 struct ofpbuf request
;
2333 struct tcmsg
*tcmsg
;
2335 tcmsg
= tc_make_request(netdev
, RTM_GETTCLASS
, 0, &request
);
2339 tcmsg
->tcm_parent
= 0;
2340 nl_dump_start(&state
->dump
, NETLINK_ROUTE
, &request
);
2341 ofpbuf_uninit(&request
);
2343 ofpbuf_init(&state
->buf
, NL_DUMP_BUFSIZE
);
2348 finish_queue_dump(struct queue_dump_state
*state
)
2350 ofpbuf_uninit(&state
->buf
);
2351 return nl_dump_done(&state
->dump
);
2354 struct netdev_linux_queue_state
{
2355 unsigned int *queues
;
2361 netdev_linux_queue_dump_start(const struct netdev
*netdev_
, void **statep
)
2363 const struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2366 ovs_mutex_lock(&netdev
->mutex
);
2367 error
= tc_query_qdisc(netdev_
);
2369 if (netdev
->tc
->ops
->class_get
) {
2370 struct netdev_linux_queue_state
*state
;
2371 struct tc_queue
*queue
;
2374 *statep
= state
= xmalloc(sizeof *state
);
2375 state
->n_queues
= hmap_count(&netdev
->tc
->queues
);
2376 state
->cur_queue
= 0;
2377 state
->queues
= xmalloc(state
->n_queues
* sizeof *state
->queues
);
2380 HMAP_FOR_EACH (queue
, hmap_node
, &netdev
->tc
->queues
) {
2381 state
->queues
[i
++] = queue
->queue_id
;
2387 ovs_mutex_unlock(&netdev
->mutex
);
2393 netdev_linux_queue_dump_next(const struct netdev
*netdev_
, void *state_
,
2394 unsigned int *queue_idp
, struct smap
*details
)
2396 const struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2397 struct netdev_linux_queue_state
*state
= state_
;
2400 ovs_mutex_lock(&netdev
->mutex
);
2401 while (state
->cur_queue
< state
->n_queues
) {
2402 unsigned int queue_id
= state
->queues
[state
->cur_queue
++];
2403 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2406 *queue_idp
= queue_id
;
2407 error
= netdev
->tc
->ops
->class_get(netdev_
, queue
, details
);
2411 ovs_mutex_unlock(&netdev
->mutex
);
2417 netdev_linux_queue_dump_done(const struct netdev
*netdev OVS_UNUSED
,
2420 struct netdev_linux_queue_state
*state
= state_
;
2422 free(state
->queues
);
2428 netdev_linux_dump_queue_stats(const struct netdev
*netdev_
,
2429 netdev_dump_queue_stats_cb
*cb
, void *aux
)
2431 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2434 ovs_mutex_lock(&netdev
->mutex
);
2435 error
= tc_query_qdisc(netdev_
);
2437 struct queue_dump_state state
;
2439 if (!netdev
->tc
->ops
->class_dump_stats
) {
2441 } else if (!start_queue_dump(netdev_
, &state
)) {
2447 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
2448 retval
= netdev
->tc
->ops
->class_dump_stats(netdev_
, &msg
,
2455 retval
= finish_queue_dump(&state
);
2461 ovs_mutex_unlock(&netdev
->mutex
);
2467 netdev_linux_get_in4(const struct netdev
*netdev_
,
2468 struct in_addr
*address
, struct in_addr
*netmask
)
2470 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2473 ovs_mutex_lock(&netdev
->mutex
);
2474 if (!(netdev
->cache_valid
& VALID_IN4
)) {
2475 error
= netdev_linux_get_ipv4(netdev_
, &netdev
->address
,
2476 SIOCGIFADDR
, "SIOCGIFADDR");
2478 error
= netdev_linux_get_ipv4(netdev_
, &netdev
->netmask
,
2479 SIOCGIFNETMASK
, "SIOCGIFNETMASK");
2481 netdev
->in4_error
= error
;
2482 netdev
->cache_valid
|= VALID_IN4
;
2484 error
= netdev
->in4_error
;
2488 if (netdev
->address
.s_addr
!= INADDR_ANY
) {
2489 *address
= netdev
->address
;
2490 *netmask
= netdev
->netmask
;
2492 error
= EADDRNOTAVAIL
;
2495 ovs_mutex_unlock(&netdev
->mutex
);
2501 netdev_linux_set_in4(struct netdev
*netdev_
, struct in_addr address
,
2502 struct in_addr netmask
)
2504 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2507 ovs_mutex_lock(&netdev
->mutex
);
2508 error
= do_set_addr(netdev_
, SIOCSIFADDR
, "SIOCSIFADDR", address
);
2510 netdev
->address
= address
;
2511 netdev
->netmask
= netmask
;
2512 if (address
.s_addr
!= INADDR_ANY
) {
2513 error
= do_set_addr(netdev_
, SIOCSIFNETMASK
,
2514 "SIOCSIFNETMASK", netmask
);
2519 netdev
->cache_valid
|= VALID_IN4
;
2520 netdev
->in4_error
= 0;
2522 netdev
->cache_valid
&= ~VALID_IN4
;
2524 ovs_mutex_unlock(&netdev
->mutex
);
2530 parse_if_inet6_line(const char *line
,
2531 struct in6_addr
*in6
, char ifname
[16 + 1])
2533 uint8_t *s6
= in6
->s6_addr
;
2534 #define X8 "%2"SCNx8
2535 return ovs_scan(line
,
2536 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2537 "%*x %*x %*x %*x %16s\n",
2538 &s6
[0], &s6
[1], &s6
[2], &s6
[3],
2539 &s6
[4], &s6
[5], &s6
[6], &s6
[7],
2540 &s6
[8], &s6
[9], &s6
[10], &s6
[11],
2541 &s6
[12], &s6
[13], &s6
[14], &s6
[15],
2545 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2546 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2549 netdev_linux_get_in6(const struct netdev
*netdev_
, struct in6_addr
*in6
)
2551 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2554 ovs_mutex_lock(&netdev
->mutex
);
2555 if (!(netdev
->cache_valid
& VALID_IN6
)) {
2559 netdev
->in6
= in6addr_any
;
2560 netdev
->in6_error
= EADDRNOTAVAIL
;
2562 file
= fopen("/proc/net/if_inet6", "r");
2564 const char *name
= netdev_get_name(netdev_
);
2565 while (fgets(line
, sizeof line
, file
)) {
2566 struct in6_addr in6_tmp
;
2567 char ifname
[16 + 1];
2568 if (parse_if_inet6_line(line
, &in6_tmp
, ifname
)
2569 && !strcmp(name
, ifname
))
2571 netdev
->in6
= in6_tmp
;
2572 netdev
->in6_error
= 0;
2578 netdev
->in6_error
= EOPNOTSUPP
;
2580 netdev
->cache_valid
|= VALID_IN6
;
2583 error
= netdev
->in6_error
;
2584 ovs_mutex_unlock(&netdev
->mutex
);
2590 make_in4_sockaddr(struct sockaddr
*sa
, struct in_addr addr
)
2592 struct sockaddr_in sin
;
2593 memset(&sin
, 0, sizeof sin
);
2594 sin
.sin_family
= AF_INET
;
2595 sin
.sin_addr
= addr
;
2598 memset(sa
, 0, sizeof *sa
);
2599 memcpy(sa
, &sin
, sizeof sin
);
2603 do_set_addr(struct netdev
*netdev
,
2604 int ioctl_nr
, const char *ioctl_name
, struct in_addr addr
)
2608 make_in4_sockaddr(&ifr
.ifr_addr
, addr
);
2609 return af_inet_ifreq_ioctl(netdev_get_name(netdev
), &ifr
, ioctl_nr
,
2613 /* Adds 'router' as a default IP gateway. */
2615 netdev_linux_add_router(struct netdev
*netdev OVS_UNUSED
, struct in_addr router
)
2617 struct in_addr any
= { INADDR_ANY
};
2621 memset(&rt
, 0, sizeof rt
);
2622 make_in4_sockaddr(&rt
.rt_dst
, any
);
2623 make_in4_sockaddr(&rt
.rt_gateway
, router
);
2624 make_in4_sockaddr(&rt
.rt_genmask
, any
);
2625 rt
.rt_flags
= RTF_UP
| RTF_GATEWAY
;
2626 error
= af_inet_ioctl(SIOCADDRT
, &rt
);
2628 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error
));
2634 netdev_linux_get_next_hop(const struct in_addr
*host
, struct in_addr
*next_hop
,
2637 static const char fn
[] = "/proc/net/route";
2642 *netdev_name
= NULL
;
2643 stream
= fopen(fn
, "r");
2644 if (stream
== NULL
) {
2645 VLOG_WARN_RL(&rl
, "%s: open failed: %s", fn
, ovs_strerror(errno
));
2650 while (fgets(line
, sizeof line
, stream
)) {
2653 ovs_be32 dest
, gateway
, mask
;
2654 int refcnt
, metric
, mtu
;
2655 unsigned int flags
, use
, window
, irtt
;
2658 "%16s %"SCNx32
" %"SCNx32
" %04X %d %u %d %"SCNx32
2660 iface
, &dest
, &gateway
, &flags
, &refcnt
,
2661 &use
, &metric
, &mask
, &mtu
, &window
, &irtt
)) {
2662 VLOG_WARN_RL(&rl
, "%s: could not parse line %d: %s",
2666 if (!(flags
& RTF_UP
)) {
2667 /* Skip routes that aren't up. */
2671 /* The output of 'dest', 'mask', and 'gateway' were given in
2672 * network byte order, so we don't need need any endian
2673 * conversions here. */
2674 if ((dest
& mask
) == (host
->s_addr
& mask
)) {
2676 /* The host is directly reachable. */
2677 next_hop
->s_addr
= 0;
2679 /* To reach the host, we must go through a gateway. */
2680 next_hop
->s_addr
= gateway
;
2682 *netdev_name
= xstrdup(iface
);
2694 netdev_linux_get_status(const struct netdev
*netdev_
, struct smap
*smap
)
2696 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2699 ovs_mutex_lock(&netdev
->mutex
);
2700 if (!(netdev
->cache_valid
& VALID_DRVINFO
)) {
2701 struct ethtool_cmd
*cmd
= (struct ethtool_cmd
*) &netdev
->drvinfo
;
2703 COVERAGE_INC(netdev_get_ethtool
);
2704 memset(&netdev
->drvinfo
, 0, sizeof netdev
->drvinfo
);
2705 error
= netdev_linux_do_ethtool(netdev
->up
.name
,
2708 "ETHTOOL_GDRVINFO");
2710 netdev
->cache_valid
|= VALID_DRVINFO
;
2715 smap_add(smap
, "driver_name", netdev
->drvinfo
.driver
);
2716 smap_add(smap
, "driver_version", netdev
->drvinfo
.version
);
2717 smap_add(smap
, "firmware_version", netdev
->drvinfo
.fw_version
);
2719 ovs_mutex_unlock(&netdev
->mutex
);
2725 netdev_internal_get_status(const struct netdev
*netdev OVS_UNUSED
,
2728 smap_add(smap
, "driver_name", "openvswitch");
2732 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2733 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2734 * returns 0. Otherwise, it returns a positive errno value; in particular,
2735 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2737 netdev_linux_arp_lookup(const struct netdev
*netdev
,
2738 ovs_be32 ip
, struct eth_addr
*mac
)
2741 struct sockaddr_in sin
;
2744 memset(&r
, 0, sizeof r
);
2745 memset(&sin
, 0, sizeof sin
);
2746 sin
.sin_family
= AF_INET
;
2747 sin
.sin_addr
.s_addr
= ip
;
2749 memcpy(&r
.arp_pa
, &sin
, sizeof sin
);
2750 r
.arp_ha
.sa_family
= ARPHRD_ETHER
;
2752 ovs_strzcpy(r
.arp_dev
, netdev_get_name(netdev
), sizeof r
.arp_dev
);
2753 COVERAGE_INC(netdev_arp_lookup
);
2754 retval
= af_inet_ioctl(SIOCGARP
, &r
);
2756 memcpy(mac
, r
.arp_ha
.sa_data
, ETH_ADDR_LEN
);
2757 } else if (retval
!= ENXIO
) {
2758 VLOG_WARN_RL(&rl
, "%s: could not look up ARP entry for "IP_FMT
": %s",
2759 netdev_get_name(netdev
), IP_ARGS(ip
),
2760 ovs_strerror(retval
));
2766 nd_to_iff_flags(enum netdev_flags nd
)
2769 if (nd
& NETDEV_UP
) {
2772 if (nd
& NETDEV_PROMISC
) {
2775 if (nd
& NETDEV_LOOPBACK
) {
2776 iff
|= IFF_LOOPBACK
;
2782 iff_to_nd_flags(int iff
)
2784 enum netdev_flags nd
= 0;
2788 if (iff
& IFF_PROMISC
) {
2789 nd
|= NETDEV_PROMISC
;
2791 if (iff
& IFF_LOOPBACK
) {
2792 nd
|= NETDEV_LOOPBACK
;
2798 update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
2799 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
2800 OVS_REQUIRES(netdev
->mutex
)
2802 int old_flags
, new_flags
;
2805 old_flags
= netdev
->ifi_flags
;
2806 *old_flagsp
= iff_to_nd_flags(old_flags
);
2807 new_flags
= (old_flags
& ~nd_to_iff_flags(off
)) | nd_to_iff_flags(on
);
2808 if (new_flags
!= old_flags
) {
2809 error
= set_flags(netdev_get_name(&netdev
->up
), new_flags
);
2810 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
2817 netdev_linux_update_flags(struct netdev
*netdev_
, enum netdev_flags off
,
2818 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
2820 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2823 ovs_mutex_lock(&netdev
->mutex
);
2824 error
= update_flags(netdev
, off
, on
, old_flagsp
);
2825 ovs_mutex_unlock(&netdev
->mutex
);
2830 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2831 GET_FEATURES, GET_STATUS) \
2837 netdev_linux_wait, \
2839 netdev_linux_alloc, \
2841 netdev_linux_destruct, \
2842 netdev_linux_dealloc, \
2843 NULL, /* get_config */ \
2844 NULL, /* set_config */ \
2845 NULL, /* get_tunnel_config */ \
2846 NULL, /* build header */ \
2847 NULL, /* push header */ \
2848 NULL, /* pop header */ \
2849 NULL, /* get_numa_id */ \
2850 NULL, /* set_multiq */ \
2852 netdev_linux_send, \
2853 netdev_linux_send_wait, \
2855 netdev_linux_set_etheraddr, \
2856 netdev_linux_get_etheraddr, \
2857 netdev_linux_get_mtu, \
2858 netdev_linux_set_mtu, \
2859 netdev_linux_get_ifindex, \
2860 netdev_linux_get_carrier, \
2861 netdev_linux_get_carrier_resets, \
2862 netdev_linux_set_miimon_interval, \
2866 netdev_linux_set_advertisements, \
2868 netdev_linux_set_policing, \
2869 netdev_linux_get_qos_types, \
2870 netdev_linux_get_qos_capabilities, \
2871 netdev_linux_get_qos, \
2872 netdev_linux_set_qos, \
2873 netdev_linux_get_queue, \
2874 netdev_linux_set_queue, \
2875 netdev_linux_delete_queue, \
2876 netdev_linux_get_queue_stats, \
2877 netdev_linux_queue_dump_start, \
2878 netdev_linux_queue_dump_next, \
2879 netdev_linux_queue_dump_done, \
2880 netdev_linux_dump_queue_stats, \
2882 netdev_linux_get_in4, \
2883 netdev_linux_set_in4, \
2884 netdev_linux_get_in6, \
2885 netdev_linux_add_router, \
2886 netdev_linux_get_next_hop, \
2888 netdev_linux_arp_lookup, \
2890 netdev_linux_update_flags, \
2892 netdev_linux_rxq_alloc, \
2893 netdev_linux_rxq_construct, \
2894 netdev_linux_rxq_destruct, \
2895 netdev_linux_rxq_dealloc, \
2896 netdev_linux_rxq_recv, \
2897 netdev_linux_rxq_wait, \
2898 netdev_linux_rxq_drain, \
2901 const struct netdev_class netdev_linux_class
=
2904 netdev_linux_construct
,
2905 netdev_linux_get_stats
,
2906 netdev_linux_get_features
,
2907 netdev_linux_get_status
);
2909 const struct netdev_class netdev_tap_class
=
2912 netdev_linux_construct_tap
,
2913 netdev_tap_get_stats
,
2914 netdev_linux_get_features
,
2915 netdev_linux_get_status
);
2917 const struct netdev_class netdev_internal_class
=
2920 netdev_linux_construct
,
2921 netdev_internal_get_stats
,
2922 NULL
, /* get_features */
2923 netdev_internal_get_status
);
2926 #define CODEL_N_QUEUES 0x0000
2928 /* In sufficiently new kernel headers these are defined as enums in
2929 * <linux/pkt_sched.h>. Define them here as macros to help out with older
2930 * kernels. (This overrides any enum definition in the header file but that's
2932 #define TCA_CODEL_TARGET 1
2933 #define TCA_CODEL_LIMIT 2
2934 #define TCA_CODEL_INTERVAL 3
2943 static struct codel
*
2944 codel_get__(const struct netdev
*netdev_
)
2946 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2947 return CONTAINER_OF(netdev
->tc
, struct codel
, tc
);
2951 codel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
2954 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2955 struct codel
*codel
;
2957 codel
= xmalloc(sizeof *codel
);
2958 tc_init(&codel
->tc
, &tc_ops_codel
);
2959 codel
->target
= target
;
2960 codel
->limit
= limit
;
2961 codel
->interval
= interval
;
2963 netdev
->tc
= &codel
->tc
;
2967 codel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
2971 struct ofpbuf request
;
2972 struct tcmsg
*tcmsg
;
2973 uint32_t otarget
, olimit
, ointerval
;
2976 tc_del_qdisc(netdev
);
2978 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
2979 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
2983 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
2984 tcmsg
->tcm_parent
= TC_H_ROOT
;
2986 otarget
= target
? target
: 5000;
2987 olimit
= limit
? limit
: 10240;
2988 ointerval
= interval
? interval
: 100000;
2990 nl_msg_put_string(&request
, TCA_KIND
, "codel");
2991 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
2992 nl_msg_put_u32(&request
, TCA_CODEL_TARGET
, otarget
);
2993 nl_msg_put_u32(&request
, TCA_CODEL_LIMIT
, olimit
);
2994 nl_msg_put_u32(&request
, TCA_CODEL_INTERVAL
, ointerval
);
2995 nl_msg_end_nested(&request
, opt_offset
);
2997 error
= tc_transact(&request
, NULL
);
2999 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3000 "target %u, limit %u, interval %u error %d(%s)",
3001 netdev_get_name(netdev
),
3002 otarget
, olimit
, ointerval
,
3003 error
, ovs_strerror(error
));
3009 codel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3010 const struct smap
*details
, struct codel
*codel
)
3012 const char *target_s
;
3013 const char *limit_s
;
3014 const char *interval_s
;
3016 target_s
= smap_get(details
, "target");
3017 limit_s
= smap_get(details
, "limit");
3018 interval_s
= smap_get(details
, "interval");
3020 codel
->target
= target_s
? strtoull(target_s
, NULL
, 10) : 0;
3021 codel
->limit
= limit_s
? strtoull(limit_s
, NULL
, 10) : 0;
3022 codel
->interval
= interval_s
? strtoull(interval_s
, NULL
, 10) : 0;
3024 if (!codel
->target
) {
3025 codel
->target
= 5000;
3027 if (!codel
->limit
) {
3028 codel
->limit
= 10240;
3030 if (!codel
->interval
) {
3031 codel
->interval
= 100000;
3036 codel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3041 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3042 error
= codel_setup_qdisc__(netdev
, codel
.target
, codel
.limit
,
3045 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3051 codel_parse_tca_options__(struct nlattr
*nl_options
, struct codel
*codel
)
3053 static const struct nl_policy tca_codel_policy
[] = {
3054 [TCA_CODEL_TARGET
] = { .type
= NL_A_U32
},
3055 [TCA_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3056 [TCA_CODEL_INTERVAL
] = { .type
= NL_A_U32
}
3059 struct nlattr
*attrs
[ARRAY_SIZE(tca_codel_policy
)];
3061 if (!nl_parse_nested(nl_options
, tca_codel_policy
,
3062 attrs
, ARRAY_SIZE(tca_codel_policy
))) {
3063 VLOG_WARN_RL(&rl
, "failed to parse CoDel class options");
3067 codel
->target
= nl_attr_get_u32(attrs
[TCA_CODEL_TARGET
]);
3068 codel
->limit
= nl_attr_get_u32(attrs
[TCA_CODEL_LIMIT
]);
3069 codel
->interval
= nl_attr_get_u32(attrs
[TCA_CODEL_INTERVAL
]);
3074 codel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3076 struct nlattr
*nlattr
;
3081 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3086 error
= codel_parse_tca_options__(nlattr
, &codel
);
3091 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3097 codel_tc_destroy(struct tc
*tc
)
3099 struct codel
*codel
= CONTAINER_OF(tc
, struct codel
, tc
);
3105 codel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3107 const struct codel
*codel
= codel_get__(netdev
);
3108 smap_add_format(details
, "target", "%u", codel
->target
);
3109 smap_add_format(details
, "limit", "%u", codel
->limit
);
3110 smap_add_format(details
, "interval", "%u", codel
->interval
);
3115 codel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3119 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3120 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3121 codel_get__(netdev
)->target
= codel
.target
;
3122 codel_get__(netdev
)->limit
= codel
.limit
;
3123 codel_get__(netdev
)->interval
= codel
.interval
;
3127 static const struct tc_ops tc_ops_codel
= {
3128 "codel", /* linux_name */
3129 "linux-codel", /* ovs_name */
3130 CODEL_N_QUEUES
, /* n_queues */
3143 /* FQ-CoDel traffic control class. */
3145 #define FQCODEL_N_QUEUES 0x0000
3147 /* In sufficiently new kernel headers these are defined as enums in
3148 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3149 * kernels. (This overrides any enum definition in the header file but that's
3151 #define TCA_FQ_CODEL_TARGET 1
3152 #define TCA_FQ_CODEL_LIMIT 2
3153 #define TCA_FQ_CODEL_INTERVAL 3
3154 #define TCA_FQ_CODEL_ECN 4
3155 #define TCA_FQ_CODEL_FLOWS 5
3156 #define TCA_FQ_CODEL_QUANTUM 6
3167 static struct fqcodel
*
3168 fqcodel_get__(const struct netdev
*netdev_
)
3170 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3171 return CONTAINER_OF(netdev
->tc
, struct fqcodel
, tc
);
3175 fqcodel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3176 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3178 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3179 struct fqcodel
*fqcodel
;
3181 fqcodel
= xmalloc(sizeof *fqcodel
);
3182 tc_init(&fqcodel
->tc
, &tc_ops_fqcodel
);
3183 fqcodel
->target
= target
;
3184 fqcodel
->limit
= limit
;
3185 fqcodel
->interval
= interval
;
3186 fqcodel
->flows
= flows
;
3187 fqcodel
->quantum
= quantum
;
3189 netdev
->tc
= &fqcodel
->tc
;
3193 fqcodel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3194 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3197 struct ofpbuf request
;
3198 struct tcmsg
*tcmsg
;
3199 uint32_t otarget
, olimit
, ointerval
, oflows
, oquantum
;
3202 tc_del_qdisc(netdev
);
3204 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
3205 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3209 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3210 tcmsg
->tcm_parent
= TC_H_ROOT
;
3212 otarget
= target
? target
: 5000;
3213 olimit
= limit
? limit
: 10240;
3214 ointerval
= interval
? interval
: 100000;
3215 oflows
= flows
? flows
: 1024;
3216 oquantum
= quantum
? quantum
: 1514; /* fq_codel default quantum is 1514
3219 nl_msg_put_string(&request
, TCA_KIND
, "fq_codel");
3220 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3221 nl_msg_put_u32(&request
, TCA_FQ_CODEL_TARGET
, otarget
);
3222 nl_msg_put_u32(&request
, TCA_FQ_CODEL_LIMIT
, olimit
);
3223 nl_msg_put_u32(&request
, TCA_FQ_CODEL_INTERVAL
, ointerval
);
3224 nl_msg_put_u32(&request
, TCA_FQ_CODEL_FLOWS
, oflows
);
3225 nl_msg_put_u32(&request
, TCA_FQ_CODEL_QUANTUM
, oquantum
);
3226 nl_msg_end_nested(&request
, opt_offset
);
3228 error
= tc_transact(&request
, NULL
);
3230 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3231 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3232 netdev_get_name(netdev
),
3233 otarget
, olimit
, ointerval
, oflows
, oquantum
,
3234 error
, ovs_strerror(error
));
3240 fqcodel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3241 const struct smap
*details
, struct fqcodel
*fqcodel
)
3243 const char *target_s
;
3244 const char *limit_s
;
3245 const char *interval_s
;
3246 const char *flows_s
;
3247 const char *quantum_s
;
3249 target_s
= smap_get(details
, "target");
3250 limit_s
= smap_get(details
, "limit");
3251 interval_s
= smap_get(details
, "interval");
3252 flows_s
= smap_get(details
, "flows");
3253 quantum_s
= smap_get(details
, "quantum");
3254 fqcodel
->target
= target_s
? strtoull(target_s
, NULL
, 10) : 0;
3255 fqcodel
->limit
= limit_s
? strtoull(limit_s
, NULL
, 10) : 0;
3256 fqcodel
->interval
= interval_s
? strtoull(interval_s
, NULL
, 10) : 0;
3257 fqcodel
->flows
= flows_s
? strtoull(flows_s
, NULL
, 10) : 0;
3258 fqcodel
->quantum
= quantum_s
? strtoull(quantum_s
, NULL
, 10) : 0;
3259 if (!fqcodel
->target
) {
3260 fqcodel
->target
= 5000;
3262 if (!fqcodel
->limit
) {
3263 fqcodel
->limit
= 10240;
3265 if (!fqcodel
->interval
) {
3266 fqcodel
->interval
= 1000000;
3268 if (!fqcodel
->flows
) {
3269 fqcodel
->flows
= 1024;
3271 if (!fqcodel
->quantum
) {
3272 fqcodel
->quantum
= 1514;
3277 fqcodel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3280 struct fqcodel fqcodel
;
3282 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3283 error
= fqcodel_setup_qdisc__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3284 fqcodel
.interval
, fqcodel
.flows
,
3287 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3288 fqcodel
.interval
, fqcodel
.flows
, fqcodel
.quantum
);
3294 fqcodel_parse_tca_options__(struct nlattr
*nl_options
, struct fqcodel
*fqcodel
)
3296 static const struct nl_policy tca_fqcodel_policy
[] = {
3297 [TCA_FQ_CODEL_TARGET
] = { .type
= NL_A_U32
},
3298 [TCA_FQ_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3299 [TCA_FQ_CODEL_INTERVAL
] = { .type
= NL_A_U32
},
3300 [TCA_FQ_CODEL_FLOWS
] = { .type
= NL_A_U32
},
3301 [TCA_FQ_CODEL_QUANTUM
] = { .type
= NL_A_U32
}
3304 struct nlattr
*attrs
[ARRAY_SIZE(tca_fqcodel_policy
)];
3306 if (!nl_parse_nested(nl_options
, tca_fqcodel_policy
,
3307 attrs
, ARRAY_SIZE(tca_fqcodel_policy
))) {
3308 VLOG_WARN_RL(&rl
, "failed to parse FQ_CoDel class options");
3312 fqcodel
->target
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_TARGET
]);
3313 fqcodel
->limit
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_LIMIT
]);
3314 fqcodel
->interval
=nl_attr_get_u32(attrs
[TCA_FQ_CODEL_INTERVAL
]);
3315 fqcodel
->flows
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_FLOWS
]);
3316 fqcodel
->quantum
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_QUANTUM
]);
3321 fqcodel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3323 struct nlattr
*nlattr
;
3326 struct fqcodel fqcodel
;
3328 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3333 error
= fqcodel_parse_tca_options__(nlattr
, &fqcodel
);
3338 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3339 fqcodel
.flows
, fqcodel
.quantum
);
3344 fqcodel_tc_destroy(struct tc
*tc
)
3346 struct fqcodel
*fqcodel
= CONTAINER_OF(tc
, struct fqcodel
, tc
);
3352 fqcodel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3354 const struct fqcodel
*fqcodel
= fqcodel_get__(netdev
);
3355 smap_add_format(details
, "target", "%u", fqcodel
->target
);
3356 smap_add_format(details
, "limit", "%u", fqcodel
->limit
);
3357 smap_add_format(details
, "interval", "%u", fqcodel
->interval
);
3358 smap_add_format(details
, "flows", "%u", fqcodel
->flows
);
3359 smap_add_format(details
, "quantum", "%u", fqcodel
->quantum
);
3364 fqcodel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3366 struct fqcodel fqcodel
;
3368 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3369 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3370 fqcodel
.flows
, fqcodel
.quantum
);
3371 fqcodel_get__(netdev
)->target
= fqcodel
.target
;
3372 fqcodel_get__(netdev
)->limit
= fqcodel
.limit
;
3373 fqcodel_get__(netdev
)->interval
= fqcodel
.interval
;
3374 fqcodel_get__(netdev
)->flows
= fqcodel
.flows
;
3375 fqcodel_get__(netdev
)->quantum
= fqcodel
.quantum
;
3379 static const struct tc_ops tc_ops_fqcodel
= {
3380 "fq_codel", /* linux_name */
3381 "linux-fq_codel", /* ovs_name */
3382 FQCODEL_N_QUEUES
, /* n_queues */
3395 /* SFQ traffic control class. */
3397 #define SFQ_N_QUEUES 0x0000
3406 sfq_get__(const struct netdev
*netdev_
)
3408 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3409 return CONTAINER_OF(netdev
->tc
, struct sfq
, tc
);
3413 sfq_install__(struct netdev
*netdev_
, uint32_t quantum
, uint32_t perturb
)
3415 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3418 sfq
= xmalloc(sizeof *sfq
);
3419 tc_init(&sfq
->tc
, &tc_ops_sfq
);
3420 sfq
->perturb
= perturb
;
3421 sfq
->quantum
= quantum
;
3423 netdev
->tc
= &sfq
->tc
;
3427 sfq_setup_qdisc__(struct netdev
*netdev
, uint32_t quantum
, uint32_t perturb
)
3429 struct tc_sfq_qopt opt
;
3430 struct ofpbuf request
;
3431 struct tcmsg
*tcmsg
;
3433 int mtu_error
, error
;
3434 mtu_error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3436 tc_del_qdisc(netdev
);
3438 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
3439 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3443 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3444 tcmsg
->tcm_parent
= TC_H_ROOT
;
3446 memset(&opt
, 0, sizeof opt
);
3449 opt
.quantum
= mtu
; /* if we cannot find mtu, use default */
3452 opt
.quantum
= quantum
;
3456 opt
.perturb_period
= 10;
3458 opt
.perturb_period
= perturb
;
3461 nl_msg_put_string(&request
, TCA_KIND
, "sfq");
3462 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
3464 error
= tc_transact(&request
, NULL
);
3466 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3467 "quantum %u, perturb %u error %d(%s)",
3468 netdev_get_name(netdev
),
3469 opt
.quantum
, opt
.perturb_period
,
3470 error
, ovs_strerror(error
));
3476 sfq_parse_qdisc_details__(struct netdev
*netdev
,
3477 const struct smap
*details
, struct sfq
*sfq
)
3479 const char *perturb_s
;
3480 const char *quantum_s
;
3484 perturb_s
= smap_get(details
, "perturb");
3485 quantum_s
= smap_get(details
, "quantum");
3486 sfq
->perturb
= perturb_s
? strtoull(perturb_s
, NULL
, 10) : 0;
3487 sfq
->quantum
= quantum_s
? strtoull(quantum_s
, NULL
, 10) : 0;
3488 if (!sfq
->perturb
) {
3492 if (!sfq
->quantum
) {
3493 mtu_error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3497 VLOG_WARN_RL(&rl
, "when using SFQ, you must specify quantum on a "
3498 "device without mtu");
3505 sfq_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3510 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3511 error
= sfq_setup_qdisc__(netdev
, sfq
.quantum
, sfq
.perturb
);
3513 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3519 sfq_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3521 const struct tc_sfq_qopt
*sfq
;
3522 struct nlattr
*nlattr
;
3526 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3528 sfq
= nl_attr_get(nlattr
);
3529 sfq_install__(netdev
, sfq
->perturb_period
, sfq
->quantum
);
3537 sfq_tc_destroy(struct tc
*tc
)
3539 struct sfq
*sfq
= CONTAINER_OF(tc
, struct sfq
, tc
);
3545 sfq_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3547 const struct sfq
*sfq
= sfq_get__(netdev
);
3548 smap_add_format(details
, "quantum", "%u", sfq
->quantum
);
3549 smap_add_format(details
, "perturb", "%u", sfq
->perturb
);
3554 sfq_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3558 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3559 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3560 sfq_get__(netdev
)->quantum
= sfq
.quantum
;
3561 sfq_get__(netdev
)->perturb
= sfq
.perturb
;
3565 static const struct tc_ops tc_ops_sfq
= {
3566 "sfq", /* linux_name */
3567 "linux-sfq", /* ovs_name */
3568 SFQ_N_QUEUES
, /* n_queues */
3581 /* HTB traffic control class. */
3583 #define HTB_N_QUEUES 0xf000
3584 #define HTB_RATE2QUANTUM 10
3588 unsigned int max_rate
; /* In bytes/s. */
3592 struct tc_queue tc_queue
;
3593 unsigned int min_rate
; /* In bytes/s. */
3594 unsigned int max_rate
; /* In bytes/s. */
3595 unsigned int burst
; /* In bytes. */
3596 unsigned int priority
; /* Lower values are higher priorities. */
3600 htb_get__(const struct netdev
*netdev_
)
3602 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3603 return CONTAINER_OF(netdev
->tc
, struct htb
, tc
);
3607 htb_install__(struct netdev
*netdev_
, uint64_t max_rate
)
3609 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3612 htb
= xmalloc(sizeof *htb
);
3613 tc_init(&htb
->tc
, &tc_ops_htb
);
3614 htb
->max_rate
= max_rate
;
3616 netdev
->tc
= &htb
->tc
;
3619 /* Create an HTB qdisc.
3621 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
3623 htb_setup_qdisc__(struct netdev
*netdev
)
3626 struct tc_htb_glob opt
;
3627 struct ofpbuf request
;
3628 struct tcmsg
*tcmsg
;
3630 tc_del_qdisc(netdev
);
3632 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
3633 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3637 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3638 tcmsg
->tcm_parent
= TC_H_ROOT
;
3640 nl_msg_put_string(&request
, TCA_KIND
, "htb");
3642 memset(&opt
, 0, sizeof opt
);
3643 opt
.rate2quantum
= HTB_RATE2QUANTUM
;
3647 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3648 nl_msg_put_unspec(&request
, TCA_HTB_INIT
, &opt
, sizeof opt
);
3649 nl_msg_end_nested(&request
, opt_offset
);
3651 return tc_transact(&request
, NULL
);
3654 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
3655 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
3657 htb_setup_class__(struct netdev
*netdev
, unsigned int handle
,
3658 unsigned int parent
, struct htb_class
*class)
3661 struct tc_htb_opt opt
;
3662 struct ofpbuf request
;
3663 struct tcmsg
*tcmsg
;
3667 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3669 VLOG_WARN_RL(&rl
, "cannot set up HTB on device %s that lacks MTU",
3670 netdev_get_name(netdev
));
3674 memset(&opt
, 0, sizeof opt
);
3675 tc_fill_rate(&opt
.rate
, class->min_rate
, mtu
);
3676 tc_fill_rate(&opt
.ceil
, class->max_rate
, mtu
);
3677 /* Makes sure the quantum is at least MTU. Setting quantum will
3678 * make htb ignore the r2q for this class. */
3679 if ((class->min_rate
/ HTB_RATE2QUANTUM
) < mtu
) {
3682 opt
.buffer
= tc_calc_buffer(opt
.rate
.rate
, mtu
, class->burst
);
3683 opt
.cbuffer
= tc_calc_buffer(opt
.ceil
.rate
, mtu
, class->burst
);
3684 opt
.prio
= class->priority
;
3686 tcmsg
= tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
, &request
);
3690 tcmsg
->tcm_handle
= handle
;
3691 tcmsg
->tcm_parent
= parent
;
3693 nl_msg_put_string(&request
, TCA_KIND
, "htb");
3694 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3695 nl_msg_put_unspec(&request
, TCA_HTB_PARMS
, &opt
, sizeof opt
);
3696 tc_put_rtab(&request
, TCA_HTB_RTAB
, &opt
.rate
);
3697 tc_put_rtab(&request
, TCA_HTB_CTAB
, &opt
.ceil
);
3698 nl_msg_end_nested(&request
, opt_offset
);
3700 error
= tc_transact(&request
, NULL
);
3702 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
3703 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
3704 netdev_get_name(netdev
),
3705 tc_get_major(handle
), tc_get_minor(handle
),
3706 tc_get_major(parent
), tc_get_minor(parent
),
3707 class->min_rate
, class->max_rate
,
3708 class->burst
, class->priority
, ovs_strerror(error
));
3713 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
3714 * description of them into 'details'. The description complies with the
3715 * specification given in the vswitch database documentation for linux-htb
3718 htb_parse_tca_options__(struct nlattr
*nl_options
, struct htb_class
*class)
3720 static const struct nl_policy tca_htb_policy
[] = {
3721 [TCA_HTB_PARMS
] = { .type
= NL_A_UNSPEC
, .optional
= false,
3722 .min_len
= sizeof(struct tc_htb_opt
) },
3725 struct nlattr
*attrs
[ARRAY_SIZE(tca_htb_policy
)];
3726 const struct tc_htb_opt
*htb
;
3728 if (!nl_parse_nested(nl_options
, tca_htb_policy
,
3729 attrs
, ARRAY_SIZE(tca_htb_policy
))) {
3730 VLOG_WARN_RL(&rl
, "failed to parse HTB class options");
3734 htb
= nl_attr_get(attrs
[TCA_HTB_PARMS
]);
3735 class->min_rate
= htb
->rate
.rate
;
3736 class->max_rate
= htb
->ceil
.rate
;
3737 class->burst
= tc_ticks_to_bytes(htb
->rate
.rate
, htb
->buffer
);
3738 class->priority
= htb
->prio
;
3743 htb_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
3744 struct htb_class
*options
,
3745 struct netdev_queue_stats
*stats
)
3747 struct nlattr
*nl_options
;
3748 unsigned int handle
;
3751 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
3752 if (!error
&& queue_id
) {
3753 unsigned int major
= tc_get_major(handle
);
3754 unsigned int minor
= tc_get_minor(handle
);
3755 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
3756 *queue_id
= minor
- 1;
3761 if (!error
&& options
) {
3762 error
= htb_parse_tca_options__(nl_options
, options
);
3768 htb_parse_qdisc_details__(struct netdev
*netdev_
,
3769 const struct smap
*details
, struct htb_class
*hc
)
3771 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3772 const char *max_rate_s
;
3774 max_rate_s
= smap_get(details
, "max-rate");
3775 hc
->max_rate
= max_rate_s
? strtoull(max_rate_s
, NULL
, 10) / 8 : 0;
3776 if (!hc
->max_rate
) {
3777 enum netdev_features current
;
3779 netdev_linux_read_features(netdev
);
3780 current
= !netdev
->get_features_error
? netdev
->current
: 0;
3781 hc
->max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
3783 hc
->min_rate
= hc
->max_rate
;
3789 htb_parse_class_details__(struct netdev
*netdev
,
3790 const struct smap
*details
, struct htb_class
*hc
)
3792 const struct htb
*htb
= htb_get__(netdev
);
3793 const char *min_rate_s
= smap_get(details
, "min-rate");
3794 const char *max_rate_s
= smap_get(details
, "max-rate");
3795 const char *burst_s
= smap_get(details
, "burst");
3796 const char *priority_s
= smap_get(details
, "priority");
3799 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3801 VLOG_WARN_RL(&rl
, "cannot parse HTB class on device %s that lacks MTU",
3802 netdev_get_name(netdev
));
3806 /* HTB requires at least an mtu sized min-rate to send any traffic even
3807 * on uncongested links. */
3808 hc
->min_rate
= min_rate_s
? strtoull(min_rate_s
, NULL
, 10) / 8 : 0;
3809 hc
->min_rate
= MAX(hc
->min_rate
, mtu
);
3810 hc
->min_rate
= MIN(hc
->min_rate
, htb
->max_rate
);
3813 hc
->max_rate
= (max_rate_s
3814 ? strtoull(max_rate_s
, NULL
, 10) / 8
3816 hc
->max_rate
= MAX(hc
->max_rate
, hc
->min_rate
);
3817 hc
->max_rate
= MIN(hc
->max_rate
, htb
->max_rate
);
3821 * According to hints in the documentation that I've read, it is important
3822 * that 'burst' be at least as big as the largest frame that might be
3823 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3824 * but having it a bit too small is a problem. Since netdev_get_mtu()
3825 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3826 * the MTU. We actually add 64, instead of 14, as a guard against
3827 * additional headers get tacked on somewhere that we're not aware of. */
3828 hc
->burst
= burst_s
? strtoull(burst_s
, NULL
, 10) / 8 : 0;
3829 hc
->burst
= MAX(hc
->burst
, mtu
+ 64);
3832 hc
->priority
= priority_s
? strtoul(priority_s
, NULL
, 10) : 0;
3838 htb_query_class__(const struct netdev
*netdev
, unsigned int handle
,
3839 unsigned int parent
, struct htb_class
*options
,
3840 struct netdev_queue_stats
*stats
)
3842 struct ofpbuf
*reply
;
3845 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
3847 error
= htb_parse_tcmsg__(reply
, NULL
, options
, stats
);
3848 ofpbuf_delete(reply
);
3854 htb_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3858 error
= htb_setup_qdisc__(netdev
);
3860 struct htb_class hc
;
3862 htb_parse_qdisc_details__(netdev
, details
, &hc
);
3863 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
3864 tc_make_handle(1, 0), &hc
);
3866 htb_install__(netdev
, hc
.max_rate
);
3872 static struct htb_class
*
3873 htb_class_cast__(const struct tc_queue
*queue
)
3875 return CONTAINER_OF(queue
, struct htb_class
, tc_queue
);
3879 htb_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
3880 const struct htb_class
*hc
)
3882 struct htb
*htb
= htb_get__(netdev
);
3883 size_t hash
= hash_int(queue_id
, 0);
3884 struct tc_queue
*queue
;
3885 struct htb_class
*hcp
;
3887 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
3889 hcp
= htb_class_cast__(queue
);
3891 hcp
= xmalloc(sizeof *hcp
);
3892 queue
= &hcp
->tc_queue
;
3893 queue
->queue_id
= queue_id
;
3894 queue
->created
= time_msec();
3895 hmap_insert(&htb
->tc
.queues
, &queue
->hmap_node
, hash
);
3898 hcp
->min_rate
= hc
->min_rate
;
3899 hcp
->max_rate
= hc
->max_rate
;
3900 hcp
->burst
= hc
->burst
;
3901 hcp
->priority
= hc
->priority
;
3905 htb_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
3908 struct queue_dump_state state
;
3909 struct htb_class hc
;
3911 /* Get qdisc options. */
3913 htb_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
3914 htb_install__(netdev
, hc
.max_rate
);
3917 if (!start_queue_dump(netdev
, &state
)) {
3920 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
3921 unsigned int queue_id
;
3923 if (!htb_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
3924 htb_update_queue__(netdev
, queue_id
, &hc
);
3927 finish_queue_dump(&state
);
3933 htb_tc_destroy(struct tc
*tc
)
3935 struct htb
*htb
= CONTAINER_OF(tc
, struct htb
, tc
);
3936 struct htb_class
*hc
, *next
;
3938 HMAP_FOR_EACH_SAFE (hc
, next
, tc_queue
.hmap_node
, &htb
->tc
.queues
) {
3939 hmap_remove(&htb
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
3947 htb_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3949 const struct htb
*htb
= htb_get__(netdev
);
3950 smap_add_format(details
, "max-rate", "%llu", 8ULL * htb
->max_rate
);
3955 htb_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3957 struct htb_class hc
;
3960 htb_parse_qdisc_details__(netdev
, details
, &hc
);
3961 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
3962 tc_make_handle(1, 0), &hc
);
3964 htb_get__(netdev
)->max_rate
= hc
.max_rate
;
3970 htb_class_get(const struct netdev
*netdev OVS_UNUSED
,
3971 const struct tc_queue
*queue
, struct smap
*details
)
3973 const struct htb_class
*hc
= htb_class_cast__(queue
);
3975 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
3976 if (hc
->min_rate
!= hc
->max_rate
) {
3977 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
3979 smap_add_format(details
, "burst", "%llu", 8ULL * hc
->burst
);
3981 smap_add_format(details
, "priority", "%u", hc
->priority
);
3987 htb_class_set(struct netdev
*netdev
, unsigned int queue_id
,
3988 const struct smap
*details
)
3990 struct htb_class hc
;
3993 error
= htb_parse_class_details__(netdev
, details
, &hc
);
3998 error
= htb_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
3999 tc_make_handle(1, 0xfffe), &hc
);
4004 htb_update_queue__(netdev
, queue_id
, &hc
);
4009 htb_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4011 struct htb_class
*hc
= htb_class_cast__(queue
);
4012 struct htb
*htb
= htb_get__(netdev
);
4015 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4017 hmap_remove(&htb
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4024 htb_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4025 struct netdev_queue_stats
*stats
)
4027 return htb_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4028 tc_make_handle(1, 0xfffe), NULL
, stats
);
4032 htb_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4033 const struct ofpbuf
*nlmsg
,
4034 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4036 struct netdev_queue_stats stats
;
4037 unsigned int handle
, major
, minor
;
4040 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4045 major
= tc_get_major(handle
);
4046 minor
= tc_get_minor(handle
);
4047 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4048 (*cb
)(minor
- 1, &stats
, aux
);
4053 static const struct tc_ops tc_ops_htb
= {
4054 "htb", /* linux_name */
4055 "linux-htb", /* ovs_name */
4056 HTB_N_QUEUES
, /* n_queues */
4065 htb_class_get_stats
,
4066 htb_class_dump_stats
4069 /* "linux-hfsc" traffic control class. */
4071 #define HFSC_N_QUEUES 0xf000
4079 struct tc_queue tc_queue
;
4084 static struct hfsc
*
4085 hfsc_get__(const struct netdev
*netdev_
)
4087 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4088 return CONTAINER_OF(netdev
->tc
, struct hfsc
, tc
);
4091 static struct hfsc_class
*
4092 hfsc_class_cast__(const struct tc_queue
*queue
)
4094 return CONTAINER_OF(queue
, struct hfsc_class
, tc_queue
);
4098 hfsc_install__(struct netdev
*netdev_
, uint32_t max_rate
)
4100 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4103 hfsc
= xmalloc(sizeof *hfsc
);
4104 tc_init(&hfsc
->tc
, &tc_ops_hfsc
);
4105 hfsc
->max_rate
= max_rate
;
4106 netdev
->tc
= &hfsc
->tc
;
4110 hfsc_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4111 const struct hfsc_class
*hc
)
4115 struct hfsc_class
*hcp
;
4116 struct tc_queue
*queue
;
4118 hfsc
= hfsc_get__(netdev
);
4119 hash
= hash_int(queue_id
, 0);
4121 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4123 hcp
= hfsc_class_cast__(queue
);
4125 hcp
= xmalloc(sizeof *hcp
);
4126 queue
= &hcp
->tc_queue
;
4127 queue
->queue_id
= queue_id
;
4128 queue
->created
= time_msec();
4129 hmap_insert(&hfsc
->tc
.queues
, &queue
->hmap_node
, hash
);
4132 hcp
->min_rate
= hc
->min_rate
;
4133 hcp
->max_rate
= hc
->max_rate
;
4137 hfsc_parse_tca_options__(struct nlattr
*nl_options
, struct hfsc_class
*class)
4139 const struct tc_service_curve
*rsc
, *fsc
, *usc
;
4140 static const struct nl_policy tca_hfsc_policy
[] = {
4142 .type
= NL_A_UNSPEC
,
4144 .min_len
= sizeof(struct tc_service_curve
),
4147 .type
= NL_A_UNSPEC
,
4149 .min_len
= sizeof(struct tc_service_curve
),
4152 .type
= NL_A_UNSPEC
,
4154 .min_len
= sizeof(struct tc_service_curve
),
4157 struct nlattr
*attrs
[ARRAY_SIZE(tca_hfsc_policy
)];
4159 if (!nl_parse_nested(nl_options
, tca_hfsc_policy
,
4160 attrs
, ARRAY_SIZE(tca_hfsc_policy
))) {
4161 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options");
4165 rsc
= nl_attr_get(attrs
[TCA_HFSC_RSC
]);
4166 fsc
= nl_attr_get(attrs
[TCA_HFSC_FSC
]);
4167 usc
= nl_attr_get(attrs
[TCA_HFSC_USC
]);
4169 if (rsc
->m1
!= 0 || rsc
->d
!= 0 ||
4170 fsc
->m1
!= 0 || fsc
->d
!= 0 ||
4171 usc
->m1
!= 0 || usc
->d
!= 0) {
4172 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4173 "Non-linear service curves are not supported.");
4177 if (rsc
->m2
!= fsc
->m2
) {
4178 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4179 "Real-time service curves are not supported ");
4183 if (rsc
->m2
> usc
->m2
) {
4184 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4185 "Min-rate service curve is greater than "
4186 "the max-rate service curve.");
4190 class->min_rate
= fsc
->m2
;
4191 class->max_rate
= usc
->m2
;
4196 hfsc_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4197 struct hfsc_class
*options
,
4198 struct netdev_queue_stats
*stats
)
4201 unsigned int handle
;
4202 struct nlattr
*nl_options
;
4204 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4210 unsigned int major
, minor
;
4212 major
= tc_get_major(handle
);
4213 minor
= tc_get_minor(handle
);
4214 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4215 *queue_id
= minor
- 1;
4222 error
= hfsc_parse_tca_options__(nl_options
, options
);
4229 hfsc_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4230 unsigned int parent
, struct hfsc_class
*options
,
4231 struct netdev_queue_stats
*stats
)
4234 struct ofpbuf
*reply
;
4236 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4241 error
= hfsc_parse_tcmsg__(reply
, NULL
, options
, stats
);
4242 ofpbuf_delete(reply
);
4247 hfsc_parse_qdisc_details__(struct netdev
*netdev_
, const struct smap
*details
,
4248 struct hfsc_class
*class)
4250 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4252 const char *max_rate_s
;
4254 max_rate_s
= smap_get(details
, "max-rate");
4255 max_rate
= max_rate_s
? strtoull(max_rate_s
, NULL
, 10) / 8 : 0;
4258 enum netdev_features current
;
4260 netdev_linux_read_features(netdev
);
4261 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4262 max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4265 class->min_rate
= max_rate
;
4266 class->max_rate
= max_rate
;
4270 hfsc_parse_class_details__(struct netdev
*netdev
,
4271 const struct smap
*details
,
4272 struct hfsc_class
* class)
4274 const struct hfsc
*hfsc
;
4275 uint32_t min_rate
, max_rate
;
4276 const char *min_rate_s
, *max_rate_s
;
4278 hfsc
= hfsc_get__(netdev
);
4279 min_rate_s
= smap_get(details
, "min-rate");
4280 max_rate_s
= smap_get(details
, "max-rate");
4282 min_rate
= min_rate_s
? strtoull(min_rate_s
, NULL
, 10) / 8 : 0;
4283 min_rate
= MAX(min_rate
, 1);
4284 min_rate
= MIN(min_rate
, hfsc
->max_rate
);
4286 max_rate
= (max_rate_s
4287 ? strtoull(max_rate_s
, NULL
, 10) / 8
4289 max_rate
= MAX(max_rate
, min_rate
);
4290 max_rate
= MIN(max_rate
, hfsc
->max_rate
);
4292 class->min_rate
= min_rate
;
4293 class->max_rate
= max_rate
;
4298 /* Create an HFSC qdisc.
4300 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4302 hfsc_setup_qdisc__(struct netdev
* netdev
)
4304 struct tcmsg
*tcmsg
;
4305 struct ofpbuf request
;
4306 struct tc_hfsc_qopt opt
;
4308 tc_del_qdisc(netdev
);
4310 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
4311 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4317 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4318 tcmsg
->tcm_parent
= TC_H_ROOT
;
4320 memset(&opt
, 0, sizeof opt
);
4323 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4324 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
4326 return tc_transact(&request
, NULL
);
4329 /* Create an HFSC class.
4331 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4332 * sc rate <min_rate> ul rate <max_rate>" */
4334 hfsc_setup_class__(struct netdev
*netdev
, unsigned int handle
,
4335 unsigned int parent
, struct hfsc_class
*class)
4339 struct tcmsg
*tcmsg
;
4340 struct ofpbuf request
;
4341 struct tc_service_curve min
, max
;
4343 tcmsg
= tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
, &request
);
4349 tcmsg
->tcm_handle
= handle
;
4350 tcmsg
->tcm_parent
= parent
;
4354 min
.m2
= class->min_rate
;
4358 max
.m2
= class->max_rate
;
4360 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4361 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4362 nl_msg_put_unspec(&request
, TCA_HFSC_RSC
, &min
, sizeof min
);
4363 nl_msg_put_unspec(&request
, TCA_HFSC_FSC
, &min
, sizeof min
);
4364 nl_msg_put_unspec(&request
, TCA_HFSC_USC
, &max
, sizeof max
);
4365 nl_msg_end_nested(&request
, opt_offset
);
4367 error
= tc_transact(&request
, NULL
);
4369 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
4370 "min-rate %ubps, max-rate %ubps (%s)",
4371 netdev_get_name(netdev
),
4372 tc_get_major(handle
), tc_get_minor(handle
),
4373 tc_get_major(parent
), tc_get_minor(parent
),
4374 class->min_rate
, class->max_rate
, ovs_strerror(error
));
4381 hfsc_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4384 struct hfsc_class
class;
4386 error
= hfsc_setup_qdisc__(netdev
);
4392 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4393 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4394 tc_make_handle(1, 0), &class);
4400 hfsc_install__(netdev
, class.max_rate
);
4405 hfsc_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4408 struct queue_dump_state state
;
4409 struct hfsc_class hc
;
4412 hfsc_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4413 hfsc_install__(netdev
, hc
.max_rate
);
4415 if (!start_queue_dump(netdev
, &state
)) {
4419 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4420 unsigned int queue_id
;
4422 if (!hfsc_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4423 hfsc_update_queue__(netdev
, queue_id
, &hc
);
4427 finish_queue_dump(&state
);
4432 hfsc_tc_destroy(struct tc
*tc
)
4435 struct hfsc_class
*hc
, *next
;
4437 hfsc
= CONTAINER_OF(tc
, struct hfsc
, tc
);
4439 HMAP_FOR_EACH_SAFE (hc
, next
, tc_queue
.hmap_node
, &hfsc
->tc
.queues
) {
4440 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4449 hfsc_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4451 const struct hfsc
*hfsc
;
4452 hfsc
= hfsc_get__(netdev
);
4453 smap_add_format(details
, "max-rate", "%llu", 8ULL * hfsc
->max_rate
);
4458 hfsc_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4461 struct hfsc_class
class;
4463 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4464 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4465 tc_make_handle(1, 0), &class);
4468 hfsc_get__(netdev
)->max_rate
= class.max_rate
;
4475 hfsc_class_get(const struct netdev
*netdev OVS_UNUSED
,
4476 const struct tc_queue
*queue
, struct smap
*details
)
4478 const struct hfsc_class
*hc
;
4480 hc
= hfsc_class_cast__(queue
);
4481 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4482 if (hc
->min_rate
!= hc
->max_rate
) {
4483 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4489 hfsc_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4490 const struct smap
*details
)
4493 struct hfsc_class
class;
4495 error
= hfsc_parse_class_details__(netdev
, details
, &class);
4500 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
4501 tc_make_handle(1, 0xfffe), &class);
4506 hfsc_update_queue__(netdev
, queue_id
, &class);
4511 hfsc_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4515 struct hfsc_class
*hc
;
4517 hc
= hfsc_class_cast__(queue
);
4518 hfsc
= hfsc_get__(netdev
);
4520 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4522 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4529 hfsc_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4530 struct netdev_queue_stats
*stats
)
4532 return hfsc_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4533 tc_make_handle(1, 0xfffe), NULL
, stats
);
4537 hfsc_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4538 const struct ofpbuf
*nlmsg
,
4539 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4541 struct netdev_queue_stats stats
;
4542 unsigned int handle
, major
, minor
;
4545 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4550 major
= tc_get_major(handle
);
4551 minor
= tc_get_minor(handle
);
4552 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4553 (*cb
)(minor
- 1, &stats
, aux
);
4558 static const struct tc_ops tc_ops_hfsc
= {
4559 "hfsc", /* linux_name */
4560 "linux-hfsc", /* ovs_name */
4561 HFSC_N_QUEUES
, /* n_queues */
4562 hfsc_tc_install
, /* tc_install */
4563 hfsc_tc_load
, /* tc_load */
4564 hfsc_tc_destroy
, /* tc_destroy */
4565 hfsc_qdisc_get
, /* qdisc_get */
4566 hfsc_qdisc_set
, /* qdisc_set */
4567 hfsc_class_get
, /* class_get */
4568 hfsc_class_set
, /* class_set */
4569 hfsc_class_delete
, /* class_delete */
4570 hfsc_class_get_stats
, /* class_get_stats */
4571 hfsc_class_dump_stats
/* class_dump_stats */
4574 /* "linux-default" traffic control class.
4576 * This class represents the default, unnamed Linux qdisc. It corresponds to
4577 * the "" (empty string) QoS type in the OVS database. */
4580 default_install__(struct netdev
*netdev_
)
4582 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4583 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
4585 /* Nothing but a tc class implementation is allowed to write to a tc. This
4586 * class never does that, so we can legitimately use a const tc object. */
4587 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4591 default_tc_install(struct netdev
*netdev
,
4592 const struct smap
*details OVS_UNUSED
)
4594 default_install__(netdev
);
4599 default_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4601 default_install__(netdev
);
4605 static const struct tc_ops tc_ops_default
= {
4606 NULL
, /* linux_name */
4611 NULL
, /* tc_destroy */
4612 NULL
, /* qdisc_get */
4613 NULL
, /* qdisc_set */
4614 NULL
, /* class_get */
4615 NULL
, /* class_set */
4616 NULL
, /* class_delete */
4617 NULL
, /* class_get_stats */
4618 NULL
/* class_dump_stats */
4621 /* "linux-other" traffic control class.
4626 other_tc_load(struct netdev
*netdev_
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4628 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4629 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_other
);
4631 /* Nothing but a tc class implementation is allowed to write to a tc. This
4632 * class never does that, so we can legitimately use a const tc object. */
4633 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
4637 static const struct tc_ops tc_ops_other
= {
4638 NULL
, /* linux_name */
4639 "linux-other", /* ovs_name */
4641 NULL
, /* tc_install */
4643 NULL
, /* tc_destroy */
4644 NULL
, /* qdisc_get */
4645 NULL
, /* qdisc_set */
4646 NULL
, /* class_get */
4647 NULL
, /* class_set */
4648 NULL
, /* class_delete */
4649 NULL
, /* class_get_stats */
4650 NULL
/* class_dump_stats */
4653 /* Traffic control. */
4655 /* Number of kernel "tc" ticks per second. */
4656 static double ticks_per_s
;
4658 /* Number of kernel "jiffies" per second. This is used for the purpose of
4659 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
4660 * one jiffy's worth of data.
4662 * There are two possibilities here:
4664 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
4665 * approximate range of 100 to 1024. That means that we really need to
4666 * make sure that the qdisc can buffer that much data.
4668 * - 'buffer_hz' is an absurdly large number. That means that the kernel
4669 * has finely granular timers and there's no need to fudge additional room
4670 * for buffers. (There's no extra effort needed to implement that: the
4671 * large 'buffer_hz' is used as a divisor, so practically any number will
4672 * come out as 0 in the division. Small integer results in the case of
4673 * really high dividends won't have any real effect anyhow.)
4675 static unsigned int buffer_hz
;
4677 /* Returns tc handle 'major':'minor'. */
4679 tc_make_handle(unsigned int major
, unsigned int minor
)
4681 return TC_H_MAKE(major
<< 16, minor
);
4684 /* Returns the major number from 'handle'. */
4686 tc_get_major(unsigned int handle
)
4688 return TC_H_MAJ(handle
) >> 16;
4691 /* Returns the minor number from 'handle'. */
4693 tc_get_minor(unsigned int handle
)
4695 return TC_H_MIN(handle
);
4698 static struct tcmsg
*
4699 tc_make_request(const struct netdev
*netdev
, int type
, unsigned int flags
,
4700 struct ofpbuf
*request
)
4702 struct tcmsg
*tcmsg
;
4706 error
= get_ifindex(netdev
, &ifindex
);
4711 ofpbuf_init(request
, 512);
4712 nl_msg_put_nlmsghdr(request
, sizeof *tcmsg
, type
, NLM_F_REQUEST
| flags
);
4713 tcmsg
= ofpbuf_put_zeros(request
, sizeof *tcmsg
);
4714 tcmsg
->tcm_family
= AF_UNSPEC
;
4715 tcmsg
->tcm_ifindex
= ifindex
;
4716 /* Caller should fill in tcmsg->tcm_handle. */
4717 /* Caller should fill in tcmsg->tcm_parent. */
4723 tc_transact(struct ofpbuf
*request
, struct ofpbuf
**replyp
)
4725 int error
= nl_transact(NETLINK_ROUTE
, request
, replyp
);
4726 ofpbuf_uninit(request
);
4730 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
4731 * policing configuration.
4733 * This function is equivalent to running the following when 'add' is true:
4734 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
4736 * This function is equivalent to running the following when 'add' is false:
4737 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
4739 * The configuration and stats may be seen with the following command:
4740 * /sbin/tc -s qdisc show dev <devname>
4742 * Returns 0 if successful, otherwise a positive errno value.
4745 tc_add_del_ingress_qdisc(struct netdev
*netdev
, bool add
)
4747 struct ofpbuf request
;
4748 struct tcmsg
*tcmsg
;
4750 int type
= add
? RTM_NEWQDISC
: RTM_DELQDISC
;
4751 int flags
= add
? NLM_F_EXCL
| NLM_F_CREATE
: 0;
4753 tcmsg
= tc_make_request(netdev
, type
, flags
, &request
);
4757 tcmsg
->tcm_handle
= tc_make_handle(0xffff, 0);
4758 tcmsg
->tcm_parent
= TC_H_INGRESS
;
4759 nl_msg_put_string(&request
, TCA_KIND
, "ingress");
4760 nl_msg_put_unspec(&request
, TCA_OPTIONS
, NULL
, 0);
4762 error
= tc_transact(&request
, NULL
);
4764 /* If we're deleting the qdisc, don't worry about some of the
4765 * error conditions. */
4766 if (!add
&& (error
== ENOENT
|| error
== EINVAL
)) {
4775 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4778 * This function is equivalent to running:
4779 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4780 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4783 * The configuration and stats may be seen with the following command:
4784 * /sbin/tc -s filter show dev <devname> parent ffff:
4786 * Returns 0 if successful, otherwise a positive errno value.
4789 tc_add_policer(struct netdev
*netdev
,
4790 uint32_t kbits_rate
, uint32_t kbits_burst
)
4792 struct tc_police tc_police
;
4793 struct ofpbuf request
;
4794 struct tcmsg
*tcmsg
;
4795 size_t basic_offset
;
4796 size_t police_offset
;
4800 memset(&tc_police
, 0, sizeof tc_police
);
4801 tc_police
.action
= TC_POLICE_SHOT
;
4802 tc_police
.mtu
= mtu
;
4803 tc_fill_rate(&tc_police
.rate
, ((uint64_t) kbits_rate
* 1000)/8, mtu
);
4805 /* The following appears wrong in two ways:
4807 * - tc_bytes_to_ticks() should take "bytes" as quantity for both of its
4808 * arguments (or at least consistently "bytes" as both or "bits" as
4809 * both), but this supplies bytes for the first argument and bits for the
4812 * - In networking a kilobit is usually 1000 bits but this uses 1024 bits.
4814 * However if you "fix" those problems then "tc filter show ..." shows
4815 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
4816 * 1,000,000 bits, whereas this actually ends up doing the right thing from
4817 * tc's point of view. Whatever. */
4818 tc_police
.burst
= tc_bytes_to_ticks(
4819 tc_police
.rate
.rate
, MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024);
4821 tcmsg
= tc_make_request(netdev
, RTM_NEWTFILTER
,
4822 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4826 tcmsg
->tcm_parent
= tc_make_handle(0xffff, 0);
4827 tcmsg
->tcm_info
= tc_make_handle(49,
4828 (OVS_FORCE
uint16_t) htons(ETH_P_ALL
));
4830 nl_msg_put_string(&request
, TCA_KIND
, "basic");
4831 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4832 police_offset
= nl_msg_start_nested(&request
, TCA_BASIC_POLICE
);
4833 nl_msg_put_unspec(&request
, TCA_POLICE_TBF
, &tc_police
, sizeof tc_police
);
4834 tc_put_rtab(&request
, TCA_POLICE_RATE
, &tc_police
.rate
);
4835 nl_msg_end_nested(&request
, police_offset
);
4836 nl_msg_end_nested(&request
, basic_offset
);
4838 error
= tc_transact(&request
, NULL
);
4849 /* The values in psched are not individually very meaningful, but they are
4850 * important. The tables below show some values seen in the wild.
4854 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4855 * (Before that, there are hints that it was 1000000000.)
4857 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4861 * -----------------------------------
4862 * [1] 000c8000 000f4240 000f4240 00000064
4863 * [2] 000003e8 00000400 000f4240 3b9aca00
4864 * [3] 000003e8 00000400 000f4240 3b9aca00
4865 * [4] 000003e8 00000400 000f4240 00000064
4866 * [5] 000003e8 00000040 000f4240 3b9aca00
4867 * [6] 000003e8 00000040 000f4240 000000f9
4869 * a b c d ticks_per_s buffer_hz
4870 * ------- --------- ---------- ------------- ----------- -------------
4871 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4872 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4873 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4874 * [4] 1,000 1,024 1,000,000 100 976,562 100
4875 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4876 * [6] 1,000 64 1,000,000 249 15,625,000 249
4878 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4879 * [2] 2.6.26-1-686-bigmem from Debian lenny
4880 * [3] 2.6.26-2-sparc64 from Debian lenny
4881 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4882 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4883 * [6] 2.6.34 from kernel.org on KVM
4885 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
4886 static const char fn
[] = "/proc/net/psched";
4887 unsigned int a
, b
, c
, d
;
4890 if (!ovsthread_once_start(&once
)) {
4897 stream
= fopen(fn
, "r");
4899 VLOG_WARN("%s: open failed: %s", fn
, ovs_strerror(errno
));
4903 if (fscanf(stream
, "%x %x %x %x", &a
, &b
, &c
, &d
) != 4) {
4904 VLOG_WARN("%s: read failed", fn
);
4908 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn
, a
, b
, c
, d
);
4912 VLOG_WARN("%s: invalid scheduler parameters", fn
);
4916 ticks_per_s
= (double) a
* c
/ b
;
4920 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4923 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn
, ticks_per_s
, buffer_hz
);
4926 ovsthread_once_done(&once
);
4929 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4930 * rate of 'rate' bytes per second. */
4932 tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
)
4935 return (rate
* ticks
) / ticks_per_s
;
4938 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4939 * rate of 'rate' bytes per second. */
4941 tc_bytes_to_ticks(unsigned int rate
, unsigned int size
)
4944 return rate
? ((unsigned long long int) ticks_per_s
* size
) / rate
: 0;
4947 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4948 * a transmission rate of 'rate' bytes per second. */
4950 tc_buffer_per_jiffy(unsigned int rate
)
4953 return rate
/ buffer_hz
;
4956 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4957 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4958 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4959 * stores NULL into it if it is absent.
4961 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4964 * Returns 0 if successful, otherwise a positive errno value. */
4966 tc_parse_qdisc(const struct ofpbuf
*msg
, const char **kind
,
4967 struct nlattr
**options
)
4969 static const struct nl_policy tca_policy
[] = {
4970 [TCA_KIND
] = { .type
= NL_A_STRING
, .optional
= false },
4971 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= true },
4973 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
4975 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
4976 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
4977 VLOG_WARN_RL(&rl
, "failed to parse qdisc message");
4982 *kind
= nl_attr_get_string(ta
[TCA_KIND
]);
4986 *options
= ta
[TCA_OPTIONS
];
5001 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5002 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5003 * into '*options', and its queue statistics into '*stats'. Any of the output
5004 * arguments may be null.
5006 * Returns 0 if successful, otherwise a positive errno value. */
5008 tc_parse_class(const struct ofpbuf
*msg
, unsigned int *handlep
,
5009 struct nlattr
**options
, struct netdev_queue_stats
*stats
)
5011 static const struct nl_policy tca_policy
[] = {
5012 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= false },
5013 [TCA_STATS2
] = { .type
= NL_A_NESTED
, .optional
= false },
5015 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5017 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5018 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5019 VLOG_WARN_RL(&rl
, "failed to parse class message");
5024 struct tcmsg
*tc
= ofpbuf_at_assert(msg
, NLMSG_HDRLEN
, sizeof *tc
);
5025 *handlep
= tc
->tcm_handle
;
5029 *options
= ta
[TCA_OPTIONS
];
5033 const struct gnet_stats_queue
*gsq
;
5034 struct gnet_stats_basic gsb
;
5036 static const struct nl_policy stats_policy
[] = {
5037 [TCA_STATS_BASIC
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5038 .min_len
= sizeof gsb
},
5039 [TCA_STATS_QUEUE
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5040 .min_len
= sizeof *gsq
},
5042 struct nlattr
*sa
[ARRAY_SIZE(stats_policy
)];
5044 if (!nl_parse_nested(ta
[TCA_STATS2
], stats_policy
,
5045 sa
, ARRAY_SIZE(sa
))) {
5046 VLOG_WARN_RL(&rl
, "failed to parse class stats");
5050 /* Alignment issues screw up the length of struct gnet_stats_basic on
5051 * some arch/bitsize combinations. Newer versions of Linux have a
5052 * struct gnet_stats_basic_packed, but we can't depend on that. The
5053 * easiest thing to do is just to make a copy. */
5054 memset(&gsb
, 0, sizeof gsb
);
5055 memcpy(&gsb
, nl_attr_get(sa
[TCA_STATS_BASIC
]),
5056 MIN(nl_attr_get_size(sa
[TCA_STATS_BASIC
]), sizeof gsb
));
5057 stats
->tx_bytes
= gsb
.bytes
;
5058 stats
->tx_packets
= gsb
.packets
;
5060 gsq
= nl_attr_get(sa
[TCA_STATS_QUEUE
]);
5061 stats
->tx_errors
= gsq
->drops
;
5071 memset(stats
, 0, sizeof *stats
);
5076 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5079 tc_query_class(const struct netdev
*netdev
,
5080 unsigned int handle
, unsigned int parent
,
5081 struct ofpbuf
**replyp
)
5083 struct ofpbuf request
;
5084 struct tcmsg
*tcmsg
;
5087 tcmsg
= tc_make_request(netdev
, RTM_GETTCLASS
, NLM_F_ECHO
, &request
);
5091 tcmsg
->tcm_handle
= handle
;
5092 tcmsg
->tcm_parent
= parent
;
5094 error
= tc_transact(&request
, replyp
);
5096 VLOG_WARN_RL(&rl
, "query %s class %u:%u (parent %u:%u) failed (%s)",
5097 netdev_get_name(netdev
),
5098 tc_get_major(handle
), tc_get_minor(handle
),
5099 tc_get_major(parent
), tc_get_minor(parent
),
5100 ovs_strerror(error
));
5105 /* Equivalent to "tc class del dev <name> handle <handle>". */
5107 tc_delete_class(const struct netdev
*netdev
, unsigned int handle
)
5109 struct ofpbuf request
;
5110 struct tcmsg
*tcmsg
;
5113 tcmsg
= tc_make_request(netdev
, RTM_DELTCLASS
, 0, &request
);
5117 tcmsg
->tcm_handle
= handle
;
5118 tcmsg
->tcm_parent
= 0;
5120 error
= tc_transact(&request
, NULL
);
5122 VLOG_WARN_RL(&rl
, "delete %s class %u:%u failed (%s)",
5123 netdev_get_name(netdev
),
5124 tc_get_major(handle
), tc_get_minor(handle
),
5125 ovs_strerror(error
));
5130 /* Equivalent to "tc qdisc del dev <name> root". */
5132 tc_del_qdisc(struct netdev
*netdev_
)
5134 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5135 struct ofpbuf request
;
5136 struct tcmsg
*tcmsg
;
5139 tcmsg
= tc_make_request(netdev_
, RTM_DELQDISC
, 0, &request
);
5143 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
5144 tcmsg
->tcm_parent
= TC_H_ROOT
;
5146 error
= tc_transact(&request
, NULL
);
5147 if (error
== EINVAL
) {
5148 /* EINVAL probably means that the default qdisc was in use, in which
5149 * case we've accomplished our purpose. */
5152 if (!error
&& netdev
->tc
) {
5153 if (netdev
->tc
->ops
->tc_destroy
) {
5154 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
5162 getqdisc_is_safe(void)
5164 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5165 static bool safe
= false;
5167 if (ovsthread_once_start(&once
)) {
5168 struct utsname utsname
;
5171 if (uname(&utsname
) == -1) {
5172 VLOG_WARN("uname failed (%s)", ovs_strerror(errno
));
5173 } else if (!ovs_scan(utsname
.release
, "%d.%d", &major
, &minor
)) {
5174 VLOG_WARN("uname reported bad OS release (%s)", utsname
.release
);
5175 } else if (major
< 2 || (major
== 2 && minor
< 35)) {
5176 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5181 ovsthread_once_done(&once
);
5186 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5187 * kernel to determine what they are. Returns 0 if successful, otherwise a
5188 * positive errno value. */
5190 tc_query_qdisc(const struct netdev
*netdev_
)
5192 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5193 struct ofpbuf request
, *qdisc
;
5194 const struct tc_ops
*ops
;
5195 struct tcmsg
*tcmsg
;
5203 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5204 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5205 * 2.6.35 without that fix backported to it.
5207 * To avoid the OOPS, we must not make a request that would attempt to dump
5208 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5209 * few others. There are a few ways that I can see to do this, but most of
5210 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5211 * technique chosen here is to assume that any non-default qdisc that we
5212 * create will have a class with handle 1:0. The built-in qdiscs only have
5213 * a class with handle 0:0.
5215 * On Linux 2.6.35+ we use the straightforward method because it allows us
5216 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5217 * in such a case we get no response at all from the kernel (!) if a
5218 * builtin qdisc is in use (which is later caught by "!error &&
5219 * !qdisc->size"). */
5220 tcmsg
= tc_make_request(netdev_
, RTM_GETQDISC
, NLM_F_ECHO
, &request
);
5224 tcmsg
->tcm_handle
= tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5225 tcmsg
->tcm_parent
= getqdisc_is_safe() ? TC_H_ROOT
: 0;
5227 /* Figure out what tc class to instantiate. */
5228 error
= tc_transact(&request
, &qdisc
);
5229 if (!error
&& qdisc
->size
) {
5232 error
= tc_parse_qdisc(qdisc
, &kind
, NULL
);
5234 ops
= &tc_ops_other
;
5236 ops
= tc_lookup_linux_name(kind
);
5238 static struct vlog_rate_limit rl2
= VLOG_RATE_LIMIT_INIT(1, 1);
5239 VLOG_DBG_RL(&rl2
, "unknown qdisc \"%s\"", kind
);
5241 ops
= &tc_ops_other
;
5244 } else if ((!error
&& !qdisc
->size
) || error
== ENOENT
) {
5245 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5246 * set up by some other entity that doesn't have a handle 1:0. We will
5247 * assume that it's the system default qdisc. */
5248 ops
= &tc_ops_default
;
5251 /* Who knows? Maybe the device got deleted. */
5252 VLOG_WARN_RL(&rl
, "query %s qdisc failed (%s)",
5253 netdev_get_name(netdev_
), ovs_strerror(error
));
5254 ops
= &tc_ops_other
;
5257 /* Instantiate it. */
5258 load_error
= ops
->tc_load(CONST_CAST(struct netdev
*, netdev_
), qdisc
);
5259 ovs_assert((load_error
== 0) == (netdev
->tc
!= NULL
));
5260 ofpbuf_delete(qdisc
);
5262 return error
? error
: load_error
;
5265 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5266 approximate the time to transmit packets of various lengths. For an MTU of
5267 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5268 represents two possible packet lengths; for a MTU of 513 through 1024, four
5269 possible lengths; and so on.
5271 Returns, for the specified 'mtu', the number of bits that packet lengths
5272 need to be shifted right to fit within such a 256-entry table. */
5274 tc_calc_cell_log(unsigned int mtu
)
5279 mtu
= ETH_PAYLOAD_MAX
;
5281 mtu
+= ETH_HEADER_LEN
+ VLAN_HEADER_LEN
;
5283 for (cell_log
= 0; mtu
>= 256; cell_log
++) {
5290 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5293 tc_fill_rate(struct tc_ratespec
*rate
, uint64_t Bps
, int mtu
)
5295 memset(rate
, 0, sizeof *rate
);
5296 rate
->cell_log
= tc_calc_cell_log(mtu
);
5297 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5298 /* rate->cell_align = 0; */ /* distro headers. */
5299 rate
->mpu
= ETH_TOTAL_MIN
;
5303 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5304 * attribute of the specified "type".
5306 * See tc_calc_cell_log() above for a description of "rtab"s. */
5308 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
)
5313 rtab
= nl_msg_put_unspec_uninit(msg
, type
, TC_RTAB_SIZE
);
5314 for (i
= 0; i
< TC_RTAB_SIZE
/ sizeof *rtab
; i
++) {
5315 unsigned packet_size
= (i
+ 1) << rate
->cell_log
;
5316 if (packet_size
< rate
->mpu
) {
5317 packet_size
= rate
->mpu
;
5319 rtab
[i
] = tc_bytes_to_ticks(rate
->rate
, packet_size
);
5323 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5324 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5325 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5328 tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
)
5330 unsigned int min_burst
= tc_buffer_per_jiffy(Bps
) + mtu
;
5331 return tc_bytes_to_ticks(Bps
, MAX(burst_bytes
, min_burst
));
5334 /* Linux-only functions declared in netdev-linux.h */
5336 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5337 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5339 netdev_linux_ethtool_set_flag(struct netdev
*netdev
, uint32_t flag
,
5340 const char *flag_name
, bool enable
)
5342 const char *netdev_name
= netdev_get_name(netdev
);
5343 struct ethtool_value evalue
;
5347 COVERAGE_INC(netdev_get_ethtool
);
5348 memset(&evalue
, 0, sizeof evalue
);
5349 error
= netdev_linux_do_ethtool(netdev_name
,
5350 (struct ethtool_cmd
*)&evalue
,
5351 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5356 COVERAGE_INC(netdev_set_ethtool
);
5357 new_flags
= (evalue
.data
& ~flag
) | (enable
? flag
: 0);
5358 if (new_flags
== evalue
.data
) {
5361 evalue
.data
= new_flags
;
5362 error
= netdev_linux_do_ethtool(netdev_name
,
5363 (struct ethtool_cmd
*)&evalue
,
5364 ETHTOOL_SFLAGS
, "ETHTOOL_SFLAGS");
5369 COVERAGE_INC(netdev_get_ethtool
);
5370 memset(&evalue
, 0, sizeof evalue
);
5371 error
= netdev_linux_do_ethtool(netdev_name
,
5372 (struct ethtool_cmd
*)&evalue
,
5373 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5378 if (new_flags
!= evalue
.data
) {
5379 VLOG_WARN_RL(&rl
, "attempt to %s ethtool %s flag on network "
5380 "device %s failed", enable
? "enable" : "disable",
5381 flag_name
, netdev_name
);
5388 /* Utility functions. */
5390 /* Copies 'src' into 'dst', performing format conversion in the process. */
5392 netdev_stats_from_rtnl_link_stats(struct netdev_stats
*dst
,
5393 const struct rtnl_link_stats
*src
)
5395 dst
->rx_packets
= src
->rx_packets
;
5396 dst
->tx_packets
= src
->tx_packets
;
5397 dst
->rx_bytes
= src
->rx_bytes
;
5398 dst
->tx_bytes
= src
->tx_bytes
;
5399 dst
->rx_errors
= src
->rx_errors
;
5400 dst
->tx_errors
= src
->tx_errors
;
5401 dst
->rx_dropped
= src
->rx_dropped
;
5402 dst
->tx_dropped
= src
->tx_dropped
;
5403 dst
->multicast
= src
->multicast
;
5404 dst
->collisions
= src
->collisions
;
5405 dst
->rx_length_errors
= src
->rx_length_errors
;
5406 dst
->rx_over_errors
= src
->rx_over_errors
;
5407 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5408 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5409 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5410 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5411 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5412 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5413 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5414 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5415 dst
->tx_window_errors
= src
->tx_window_errors
;
5418 /* Copies 'src' into 'dst', performing format conversion in the process. */
5420 netdev_stats_from_rtnl_link_stats64(struct netdev_stats
*dst
,
5421 const struct rtnl_link_stats64
*src
)
5423 dst
->rx_packets
= src
->rx_packets
;
5424 dst
->tx_packets
= src
->tx_packets
;
5425 dst
->rx_bytes
= src
->rx_bytes
;
5426 dst
->tx_bytes
= src
->tx_bytes
;
5427 dst
->rx_errors
= src
->rx_errors
;
5428 dst
->tx_errors
= src
->tx_errors
;
5429 dst
->rx_dropped
= src
->rx_dropped
;
5430 dst
->tx_dropped
= src
->tx_dropped
;
5431 dst
->multicast
= src
->multicast
;
5432 dst
->collisions
= src
->collisions
;
5433 dst
->rx_length_errors
= src
->rx_length_errors
;
5434 dst
->rx_over_errors
= src
->rx_over_errors
;
5435 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5436 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5437 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5438 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5439 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5440 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5441 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5442 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5443 dst
->tx_window_errors
= src
->tx_window_errors
;
5447 get_stats_via_netlink(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
5449 struct ofpbuf request
;
5450 struct ofpbuf
*reply
;
5453 ofpbuf_init(&request
, 0);
5454 nl_msg_put_nlmsghdr(&request
,
5455 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
5456 RTM_GETLINK
, NLM_F_REQUEST
);
5457 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
5458 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(netdev_
));
5459 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
5460 ofpbuf_uninit(&request
);
5465 if (ofpbuf_try_pull(reply
, NLMSG_HDRLEN
+ sizeof(struct ifinfomsg
))) {
5466 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS64
);
5467 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats64
)) {
5468 netdev_stats_from_rtnl_link_stats64(stats
, nl_attr_get(a
));
5471 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS
);
5472 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats
)) {
5473 netdev_stats_from_rtnl_link_stats(stats
, nl_attr_get(a
));
5476 VLOG_WARN_RL(&rl
, "RTM_GETLINK reply lacks stats");
5481 VLOG_WARN_RL(&rl
, "short RTM_GETLINK reply");
5486 ofpbuf_delete(reply
);
5491 get_flags(const struct netdev
*dev
, unsigned int *flags
)
5497 error
= af_inet_ifreq_ioctl(dev
->name
, &ifr
, SIOCGIFFLAGS
, "SIOCGIFFLAGS");
5499 *flags
= ifr
.ifr_flags
;
5505 set_flags(const char *name
, unsigned int flags
)
5509 ifr
.ifr_flags
= flags
;
5510 return af_inet_ifreq_ioctl(name
, &ifr
, SIOCSIFFLAGS
, "SIOCSIFFLAGS");
5514 do_get_ifindex(const char *netdev_name
)
5519 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5520 COVERAGE_INC(netdev_get_ifindex
);
5522 error
= af_inet_ioctl(SIOCGIFINDEX
, &ifr
);
5524 VLOG_WARN_RL(&rl
, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
5525 netdev_name
, ovs_strerror(error
));
5528 return ifr
.ifr_ifindex
;
5532 get_ifindex(const struct netdev
*netdev_
, int *ifindexp
)
5534 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5536 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
5537 int ifindex
= do_get_ifindex(netdev_get_name(netdev_
));
5540 netdev
->get_ifindex_error
= -ifindex
;
5541 netdev
->ifindex
= 0;
5543 netdev
->get_ifindex_error
= 0;
5544 netdev
->ifindex
= ifindex
;
5546 netdev
->cache_valid
|= VALID_IFINDEX
;
5549 *ifindexp
= netdev
->ifindex
;
5550 return netdev
->get_ifindex_error
;
5554 get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
)
5560 memset(&ifr
, 0, sizeof ifr
);
5561 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5562 COVERAGE_INC(netdev_get_hwaddr
);
5563 error
= af_inet_ioctl(SIOCGIFHWADDR
, &ifr
);
5565 /* ENODEV probably means that a vif disappeared asynchronously and
5566 * hasn't been removed from the database yet, so reduce the log level
5567 * to INFO for that case. */
5568 VLOG(error
== ENODEV
? VLL_INFO
: VLL_ERR
,
5569 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
5570 netdev_name
, ovs_strerror(error
));
5573 hwaddr_family
= ifr
.ifr_hwaddr
.sa_family
;
5574 if (hwaddr_family
!= AF_UNSPEC
&& hwaddr_family
!= ARPHRD_ETHER
) {
5575 VLOG_INFO("%s device has unknown hardware address family %d",
5576 netdev_name
, hwaddr_family
);
5579 memcpy(ea
, ifr
.ifr_hwaddr
.sa_data
, ETH_ADDR_LEN
);
5584 set_etheraddr(const char *netdev_name
, const struct eth_addr mac
)
5589 memset(&ifr
, 0, sizeof ifr
);
5590 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5591 ifr
.ifr_hwaddr
.sa_family
= ARPHRD_ETHER
;
5592 memcpy(ifr
.ifr_hwaddr
.sa_data
, &mac
, ETH_ADDR_LEN
);
5593 COVERAGE_INC(netdev_set_hwaddr
);
5594 error
= af_inet_ioctl(SIOCSIFHWADDR
, &ifr
);
5596 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
5597 netdev_name
, ovs_strerror(error
));
5603 netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*ecmd
,
5604 int cmd
, const char *cmd_name
)
5609 memset(&ifr
, 0, sizeof ifr
);
5610 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
5611 ifr
.ifr_data
= (caddr_t
) ecmd
;
5614 error
= af_inet_ioctl(SIOCETHTOOL
, &ifr
);
5616 if (error
!= EOPNOTSUPP
) {
5617 VLOG_WARN_RL(&rl
, "ethtool command %s on network device %s "
5618 "failed: %s", cmd_name
, name
, ovs_strerror(error
));
5620 /* The device doesn't support this operation. That's pretty
5621 * common, so there's no point in logging anything. */
5628 netdev_linux_get_ipv4(const struct netdev
*netdev
, struct in_addr
*ip
,
5629 int cmd
, const char *cmd_name
)
5634 ifr
.ifr_addr
.sa_family
= AF_INET
;
5635 error
= af_inet_ifreq_ioctl(netdev_get_name(netdev
), &ifr
, cmd
, cmd_name
);
5637 const struct sockaddr_in
*sin
= ALIGNED_CAST(struct sockaddr_in
*,
5639 *ip
= sin
->sin_addr
;
5644 /* Returns an AF_PACKET raw socket or a negative errno value. */
5646 af_packet_sock(void)
5648 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5651 if (ovsthread_once_start(&once
)) {
5652 sock
= socket(AF_PACKET
, SOCK_RAW
, 0);
5654 int error
= set_nonblocking(sock
);
5661 VLOG_ERR("failed to create packet socket: %s",
5662 ovs_strerror(errno
));
5664 ovsthread_once_done(&once
);