2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <arpa/inet.h>
25 #include <linux/filter.h>
26 #include <linux/gen_stats.h>
27 #include <linux/if_ether.h>
28 #include <linux/if_tun.h>
29 #include <linux/types.h>
30 #include <linux/ethtool.h>
31 #include <linux/mii.h>
32 #include <linux/pkt_cls.h>
33 #include <linux/pkt_sched.h>
34 #include <linux/rtnetlink.h>
35 #include <linux/sockios.h>
36 #include <sys/types.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <netpacket/packet.h>
41 #include <net/if_arp.h>
42 #include <net/if_packet.h>
43 #include <net/route.h>
44 #include <netinet/in.h>
51 #include "dpif-netlink.h"
52 #include "dpif-netdev.h"
53 #include "dynamic-string.h"
54 #include "fatal-signal.h"
57 #include "netdev-provider.h"
58 #include "netdev-vport.h"
59 #include "netlink-notifier.h"
60 #include "netlink-socket.h"
63 #include "openflow/openflow.h"
64 #include "ovs-atomic.h"
65 #include "packet-dpif.h"
67 #include "poll-loop.h"
68 #include "rtnetlink-link.h"
70 #include "socket-util.h"
73 #include "unaligned.h"
76 VLOG_DEFINE_THIS_MODULE(netdev_linux
);
78 COVERAGE_DEFINE(netdev_set_policing
);
79 COVERAGE_DEFINE(netdev_arp_lookup
);
80 COVERAGE_DEFINE(netdev_get_ifindex
);
81 COVERAGE_DEFINE(netdev_get_hwaddr
);
82 COVERAGE_DEFINE(netdev_set_hwaddr
);
83 COVERAGE_DEFINE(netdev_get_ethtool
);
84 COVERAGE_DEFINE(netdev_set_ethtool
);
87 /* These were introduced in Linux 2.6.14, so they might be missing if we have
89 #ifndef ADVERTISED_Pause
90 #define ADVERTISED_Pause (1 << 13)
92 #ifndef ADVERTISED_Asym_Pause
93 #define ADVERTISED_Asym_Pause (1 << 14)
96 /* These were introduced in Linux 2.6.24, so they might be missing if we
97 * have old headers. */
98 #ifndef ETHTOOL_GFLAGS
99 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
101 #ifndef ETHTOOL_SFLAGS
102 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
105 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
108 #define TC_RTAB_SIZE 1024
111 /* Linux 2.6.21 introduced struct tpacket_auxdata.
112 * Linux 2.6.27 added the tp_vlan_tci member.
113 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
114 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
115 * TP_STATUS_VLAN_TPID_VALID.
117 * With all this churn it's easiest to unconditionally define a replacement
118 * structure that has everything we want.
120 #ifndef PACKET_AUXDATA
121 #define PACKET_AUXDATA 8
123 #ifndef TP_STATUS_VLAN_VALID
124 #define TP_STATUS_VLAN_VALID (1 << 4)
126 #ifndef TP_STATUS_VLAN_TPID_VALID
127 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
129 #undef tpacket_auxdata
130 #define tpacket_auxdata rpl_tpacket_auxdata
131 struct tpacket_auxdata
{
137 uint16_t tp_vlan_tci
;
138 uint16_t tp_vlan_tpid
;
141 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
143 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
144 * 2.6.32-431.29.2.el6.x86_64 (see report at
145 * http://openvswitch.org/pipermail/dev/2014-October/047978.html). Maybe
146 * if_link.h is not self-contained on those kernels. It is easiest to
147 * unconditionally define a replacement. */
149 #define IFLA_STATS64 23
151 #define rtnl_link_stats64 rpl_rtnl_link_stats64
152 struct rtnl_link_stats64
{
164 uint64_t rx_length_errors
;
165 uint64_t rx_over_errors
;
166 uint64_t rx_crc_errors
;
167 uint64_t rx_frame_errors
;
168 uint64_t rx_fifo_errors
;
169 uint64_t rx_missed_errors
;
171 uint64_t tx_aborted_errors
;
172 uint64_t tx_carrier_errors
;
173 uint64_t tx_fifo_errors
;
174 uint64_t tx_heartbeat_errors
;
175 uint64_t tx_window_errors
;
177 uint64_t rx_compressed
;
178 uint64_t tx_compressed
;
182 VALID_IFINDEX
= 1 << 0,
183 VALID_ETHERADDR
= 1 << 1,
187 VALID_POLICING
= 1 << 5,
188 VALID_VPORT_STAT_ERROR
= 1 << 6,
189 VALID_DRVINFO
= 1 << 7,
190 VALID_FEATURES
= 1 << 8,
193 /* Traffic control. */
195 /* An instance of a traffic control class. Always associated with a particular
198 * Each TC implementation subclasses this with whatever additional data it
201 const struct tc_ops
*ops
;
202 struct hmap queues
; /* Contains "struct tc_queue"s.
203 * Read by generic TC layer.
204 * Written only by TC implementation. */
207 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
209 /* One traffic control queue.
211 * Each TC implementation subclasses this with whatever additional data it
214 struct hmap_node hmap_node
; /* In struct tc's "queues" hmap. */
215 unsigned int queue_id
; /* OpenFlow queue ID. */
216 long long int created
; /* Time queue was created, in msecs. */
219 /* A particular kind of traffic control. Each implementation generally maps to
220 * one particular Linux qdisc class.
222 * The functions below return 0 if successful or a positive errno value on
223 * failure, except where otherwise noted. All of them must be provided, except
224 * where otherwise noted. */
226 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
227 * This is null for tc_ops_default and tc_ops_other, for which there are no
228 * appropriate values. */
229 const char *linux_name
;
231 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
232 const char *ovs_name
;
234 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
235 * queues. The queues are numbered 0 through n_queues - 1. */
236 unsigned int n_queues
;
238 /* Called to install this TC class on 'netdev'. The implementation should
239 * make the Netlink calls required to set up 'netdev' with the right qdisc
240 * and configure it according to 'details'. The implementation may assume
241 * that the current qdisc is the default; that is, there is no need for it
242 * to delete the current qdisc before installing itself.
244 * The contents of 'details' should be documented as valid for 'ovs_name'
245 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
246 * (which is built as ovs-vswitchd.conf.db(8)).
248 * This function must return 0 if and only if it sets 'netdev->tc' to an
249 * initialized 'struct tc'.
251 * (This function is null for tc_ops_other, which cannot be installed. For
252 * other TC classes it should always be nonnull.) */
253 int (*tc_install
)(struct netdev
*netdev
, const struct smap
*details
);
255 /* Called when the netdev code determines (through a Netlink query) that
256 * this TC class's qdisc is installed on 'netdev', but we didn't install
257 * it ourselves and so don't know any of the details.
259 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
260 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
261 * implementation should parse the other attributes of 'nlmsg' as
262 * necessary to determine its configuration. If necessary it should also
263 * use Netlink queries to determine the configuration of queues on
266 * This function must return 0 if and only if it sets 'netdev->tc' to an
267 * initialized 'struct tc'. */
268 int (*tc_load
)(struct netdev
*netdev
, struct ofpbuf
*nlmsg
);
270 /* Destroys the data structures allocated by the implementation as part of
271 * 'tc'. (This includes destroying 'tc->queues' by calling
274 * The implementation should not need to perform any Netlink calls. If
275 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
276 * (But it may not be desirable.)
278 * This function may be null if 'tc' is trivial. */
279 void (*tc_destroy
)(struct tc
*tc
);
281 /* Retrieves details of 'netdev->tc' configuration into 'details'.
283 * The implementation should not need to perform any Netlink calls, because
284 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
285 * cached the configuration.
287 * The contents of 'details' should be documented as valid for 'ovs_name'
288 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
289 * (which is built as ovs-vswitchd.conf.db(8)).
291 * This function may be null if 'tc' is not configurable.
293 int (*qdisc_get
)(const struct netdev
*netdev
, struct smap
*details
);
295 /* Reconfigures 'netdev->tc' according to 'details', performing any
296 * required Netlink calls to complete the reconfiguration.
298 * The contents of 'details' should be documented as valid for 'ovs_name'
299 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
300 * (which is built as ovs-vswitchd.conf.db(8)).
302 * This function may be null if 'tc' is not configurable.
304 int (*qdisc_set
)(struct netdev
*, const struct smap
*details
);
306 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
307 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
309 * The contents of 'details' should be documented as valid for 'ovs_name'
310 * in the "other_config" column in the "Queue" table in
311 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
313 * The implementation should not need to perform any Netlink calls, because
314 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
315 * cached the queue configuration.
317 * This function may be null if 'tc' does not have queues ('n_queues' is
319 int (*class_get
)(const struct netdev
*netdev
, const struct tc_queue
*queue
,
320 struct smap
*details
);
322 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
323 * 'details', perfoming any required Netlink calls to complete the
324 * reconfiguration. The caller ensures that 'queue_id' is less than
327 * The contents of 'details' should be documented as valid for 'ovs_name'
328 * in the "other_config" column in the "Queue" table in
329 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
331 * This function may be null if 'tc' does not have queues or its queues are
332 * not configurable. */
333 int (*class_set
)(struct netdev
*, unsigned int queue_id
,
334 const struct smap
*details
);
336 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
337 * tc_queue's within 'netdev->tc->queues'.
339 * This function may be null if 'tc' does not have queues or its queues
340 * cannot be deleted. */
341 int (*class_delete
)(struct netdev
*, struct tc_queue
*queue
);
343 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
344 * 'struct tc_queue's within 'netdev->tc->queues'.
346 * On success, initializes '*stats'.
348 * This function may be null if 'tc' does not have queues or if it cannot
349 * report queue statistics. */
350 int (*class_get_stats
)(const struct netdev
*netdev
,
351 const struct tc_queue
*queue
,
352 struct netdev_queue_stats
*stats
);
354 /* Extracts queue stats from 'nlmsg', which is a response to a
355 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
357 * This function may be null if 'tc' does not have queues or if it cannot
358 * report queue statistics. */
359 int (*class_dump_stats
)(const struct netdev
*netdev
,
360 const struct ofpbuf
*nlmsg
,
361 netdev_dump_queue_stats_cb
*cb
, void *aux
);
365 tc_init(struct tc
*tc
, const struct tc_ops
*ops
)
368 hmap_init(&tc
->queues
);
372 tc_destroy(struct tc
*tc
)
374 hmap_destroy(&tc
->queues
);
377 static const struct tc_ops tc_ops_htb
;
378 static const struct tc_ops tc_ops_hfsc
;
379 static const struct tc_ops tc_ops_default
;
380 static const struct tc_ops tc_ops_other
;
382 static const struct tc_ops
*const tcs
[] = {
383 &tc_ops_htb
, /* Hierarchy token bucket (see tc-htb(8)). */
384 &tc_ops_hfsc
, /* Hierarchical fair service curve. */
385 &tc_ops_default
, /* Default qdisc (see tc-pfifo_fast(8)). */
386 &tc_ops_other
, /* Some other qdisc. */
390 static unsigned int tc_make_handle(unsigned int major
, unsigned int minor
);
391 static unsigned int tc_get_major(unsigned int handle
);
392 static unsigned int tc_get_minor(unsigned int handle
);
394 static unsigned int tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
);
395 static unsigned int tc_bytes_to_ticks(unsigned int rate
, unsigned int size
);
396 static unsigned int tc_buffer_per_jiffy(unsigned int rate
);
398 static struct tcmsg
*tc_make_request(const struct netdev
*, int type
,
399 unsigned int flags
, struct ofpbuf
*);
400 static int tc_transact(struct ofpbuf
*request
, struct ofpbuf
**replyp
);
401 static int tc_add_del_ingress_qdisc(struct netdev
*netdev
, bool add
);
402 static int tc_add_policer(struct netdev
*netdev
, int kbits_rate
,
405 static int tc_parse_qdisc(const struct ofpbuf
*, const char **kind
,
406 struct nlattr
**options
);
407 static int tc_parse_class(const struct ofpbuf
*, unsigned int *queue_id
,
408 struct nlattr
**options
,
409 struct netdev_queue_stats
*);
410 static int tc_query_class(const struct netdev
*,
411 unsigned int handle
, unsigned int parent
,
412 struct ofpbuf
**replyp
);
413 static int tc_delete_class(const struct netdev
*, unsigned int handle
);
415 static int tc_del_qdisc(struct netdev
*netdev
);
416 static int tc_query_qdisc(const struct netdev
*netdev
);
418 static int tc_calc_cell_log(unsigned int mtu
);
419 static void tc_fill_rate(struct tc_ratespec
*rate
, uint64_t bps
, int mtu
);
420 static void tc_put_rtab(struct ofpbuf
*, uint16_t type
,
421 const struct tc_ratespec
*rate
);
422 static int tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
);
424 struct netdev_linux
{
427 /* Protects all members below. */
428 struct ovs_mutex mutex
;
430 unsigned int cache_valid
;
432 bool miimon
; /* Link status of last poll. */
433 long long int miimon_interval
; /* Miimon Poll rate. Disabled if <= 0. */
434 struct timer miimon_timer
;
436 /* The following are figured out "on demand" only. They are only valid
437 * when the corresponding VALID_* bit in 'cache_valid' is set. */
439 uint8_t etheraddr
[ETH_ADDR_LEN
];
440 struct in_addr address
, netmask
;
443 unsigned int ifi_flags
;
444 long long int carrier_resets
;
445 uint32_t kbits_rate
; /* Policing data. */
446 uint32_t kbits_burst
;
447 int vport_stats_error
; /* Cached error code from vport_get_stats().
448 0 or an errno value. */
449 int netdev_mtu_error
; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
450 int ether_addr_error
; /* Cached error code from set/get etheraddr. */
451 int netdev_policing_error
; /* Cached error code from set policing. */
452 int get_features_error
; /* Cached error code from ETHTOOL_GSET. */
453 int get_ifindex_error
; /* Cached error code from SIOCGIFINDEX. */
455 enum netdev_features current
; /* Cached from ETHTOOL_GSET. */
456 enum netdev_features advertised
; /* Cached from ETHTOOL_GSET. */
457 enum netdev_features supported
; /* Cached from ETHTOOL_GSET. */
459 struct ethtool_drvinfo drvinfo
; /* Cached from ETHTOOL_GDRVINFO. */
462 /* For devices of class netdev_tap_class only. */
466 struct netdev_rxq_linux
{
467 struct netdev_rxq up
;
472 /* This is set pretty low because we probably won't learn anything from the
473 * additional log messages. */
474 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
476 /* Polling miimon status for all ports causes performance degradation when
477 * handling a large number of ports. If there are no devices using miimon, then
478 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
480 * Readers do not depend on this variable synchronizing with the related
481 * changes in the device miimon status, so we can use atomic_count. */
482 static atomic_count miimon_cnt
= ATOMIC_COUNT_INIT(0);
484 static void netdev_linux_run(void);
486 static int netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*,
487 int cmd
, const char *cmd_name
);
488 static int netdev_linux_get_ipv4(const struct netdev
*, struct in_addr
*,
489 int cmd
, const char *cmd_name
);
490 static int get_flags(const struct netdev
*, unsigned int *flags
);
491 static int set_flags(const char *, unsigned int flags
);
492 static int update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
493 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
494 OVS_REQUIRES(netdev
->mutex
);
495 static int do_get_ifindex(const char *netdev_name
);
496 static int get_ifindex(const struct netdev
*, int *ifindexp
);
497 static int do_set_addr(struct netdev
*netdev
,
498 int ioctl_nr
, const char *ioctl_name
,
499 struct in_addr addr
);
500 static int get_etheraddr(const char *netdev_name
, uint8_t ea
[ETH_ADDR_LEN
]);
501 static int set_etheraddr(const char *netdev_name
, const uint8_t[ETH_ADDR_LEN
]);
502 static int get_stats_via_netlink(const struct netdev
*, struct netdev_stats
*);
503 static int af_packet_sock(void);
504 static bool netdev_linux_miimon_enabled(void);
505 static void netdev_linux_miimon_run(void);
506 static void netdev_linux_miimon_wait(void);
507 static int netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
);
510 is_netdev_linux_class(const struct netdev_class
*netdev_class
)
512 return netdev_class
->run
== netdev_linux_run
;
516 is_tap_netdev(const struct netdev
*netdev
)
518 return netdev_get_class(netdev
) == &netdev_tap_class
;
521 static struct netdev_linux
*
522 netdev_linux_cast(const struct netdev
*netdev
)
524 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev
)));
526 return CONTAINER_OF(netdev
, struct netdev_linux
, up
);
529 static struct netdev_rxq_linux
*
530 netdev_rxq_linux_cast(const struct netdev_rxq
*rx
)
532 ovs_assert(is_netdev_linux_class(netdev_get_class(rx
->netdev
)));
533 return CONTAINER_OF(rx
, struct netdev_rxq_linux
, up
);
536 static void netdev_linux_update(struct netdev_linux
*netdev
,
537 const struct rtnetlink_link_change
*)
538 OVS_REQUIRES(netdev
->mutex
);
539 static void netdev_linux_changed(struct netdev_linux
*netdev
,
540 unsigned int ifi_flags
, unsigned int mask
)
541 OVS_REQUIRES(netdev
->mutex
);
543 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK changes, or NULL
544 * if no such socket could be created. */
545 static struct nl_sock
*
546 netdev_linux_notify_sock(void)
548 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
549 static struct nl_sock
*sock
;
551 if (ovsthread_once_start(&once
)) {
554 error
= nl_sock_create(NETLINK_ROUTE
, &sock
);
556 error
= nl_sock_join_mcgroup(sock
, RTNLGRP_LINK
);
558 nl_sock_destroy(sock
);
562 ovsthread_once_done(&once
);
569 netdev_linux_miimon_enabled(void)
571 return atomic_count_get(&miimon_cnt
) > 0;
575 netdev_linux_run(void)
577 struct nl_sock
*sock
;
580 if (netdev_linux_miimon_enabled()) {
581 netdev_linux_miimon_run();
584 sock
= netdev_linux_notify_sock();
590 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
591 uint64_t buf_stub
[4096 / 8];
594 ofpbuf_use_stub(&buf
, buf_stub
, sizeof buf_stub
);
595 error
= nl_sock_recv(sock
, &buf
, false);
597 struct rtnetlink_link_change change
;
599 if (rtnetlink_link_parse(&buf
, &change
)) {
600 struct netdev
*netdev_
= netdev_from_name(change
.ifname
);
601 if (netdev_
&& is_netdev_linux_class(netdev_
->netdev_class
)) {
602 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
604 ovs_mutex_lock(&netdev
->mutex
);
605 netdev_linux_update(netdev
, &change
);
606 ovs_mutex_unlock(&netdev
->mutex
);
608 netdev_close(netdev_
);
610 } else if (error
== ENOBUFS
) {
611 struct shash device_shash
;
612 struct shash_node
*node
;
616 shash_init(&device_shash
);
617 netdev_get_devices(&netdev_linux_class
, &device_shash
);
618 SHASH_FOR_EACH (node
, &device_shash
) {
619 struct netdev
*netdev_
= node
->data
;
620 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
623 ovs_mutex_lock(&netdev
->mutex
);
624 get_flags(netdev_
, &flags
);
625 netdev_linux_changed(netdev
, flags
, 0);
626 ovs_mutex_unlock(&netdev
->mutex
);
628 netdev_close(netdev_
);
630 shash_destroy(&device_shash
);
631 } else if (error
!= EAGAIN
) {
632 VLOG_WARN_RL(&rl
, "error reading or parsing netlink (%s)",
633 ovs_strerror(error
));
640 netdev_linux_wait(void)
642 struct nl_sock
*sock
;
644 if (netdev_linux_miimon_enabled()) {
645 netdev_linux_miimon_wait();
647 sock
= netdev_linux_notify_sock();
649 nl_sock_wait(sock
, POLLIN
);
654 netdev_linux_changed(struct netdev_linux
*dev
,
655 unsigned int ifi_flags
, unsigned int mask
)
656 OVS_REQUIRES(dev
->mutex
)
658 netdev_change_seq_changed(&dev
->up
);
660 if ((dev
->ifi_flags
^ ifi_flags
) & IFF_RUNNING
) {
661 dev
->carrier_resets
++;
663 dev
->ifi_flags
= ifi_flags
;
665 dev
->cache_valid
&= mask
;
669 netdev_linux_update(struct netdev_linux
*dev
,
670 const struct rtnetlink_link_change
*change
)
671 OVS_REQUIRES(dev
->mutex
)
673 if (change
->nlmsg_type
== RTM_NEWLINK
) {
675 netdev_linux_changed(dev
, change
->ifi_flags
, VALID_DRVINFO
);
677 /* Update netdev from rtnl-change msg. */
679 dev
->mtu
= change
->mtu
;
680 dev
->cache_valid
|= VALID_MTU
;
681 dev
->netdev_mtu_error
= 0;
684 if (!eth_addr_is_zero(change
->addr
)) {
685 memcpy(dev
->etheraddr
, change
->addr
, ETH_ADDR_LEN
);
686 dev
->cache_valid
|= VALID_ETHERADDR
;
687 dev
->ether_addr_error
= 0;
690 dev
->ifindex
= change
->ifi_index
;
691 dev
->cache_valid
|= VALID_IFINDEX
;
692 dev
->get_ifindex_error
= 0;
695 netdev_linux_changed(dev
, change
->ifi_flags
, 0);
699 static struct netdev
*
700 netdev_linux_alloc(void)
702 struct netdev_linux
*netdev
= xzalloc(sizeof *netdev
);
707 netdev_linux_common_construct(struct netdev_linux
*netdev
)
709 ovs_mutex_init(&netdev
->mutex
);
712 /* Creates system and internal devices. */
714 netdev_linux_construct(struct netdev
*netdev_
)
716 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
719 netdev_linux_common_construct(netdev
);
721 error
= get_flags(&netdev
->up
, &netdev
->ifi_flags
);
722 if (error
== ENODEV
) {
723 if (netdev
->up
.netdev_class
!= &netdev_internal_class
) {
724 /* The device does not exist, so don't allow it to be opened. */
727 /* "Internal" netdevs have to be created as netdev objects before
728 * they exist in the kernel, because creating them in the kernel
729 * happens by passing a netdev object to dpif_port_add().
730 * Therefore, ignore the error. */
737 /* For most types of netdevs we open the device for each call of
738 * netdev_open(). However, this is not the case with tap devices,
739 * since it is only possible to open the device once. In this
740 * situation we share a single file descriptor, and consequently
741 * buffers, across all readers. Therefore once data is read it will
742 * be unavailable to other reads for tap devices. */
744 netdev_linux_construct_tap(struct netdev
*netdev_
)
746 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
747 static const char tap_dev
[] = "/dev/net/tun";
748 const char *name
= netdev_
->name
;
752 netdev_linux_common_construct(netdev
);
754 /* Open tap device. */
755 netdev
->tap_fd
= open(tap_dev
, O_RDWR
);
756 if (netdev
->tap_fd
< 0) {
758 VLOG_WARN("opening \"%s\" failed: %s", tap_dev
, ovs_strerror(error
));
762 /* Create tap device. */
763 ifr
.ifr_flags
= IFF_TAP
| IFF_NO_PI
;
764 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
765 if (ioctl(netdev
->tap_fd
, TUNSETIFF
, &ifr
) == -1) {
766 VLOG_WARN("%s: creating tap device failed: %s", name
,
767 ovs_strerror(errno
));
772 /* Make non-blocking. */
773 error
= set_nonblocking(netdev
->tap_fd
);
781 close(netdev
->tap_fd
);
786 netdev_linux_destruct(struct netdev
*netdev_
)
788 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
790 if (netdev
->tc
&& netdev
->tc
->ops
->tc_destroy
) {
791 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
794 if (netdev_get_class(netdev_
) == &netdev_tap_class
795 && netdev
->tap_fd
>= 0)
797 close(netdev
->tap_fd
);
800 if (netdev
->miimon_interval
> 0) {
801 atomic_count_dec(&miimon_cnt
);
804 ovs_mutex_destroy(&netdev
->mutex
);
808 netdev_linux_dealloc(struct netdev
*netdev_
)
810 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
814 static struct netdev_rxq
*
815 netdev_linux_rxq_alloc(void)
817 struct netdev_rxq_linux
*rx
= xzalloc(sizeof *rx
);
822 netdev_linux_rxq_construct(struct netdev_rxq
*rxq_
)
824 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
825 struct netdev
*netdev_
= rx
->up
.netdev
;
826 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
829 ovs_mutex_lock(&netdev
->mutex
);
830 rx
->is_tap
= is_tap_netdev(netdev_
);
832 rx
->fd
= netdev
->tap_fd
;
834 struct sockaddr_ll sll
;
836 /* Result of tcpdump -dd inbound */
837 static const struct sock_filter filt
[] = {
838 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
839 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
840 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
841 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
843 static const struct sock_fprog fprog
= {
844 ARRAY_SIZE(filt
), (struct sock_filter
*) filt
847 /* Create file descriptor. */
848 rx
->fd
= socket(PF_PACKET
, SOCK_RAW
, 0);
851 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error
));
856 if (setsockopt(rx
->fd
, SOL_PACKET
, PACKET_AUXDATA
, &val
, sizeof val
)) {
858 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
859 netdev_get_name(netdev_
), ovs_strerror(error
));
863 /* Set non-blocking mode. */
864 error
= set_nonblocking(rx
->fd
);
869 /* Get ethernet device index. */
870 error
= get_ifindex(&netdev
->up
, &ifindex
);
875 /* Bind to specific ethernet device. */
876 memset(&sll
, 0, sizeof sll
);
877 sll
.sll_family
= AF_PACKET
;
878 sll
.sll_ifindex
= ifindex
;
879 sll
.sll_protocol
= htons(ETH_P_ALL
);
880 if (bind(rx
->fd
, (struct sockaddr
*) &sll
, sizeof sll
) < 0) {
882 VLOG_ERR("%s: failed to bind raw socket (%s)",
883 netdev_get_name(netdev_
), ovs_strerror(error
));
887 /* Filter for only inbound packets. */
888 error
= setsockopt(rx
->fd
, SOL_SOCKET
, SO_ATTACH_FILTER
, &fprog
,
892 VLOG_ERR("%s: failed to attach filter (%s)",
893 netdev_get_name(netdev_
), ovs_strerror(error
));
897 ovs_mutex_unlock(&netdev
->mutex
);
905 ovs_mutex_unlock(&netdev
->mutex
);
910 netdev_linux_rxq_destruct(struct netdev_rxq
*rxq_
)
912 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
920 netdev_linux_rxq_dealloc(struct netdev_rxq
*rxq_
)
922 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
928 auxdata_to_vlan_tpid(const struct tpacket_auxdata
*aux
)
930 if (aux
->tp_status
& TP_STATUS_VLAN_TPID_VALID
) {
931 return htons(aux
->tp_vlan_tpid
);
933 return htons(ETH_TYPE_VLAN
);
938 auxdata_has_vlan_tci(const struct tpacket_auxdata
*aux
)
940 return aux
->tp_vlan_tci
|| aux
->tp_status
& TP_STATUS_VLAN_VALID
;
944 netdev_linux_rxq_recv_sock(int fd
, struct ofpbuf
*buffer
)
949 struct cmsghdr
*cmsg
;
952 char buffer
[CMSG_SPACE(sizeof(struct tpacket_auxdata
))];
956 /* Reserve headroom for a single VLAN tag */
957 ofpbuf_reserve(buffer
, VLAN_HEADER_LEN
);
958 size
= ofpbuf_tailroom(buffer
);
960 iov
.iov_base
= ofpbuf_data(buffer
);
962 msgh
.msg_name
= NULL
;
963 msgh
.msg_namelen
= 0;
966 msgh
.msg_control
= &cmsg_buffer
;
967 msgh
.msg_controllen
= sizeof cmsg_buffer
;
971 retval
= recvmsg(fd
, &msgh
, MSG_TRUNC
);
972 } while (retval
< 0 && errno
== EINTR
);
976 } else if (retval
> size
) {
980 ofpbuf_set_size(buffer
, ofpbuf_size(buffer
) + retval
);
982 for (cmsg
= CMSG_FIRSTHDR(&msgh
); cmsg
; cmsg
= CMSG_NXTHDR(&msgh
, cmsg
)) {
983 const struct tpacket_auxdata
*aux
;
985 if (cmsg
->cmsg_level
!= SOL_PACKET
986 || cmsg
->cmsg_type
!= PACKET_AUXDATA
987 || cmsg
->cmsg_len
< CMSG_LEN(sizeof(struct tpacket_auxdata
))) {
991 aux
= ALIGNED_CAST(struct tpacket_auxdata
*, CMSG_DATA(cmsg
));
992 if (auxdata_has_vlan_tci(aux
)) {
993 if (retval
< ETH_HEADER_LEN
) {
997 eth_push_vlan(buffer
, auxdata_to_vlan_tpid(aux
),
998 htons(aux
->tp_vlan_tci
));
1007 netdev_linux_rxq_recv_tap(int fd
, struct ofpbuf
*buffer
)
1010 size_t size
= ofpbuf_tailroom(buffer
);
1013 retval
= read(fd
, ofpbuf_data(buffer
), size
);
1014 } while (retval
< 0 && errno
== EINTR
);
1018 } else if (retval
> size
) {
1022 ofpbuf_set_size(buffer
, ofpbuf_size(buffer
) + retval
);
1027 netdev_linux_rxq_recv(struct netdev_rxq
*rxq_
, struct dpif_packet
**packets
,
1030 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1031 struct netdev
*netdev
= rx
->up
.netdev
;
1032 struct dpif_packet
*packet
;
1033 struct ofpbuf
*buffer
;
1037 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
1038 mtu
= ETH_PAYLOAD_MAX
;
1041 packet
= dpif_packet_new_with_headroom(VLAN_ETH_HEADER_LEN
+ mtu
,
1042 DP_NETDEV_HEADROOM
);
1043 buffer
= &packet
->ofpbuf
;
1045 retval
= (rx
->is_tap
1046 ? netdev_linux_rxq_recv_tap(rx
->fd
, buffer
)
1047 : netdev_linux_rxq_recv_sock(rx
->fd
, buffer
));
1050 if (retval
!= EAGAIN
&& retval
!= EMSGSIZE
) {
1051 VLOG_WARN_RL(&rl
, "error receiving Ethernet packet on %s: %s",
1052 ovs_strerror(errno
), netdev_rxq_get_name(rxq_
));
1054 dpif_packet_delete(packet
);
1056 dp_packet_pad(buffer
);
1057 dpif_packet_set_dp_hash(packet
, 0);
1058 packets
[0] = packet
;
1066 netdev_linux_rxq_wait(struct netdev_rxq
*rxq_
)
1068 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1069 poll_fd_wait(rx
->fd
, POLLIN
);
1073 netdev_linux_rxq_drain(struct netdev_rxq
*rxq_
)
1075 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1078 int error
= af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_
), &ifr
,
1079 SIOCGIFTXQLEN
, "SIOCGIFTXQLEN");
1083 drain_fd(rx
->fd
, ifr
.ifr_qlen
);
1086 return drain_rcvbuf(rx
->fd
);
1090 /* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
1091 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1092 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1093 * the packet is too big or too small to transmit on the device.
1095 * The caller retains ownership of 'buffer' in all cases.
1097 * The kernel maintains a packet transmission queue, so the caller is not
1098 * expected to do additional queuing of packets. */
1100 netdev_linux_send(struct netdev
*netdev_
, int qid OVS_UNUSED
,
1101 struct dpif_packet
**pkts
, int cnt
, bool may_steal
)
1106 /* 'i' is incremented only if there's no error */
1107 for (i
= 0; i
< cnt
;) {
1108 const void *data
= ofpbuf_data(&pkts
[i
]->ofpbuf
);
1109 size_t size
= ofpbuf_size(&pkts
[i
]->ofpbuf
);
1112 if (!is_tap_netdev(netdev_
)) {
1113 /* Use our AF_PACKET socket to send to this device. */
1114 struct sockaddr_ll sll
;
1120 sock
= af_packet_sock();
1125 ifindex
= netdev_get_ifindex(netdev_
);
1130 /* We don't bother setting most fields in sockaddr_ll because the
1131 * kernel ignores them for SOCK_RAW. */
1132 memset(&sll
, 0, sizeof sll
);
1133 sll
.sll_family
= AF_PACKET
;
1134 sll
.sll_ifindex
= ifindex
;
1136 iov
.iov_base
= CONST_CAST(void *, data
);
1139 msg
.msg_name
= &sll
;
1140 msg
.msg_namelen
= sizeof sll
;
1143 msg
.msg_control
= NULL
;
1144 msg
.msg_controllen
= 0;
1147 retval
= sendmsg(sock
, &msg
, 0);
1149 /* Use the tap fd to send to this device. This is essential for
1150 * tap devices, because packets sent to a tap device with an
1151 * AF_PACKET socket will loop back to be *received* again on the
1152 * tap device. This doesn't occur on other interface types
1153 * because we attach a socket filter to the rx socket. */
1154 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1156 retval
= write(netdev
->tap_fd
, data
, size
);
1160 /* The Linux AF_PACKET implementation never blocks waiting for room
1161 * for packets, instead returning ENOBUFS. Translate this into
1162 * EAGAIN for the caller. */
1163 error
= errno
== ENOBUFS
? EAGAIN
: errno
;
1164 if (error
== EINTR
) {
1165 /* continue without incrementing 'i', i.e. retry this packet */
1169 } else if (retval
!= size
) {
1170 VLOG_WARN_RL(&rl
, "sent partial Ethernet packet (%"PRIuSIZE
" bytes"
1171 " of %"PRIuSIZE
") on %s", retval
, size
,
1172 netdev_get_name(netdev_
));
1177 /* Process the next packet in the batch */
1182 for (i
= 0; i
< cnt
; i
++) {
1183 dpif_packet_delete(pkts
[i
]);
1187 if (error
&& error
!= EAGAIN
) {
1188 VLOG_WARN_RL(&rl
, "error sending Ethernet packet on %s: %s",
1189 netdev_get_name(netdev_
), ovs_strerror(error
));
1196 /* Registers with the poll loop to wake up from the next call to poll_block()
1197 * when the packet transmission queue has sufficient room to transmit a packet
1198 * with netdev_send().
1200 * The kernel maintains a packet transmission queue, so the client is not
1201 * expected to do additional queuing of packets. Thus, this function is
1202 * unlikely to ever be used. It is included for completeness. */
1204 netdev_linux_send_wait(struct netdev
*netdev
, int qid OVS_UNUSED
)
1206 if (is_tap_netdev(netdev
)) {
1207 /* TAP device always accepts packets.*/
1208 poll_immediate_wake();
1212 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1213 * otherwise a positive errno value. */
1215 netdev_linux_set_etheraddr(struct netdev
*netdev_
,
1216 const uint8_t mac
[ETH_ADDR_LEN
])
1218 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1219 enum netdev_flags old_flags
= 0;
1222 ovs_mutex_lock(&netdev
->mutex
);
1224 if (netdev
->cache_valid
& VALID_ETHERADDR
) {
1225 error
= netdev
->ether_addr_error
;
1226 if (error
|| eth_addr_equals(netdev
->etheraddr
, mac
)) {
1229 netdev
->cache_valid
&= ~VALID_ETHERADDR
;
1232 /* Tap devices must be brought down before setting the address. */
1233 if (is_tap_netdev(netdev_
)) {
1234 update_flags(netdev
, NETDEV_UP
, 0, &old_flags
);
1236 error
= set_etheraddr(netdev_get_name(netdev_
), mac
);
1237 if (!error
|| error
== ENODEV
) {
1238 netdev
->ether_addr_error
= error
;
1239 netdev
->cache_valid
|= VALID_ETHERADDR
;
1241 memcpy(netdev
->etheraddr
, mac
, ETH_ADDR_LEN
);
1245 if (is_tap_netdev(netdev_
) && old_flags
& NETDEV_UP
) {
1246 update_flags(netdev
, 0, NETDEV_UP
, &old_flags
);
1250 ovs_mutex_unlock(&netdev
->mutex
);
1254 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1256 netdev_linux_get_etheraddr(const struct netdev
*netdev_
,
1257 uint8_t mac
[ETH_ADDR_LEN
])
1259 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1262 ovs_mutex_lock(&netdev
->mutex
);
1263 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1264 netdev
->ether_addr_error
= get_etheraddr(netdev_get_name(netdev_
),
1266 netdev
->cache_valid
|= VALID_ETHERADDR
;
1269 error
= netdev
->ether_addr_error
;
1271 memcpy(mac
, netdev
->etheraddr
, ETH_ADDR_LEN
);
1273 ovs_mutex_unlock(&netdev
->mutex
);
1279 netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
)
1283 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1286 netdev
->netdev_mtu_error
= af_inet_ifreq_ioctl(
1287 netdev_get_name(&netdev
->up
), &ifr
, SIOCGIFMTU
, "SIOCGIFMTU");
1288 netdev
->mtu
= ifr
.ifr_mtu
;
1289 netdev
->cache_valid
|= VALID_MTU
;
1292 error
= netdev
->netdev_mtu_error
;
1294 *mtup
= netdev
->mtu
;
1300 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1301 * in bytes, not including the hardware header; thus, this is typically 1500
1302 * bytes for Ethernet devices. */
1304 netdev_linux_get_mtu(const struct netdev
*netdev_
, int *mtup
)
1306 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1309 ovs_mutex_lock(&netdev
->mutex
);
1310 error
= netdev_linux_get_mtu__(netdev
, mtup
);
1311 ovs_mutex_unlock(&netdev
->mutex
);
1316 /* Sets the maximum size of transmitted (MTU) for given device using linux
1317 * networking ioctl interface.
1320 netdev_linux_set_mtu(const struct netdev
*netdev_
, int mtu
)
1322 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1326 ovs_mutex_lock(&netdev
->mutex
);
1327 if (netdev
->cache_valid
& VALID_MTU
) {
1328 error
= netdev
->netdev_mtu_error
;
1329 if (error
|| netdev
->mtu
== mtu
) {
1332 netdev
->cache_valid
&= ~VALID_MTU
;
1335 error
= af_inet_ifreq_ioctl(netdev_get_name(netdev_
), &ifr
,
1336 SIOCSIFMTU
, "SIOCSIFMTU");
1337 if (!error
|| error
== ENODEV
) {
1338 netdev
->netdev_mtu_error
= error
;
1339 netdev
->mtu
= ifr
.ifr_mtu
;
1340 netdev
->cache_valid
|= VALID_MTU
;
1343 ovs_mutex_unlock(&netdev
->mutex
);
1347 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1348 * On failure, returns a negative errno value. */
1350 netdev_linux_get_ifindex(const struct netdev
*netdev_
)
1352 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1355 ovs_mutex_lock(&netdev
->mutex
);
1356 error
= get_ifindex(netdev_
, &ifindex
);
1357 ovs_mutex_unlock(&netdev
->mutex
);
1359 return error
? -error
: ifindex
;
1363 netdev_linux_get_carrier(const struct netdev
*netdev_
, bool *carrier
)
1365 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1367 ovs_mutex_lock(&netdev
->mutex
);
1368 if (netdev
->miimon_interval
> 0) {
1369 *carrier
= netdev
->miimon
;
1371 *carrier
= (netdev
->ifi_flags
& IFF_RUNNING
) != 0;
1373 ovs_mutex_unlock(&netdev
->mutex
);
1378 static long long int
1379 netdev_linux_get_carrier_resets(const struct netdev
*netdev_
)
1381 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1382 long long int carrier_resets
;
1384 ovs_mutex_lock(&netdev
->mutex
);
1385 carrier_resets
= netdev
->carrier_resets
;
1386 ovs_mutex_unlock(&netdev
->mutex
);
1388 return carrier_resets
;
1392 netdev_linux_do_miimon(const char *name
, int cmd
, const char *cmd_name
,
1393 struct mii_ioctl_data
*data
)
1398 memset(&ifr
, 0, sizeof ifr
);
1399 memcpy(&ifr
.ifr_data
, data
, sizeof *data
);
1400 error
= af_inet_ifreq_ioctl(name
, &ifr
, cmd
, cmd_name
);
1401 memcpy(data
, &ifr
.ifr_data
, sizeof *data
);
1407 netdev_linux_get_miimon(const char *name
, bool *miimon
)
1409 struct mii_ioctl_data data
;
1414 memset(&data
, 0, sizeof data
);
1415 error
= netdev_linux_do_miimon(name
, SIOCGMIIPHY
, "SIOCGMIIPHY", &data
);
1417 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1418 data
.reg_num
= MII_BMSR
;
1419 error
= netdev_linux_do_miimon(name
, SIOCGMIIREG
, "SIOCGMIIREG",
1423 *miimon
= !!(data
.val_out
& BMSR_LSTATUS
);
1425 VLOG_WARN_RL(&rl
, "%s: failed to query MII", name
);
1428 struct ethtool_cmd ecmd
;
1430 VLOG_DBG_RL(&rl
, "%s: failed to query MII, falling back to ethtool",
1433 COVERAGE_INC(netdev_get_ethtool
);
1434 memset(&ecmd
, 0, sizeof ecmd
);
1435 error
= netdev_linux_do_ethtool(name
, &ecmd
, ETHTOOL_GLINK
,
1438 struct ethtool_value eval
;
1440 memcpy(&eval
, &ecmd
, sizeof eval
);
1441 *miimon
= !!eval
.data
;
1443 VLOG_WARN_RL(&rl
, "%s: ethtool link status failed", name
);
1451 netdev_linux_set_miimon_interval(struct netdev
*netdev_
,
1452 long long int interval
)
1454 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1456 ovs_mutex_lock(&netdev
->mutex
);
1457 interval
= interval
> 0 ? MAX(interval
, 100) : 0;
1458 if (netdev
->miimon_interval
!= interval
) {
1459 if (interval
&& !netdev
->miimon_interval
) {
1460 atomic_count_inc(&miimon_cnt
);
1461 } else if (!interval
&& netdev
->miimon_interval
) {
1462 atomic_count_dec(&miimon_cnt
);
1465 netdev
->miimon_interval
= interval
;
1466 timer_set_expired(&netdev
->miimon_timer
);
1468 ovs_mutex_unlock(&netdev
->mutex
);
1474 netdev_linux_miimon_run(void)
1476 struct shash device_shash
;
1477 struct shash_node
*node
;
1479 shash_init(&device_shash
);
1480 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1481 SHASH_FOR_EACH (node
, &device_shash
) {
1482 struct netdev
*netdev
= node
->data
;
1483 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1486 ovs_mutex_lock(&dev
->mutex
);
1487 if (dev
->miimon_interval
> 0 && timer_expired(&dev
->miimon_timer
)) {
1488 netdev_linux_get_miimon(dev
->up
.name
, &miimon
);
1489 if (miimon
!= dev
->miimon
) {
1490 dev
->miimon
= miimon
;
1491 netdev_linux_changed(dev
, dev
->ifi_flags
, 0);
1494 timer_set_duration(&dev
->miimon_timer
, dev
->miimon_interval
);
1496 ovs_mutex_unlock(&dev
->mutex
);
1497 netdev_close(netdev
);
1500 shash_destroy(&device_shash
);
1504 netdev_linux_miimon_wait(void)
1506 struct shash device_shash
;
1507 struct shash_node
*node
;
1509 shash_init(&device_shash
);
1510 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1511 SHASH_FOR_EACH (node
, &device_shash
) {
1512 struct netdev
*netdev
= node
->data
;
1513 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1515 ovs_mutex_lock(&dev
->mutex
);
1516 if (dev
->miimon_interval
> 0) {
1517 timer_wait(&dev
->miimon_timer
);
1519 ovs_mutex_unlock(&dev
->mutex
);
1520 netdev_close(netdev
);
1522 shash_destroy(&device_shash
);
1526 swap_uint64(uint64_t *a
, uint64_t *b
)
1533 /* Copies 'src' into 'dst', performing format conversion in the process.
1535 * 'src' is allowed to be misaligned. */
1537 netdev_stats_from_ovs_vport_stats(struct netdev_stats
*dst
,
1538 const struct ovs_vport_stats
*src
)
1540 dst
->rx_packets
= get_32aligned_u64(&src
->rx_packets
);
1541 dst
->tx_packets
= get_32aligned_u64(&src
->tx_packets
);
1542 dst
->rx_bytes
= get_32aligned_u64(&src
->rx_bytes
);
1543 dst
->tx_bytes
= get_32aligned_u64(&src
->tx_bytes
);
1544 dst
->rx_errors
= get_32aligned_u64(&src
->rx_errors
);
1545 dst
->tx_errors
= get_32aligned_u64(&src
->tx_errors
);
1546 dst
->rx_dropped
= get_32aligned_u64(&src
->rx_dropped
);
1547 dst
->tx_dropped
= get_32aligned_u64(&src
->tx_dropped
);
1549 dst
->collisions
= 0;
1550 dst
->rx_length_errors
= 0;
1551 dst
->rx_over_errors
= 0;
1552 dst
->rx_crc_errors
= 0;
1553 dst
->rx_frame_errors
= 0;
1554 dst
->rx_fifo_errors
= 0;
1555 dst
->rx_missed_errors
= 0;
1556 dst
->tx_aborted_errors
= 0;
1557 dst
->tx_carrier_errors
= 0;
1558 dst
->tx_fifo_errors
= 0;
1559 dst
->tx_heartbeat_errors
= 0;
1560 dst
->tx_window_errors
= 0;
1564 get_stats_via_vport__(const struct netdev
*netdev
, struct netdev_stats
*stats
)
1566 struct dpif_netlink_vport reply
;
1570 error
= dpif_netlink_vport_get(netdev_get_name(netdev
), &reply
, &buf
);
1573 } else if (!reply
.stats
) {
1578 netdev_stats_from_ovs_vport_stats(stats
, reply
.stats
);
1586 get_stats_via_vport(const struct netdev
*netdev_
,
1587 struct netdev_stats
*stats
)
1589 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1591 if (!netdev
->vport_stats_error
||
1592 !(netdev
->cache_valid
& VALID_VPORT_STAT_ERROR
)) {
1595 error
= get_stats_via_vport__(netdev_
, stats
);
1596 if (error
&& error
!= ENOENT
) {
1597 VLOG_WARN_RL(&rl
, "%s: obtaining netdev stats via vport failed "
1599 netdev_get_name(netdev_
), ovs_strerror(error
));
1601 netdev
->vport_stats_error
= error
;
1602 netdev
->cache_valid
|= VALID_VPORT_STAT_ERROR
;
1606 /* Retrieves current device stats for 'netdev-linux'. */
1608 netdev_linux_get_stats(const struct netdev
*netdev_
,
1609 struct netdev_stats
*stats
)
1611 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1612 struct netdev_stats dev_stats
;
1615 ovs_mutex_lock(&netdev
->mutex
);
1616 get_stats_via_vport(netdev_
, stats
);
1617 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1619 if (!netdev
->vport_stats_error
) {
1622 } else if (netdev
->vport_stats_error
) {
1623 /* stats not available from OVS then use netdev stats. */
1626 /* Use kernel netdev's packet and byte counts since vport's counters
1627 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1629 stats
->rx_packets
= dev_stats
.rx_packets
;
1630 stats
->rx_bytes
= dev_stats
.rx_bytes
;
1631 stats
->tx_packets
= dev_stats
.tx_packets
;
1632 stats
->tx_bytes
= dev_stats
.tx_bytes
;
1634 stats
->rx_errors
+= dev_stats
.rx_errors
;
1635 stats
->tx_errors
+= dev_stats
.tx_errors
;
1636 stats
->rx_dropped
+= dev_stats
.rx_dropped
;
1637 stats
->tx_dropped
+= dev_stats
.tx_dropped
;
1638 stats
->multicast
+= dev_stats
.multicast
;
1639 stats
->collisions
+= dev_stats
.collisions
;
1640 stats
->rx_length_errors
+= dev_stats
.rx_length_errors
;
1641 stats
->rx_over_errors
+= dev_stats
.rx_over_errors
;
1642 stats
->rx_crc_errors
+= dev_stats
.rx_crc_errors
;
1643 stats
->rx_frame_errors
+= dev_stats
.rx_frame_errors
;
1644 stats
->rx_fifo_errors
+= dev_stats
.rx_fifo_errors
;
1645 stats
->rx_missed_errors
+= dev_stats
.rx_missed_errors
;
1646 stats
->tx_aborted_errors
+= dev_stats
.tx_aborted_errors
;
1647 stats
->tx_carrier_errors
+= dev_stats
.tx_carrier_errors
;
1648 stats
->tx_fifo_errors
+= dev_stats
.tx_fifo_errors
;
1649 stats
->tx_heartbeat_errors
+= dev_stats
.tx_heartbeat_errors
;
1650 stats
->tx_window_errors
+= dev_stats
.tx_window_errors
;
1652 ovs_mutex_unlock(&netdev
->mutex
);
1657 /* Retrieves current device stats for 'netdev-tap' netdev or
1658 * netdev-internal. */
1660 netdev_tap_get_stats(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
1662 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1663 struct netdev_stats dev_stats
;
1666 ovs_mutex_lock(&netdev
->mutex
);
1667 get_stats_via_vport(netdev_
, stats
);
1668 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1670 if (!netdev
->vport_stats_error
) {
1673 } else if (netdev
->vport_stats_error
) {
1674 /* Transmit and receive stats will appear to be swapped relative to the
1675 * other ports since we are the one sending the data, not a remote
1676 * computer. For consistency, we swap them back here. This does not
1677 * apply if we are getting stats from the vport layer because it always
1678 * tracks stats from the perspective of the switch. */
1681 swap_uint64(&stats
->rx_packets
, &stats
->tx_packets
);
1682 swap_uint64(&stats
->rx_bytes
, &stats
->tx_bytes
);
1683 swap_uint64(&stats
->rx_errors
, &stats
->tx_errors
);
1684 swap_uint64(&stats
->rx_dropped
, &stats
->tx_dropped
);
1685 stats
->rx_length_errors
= 0;
1686 stats
->rx_over_errors
= 0;
1687 stats
->rx_crc_errors
= 0;
1688 stats
->rx_frame_errors
= 0;
1689 stats
->rx_fifo_errors
= 0;
1690 stats
->rx_missed_errors
= 0;
1691 stats
->tx_aborted_errors
= 0;
1692 stats
->tx_carrier_errors
= 0;
1693 stats
->tx_fifo_errors
= 0;
1694 stats
->tx_heartbeat_errors
= 0;
1695 stats
->tx_window_errors
= 0;
1697 /* Use kernel netdev's packet and byte counts since vport counters
1698 * do not reflect packet counts on the wire when GSO, TSO or GRO
1700 stats
->rx_packets
= dev_stats
.tx_packets
;
1701 stats
->rx_bytes
= dev_stats
.tx_bytes
;
1702 stats
->tx_packets
= dev_stats
.rx_packets
;
1703 stats
->tx_bytes
= dev_stats
.rx_bytes
;
1705 stats
->rx_dropped
+= dev_stats
.tx_dropped
;
1706 stats
->tx_dropped
+= dev_stats
.rx_dropped
;
1708 stats
->rx_errors
+= dev_stats
.tx_errors
;
1709 stats
->tx_errors
+= dev_stats
.rx_errors
;
1711 stats
->multicast
+= dev_stats
.multicast
;
1712 stats
->collisions
+= dev_stats
.collisions
;
1714 ovs_mutex_unlock(&netdev
->mutex
);
1720 netdev_internal_get_stats(const struct netdev
*netdev_
,
1721 struct netdev_stats
*stats
)
1723 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1726 ovs_mutex_lock(&netdev
->mutex
);
1727 get_stats_via_vport(netdev_
, stats
);
1728 error
= netdev
->vport_stats_error
;
1729 ovs_mutex_unlock(&netdev
->mutex
);
1735 netdev_linux_read_features(struct netdev_linux
*netdev
)
1737 struct ethtool_cmd ecmd
;
1741 if (netdev
->cache_valid
& VALID_FEATURES
) {
1745 COVERAGE_INC(netdev_get_ethtool
);
1746 memset(&ecmd
, 0, sizeof ecmd
);
1747 error
= netdev_linux_do_ethtool(netdev
->up
.name
, &ecmd
,
1748 ETHTOOL_GSET
, "ETHTOOL_GSET");
1753 /* Supported features. */
1754 netdev
->supported
= 0;
1755 if (ecmd
.supported
& SUPPORTED_10baseT_Half
) {
1756 netdev
->supported
|= NETDEV_F_10MB_HD
;
1758 if (ecmd
.supported
& SUPPORTED_10baseT_Full
) {
1759 netdev
->supported
|= NETDEV_F_10MB_FD
;
1761 if (ecmd
.supported
& SUPPORTED_100baseT_Half
) {
1762 netdev
->supported
|= NETDEV_F_100MB_HD
;
1764 if (ecmd
.supported
& SUPPORTED_100baseT_Full
) {
1765 netdev
->supported
|= NETDEV_F_100MB_FD
;
1767 if (ecmd
.supported
& SUPPORTED_1000baseT_Half
) {
1768 netdev
->supported
|= NETDEV_F_1GB_HD
;
1770 if (ecmd
.supported
& SUPPORTED_1000baseT_Full
) {
1771 netdev
->supported
|= NETDEV_F_1GB_FD
;
1773 if (ecmd
.supported
& SUPPORTED_10000baseT_Full
) {
1774 netdev
->supported
|= NETDEV_F_10GB_FD
;
1776 if (ecmd
.supported
& SUPPORTED_TP
) {
1777 netdev
->supported
|= NETDEV_F_COPPER
;
1779 if (ecmd
.supported
& SUPPORTED_FIBRE
) {
1780 netdev
->supported
|= NETDEV_F_FIBER
;
1782 if (ecmd
.supported
& SUPPORTED_Autoneg
) {
1783 netdev
->supported
|= NETDEV_F_AUTONEG
;
1785 if (ecmd
.supported
& SUPPORTED_Pause
) {
1786 netdev
->supported
|= NETDEV_F_PAUSE
;
1788 if (ecmd
.supported
& SUPPORTED_Asym_Pause
) {
1789 netdev
->supported
|= NETDEV_F_PAUSE_ASYM
;
1792 /* Advertised features. */
1793 netdev
->advertised
= 0;
1794 if (ecmd
.advertising
& ADVERTISED_10baseT_Half
) {
1795 netdev
->advertised
|= NETDEV_F_10MB_HD
;
1797 if (ecmd
.advertising
& ADVERTISED_10baseT_Full
) {
1798 netdev
->advertised
|= NETDEV_F_10MB_FD
;
1800 if (ecmd
.advertising
& ADVERTISED_100baseT_Half
) {
1801 netdev
->advertised
|= NETDEV_F_100MB_HD
;
1803 if (ecmd
.advertising
& ADVERTISED_100baseT_Full
) {
1804 netdev
->advertised
|= NETDEV_F_100MB_FD
;
1806 if (ecmd
.advertising
& ADVERTISED_1000baseT_Half
) {
1807 netdev
->advertised
|= NETDEV_F_1GB_HD
;
1809 if (ecmd
.advertising
& ADVERTISED_1000baseT_Full
) {
1810 netdev
->advertised
|= NETDEV_F_1GB_FD
;
1812 if (ecmd
.advertising
& ADVERTISED_10000baseT_Full
) {
1813 netdev
->advertised
|= NETDEV_F_10GB_FD
;
1815 if (ecmd
.advertising
& ADVERTISED_TP
) {
1816 netdev
->advertised
|= NETDEV_F_COPPER
;
1818 if (ecmd
.advertising
& ADVERTISED_FIBRE
) {
1819 netdev
->advertised
|= NETDEV_F_FIBER
;
1821 if (ecmd
.advertising
& ADVERTISED_Autoneg
) {
1822 netdev
->advertised
|= NETDEV_F_AUTONEG
;
1824 if (ecmd
.advertising
& ADVERTISED_Pause
) {
1825 netdev
->advertised
|= NETDEV_F_PAUSE
;
1827 if (ecmd
.advertising
& ADVERTISED_Asym_Pause
) {
1828 netdev
->advertised
|= NETDEV_F_PAUSE_ASYM
;
1831 /* Current settings. */
1833 if (speed
== SPEED_10
) {
1834 netdev
->current
= ecmd
.duplex
? NETDEV_F_10MB_FD
: NETDEV_F_10MB_HD
;
1835 } else if (speed
== SPEED_100
) {
1836 netdev
->current
= ecmd
.duplex
? NETDEV_F_100MB_FD
: NETDEV_F_100MB_HD
;
1837 } else if (speed
== SPEED_1000
) {
1838 netdev
->current
= ecmd
.duplex
? NETDEV_F_1GB_FD
: NETDEV_F_1GB_HD
;
1839 } else if (speed
== SPEED_10000
) {
1840 netdev
->current
= NETDEV_F_10GB_FD
;
1841 } else if (speed
== 40000) {
1842 netdev
->current
= NETDEV_F_40GB_FD
;
1843 } else if (speed
== 100000) {
1844 netdev
->current
= NETDEV_F_100GB_FD
;
1845 } else if (speed
== 1000000) {
1846 netdev
->current
= NETDEV_F_1TB_FD
;
1848 netdev
->current
= 0;
1851 if (ecmd
.port
== PORT_TP
) {
1852 netdev
->current
|= NETDEV_F_COPPER
;
1853 } else if (ecmd
.port
== PORT_FIBRE
) {
1854 netdev
->current
|= NETDEV_F_FIBER
;
1858 netdev
->current
|= NETDEV_F_AUTONEG
;
1862 netdev
->cache_valid
|= VALID_FEATURES
;
1863 netdev
->get_features_error
= error
;
1866 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
1867 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
1868 * Returns 0 if successful, otherwise a positive errno value. */
1870 netdev_linux_get_features(const struct netdev
*netdev_
,
1871 enum netdev_features
*current
,
1872 enum netdev_features
*advertised
,
1873 enum netdev_features
*supported
,
1874 enum netdev_features
*peer
)
1876 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1879 ovs_mutex_lock(&netdev
->mutex
);
1880 netdev_linux_read_features(netdev
);
1881 if (!netdev
->get_features_error
) {
1882 *current
= netdev
->current
;
1883 *advertised
= netdev
->advertised
;
1884 *supported
= netdev
->supported
;
1885 *peer
= 0; /* XXX */
1887 error
= netdev
->get_features_error
;
1888 ovs_mutex_unlock(&netdev
->mutex
);
1893 /* Set the features advertised by 'netdev' to 'advertise'. */
1895 netdev_linux_set_advertisements(struct netdev
*netdev_
,
1896 enum netdev_features advertise
)
1898 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1899 struct ethtool_cmd ecmd
;
1902 ovs_mutex_lock(&netdev
->mutex
);
1904 COVERAGE_INC(netdev_get_ethtool
);
1905 memset(&ecmd
, 0, sizeof ecmd
);
1906 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
1907 ETHTOOL_GSET
, "ETHTOOL_GSET");
1912 ecmd
.advertising
= 0;
1913 if (advertise
& NETDEV_F_10MB_HD
) {
1914 ecmd
.advertising
|= ADVERTISED_10baseT_Half
;
1916 if (advertise
& NETDEV_F_10MB_FD
) {
1917 ecmd
.advertising
|= ADVERTISED_10baseT_Full
;
1919 if (advertise
& NETDEV_F_100MB_HD
) {
1920 ecmd
.advertising
|= ADVERTISED_100baseT_Half
;
1922 if (advertise
& NETDEV_F_100MB_FD
) {
1923 ecmd
.advertising
|= ADVERTISED_100baseT_Full
;
1925 if (advertise
& NETDEV_F_1GB_HD
) {
1926 ecmd
.advertising
|= ADVERTISED_1000baseT_Half
;
1928 if (advertise
& NETDEV_F_1GB_FD
) {
1929 ecmd
.advertising
|= ADVERTISED_1000baseT_Full
;
1931 if (advertise
& NETDEV_F_10GB_FD
) {
1932 ecmd
.advertising
|= ADVERTISED_10000baseT_Full
;
1934 if (advertise
& NETDEV_F_COPPER
) {
1935 ecmd
.advertising
|= ADVERTISED_TP
;
1937 if (advertise
& NETDEV_F_FIBER
) {
1938 ecmd
.advertising
|= ADVERTISED_FIBRE
;
1940 if (advertise
& NETDEV_F_AUTONEG
) {
1941 ecmd
.advertising
|= ADVERTISED_Autoneg
;
1943 if (advertise
& NETDEV_F_PAUSE
) {
1944 ecmd
.advertising
|= ADVERTISED_Pause
;
1946 if (advertise
& NETDEV_F_PAUSE_ASYM
) {
1947 ecmd
.advertising
|= ADVERTISED_Asym_Pause
;
1949 COVERAGE_INC(netdev_set_ethtool
);
1950 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
1951 ETHTOOL_SSET
, "ETHTOOL_SSET");
1954 ovs_mutex_unlock(&netdev
->mutex
);
1958 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
1959 * successful, otherwise a positive errno value. */
1961 netdev_linux_set_policing(struct netdev
*netdev_
,
1962 uint32_t kbits_rate
, uint32_t kbits_burst
)
1964 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1965 const char *netdev_name
= netdev_get_name(netdev_
);
1968 kbits_burst
= (!kbits_rate
? 0 /* Force to 0 if no rate specified. */
1969 : !kbits_burst
? 1000 /* Default to 1000 kbits if 0. */
1970 : kbits_burst
); /* Stick with user-specified value. */
1972 ovs_mutex_lock(&netdev
->mutex
);
1973 if (netdev
->cache_valid
& VALID_POLICING
) {
1974 error
= netdev
->netdev_policing_error
;
1975 if (error
|| (netdev
->kbits_rate
== kbits_rate
&&
1976 netdev
->kbits_burst
== kbits_burst
)) {
1977 /* Assume that settings haven't changed since we last set them. */
1980 netdev
->cache_valid
&= ~VALID_POLICING
;
1983 COVERAGE_INC(netdev_set_policing
);
1984 /* Remove any existing ingress qdisc. */
1985 error
= tc_add_del_ingress_qdisc(netdev_
, false);
1987 VLOG_WARN_RL(&rl
, "%s: removing policing failed: %s",
1988 netdev_name
, ovs_strerror(error
));
1993 error
= tc_add_del_ingress_qdisc(netdev_
, true);
1995 VLOG_WARN_RL(&rl
, "%s: adding policing qdisc failed: %s",
1996 netdev_name
, ovs_strerror(error
));
2000 error
= tc_add_policer(netdev_
, kbits_rate
, kbits_burst
);
2002 VLOG_WARN_RL(&rl
, "%s: adding policing action failed: %s",
2003 netdev_name
, ovs_strerror(error
));
2008 netdev
->kbits_rate
= kbits_rate
;
2009 netdev
->kbits_burst
= kbits_burst
;
2012 if (!error
|| error
== ENODEV
) {
2013 netdev
->netdev_policing_error
= error
;
2014 netdev
->cache_valid
|= VALID_POLICING
;
2016 ovs_mutex_unlock(&netdev
->mutex
);
2021 netdev_linux_get_qos_types(const struct netdev
*netdev OVS_UNUSED
,
2024 const struct tc_ops
*const *opsp
;
2026 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2027 const struct tc_ops
*ops
= *opsp
;
2028 if (ops
->tc_install
&& ops
->ovs_name
[0] != '\0') {
2029 sset_add(types
, ops
->ovs_name
);
2035 static const struct tc_ops
*
2036 tc_lookup_ovs_name(const char *name
)
2038 const struct tc_ops
*const *opsp
;
2040 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2041 const struct tc_ops
*ops
= *opsp
;
2042 if (!strcmp(name
, ops
->ovs_name
)) {
2049 static const struct tc_ops
*
2050 tc_lookup_linux_name(const char *name
)
2052 const struct tc_ops
*const *opsp
;
2054 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2055 const struct tc_ops
*ops
= *opsp
;
2056 if (ops
->linux_name
&& !strcmp(name
, ops
->linux_name
)) {
2063 static struct tc_queue
*
2064 tc_find_queue__(const struct netdev
*netdev_
, unsigned int queue_id
,
2067 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2068 struct tc_queue
*queue
;
2070 HMAP_FOR_EACH_IN_BUCKET (queue
, hmap_node
, hash
, &netdev
->tc
->queues
) {
2071 if (queue
->queue_id
== queue_id
) {
2078 static struct tc_queue
*
2079 tc_find_queue(const struct netdev
*netdev
, unsigned int queue_id
)
2081 return tc_find_queue__(netdev
, queue_id
, hash_int(queue_id
, 0));
2085 netdev_linux_get_qos_capabilities(const struct netdev
*netdev OVS_UNUSED
,
2087 struct netdev_qos_capabilities
*caps
)
2089 const struct tc_ops
*ops
= tc_lookup_ovs_name(type
);
2093 caps
->n_queues
= ops
->n_queues
;
2098 netdev_linux_get_qos(const struct netdev
*netdev_
,
2099 const char **typep
, struct smap
*details
)
2101 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2104 ovs_mutex_lock(&netdev
->mutex
);
2105 error
= tc_query_qdisc(netdev_
);
2107 *typep
= netdev
->tc
->ops
->ovs_name
;
2108 error
= (netdev
->tc
->ops
->qdisc_get
2109 ? netdev
->tc
->ops
->qdisc_get(netdev_
, details
)
2112 ovs_mutex_unlock(&netdev
->mutex
);
2118 netdev_linux_set_qos(struct netdev
*netdev_
,
2119 const char *type
, const struct smap
*details
)
2121 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2122 const struct tc_ops
*new_ops
;
2125 new_ops
= tc_lookup_ovs_name(type
);
2126 if (!new_ops
|| !new_ops
->tc_install
) {
2130 ovs_mutex_lock(&netdev
->mutex
);
2131 error
= tc_query_qdisc(netdev_
);
2136 if (new_ops
== netdev
->tc
->ops
) {
2137 error
= new_ops
->qdisc_set
? new_ops
->qdisc_set(netdev_
, details
) : 0;
2139 /* Delete existing qdisc. */
2140 error
= tc_del_qdisc(netdev_
);
2144 ovs_assert(netdev
->tc
== NULL
);
2146 /* Install new qdisc. */
2147 error
= new_ops
->tc_install(netdev_
, details
);
2148 ovs_assert((error
== 0) == (netdev
->tc
!= NULL
));
2152 ovs_mutex_unlock(&netdev
->mutex
);
2157 netdev_linux_get_queue(const struct netdev
*netdev_
,
2158 unsigned int queue_id
, struct smap
*details
)
2160 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2163 ovs_mutex_lock(&netdev
->mutex
);
2164 error
= tc_query_qdisc(netdev_
);
2166 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2168 ? netdev
->tc
->ops
->class_get(netdev_
, queue
, details
)
2171 ovs_mutex_unlock(&netdev
->mutex
);
2177 netdev_linux_set_queue(struct netdev
*netdev_
,
2178 unsigned int queue_id
, const struct smap
*details
)
2180 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2183 ovs_mutex_lock(&netdev
->mutex
);
2184 error
= tc_query_qdisc(netdev_
);
2186 error
= (queue_id
< netdev
->tc
->ops
->n_queues
2187 && netdev
->tc
->ops
->class_set
2188 ? netdev
->tc
->ops
->class_set(netdev_
, queue_id
, details
)
2191 ovs_mutex_unlock(&netdev
->mutex
);
2197 netdev_linux_delete_queue(struct netdev
*netdev_
, unsigned int queue_id
)
2199 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2202 ovs_mutex_lock(&netdev
->mutex
);
2203 error
= tc_query_qdisc(netdev_
);
2205 if (netdev
->tc
->ops
->class_delete
) {
2206 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2208 ? netdev
->tc
->ops
->class_delete(netdev_
, queue
)
2214 ovs_mutex_unlock(&netdev
->mutex
);
2220 netdev_linux_get_queue_stats(const struct netdev
*netdev_
,
2221 unsigned int queue_id
,
2222 struct netdev_queue_stats
*stats
)
2224 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2227 ovs_mutex_lock(&netdev
->mutex
);
2228 error
= tc_query_qdisc(netdev_
);
2230 if (netdev
->tc
->ops
->class_get_stats
) {
2231 const struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2233 stats
->created
= queue
->created
;
2234 error
= netdev
->tc
->ops
->class_get_stats(netdev_
, queue
,
2243 ovs_mutex_unlock(&netdev
->mutex
);
2248 struct queue_dump_state
{
2249 struct nl_dump dump
;
2254 start_queue_dump(const struct netdev
*netdev
, struct queue_dump_state
*state
)
2256 struct ofpbuf request
;
2257 struct tcmsg
*tcmsg
;
2259 tcmsg
= tc_make_request(netdev
, RTM_GETTCLASS
, 0, &request
);
2263 tcmsg
->tcm_parent
= 0;
2264 nl_dump_start(&state
->dump
, NETLINK_ROUTE
, &request
);
2265 ofpbuf_uninit(&request
);
2267 ofpbuf_init(&state
->buf
, NL_DUMP_BUFSIZE
);
2272 finish_queue_dump(struct queue_dump_state
*state
)
2274 ofpbuf_uninit(&state
->buf
);
2275 return nl_dump_done(&state
->dump
);
2278 struct netdev_linux_queue_state
{
2279 unsigned int *queues
;
2285 netdev_linux_queue_dump_start(const struct netdev
*netdev_
, void **statep
)
2287 const struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2290 ovs_mutex_lock(&netdev
->mutex
);
2291 error
= tc_query_qdisc(netdev_
);
2293 if (netdev
->tc
->ops
->class_get
) {
2294 struct netdev_linux_queue_state
*state
;
2295 struct tc_queue
*queue
;
2298 *statep
= state
= xmalloc(sizeof *state
);
2299 state
->n_queues
= hmap_count(&netdev
->tc
->queues
);
2300 state
->cur_queue
= 0;
2301 state
->queues
= xmalloc(state
->n_queues
* sizeof *state
->queues
);
2304 HMAP_FOR_EACH (queue
, hmap_node
, &netdev
->tc
->queues
) {
2305 state
->queues
[i
++] = queue
->queue_id
;
2311 ovs_mutex_unlock(&netdev
->mutex
);
2317 netdev_linux_queue_dump_next(const struct netdev
*netdev_
, void *state_
,
2318 unsigned int *queue_idp
, struct smap
*details
)
2320 const struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2321 struct netdev_linux_queue_state
*state
= state_
;
2324 ovs_mutex_lock(&netdev
->mutex
);
2325 while (state
->cur_queue
< state
->n_queues
) {
2326 unsigned int queue_id
= state
->queues
[state
->cur_queue
++];
2327 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2330 *queue_idp
= queue_id
;
2331 error
= netdev
->tc
->ops
->class_get(netdev_
, queue
, details
);
2335 ovs_mutex_unlock(&netdev
->mutex
);
2341 netdev_linux_queue_dump_done(const struct netdev
*netdev OVS_UNUSED
,
2344 struct netdev_linux_queue_state
*state
= state_
;
2346 free(state
->queues
);
2352 netdev_linux_dump_queue_stats(const struct netdev
*netdev_
,
2353 netdev_dump_queue_stats_cb
*cb
, void *aux
)
2355 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2358 ovs_mutex_lock(&netdev
->mutex
);
2359 error
= tc_query_qdisc(netdev_
);
2361 struct queue_dump_state state
;
2363 if (!netdev
->tc
->ops
->class_dump_stats
) {
2365 } else if (!start_queue_dump(netdev_
, &state
)) {
2371 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
2372 retval
= netdev
->tc
->ops
->class_dump_stats(netdev_
, &msg
,
2379 retval
= finish_queue_dump(&state
);
2385 ovs_mutex_unlock(&netdev
->mutex
);
2391 netdev_linux_get_in4(const struct netdev
*netdev_
,
2392 struct in_addr
*address
, struct in_addr
*netmask
)
2394 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2397 ovs_mutex_lock(&netdev
->mutex
);
2398 if (!(netdev
->cache_valid
& VALID_IN4
)) {
2399 error
= netdev_linux_get_ipv4(netdev_
, &netdev
->address
,
2400 SIOCGIFADDR
, "SIOCGIFADDR");
2402 error
= netdev_linux_get_ipv4(netdev_
, &netdev
->netmask
,
2403 SIOCGIFNETMASK
, "SIOCGIFNETMASK");
2405 netdev
->cache_valid
|= VALID_IN4
;
2413 if (netdev
->address
.s_addr
!= INADDR_ANY
) {
2414 *address
= netdev
->address
;
2415 *netmask
= netdev
->netmask
;
2417 error
= EADDRNOTAVAIL
;
2420 ovs_mutex_unlock(&netdev
->mutex
);
2426 netdev_linux_set_in4(struct netdev
*netdev_
, struct in_addr address
,
2427 struct in_addr netmask
)
2429 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2432 ovs_mutex_lock(&netdev
->mutex
);
2433 error
= do_set_addr(netdev_
, SIOCSIFADDR
, "SIOCSIFADDR", address
);
2435 netdev
->cache_valid
|= VALID_IN4
;
2436 netdev
->address
= address
;
2437 netdev
->netmask
= netmask
;
2438 if (address
.s_addr
!= INADDR_ANY
) {
2439 error
= do_set_addr(netdev_
, SIOCSIFNETMASK
,
2440 "SIOCSIFNETMASK", netmask
);
2443 ovs_mutex_unlock(&netdev
->mutex
);
2449 parse_if_inet6_line(const char *line
,
2450 struct in6_addr
*in6
, char ifname
[16 + 1])
2452 uint8_t *s6
= in6
->s6_addr
;
2453 #define X8 "%2"SCNx8
2454 return ovs_scan(line
,
2455 " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
2456 "%*x %*x %*x %*x %16s\n",
2457 &s6
[0], &s6
[1], &s6
[2], &s6
[3],
2458 &s6
[4], &s6
[5], &s6
[6], &s6
[7],
2459 &s6
[8], &s6
[9], &s6
[10], &s6
[11],
2460 &s6
[12], &s6
[13], &s6
[14], &s6
[15],
2464 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2465 * 'in6' is non-null) and returns true. Otherwise, returns false. */
2467 netdev_linux_get_in6(const struct netdev
*netdev_
, struct in6_addr
*in6
)
2469 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2471 ovs_mutex_lock(&netdev
->mutex
);
2472 if (!(netdev
->cache_valid
& VALID_IN6
)) {
2476 netdev
->in6
= in6addr_any
;
2478 file
= fopen("/proc/net/if_inet6", "r");
2480 const char *name
= netdev_get_name(netdev_
);
2481 while (fgets(line
, sizeof line
, file
)) {
2482 struct in6_addr in6_tmp
;
2483 char ifname
[16 + 1];
2484 if (parse_if_inet6_line(line
, &in6_tmp
, ifname
)
2485 && !strcmp(name
, ifname
))
2487 netdev
->in6
= in6_tmp
;
2493 netdev
->cache_valid
|= VALID_IN6
;
2496 ovs_mutex_unlock(&netdev
->mutex
);
2502 make_in4_sockaddr(struct sockaddr
*sa
, struct in_addr addr
)
2504 struct sockaddr_in sin
;
2505 memset(&sin
, 0, sizeof sin
);
2506 sin
.sin_family
= AF_INET
;
2507 sin
.sin_addr
= addr
;
2510 memset(sa
, 0, sizeof *sa
);
2511 memcpy(sa
, &sin
, sizeof sin
);
2515 do_set_addr(struct netdev
*netdev
,
2516 int ioctl_nr
, const char *ioctl_name
, struct in_addr addr
)
2520 make_in4_sockaddr(&ifr
.ifr_addr
, addr
);
2521 return af_inet_ifreq_ioctl(netdev_get_name(netdev
), &ifr
, ioctl_nr
,
2525 /* Adds 'router' as a default IP gateway. */
2527 netdev_linux_add_router(struct netdev
*netdev OVS_UNUSED
, struct in_addr router
)
2529 struct in_addr any
= { INADDR_ANY
};
2533 memset(&rt
, 0, sizeof rt
);
2534 make_in4_sockaddr(&rt
.rt_dst
, any
);
2535 make_in4_sockaddr(&rt
.rt_gateway
, router
);
2536 make_in4_sockaddr(&rt
.rt_genmask
, any
);
2537 rt
.rt_flags
= RTF_UP
| RTF_GATEWAY
;
2538 error
= af_inet_ioctl(SIOCADDRT
, &rt
);
2540 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error
));
2546 netdev_linux_get_next_hop(const struct in_addr
*host
, struct in_addr
*next_hop
,
2549 static const char fn
[] = "/proc/net/route";
2554 *netdev_name
= NULL
;
2555 stream
= fopen(fn
, "r");
2556 if (stream
== NULL
) {
2557 VLOG_WARN_RL(&rl
, "%s: open failed: %s", fn
, ovs_strerror(errno
));
2562 while (fgets(line
, sizeof line
, stream
)) {
2565 ovs_be32 dest
, gateway
, mask
;
2566 int refcnt
, metric
, mtu
;
2567 unsigned int flags
, use
, window
, irtt
;
2570 "%16s %"SCNx32
" %"SCNx32
" %04X %d %u %d %"SCNx32
2572 iface
, &dest
, &gateway
, &flags
, &refcnt
,
2573 &use
, &metric
, &mask
, &mtu
, &window
, &irtt
)) {
2574 VLOG_WARN_RL(&rl
, "%s: could not parse line %d: %s",
2578 if (!(flags
& RTF_UP
)) {
2579 /* Skip routes that aren't up. */
2583 /* The output of 'dest', 'mask', and 'gateway' were given in
2584 * network byte order, so we don't need need any endian
2585 * conversions here. */
2586 if ((dest
& mask
) == (host
->s_addr
& mask
)) {
2588 /* The host is directly reachable. */
2589 next_hop
->s_addr
= 0;
2591 /* To reach the host, we must go through a gateway. */
2592 next_hop
->s_addr
= gateway
;
2594 *netdev_name
= xstrdup(iface
);
2606 netdev_linux_get_status(const struct netdev
*netdev_
, struct smap
*smap
)
2608 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2611 ovs_mutex_lock(&netdev
->mutex
);
2612 if (!(netdev
->cache_valid
& VALID_DRVINFO
)) {
2613 struct ethtool_cmd
*cmd
= (struct ethtool_cmd
*) &netdev
->drvinfo
;
2615 COVERAGE_INC(netdev_get_ethtool
);
2616 memset(&netdev
->drvinfo
, 0, sizeof netdev
->drvinfo
);
2617 error
= netdev_linux_do_ethtool(netdev
->up
.name
,
2620 "ETHTOOL_GDRVINFO");
2622 netdev
->cache_valid
|= VALID_DRVINFO
;
2627 smap_add(smap
, "driver_name", netdev
->drvinfo
.driver
);
2628 smap_add(smap
, "driver_version", netdev
->drvinfo
.version
);
2629 smap_add(smap
, "firmware_version", netdev
->drvinfo
.fw_version
);
2631 ovs_mutex_unlock(&netdev
->mutex
);
2637 netdev_internal_get_status(const struct netdev
*netdev OVS_UNUSED
,
2640 smap_add(smap
, "driver_name", "openvswitch");
2644 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2645 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
2646 * returns 0. Otherwise, it returns a positive errno value; in particular,
2647 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2649 netdev_linux_arp_lookup(const struct netdev
*netdev
,
2650 ovs_be32 ip
, uint8_t mac
[ETH_ADDR_LEN
])
2653 struct sockaddr_in sin
;
2656 memset(&r
, 0, sizeof r
);
2657 memset(&sin
, 0, sizeof sin
);
2658 sin
.sin_family
= AF_INET
;
2659 sin
.sin_addr
.s_addr
= ip
;
2661 memcpy(&r
.arp_pa
, &sin
, sizeof sin
);
2662 r
.arp_ha
.sa_family
= ARPHRD_ETHER
;
2664 ovs_strzcpy(r
.arp_dev
, netdev_get_name(netdev
), sizeof r
.arp_dev
);
2665 COVERAGE_INC(netdev_arp_lookup
);
2666 retval
= af_inet_ioctl(SIOCGARP
, &r
);
2668 memcpy(mac
, r
.arp_ha
.sa_data
, ETH_ADDR_LEN
);
2669 } else if (retval
!= ENXIO
) {
2670 VLOG_WARN_RL(&rl
, "%s: could not look up ARP entry for "IP_FMT
": %s",
2671 netdev_get_name(netdev
), IP_ARGS(ip
),
2672 ovs_strerror(retval
));
2678 nd_to_iff_flags(enum netdev_flags nd
)
2681 if (nd
& NETDEV_UP
) {
2684 if (nd
& NETDEV_PROMISC
) {
2687 if (nd
& NETDEV_LOOPBACK
) {
2688 iff
|= IFF_LOOPBACK
;
2694 iff_to_nd_flags(int iff
)
2696 enum netdev_flags nd
= 0;
2700 if (iff
& IFF_PROMISC
) {
2701 nd
|= NETDEV_PROMISC
;
2703 if (iff
& IFF_LOOPBACK
) {
2704 nd
|= NETDEV_LOOPBACK
;
2710 update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
2711 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
2712 OVS_REQUIRES(netdev
->mutex
)
2714 int old_flags
, new_flags
;
2717 old_flags
= netdev
->ifi_flags
;
2718 *old_flagsp
= iff_to_nd_flags(old_flags
);
2719 new_flags
= (old_flags
& ~nd_to_iff_flags(off
)) | nd_to_iff_flags(on
);
2720 if (new_flags
!= old_flags
) {
2721 error
= set_flags(netdev_get_name(&netdev
->up
), new_flags
);
2722 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
2729 netdev_linux_update_flags(struct netdev
*netdev_
, enum netdev_flags off
,
2730 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
2732 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2735 ovs_mutex_lock(&netdev
->mutex
);
2736 error
= update_flags(netdev
, off
, on
, old_flagsp
);
2737 ovs_mutex_unlock(&netdev
->mutex
);
2742 #define NETDEV_LINUX_CLASS(NAME, CONSTRUCT, GET_STATS, \
2743 GET_FEATURES, GET_STATUS) \
2749 netdev_linux_wait, \
2751 netdev_linux_alloc, \
2753 netdev_linux_destruct, \
2754 netdev_linux_dealloc, \
2755 NULL, /* get_config */ \
2756 NULL, /* set_config */ \
2757 NULL, /* get_tunnel_config */ \
2758 NULL, /* get_numa_id */ \
2759 NULL, /* set_multiq */ \
2761 netdev_linux_send, \
2762 netdev_linux_send_wait, \
2764 netdev_linux_set_etheraddr, \
2765 netdev_linux_get_etheraddr, \
2766 netdev_linux_get_mtu, \
2767 netdev_linux_set_mtu, \
2768 netdev_linux_get_ifindex, \
2769 netdev_linux_get_carrier, \
2770 netdev_linux_get_carrier_resets, \
2771 netdev_linux_set_miimon_interval, \
2775 netdev_linux_set_advertisements, \
2777 netdev_linux_set_policing, \
2778 netdev_linux_get_qos_types, \
2779 netdev_linux_get_qos_capabilities, \
2780 netdev_linux_get_qos, \
2781 netdev_linux_set_qos, \
2782 netdev_linux_get_queue, \
2783 netdev_linux_set_queue, \
2784 netdev_linux_delete_queue, \
2785 netdev_linux_get_queue_stats, \
2786 netdev_linux_queue_dump_start, \
2787 netdev_linux_queue_dump_next, \
2788 netdev_linux_queue_dump_done, \
2789 netdev_linux_dump_queue_stats, \
2791 netdev_linux_get_in4, \
2792 netdev_linux_set_in4, \
2793 netdev_linux_get_in6, \
2794 netdev_linux_add_router, \
2795 netdev_linux_get_next_hop, \
2797 netdev_linux_arp_lookup, \
2799 netdev_linux_update_flags, \
2801 netdev_linux_rxq_alloc, \
2802 netdev_linux_rxq_construct, \
2803 netdev_linux_rxq_destruct, \
2804 netdev_linux_rxq_dealloc, \
2805 netdev_linux_rxq_recv, \
2806 netdev_linux_rxq_wait, \
2807 netdev_linux_rxq_drain, \
2810 const struct netdev_class netdev_linux_class
=
2813 netdev_linux_construct
,
2814 netdev_linux_get_stats
,
2815 netdev_linux_get_features
,
2816 netdev_linux_get_status
);
2818 const struct netdev_class netdev_tap_class
=
2821 netdev_linux_construct_tap
,
2822 netdev_tap_get_stats
,
2823 netdev_linux_get_features
,
2824 netdev_linux_get_status
);
2826 const struct netdev_class netdev_internal_class
=
2829 netdev_linux_construct
,
2830 netdev_internal_get_stats
,
2831 NULL
, /* get_features */
2832 netdev_internal_get_status
);
2834 /* HTB traffic control class. */
2836 #define HTB_N_QUEUES 0xf000
2840 unsigned int max_rate
; /* In bytes/s. */
2844 struct tc_queue tc_queue
;
2845 unsigned int min_rate
; /* In bytes/s. */
2846 unsigned int max_rate
; /* In bytes/s. */
2847 unsigned int burst
; /* In bytes. */
2848 unsigned int priority
; /* Lower values are higher priorities. */
2852 htb_get__(const struct netdev
*netdev_
)
2854 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2855 return CONTAINER_OF(netdev
->tc
, struct htb
, tc
);
2859 htb_install__(struct netdev
*netdev_
, uint64_t max_rate
)
2861 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2864 htb
= xmalloc(sizeof *htb
);
2865 tc_init(&htb
->tc
, &tc_ops_htb
);
2866 htb
->max_rate
= max_rate
;
2868 netdev
->tc
= &htb
->tc
;
2871 /* Create an HTB qdisc.
2873 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2875 htb_setup_qdisc__(struct netdev
*netdev
)
2878 struct tc_htb_glob opt
;
2879 struct ofpbuf request
;
2880 struct tcmsg
*tcmsg
;
2882 tc_del_qdisc(netdev
);
2884 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
2885 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
2889 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
2890 tcmsg
->tcm_parent
= TC_H_ROOT
;
2892 nl_msg_put_string(&request
, TCA_KIND
, "htb");
2894 memset(&opt
, 0, sizeof opt
);
2895 opt
.rate2quantum
= 10;
2899 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
2900 nl_msg_put_unspec(&request
, TCA_HTB_INIT
, &opt
, sizeof opt
);
2901 nl_msg_end_nested(&request
, opt_offset
);
2903 return tc_transact(&request
, NULL
);
2906 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2907 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2909 htb_setup_class__(struct netdev
*netdev
, unsigned int handle
,
2910 unsigned int parent
, struct htb_class
*class)
2913 struct tc_htb_opt opt
;
2914 struct ofpbuf request
;
2915 struct tcmsg
*tcmsg
;
2919 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
2921 VLOG_WARN_RL(&rl
, "cannot set up HTB on device %s that lacks MTU",
2922 netdev_get_name(netdev
));
2926 memset(&opt
, 0, sizeof opt
);
2927 tc_fill_rate(&opt
.rate
, class->min_rate
, mtu
);
2928 tc_fill_rate(&opt
.ceil
, class->max_rate
, mtu
);
2929 opt
.buffer
= tc_calc_buffer(opt
.rate
.rate
, mtu
, class->burst
);
2930 opt
.cbuffer
= tc_calc_buffer(opt
.ceil
.rate
, mtu
, class->burst
);
2931 opt
.prio
= class->priority
;
2933 tcmsg
= tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
, &request
);
2937 tcmsg
->tcm_handle
= handle
;
2938 tcmsg
->tcm_parent
= parent
;
2940 nl_msg_put_string(&request
, TCA_KIND
, "htb");
2941 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
2942 nl_msg_put_unspec(&request
, TCA_HTB_PARMS
, &opt
, sizeof opt
);
2943 tc_put_rtab(&request
, TCA_HTB_RTAB
, &opt
.rate
);
2944 tc_put_rtab(&request
, TCA_HTB_CTAB
, &opt
.ceil
);
2945 nl_msg_end_nested(&request
, opt_offset
);
2947 error
= tc_transact(&request
, NULL
);
2949 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
2950 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2951 netdev_get_name(netdev
),
2952 tc_get_major(handle
), tc_get_minor(handle
),
2953 tc_get_major(parent
), tc_get_minor(parent
),
2954 class->min_rate
, class->max_rate
,
2955 class->burst
, class->priority
, ovs_strerror(error
));
2960 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
2961 * description of them into 'details'. The description complies with the
2962 * specification given in the vswitch database documentation for linux-htb
2965 htb_parse_tca_options__(struct nlattr
*nl_options
, struct htb_class
*class)
2967 static const struct nl_policy tca_htb_policy
[] = {
2968 [TCA_HTB_PARMS
] = { .type
= NL_A_UNSPEC
, .optional
= false,
2969 .min_len
= sizeof(struct tc_htb_opt
) },
2972 struct nlattr
*attrs
[ARRAY_SIZE(tca_htb_policy
)];
2973 const struct tc_htb_opt
*htb
;
2975 if (!nl_parse_nested(nl_options
, tca_htb_policy
,
2976 attrs
, ARRAY_SIZE(tca_htb_policy
))) {
2977 VLOG_WARN_RL(&rl
, "failed to parse HTB class options");
2981 htb
= nl_attr_get(attrs
[TCA_HTB_PARMS
]);
2982 class->min_rate
= htb
->rate
.rate
;
2983 class->max_rate
= htb
->ceil
.rate
;
2984 class->burst
= tc_ticks_to_bytes(htb
->rate
.rate
, htb
->buffer
);
2985 class->priority
= htb
->prio
;
2990 htb_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
2991 struct htb_class
*options
,
2992 struct netdev_queue_stats
*stats
)
2994 struct nlattr
*nl_options
;
2995 unsigned int handle
;
2998 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
2999 if (!error
&& queue_id
) {
3000 unsigned int major
= tc_get_major(handle
);
3001 unsigned int minor
= tc_get_minor(handle
);
3002 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
3003 *queue_id
= minor
- 1;
3008 if (!error
&& options
) {
3009 error
= htb_parse_tca_options__(nl_options
, options
);
3015 htb_parse_qdisc_details__(struct netdev
*netdev_
,
3016 const struct smap
*details
, struct htb_class
*hc
)
3018 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3019 const char *max_rate_s
;
3021 max_rate_s
= smap_get(details
, "max-rate");
3022 hc
->max_rate
= max_rate_s
? strtoull(max_rate_s
, NULL
, 10) / 8 : 0;
3023 if (!hc
->max_rate
) {
3024 enum netdev_features current
;
3026 netdev_linux_read_features(netdev
);
3027 current
= !netdev
->get_features_error
? netdev
->current
: 0;
3028 hc
->max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
3030 hc
->min_rate
= hc
->max_rate
;
3036 htb_parse_class_details__(struct netdev
*netdev
,
3037 const struct smap
*details
, struct htb_class
*hc
)
3039 const struct htb
*htb
= htb_get__(netdev
);
3040 const char *min_rate_s
= smap_get(details
, "min-rate");
3041 const char *max_rate_s
= smap_get(details
, "max-rate");
3042 const char *burst_s
= smap_get(details
, "burst");
3043 const char *priority_s
= smap_get(details
, "priority");
3046 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3048 VLOG_WARN_RL(&rl
, "cannot parse HTB class on device %s that lacks MTU",
3049 netdev_get_name(netdev
));
3053 /* HTB requires at least an mtu sized min-rate to send any traffic even
3054 * on uncongested links. */
3055 hc
->min_rate
= min_rate_s
? strtoull(min_rate_s
, NULL
, 10) / 8 : 0;
3056 hc
->min_rate
= MAX(hc
->min_rate
, mtu
);
3057 hc
->min_rate
= MIN(hc
->min_rate
, htb
->max_rate
);
3060 hc
->max_rate
= (max_rate_s
3061 ? strtoull(max_rate_s
, NULL
, 10) / 8
3063 hc
->max_rate
= MAX(hc
->max_rate
, hc
->min_rate
);
3064 hc
->max_rate
= MIN(hc
->max_rate
, htb
->max_rate
);
3068 * According to hints in the documentation that I've read, it is important
3069 * that 'burst' be at least as big as the largest frame that might be
3070 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
3071 * but having it a bit too small is a problem. Since netdev_get_mtu()
3072 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
3073 * the MTU. We actually add 64, instead of 14, as a guard against
3074 * additional headers get tacked on somewhere that we're not aware of. */
3075 hc
->burst
= burst_s
? strtoull(burst_s
, NULL
, 10) / 8 : 0;
3076 hc
->burst
= MAX(hc
->burst
, mtu
+ 64);
3079 hc
->priority
= priority_s
? strtoul(priority_s
, NULL
, 10) : 0;
3085 htb_query_class__(const struct netdev
*netdev
, unsigned int handle
,
3086 unsigned int parent
, struct htb_class
*options
,
3087 struct netdev_queue_stats
*stats
)
3089 struct ofpbuf
*reply
;
3092 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
3094 error
= htb_parse_tcmsg__(reply
, NULL
, options
, stats
);
3095 ofpbuf_delete(reply
);
3101 htb_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3105 error
= htb_setup_qdisc__(netdev
);
3107 struct htb_class hc
;
3109 htb_parse_qdisc_details__(netdev
, details
, &hc
);
3110 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
3111 tc_make_handle(1, 0), &hc
);
3113 htb_install__(netdev
, hc
.max_rate
);
3119 static struct htb_class
*
3120 htb_class_cast__(const struct tc_queue
*queue
)
3122 return CONTAINER_OF(queue
, struct htb_class
, tc_queue
);
3126 htb_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
3127 const struct htb_class
*hc
)
3129 struct htb
*htb
= htb_get__(netdev
);
3130 size_t hash
= hash_int(queue_id
, 0);
3131 struct tc_queue
*queue
;
3132 struct htb_class
*hcp
;
3134 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
3136 hcp
= htb_class_cast__(queue
);
3138 hcp
= xmalloc(sizeof *hcp
);
3139 queue
= &hcp
->tc_queue
;
3140 queue
->queue_id
= queue_id
;
3141 queue
->created
= time_msec();
3142 hmap_insert(&htb
->tc
.queues
, &queue
->hmap_node
, hash
);
3145 hcp
->min_rate
= hc
->min_rate
;
3146 hcp
->max_rate
= hc
->max_rate
;
3147 hcp
->burst
= hc
->burst
;
3148 hcp
->priority
= hc
->priority
;
3152 htb_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
3155 struct queue_dump_state state
;
3156 struct htb_class hc
;
3158 /* Get qdisc options. */
3160 htb_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
3161 htb_install__(netdev
, hc
.max_rate
);
3164 if (!start_queue_dump(netdev
, &state
)) {
3167 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
3168 unsigned int queue_id
;
3170 if (!htb_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
3171 htb_update_queue__(netdev
, queue_id
, &hc
);
3174 finish_queue_dump(&state
);
3180 htb_tc_destroy(struct tc
*tc
)
3182 struct htb
*htb
= CONTAINER_OF(tc
, struct htb
, tc
);
3183 struct htb_class
*hc
, *next
;
3185 HMAP_FOR_EACH_SAFE (hc
, next
, tc_queue
.hmap_node
, &htb
->tc
.queues
) {
3186 hmap_remove(&htb
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
3194 htb_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3196 const struct htb
*htb
= htb_get__(netdev
);
3197 smap_add_format(details
, "max-rate", "%llu", 8ULL * htb
->max_rate
);
3202 htb_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3204 struct htb_class hc
;
3207 htb_parse_qdisc_details__(netdev
, details
, &hc
);
3208 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
3209 tc_make_handle(1, 0), &hc
);
3211 htb_get__(netdev
)->max_rate
= hc
.max_rate
;
3217 htb_class_get(const struct netdev
*netdev OVS_UNUSED
,
3218 const struct tc_queue
*queue
, struct smap
*details
)
3220 const struct htb_class
*hc
= htb_class_cast__(queue
);
3222 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
3223 if (hc
->min_rate
!= hc
->max_rate
) {
3224 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
3226 smap_add_format(details
, "burst", "%llu", 8ULL * hc
->burst
);
3228 smap_add_format(details
, "priority", "%u", hc
->priority
);
3234 htb_class_set(struct netdev
*netdev
, unsigned int queue_id
,
3235 const struct smap
*details
)
3237 struct htb_class hc
;
3240 error
= htb_parse_class_details__(netdev
, details
, &hc
);
3245 error
= htb_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
3246 tc_make_handle(1, 0xfffe), &hc
);
3251 htb_update_queue__(netdev
, queue_id
, &hc
);
3256 htb_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
3258 struct htb_class
*hc
= htb_class_cast__(queue
);
3259 struct htb
*htb
= htb_get__(netdev
);
3262 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
3264 hmap_remove(&htb
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
3271 htb_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
3272 struct netdev_queue_stats
*stats
)
3274 return htb_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
3275 tc_make_handle(1, 0xfffe), NULL
, stats
);
3279 htb_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
3280 const struct ofpbuf
*nlmsg
,
3281 netdev_dump_queue_stats_cb
*cb
, void *aux
)
3283 struct netdev_queue_stats stats
;
3284 unsigned int handle
, major
, minor
;
3287 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
3292 major
= tc_get_major(handle
);
3293 minor
= tc_get_minor(handle
);
3294 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
3295 (*cb
)(minor
- 1, &stats
, aux
);
3300 static const struct tc_ops tc_ops_htb
= {
3301 "htb", /* linux_name */
3302 "linux-htb", /* ovs_name */
3303 HTB_N_QUEUES
, /* n_queues */
3312 htb_class_get_stats
,
3313 htb_class_dump_stats
3316 /* "linux-hfsc" traffic control class. */
3318 #define HFSC_N_QUEUES 0xf000
3326 struct tc_queue tc_queue
;
3331 static struct hfsc
*
3332 hfsc_get__(const struct netdev
*netdev_
)
3334 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3335 return CONTAINER_OF(netdev
->tc
, struct hfsc
, tc
);
3338 static struct hfsc_class
*
3339 hfsc_class_cast__(const struct tc_queue
*queue
)
3341 return CONTAINER_OF(queue
, struct hfsc_class
, tc_queue
);
3345 hfsc_install__(struct netdev
*netdev_
, uint32_t max_rate
)
3347 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3350 hfsc
= xmalloc(sizeof *hfsc
);
3351 tc_init(&hfsc
->tc
, &tc_ops_hfsc
);
3352 hfsc
->max_rate
= max_rate
;
3353 netdev
->tc
= &hfsc
->tc
;
3357 hfsc_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
3358 const struct hfsc_class
*hc
)
3362 struct hfsc_class
*hcp
;
3363 struct tc_queue
*queue
;
3365 hfsc
= hfsc_get__(netdev
);
3366 hash
= hash_int(queue_id
, 0);
3368 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
3370 hcp
= hfsc_class_cast__(queue
);
3372 hcp
= xmalloc(sizeof *hcp
);
3373 queue
= &hcp
->tc_queue
;
3374 queue
->queue_id
= queue_id
;
3375 queue
->created
= time_msec();
3376 hmap_insert(&hfsc
->tc
.queues
, &queue
->hmap_node
, hash
);
3379 hcp
->min_rate
= hc
->min_rate
;
3380 hcp
->max_rate
= hc
->max_rate
;
3384 hfsc_parse_tca_options__(struct nlattr
*nl_options
, struct hfsc_class
*class)
3386 const struct tc_service_curve
*rsc
, *fsc
, *usc
;
3387 static const struct nl_policy tca_hfsc_policy
[] = {
3389 .type
= NL_A_UNSPEC
,
3391 .min_len
= sizeof(struct tc_service_curve
),
3394 .type
= NL_A_UNSPEC
,
3396 .min_len
= sizeof(struct tc_service_curve
),
3399 .type
= NL_A_UNSPEC
,
3401 .min_len
= sizeof(struct tc_service_curve
),
3404 struct nlattr
*attrs
[ARRAY_SIZE(tca_hfsc_policy
)];
3406 if (!nl_parse_nested(nl_options
, tca_hfsc_policy
,
3407 attrs
, ARRAY_SIZE(tca_hfsc_policy
))) {
3408 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options");
3412 rsc
= nl_attr_get(attrs
[TCA_HFSC_RSC
]);
3413 fsc
= nl_attr_get(attrs
[TCA_HFSC_FSC
]);
3414 usc
= nl_attr_get(attrs
[TCA_HFSC_USC
]);
3416 if (rsc
->m1
!= 0 || rsc
->d
!= 0 ||
3417 fsc
->m1
!= 0 || fsc
->d
!= 0 ||
3418 usc
->m1
!= 0 || usc
->d
!= 0) {
3419 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
3420 "Non-linear service curves are not supported.");
3424 if (rsc
->m2
!= fsc
->m2
) {
3425 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
3426 "Real-time service curves are not supported ");
3430 if (rsc
->m2
> usc
->m2
) {
3431 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
3432 "Min-rate service curve is greater than "
3433 "the max-rate service curve.");
3437 class->min_rate
= fsc
->m2
;
3438 class->max_rate
= usc
->m2
;
3443 hfsc_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
3444 struct hfsc_class
*options
,
3445 struct netdev_queue_stats
*stats
)
3448 unsigned int handle
;
3449 struct nlattr
*nl_options
;
3451 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
3457 unsigned int major
, minor
;
3459 major
= tc_get_major(handle
);
3460 minor
= tc_get_minor(handle
);
3461 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
3462 *queue_id
= minor
- 1;
3469 error
= hfsc_parse_tca_options__(nl_options
, options
);
3476 hfsc_query_class__(const struct netdev
*netdev
, unsigned int handle
,
3477 unsigned int parent
, struct hfsc_class
*options
,
3478 struct netdev_queue_stats
*stats
)
3481 struct ofpbuf
*reply
;
3483 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
3488 error
= hfsc_parse_tcmsg__(reply
, NULL
, options
, stats
);
3489 ofpbuf_delete(reply
);
3494 hfsc_parse_qdisc_details__(struct netdev
*netdev_
, const struct smap
*details
,
3495 struct hfsc_class
*class)
3497 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3499 const char *max_rate_s
;
3501 max_rate_s
= smap_get(details
, "max-rate");
3502 max_rate
= max_rate_s
? strtoull(max_rate_s
, NULL
, 10) / 8 : 0;
3505 enum netdev_features current
;
3507 netdev_linux_read_features(netdev
);
3508 current
= !netdev
->get_features_error
? netdev
->current
: 0;
3509 max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
3512 class->min_rate
= max_rate
;
3513 class->max_rate
= max_rate
;
3517 hfsc_parse_class_details__(struct netdev
*netdev
,
3518 const struct smap
*details
,
3519 struct hfsc_class
* class)
3521 const struct hfsc
*hfsc
;
3522 uint32_t min_rate
, max_rate
;
3523 const char *min_rate_s
, *max_rate_s
;
3525 hfsc
= hfsc_get__(netdev
);
3526 min_rate_s
= smap_get(details
, "min-rate");
3527 max_rate_s
= smap_get(details
, "max-rate");
3529 min_rate
= min_rate_s
? strtoull(min_rate_s
, NULL
, 10) / 8 : 0;
3530 min_rate
= MAX(min_rate
, 1);
3531 min_rate
= MIN(min_rate
, hfsc
->max_rate
);
3533 max_rate
= (max_rate_s
3534 ? strtoull(max_rate_s
, NULL
, 10) / 8
3536 max_rate
= MAX(max_rate
, min_rate
);
3537 max_rate
= MIN(max_rate
, hfsc
->max_rate
);
3539 class->min_rate
= min_rate
;
3540 class->max_rate
= max_rate
;
3545 /* Create an HFSC qdisc.
3547 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3549 hfsc_setup_qdisc__(struct netdev
* netdev
)
3551 struct tcmsg
*tcmsg
;
3552 struct ofpbuf request
;
3553 struct tc_hfsc_qopt opt
;
3555 tc_del_qdisc(netdev
);
3557 tcmsg
= tc_make_request(netdev
, RTM_NEWQDISC
,
3558 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3564 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3565 tcmsg
->tcm_parent
= TC_H_ROOT
;
3567 memset(&opt
, 0, sizeof opt
);
3570 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
3571 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
3573 return tc_transact(&request
, NULL
);
3576 /* Create an HFSC class.
3578 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3579 * sc rate <min_rate> ul rate <max_rate>" */
3581 hfsc_setup_class__(struct netdev
*netdev
, unsigned int handle
,
3582 unsigned int parent
, struct hfsc_class
*class)
3586 struct tcmsg
*tcmsg
;
3587 struct ofpbuf request
;
3588 struct tc_service_curve min
, max
;
3590 tcmsg
= tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
, &request
);
3596 tcmsg
->tcm_handle
= handle
;
3597 tcmsg
->tcm_parent
= parent
;
3601 min
.m2
= class->min_rate
;
3605 max
.m2
= class->max_rate
;
3607 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
3608 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3609 nl_msg_put_unspec(&request
, TCA_HFSC_RSC
, &min
, sizeof min
);
3610 nl_msg_put_unspec(&request
, TCA_HFSC_FSC
, &min
, sizeof min
);
3611 nl_msg_put_unspec(&request
, TCA_HFSC_USC
, &max
, sizeof max
);
3612 nl_msg_end_nested(&request
, opt_offset
);
3614 error
= tc_transact(&request
, NULL
);
3616 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
3617 "min-rate %ubps, max-rate %ubps (%s)",
3618 netdev_get_name(netdev
),
3619 tc_get_major(handle
), tc_get_minor(handle
),
3620 tc_get_major(parent
), tc_get_minor(parent
),
3621 class->min_rate
, class->max_rate
, ovs_strerror(error
));
3628 hfsc_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3631 struct hfsc_class
class;
3633 error
= hfsc_setup_qdisc__(netdev
);
3639 hfsc_parse_qdisc_details__(netdev
, details
, &class);
3640 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
3641 tc_make_handle(1, 0), &class);
3647 hfsc_install__(netdev
, class.max_rate
);
3652 hfsc_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
3655 struct queue_dump_state state
;
3656 struct hfsc_class hc
;
3659 hfsc_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
3660 hfsc_install__(netdev
, hc
.max_rate
);
3662 if (!start_queue_dump(netdev
, &state
)) {
3666 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
3667 unsigned int queue_id
;
3669 if (!hfsc_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
3670 hfsc_update_queue__(netdev
, queue_id
, &hc
);
3674 finish_queue_dump(&state
);
3679 hfsc_tc_destroy(struct tc
*tc
)
3682 struct hfsc_class
*hc
, *next
;
3684 hfsc
= CONTAINER_OF(tc
, struct hfsc
, tc
);
3686 HMAP_FOR_EACH_SAFE (hc
, next
, tc_queue
.hmap_node
, &hfsc
->tc
.queues
) {
3687 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
3696 hfsc_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3698 const struct hfsc
*hfsc
;
3699 hfsc
= hfsc_get__(netdev
);
3700 smap_add_format(details
, "max-rate", "%llu", 8ULL * hfsc
->max_rate
);
3705 hfsc_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3708 struct hfsc_class
class;
3710 hfsc_parse_qdisc_details__(netdev
, details
, &class);
3711 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
3712 tc_make_handle(1, 0), &class);
3715 hfsc_get__(netdev
)->max_rate
= class.max_rate
;
3722 hfsc_class_get(const struct netdev
*netdev OVS_UNUSED
,
3723 const struct tc_queue
*queue
, struct smap
*details
)
3725 const struct hfsc_class
*hc
;
3727 hc
= hfsc_class_cast__(queue
);
3728 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
3729 if (hc
->min_rate
!= hc
->max_rate
) {
3730 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
3736 hfsc_class_set(struct netdev
*netdev
, unsigned int queue_id
,
3737 const struct smap
*details
)
3740 struct hfsc_class
class;
3742 error
= hfsc_parse_class_details__(netdev
, details
, &class);
3747 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
3748 tc_make_handle(1, 0xfffe), &class);
3753 hfsc_update_queue__(netdev
, queue_id
, &class);
3758 hfsc_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
3762 struct hfsc_class
*hc
;
3764 hc
= hfsc_class_cast__(queue
);
3765 hfsc
= hfsc_get__(netdev
);
3767 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
3769 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
3776 hfsc_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
3777 struct netdev_queue_stats
*stats
)
3779 return hfsc_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
3780 tc_make_handle(1, 0xfffe), NULL
, stats
);
3784 hfsc_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
3785 const struct ofpbuf
*nlmsg
,
3786 netdev_dump_queue_stats_cb
*cb
, void *aux
)
3788 struct netdev_queue_stats stats
;
3789 unsigned int handle
, major
, minor
;
3792 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
3797 major
= tc_get_major(handle
);
3798 minor
= tc_get_minor(handle
);
3799 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
3800 (*cb
)(minor
- 1, &stats
, aux
);
3805 static const struct tc_ops tc_ops_hfsc
= {
3806 "hfsc", /* linux_name */
3807 "linux-hfsc", /* ovs_name */
3808 HFSC_N_QUEUES
, /* n_queues */
3809 hfsc_tc_install
, /* tc_install */
3810 hfsc_tc_load
, /* tc_load */
3811 hfsc_tc_destroy
, /* tc_destroy */
3812 hfsc_qdisc_get
, /* qdisc_get */
3813 hfsc_qdisc_set
, /* qdisc_set */
3814 hfsc_class_get
, /* class_get */
3815 hfsc_class_set
, /* class_set */
3816 hfsc_class_delete
, /* class_delete */
3817 hfsc_class_get_stats
, /* class_get_stats */
3818 hfsc_class_dump_stats
/* class_dump_stats */
3821 /* "linux-default" traffic control class.
3823 * This class represents the default, unnamed Linux qdisc. It corresponds to
3824 * the "" (empty string) QoS type in the OVS database. */
3827 default_install__(struct netdev
*netdev_
)
3829 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3830 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
3832 /* Nothing but a tc class implementation is allowed to write to a tc. This
3833 * class never does that, so we can legitimately use a const tc object. */
3834 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
3838 default_tc_install(struct netdev
*netdev
,
3839 const struct smap
*details OVS_UNUSED
)
3841 default_install__(netdev
);
3846 default_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
3848 default_install__(netdev
);
3852 static const struct tc_ops tc_ops_default
= {
3853 NULL
, /* linux_name */
3858 NULL
, /* tc_destroy */
3859 NULL
, /* qdisc_get */
3860 NULL
, /* qdisc_set */
3861 NULL
, /* class_get */
3862 NULL
, /* class_set */
3863 NULL
, /* class_delete */
3864 NULL
, /* class_get_stats */
3865 NULL
/* class_dump_stats */
3868 /* "linux-other" traffic control class.
3873 other_tc_load(struct netdev
*netdev_
, struct ofpbuf
*nlmsg OVS_UNUSED
)
3875 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3876 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_other
);
3878 /* Nothing but a tc class implementation is allowed to write to a tc. This
3879 * class never does that, so we can legitimately use a const tc object. */
3880 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
3884 static const struct tc_ops tc_ops_other
= {
3885 NULL
, /* linux_name */
3886 "linux-other", /* ovs_name */
3888 NULL
, /* tc_install */
3890 NULL
, /* tc_destroy */
3891 NULL
, /* qdisc_get */
3892 NULL
, /* qdisc_set */
3893 NULL
, /* class_get */
3894 NULL
, /* class_set */
3895 NULL
, /* class_delete */
3896 NULL
, /* class_get_stats */
3897 NULL
/* class_dump_stats */
3900 /* Traffic control. */
3902 /* Number of kernel "tc" ticks per second. */
3903 static double ticks_per_s
;
3905 /* Number of kernel "jiffies" per second. This is used for the purpose of
3906 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3907 * one jiffy's worth of data.
3909 * There are two possibilities here:
3911 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3912 * approximate range of 100 to 1024. That means that we really need to
3913 * make sure that the qdisc can buffer that much data.
3915 * - 'buffer_hz' is an absurdly large number. That means that the kernel
3916 * has finely granular timers and there's no need to fudge additional room
3917 * for buffers. (There's no extra effort needed to implement that: the
3918 * large 'buffer_hz' is used as a divisor, so practically any number will
3919 * come out as 0 in the division. Small integer results in the case of
3920 * really high dividends won't have any real effect anyhow.)
3922 static unsigned int buffer_hz
;
3924 /* Returns tc handle 'major':'minor'. */
3926 tc_make_handle(unsigned int major
, unsigned int minor
)
3928 return TC_H_MAKE(major
<< 16, minor
);
3931 /* Returns the major number from 'handle'. */
3933 tc_get_major(unsigned int handle
)
3935 return TC_H_MAJ(handle
) >> 16;
3938 /* Returns the minor number from 'handle'. */
3940 tc_get_minor(unsigned int handle
)
3942 return TC_H_MIN(handle
);
3945 static struct tcmsg
*
3946 tc_make_request(const struct netdev
*netdev
, int type
, unsigned int flags
,
3947 struct ofpbuf
*request
)
3949 struct tcmsg
*tcmsg
;
3953 error
= get_ifindex(netdev
, &ifindex
);
3958 ofpbuf_init(request
, 512);
3959 nl_msg_put_nlmsghdr(request
, sizeof *tcmsg
, type
, NLM_F_REQUEST
| flags
);
3960 tcmsg
= ofpbuf_put_zeros(request
, sizeof *tcmsg
);
3961 tcmsg
->tcm_family
= AF_UNSPEC
;
3962 tcmsg
->tcm_ifindex
= ifindex
;
3963 /* Caller should fill in tcmsg->tcm_handle. */
3964 /* Caller should fill in tcmsg->tcm_parent. */
3970 tc_transact(struct ofpbuf
*request
, struct ofpbuf
**replyp
)
3972 int error
= nl_transact(NETLINK_ROUTE
, request
, replyp
);
3973 ofpbuf_uninit(request
);
3977 /* Adds or deletes a root ingress qdisc on 'netdev'. We use this for
3978 * policing configuration.
3980 * This function is equivalent to running the following when 'add' is true:
3981 * /sbin/tc qdisc add dev <devname> handle ffff: ingress
3983 * This function is equivalent to running the following when 'add' is false:
3984 * /sbin/tc qdisc del dev <devname> handle ffff: ingress
3986 * The configuration and stats may be seen with the following command:
3987 * /sbin/tc -s qdisc show dev <devname>
3989 * Returns 0 if successful, otherwise a positive errno value.
3992 tc_add_del_ingress_qdisc(struct netdev
*netdev
, bool add
)
3994 struct ofpbuf request
;
3995 struct tcmsg
*tcmsg
;
3997 int type
= add
? RTM_NEWQDISC
: RTM_DELQDISC
;
3998 int flags
= add
? NLM_F_EXCL
| NLM_F_CREATE
: 0;
4000 tcmsg
= tc_make_request(netdev
, type
, flags
, &request
);
4004 tcmsg
->tcm_handle
= tc_make_handle(0xffff, 0);
4005 tcmsg
->tcm_parent
= TC_H_INGRESS
;
4006 nl_msg_put_string(&request
, TCA_KIND
, "ingress");
4007 nl_msg_put_unspec(&request
, TCA_OPTIONS
, NULL
, 0);
4009 error
= tc_transact(&request
, NULL
);
4011 /* If we're deleting the qdisc, don't worry about some of the
4012 * error conditions. */
4013 if (!add
&& (error
== ENOENT
|| error
== EINVAL
)) {
4022 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
4025 * This function is equivalent to running:
4026 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
4027 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
4030 * The configuration and stats may be seen with the following command:
4031 * /sbin/tc -s filter show <devname> eth0 parent ffff:
4033 * Returns 0 if successful, otherwise a positive errno value.
4036 tc_add_policer(struct netdev
*netdev
, int kbits_rate
, int kbits_burst
)
4038 struct tc_police tc_police
;
4039 struct ofpbuf request
;
4040 struct tcmsg
*tcmsg
;
4041 size_t basic_offset
;
4042 size_t police_offset
;
4046 memset(&tc_police
, 0, sizeof tc_police
);
4047 tc_police
.action
= TC_POLICE_SHOT
;
4048 tc_police
.mtu
= mtu
;
4049 tc_fill_rate(&tc_police
.rate
, ((uint64_t) kbits_rate
* 1000)/8, mtu
);
4050 tc_police
.burst
= tc_bytes_to_ticks(tc_police
.rate
.rate
,
4051 kbits_burst
* 1024);
4053 tcmsg
= tc_make_request(netdev
, RTM_NEWTFILTER
,
4054 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4058 tcmsg
->tcm_parent
= tc_make_handle(0xffff, 0);
4059 tcmsg
->tcm_info
= tc_make_handle(49,
4060 (OVS_FORCE
uint16_t) htons(ETH_P_ALL
));
4062 nl_msg_put_string(&request
, TCA_KIND
, "basic");
4063 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4064 police_offset
= nl_msg_start_nested(&request
, TCA_BASIC_POLICE
);
4065 nl_msg_put_unspec(&request
, TCA_POLICE_TBF
, &tc_police
, sizeof tc_police
);
4066 tc_put_rtab(&request
, TCA_POLICE_RATE
, &tc_police
.rate
);
4067 nl_msg_end_nested(&request
, police_offset
);
4068 nl_msg_end_nested(&request
, basic_offset
);
4070 error
= tc_transact(&request
, NULL
);
4081 /* The values in psched are not individually very meaningful, but they are
4082 * important. The tables below show some values seen in the wild.
4086 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
4087 * (Before that, there are hints that it was 1000000000.)
4089 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
4093 * -----------------------------------
4094 * [1] 000c8000 000f4240 000f4240 00000064
4095 * [2] 000003e8 00000400 000f4240 3b9aca00
4096 * [3] 000003e8 00000400 000f4240 3b9aca00
4097 * [4] 000003e8 00000400 000f4240 00000064
4098 * [5] 000003e8 00000040 000f4240 3b9aca00
4099 * [6] 000003e8 00000040 000f4240 000000f9
4101 * a b c d ticks_per_s buffer_hz
4102 * ------- --------- ---------- ------------- ----------- -------------
4103 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
4104 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4105 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
4106 * [4] 1,000 1,024 1,000,000 100 976,562 100
4107 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
4108 * [6] 1,000 64 1,000,000 249 15,625,000 249
4110 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
4111 * [2] 2.6.26-1-686-bigmem from Debian lenny
4112 * [3] 2.6.26-2-sparc64 from Debian lenny
4113 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
4114 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
4115 * [6] 2.6.34 from kernel.org on KVM
4117 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
4118 static const char fn
[] = "/proc/net/psched";
4119 unsigned int a
, b
, c
, d
;
4122 if (!ovsthread_once_start(&once
)) {
4129 stream
= fopen(fn
, "r");
4131 VLOG_WARN("%s: open failed: %s", fn
, ovs_strerror(errno
));
4135 if (fscanf(stream
, "%x %x %x %x", &a
, &b
, &c
, &d
) != 4) {
4136 VLOG_WARN("%s: read failed", fn
);
4140 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn
, a
, b
, c
, d
);
4144 VLOG_WARN("%s: invalid scheduler parameters", fn
);
4148 ticks_per_s
= (double) a
* c
/ b
;
4152 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
4155 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn
, ticks_per_s
, buffer_hz
);
4158 ovsthread_once_done(&once
);
4161 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
4162 * rate of 'rate' bytes per second. */
4164 tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
)
4167 return (rate
* ticks
) / ticks_per_s
;
4170 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
4171 * rate of 'rate' bytes per second. */
4173 tc_bytes_to_ticks(unsigned int rate
, unsigned int size
)
4176 return rate
? ((unsigned long long int) ticks_per_s
* size
) / rate
: 0;
4179 /* Returns the number of bytes that need to be reserved for qdisc buffering at
4180 * a transmission rate of 'rate' bytes per second. */
4182 tc_buffer_per_jiffy(unsigned int rate
)
4185 return rate
/ buffer_hz
;
4188 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
4189 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
4190 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
4191 * stores NULL into it if it is absent.
4193 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
4196 * Returns 0 if successful, otherwise a positive errno value. */
4198 tc_parse_qdisc(const struct ofpbuf
*msg
, const char **kind
,
4199 struct nlattr
**options
)
4201 static const struct nl_policy tca_policy
[] = {
4202 [TCA_KIND
] = { .type
= NL_A_STRING
, .optional
= false },
4203 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= true },
4205 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
4207 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
4208 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
4209 VLOG_WARN_RL(&rl
, "failed to parse qdisc message");
4214 *kind
= nl_attr_get_string(ta
[TCA_KIND
]);
4218 *options
= ta
[TCA_OPTIONS
];
4233 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
4234 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
4235 * into '*options', and its queue statistics into '*stats'. Any of the output
4236 * arguments may be null.
4238 * Returns 0 if successful, otherwise a positive errno value. */
4240 tc_parse_class(const struct ofpbuf
*msg
, unsigned int *handlep
,
4241 struct nlattr
**options
, struct netdev_queue_stats
*stats
)
4243 static const struct nl_policy tca_policy
[] = {
4244 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= false },
4245 [TCA_STATS2
] = { .type
= NL_A_NESTED
, .optional
= false },
4247 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
4249 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
4250 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
4251 VLOG_WARN_RL(&rl
, "failed to parse class message");
4256 struct tcmsg
*tc
= ofpbuf_at_assert(msg
, NLMSG_HDRLEN
, sizeof *tc
);
4257 *handlep
= tc
->tcm_handle
;
4261 *options
= ta
[TCA_OPTIONS
];
4265 const struct gnet_stats_queue
*gsq
;
4266 struct gnet_stats_basic gsb
;
4268 static const struct nl_policy stats_policy
[] = {
4269 [TCA_STATS_BASIC
] = { .type
= NL_A_UNSPEC
, .optional
= false,
4270 .min_len
= sizeof gsb
},
4271 [TCA_STATS_QUEUE
] = { .type
= NL_A_UNSPEC
, .optional
= false,
4272 .min_len
= sizeof *gsq
},
4274 struct nlattr
*sa
[ARRAY_SIZE(stats_policy
)];
4276 if (!nl_parse_nested(ta
[TCA_STATS2
], stats_policy
,
4277 sa
, ARRAY_SIZE(sa
))) {
4278 VLOG_WARN_RL(&rl
, "failed to parse class stats");
4282 /* Alignment issues screw up the length of struct gnet_stats_basic on
4283 * some arch/bitsize combinations. Newer versions of Linux have a
4284 * struct gnet_stats_basic_packed, but we can't depend on that. The
4285 * easiest thing to do is just to make a copy. */
4286 memset(&gsb
, 0, sizeof gsb
);
4287 memcpy(&gsb
, nl_attr_get(sa
[TCA_STATS_BASIC
]),
4288 MIN(nl_attr_get_size(sa
[TCA_STATS_BASIC
]), sizeof gsb
));
4289 stats
->tx_bytes
= gsb
.bytes
;
4290 stats
->tx_packets
= gsb
.packets
;
4292 gsq
= nl_attr_get(sa
[TCA_STATS_QUEUE
]);
4293 stats
->tx_errors
= gsq
->drops
;
4303 memset(stats
, 0, sizeof *stats
);
4308 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
4311 tc_query_class(const struct netdev
*netdev
,
4312 unsigned int handle
, unsigned int parent
,
4313 struct ofpbuf
**replyp
)
4315 struct ofpbuf request
;
4316 struct tcmsg
*tcmsg
;
4319 tcmsg
= tc_make_request(netdev
, RTM_GETTCLASS
, NLM_F_ECHO
, &request
);
4323 tcmsg
->tcm_handle
= handle
;
4324 tcmsg
->tcm_parent
= parent
;
4326 error
= tc_transact(&request
, replyp
);
4328 VLOG_WARN_RL(&rl
, "query %s class %u:%u (parent %u:%u) failed (%s)",
4329 netdev_get_name(netdev
),
4330 tc_get_major(handle
), tc_get_minor(handle
),
4331 tc_get_major(parent
), tc_get_minor(parent
),
4332 ovs_strerror(error
));
4337 /* Equivalent to "tc class del dev <name> handle <handle>". */
4339 tc_delete_class(const struct netdev
*netdev
, unsigned int handle
)
4341 struct ofpbuf request
;
4342 struct tcmsg
*tcmsg
;
4345 tcmsg
= tc_make_request(netdev
, RTM_DELTCLASS
, 0, &request
);
4349 tcmsg
->tcm_handle
= handle
;
4350 tcmsg
->tcm_parent
= 0;
4352 error
= tc_transact(&request
, NULL
);
4354 VLOG_WARN_RL(&rl
, "delete %s class %u:%u failed (%s)",
4355 netdev_get_name(netdev
),
4356 tc_get_major(handle
), tc_get_minor(handle
),
4357 ovs_strerror(error
));
4362 /* Equivalent to "tc qdisc del dev <name> root". */
4364 tc_del_qdisc(struct netdev
*netdev_
)
4366 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4367 struct ofpbuf request
;
4368 struct tcmsg
*tcmsg
;
4371 tcmsg
= tc_make_request(netdev_
, RTM_DELQDISC
, 0, &request
);
4375 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4376 tcmsg
->tcm_parent
= TC_H_ROOT
;
4378 error
= tc_transact(&request
, NULL
);
4379 if (error
== EINVAL
) {
4380 /* EINVAL probably means that the default qdisc was in use, in which
4381 * case we've accomplished our purpose. */
4384 if (!error
&& netdev
->tc
) {
4385 if (netdev
->tc
->ops
->tc_destroy
) {
4386 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
4393 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
4394 * kernel to determine what they are. Returns 0 if successful, otherwise a
4395 * positive errno value. */
4397 tc_query_qdisc(const struct netdev
*netdev_
)
4399 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4400 struct ofpbuf request
, *qdisc
;
4401 const struct tc_ops
*ops
;
4402 struct tcmsg
*tcmsg
;
4410 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
4411 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
4412 * 2.6.35 without that fix backported to it.
4414 * To avoid the OOPS, we must not make a request that would attempt to dump
4415 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
4416 * few others. There are a few ways that I can see to do this, but most of
4417 * them seem to be racy (and if you lose the race the kernel OOPSes). The
4418 * technique chosen here is to assume that any non-default qdisc that we
4419 * create will have a class with handle 1:0. The built-in qdiscs only have
4420 * a class with handle 0:0.
4422 * We could check for Linux 2.6.35+ and use a more straightforward method
4424 tcmsg
= tc_make_request(netdev_
, RTM_GETQDISC
, NLM_F_ECHO
, &request
);
4428 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4429 tcmsg
->tcm_parent
= 0;
4431 /* Figure out what tc class to instantiate. */
4432 error
= tc_transact(&request
, &qdisc
);
4436 error
= tc_parse_qdisc(qdisc
, &kind
, NULL
);
4438 ops
= &tc_ops_other
;
4440 ops
= tc_lookup_linux_name(kind
);
4442 static struct vlog_rate_limit rl2
= VLOG_RATE_LIMIT_INIT(1, 1);
4443 VLOG_INFO_RL(&rl2
, "unknown qdisc \"%s\"", kind
);
4445 ops
= &tc_ops_other
;
4448 } else if (error
== ENOENT
) {
4449 /* Either it's a built-in qdisc, or it's a qdisc set up by some
4450 * other entity that doesn't have a handle 1:0. We will assume
4451 * that it's the system default qdisc. */
4452 ops
= &tc_ops_default
;
4455 /* Who knows? Maybe the device got deleted. */
4456 VLOG_WARN_RL(&rl
, "query %s qdisc failed (%s)",
4457 netdev_get_name(netdev_
), ovs_strerror(error
));
4458 ops
= &tc_ops_other
;
4461 /* Instantiate it. */
4462 load_error
= ops
->tc_load(CONST_CAST(struct netdev
*, netdev_
), qdisc
);
4463 ovs_assert((load_error
== 0) == (netdev
->tc
!= NULL
));
4464 ofpbuf_delete(qdisc
);
4466 return error
? error
: load_error
;
4469 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
4470 approximate the time to transmit packets of various lengths. For an MTU of
4471 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
4472 represents two possible packet lengths; for a MTU of 513 through 1024, four
4473 possible lengths; and so on.
4475 Returns, for the specified 'mtu', the number of bits that packet lengths
4476 need to be shifted right to fit within such a 256-entry table. */
4478 tc_calc_cell_log(unsigned int mtu
)
4483 mtu
= ETH_PAYLOAD_MAX
;
4485 mtu
+= ETH_HEADER_LEN
+ VLAN_HEADER_LEN
;
4487 for (cell_log
= 0; mtu
>= 256; cell_log
++) {
4494 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
4497 tc_fill_rate(struct tc_ratespec
*rate
, uint64_t Bps
, int mtu
)
4499 memset(rate
, 0, sizeof *rate
);
4500 rate
->cell_log
= tc_calc_cell_log(mtu
);
4501 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
4502 /* rate->cell_align = 0; */ /* distro headers. */
4503 rate
->mpu
= ETH_TOTAL_MIN
;
4507 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
4508 * attribute of the specified "type".
4510 * See tc_calc_cell_log() above for a description of "rtab"s. */
4512 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
)
4517 rtab
= nl_msg_put_unspec_uninit(msg
, type
, TC_RTAB_SIZE
);
4518 for (i
= 0; i
< TC_RTAB_SIZE
/ sizeof *rtab
; i
++) {
4519 unsigned packet_size
= (i
+ 1) << rate
->cell_log
;
4520 if (packet_size
< rate
->mpu
) {
4521 packet_size
= rate
->mpu
;
4523 rtab
[i
] = tc_bytes_to_ticks(rate
->rate
, packet_size
);
4527 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
4528 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
4529 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
4532 tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
)
4534 unsigned int min_burst
= tc_buffer_per_jiffy(Bps
) + mtu
;
4535 return tc_bytes_to_ticks(Bps
, MAX(burst_bytes
, min_burst
));
4538 /* Linux-only functions declared in netdev-linux.h */
4540 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
4541 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
4543 netdev_linux_ethtool_set_flag(struct netdev
*netdev
, uint32_t flag
,
4544 const char *flag_name
, bool enable
)
4546 const char *netdev_name
= netdev_get_name(netdev
);
4547 struct ethtool_value evalue
;
4551 COVERAGE_INC(netdev_get_ethtool
);
4552 memset(&evalue
, 0, sizeof evalue
);
4553 error
= netdev_linux_do_ethtool(netdev_name
,
4554 (struct ethtool_cmd
*)&evalue
,
4555 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
4560 COVERAGE_INC(netdev_set_ethtool
);
4561 evalue
.data
= new_flags
= (evalue
.data
& ~flag
) | (enable
? flag
: 0);
4562 error
= netdev_linux_do_ethtool(netdev_name
,
4563 (struct ethtool_cmd
*)&evalue
,
4564 ETHTOOL_SFLAGS
, "ETHTOOL_SFLAGS");
4569 COVERAGE_INC(netdev_get_ethtool
);
4570 memset(&evalue
, 0, sizeof evalue
);
4571 error
= netdev_linux_do_ethtool(netdev_name
,
4572 (struct ethtool_cmd
*)&evalue
,
4573 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
4578 if (new_flags
!= evalue
.data
) {
4579 VLOG_WARN_RL(&rl
, "attempt to %s ethtool %s flag on network "
4580 "device %s failed", enable
? "enable" : "disable",
4581 flag_name
, netdev_name
);
4588 /* Utility functions. */
4590 /* Copies 'src' into 'dst', performing format conversion in the process. */
4592 netdev_stats_from_rtnl_link_stats(struct netdev_stats
*dst
,
4593 const struct rtnl_link_stats
*src
)
4595 dst
->rx_packets
= src
->rx_packets
;
4596 dst
->tx_packets
= src
->tx_packets
;
4597 dst
->rx_bytes
= src
->rx_bytes
;
4598 dst
->tx_bytes
= src
->tx_bytes
;
4599 dst
->rx_errors
= src
->rx_errors
;
4600 dst
->tx_errors
= src
->tx_errors
;
4601 dst
->rx_dropped
= src
->rx_dropped
;
4602 dst
->tx_dropped
= src
->tx_dropped
;
4603 dst
->multicast
= src
->multicast
;
4604 dst
->collisions
= src
->collisions
;
4605 dst
->rx_length_errors
= src
->rx_length_errors
;
4606 dst
->rx_over_errors
= src
->rx_over_errors
;
4607 dst
->rx_crc_errors
= src
->rx_crc_errors
;
4608 dst
->rx_frame_errors
= src
->rx_frame_errors
;
4609 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
4610 dst
->rx_missed_errors
= src
->rx_missed_errors
;
4611 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
4612 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
4613 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
4614 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
4615 dst
->tx_window_errors
= src
->tx_window_errors
;
4618 /* Copies 'src' into 'dst', performing format conversion in the process. */
4620 netdev_stats_from_rtnl_link_stats64(struct netdev_stats
*dst
,
4621 const struct rtnl_link_stats64
*src
)
4623 dst
->rx_packets
= src
->rx_packets
;
4624 dst
->tx_packets
= src
->tx_packets
;
4625 dst
->rx_bytes
= src
->rx_bytes
;
4626 dst
->tx_bytes
= src
->tx_bytes
;
4627 dst
->rx_errors
= src
->rx_errors
;
4628 dst
->tx_errors
= src
->tx_errors
;
4629 dst
->rx_dropped
= src
->rx_dropped
;
4630 dst
->tx_dropped
= src
->tx_dropped
;
4631 dst
->multicast
= src
->multicast
;
4632 dst
->collisions
= src
->collisions
;
4633 dst
->rx_length_errors
= src
->rx_length_errors
;
4634 dst
->rx_over_errors
= src
->rx_over_errors
;
4635 dst
->rx_crc_errors
= src
->rx_crc_errors
;
4636 dst
->rx_frame_errors
= src
->rx_frame_errors
;
4637 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
4638 dst
->rx_missed_errors
= src
->rx_missed_errors
;
4639 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
4640 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
4641 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
4642 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
4643 dst
->tx_window_errors
= src
->tx_window_errors
;
4647 get_stats_via_netlink(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
4649 struct ofpbuf request
;
4650 struct ofpbuf
*reply
;
4653 ofpbuf_init(&request
, 0);
4654 nl_msg_put_nlmsghdr(&request
,
4655 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
4656 RTM_GETLINK
, NLM_F_REQUEST
);
4657 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
4658 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(netdev_
));
4659 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
4660 ofpbuf_uninit(&request
);
4665 if (ofpbuf_try_pull(reply
, NLMSG_HDRLEN
+ sizeof(struct ifinfomsg
))) {
4666 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS64
);
4667 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats64
)) {
4668 netdev_stats_from_rtnl_link_stats64(stats
, nl_attr_get(a
));
4671 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS
);
4672 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats
)) {
4673 netdev_stats_from_rtnl_link_stats(stats
, nl_attr_get(a
));
4676 VLOG_WARN_RL(&rl
, "RTM_GETLINK reply lacks stats");
4681 VLOG_WARN_RL(&rl
, "short RTM_GETLINK reply");
4686 ofpbuf_delete(reply
);
4691 get_flags(const struct netdev
*dev
, unsigned int *flags
)
4697 error
= af_inet_ifreq_ioctl(dev
->name
, &ifr
, SIOCGIFFLAGS
, "SIOCGIFFLAGS");
4699 *flags
= ifr
.ifr_flags
;
4705 set_flags(const char *name
, unsigned int flags
)
4709 ifr
.ifr_flags
= flags
;
4710 return af_inet_ifreq_ioctl(name
, &ifr
, SIOCSIFFLAGS
, "SIOCSIFFLAGS");
4714 do_get_ifindex(const char *netdev_name
)
4719 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
4720 COVERAGE_INC(netdev_get_ifindex
);
4722 error
= af_inet_ioctl(SIOCGIFINDEX
, &ifr
);
4724 VLOG_WARN_RL(&rl
, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4725 netdev_name
, ovs_strerror(error
));
4728 return ifr
.ifr_ifindex
;
4732 get_ifindex(const struct netdev
*netdev_
, int *ifindexp
)
4734 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4736 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
4737 int ifindex
= do_get_ifindex(netdev_get_name(netdev_
));
4740 netdev
->get_ifindex_error
= -ifindex
;
4741 netdev
->ifindex
= 0;
4743 netdev
->get_ifindex_error
= 0;
4744 netdev
->ifindex
= ifindex
;
4746 netdev
->cache_valid
|= VALID_IFINDEX
;
4749 *ifindexp
= netdev
->ifindex
;
4750 return netdev
->get_ifindex_error
;
4754 get_etheraddr(const char *netdev_name
, uint8_t ea
[ETH_ADDR_LEN
])
4760 memset(&ifr
, 0, sizeof ifr
);
4761 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
4762 COVERAGE_INC(netdev_get_hwaddr
);
4763 error
= af_inet_ioctl(SIOCGIFHWADDR
, &ifr
);
4765 /* ENODEV probably means that a vif disappeared asynchronously and
4766 * hasn't been removed from the database yet, so reduce the log level
4767 * to INFO for that case. */
4768 VLOG(error
== ENODEV
? VLL_INFO
: VLL_ERR
,
4769 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4770 netdev_name
, ovs_strerror(error
));
4773 hwaddr_family
= ifr
.ifr_hwaddr
.sa_family
;
4774 if (hwaddr_family
!= AF_UNSPEC
&& hwaddr_family
!= ARPHRD_ETHER
) {
4775 VLOG_WARN("%s device has unknown hardware address family %d",
4776 netdev_name
, hwaddr_family
);
4778 memcpy(ea
, ifr
.ifr_hwaddr
.sa_data
, ETH_ADDR_LEN
);
4783 set_etheraddr(const char *netdev_name
,
4784 const uint8_t mac
[ETH_ADDR_LEN
])
4789 memset(&ifr
, 0, sizeof ifr
);
4790 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
4791 ifr
.ifr_hwaddr
.sa_family
= ARPHRD_ETHER
;
4792 memcpy(ifr
.ifr_hwaddr
.sa_data
, mac
, ETH_ADDR_LEN
);
4793 COVERAGE_INC(netdev_set_hwaddr
);
4794 error
= af_inet_ioctl(SIOCSIFHWADDR
, &ifr
);
4796 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4797 netdev_name
, ovs_strerror(error
));
4803 netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*ecmd
,
4804 int cmd
, const char *cmd_name
)
4809 memset(&ifr
, 0, sizeof ifr
);
4810 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
4811 ifr
.ifr_data
= (caddr_t
) ecmd
;
4814 error
= af_inet_ioctl(SIOCETHTOOL
, &ifr
);
4816 if (error
!= EOPNOTSUPP
) {
4817 VLOG_WARN_RL(&rl
, "ethtool command %s on network device %s "
4818 "failed: %s", cmd_name
, name
, ovs_strerror(error
));
4820 /* The device doesn't support this operation. That's pretty
4821 * common, so there's no point in logging anything. */
4828 netdev_linux_get_ipv4(const struct netdev
*netdev
, struct in_addr
*ip
,
4829 int cmd
, const char *cmd_name
)
4834 ifr
.ifr_addr
.sa_family
= AF_INET
;
4835 error
= af_inet_ifreq_ioctl(netdev_get_name(netdev
), &ifr
, cmd
, cmd_name
);
4837 const struct sockaddr_in
*sin
= ALIGNED_CAST(struct sockaddr_in
*,
4839 *ip
= sin
->sin_addr
;
4844 /* Returns an AF_PACKET raw socket or a negative errno value. */
4846 af_packet_sock(void)
4848 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
4851 if (ovsthread_once_start(&once
)) {
4852 sock
= socket(AF_PACKET
, SOCK_RAW
, 0);
4854 int error
= set_nonblocking(sock
);
4861 VLOG_ERR("failed to create packet socket: %s",
4862 ovs_strerror(errno
));
4864 ovsthread_once_done(&once
);