2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
20 #include "netdev-linux-private.h"
24 #include <sys/types.h>
25 #include <netinet/in.h>
26 #include <arpa/inet.h>
29 #include <linux/filter.h>
30 #include <linux/gen_stats.h>
31 #include <linux/if_ether.h>
32 #include <linux/if_packet.h>
33 #include <linux/if_tun.h>
34 #include <linux/types.h>
35 #include <linux/ethtool.h>
36 #include <linux/mii.h>
37 #include <linux/rtnetlink.h>
38 #include <linux/sockios.h>
39 #include <linux/virtio_net.h>
40 #include <sys/ioctl.h>
41 #include <sys/socket.h>
43 #include <sys/utsname.h>
45 #include <net/if_arp.h>
46 #include <net/route.h>
53 #include "dp-packet.h"
54 #include "dpif-netlink.h"
55 #include "dpif-netdev.h"
56 #include "openvswitch/dynamic-string.h"
57 #include "fatal-signal.h"
59 #include "openvswitch/hmap.h"
60 #include "netdev-afxdp.h"
61 #include "netdev-provider.h"
62 #include "netdev-vport.h"
63 #include "netlink-notifier.h"
64 #include "netlink-socket.h"
67 #include "openvswitch/ofpbuf.h"
68 #include "openflow/openflow.h"
69 #include "ovs-atomic.h"
72 #include "openvswitch/poll-loop.h"
73 #include "rtnetlink.h"
74 #include "openvswitch/shash.h"
75 #include "socket-util.h"
79 #include "unaligned.h"
80 #include "openvswitch/vlog.h"
81 #include "userspace-tso.h"
84 VLOG_DEFINE_THIS_MODULE(netdev_linux
);
86 COVERAGE_DEFINE(netdev_set_policing
);
87 COVERAGE_DEFINE(netdev_arp_lookup
);
88 COVERAGE_DEFINE(netdev_get_ifindex
);
89 COVERAGE_DEFINE(netdev_get_hwaddr
);
90 COVERAGE_DEFINE(netdev_set_hwaddr
);
91 COVERAGE_DEFINE(netdev_get_ethtool
);
92 COVERAGE_DEFINE(netdev_set_ethtool
);
95 #ifndef IFLA_IF_NETNSID
96 #define IFLA_IF_NETNSID 0x45
98 /* These were introduced in Linux 2.6.14, so they might be missing if we have
100 #ifndef ADVERTISED_Pause
101 #define ADVERTISED_Pause (1 << 13)
103 #ifndef ADVERTISED_Asym_Pause
104 #define ADVERTISED_Asym_Pause (1 << 14)
107 /* These were introduced in Linux 2.6.24, so they might be missing if we
108 * have old headers. */
109 #ifndef ETHTOOL_GFLAGS
110 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
112 #ifndef ETHTOOL_SFLAGS
113 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
116 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
119 #define TC_RTAB_SIZE 1024
122 /* Linux 2.6.21 introduced struct tpacket_auxdata.
123 * Linux 2.6.27 added the tp_vlan_tci member.
124 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
125 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
126 * TP_STATUS_VLAN_TPID_VALID.
128 * With all this churn it's easiest to unconditionally define a replacement
129 * structure that has everything we want.
131 #ifndef PACKET_AUXDATA
132 #define PACKET_AUXDATA 8
134 #ifndef TP_STATUS_VLAN_VALID
135 #define TP_STATUS_VLAN_VALID (1 << 4)
137 #ifndef TP_STATUS_VLAN_TPID_VALID
138 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
140 #undef tpacket_auxdata
141 #define tpacket_auxdata rpl_tpacket_auxdata
142 struct tpacket_auxdata
{
148 uint16_t tp_vlan_tci
;
149 uint16_t tp_vlan_tpid
;
152 /* Linux 2.6.27 introduced ethtool_cmd_speed
154 * To avoid revisiting problems reported with using configure to detect
155 * compatibility (see report at
156 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
157 * unconditionally replace ethtool_cmd_speed. */
158 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
159 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd
*ep
)
161 return ep
->speed
| (ep
->speed_hi
<< 16);
164 /* Linux 2.6.30 introduced supported and advertised flags for
165 * 1G base KX, and 10G base KX4, KR and R. */
166 #ifndef SUPPORTED_1000baseKX_Full
167 #define SUPPORTED_1000baseKX_Full (1 << 17)
168 #define SUPPORTED_10000baseKX4_Full (1 << 18)
169 #define SUPPORTED_10000baseKR_Full (1 << 19)
170 #define SUPPORTED_10000baseR_FEC (1 << 20)
171 #define ADVERTISED_1000baseKX_Full (1 << 17)
172 #define ADVERTISED_10000baseKX4_Full (1 << 18)
173 #define ADVERTISED_10000baseKR_Full (1 << 19)
174 #define ADVERTISED_10000baseR_FEC (1 << 20)
177 /* Linux 3.5 introduced supported and advertised flags for
178 * 40G base KR4, CR4, SR4 and LR4. */
179 #ifndef SUPPORTED_40000baseKR4_Full
180 #define SUPPORTED_40000baseKR4_Full (1 << 23)
181 #define SUPPORTED_40000baseCR4_Full (1 << 24)
182 #define SUPPORTED_40000baseSR4_Full (1 << 25)
183 #define SUPPORTED_40000baseLR4_Full (1 << 26)
184 #define ADVERTISED_40000baseKR4_Full (1 << 23)
185 #define ADVERTISED_40000baseCR4_Full (1 << 24)
186 #define ADVERTISED_40000baseSR4_Full (1 << 25)
187 #define ADVERTISED_40000baseLR4_Full (1 << 26)
190 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
192 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
193 * 2.6.32-431.29.2.el6.x86_64 (see report at
194 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
195 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
196 * unconditionally define a replacement. */
198 #define IFLA_STATS64 23
200 #define rtnl_link_stats64 rpl_rtnl_link_stats64
201 struct rtnl_link_stats64
{
213 uint64_t rx_length_errors
;
214 uint64_t rx_over_errors
;
215 uint64_t rx_crc_errors
;
216 uint64_t rx_frame_errors
;
217 uint64_t rx_fifo_errors
;
218 uint64_t rx_missed_errors
;
220 uint64_t tx_aborted_errors
;
221 uint64_t tx_carrier_errors
;
222 uint64_t tx_fifo_errors
;
223 uint64_t tx_heartbeat_errors
;
224 uint64_t tx_window_errors
;
226 uint64_t rx_compressed
;
227 uint64_t tx_compressed
;
230 /* Linux 3.19 introduced virtio_types.h. It might be missing
231 * if we are using old kernel. */
232 #ifndef HAVE_VIRTIO_TYPES
233 typedef __u16 __bitwise__ __virtio16
;
234 typedef __u32 __bitwise__ __virtio32
;
235 typedef __u64 __bitwise__ __virtio64
;
239 VALID_IFINDEX
= 1 << 0,
240 VALID_ETHERADDR
= 1 << 1,
243 VALID_POLICING
= 1 << 4,
244 VALID_VPORT_STAT_ERROR
= 1 << 5,
245 VALID_DRVINFO
= 1 << 6,
246 VALID_FEATURES
= 1 << 7,
247 VALID_NUMA_ID
= 1 << 8,
250 /* Use one for the packet buffer and another for the aux buffer to receive
252 #define IOV_STD_SIZE 1
253 #define IOV_TSO_SIZE 2
260 struct linux_lag_slave
{
262 struct shash_node
*node
;
265 /* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
266 static struct ovs_mutex lag_mutex
= OVS_MUTEX_INITIALIZER
;
268 /* All slaves whose LAG masters are network devices in OvS. */
269 static struct shash lag_shash
OVS_GUARDED_BY(lag_mutex
)
270 = SHASH_INITIALIZER(&lag_shash
);
272 /* Traffic control. */
274 /* An instance of a traffic control class. Always associated with a particular
277 * Each TC implementation subclasses this with whatever additional data it
280 const struct tc_ops
*ops
;
281 struct hmap queues
; /* Contains "struct tc_queue"s.
282 * Read by generic TC layer.
283 * Written only by TC implementation. */
286 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
288 /* One traffic control queue.
290 * Each TC implementation subclasses this with whatever additional data it
293 struct hmap_node hmap_node
; /* In struct tc's "queues" hmap. */
294 unsigned int queue_id
; /* OpenFlow queue ID. */
295 long long int created
; /* Time queue was created, in msecs. */
298 /* A particular kind of traffic control. Each implementation generally maps to
299 * one particular Linux qdisc class.
301 * The functions below return 0 if successful or a positive errno value on
302 * failure, except where otherwise noted. All of them must be provided, except
303 * where otherwise noted. */
305 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
306 * This is null for tc_ops_default and tc_ops_other, for which there are no
307 * appropriate values. */
308 const char *linux_name
;
310 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
311 const char *ovs_name
;
313 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
314 * queues. The queues are numbered 0 through n_queues - 1. */
315 unsigned int n_queues
;
317 /* Called to install this TC class on 'netdev'. The implementation should
318 * make the Netlink calls required to set up 'netdev' with the right qdisc
319 * and configure it according to 'details'. The implementation may assume
320 * that the current qdisc is the default; that is, there is no need for it
321 * to delete the current qdisc before installing itself.
323 * The contents of 'details' should be documented as valid for 'ovs_name'
324 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
325 * (which is built as ovs-vswitchd.conf.db(8)).
327 * This function must return 0 if and only if it sets 'netdev->tc' to an
328 * initialized 'struct tc'.
330 * (This function is null for tc_ops_other, which cannot be installed. For
331 * other TC classes it should always be nonnull.) */
332 int (*tc_install
)(struct netdev
*netdev
, const struct smap
*details
);
334 /* Called when the netdev code determines (through a Netlink query) that
335 * this TC class's qdisc is installed on 'netdev', but we didn't install
336 * it ourselves and so don't know any of the details.
338 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
339 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
340 * implementation should parse the other attributes of 'nlmsg' as
341 * necessary to determine its configuration. If necessary it should also
342 * use Netlink queries to determine the configuration of queues on
345 * This function must return 0 if and only if it sets 'netdev->tc' to an
346 * initialized 'struct tc'. */
347 int (*tc_load
)(struct netdev
*netdev
, struct ofpbuf
*nlmsg
);
349 /* Destroys the data structures allocated by the implementation as part of
350 * 'tc'. (This includes destroying 'tc->queues' by calling
353 * The implementation should not need to perform any Netlink calls. If
354 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
355 * (But it may not be desirable.)
357 * This function may be null if 'tc' is trivial. */
358 void (*tc_destroy
)(struct tc
*tc
);
360 /* Retrieves details of 'netdev->tc' configuration into 'details'.
362 * The implementation should not need to perform any Netlink calls, because
363 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
364 * cached the configuration.
366 * The contents of 'details' should be documented as valid for 'ovs_name'
367 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
368 * (which is built as ovs-vswitchd.conf.db(8)).
370 * This function may be null if 'tc' is not configurable.
372 int (*qdisc_get
)(const struct netdev
*netdev
, struct smap
*details
);
374 /* Reconfigures 'netdev->tc' according to 'details', performing any
375 * required Netlink calls to complete the reconfiguration.
377 * The contents of 'details' should be documented as valid for 'ovs_name'
378 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
379 * (which is built as ovs-vswitchd.conf.db(8)).
381 * This function may be null if 'tc' is not configurable.
383 int (*qdisc_set
)(struct netdev
*, const struct smap
*details
);
385 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
386 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
388 * The contents of 'details' should be documented as valid for 'ovs_name'
389 * in the "other_config" column in the "Queue" table in
390 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
392 * The implementation should not need to perform any Netlink calls, because
393 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
394 * cached the queue configuration.
396 * This function may be null if 'tc' does not have queues ('n_queues' is
398 int (*class_get
)(const struct netdev
*netdev
, const struct tc_queue
*queue
,
399 struct smap
*details
);
401 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
402 * 'details', perfoming any required Netlink calls to complete the
403 * reconfiguration. The caller ensures that 'queue_id' is less than
406 * The contents of 'details' should be documented as valid for 'ovs_name'
407 * in the "other_config" column in the "Queue" table in
408 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
410 * This function may be null if 'tc' does not have queues or its queues are
411 * not configurable. */
412 int (*class_set
)(struct netdev
*, unsigned int queue_id
,
413 const struct smap
*details
);
415 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
416 * tc_queue's within 'netdev->tc->queues'.
418 * This function may be null if 'tc' does not have queues or its queues
419 * cannot be deleted. */
420 int (*class_delete
)(struct netdev
*, struct tc_queue
*queue
);
422 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
423 * 'struct tc_queue's within 'netdev->tc->queues'.
425 * On success, initializes '*stats'.
427 * This function may be null if 'tc' does not have queues or if it cannot
428 * report queue statistics. */
429 int (*class_get_stats
)(const struct netdev
*netdev
,
430 const struct tc_queue
*queue
,
431 struct netdev_queue_stats
*stats
);
433 /* Extracts queue stats from 'nlmsg', which is a response to a
434 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
436 * This function may be null if 'tc' does not have queues or if it cannot
437 * report queue statistics. */
438 int (*class_dump_stats
)(const struct netdev
*netdev
,
439 const struct ofpbuf
*nlmsg
,
440 netdev_dump_queue_stats_cb
*cb
, void *aux
);
444 tc_init(struct tc
*tc
, const struct tc_ops
*ops
)
447 hmap_init(&tc
->queues
);
451 tc_destroy(struct tc
*tc
)
453 hmap_destroy(&tc
->queues
);
456 static const struct tc_ops tc_ops_htb
;
457 static const struct tc_ops tc_ops_hfsc
;
458 static const struct tc_ops tc_ops_codel
;
459 static const struct tc_ops tc_ops_fqcodel
;
460 static const struct tc_ops tc_ops_sfq
;
461 static const struct tc_ops tc_ops_netem
;
462 static const struct tc_ops tc_ops_default
;
463 static const struct tc_ops tc_ops_noop
;
464 static const struct tc_ops tc_ops_other
;
466 static const struct tc_ops
*const tcs
[] = {
467 &tc_ops_htb
, /* Hierarchy token bucket (see tc-htb(8)). */
468 &tc_ops_hfsc
, /* Hierarchical fair service curve. */
469 &tc_ops_codel
, /* Controlled delay */
470 &tc_ops_fqcodel
, /* Fair queue controlled delay */
471 &tc_ops_sfq
, /* Stochastic fair queueing */
472 &tc_ops_netem
, /* Network Emulator */
473 &tc_ops_noop
, /* Non operating qos type. */
474 &tc_ops_default
, /* Default qdisc (see tc-pfifo_fast(8)). */
475 &tc_ops_other
, /* Some other qdisc. */
479 static unsigned int tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
);
480 static unsigned int tc_bytes_to_ticks(unsigned int rate
, unsigned int size
);
481 static unsigned int tc_buffer_per_jiffy(unsigned int rate
);
482 static uint32_t tc_time_to_ticks(uint32_t time
);
484 static struct tcmsg
*netdev_linux_tc_make_request(const struct netdev
*,
488 static int tc_add_policer(struct netdev
*,
489 uint32_t kbits_rate
, uint32_t kbits_burst
);
491 static int tc_parse_qdisc(const struct ofpbuf
*, const char **kind
,
492 struct nlattr
**options
);
493 static int tc_parse_class(const struct ofpbuf
*, unsigned int *queue_id
,
494 struct nlattr
**options
,
495 struct netdev_queue_stats
*);
496 static int tc_query_class(const struct netdev
*,
497 unsigned int handle
, unsigned int parent
,
498 struct ofpbuf
**replyp
);
499 static int tc_delete_class(const struct netdev
*, unsigned int handle
);
501 static int tc_del_qdisc(struct netdev
*netdev
);
502 static int tc_query_qdisc(const struct netdev
*netdev
);
505 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
);
506 static int tc_calc_cell_log(unsigned int mtu
);
507 static void tc_fill_rate(struct tc_ratespec
*rate
, uint64_t bps
, int mtu
);
508 static int tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
);
511 /* This is set pretty low because we probably won't learn anything from the
512 * additional log messages. */
513 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
515 /* Polling miimon status for all ports causes performance degradation when
516 * handling a large number of ports. If there are no devices using miimon, then
517 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
519 * Readers do not depend on this variable synchronizing with the related
520 * changes in the device miimon status, so we can use atomic_count. */
521 static atomic_count miimon_cnt
= ATOMIC_COUNT_INIT(0);
523 static int netdev_linux_parse_vnet_hdr(struct dp_packet
*b
);
524 static void netdev_linux_prepend_vnet_hdr(struct dp_packet
*b
, int mtu
);
525 static int netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*,
526 int cmd
, const char *cmd_name
);
527 static int get_flags(const struct netdev
*, unsigned int *flags
);
528 static int set_flags(const char *, unsigned int flags
);
529 static int update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
530 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
531 OVS_REQUIRES(netdev
->mutex
);
532 static int get_ifindex(const struct netdev
*, int *ifindexp
);
533 static int do_set_addr(struct netdev
*netdev
,
534 int ioctl_nr
, const char *ioctl_name
,
535 struct in_addr addr
);
536 static int get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
);
537 static int set_etheraddr(const char *netdev_name
, const struct eth_addr
);
538 static int af_packet_sock(void);
539 static bool netdev_linux_miimon_enabled(void);
540 static void netdev_linux_miimon_run(void);
541 static void netdev_linux_miimon_wait(void);
542 static int netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
);
545 is_tap_netdev(const struct netdev
*netdev
)
547 return netdev_get_class(netdev
) == &netdev_tap_class
;
551 netdev_linux_netnsid_update__(struct netdev_linux
*netdev
)
553 struct dpif_netlink_vport reply
;
557 error
= dpif_netlink_vport_get(netdev_get_name(&netdev
->up
), &reply
, &buf
);
559 if (error
== ENOENT
) {
560 /* Assume it is local if there is no API (e.g. if the openvswitch
561 * kernel module is not loaded). */
562 netnsid_set_local(&netdev
->netnsid
);
564 netnsid_unset(&netdev
->netnsid
);
569 netnsid_set(&netdev
->netnsid
, reply
.netnsid
);
575 netdev_linux_netnsid_update(struct netdev_linux
*netdev
)
577 if (netnsid_is_unset(netdev
->netnsid
)) {
578 if (netdev_get_class(&netdev
->up
) == &netdev_tap_class
) {
579 netnsid_set_local(&netdev
->netnsid
);
581 return netdev_linux_netnsid_update__(netdev
);
589 netdev_linux_netnsid_is_eq(struct netdev_linux
*netdev
, int nsid
)
591 netdev_linux_netnsid_update(netdev
);
592 return netnsid_eq(netdev
->netnsid
, nsid
);
596 netdev_linux_netnsid_is_remote(struct netdev_linux
*netdev
)
598 netdev_linux_netnsid_update(netdev
);
599 return netnsid_is_remote(netdev
->netnsid
);
602 static int netdev_linux_update_via_netlink(struct netdev_linux
*);
603 static void netdev_linux_update(struct netdev_linux
*netdev
, int,
604 const struct rtnetlink_change
*)
605 OVS_REQUIRES(netdev
->mutex
);
606 static void netdev_linux_changed(struct netdev_linux
*netdev
,
607 unsigned int ifi_flags
, unsigned int mask
)
608 OVS_REQUIRES(netdev
->mutex
);
610 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
611 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
612 * if no such socket could be created. */
613 static struct nl_sock
*
614 netdev_linux_notify_sock(void)
616 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
617 static struct nl_sock
*sock
;
618 unsigned int mcgroups
[] = {RTNLGRP_LINK
, RTNLGRP_IPV4_IFADDR
,
619 RTNLGRP_IPV6_IFADDR
, RTNLGRP_IPV6_IFINFO
};
621 if (ovsthread_once_start(&once
)) {
624 error
= nl_sock_create(NETLINK_ROUTE
, &sock
);
628 for (i
= 0; i
< ARRAY_SIZE(mcgroups
); i
++) {
629 error
= nl_sock_join_mcgroup(sock
, mcgroups
[i
]);
631 nl_sock_destroy(sock
);
637 nl_sock_listen_all_nsid(sock
, true);
638 ovsthread_once_done(&once
);
645 netdev_linux_miimon_enabled(void)
647 return atomic_count_get(&miimon_cnt
) > 0;
651 netdev_linux_kind_is_lag(const char *kind
)
653 if (!strcmp(kind
, "bond") || !strcmp(kind
, "team")) {
661 netdev_linux_update_lag(struct rtnetlink_change
*change
)
662 OVS_REQUIRES(lag_mutex
)
664 struct linux_lag_slave
*lag
;
666 if (change
->slave
&& netdev_linux_kind_is_lag(change
->slave
)) {
667 lag
= shash_find_data(&lag_shash
, change
->ifname
);
670 struct netdev
*master_netdev
;
671 char master_name
[IFNAMSIZ
];
675 if_indextoname(change
->master_ifindex
, master_name
);
676 master_netdev
= netdev_from_name(master_name
);
677 if (!master_netdev
) {
681 if (is_netdev_linux_class(master_netdev
->netdev_class
)) {
682 block_id
= netdev_get_block_id(master_netdev
);
684 netdev_close(master_netdev
);
688 lag
= xmalloc(sizeof *lag
);
689 lag
->block_id
= block_id
;
690 lag
->node
= shash_add(&lag_shash
, change
->ifname
, lag
);
692 /* delete ingress block in case it exists */
693 tc_add_del_qdisc(change
->if_index
, false, 0, TC_INGRESS
);
694 /* LAG master is linux netdev so add slave to same block. */
695 error
= tc_add_del_qdisc(change
->if_index
, true, block_id
,
698 VLOG_WARN("failed to bind LAG slave %s to master's block",
700 shash_delete(&lag_shash
, lag
->node
);
705 netdev_close(master_netdev
);
707 } else if (change
->master_ifindex
== 0) {
708 /* Check if this was a lag slave that has been freed. */
709 lag
= shash_find_data(&lag_shash
, change
->ifname
);
712 tc_add_del_qdisc(change
->if_index
, false, lag
->block_id
,
714 shash_delete(&lag_shash
, lag
->node
);
721 netdev_linux_run(const struct netdev_class
*netdev_class OVS_UNUSED
)
723 struct nl_sock
*sock
;
726 if (netdev_linux_miimon_enabled()) {
727 netdev_linux_miimon_run();
730 sock
= netdev_linux_notify_sock();
736 uint64_t buf_stub
[4096 / 8];
740 ofpbuf_use_stub(&buf
, buf_stub
, sizeof buf_stub
);
741 error
= nl_sock_recv(sock
, &buf
, &nsid
, false);
743 struct rtnetlink_change change
;
745 if (rtnetlink_parse(&buf
, &change
)) {
746 struct netdev
*netdev_
= NULL
;
747 char dev_name
[IFNAMSIZ
];
749 if (!change
.ifname
) {
750 change
.ifname
= if_indextoname(change
.if_index
, dev_name
);
754 netdev_
= netdev_from_name(change
.ifname
);
756 if (netdev_
&& is_netdev_linux_class(netdev_
->netdev_class
)) {
757 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
759 ovs_mutex_lock(&netdev
->mutex
);
760 netdev_linux_update(netdev
, nsid
, &change
);
761 ovs_mutex_unlock(&netdev
->mutex
);
765 rtnetlink_type_is_rtnlgrp_link(change
.nlmsg_type
)) {
767 /* Need to try updating the LAG information. */
768 ovs_mutex_lock(&lag_mutex
);
769 netdev_linux_update_lag(&change
);
770 ovs_mutex_unlock(&lag_mutex
);
772 netdev_close(netdev_
);
774 } else if (error
== ENOBUFS
) {
775 struct shash device_shash
;
776 struct shash_node
*node
;
780 shash_init(&device_shash
);
781 netdev_get_devices(&netdev_linux_class
, &device_shash
);
782 SHASH_FOR_EACH (node
, &device_shash
) {
783 struct netdev
*netdev_
= node
->data
;
784 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
787 ovs_mutex_lock(&netdev
->mutex
);
788 get_flags(netdev_
, &flags
);
789 netdev_linux_changed(netdev
, flags
, 0);
790 ovs_mutex_unlock(&netdev
->mutex
);
792 netdev_close(netdev_
);
794 shash_destroy(&device_shash
);
795 } else if (error
!= EAGAIN
) {
796 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 5);
797 VLOG_WARN_RL(&rll
, "error reading or parsing netlink (%s)",
798 ovs_strerror(error
));
805 netdev_linux_wait(const struct netdev_class
*netdev_class OVS_UNUSED
)
807 struct nl_sock
*sock
;
809 if (netdev_linux_miimon_enabled()) {
810 netdev_linux_miimon_wait();
812 sock
= netdev_linux_notify_sock();
814 nl_sock_wait(sock
, POLLIN
);
819 netdev_linux_changed(struct netdev_linux
*dev
,
820 unsigned int ifi_flags
, unsigned int mask
)
821 OVS_REQUIRES(dev
->mutex
)
823 netdev_change_seq_changed(&dev
->up
);
825 if ((dev
->ifi_flags
^ ifi_flags
) & IFF_RUNNING
) {
826 dev
->carrier_resets
++;
828 dev
->ifi_flags
= ifi_flags
;
830 dev
->cache_valid
&= mask
;
831 if (!(mask
& VALID_IN
)) {
832 netdev_get_addrs_list_flush();
837 netdev_linux_update__(struct netdev_linux
*dev
,
838 const struct rtnetlink_change
*change
)
839 OVS_REQUIRES(dev
->mutex
)
841 if (rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)) {
842 if (change
->nlmsg_type
== RTM_NEWLINK
) {
843 /* Keep drv-info, ip addresses, and NUMA id. */
844 netdev_linux_changed(dev
, change
->ifi_flags
,
845 VALID_DRVINFO
| VALID_IN
| VALID_NUMA_ID
);
847 /* Update netdev from rtnl-change msg. */
849 dev
->mtu
= change
->mtu
;
850 dev
->cache_valid
|= VALID_MTU
;
851 dev
->netdev_mtu_error
= 0;
854 if (!eth_addr_is_zero(change
->mac
)) {
855 dev
->etheraddr
= change
->mac
;
856 dev
->cache_valid
|= VALID_ETHERADDR
;
857 dev
->ether_addr_error
= 0;
859 /* The mac addr has been changed, report it now. */
860 rtnetlink_report_link();
863 if (change
->master
&& netdev_linux_kind_is_lag(change
->master
)) {
864 dev
->is_lag_master
= true;
867 dev
->ifindex
= change
->if_index
;
868 dev
->cache_valid
|= VALID_IFINDEX
;
869 dev
->get_ifindex_error
= 0;
873 netdev_linux_changed(dev
, change
->ifi_flags
, 0);
874 dev
->present
= false;
875 netnsid_unset(&dev
->netnsid
);
877 } else if (rtnetlink_type_is_rtnlgrp_addr(change
->nlmsg_type
)) {
878 /* Invalidates in4, in6. */
879 netdev_linux_changed(dev
, dev
->ifi_flags
, ~VALID_IN
);
886 netdev_linux_update(struct netdev_linux
*dev
, int nsid
,
887 const struct rtnetlink_change
*change
)
888 OVS_REQUIRES(dev
->mutex
)
890 if (netdev_linux_netnsid_is_eq(dev
, nsid
)) {
891 netdev_linux_update__(dev
, change
);
895 static struct netdev
*
896 netdev_linux_alloc(void)
898 struct netdev_linux
*netdev
= xzalloc(sizeof *netdev
);
903 netdev_linux_common_construct(struct netdev
*netdev_
)
905 /* Prevent any attempt to create (or open) a network device named "default"
906 * or "all". These device names are effectively reserved on Linux because
907 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
908 * itself this wouldn't call for any special treatment, but in practice if
909 * a program tries to create devices with these names, it causes the kernel
910 * to fire a "new device" notification event even though creation failed,
911 * and in turn that causes OVS to wake up and try to create them again,
912 * which ends up as a 100% CPU loop. */
913 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
914 const char *name
= netdev_
->name
;
915 if (!strcmp(name
, "default") || !strcmp(name
, "all")) {
916 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 1);
917 VLOG_WARN_RL(&rll
, "%s: Linux forbids network device with this name",
922 /* The device could be in the same network namespace or in another one. */
923 netnsid_unset(&netdev
->netnsid
);
924 ovs_mutex_init(&netdev
->mutex
);
926 if (userspace_tso_enabled()) {
927 netdev_
->ol_flags
|= NETDEV_TX_OFFLOAD_TCP_TSO
;
928 netdev_
->ol_flags
|= NETDEV_TX_OFFLOAD_TCP_CKSUM
;
929 netdev_
->ol_flags
|= NETDEV_TX_OFFLOAD_UDP_CKSUM
;
930 netdev_
->ol_flags
|= NETDEV_TX_OFFLOAD_SCTP_CKSUM
;
931 netdev_
->ol_flags
|= NETDEV_TX_OFFLOAD_IPV4_CKSUM
;
937 /* Creates system and internal devices. */
939 netdev_linux_construct(struct netdev
*netdev_
)
941 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
942 int error
= netdev_linux_common_construct(netdev_
);
947 error
= get_flags(&netdev
->up
, &netdev
->ifi_flags
);
948 if (error
== ENODEV
) {
949 if (netdev
->up
.netdev_class
!= &netdev_internal_class
) {
950 /* The device does not exist, so don't allow it to be opened. */
953 /* "Internal" netdevs have to be created as netdev objects before
954 * they exist in the kernel, because creating them in the kernel
955 * happens by passing a netdev object to dpif_port_add().
956 * Therefore, ignore the error. */
963 /* For most types of netdevs we open the device for each call of
964 * netdev_open(). However, this is not the case with tap devices,
965 * since it is only possible to open the device once. In this
966 * situation we share a single file descriptor, and consequently
967 * buffers, across all readers. Therefore once data is read it will
968 * be unavailable to other reads for tap devices. */
970 netdev_linux_construct_tap(struct netdev
*netdev_
)
972 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
973 static const char tap_dev
[] = "/dev/net/tun";
974 const char *name
= netdev_
->name
;
977 int error
= netdev_linux_common_construct(netdev_
);
982 /* Open tap device. */
983 netdev
->tap_fd
= open(tap_dev
, O_RDWR
);
984 if (netdev
->tap_fd
< 0) {
986 VLOG_WARN("opening \"%s\" failed: %s", tap_dev
, ovs_strerror(error
));
990 /* Create tap device. */
991 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
992 ifr
.ifr_flags
= IFF_TAP
| IFF_NO_PI
;
993 if (userspace_tso_enabled()) {
994 ifr
.ifr_flags
|= IFF_VNET_HDR
;
997 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
998 if (ioctl(netdev
->tap_fd
, TUNSETIFF
, &ifr
) == -1) {
999 VLOG_WARN("%s: creating tap device failed: %s", name
,
1000 ovs_strerror(errno
));
1005 /* Make non-blocking. */
1006 error
= set_nonblocking(netdev
->tap_fd
);
1011 if (ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 1)) {
1012 VLOG_WARN("%s: creating tap device failed (persist): %s", name
,
1013 ovs_strerror(errno
));
1018 if (userspace_tso_enabled()) {
1019 /* Old kernels don't support TUNSETOFFLOAD. If TUNSETOFFLOAD is
1020 * available, it will return EINVAL when a flag is unknown.
1021 * Therefore, try enabling offload with no flags to check
1022 * if TUNSETOFFLOAD support is available or not. */
1023 if (ioctl(netdev
->tap_fd
, TUNSETOFFLOAD
, 0) == 0 || errno
!= EINVAL
) {
1024 unsigned long oflags
= TUN_F_CSUM
| TUN_F_TSO4
| TUN_F_TSO6
;
1026 if (ioctl(netdev
->tap_fd
, TUNSETOFFLOAD
, oflags
) == -1) {
1027 VLOG_WARN("%s: enabling tap offloading failed: %s", name
,
1028 ovs_strerror(errno
));
1035 netdev
->present
= true;
1039 close(netdev
->tap_fd
);
1044 netdev_linux_destruct(struct netdev
*netdev_
)
1046 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1048 if (netdev
->tc
&& netdev
->tc
->ops
->tc_destroy
) {
1049 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
1052 if (netdev_get_class(netdev_
) == &netdev_tap_class
1053 && netdev
->tap_fd
>= 0)
1055 ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 0);
1056 close(netdev
->tap_fd
);
1059 if (netdev
->miimon_interval
> 0) {
1060 atomic_count_dec(&miimon_cnt
);
1063 ovs_mutex_destroy(&netdev
->mutex
);
1067 netdev_linux_dealloc(struct netdev
*netdev_
)
1069 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1073 static struct netdev_rxq
*
1074 netdev_linux_rxq_alloc(void)
1076 struct netdev_rxq_linux
*rx
= xzalloc(sizeof *rx
);
1081 netdev_linux_rxq_construct(struct netdev_rxq
*rxq_
)
1083 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1084 struct netdev
*netdev_
= rx
->up
.netdev
;
1085 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1088 ovs_mutex_lock(&netdev
->mutex
);
1089 rx
->is_tap
= is_tap_netdev(netdev_
);
1091 rx
->fd
= netdev
->tap_fd
;
1093 struct sockaddr_ll sll
;
1095 /* Result of tcpdump -dd inbound */
1096 static const struct sock_filter filt
[] = {
1097 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1098 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1099 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1100 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1102 static const struct sock_fprog fprog
= {
1103 ARRAY_SIZE(filt
), (struct sock_filter
*) filt
1106 /* Create file descriptor. */
1107 rx
->fd
= socket(PF_PACKET
, SOCK_RAW
, 0);
1110 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error
));
1115 if (setsockopt(rx
->fd
, SOL_PACKET
, PACKET_AUXDATA
, &val
, sizeof val
)) {
1117 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1118 netdev_get_name(netdev_
), ovs_strerror(error
));
1122 if (userspace_tso_enabled()
1123 && setsockopt(rx
->fd
, SOL_PACKET
, PACKET_VNET_HDR
, &val
,
1126 VLOG_ERR("%s: failed to enable vnet hdr in txq raw socket: %s",
1127 netdev_get_name(netdev_
), ovs_strerror(errno
));
1131 /* Set non-blocking mode. */
1132 error
= set_nonblocking(rx
->fd
);
1137 /* Get ethernet device index. */
1138 error
= get_ifindex(&netdev
->up
, &ifindex
);
1143 /* Bind to specific ethernet device. */
1144 memset(&sll
, 0, sizeof sll
);
1145 sll
.sll_family
= AF_PACKET
;
1146 sll
.sll_ifindex
= ifindex
;
1147 sll
.sll_protocol
= htons(ETH_P_ALL
);
1148 if (bind(rx
->fd
, (struct sockaddr
*) &sll
, sizeof sll
) < 0) {
1150 VLOG_ERR("%s: failed to bind raw socket (%s)",
1151 netdev_get_name(netdev_
), ovs_strerror(error
));
1155 /* Filter for only inbound packets. */
1156 error
= setsockopt(rx
->fd
, SOL_SOCKET
, SO_ATTACH_FILTER
, &fprog
,
1160 VLOG_ERR("%s: failed to attach filter (%s)",
1161 netdev_get_name(netdev_
), ovs_strerror(error
));
1165 ovs_mutex_unlock(&netdev
->mutex
);
1173 ovs_mutex_unlock(&netdev
->mutex
);
1178 netdev_linux_rxq_destruct(struct netdev_rxq
*rxq_
)
1180 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1187 for (i
= 0; i
< NETDEV_MAX_BURST
; i
++) {
1188 dp_packet_delete(rx
->aux_bufs
[i
]);
1193 netdev_linux_rxq_dealloc(struct netdev_rxq
*rxq_
)
1195 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1201 auxdata_to_vlan_tpid(const struct tpacket_auxdata
*aux
, bool double_tagged
)
1203 if (aux
->tp_status
& TP_STATUS_VLAN_TPID_VALID
) {
1204 return htons(aux
->tp_vlan_tpid
);
1205 } else if (double_tagged
) {
1206 return htons(ETH_TYPE_VLAN_8021AD
);
1208 return htons(ETH_TYPE_VLAN_8021Q
);
1213 auxdata_has_vlan_tci(const struct tpacket_auxdata
*aux
)
1215 return aux
->tp_vlan_tci
|| aux
->tp_status
& TP_STATUS_VLAN_VALID
;
1219 * Receive packets from raw socket in batch process for better performance,
1220 * it can receive NETDEV_MAX_BURST packets at most once, the received
1221 * packets are added into *batch. The return value is 0 or errno.
1223 * It also used recvmmsg to reduce multiple syscalls overhead;
1226 netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux
*rx
, int mtu
,
1227 struct dp_packet_batch
*batch
)
1232 int virtio_net_hdr_size
;
1233 struct iovec iovs
[NETDEV_MAX_BURST
][IOV_TSO_SIZE
];
1234 struct cmsghdr
*cmsg
;
1236 struct cmsghdr cmsg
;
1237 char buffer
[CMSG_SPACE(sizeof(struct tpacket_auxdata
))];
1238 } cmsg_buffers
[NETDEV_MAX_BURST
];
1239 struct mmsghdr mmsgs
[NETDEV_MAX_BURST
];
1240 struct dp_packet
*buffers
[NETDEV_MAX_BURST
];
1243 if (userspace_tso_enabled()) {
1244 /* Use the buffer from the allocated packet below to receive MTU
1245 * sized packets and an aux_buf for extra TSO data. */
1246 iovlen
= IOV_TSO_SIZE
;
1247 virtio_net_hdr_size
= sizeof(struct virtio_net_hdr
);
1249 /* Use only the buffer from the allocated packet. */
1250 iovlen
= IOV_STD_SIZE
;
1251 virtio_net_hdr_size
= 0;
1254 /* The length here needs to be accounted in the same way when the
1255 * aux_buf is allocated so that it can be prepended to TSO buffer. */
1256 std_len
= virtio_net_hdr_size
+ VLAN_ETH_HEADER_LEN
+ mtu
;
1257 for (i
= 0; i
< NETDEV_MAX_BURST
; i
++) {
1258 buffers
[i
] = dp_packet_new_with_headroom(std_len
, DP_NETDEV_HEADROOM
);
1259 iovs
[i
][IOV_PACKET
].iov_base
= dp_packet_data(buffers
[i
]);
1260 iovs
[i
][IOV_PACKET
].iov_len
= std_len
;
1261 if (iovlen
== IOV_TSO_SIZE
) {
1262 iovs
[i
][IOV_AUXBUF
].iov_base
= dp_packet_data(rx
->aux_bufs
[i
]);
1263 iovs
[i
][IOV_AUXBUF
].iov_len
= dp_packet_tailroom(rx
->aux_bufs
[i
]);
1266 mmsgs
[i
].msg_hdr
.msg_name
= NULL
;
1267 mmsgs
[i
].msg_hdr
.msg_namelen
= 0;
1268 mmsgs
[i
].msg_hdr
.msg_iov
= iovs
[i
];
1269 mmsgs
[i
].msg_hdr
.msg_iovlen
= iovlen
;
1270 mmsgs
[i
].msg_hdr
.msg_control
= &cmsg_buffers
[i
];
1271 mmsgs
[i
].msg_hdr
.msg_controllen
= sizeof cmsg_buffers
[i
];
1272 mmsgs
[i
].msg_hdr
.msg_flags
= 0;
1276 retval
= recvmmsg(rx
->fd
, mmsgs
, NETDEV_MAX_BURST
, MSG_TRUNC
, NULL
);
1277 } while (retval
< 0 && errno
== EINTR
);
1281 for (i
= 0; i
< NETDEV_MAX_BURST
; i
++) {
1282 dp_packet_delete(buffers
[i
]);
1288 for (i
= 0; i
< retval
; i
++) {
1289 struct dp_packet
*pkt
;
1291 if (mmsgs
[i
].msg_len
< ETH_HEADER_LEN
) {
1292 struct netdev
*netdev_
= netdev_rxq_get_netdev(&rx
->up
);
1293 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1295 dp_packet_delete(buffers
[i
]);
1296 netdev
->rx_dropped
+= 1;
1297 VLOG_WARN_RL(&rl
, "%s: Dropped packet: less than ether hdr size",
1298 netdev_get_name(netdev_
));
1302 if (mmsgs
[i
].msg_len
> std_len
) {
1303 /* Build a single linear TSO packet by prepending the data from
1304 * std_len buffer to the aux_buf. */
1305 pkt
= rx
->aux_bufs
[i
];
1306 dp_packet_set_size(pkt
, mmsgs
[i
].msg_len
- std_len
);
1307 dp_packet_push(pkt
, dp_packet_data(buffers
[i
]), std_len
);
1308 /* The headroom should be the same in buffers[i], pkt and
1309 * DP_NETDEV_HEADROOM. */
1310 dp_packet_resize(pkt
, DP_NETDEV_HEADROOM
, 0);
1311 dp_packet_delete(buffers
[i
]);
1312 rx
->aux_bufs
[i
] = NULL
;
1314 dp_packet_set_size(buffers
[i
], mmsgs
[i
].msg_len
);
1318 if (virtio_net_hdr_size
&& netdev_linux_parse_vnet_hdr(pkt
)) {
1319 struct netdev
*netdev_
= netdev_rxq_get_netdev(&rx
->up
);
1320 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1322 /* Unexpected error situation: the virtio header is not present
1323 * or corrupted. Drop the packet but continue in case next ones
1325 dp_packet_delete(pkt
);
1326 netdev
->rx_dropped
+= 1;
1327 VLOG_WARN_RL(&rl
, "%s: Dropped packet: Invalid virtio net header",
1328 netdev_get_name(netdev_
));
1332 for (cmsg
= CMSG_FIRSTHDR(&mmsgs
[i
].msg_hdr
); cmsg
;
1333 cmsg
= CMSG_NXTHDR(&mmsgs
[i
].msg_hdr
, cmsg
)) {
1334 const struct tpacket_auxdata
*aux
;
1336 if (cmsg
->cmsg_level
!= SOL_PACKET
1337 || cmsg
->cmsg_type
!= PACKET_AUXDATA
1339 CMSG_LEN(sizeof(struct tpacket_auxdata
))) {
1343 aux
= ALIGNED_CAST(struct tpacket_auxdata
*, CMSG_DATA(cmsg
));
1344 if (auxdata_has_vlan_tci(aux
)) {
1345 struct eth_header
*eth
;
1348 eth
= dp_packet_data(pkt
);
1349 double_tagged
= eth
->eth_type
== htons(ETH_TYPE_VLAN_8021Q
);
1352 auxdata_to_vlan_tpid(aux
, double_tagged
),
1353 htons(aux
->tp_vlan_tci
));
1357 dp_packet_batch_add(batch
, pkt
);
1360 /* Delete unused buffers. */
1361 for (; i
< NETDEV_MAX_BURST
; i
++) {
1362 dp_packet_delete(buffers
[i
]);
1369 * Receive packets from tap by batch process for better performance,
1370 * it can receive NETDEV_MAX_BURST packets at most once, the received
1371 * packets are added into *batch. The return value is 0 or errno.
1374 netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux
*rx
, int mtu
,
1375 struct dp_packet_batch
*batch
)
1377 int virtio_net_hdr_size
;
1383 if (userspace_tso_enabled()) {
1384 /* Use the buffer from the allocated packet below to receive MTU
1385 * sized packets and an aux_buf for extra TSO data. */
1386 iovlen
= IOV_TSO_SIZE
;
1387 virtio_net_hdr_size
= sizeof(struct virtio_net_hdr
);
1389 /* Use only the buffer from the allocated packet. */
1390 iovlen
= IOV_STD_SIZE
;
1391 virtio_net_hdr_size
= 0;
1394 /* The length here needs to be accounted in the same way when the
1395 * aux_buf is allocated so that it can be prepended to TSO buffer. */
1396 std_len
= virtio_net_hdr_size
+ VLAN_ETH_HEADER_LEN
+ mtu
;
1397 for (i
= 0; i
< NETDEV_MAX_BURST
; i
++) {
1398 struct dp_packet
*buffer
;
1399 struct dp_packet
*pkt
;
1400 struct iovec iov
[IOV_TSO_SIZE
];
1402 /* Assume Ethernet port. No need to set packet_type. */
1403 buffer
= dp_packet_new_with_headroom(std_len
, DP_NETDEV_HEADROOM
);
1404 iov
[IOV_PACKET
].iov_base
= dp_packet_data(buffer
);
1405 iov
[IOV_PACKET
].iov_len
= std_len
;
1406 if (iovlen
== IOV_TSO_SIZE
) {
1407 iov
[IOV_AUXBUF
].iov_base
= dp_packet_data(rx
->aux_bufs
[i
]);
1408 iov
[IOV_AUXBUF
].iov_len
= dp_packet_tailroom(rx
->aux_bufs
[i
]);
1412 retval
= readv(rx
->fd
, iov
, iovlen
);
1413 } while (retval
< 0 && errno
== EINTR
);
1416 dp_packet_delete(buffer
);
1420 if (retval
> std_len
) {
1421 /* Build a single linear TSO packet by prepending the data from
1422 * std_len buffer to the aux_buf. */
1423 pkt
= rx
->aux_bufs
[i
];
1424 dp_packet_set_size(pkt
, retval
- std_len
);
1425 dp_packet_push(pkt
, dp_packet_data(buffer
), std_len
);
1426 /* The headroom should be the same in buffers[i], pkt and
1427 * DP_NETDEV_HEADROOM. */
1428 dp_packet_resize(pkt
, DP_NETDEV_HEADROOM
, 0);
1429 dp_packet_delete(buffer
);
1430 rx
->aux_bufs
[i
] = NULL
;
1432 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1436 if (virtio_net_hdr_size
&& netdev_linux_parse_vnet_hdr(pkt
)) {
1437 struct netdev
*netdev_
= netdev_rxq_get_netdev(&rx
->up
);
1438 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1440 /* Unexpected error situation: the virtio header is not present
1441 * or corrupted. Drop the packet but continue in case next ones
1443 dp_packet_delete(pkt
);
1444 netdev
->rx_dropped
+= 1;
1445 VLOG_WARN_RL(&rl
, "%s: Dropped packet: Invalid virtio net header",
1446 netdev_get_name(netdev_
));
1450 dp_packet_batch_add(batch
, pkt
);
1453 if ((i
== 0) && (retval
< 0)) {
1461 netdev_linux_rxq_recv(struct netdev_rxq
*rxq_
, struct dp_packet_batch
*batch
,
1464 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1465 struct netdev
*netdev
= rx
->up
.netdev
;
1469 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
1470 mtu
= ETH_PAYLOAD_MAX
;
1473 if (userspace_tso_enabled()) {
1474 /* Allocate TSO packets. The packet has enough headroom to store
1475 * a full non-TSO packet. When a TSO packet is received, the data
1476 * from non-TSO buffer (std_len) is prepended to the TSO packet
1478 size_t std_len
= sizeof(struct virtio_net_hdr
) + VLAN_ETH_HEADER_LEN
1479 + DP_NETDEV_HEADROOM
+ mtu
;
1480 size_t data_len
= LINUX_RXQ_TSO_MAX_LEN
- std_len
;
1481 for (int i
= 0; i
< NETDEV_MAX_BURST
; i
++) {
1482 if (rx
->aux_bufs
[i
]) {
1486 rx
->aux_bufs
[i
] = dp_packet_new_with_headroom(data_len
, std_len
);
1490 dp_packet_batch_init(batch
);
1491 retval
= (rx
->is_tap
1492 ? netdev_linux_batch_rxq_recv_tap(rx
, mtu
, batch
)
1493 : netdev_linux_batch_rxq_recv_sock(rx
, mtu
, batch
));
1496 if (retval
!= EAGAIN
&& retval
!= EMSGSIZE
) {
1497 VLOG_WARN_RL(&rl
, "error receiving Ethernet packet on %s: %s",
1498 netdev_rxq_get_name(rxq_
), ovs_strerror(errno
));
1510 netdev_linux_rxq_wait(struct netdev_rxq
*rxq_
)
1512 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1513 poll_fd_wait(rx
->fd
, POLLIN
);
1517 netdev_linux_rxq_drain(struct netdev_rxq
*rxq_
)
1519 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1522 int error
= af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_
), &ifr
,
1523 SIOCGIFTXQLEN
, "SIOCGIFTXQLEN");
1527 drain_fd(rx
->fd
, ifr
.ifr_qlen
);
1530 return drain_rcvbuf(rx
->fd
);
1535 netdev_linux_sock_batch_send(int sock
, int ifindex
, bool tso
, int mtu
,
1536 struct dp_packet_batch
*batch
)
1538 const size_t size
= dp_packet_batch_size(batch
);
1539 /* We don't bother setting most fields in sockaddr_ll because the
1540 * kernel ignores them for SOCK_RAW. */
1541 struct sockaddr_ll sll
= { .sll_family
= AF_PACKET
,
1542 .sll_ifindex
= ifindex
};
1544 struct mmsghdr
*mmsg
= xmalloc(sizeof(*mmsg
) * size
);
1545 struct iovec
*iov
= xmalloc(sizeof(*iov
) * size
);
1547 struct dp_packet
*packet
;
1548 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1550 netdev_linux_prepend_vnet_hdr(packet
, mtu
);
1553 iov
[i
].iov_base
= dp_packet_data(packet
);
1554 iov
[i
].iov_len
= dp_packet_size(packet
);
1555 mmsg
[i
].msg_hdr
= (struct msghdr
) { .msg_name
= &sll
,
1556 .msg_namelen
= sizeof sll
,
1562 for (uint32_t ofs
= 0; ofs
< size
; ) {
1565 retval
= sendmmsg(sock
, mmsg
+ ofs
, size
- ofs
, 0);
1566 error
= retval
< 0 ? errno
: 0;
1567 } while (error
== EINTR
);
1579 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1580 * essential, because packets sent to a tap device with an AF_PACKET socket
1581 * will loop back to be *received* again on the tap device. This doesn't occur
1582 * on other interface types because we attach a socket filter to the rx
1585 netdev_linux_tap_batch_send(struct netdev
*netdev_
, bool tso
, int mtu
,
1586 struct dp_packet_batch
*batch
)
1588 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1589 struct dp_packet
*packet
;
1591 /* The Linux tap driver returns EIO if the device is not up,
1592 * so if the device is not up, don't waste time sending it.
1593 * However, if the device is in another network namespace
1594 * then OVS can't retrieve the state. In that case, send the
1595 * packets anyway. */
1596 if (netdev
->present
&& !(netdev
->ifi_flags
& IFF_UP
)) {
1597 netdev
->tx_dropped
+= dp_packet_batch_size(batch
);
1601 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1607 netdev_linux_prepend_vnet_hdr(packet
, mtu
);
1610 size
= dp_packet_size(packet
);
1612 retval
= write(netdev
->tap_fd
, dp_packet_data(packet
), size
);
1613 error
= retval
< 0 ? errno
: 0;
1614 } while (error
== EINTR
);
1617 /* The Linux tap driver returns EIO if the device is not up. From
1618 * the OVS side this is not an error, so we ignore it; otherwise,
1619 * return the erro. */
1623 } else if (retval
!= size
) {
1624 VLOG_WARN_RL(&rl
, "sent partial Ethernet packet (%"PRIuSIZE
" "
1625 "bytes of %"PRIuSIZE
") on %s",
1626 retval
, size
, netdev_get_name(netdev_
));
1634 netdev_linux_get_numa_id__(struct netdev_linux
*netdev
)
1635 OVS_REQUIRES(netdev
->mutex
)
1637 char *numa_node_path
;
1642 if (netdev
->cache_valid
& VALID_NUMA_ID
) {
1643 return netdev
->numa_id
;
1646 netdev
->numa_id
= 0;
1647 netdev
->cache_valid
|= VALID_NUMA_ID
;
1649 if (ovs_numa_get_n_numas() < 2) {
1650 /* No need to check on system with a single NUMA node. */
1654 name
= netdev_get_name(&netdev
->up
);
1655 if (strpbrk(name
, "/\\")) {
1656 VLOG_ERR_RL(&rl
, "\"%s\" is not a valid name for a port. "
1657 "A valid name must not include '/' or '\\'."
1658 "Using numa_id 0", name
);
1662 numa_node_path
= xasprintf("/sys/class/net/%s/device/numa_node", name
);
1664 stream
= fopen(numa_node_path
, "r");
1666 /* Virtual device does not have this info. */
1667 VLOG_INFO_RL(&rl
, "%s: Can't open '%s': %s, using numa_id 0",
1668 name
, numa_node_path
, ovs_strerror(errno
));
1669 free(numa_node_path
);
1673 if (fscanf(stream
, "%d", &node_id
) != 1
1674 || !ovs_numa_numa_id_is_valid(node_id
)) {
1675 VLOG_WARN_RL(&rl
, "%s: Can't detect NUMA node, using numa_id 0", name
);
1679 netdev
->numa_id
= node_id
;
1681 free(numa_node_path
);
1685 static int OVS_UNUSED
1686 netdev_linux_get_numa_id(const struct netdev
*netdev_
)
1688 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1691 ovs_mutex_lock(&netdev
->mutex
);
1692 numa_id
= netdev_linux_get_numa_id__(netdev
);
1693 ovs_mutex_unlock(&netdev
->mutex
);
1698 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1699 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1700 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1701 * the packet is too big or too small to transmit on the device.
1703 * The kernel maintains a packet transmission queue, so the caller is not
1704 * expected to do additional queuing of packets. */
1706 netdev_linux_send(struct netdev
*netdev_
, int qid OVS_UNUSED
,
1707 struct dp_packet_batch
*batch
,
1708 bool concurrent_txq OVS_UNUSED
)
1710 bool tso
= userspace_tso_enabled();
1711 int mtu
= ETH_PAYLOAD_MAX
;
1716 netdev_linux_get_mtu__(netdev_linux_cast(netdev_
), &mtu
);
1719 if (!is_tap_netdev(netdev_
)) {
1720 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_
))) {
1725 sock
= af_packet_sock();
1731 int ifindex
= netdev_get_ifindex(netdev_
);
1737 error
= netdev_linux_sock_batch_send(sock
, ifindex
, tso
, mtu
, batch
);
1739 error
= netdev_linux_tap_batch_send(netdev_
, tso
, mtu
, batch
);
1742 if (error
== ENOBUFS
) {
1743 /* The Linux AF_PACKET implementation never blocks waiting
1744 * for room for packets, instead returning ENOBUFS.
1745 * Translate this into EAGAIN for the caller. */
1748 VLOG_WARN_RL(&rl
, "error sending Ethernet packet on %s: %s",
1749 netdev_get_name(netdev_
), ovs_strerror(error
));
1754 dp_packet_delete_batch(batch
, true);
1758 /* Registers with the poll loop to wake up from the next call to poll_block()
1759 * when the packet transmission queue has sufficient room to transmit a packet
1760 * with netdev_send().
1762 * The kernel maintains a packet transmission queue, so the client is not
1763 * expected to do additional queuing of packets. Thus, this function is
1764 * unlikely to ever be used. It is included for completeness. */
1766 netdev_linux_send_wait(struct netdev
*netdev
, int qid OVS_UNUSED
)
1768 if (is_tap_netdev(netdev
)) {
1769 /* TAP device always accepts packets.*/
1770 poll_immediate_wake();
1774 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1775 * otherwise a positive errno value. */
1777 netdev_linux_set_etheraddr(struct netdev
*netdev_
, const struct eth_addr mac
)
1779 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1780 enum netdev_flags old_flags
= 0;
1783 ovs_mutex_lock(&netdev
->mutex
);
1784 if (netdev_linux_netnsid_is_remote(netdev
)) {
1789 if (netdev
->cache_valid
& VALID_ETHERADDR
) {
1790 error
= netdev
->ether_addr_error
;
1791 if (error
|| eth_addr_equals(netdev
->etheraddr
, mac
)) {
1794 netdev
->cache_valid
&= ~VALID_ETHERADDR
;
1797 /* Tap devices must be brought down before setting the address. */
1798 if (is_tap_netdev(netdev_
)) {
1799 update_flags(netdev
, NETDEV_UP
, 0, &old_flags
);
1801 error
= set_etheraddr(netdev_get_name(netdev_
), mac
);
1802 if (!error
|| error
== ENODEV
) {
1803 netdev
->ether_addr_error
= error
;
1804 netdev
->cache_valid
|= VALID_ETHERADDR
;
1806 netdev
->etheraddr
= mac
;
1810 if (is_tap_netdev(netdev_
) && old_flags
& NETDEV_UP
) {
1811 update_flags(netdev
, 0, NETDEV_UP
, &old_flags
);
1815 ovs_mutex_unlock(&netdev
->mutex
);
1819 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1821 netdev_linux_get_etheraddr(const struct netdev
*netdev_
, struct eth_addr
*mac
)
1823 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1826 ovs_mutex_lock(&netdev
->mutex
);
1827 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1828 netdev_linux_update_via_netlink(netdev
);
1831 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1832 /* Fall back to ioctl if netlink fails */
1833 netdev
->ether_addr_error
= get_etheraddr(netdev_get_name(netdev_
),
1834 &netdev
->etheraddr
);
1835 netdev
->cache_valid
|= VALID_ETHERADDR
;
1838 error
= netdev
->ether_addr_error
;
1840 *mac
= netdev
->etheraddr
;
1842 ovs_mutex_unlock(&netdev
->mutex
);
1848 netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
)
1852 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1853 netdev_linux_update_via_netlink(netdev
);
1856 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1857 /* Fall back to ioctl if netlink fails */
1860 netdev
->netdev_mtu_error
= af_inet_ifreq_ioctl(
1861 netdev_get_name(&netdev
->up
), &ifr
, SIOCGIFMTU
, "SIOCGIFMTU");
1862 netdev
->mtu
= ifr
.ifr_mtu
;
1863 netdev
->cache_valid
|= VALID_MTU
;
1866 error
= netdev
->netdev_mtu_error
;
1868 *mtup
= netdev
->mtu
;
1874 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1875 * in bytes, not including the hardware header; thus, this is typically 1500
1876 * bytes for Ethernet devices. */
1878 netdev_linux_get_mtu(const struct netdev
*netdev_
, int *mtup
)
1880 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1883 ovs_mutex_lock(&netdev
->mutex
);
1884 error
= netdev_linux_get_mtu__(netdev
, mtup
);
1885 ovs_mutex_unlock(&netdev
->mutex
);
1890 /* Sets the maximum size of transmitted (MTU) for given device using linux
1891 * networking ioctl interface.
1894 netdev_linux_set_mtu(struct netdev
*netdev_
, int mtu
)
1896 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1900 ovs_mutex_lock(&netdev
->mutex
);
1901 if (netdev_linux_netnsid_is_remote(netdev
)) {
1907 if (netdev_get_class(netdev_
) == &netdev_afxdp_class
) {
1908 error
= netdev_afxdp_verify_mtu_size(netdev_
, mtu
);
1915 if (netdev
->cache_valid
& VALID_MTU
) {
1916 error
= netdev
->netdev_mtu_error
;
1917 if (error
|| netdev
->mtu
== mtu
) {
1920 netdev
->cache_valid
&= ~VALID_MTU
;
1923 error
= af_inet_ifreq_ioctl(netdev_get_name(netdev_
), &ifr
,
1924 SIOCSIFMTU
, "SIOCSIFMTU");
1925 if (!error
|| error
== ENODEV
) {
1926 netdev
->netdev_mtu_error
= error
;
1927 netdev
->mtu
= ifr
.ifr_mtu
;
1928 netdev
->cache_valid
|= VALID_MTU
;
1931 ovs_mutex_unlock(&netdev
->mutex
);
1935 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1936 * On failure, returns a negative errno value. */
1938 netdev_linux_get_ifindex(const struct netdev
*netdev_
)
1940 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1943 ovs_mutex_lock(&netdev
->mutex
);
1944 if (netdev_linux_netnsid_is_remote(netdev
)) {
1948 error
= get_ifindex(netdev_
, &ifindex
);
1951 ovs_mutex_unlock(&netdev
->mutex
);
1952 return error
? -error
: ifindex
;
1956 netdev_linux_get_carrier(const struct netdev
*netdev_
, bool *carrier
)
1958 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1960 ovs_mutex_lock(&netdev
->mutex
);
1961 if (netdev
->miimon_interval
> 0) {
1962 *carrier
= netdev
->miimon
;
1964 *carrier
= (netdev
->ifi_flags
& IFF_RUNNING
) != 0;
1966 ovs_mutex_unlock(&netdev
->mutex
);
1971 static long long int
1972 netdev_linux_get_carrier_resets(const struct netdev
*netdev_
)
1974 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1975 long long int carrier_resets
;
1977 ovs_mutex_lock(&netdev
->mutex
);
1978 carrier_resets
= netdev
->carrier_resets
;
1979 ovs_mutex_unlock(&netdev
->mutex
);
1981 return carrier_resets
;
1985 netdev_linux_do_miimon(const char *name
, int cmd
, const char *cmd_name
,
1986 struct mii_ioctl_data
*data
)
1991 memset(&ifr
, 0, sizeof ifr
);
1992 memcpy(&ifr
.ifr_data
, data
, sizeof *data
);
1993 error
= af_inet_ifreq_ioctl(name
, &ifr
, cmd
, cmd_name
);
1994 memcpy(data
, &ifr
.ifr_data
, sizeof *data
);
2000 netdev_linux_get_miimon(const char *name
, bool *miimon
)
2002 struct mii_ioctl_data data
;
2007 memset(&data
, 0, sizeof data
);
2008 error
= netdev_linux_do_miimon(name
, SIOCGMIIPHY
, "SIOCGMIIPHY", &data
);
2010 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
2011 data
.reg_num
= MII_BMSR
;
2012 error
= netdev_linux_do_miimon(name
, SIOCGMIIREG
, "SIOCGMIIREG",
2016 *miimon
= !!(data
.val_out
& BMSR_LSTATUS
);
2020 struct ethtool_cmd ecmd
;
2022 VLOG_DBG_RL(&rl
, "%s: failed to query MII, falling back to ethtool",
2025 COVERAGE_INC(netdev_get_ethtool
);
2026 memset(&ecmd
, 0, sizeof ecmd
);
2027 error
= netdev_linux_do_ethtool(name
, &ecmd
, ETHTOOL_GLINK
,
2030 struct ethtool_value eval
;
2032 memcpy(&eval
, &ecmd
, sizeof eval
);
2033 *miimon
= !!eval
.data
;
2035 VLOG_WARN_RL(&rl
, "%s: ethtool link status failed", name
);
2043 netdev_linux_set_miimon_interval(struct netdev
*netdev_
,
2044 long long int interval
)
2046 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2048 ovs_mutex_lock(&netdev
->mutex
);
2049 interval
= interval
> 0 ? MAX(interval
, 100) : 0;
2050 if (netdev
->miimon_interval
!= interval
) {
2051 if (interval
&& !netdev
->miimon_interval
) {
2052 atomic_count_inc(&miimon_cnt
);
2053 } else if (!interval
&& netdev
->miimon_interval
) {
2054 atomic_count_dec(&miimon_cnt
);
2057 netdev
->miimon_interval
= interval
;
2058 timer_set_expired(&netdev
->miimon_timer
);
2060 ovs_mutex_unlock(&netdev
->mutex
);
2066 netdev_linux_miimon_run(void)
2068 struct shash device_shash
;
2069 struct shash_node
*node
;
2071 shash_init(&device_shash
);
2072 netdev_get_devices(&netdev_linux_class
, &device_shash
);
2073 SHASH_FOR_EACH (node
, &device_shash
) {
2074 struct netdev
*netdev
= node
->data
;
2075 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
2078 ovs_mutex_lock(&dev
->mutex
);
2079 if (dev
->miimon_interval
> 0 && timer_expired(&dev
->miimon_timer
)) {
2080 netdev_linux_get_miimon(dev
->up
.name
, &miimon
);
2081 if (miimon
!= dev
->miimon
) {
2082 dev
->miimon
= miimon
;
2083 netdev_linux_changed(dev
, dev
->ifi_flags
, 0);
2086 timer_set_duration(&dev
->miimon_timer
, dev
->miimon_interval
);
2088 ovs_mutex_unlock(&dev
->mutex
);
2089 netdev_close(netdev
);
2092 shash_destroy(&device_shash
);
2096 netdev_linux_miimon_wait(void)
2098 struct shash device_shash
;
2099 struct shash_node
*node
;
2101 shash_init(&device_shash
);
2102 netdev_get_devices(&netdev_linux_class
, &device_shash
);
2103 SHASH_FOR_EACH (node
, &device_shash
) {
2104 struct netdev
*netdev
= node
->data
;
2105 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
2107 ovs_mutex_lock(&dev
->mutex
);
2108 if (dev
->miimon_interval
> 0) {
2109 timer_wait(&dev
->miimon_timer
);
2111 ovs_mutex_unlock(&dev
->mutex
);
2112 netdev_close(netdev
);
2114 shash_destroy(&device_shash
);
2118 swap_uint64(uint64_t *a
, uint64_t *b
)
2125 /* Copies 'src' into 'dst', performing format conversion in the process.
2127 * 'src' is allowed to be misaligned. */
2129 netdev_stats_from_ovs_vport_stats(struct netdev_stats
*dst
,
2130 const struct ovs_vport_stats
*src
)
2132 dst
->rx_packets
= get_32aligned_u64(&src
->rx_packets
);
2133 dst
->tx_packets
= get_32aligned_u64(&src
->tx_packets
);
2134 dst
->rx_bytes
= get_32aligned_u64(&src
->rx_bytes
);
2135 dst
->tx_bytes
= get_32aligned_u64(&src
->tx_bytes
);
2136 dst
->rx_errors
= get_32aligned_u64(&src
->rx_errors
);
2137 dst
->tx_errors
= get_32aligned_u64(&src
->tx_errors
);
2138 dst
->rx_dropped
= get_32aligned_u64(&src
->rx_dropped
);
2139 dst
->tx_dropped
= get_32aligned_u64(&src
->tx_dropped
);
2141 dst
->collisions
= 0;
2142 dst
->rx_length_errors
= 0;
2143 dst
->rx_over_errors
= 0;
2144 dst
->rx_crc_errors
= 0;
2145 dst
->rx_frame_errors
= 0;
2146 dst
->rx_fifo_errors
= 0;
2147 dst
->rx_missed_errors
= 0;
2148 dst
->tx_aborted_errors
= 0;
2149 dst
->tx_carrier_errors
= 0;
2150 dst
->tx_fifo_errors
= 0;
2151 dst
->tx_heartbeat_errors
= 0;
2152 dst
->tx_window_errors
= 0;
2156 get_stats_via_vport__(const struct netdev
*netdev
, struct netdev_stats
*stats
)
2158 struct dpif_netlink_vport reply
;
2162 error
= dpif_netlink_vport_get(netdev_get_name(netdev
), &reply
, &buf
);
2165 } else if (!reply
.stats
) {
2170 netdev_stats_from_ovs_vport_stats(stats
, reply
.stats
);
2178 get_stats_via_vport(const struct netdev
*netdev_
,
2179 struct netdev_stats
*stats
)
2181 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2183 if (!netdev
->vport_stats_error
||
2184 !(netdev
->cache_valid
& VALID_VPORT_STAT_ERROR
)) {
2187 error
= get_stats_via_vport__(netdev_
, stats
);
2188 if (error
&& error
!= ENOENT
&& error
!= ENODEV
) {
2189 VLOG_WARN_RL(&rl
, "%s: obtaining netdev stats via vport failed "
2191 netdev_get_name(netdev_
), ovs_strerror(error
));
2193 netdev
->vport_stats_error
= error
;
2194 netdev
->cache_valid
|= VALID_VPORT_STAT_ERROR
;
2198 /* Retrieves current device stats for 'netdev-linux'. */
2200 netdev_linux_get_stats(const struct netdev
*netdev_
,
2201 struct netdev_stats
*stats
)
2203 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2204 struct netdev_stats dev_stats
;
2207 ovs_mutex_lock(&netdev
->mutex
);
2208 get_stats_via_vport(netdev_
, stats
);
2209 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
2211 if (!netdev
->vport_stats_error
) {
2214 } else if (netdev
->vport_stats_error
) {
2215 /* stats not available from OVS then use netdev stats. */
2218 stats
->multicast
+= dev_stats
.multicast
;
2219 stats
->collisions
+= dev_stats
.collisions
;
2220 stats
->rx_length_errors
+= dev_stats
.rx_length_errors
;
2221 stats
->rx_over_errors
+= dev_stats
.rx_over_errors
;
2222 stats
->rx_crc_errors
+= dev_stats
.rx_crc_errors
;
2223 stats
->rx_frame_errors
+= dev_stats
.rx_frame_errors
;
2224 stats
->rx_fifo_errors
+= dev_stats
.rx_fifo_errors
;
2225 stats
->rx_missed_errors
+= dev_stats
.rx_missed_errors
;
2226 stats
->tx_aborted_errors
+= dev_stats
.tx_aborted_errors
;
2227 stats
->tx_carrier_errors
+= dev_stats
.tx_carrier_errors
;
2228 stats
->tx_fifo_errors
+= dev_stats
.tx_fifo_errors
;
2229 stats
->tx_heartbeat_errors
+= dev_stats
.tx_heartbeat_errors
;
2230 stats
->tx_window_errors
+= dev_stats
.tx_window_errors
;
2232 ovs_mutex_unlock(&netdev
->mutex
);
2237 /* Retrieves current device stats for 'netdev-tap' netdev or
2238 * netdev-internal. */
2240 netdev_tap_get_stats(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
2242 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2243 struct netdev_stats dev_stats
;
2246 ovs_mutex_lock(&netdev
->mutex
);
2247 get_stats_via_vport(netdev_
, stats
);
2248 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
2250 if (!netdev
->vport_stats_error
) {
2253 } else if (netdev
->vport_stats_error
) {
2254 /* Transmit and receive stats will appear to be swapped relative to the
2255 * other ports since we are the one sending the data, not a remote
2256 * computer. For consistency, we swap them back here. This does not
2257 * apply if we are getting stats from the vport layer because it always
2258 * tracks stats from the perspective of the switch. */
2261 swap_uint64(&stats
->rx_packets
, &stats
->tx_packets
);
2262 swap_uint64(&stats
->rx_bytes
, &stats
->tx_bytes
);
2263 swap_uint64(&stats
->rx_errors
, &stats
->tx_errors
);
2264 swap_uint64(&stats
->rx_dropped
, &stats
->tx_dropped
);
2265 stats
->rx_length_errors
= 0;
2266 stats
->rx_over_errors
= 0;
2267 stats
->rx_crc_errors
= 0;
2268 stats
->rx_frame_errors
= 0;
2269 stats
->rx_fifo_errors
= 0;
2270 stats
->rx_missed_errors
= 0;
2271 stats
->tx_aborted_errors
= 0;
2272 stats
->tx_carrier_errors
= 0;
2273 stats
->tx_fifo_errors
= 0;
2274 stats
->tx_heartbeat_errors
= 0;
2275 stats
->tx_window_errors
= 0;
2277 /* Use kernel netdev's packet and byte counts since vport counters
2278 * do not reflect packet counts on the wire when GSO, TSO or GRO
2280 stats
->rx_packets
= dev_stats
.tx_packets
;
2281 stats
->rx_bytes
= dev_stats
.tx_bytes
;
2282 stats
->tx_packets
= dev_stats
.rx_packets
;
2283 stats
->tx_bytes
= dev_stats
.rx_bytes
;
2285 stats
->rx_dropped
+= dev_stats
.tx_dropped
;
2286 stats
->tx_dropped
+= dev_stats
.rx_dropped
;
2288 stats
->rx_errors
+= dev_stats
.tx_errors
;
2289 stats
->tx_errors
+= dev_stats
.rx_errors
;
2291 stats
->multicast
+= dev_stats
.multicast
;
2292 stats
->collisions
+= dev_stats
.collisions
;
2294 stats
->tx_dropped
+= netdev
->tx_dropped
;
2295 stats
->rx_dropped
+= netdev
->rx_dropped
;
2296 ovs_mutex_unlock(&netdev
->mutex
);
2302 netdev_internal_get_stats(const struct netdev
*netdev_
,
2303 struct netdev_stats
*stats
)
2305 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2308 ovs_mutex_lock(&netdev
->mutex
);
2309 get_stats_via_vport(netdev_
, stats
);
2310 error
= netdev
->vport_stats_error
;
2311 ovs_mutex_unlock(&netdev
->mutex
);
2317 netdev_linux_read_features(struct netdev_linux
*netdev
)
2319 struct ethtool_cmd ecmd
;
2323 if (netdev
->cache_valid
& VALID_FEATURES
) {
2327 COVERAGE_INC(netdev_get_ethtool
);
2328 memset(&ecmd
, 0, sizeof ecmd
);
2329 error
= netdev_linux_do_ethtool(netdev
->up
.name
, &ecmd
,
2330 ETHTOOL_GSET
, "ETHTOOL_GSET");
2335 /* Supported features. */
2336 netdev
->supported
= 0;
2337 if (ecmd
.supported
& SUPPORTED_10baseT_Half
) {
2338 netdev
->supported
|= NETDEV_F_10MB_HD
;
2340 if (ecmd
.supported
& SUPPORTED_10baseT_Full
) {
2341 netdev
->supported
|= NETDEV_F_10MB_FD
;
2343 if (ecmd
.supported
& SUPPORTED_100baseT_Half
) {
2344 netdev
->supported
|= NETDEV_F_100MB_HD
;
2346 if (ecmd
.supported
& SUPPORTED_100baseT_Full
) {
2347 netdev
->supported
|= NETDEV_F_100MB_FD
;
2349 if (ecmd
.supported
& SUPPORTED_1000baseT_Half
) {
2350 netdev
->supported
|= NETDEV_F_1GB_HD
;
2352 if ((ecmd
.supported
& SUPPORTED_1000baseT_Full
) ||
2353 (ecmd
.supported
& SUPPORTED_1000baseKX_Full
)) {
2354 netdev
->supported
|= NETDEV_F_1GB_FD
;
2356 if ((ecmd
.supported
& SUPPORTED_10000baseT_Full
) ||
2357 (ecmd
.supported
& SUPPORTED_10000baseKX4_Full
) ||
2358 (ecmd
.supported
& SUPPORTED_10000baseKR_Full
) ||
2359 (ecmd
.supported
& SUPPORTED_10000baseR_FEC
)) {
2360 netdev
->supported
|= NETDEV_F_10GB_FD
;
2362 if ((ecmd
.supported
& SUPPORTED_40000baseKR4_Full
) ||
2363 (ecmd
.supported
& SUPPORTED_40000baseCR4_Full
) ||
2364 (ecmd
.supported
& SUPPORTED_40000baseSR4_Full
) ||
2365 (ecmd
.supported
& SUPPORTED_40000baseLR4_Full
)) {
2366 netdev
->supported
|= NETDEV_F_40GB_FD
;
2368 if (ecmd
.supported
& SUPPORTED_TP
) {
2369 netdev
->supported
|= NETDEV_F_COPPER
;
2371 if (ecmd
.supported
& SUPPORTED_FIBRE
) {
2372 netdev
->supported
|= NETDEV_F_FIBER
;
2374 if (ecmd
.supported
& SUPPORTED_Autoneg
) {
2375 netdev
->supported
|= NETDEV_F_AUTONEG
;
2377 if (ecmd
.supported
& SUPPORTED_Pause
) {
2378 netdev
->supported
|= NETDEV_F_PAUSE
;
2380 if (ecmd
.supported
& SUPPORTED_Asym_Pause
) {
2381 netdev
->supported
|= NETDEV_F_PAUSE_ASYM
;
2384 /* Advertised features. */
2385 netdev
->advertised
= 0;
2386 if (ecmd
.advertising
& ADVERTISED_10baseT_Half
) {
2387 netdev
->advertised
|= NETDEV_F_10MB_HD
;
2389 if (ecmd
.advertising
& ADVERTISED_10baseT_Full
) {
2390 netdev
->advertised
|= NETDEV_F_10MB_FD
;
2392 if (ecmd
.advertising
& ADVERTISED_100baseT_Half
) {
2393 netdev
->advertised
|= NETDEV_F_100MB_HD
;
2395 if (ecmd
.advertising
& ADVERTISED_100baseT_Full
) {
2396 netdev
->advertised
|= NETDEV_F_100MB_FD
;
2398 if (ecmd
.advertising
& ADVERTISED_1000baseT_Half
) {
2399 netdev
->advertised
|= NETDEV_F_1GB_HD
;
2401 if ((ecmd
.advertising
& ADVERTISED_1000baseT_Full
) ||
2402 (ecmd
.advertising
& ADVERTISED_1000baseKX_Full
)) {
2403 netdev
->advertised
|= NETDEV_F_1GB_FD
;
2405 if ((ecmd
.advertising
& ADVERTISED_10000baseT_Full
) ||
2406 (ecmd
.advertising
& ADVERTISED_10000baseKX4_Full
) ||
2407 (ecmd
.advertising
& ADVERTISED_10000baseKR_Full
) ||
2408 (ecmd
.advertising
& ADVERTISED_10000baseR_FEC
)) {
2409 netdev
->advertised
|= NETDEV_F_10GB_FD
;
2411 if ((ecmd
.advertising
& ADVERTISED_40000baseKR4_Full
) ||
2412 (ecmd
.advertising
& ADVERTISED_40000baseCR4_Full
) ||
2413 (ecmd
.advertising
& ADVERTISED_40000baseSR4_Full
) ||
2414 (ecmd
.advertising
& ADVERTISED_40000baseLR4_Full
)) {
2415 netdev
->advertised
|= NETDEV_F_40GB_FD
;
2417 if (ecmd
.advertising
& ADVERTISED_TP
) {
2418 netdev
->advertised
|= NETDEV_F_COPPER
;
2420 if (ecmd
.advertising
& ADVERTISED_FIBRE
) {
2421 netdev
->advertised
|= NETDEV_F_FIBER
;
2423 if (ecmd
.advertising
& ADVERTISED_Autoneg
) {
2424 netdev
->advertised
|= NETDEV_F_AUTONEG
;
2426 if (ecmd
.advertising
& ADVERTISED_Pause
) {
2427 netdev
->advertised
|= NETDEV_F_PAUSE
;
2429 if (ecmd
.advertising
& ADVERTISED_Asym_Pause
) {
2430 netdev
->advertised
|= NETDEV_F_PAUSE_ASYM
;
2433 /* Current settings. */
2434 speed
= ethtool_cmd_speed(&ecmd
);
2435 if (speed
== SPEED_10
) {
2436 netdev
->current
= ecmd
.duplex
? NETDEV_F_10MB_FD
: NETDEV_F_10MB_HD
;
2437 } else if (speed
== SPEED_100
) {
2438 netdev
->current
= ecmd
.duplex
? NETDEV_F_100MB_FD
: NETDEV_F_100MB_HD
;
2439 } else if (speed
== SPEED_1000
) {
2440 netdev
->current
= ecmd
.duplex
? NETDEV_F_1GB_FD
: NETDEV_F_1GB_HD
;
2441 } else if (speed
== SPEED_10000
) {
2442 netdev
->current
= NETDEV_F_10GB_FD
;
2443 } else if (speed
== 40000) {
2444 netdev
->current
= NETDEV_F_40GB_FD
;
2445 } else if (speed
== 100000) {
2446 netdev
->current
= NETDEV_F_100GB_FD
;
2447 } else if (speed
== 1000000) {
2448 netdev
->current
= NETDEV_F_1TB_FD
;
2450 netdev
->current
= 0;
2453 if (ecmd
.port
== PORT_TP
) {
2454 netdev
->current
|= NETDEV_F_COPPER
;
2455 } else if (ecmd
.port
== PORT_FIBRE
) {
2456 netdev
->current
|= NETDEV_F_FIBER
;
2460 netdev
->current
|= NETDEV_F_AUTONEG
;
2464 netdev
->cache_valid
|= VALID_FEATURES
;
2465 netdev
->get_features_error
= error
;
2468 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2469 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2470 * Returns 0 if successful, otherwise a positive errno value. */
2472 netdev_linux_get_features(const struct netdev
*netdev_
,
2473 enum netdev_features
*current
,
2474 enum netdev_features
*advertised
,
2475 enum netdev_features
*supported
,
2476 enum netdev_features
*peer
)
2478 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2481 ovs_mutex_lock(&netdev
->mutex
);
2482 if (netdev_linux_netnsid_is_remote(netdev
)) {
2487 netdev_linux_read_features(netdev
);
2488 if (!netdev
->get_features_error
) {
2489 *current
= netdev
->current
;
2490 *advertised
= netdev
->advertised
;
2491 *supported
= netdev
->supported
;
2492 *peer
= 0; /* XXX */
2494 error
= netdev
->get_features_error
;
2497 ovs_mutex_unlock(&netdev
->mutex
);
2501 /* Set the features advertised by 'netdev' to 'advertise'. */
2503 netdev_linux_set_advertisements(struct netdev
*netdev_
,
2504 enum netdev_features advertise
)
2506 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2507 struct ethtool_cmd ecmd
;
2510 ovs_mutex_lock(&netdev
->mutex
);
2512 COVERAGE_INC(netdev_get_ethtool
);
2514 if (netdev_linux_netnsid_is_remote(netdev
)) {
2519 memset(&ecmd
, 0, sizeof ecmd
);
2520 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2521 ETHTOOL_GSET
, "ETHTOOL_GSET");
2526 ecmd
.advertising
= 0;
2527 if (advertise
& NETDEV_F_10MB_HD
) {
2528 ecmd
.advertising
|= ADVERTISED_10baseT_Half
;
2530 if (advertise
& NETDEV_F_10MB_FD
) {
2531 ecmd
.advertising
|= ADVERTISED_10baseT_Full
;
2533 if (advertise
& NETDEV_F_100MB_HD
) {
2534 ecmd
.advertising
|= ADVERTISED_100baseT_Half
;
2536 if (advertise
& NETDEV_F_100MB_FD
) {
2537 ecmd
.advertising
|= ADVERTISED_100baseT_Full
;
2539 if (advertise
& NETDEV_F_1GB_HD
) {
2540 ecmd
.advertising
|= ADVERTISED_1000baseT_Half
;
2542 if (advertise
& NETDEV_F_1GB_FD
) {
2543 ecmd
.advertising
|= ADVERTISED_1000baseT_Full
;
2545 if (advertise
& NETDEV_F_10GB_FD
) {
2546 ecmd
.advertising
|= ADVERTISED_10000baseT_Full
;
2548 if (advertise
& NETDEV_F_COPPER
) {
2549 ecmd
.advertising
|= ADVERTISED_TP
;
2551 if (advertise
& NETDEV_F_FIBER
) {
2552 ecmd
.advertising
|= ADVERTISED_FIBRE
;
2554 if (advertise
& NETDEV_F_AUTONEG
) {
2555 ecmd
.advertising
|= ADVERTISED_Autoneg
;
2557 if (advertise
& NETDEV_F_PAUSE
) {
2558 ecmd
.advertising
|= ADVERTISED_Pause
;
2560 if (advertise
& NETDEV_F_PAUSE_ASYM
) {
2561 ecmd
.advertising
|= ADVERTISED_Asym_Pause
;
2563 COVERAGE_INC(netdev_set_ethtool
);
2564 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2565 ETHTOOL_SSET
, "ETHTOOL_SSET");
2568 ovs_mutex_unlock(&netdev
->mutex
);
2572 static struct tc_police
2573 tc_matchall_fill_police(uint32_t kbits_rate
, uint32_t kbits_burst
)
2575 unsigned int bsize
= MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024 / 64;
2576 unsigned int bps
= ((uint64_t) kbits_rate
* 1000) / 8;
2577 struct tc_police police
;
2578 struct tc_ratespec rate
;
2581 memset(&rate
, 0, sizeof rate
);
2583 rate
.cell_log
= tc_calc_cell_log(mtu
);
2584 rate
.mpu
= ETH_TOTAL_MIN
;
2586 memset(&police
, 0, sizeof police
);
2587 police
.burst
= tc_bytes_to_ticks(bps
, bsize
);
2588 police
.action
= TC_POLICE_SHOT
;
2596 nl_msg_put_act_police(struct ofpbuf
*request
, struct tc_police police
)
2600 nl_msg_put_string(request
, TCA_ACT_KIND
, "police");
2601 offset
= nl_msg_start_nested(request
, TCA_ACT_OPTIONS
);
2602 nl_msg_put_unspec(request
, TCA_POLICE_TBF
, &police
, sizeof police
);
2603 tc_put_rtab(request
, TCA_POLICE_RATE
, &police
.rate
);
2604 nl_msg_put_u32(request
, TCA_POLICE_RESULT
, TC_ACT_UNSPEC
);
2605 nl_msg_end_nested(request
, offset
);
2609 tc_add_matchall_policer(struct netdev
*netdev
, uint32_t kbits_rate
,
2610 uint32_t kbits_burst
)
2612 uint16_t eth_type
= (OVS_FORCE
uint16_t) htons(ETH_P_ALL
);
2613 size_t basic_offset
, action_offset
, inner_offset
;
2614 uint16_t prio
= TC_RESERVED_PRIORITY_POLICE
;
2615 int ifindex
, err
= 0;
2616 struct tc_police pol_act
;
2617 struct ofpbuf request
;
2618 struct ofpbuf
*reply
;
2619 struct tcmsg
*tcmsg
;
2620 uint32_t handle
= 1;
2622 err
= get_ifindex(netdev
, &ifindex
);
2627 tcmsg
= tc_make_request(ifindex
, RTM_NEWTFILTER
, NLM_F_CREATE
| NLM_F_ECHO
,
2629 tcmsg
->tcm_parent
= TC_INGRESS_PARENT
;
2630 tcmsg
->tcm_info
= tc_make_handle(prio
, eth_type
);
2631 tcmsg
->tcm_handle
= handle
;
2633 pol_act
= tc_matchall_fill_police(kbits_rate
, kbits_burst
);
2634 nl_msg_put_string(&request
, TCA_KIND
, "matchall");
2635 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
2636 action_offset
= nl_msg_start_nested(&request
, TCA_MATCHALL_ACT
);
2637 inner_offset
= nl_msg_start_nested(&request
, 1);
2638 nl_msg_put_act_police(&request
, pol_act
);
2639 nl_msg_end_nested(&request
, inner_offset
);
2640 nl_msg_end_nested(&request
, action_offset
);
2641 nl_msg_end_nested(&request
, basic_offset
);
2643 err
= tc_transact(&request
, &reply
);
2646 ofpbuf_at_assert(reply
, NLMSG_HDRLEN
, sizeof *tc
);
2647 ofpbuf_delete(reply
);
2654 tc_del_matchall_policer(struct netdev
*netdev
)
2656 int prio
= TC_RESERVED_PRIORITY_POLICE
;
2657 uint32_t block_id
= 0;
2662 err
= get_ifindex(netdev
, &ifindex
);
2667 id
= tc_make_tcf_id(ifindex
, block_id
, prio
, TC_INGRESS
);
2668 err
= tc_del_filter(&id
);
2676 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2677 * successful, otherwise a positive errno value. */
2679 netdev_linux_set_policing(struct netdev
*netdev_
,
2680 uint32_t kbits_rate
, uint32_t kbits_burst
)
2682 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2683 const char *netdev_name
= netdev_get_name(netdev_
);
2687 kbits_burst
= (!kbits_rate
? 0 /* Force to 0 if no rate specified. */
2688 : !kbits_burst
? 8000 /* Default to 8000 kbits if 0. */
2689 : kbits_burst
); /* Stick with user-specified value. */
2691 ovs_mutex_lock(&netdev
->mutex
);
2692 if (netdev_linux_netnsid_is_remote(netdev
)) {
2697 if (netdev
->cache_valid
& VALID_POLICING
) {
2698 error
= netdev
->netdev_policing_error
;
2699 if (error
|| (netdev
->kbits_rate
== kbits_rate
&&
2700 netdev
->kbits_burst
== kbits_burst
)) {
2701 /* Assume that settings haven't changed since we last set them. */
2704 netdev
->cache_valid
&= ~VALID_POLICING
;
2707 COVERAGE_INC(netdev_set_policing
);
2709 /* Use matchall for policing when offloadling ovs with tc-flower. */
2710 if (netdev_is_flow_api_enabled()) {
2711 error
= tc_del_matchall_policer(netdev_
);
2713 error
= tc_add_matchall_policer(netdev_
, kbits_rate
, kbits_burst
);
2715 ovs_mutex_unlock(&netdev
->mutex
);
2719 error
= get_ifindex(netdev_
, &ifindex
);
2724 /* Remove any existing ingress qdisc. */
2725 error
= tc_add_del_qdisc(ifindex
, false, 0, TC_INGRESS
);
2727 VLOG_WARN_RL(&rl
, "%s: removing policing failed: %s",
2728 netdev_name
, ovs_strerror(error
));
2733 error
= tc_add_del_qdisc(ifindex
, true, 0, TC_INGRESS
);
2735 VLOG_WARN_RL(&rl
, "%s: adding policing qdisc failed: %s",
2736 netdev_name
, ovs_strerror(error
));
2740 error
= tc_add_policer(netdev_
, kbits_rate
, kbits_burst
);
2742 VLOG_WARN_RL(&rl
, "%s: adding policing action failed: %s",
2743 netdev_name
, ovs_strerror(error
));
2748 netdev
->kbits_rate
= kbits_rate
;
2749 netdev
->kbits_burst
= kbits_burst
;
2752 if (!error
|| error
== ENODEV
) {
2753 netdev
->netdev_policing_error
= error
;
2754 netdev
->cache_valid
|= VALID_POLICING
;
2756 ovs_mutex_unlock(&netdev
->mutex
);
2761 netdev_linux_get_qos_types(const struct netdev
*netdev OVS_UNUSED
,
2764 const struct tc_ops
*const *opsp
;
2765 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2766 const struct tc_ops
*ops
= *opsp
;
2767 if (ops
->tc_install
&& ops
->ovs_name
[0] != '\0') {
2768 sset_add(types
, ops
->ovs_name
);
2774 static const struct tc_ops
*
2775 tc_lookup_ovs_name(const char *name
)
2777 const struct tc_ops
*const *opsp
;
2779 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2780 const struct tc_ops
*ops
= *opsp
;
2781 if (!strcmp(name
, ops
->ovs_name
)) {
2788 static const struct tc_ops
*
2789 tc_lookup_linux_name(const char *name
)
2791 const struct tc_ops
*const *opsp
;
2793 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2794 const struct tc_ops
*ops
= *opsp
;
2795 if (ops
->linux_name
&& !strcmp(name
, ops
->linux_name
)) {
2802 static struct tc_queue
*
2803 tc_find_queue__(const struct netdev
*netdev_
, unsigned int queue_id
,
2806 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2807 struct tc_queue
*queue
;
2809 HMAP_FOR_EACH_IN_BUCKET (queue
, hmap_node
, hash
, &netdev
->tc
->queues
) {
2810 if (queue
->queue_id
== queue_id
) {
2817 static struct tc_queue
*
2818 tc_find_queue(const struct netdev
*netdev
, unsigned int queue_id
)
2820 return tc_find_queue__(netdev
, queue_id
, hash_int(queue_id
, 0));
2824 netdev_linux_get_qos_capabilities(const struct netdev
*netdev OVS_UNUSED
,
2826 struct netdev_qos_capabilities
*caps
)
2828 const struct tc_ops
*ops
= tc_lookup_ovs_name(type
);
2832 caps
->n_queues
= ops
->n_queues
;
2837 netdev_linux_get_qos(const struct netdev
*netdev_
,
2838 const char **typep
, struct smap
*details
)
2840 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2843 ovs_mutex_lock(&netdev
->mutex
);
2844 if (netdev_linux_netnsid_is_remote(netdev
)) {
2849 error
= tc_query_qdisc(netdev_
);
2851 *typep
= netdev
->tc
->ops
->ovs_name
;
2852 error
= (netdev
->tc
->ops
->qdisc_get
2853 ? netdev
->tc
->ops
->qdisc_get(netdev_
, details
)
2858 ovs_mutex_unlock(&netdev
->mutex
);
2863 netdev_linux_set_qos(struct netdev
*netdev_
,
2864 const char *type
, const struct smap
*details
)
2866 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2867 const struct tc_ops
*new_ops
;
2870 new_ops
= tc_lookup_ovs_name(type
);
2871 if (!new_ops
|| !new_ops
->tc_install
) {
2875 if (new_ops
== &tc_ops_noop
) {
2876 return new_ops
->tc_install(netdev_
, details
);
2879 ovs_mutex_lock(&netdev
->mutex
);
2880 if (netdev_linux_netnsid_is_remote(netdev
)) {
2885 error
= tc_query_qdisc(netdev_
);
2890 if (new_ops
== netdev
->tc
->ops
) {
2891 error
= new_ops
->qdisc_set
? new_ops
->qdisc_set(netdev_
, details
) : 0;
2893 /* Delete existing qdisc. */
2894 error
= tc_del_qdisc(netdev_
);
2898 ovs_assert(netdev
->tc
== NULL
);
2900 /* Install new qdisc. */
2901 error
= new_ops
->tc_install(netdev_
, details
);
2902 ovs_assert((error
== 0) == (netdev
->tc
!= NULL
));
2906 ovs_mutex_unlock(&netdev
->mutex
);
2911 netdev_linux_get_queue(const struct netdev
*netdev_
,
2912 unsigned int queue_id
, struct smap
*details
)
2914 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2917 ovs_mutex_lock(&netdev
->mutex
);
2918 if (netdev_linux_netnsid_is_remote(netdev
)) {
2923 error
= tc_query_qdisc(netdev_
);
2925 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2927 ? netdev
->tc
->ops
->class_get(netdev_
, queue
, details
)
2932 ovs_mutex_unlock(&netdev
->mutex
);
2937 netdev_linux_set_queue(struct netdev
*netdev_
,
2938 unsigned int queue_id
, const struct smap
*details
)
2940 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2943 ovs_mutex_lock(&netdev
->mutex
);
2944 if (netdev_linux_netnsid_is_remote(netdev
)) {
2949 error
= tc_query_qdisc(netdev_
);
2951 error
= (queue_id
< netdev
->tc
->ops
->n_queues
2952 && netdev
->tc
->ops
->class_set
2953 ? netdev
->tc
->ops
->class_set(netdev_
, queue_id
, details
)
2958 ovs_mutex_unlock(&netdev
->mutex
);
2963 netdev_linux_delete_queue(struct netdev
*netdev_
, unsigned int queue_id
)
2965 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2968 ovs_mutex_lock(&netdev
->mutex
);
2969 if (netdev_linux_netnsid_is_remote(netdev
)) {
2974 error
= tc_query_qdisc(netdev_
);
2976 if (netdev
->tc
->ops
->class_delete
) {
2977 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2979 ? netdev
->tc
->ops
->class_delete(netdev_
, queue
)
2987 ovs_mutex_unlock(&netdev
->mutex
);
2992 netdev_linux_get_queue_stats(const struct netdev
*netdev_
,
2993 unsigned int queue_id
,
2994 struct netdev_queue_stats
*stats
)
2996 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2999 ovs_mutex_lock(&netdev
->mutex
);
3000 if (netdev_linux_netnsid_is_remote(netdev
)) {
3005 error
= tc_query_qdisc(netdev_
);
3007 if (netdev
->tc
->ops
->class_get_stats
) {
3008 const struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
3010 stats
->created
= queue
->created
;
3011 error
= netdev
->tc
->ops
->class_get_stats(netdev_
, queue
,
3022 ovs_mutex_unlock(&netdev
->mutex
);
3026 struct queue_dump_state
{
3027 struct nl_dump dump
;
3032 start_queue_dump(const struct netdev
*netdev
, struct queue_dump_state
*state
)
3034 struct ofpbuf request
;
3035 struct tcmsg
*tcmsg
;
3037 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, 0, &request
);
3041 tcmsg
->tcm_parent
= 0;
3042 nl_dump_start(&state
->dump
, NETLINK_ROUTE
, &request
);
3043 ofpbuf_uninit(&request
);
3045 ofpbuf_init(&state
->buf
, NL_DUMP_BUFSIZE
);
3050 finish_queue_dump(struct queue_dump_state
*state
)
3052 ofpbuf_uninit(&state
->buf
);
3053 return nl_dump_done(&state
->dump
);
3056 struct netdev_linux_queue_state
{
3057 unsigned int *queues
;
3063 netdev_linux_queue_dump_start(const struct netdev
*netdev_
, void **statep
)
3065 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3068 ovs_mutex_lock(&netdev
->mutex
);
3069 if (netdev_linux_netnsid_is_remote(netdev
)) {
3074 error
= tc_query_qdisc(netdev_
);
3076 if (netdev
->tc
->ops
->class_get
) {
3077 struct netdev_linux_queue_state
*state
;
3078 struct tc_queue
*queue
;
3081 *statep
= state
= xmalloc(sizeof *state
);
3082 state
->n_queues
= hmap_count(&netdev
->tc
->queues
);
3083 state
->cur_queue
= 0;
3084 state
->queues
= xmalloc(state
->n_queues
* sizeof *state
->queues
);
3087 HMAP_FOR_EACH (queue
, hmap_node
, &netdev
->tc
->queues
) {
3088 state
->queues
[i
++] = queue
->queue_id
;
3096 ovs_mutex_unlock(&netdev
->mutex
);
3101 netdev_linux_queue_dump_next(const struct netdev
*netdev_
, void *state_
,
3102 unsigned int *queue_idp
, struct smap
*details
)
3104 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3105 struct netdev_linux_queue_state
*state
= state_
;
3108 ovs_mutex_lock(&netdev
->mutex
);
3109 if (netdev_linux_netnsid_is_remote(netdev
)) {
3114 while (state
->cur_queue
< state
->n_queues
) {
3115 unsigned int queue_id
= state
->queues
[state
->cur_queue
++];
3116 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
3119 *queue_idp
= queue_id
;
3120 error
= netdev
->tc
->ops
->class_get(netdev_
, queue
, details
);
3126 ovs_mutex_unlock(&netdev
->mutex
);
3131 netdev_linux_queue_dump_done(const struct netdev
*netdev OVS_UNUSED
,
3134 struct netdev_linux_queue_state
*state
= state_
;
3136 free(state
->queues
);
3142 netdev_linux_dump_queue_stats(const struct netdev
*netdev_
,
3143 netdev_dump_queue_stats_cb
*cb
, void *aux
)
3145 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3148 ovs_mutex_lock(&netdev
->mutex
);
3149 if (netdev_linux_netnsid_is_remote(netdev
)) {
3154 error
= tc_query_qdisc(netdev_
);
3156 struct queue_dump_state state
;
3158 if (!netdev
->tc
->ops
->class_dump_stats
) {
3160 } else if (!start_queue_dump(netdev_
, &state
)) {
3166 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
3167 retval
= netdev
->tc
->ops
->class_dump_stats(netdev_
, &msg
,
3174 retval
= finish_queue_dump(&state
);
3182 ovs_mutex_unlock(&netdev
->mutex
);
3187 netdev_linux_set_in4(struct netdev
*netdev_
, struct in_addr address
,
3188 struct in_addr netmask
)
3190 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3193 ovs_mutex_lock(&netdev
->mutex
);
3194 if (netdev_linux_netnsid_is_remote(netdev
)) {
3199 error
= do_set_addr(netdev_
, SIOCSIFADDR
, "SIOCSIFADDR", address
);
3201 if (address
.s_addr
!= INADDR_ANY
) {
3202 error
= do_set_addr(netdev_
, SIOCSIFNETMASK
,
3203 "SIOCSIFNETMASK", netmask
);
3208 ovs_mutex_unlock(&netdev
->mutex
);
3212 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
3213 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
3216 netdev_linux_get_addr_list(const struct netdev
*netdev_
,
3217 struct in6_addr
**addr
, struct in6_addr
**mask
, int *n_cnt
)
3219 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3222 ovs_mutex_lock(&netdev
->mutex
);
3223 if (netdev_linux_netnsid_is_remote(netdev
)) {
3228 error
= netdev_get_addrs(netdev_get_name(netdev_
), addr
, mask
, n_cnt
);
3231 ovs_mutex_unlock(&netdev
->mutex
);
3236 make_in4_sockaddr(struct sockaddr
*sa
, struct in_addr addr
)
3238 struct sockaddr_in sin
;
3239 memset(&sin
, 0, sizeof sin
);
3240 sin
.sin_family
= AF_INET
;
3241 sin
.sin_addr
= addr
;
3244 memset(sa
, 0, sizeof *sa
);
3245 memcpy(sa
, &sin
, sizeof sin
);
3249 do_set_addr(struct netdev
*netdev
,
3250 int ioctl_nr
, const char *ioctl_name
, struct in_addr addr
)
3254 make_in4_sockaddr(&ifr
.ifr_addr
, addr
);
3255 return af_inet_ifreq_ioctl(netdev_get_name(netdev
), &ifr
, ioctl_nr
,
3259 /* Adds 'router' as a default IP gateway. */
3261 netdev_linux_add_router(struct netdev
*netdev OVS_UNUSED
, struct in_addr router
)
3263 struct in_addr any
= { INADDR_ANY
};
3267 memset(&rt
, 0, sizeof rt
);
3268 make_in4_sockaddr(&rt
.rt_dst
, any
);
3269 make_in4_sockaddr(&rt
.rt_gateway
, router
);
3270 make_in4_sockaddr(&rt
.rt_genmask
, any
);
3271 rt
.rt_flags
= RTF_UP
| RTF_GATEWAY
;
3272 error
= af_inet_ioctl(SIOCADDRT
, &rt
);
3274 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error
));
3280 netdev_linux_get_next_hop(const struct in_addr
*host
, struct in_addr
*next_hop
,
3283 static const char fn
[] = "/proc/net/route";
3288 *netdev_name
= NULL
;
3289 stream
= fopen(fn
, "r");
3290 if (stream
== NULL
) {
3291 VLOG_WARN_RL(&rl
, "%s: open failed: %s", fn
, ovs_strerror(errno
));
3296 while (fgets(line
, sizeof line
, stream
)) {
3299 ovs_be32 dest
, gateway
, mask
;
3300 int refcnt
, metric
, mtu
;
3301 unsigned int flags
, use
, window
, irtt
;
3304 "%16s %"SCNx32
" %"SCNx32
" %04X %d %u %d %"SCNx32
3306 iface
, &dest
, &gateway
, &flags
, &refcnt
,
3307 &use
, &metric
, &mask
, &mtu
, &window
, &irtt
)) {
3308 VLOG_WARN_RL(&rl
, "%s: could not parse line %d: %s",
3312 if (!(flags
& RTF_UP
)) {
3313 /* Skip routes that aren't up. */
3317 /* The output of 'dest', 'mask', and 'gateway' were given in
3318 * network byte order, so we don't need need any endian
3319 * conversions here. */
3320 if ((dest
& mask
) == (host
->s_addr
& mask
)) {
3322 /* The host is directly reachable. */
3323 next_hop
->s_addr
= 0;
3325 /* To reach the host, we must go through a gateway. */
3326 next_hop
->s_addr
= gateway
;
3328 *netdev_name
= xstrdup(iface
);
3340 netdev_linux_get_status(const struct netdev
*netdev_
, struct smap
*smap
)
3342 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3345 ovs_mutex_lock(&netdev
->mutex
);
3346 if (!(netdev
->cache_valid
& VALID_DRVINFO
)) {
3347 struct ethtool_cmd
*cmd
= (struct ethtool_cmd
*) &netdev
->drvinfo
;
3349 COVERAGE_INC(netdev_get_ethtool
);
3350 memset(&netdev
->drvinfo
, 0, sizeof netdev
->drvinfo
);
3351 error
= netdev_linux_do_ethtool(netdev
->up
.name
,
3354 "ETHTOOL_GDRVINFO");
3356 netdev
->cache_valid
|= VALID_DRVINFO
;
3361 smap_add(smap
, "driver_name", netdev
->drvinfo
.driver
);
3362 smap_add(smap
, "driver_version", netdev
->drvinfo
.version
);
3363 smap_add(smap
, "firmware_version", netdev
->drvinfo
.fw_version
);
3365 ovs_mutex_unlock(&netdev
->mutex
);
3371 netdev_internal_get_status(const struct netdev
*netdev OVS_UNUSED
,
3374 smap_add(smap
, "driver_name", "openvswitch");
3379 netdev_linux_get_block_id(struct netdev
*netdev_
)
3381 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3382 uint32_t block_id
= 0;
3384 ovs_mutex_lock(&netdev
->mutex
);
3385 /* Ensure the linux netdev has had its fields populated. */
3386 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
3387 netdev_linux_update_via_netlink(netdev
);
3390 /* Only assigning block ids to linux netdevs that are LAG masters. */
3391 if (netdev
->is_lag_master
) {
3392 block_id
= netdev
->ifindex
;
3394 ovs_mutex_unlock(&netdev
->mutex
);
3399 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3400 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3401 * returns 0. Otherwise, it returns a positive errno value; in particular,
3402 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3404 netdev_linux_arp_lookup(const struct netdev
*netdev
,
3405 ovs_be32 ip
, struct eth_addr
*mac
)
3408 struct sockaddr_in sin
;
3411 memset(&r
, 0, sizeof r
);
3412 memset(&sin
, 0, sizeof sin
);
3413 sin
.sin_family
= AF_INET
;
3414 sin
.sin_addr
.s_addr
= ip
;
3416 memcpy(&r
.arp_pa
, &sin
, sizeof sin
);
3417 r
.arp_ha
.sa_family
= ARPHRD_ETHER
;
3419 ovs_strzcpy(r
.arp_dev
, netdev_get_name(netdev
), sizeof r
.arp_dev
);
3420 COVERAGE_INC(netdev_arp_lookup
);
3421 retval
= af_inet_ioctl(SIOCGARP
, &r
);
3423 memcpy(mac
, r
.arp_ha
.sa_data
, ETH_ADDR_LEN
);
3424 } else if (retval
!= ENXIO
) {
3425 VLOG_WARN_RL(&rl
, "%s: could not look up ARP entry for "IP_FMT
": %s",
3426 netdev_get_name(netdev
), IP_ARGS(ip
),
3427 ovs_strerror(retval
));
3433 nd_to_iff_flags(enum netdev_flags nd
)
3435 unsigned int iff
= 0;
3436 if (nd
& NETDEV_UP
) {
3439 if (nd
& NETDEV_PROMISC
) {
3442 if (nd
& NETDEV_LOOPBACK
) {
3443 iff
|= IFF_LOOPBACK
;
3449 iff_to_nd_flags(unsigned int iff
)
3451 enum netdev_flags nd
= 0;
3455 if (iff
& IFF_PROMISC
) {
3456 nd
|= NETDEV_PROMISC
;
3458 if (iff
& IFF_LOOPBACK
) {
3459 nd
|= NETDEV_LOOPBACK
;
3465 update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
3466 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
3467 OVS_REQUIRES(netdev
->mutex
)
3469 unsigned int old_flags
, new_flags
;
3472 old_flags
= netdev
->ifi_flags
;
3473 *old_flagsp
= iff_to_nd_flags(old_flags
);
3474 new_flags
= (old_flags
& ~nd_to_iff_flags(off
)) | nd_to_iff_flags(on
);
3475 if (new_flags
!= old_flags
) {
3476 error
= set_flags(netdev_get_name(&netdev
->up
), new_flags
);
3477 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
3484 netdev_linux_update_flags(struct netdev
*netdev_
, enum netdev_flags off
,
3485 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
3487 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3490 ovs_mutex_lock(&netdev
->mutex
);
3492 /* Changing flags over netlink isn't support yet. */
3493 if (netdev_linux_netnsid_is_remote(netdev
)) {
3497 error
= update_flags(netdev
, off
, on
, old_flagsp
);
3499 /* Try reading flags over netlink, or fall back to ioctl. */
3500 if (!netdev_linux_update_via_netlink(netdev
)) {
3501 *old_flagsp
= iff_to_nd_flags(netdev
->ifi_flags
);
3503 error
= update_flags(netdev
, off
, on
, old_flagsp
);
3508 ovs_mutex_unlock(&netdev
->mutex
);
3512 #define NETDEV_LINUX_CLASS_COMMON \
3513 .run = netdev_linux_run, \
3514 .wait = netdev_linux_wait, \
3515 .alloc = netdev_linux_alloc, \
3516 .dealloc = netdev_linux_dealloc, \
3517 .send_wait = netdev_linux_send_wait, \
3518 .set_etheraddr = netdev_linux_set_etheraddr, \
3519 .get_etheraddr = netdev_linux_get_etheraddr, \
3520 .get_mtu = netdev_linux_get_mtu, \
3521 .set_mtu = netdev_linux_set_mtu, \
3522 .get_ifindex = netdev_linux_get_ifindex, \
3523 .get_carrier = netdev_linux_get_carrier, \
3524 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3525 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3526 .set_advertisements = netdev_linux_set_advertisements, \
3527 .set_policing = netdev_linux_set_policing, \
3528 .get_qos_types = netdev_linux_get_qos_types, \
3529 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3530 .get_qos = netdev_linux_get_qos, \
3531 .set_qos = netdev_linux_set_qos, \
3532 .get_queue = netdev_linux_get_queue, \
3533 .set_queue = netdev_linux_set_queue, \
3534 .delete_queue = netdev_linux_delete_queue, \
3535 .get_queue_stats = netdev_linux_get_queue_stats, \
3536 .queue_dump_start = netdev_linux_queue_dump_start, \
3537 .queue_dump_next = netdev_linux_queue_dump_next, \
3538 .queue_dump_done = netdev_linux_queue_dump_done, \
3539 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3540 .set_in4 = netdev_linux_set_in4, \
3541 .get_addr_list = netdev_linux_get_addr_list, \
3542 .add_router = netdev_linux_add_router, \
3543 .get_next_hop = netdev_linux_get_next_hop, \
3544 .arp_lookup = netdev_linux_arp_lookup, \
3545 .update_flags = netdev_linux_update_flags, \
3546 .rxq_alloc = netdev_linux_rxq_alloc, \
3547 .rxq_dealloc = netdev_linux_rxq_dealloc, \
3548 .rxq_wait = netdev_linux_rxq_wait, \
3549 .rxq_drain = netdev_linux_rxq_drain
3551 const struct netdev_class netdev_linux_class
= {
3552 NETDEV_LINUX_CLASS_COMMON
,
3555 .construct
= netdev_linux_construct
,
3556 .destruct
= netdev_linux_destruct
,
3557 .get_stats
= netdev_linux_get_stats
,
3558 .get_features
= netdev_linux_get_features
,
3559 .get_status
= netdev_linux_get_status
,
3560 .get_block_id
= netdev_linux_get_block_id
,
3561 .send
= netdev_linux_send
,
3562 .rxq_construct
= netdev_linux_rxq_construct
,
3563 .rxq_destruct
= netdev_linux_rxq_destruct
,
3564 .rxq_recv
= netdev_linux_rxq_recv
,
3567 const struct netdev_class netdev_tap_class
= {
3568 NETDEV_LINUX_CLASS_COMMON
,
3571 .construct
= netdev_linux_construct_tap
,
3572 .destruct
= netdev_linux_destruct
,
3573 .get_stats
= netdev_tap_get_stats
,
3574 .get_features
= netdev_linux_get_features
,
3575 .get_status
= netdev_linux_get_status
,
3576 .send
= netdev_linux_send
,
3577 .rxq_construct
= netdev_linux_rxq_construct
,
3578 .rxq_destruct
= netdev_linux_rxq_destruct
,
3579 .rxq_recv
= netdev_linux_rxq_recv
,
3582 const struct netdev_class netdev_internal_class
= {
3583 NETDEV_LINUX_CLASS_COMMON
,
3586 .construct
= netdev_linux_construct
,
3587 .destruct
= netdev_linux_destruct
,
3588 .get_stats
= netdev_internal_get_stats
,
3589 .get_status
= netdev_internal_get_status
,
3590 .send
= netdev_linux_send
,
3591 .rxq_construct
= netdev_linux_rxq_construct
,
3592 .rxq_destruct
= netdev_linux_rxq_destruct
,
3593 .rxq_recv
= netdev_linux_rxq_recv
,
3597 #define NETDEV_AFXDP_CLASS_COMMON \
3598 .init = netdev_afxdp_init, \
3599 .construct = netdev_afxdp_construct, \
3600 .destruct = netdev_afxdp_destruct, \
3601 .get_stats = netdev_afxdp_get_stats, \
3602 .get_custom_stats = netdev_afxdp_get_custom_stats, \
3603 .get_status = netdev_linux_get_status, \
3604 .set_config = netdev_afxdp_set_config, \
3605 .get_config = netdev_afxdp_get_config, \
3606 .reconfigure = netdev_afxdp_reconfigure, \
3607 .get_numa_id = netdev_linux_get_numa_id, \
3608 .send = netdev_afxdp_batch_send, \
3609 .rxq_construct = netdev_afxdp_rxq_construct, \
3610 .rxq_destruct = netdev_afxdp_rxq_destruct, \
3611 .rxq_recv = netdev_afxdp_rxq_recv
3613 const struct netdev_class netdev_afxdp_class
= {
3614 NETDEV_LINUX_CLASS_COMMON
,
3615 NETDEV_AFXDP_CLASS_COMMON
,
3620 const struct netdev_class netdev_afxdp_nonpmd_class
= {
3621 NETDEV_LINUX_CLASS_COMMON
,
3622 NETDEV_AFXDP_CLASS_COMMON
,
3623 .type
= "afxdp-nonpmd",
3629 #define CODEL_N_QUEUES 0x0000
3631 /* In sufficiently new kernel headers these are defined as enums in
3632 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3633 * kernels. (This overrides any enum definition in the header file but that's
3635 #define TCA_CODEL_TARGET 1
3636 #define TCA_CODEL_LIMIT 2
3637 #define TCA_CODEL_INTERVAL 3
3646 static struct codel
*
3647 codel_get__(const struct netdev
*netdev_
)
3649 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3650 return CONTAINER_OF(netdev
->tc
, struct codel
, tc
);
3654 codel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3657 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3658 struct codel
*codel
;
3660 codel
= xmalloc(sizeof *codel
);
3661 tc_init(&codel
->tc
, &tc_ops_codel
);
3662 codel
->target
= target
;
3663 codel
->limit
= limit
;
3664 codel
->interval
= interval
;
3666 netdev
->tc
= &codel
->tc
;
3670 codel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3674 struct ofpbuf request
;
3675 struct tcmsg
*tcmsg
;
3676 uint32_t otarget
, olimit
, ointerval
;
3679 tc_del_qdisc(netdev
);
3681 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3682 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3686 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3687 tcmsg
->tcm_parent
= TC_H_ROOT
;
3689 otarget
= target
? target
: 5000;
3690 olimit
= limit
? limit
: 10240;
3691 ointerval
= interval
? interval
: 100000;
3693 nl_msg_put_string(&request
, TCA_KIND
, "codel");
3694 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3695 nl_msg_put_u32(&request
, TCA_CODEL_TARGET
, otarget
);
3696 nl_msg_put_u32(&request
, TCA_CODEL_LIMIT
, olimit
);
3697 nl_msg_put_u32(&request
, TCA_CODEL_INTERVAL
, ointerval
);
3698 nl_msg_end_nested(&request
, opt_offset
);
3700 error
= tc_transact(&request
, NULL
);
3702 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3703 "target %u, limit %u, interval %u error %d(%s)",
3704 netdev_get_name(netdev
),
3705 otarget
, olimit
, ointerval
,
3706 error
, ovs_strerror(error
));
3712 codel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3713 const struct smap
*details
, struct codel
*codel
)
3715 codel
->target
= smap_get_ullong(details
, "target", 0);
3716 codel
->limit
= smap_get_ullong(details
, "limit", 0);
3717 codel
->interval
= smap_get_ullong(details
, "interval", 0);
3719 if (!codel
->target
) {
3720 codel
->target
= 5000;
3722 if (!codel
->limit
) {
3723 codel
->limit
= 10240;
3725 if (!codel
->interval
) {
3726 codel
->interval
= 100000;
3731 codel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3736 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3737 error
= codel_setup_qdisc__(netdev
, codel
.target
, codel
.limit
,
3740 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3746 codel_parse_tca_options__(struct nlattr
*nl_options
, struct codel
*codel
)
3748 static const struct nl_policy tca_codel_policy
[] = {
3749 [TCA_CODEL_TARGET
] = { .type
= NL_A_U32
},
3750 [TCA_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3751 [TCA_CODEL_INTERVAL
] = { .type
= NL_A_U32
}
3754 struct nlattr
*attrs
[ARRAY_SIZE(tca_codel_policy
)];
3756 if (!nl_parse_nested(nl_options
, tca_codel_policy
,
3757 attrs
, ARRAY_SIZE(tca_codel_policy
))) {
3758 VLOG_WARN_RL(&rl
, "failed to parse CoDel class options");
3762 codel
->target
= nl_attr_get_u32(attrs
[TCA_CODEL_TARGET
]);
3763 codel
->limit
= nl_attr_get_u32(attrs
[TCA_CODEL_LIMIT
]);
3764 codel
->interval
= nl_attr_get_u32(attrs
[TCA_CODEL_INTERVAL
]);
3769 codel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3771 struct nlattr
*nlattr
;
3776 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3781 error
= codel_parse_tca_options__(nlattr
, &codel
);
3786 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3792 codel_tc_destroy(struct tc
*tc
)
3794 struct codel
*codel
= CONTAINER_OF(tc
, struct codel
, tc
);
3800 codel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3802 const struct codel
*codel
= codel_get__(netdev
);
3803 smap_add_format(details
, "target", "%u", codel
->target
);
3804 smap_add_format(details
, "limit", "%u", codel
->limit
);
3805 smap_add_format(details
, "interval", "%u", codel
->interval
);
3810 codel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3814 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3815 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3816 codel_get__(netdev
)->target
= codel
.target
;
3817 codel_get__(netdev
)->limit
= codel
.limit
;
3818 codel_get__(netdev
)->interval
= codel
.interval
;
3822 static const struct tc_ops tc_ops_codel
= {
3823 .linux_name
= "codel",
3824 .ovs_name
= "linux-codel",
3825 .n_queues
= CODEL_N_QUEUES
,
3826 .tc_install
= codel_tc_install
,
3827 .tc_load
= codel_tc_load
,
3828 .tc_destroy
= codel_tc_destroy
,
3829 .qdisc_get
= codel_qdisc_get
,
3830 .qdisc_set
= codel_qdisc_set
,
3833 /* FQ-CoDel traffic control class. */
3835 #define FQCODEL_N_QUEUES 0x0000
3837 /* In sufficiently new kernel headers these are defined as enums in
3838 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3839 * kernels. (This overrides any enum definition in the header file but that's
3841 #define TCA_FQ_CODEL_TARGET 1
3842 #define TCA_FQ_CODEL_LIMIT 2
3843 #define TCA_FQ_CODEL_INTERVAL 3
3844 #define TCA_FQ_CODEL_ECN 4
3845 #define TCA_FQ_CODEL_FLOWS 5
3846 #define TCA_FQ_CODEL_QUANTUM 6
3857 static struct fqcodel
*
3858 fqcodel_get__(const struct netdev
*netdev_
)
3860 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3861 return CONTAINER_OF(netdev
->tc
, struct fqcodel
, tc
);
3865 fqcodel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3866 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3868 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3869 struct fqcodel
*fqcodel
;
3871 fqcodel
= xmalloc(sizeof *fqcodel
);
3872 tc_init(&fqcodel
->tc
, &tc_ops_fqcodel
);
3873 fqcodel
->target
= target
;
3874 fqcodel
->limit
= limit
;
3875 fqcodel
->interval
= interval
;
3876 fqcodel
->flows
= flows
;
3877 fqcodel
->quantum
= quantum
;
3879 netdev
->tc
= &fqcodel
->tc
;
3883 fqcodel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3884 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3887 struct ofpbuf request
;
3888 struct tcmsg
*tcmsg
;
3889 uint32_t otarget
, olimit
, ointerval
, oflows
, oquantum
;
3892 tc_del_qdisc(netdev
);
3894 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3895 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3899 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3900 tcmsg
->tcm_parent
= TC_H_ROOT
;
3902 otarget
= target
? target
: 5000;
3903 olimit
= limit
? limit
: 10240;
3904 ointerval
= interval
? interval
: 100000;
3905 oflows
= flows
? flows
: 1024;
3906 oquantum
= quantum
? quantum
: 1514; /* fq_codel default quantum is 1514
3909 nl_msg_put_string(&request
, TCA_KIND
, "fq_codel");
3910 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3911 nl_msg_put_u32(&request
, TCA_FQ_CODEL_TARGET
, otarget
);
3912 nl_msg_put_u32(&request
, TCA_FQ_CODEL_LIMIT
, olimit
);
3913 nl_msg_put_u32(&request
, TCA_FQ_CODEL_INTERVAL
, ointerval
);
3914 nl_msg_put_u32(&request
, TCA_FQ_CODEL_FLOWS
, oflows
);
3915 nl_msg_put_u32(&request
, TCA_FQ_CODEL_QUANTUM
, oquantum
);
3916 nl_msg_end_nested(&request
, opt_offset
);
3918 error
= tc_transact(&request
, NULL
);
3920 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3921 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3922 netdev_get_name(netdev
),
3923 otarget
, olimit
, ointerval
, oflows
, oquantum
,
3924 error
, ovs_strerror(error
));
3930 fqcodel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3931 const struct smap
*details
, struct fqcodel
*fqcodel
)
3933 fqcodel
->target
= smap_get_ullong(details
, "target", 0);
3934 fqcodel
->limit
= smap_get_ullong(details
, "limit", 0);
3935 fqcodel
->interval
= smap_get_ullong(details
, "interval", 0);
3936 fqcodel
->flows
= smap_get_ullong(details
, "flows", 0);
3937 fqcodel
->quantum
= smap_get_ullong(details
, "quantum", 0);
3939 if (!fqcodel
->target
) {
3940 fqcodel
->target
= 5000;
3942 if (!fqcodel
->limit
) {
3943 fqcodel
->limit
= 10240;
3945 if (!fqcodel
->interval
) {
3946 fqcodel
->interval
= 1000000;
3948 if (!fqcodel
->flows
) {
3949 fqcodel
->flows
= 1024;
3951 if (!fqcodel
->quantum
) {
3952 fqcodel
->quantum
= 1514;
3957 fqcodel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3960 struct fqcodel fqcodel
;
3962 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3963 error
= fqcodel_setup_qdisc__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3964 fqcodel
.interval
, fqcodel
.flows
,
3967 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3968 fqcodel
.interval
, fqcodel
.flows
, fqcodel
.quantum
);
3974 fqcodel_parse_tca_options__(struct nlattr
*nl_options
, struct fqcodel
*fqcodel
)
3976 static const struct nl_policy tca_fqcodel_policy
[] = {
3977 [TCA_FQ_CODEL_TARGET
] = { .type
= NL_A_U32
},
3978 [TCA_FQ_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3979 [TCA_FQ_CODEL_INTERVAL
] = { .type
= NL_A_U32
},
3980 [TCA_FQ_CODEL_FLOWS
] = { .type
= NL_A_U32
},
3981 [TCA_FQ_CODEL_QUANTUM
] = { .type
= NL_A_U32
}
3984 struct nlattr
*attrs
[ARRAY_SIZE(tca_fqcodel_policy
)];
3986 if (!nl_parse_nested(nl_options
, tca_fqcodel_policy
,
3987 attrs
, ARRAY_SIZE(tca_fqcodel_policy
))) {
3988 VLOG_WARN_RL(&rl
, "failed to parse FQ_CoDel class options");
3992 fqcodel
->target
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_TARGET
]);
3993 fqcodel
->limit
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_LIMIT
]);
3994 fqcodel
->interval
=nl_attr_get_u32(attrs
[TCA_FQ_CODEL_INTERVAL
]);
3995 fqcodel
->flows
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_FLOWS
]);
3996 fqcodel
->quantum
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_QUANTUM
]);
4001 fqcodel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
4003 struct nlattr
*nlattr
;
4006 struct fqcodel fqcodel
;
4008 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
4013 error
= fqcodel_parse_tca_options__(nlattr
, &fqcodel
);
4018 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
4019 fqcodel
.flows
, fqcodel
.quantum
);
4024 fqcodel_tc_destroy(struct tc
*tc
)
4026 struct fqcodel
*fqcodel
= CONTAINER_OF(tc
, struct fqcodel
, tc
);
4032 fqcodel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4034 const struct fqcodel
*fqcodel
= fqcodel_get__(netdev
);
4035 smap_add_format(details
, "target", "%u", fqcodel
->target
);
4036 smap_add_format(details
, "limit", "%u", fqcodel
->limit
);
4037 smap_add_format(details
, "interval", "%u", fqcodel
->interval
);
4038 smap_add_format(details
, "flows", "%u", fqcodel
->flows
);
4039 smap_add_format(details
, "quantum", "%u", fqcodel
->quantum
);
4044 fqcodel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4046 struct fqcodel fqcodel
;
4048 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
4049 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
4050 fqcodel
.flows
, fqcodel
.quantum
);
4051 fqcodel_get__(netdev
)->target
= fqcodel
.target
;
4052 fqcodel_get__(netdev
)->limit
= fqcodel
.limit
;
4053 fqcodel_get__(netdev
)->interval
= fqcodel
.interval
;
4054 fqcodel_get__(netdev
)->flows
= fqcodel
.flows
;
4055 fqcodel_get__(netdev
)->quantum
= fqcodel
.quantum
;
4059 static const struct tc_ops tc_ops_fqcodel
= {
4060 .linux_name
= "fq_codel",
4061 .ovs_name
= "linux-fq_codel",
4062 .n_queues
= FQCODEL_N_QUEUES
,
4063 .tc_install
= fqcodel_tc_install
,
4064 .tc_load
= fqcodel_tc_load
,
4065 .tc_destroy
= fqcodel_tc_destroy
,
4066 .qdisc_get
= fqcodel_qdisc_get
,
4067 .qdisc_set
= fqcodel_qdisc_set
,
4070 /* SFQ traffic control class. */
4072 #define SFQ_N_QUEUES 0x0000
4081 sfq_get__(const struct netdev
*netdev_
)
4083 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4084 return CONTAINER_OF(netdev
->tc
, struct sfq
, tc
);
4088 sfq_install__(struct netdev
*netdev_
, uint32_t quantum
, uint32_t perturb
)
4090 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4093 sfq
= xmalloc(sizeof *sfq
);
4094 tc_init(&sfq
->tc
, &tc_ops_sfq
);
4095 sfq
->perturb
= perturb
;
4096 sfq
->quantum
= quantum
;
4098 netdev
->tc
= &sfq
->tc
;
4102 sfq_setup_qdisc__(struct netdev
*netdev
, uint32_t quantum
, uint32_t perturb
)
4104 struct tc_sfq_qopt opt
;
4105 struct ofpbuf request
;
4106 struct tcmsg
*tcmsg
;
4108 int mtu_error
, error
;
4109 mtu_error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
4111 tc_del_qdisc(netdev
);
4113 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4114 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4118 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4119 tcmsg
->tcm_parent
= TC_H_ROOT
;
4121 memset(&opt
, 0, sizeof opt
);
4124 opt
.quantum
= mtu
; /* if we cannot find mtu, use default */
4127 opt
.quantum
= quantum
;
4131 opt
.perturb_period
= 10;
4133 opt
.perturb_period
= perturb
;
4136 nl_msg_put_string(&request
, TCA_KIND
, "sfq");
4137 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
4139 error
= tc_transact(&request
, NULL
);
4141 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
4142 "quantum %u, perturb %u error %d(%s)",
4143 netdev_get_name(netdev
),
4144 opt
.quantum
, opt
.perturb_period
,
4145 error
, ovs_strerror(error
));
4151 sfq_parse_qdisc_details__(struct netdev
*netdev
,
4152 const struct smap
*details
, struct sfq
*sfq
)
4154 sfq
->perturb
= smap_get_ullong(details
, "perturb", 0);
4155 sfq
->quantum
= smap_get_ullong(details
, "quantum", 0);
4157 if (!sfq
->perturb
) {
4161 if (!sfq
->quantum
) {
4163 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
4166 VLOG_WARN_RL(&rl
, "when using SFQ, you must specify quantum on a "
4167 "device without mtu");
4173 sfq_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4178 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
4179 error
= sfq_setup_qdisc__(netdev
, sfq
.quantum
, sfq
.perturb
);
4181 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
4187 sfq_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
4189 const struct tc_sfq_qopt
*sfq
;
4190 struct nlattr
*nlattr
;
4194 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
4196 sfq
= nl_attr_get(nlattr
);
4197 sfq_install__(netdev
, sfq
->quantum
, sfq
->perturb_period
);
4205 sfq_tc_destroy(struct tc
*tc
)
4207 struct sfq
*sfq
= CONTAINER_OF(tc
, struct sfq
, tc
);
4213 sfq_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4215 const struct sfq
*sfq
= sfq_get__(netdev
);
4216 smap_add_format(details
, "quantum", "%u", sfq
->quantum
);
4217 smap_add_format(details
, "perturb", "%u", sfq
->perturb
);
4222 sfq_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4226 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
4227 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
4228 sfq_get__(netdev
)->quantum
= sfq
.quantum
;
4229 sfq_get__(netdev
)->perturb
= sfq
.perturb
;
4233 static const struct tc_ops tc_ops_sfq
= {
4234 .linux_name
= "sfq",
4235 .ovs_name
= "linux-sfq",
4236 .n_queues
= SFQ_N_QUEUES
,
4237 .tc_install
= sfq_tc_install
,
4238 .tc_load
= sfq_tc_load
,
4239 .tc_destroy
= sfq_tc_destroy
,
4240 .qdisc_get
= sfq_qdisc_get
,
4241 .qdisc_set
= sfq_qdisc_set
,
4244 /* netem traffic control class. */
4253 static struct netem
*
4254 netem_get__(const struct netdev
*netdev_
)
4256 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4257 return CONTAINER_OF(netdev
->tc
, struct netem
, tc
);
4261 netem_install__(struct netdev
*netdev_
, uint32_t latency
,
4262 uint32_t limit
, uint32_t loss
)
4264 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4265 struct netem
*netem
;
4267 netem
= xmalloc(sizeof *netem
);
4268 tc_init(&netem
->tc
, &tc_ops_netem
);
4269 netem
->latency
= latency
;
4270 netem
->limit
= limit
;
4273 netdev
->tc
= &netem
->tc
;
4277 netem_setup_qdisc__(struct netdev
*netdev
, uint32_t latency
,
4278 uint32_t limit
, uint32_t loss
)
4280 struct tc_netem_qopt opt
;
4281 struct ofpbuf request
;
4282 struct tcmsg
*tcmsg
;
4285 tc_del_qdisc(netdev
);
4287 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4288 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4292 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4293 tcmsg
->tcm_parent
= TC_H_ROOT
;
4295 memset(&opt
, 0, sizeof opt
);
4306 "loss should be a percentage value between 0 to 100, "
4307 "loss was %u", loss
);
4310 opt
.loss
= floor(UINT32_MAX
* (loss
/ 100.0));
4313 opt
.latency
= tc_time_to_ticks(latency
);
4315 nl_msg_put_string(&request
, TCA_KIND
, "netem");
4316 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
4318 error
= tc_transact(&request
, NULL
);
4320 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
4321 "latency %u, limit %u, loss %u error %d(%s)",
4322 netdev_get_name(netdev
),
4323 opt
.latency
, opt
.limit
, opt
.loss
,
4324 error
, ovs_strerror(error
));
4330 netem_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
4331 const struct smap
*details
, struct netem
*netem
)
4333 netem
->latency
= smap_get_ullong(details
, "latency", 0);
4334 netem
->limit
= smap_get_ullong(details
, "limit", 0);
4335 netem
->loss
= smap_get_ullong(details
, "loss", 0);
4337 if (!netem
->limit
) {
4338 netem
->limit
= 1000;
4343 netem_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4348 netem_parse_qdisc_details__(netdev
, details
, &netem
);
4349 error
= netem_setup_qdisc__(netdev
, netem
.latency
,
4350 netem
.limit
, netem
.loss
);
4352 netem_install__(netdev
, netem
.latency
, netem
.limit
, netem
.loss
);
4358 netem_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
4360 const struct tc_netem_qopt
*netem
;
4361 struct nlattr
*nlattr
;
4365 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
4367 netem
= nl_attr_get(nlattr
);
4368 netem_install__(netdev
, netem
->latency
, netem
->limit
, netem
->loss
);
4376 netem_tc_destroy(struct tc
*tc
)
4378 struct netem
*netem
= CONTAINER_OF(tc
, struct netem
, tc
);
4384 netem_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4386 const struct netem
*netem
= netem_get__(netdev
);
4387 smap_add_format(details
, "latency", "%u", netem
->latency
);
4388 smap_add_format(details
, "limit", "%u", netem
->limit
);
4389 smap_add_format(details
, "loss", "%u", netem
->loss
);
4394 netem_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4398 netem_parse_qdisc_details__(netdev
, details
, &netem
);
4399 netem_install__(netdev
, netem
.latency
, netem
.limit
, netem
.loss
);
4400 netem_get__(netdev
)->latency
= netem
.latency
;
4401 netem_get__(netdev
)->limit
= netem
.limit
;
4402 netem_get__(netdev
)->loss
= netem
.loss
;
4406 static const struct tc_ops tc_ops_netem
= {
4407 .linux_name
= "netem",
4408 .ovs_name
= "linux-netem",
4410 .tc_install
= netem_tc_install
,
4411 .tc_load
= netem_tc_load
,
4412 .tc_destroy
= netem_tc_destroy
,
4413 .qdisc_get
= netem_qdisc_get
,
4414 .qdisc_set
= netem_qdisc_set
,
4417 /* HTB traffic control class. */
4419 #define HTB_N_QUEUES 0xf000
4420 #define HTB_RATE2QUANTUM 10
4424 unsigned int max_rate
; /* In bytes/s. */
4428 struct tc_queue tc_queue
;
4429 unsigned int min_rate
; /* In bytes/s. */
4430 unsigned int max_rate
; /* In bytes/s. */
4431 unsigned int burst
; /* In bytes. */
4432 unsigned int priority
; /* Lower values are higher priorities. */
4436 htb_get__(const struct netdev
*netdev_
)
4438 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4439 return CONTAINER_OF(netdev
->tc
, struct htb
, tc
);
4443 htb_install__(struct netdev
*netdev_
, uint64_t max_rate
)
4445 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4448 htb
= xmalloc(sizeof *htb
);
4449 tc_init(&htb
->tc
, &tc_ops_htb
);
4450 htb
->max_rate
= max_rate
;
4452 netdev
->tc
= &htb
->tc
;
4455 /* Create an HTB qdisc.
4457 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
4459 htb_setup_qdisc__(struct netdev
*netdev
)
4462 struct tc_htb_glob opt
;
4463 struct ofpbuf request
;
4464 struct tcmsg
*tcmsg
;
4466 tc_del_qdisc(netdev
);
4468 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4469 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4473 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4474 tcmsg
->tcm_parent
= TC_H_ROOT
;
4476 nl_msg_put_string(&request
, TCA_KIND
, "htb");
4478 memset(&opt
, 0, sizeof opt
);
4479 opt
.rate2quantum
= HTB_RATE2QUANTUM
;
4483 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4484 nl_msg_put_unspec(&request
, TCA_HTB_INIT
, &opt
, sizeof opt
);
4485 nl_msg_end_nested(&request
, opt_offset
);
4487 return tc_transact(&request
, NULL
);
4490 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
4491 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
4493 htb_setup_class__(struct netdev
*netdev
, unsigned int handle
,
4494 unsigned int parent
, struct htb_class
*class)
4497 struct tc_htb_opt opt
;
4498 struct ofpbuf request
;
4499 struct tcmsg
*tcmsg
;
4503 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
4505 VLOG_WARN_RL(&rl
, "cannot set up HTB on device %s that lacks MTU",
4506 netdev_get_name(netdev
));
4510 memset(&opt
, 0, sizeof opt
);
4511 tc_fill_rate(&opt
.rate
, class->min_rate
, mtu
);
4512 tc_fill_rate(&opt
.ceil
, class->max_rate
, mtu
);
4513 /* Makes sure the quantum is at least MTU. Setting quantum will
4514 * make htb ignore the r2q for this class. */
4515 if ((class->min_rate
/ HTB_RATE2QUANTUM
) < mtu
) {
4518 opt
.buffer
= tc_calc_buffer(opt
.rate
.rate
, mtu
, class->burst
);
4519 opt
.cbuffer
= tc_calc_buffer(opt
.ceil
.rate
, mtu
, class->burst
);
4520 opt
.prio
= class->priority
;
4522 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
4527 tcmsg
->tcm_handle
= handle
;
4528 tcmsg
->tcm_parent
= parent
;
4530 nl_msg_put_string(&request
, TCA_KIND
, "htb");
4531 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4532 nl_msg_put_unspec(&request
, TCA_HTB_PARMS
, &opt
, sizeof opt
);
4533 tc_put_rtab(&request
, TCA_HTB_RTAB
, &opt
.rate
);
4534 tc_put_rtab(&request
, TCA_HTB_CTAB
, &opt
.ceil
);
4535 nl_msg_end_nested(&request
, opt_offset
);
4537 error
= tc_transact(&request
, NULL
);
4539 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
4540 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
4541 netdev_get_name(netdev
),
4542 tc_get_major(handle
), tc_get_minor(handle
),
4543 tc_get_major(parent
), tc_get_minor(parent
),
4544 class->min_rate
, class->max_rate
,
4545 class->burst
, class->priority
, ovs_strerror(error
));
4550 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
4551 * description of them into 'details'. The description complies with the
4552 * specification given in the vswitch database documentation for linux-htb
4555 htb_parse_tca_options__(struct nlattr
*nl_options
, struct htb_class
*class)
4557 static const struct nl_policy tca_htb_policy
[] = {
4558 [TCA_HTB_PARMS
] = { .type
= NL_A_UNSPEC
, .optional
= false,
4559 .min_len
= sizeof(struct tc_htb_opt
) },
4562 struct nlattr
*attrs
[ARRAY_SIZE(tca_htb_policy
)];
4563 const struct tc_htb_opt
*htb
;
4565 if (!nl_parse_nested(nl_options
, tca_htb_policy
,
4566 attrs
, ARRAY_SIZE(tca_htb_policy
))) {
4567 VLOG_WARN_RL(&rl
, "failed to parse HTB class options");
4571 htb
= nl_attr_get(attrs
[TCA_HTB_PARMS
]);
4572 class->min_rate
= htb
->rate
.rate
;
4573 class->max_rate
= htb
->ceil
.rate
;
4574 class->burst
= tc_ticks_to_bytes(htb
->rate
.rate
, htb
->buffer
);
4575 class->priority
= htb
->prio
;
4580 htb_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4581 struct htb_class
*options
,
4582 struct netdev_queue_stats
*stats
)
4584 struct nlattr
*nl_options
;
4585 unsigned int handle
;
4588 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4589 if (!error
&& queue_id
) {
4590 unsigned int major
= tc_get_major(handle
);
4591 unsigned int minor
= tc_get_minor(handle
);
4592 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4593 *queue_id
= minor
- 1;
4598 if (!error
&& options
) {
4599 error
= htb_parse_tca_options__(nl_options
, options
);
4605 htb_parse_qdisc_details__(struct netdev
*netdev_
,
4606 const struct smap
*details
, struct htb_class
*hc
)
4608 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4610 hc
->max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
4611 if (!hc
->max_rate
) {
4612 enum netdev_features current
;
4614 netdev_linux_read_features(netdev
);
4615 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4616 hc
->max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4618 hc
->min_rate
= hc
->max_rate
;
4624 htb_parse_class_details__(struct netdev
*netdev
,
4625 const struct smap
*details
, struct htb_class
*hc
)
4627 const struct htb
*htb
= htb_get__(netdev
);
4629 unsigned long long int max_rate_bit
;
4631 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
4633 VLOG_WARN_RL(&rl
, "cannot parse HTB class on device %s that lacks MTU",
4634 netdev_get_name(netdev
));
4638 /* HTB requires at least an mtu sized min-rate to send any traffic even
4639 * on uncongested links. */
4640 hc
->min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
4641 hc
->min_rate
= MAX(hc
->min_rate
, mtu
);
4642 hc
->min_rate
= MIN(hc
->min_rate
, htb
->max_rate
);
4645 max_rate_bit
= smap_get_ullong(details
, "max-rate", 0);
4646 hc
->max_rate
= max_rate_bit
? max_rate_bit
/ 8 : htb
->max_rate
;
4647 hc
->max_rate
= MAX(hc
->max_rate
, hc
->min_rate
);
4648 hc
->max_rate
= MIN(hc
->max_rate
, htb
->max_rate
);
4652 * According to hints in the documentation that I've read, it is important
4653 * that 'burst' be at least as big as the largest frame that might be
4654 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4655 * but having it a bit too small is a problem. Since netdev_get_mtu()
4656 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4657 * the MTU. We actually add 64, instead of 14, as a guard against
4658 * additional headers get tacked on somewhere that we're not aware of. */
4659 hc
->burst
= smap_get_ullong(details
, "burst", 0) / 8;
4660 hc
->burst
= MAX(hc
->burst
, mtu
+ 64);
4663 hc
->priority
= smap_get_ullong(details
, "priority", 0);
4669 htb_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4670 unsigned int parent
, struct htb_class
*options
,
4671 struct netdev_queue_stats
*stats
)
4673 struct ofpbuf
*reply
;
4676 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4678 error
= htb_parse_tcmsg__(reply
, NULL
, options
, stats
);
4679 ofpbuf_delete(reply
);
4685 htb_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4689 error
= htb_setup_qdisc__(netdev
);
4691 struct htb_class hc
;
4693 htb_parse_qdisc_details__(netdev
, details
, &hc
);
4694 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4695 tc_make_handle(1, 0), &hc
);
4697 htb_install__(netdev
, hc
.max_rate
);
4703 static struct htb_class
*
4704 htb_class_cast__(const struct tc_queue
*queue
)
4706 return CONTAINER_OF(queue
, struct htb_class
, tc_queue
);
4710 htb_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4711 const struct htb_class
*hc
)
4713 struct htb
*htb
= htb_get__(netdev
);
4714 size_t hash
= hash_int(queue_id
, 0);
4715 struct tc_queue
*queue
;
4716 struct htb_class
*hcp
;
4718 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4720 hcp
= htb_class_cast__(queue
);
4722 hcp
= xmalloc(sizeof *hcp
);
4723 queue
= &hcp
->tc_queue
;
4724 queue
->queue_id
= queue_id
;
4725 queue
->created
= time_msec();
4726 hmap_insert(&htb
->tc
.queues
, &queue
->hmap_node
, hash
);
4729 hcp
->min_rate
= hc
->min_rate
;
4730 hcp
->max_rate
= hc
->max_rate
;
4731 hcp
->burst
= hc
->burst
;
4732 hcp
->priority
= hc
->priority
;
4736 htb_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4739 struct queue_dump_state state
;
4740 struct htb_class hc
;
4742 /* Get qdisc options. */
4744 htb_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4745 htb_install__(netdev
, hc
.max_rate
);
4748 if (!start_queue_dump(netdev
, &state
)) {
4751 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4752 unsigned int queue_id
;
4754 if (!htb_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4755 htb_update_queue__(netdev
, queue_id
, &hc
);
4758 finish_queue_dump(&state
);
4764 htb_tc_destroy(struct tc
*tc
)
4766 struct htb
*htb
= CONTAINER_OF(tc
, struct htb
, tc
);
4767 struct htb_class
*hc
;
4769 HMAP_FOR_EACH_POP (hc
, tc_queue
.hmap_node
, &htb
->tc
.queues
) {
4777 htb_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4779 const struct htb
*htb
= htb_get__(netdev
);
4780 smap_add_format(details
, "max-rate", "%llu", 8ULL * htb
->max_rate
);
4785 htb_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4787 struct htb_class hc
;
4790 htb_parse_qdisc_details__(netdev
, details
, &hc
);
4791 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4792 tc_make_handle(1, 0), &hc
);
4794 htb_get__(netdev
)->max_rate
= hc
.max_rate
;
4800 htb_class_get(const struct netdev
*netdev OVS_UNUSED
,
4801 const struct tc_queue
*queue
, struct smap
*details
)
4803 const struct htb_class
*hc
= htb_class_cast__(queue
);
4805 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4806 if (hc
->min_rate
!= hc
->max_rate
) {
4807 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4809 smap_add_format(details
, "burst", "%llu", 8ULL * hc
->burst
);
4811 smap_add_format(details
, "priority", "%u", hc
->priority
);
4817 htb_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4818 const struct smap
*details
)
4820 struct htb_class hc
;
4823 error
= htb_parse_class_details__(netdev
, details
, &hc
);
4828 error
= htb_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
4829 tc_make_handle(1, 0xfffe), &hc
);
4834 htb_update_queue__(netdev
, queue_id
, &hc
);
4839 htb_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4841 struct htb_class
*hc
= htb_class_cast__(queue
);
4842 struct htb
*htb
= htb_get__(netdev
);
4845 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4847 hmap_remove(&htb
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4854 htb_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4855 struct netdev_queue_stats
*stats
)
4857 return htb_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4858 tc_make_handle(1, 0xfffe), NULL
, stats
);
4862 htb_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4863 const struct ofpbuf
*nlmsg
,
4864 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4866 struct netdev_queue_stats stats
;
4867 unsigned int handle
, major
, minor
;
4870 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4875 major
= tc_get_major(handle
);
4876 minor
= tc_get_minor(handle
);
4877 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4878 (*cb
)(minor
- 1, &stats
, aux
);
4883 static const struct tc_ops tc_ops_htb
= {
4884 .linux_name
= "htb",
4885 .ovs_name
= "linux-htb",
4886 .n_queues
= HTB_N_QUEUES
,
4887 .tc_install
= htb_tc_install
,
4888 .tc_load
= htb_tc_load
,
4889 .tc_destroy
= htb_tc_destroy
,
4890 .qdisc_get
= htb_qdisc_get
,
4891 .qdisc_set
= htb_qdisc_set
,
4892 .class_get
= htb_class_get
,
4893 .class_set
= htb_class_set
,
4894 .class_delete
= htb_class_delete
,
4895 .class_get_stats
= htb_class_get_stats
,
4896 .class_dump_stats
= htb_class_dump_stats
4899 /* "linux-hfsc" traffic control class. */
4901 #define HFSC_N_QUEUES 0xf000
4909 struct tc_queue tc_queue
;
4914 static struct hfsc
*
4915 hfsc_get__(const struct netdev
*netdev_
)
4917 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4918 return CONTAINER_OF(netdev
->tc
, struct hfsc
, tc
);
4921 static struct hfsc_class
*
4922 hfsc_class_cast__(const struct tc_queue
*queue
)
4924 return CONTAINER_OF(queue
, struct hfsc_class
, tc_queue
);
4928 hfsc_install__(struct netdev
*netdev_
, uint32_t max_rate
)
4930 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4933 hfsc
= xmalloc(sizeof *hfsc
);
4934 tc_init(&hfsc
->tc
, &tc_ops_hfsc
);
4935 hfsc
->max_rate
= max_rate
;
4936 netdev
->tc
= &hfsc
->tc
;
4940 hfsc_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4941 const struct hfsc_class
*hc
)
4945 struct hfsc_class
*hcp
;
4946 struct tc_queue
*queue
;
4948 hfsc
= hfsc_get__(netdev
);
4949 hash
= hash_int(queue_id
, 0);
4951 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4953 hcp
= hfsc_class_cast__(queue
);
4955 hcp
= xmalloc(sizeof *hcp
);
4956 queue
= &hcp
->tc_queue
;
4957 queue
->queue_id
= queue_id
;
4958 queue
->created
= time_msec();
4959 hmap_insert(&hfsc
->tc
.queues
, &queue
->hmap_node
, hash
);
4962 hcp
->min_rate
= hc
->min_rate
;
4963 hcp
->max_rate
= hc
->max_rate
;
4967 hfsc_parse_tca_options__(struct nlattr
*nl_options
, struct hfsc_class
*class)
4969 const struct tc_service_curve
*rsc
, *fsc
, *usc
;
4970 static const struct nl_policy tca_hfsc_policy
[] = {
4972 .type
= NL_A_UNSPEC
,
4974 .min_len
= sizeof(struct tc_service_curve
),
4977 .type
= NL_A_UNSPEC
,
4979 .min_len
= sizeof(struct tc_service_curve
),
4982 .type
= NL_A_UNSPEC
,
4984 .min_len
= sizeof(struct tc_service_curve
),
4987 struct nlattr
*attrs
[ARRAY_SIZE(tca_hfsc_policy
)];
4989 if (!nl_parse_nested(nl_options
, tca_hfsc_policy
,
4990 attrs
, ARRAY_SIZE(tca_hfsc_policy
))) {
4991 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options");
4995 rsc
= nl_attr_get(attrs
[TCA_HFSC_RSC
]);
4996 fsc
= nl_attr_get(attrs
[TCA_HFSC_FSC
]);
4997 usc
= nl_attr_get(attrs
[TCA_HFSC_USC
]);
4999 if (rsc
->m1
!= 0 || rsc
->d
!= 0 ||
5000 fsc
->m1
!= 0 || fsc
->d
!= 0 ||
5001 usc
->m1
!= 0 || usc
->d
!= 0) {
5002 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
5003 "Non-linear service curves are not supported.");
5007 if (rsc
->m2
!= fsc
->m2
) {
5008 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
5009 "Real-time service curves are not supported ");
5013 if (rsc
->m2
> usc
->m2
) {
5014 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
5015 "Min-rate service curve is greater than "
5016 "the max-rate service curve.");
5020 class->min_rate
= fsc
->m2
;
5021 class->max_rate
= usc
->m2
;
5026 hfsc_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
5027 struct hfsc_class
*options
,
5028 struct netdev_queue_stats
*stats
)
5031 unsigned int handle
;
5032 struct nlattr
*nl_options
;
5034 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
5040 unsigned int major
, minor
;
5042 major
= tc_get_major(handle
);
5043 minor
= tc_get_minor(handle
);
5044 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
5045 *queue_id
= minor
- 1;
5052 error
= hfsc_parse_tca_options__(nl_options
, options
);
5059 hfsc_query_class__(const struct netdev
*netdev
, unsigned int handle
,
5060 unsigned int parent
, struct hfsc_class
*options
,
5061 struct netdev_queue_stats
*stats
)
5064 struct ofpbuf
*reply
;
5066 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
5071 error
= hfsc_parse_tcmsg__(reply
, NULL
, options
, stats
);
5072 ofpbuf_delete(reply
);
5077 hfsc_parse_qdisc_details__(struct netdev
*netdev_
, const struct smap
*details
,
5078 struct hfsc_class
*class)
5080 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5082 uint32_t max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
5084 enum netdev_features current
;
5086 netdev_linux_read_features(netdev
);
5087 current
= !netdev
->get_features_error
? netdev
->current
: 0;
5088 max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
5091 class->min_rate
= max_rate
;
5092 class->max_rate
= max_rate
;
5096 hfsc_parse_class_details__(struct netdev
*netdev
,
5097 const struct smap
*details
,
5098 struct hfsc_class
* class)
5100 const struct hfsc
*hfsc
;
5101 uint32_t min_rate
, max_rate
;
5103 hfsc
= hfsc_get__(netdev
);
5105 min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
5106 min_rate
= MAX(min_rate
, 1);
5107 min_rate
= MIN(min_rate
, hfsc
->max_rate
);
5109 max_rate
= smap_get_ullong(details
, "max-rate", hfsc
->max_rate
* 8) / 8;
5110 max_rate
= MAX(max_rate
, min_rate
);
5111 max_rate
= MIN(max_rate
, hfsc
->max_rate
);
5113 class->min_rate
= min_rate
;
5114 class->max_rate
= max_rate
;
5119 /* Create an HFSC qdisc.
5121 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
5123 hfsc_setup_qdisc__(struct netdev
* netdev
)
5125 struct tcmsg
*tcmsg
;
5126 struct ofpbuf request
;
5127 struct tc_hfsc_qopt opt
;
5129 tc_del_qdisc(netdev
);
5131 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
5132 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
5138 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
5139 tcmsg
->tcm_parent
= TC_H_ROOT
;
5141 memset(&opt
, 0, sizeof opt
);
5144 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
5145 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
5147 return tc_transact(&request
, NULL
);
5150 /* Create an HFSC class.
5152 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
5153 * sc rate <min_rate> ul rate <max_rate>" */
5155 hfsc_setup_class__(struct netdev
*netdev
, unsigned int handle
,
5156 unsigned int parent
, struct hfsc_class
*class)
5160 struct tcmsg
*tcmsg
;
5161 struct ofpbuf request
;
5162 struct tc_service_curve min
, max
;
5164 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
5171 tcmsg
->tcm_handle
= handle
;
5172 tcmsg
->tcm_parent
= parent
;
5176 min
.m2
= class->min_rate
;
5180 max
.m2
= class->max_rate
;
5182 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
5183 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
5184 nl_msg_put_unspec(&request
, TCA_HFSC_RSC
, &min
, sizeof min
);
5185 nl_msg_put_unspec(&request
, TCA_HFSC_FSC
, &min
, sizeof min
);
5186 nl_msg_put_unspec(&request
, TCA_HFSC_USC
, &max
, sizeof max
);
5187 nl_msg_end_nested(&request
, opt_offset
);
5189 error
= tc_transact(&request
, NULL
);
5191 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
5192 "min-rate %ubps, max-rate %ubps (%s)",
5193 netdev_get_name(netdev
),
5194 tc_get_major(handle
), tc_get_minor(handle
),
5195 tc_get_major(parent
), tc_get_minor(parent
),
5196 class->min_rate
, class->max_rate
, ovs_strerror(error
));
5203 hfsc_tc_install(struct netdev
*netdev
, const struct smap
*details
)
5206 struct hfsc_class
class;
5208 error
= hfsc_setup_qdisc__(netdev
);
5214 hfsc_parse_qdisc_details__(netdev
, details
, &class);
5215 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
5216 tc_make_handle(1, 0), &class);
5222 hfsc_install__(netdev
, class.max_rate
);
5227 hfsc_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5230 struct queue_dump_state state
;
5231 struct hfsc_class hc
;
5234 hfsc_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
5235 hfsc_install__(netdev
, hc
.max_rate
);
5237 if (!start_queue_dump(netdev
, &state
)) {
5241 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
5242 unsigned int queue_id
;
5244 if (!hfsc_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
5245 hfsc_update_queue__(netdev
, queue_id
, &hc
);
5249 finish_queue_dump(&state
);
5254 hfsc_tc_destroy(struct tc
*tc
)
5257 struct hfsc_class
*hc
, *next
;
5259 hfsc
= CONTAINER_OF(tc
, struct hfsc
, tc
);
5261 HMAP_FOR_EACH_SAFE (hc
, next
, tc_queue
.hmap_node
, &hfsc
->tc
.queues
) {
5262 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
5271 hfsc_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
5273 const struct hfsc
*hfsc
;
5274 hfsc
= hfsc_get__(netdev
);
5275 smap_add_format(details
, "max-rate", "%llu", 8ULL * hfsc
->max_rate
);
5280 hfsc_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
5283 struct hfsc_class
class;
5285 hfsc_parse_qdisc_details__(netdev
, details
, &class);
5286 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
5287 tc_make_handle(1, 0), &class);
5290 hfsc_get__(netdev
)->max_rate
= class.max_rate
;
5297 hfsc_class_get(const struct netdev
*netdev OVS_UNUSED
,
5298 const struct tc_queue
*queue
, struct smap
*details
)
5300 const struct hfsc_class
*hc
;
5302 hc
= hfsc_class_cast__(queue
);
5303 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
5304 if (hc
->min_rate
!= hc
->max_rate
) {
5305 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
5311 hfsc_class_set(struct netdev
*netdev
, unsigned int queue_id
,
5312 const struct smap
*details
)
5315 struct hfsc_class
class;
5317 error
= hfsc_parse_class_details__(netdev
, details
, &class);
5322 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
5323 tc_make_handle(1, 0xfffe), &class);
5328 hfsc_update_queue__(netdev
, queue_id
, &class);
5333 hfsc_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
5337 struct hfsc_class
*hc
;
5339 hc
= hfsc_class_cast__(queue
);
5340 hfsc
= hfsc_get__(netdev
);
5342 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
5344 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
5351 hfsc_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
5352 struct netdev_queue_stats
*stats
)
5354 return hfsc_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
5355 tc_make_handle(1, 0xfffe), NULL
, stats
);
5359 hfsc_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
5360 const struct ofpbuf
*nlmsg
,
5361 netdev_dump_queue_stats_cb
*cb
, void *aux
)
5363 struct netdev_queue_stats stats
;
5364 unsigned int handle
, major
, minor
;
5367 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
5372 major
= tc_get_major(handle
);
5373 minor
= tc_get_minor(handle
);
5374 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
5375 (*cb
)(minor
- 1, &stats
, aux
);
5380 static const struct tc_ops tc_ops_hfsc
= {
5381 .linux_name
= "hfsc",
5382 .ovs_name
= "linux-hfsc",
5383 .n_queues
= HFSC_N_QUEUES
, /* n_queues */
5384 .tc_install
= hfsc_tc_install
,
5385 .tc_load
= hfsc_tc_load
,
5386 .tc_destroy
= hfsc_tc_destroy
,
5387 .qdisc_get
= hfsc_qdisc_get
,
5388 .qdisc_set
= hfsc_qdisc_set
,
5389 .class_get
= hfsc_class_get
,
5390 .class_set
= hfsc_class_set
,
5391 .class_delete
= hfsc_class_delete
,
5392 .class_get_stats
= hfsc_class_get_stats
,
5393 .class_dump_stats
= hfsc_class_dump_stats
,
5396 /* "linux-noop" traffic control class. */
5399 noop_install__(struct netdev
*netdev_
)
5401 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5402 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
5404 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
5408 noop_tc_install(struct netdev
*netdev
,
5409 const struct smap
*details OVS_UNUSED
)
5411 noop_install__(netdev
);
5416 noop_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5418 noop_install__(netdev
);
5422 static const struct tc_ops tc_ops_noop
= {
5423 .ovs_name
= "linux-noop", /* ovs_name */
5424 .tc_install
= noop_tc_install
,
5425 .tc_load
= noop_tc_load
,
5428 /* "linux-default" traffic control class.
5430 * This class represents the default, unnamed Linux qdisc. It corresponds to
5431 * the "" (empty string) QoS type in the OVS database. */
5434 default_install__(struct netdev
*netdev_
)
5436 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5437 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
5439 /* Nothing but a tc class implementation is allowed to write to a tc. This
5440 * class never does that, so we can legitimately use a const tc object. */
5441 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
5445 default_tc_install(struct netdev
*netdev
,
5446 const struct smap
*details OVS_UNUSED
)
5448 default_install__(netdev
);
5453 default_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5455 default_install__(netdev
);
5459 static const struct tc_ops tc_ops_default
= {
5460 .ovs_name
= "", /* ovs_name */
5461 .tc_install
= default_tc_install
,
5462 .tc_load
= default_tc_load
,
5465 /* "linux-other" traffic control class.
5470 other_tc_load(struct netdev
*netdev_
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5472 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5473 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_other
);
5475 /* Nothing but a tc class implementation is allowed to write to a tc. This
5476 * class never does that, so we can legitimately use a const tc object. */
5477 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
5481 static const struct tc_ops tc_ops_other
= {
5482 .ovs_name
= "linux-other",
5483 .tc_load
= other_tc_load
,
5486 /* Traffic control. */
5488 /* Number of kernel "tc" ticks per second. */
5489 static double ticks_per_s
;
5491 /* Number of kernel "jiffies" per second. This is used for the purpose of
5492 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
5493 * one jiffy's worth of data.
5495 * There are two possibilities here:
5497 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5498 * approximate range of 100 to 1024. That means that we really need to
5499 * make sure that the qdisc can buffer that much data.
5501 * - 'buffer_hz' is an absurdly large number. That means that the kernel
5502 * has finely granular timers and there's no need to fudge additional room
5503 * for buffers. (There's no extra effort needed to implement that: the
5504 * large 'buffer_hz' is used as a divisor, so practically any number will
5505 * come out as 0 in the division. Small integer results in the case of
5506 * really high dividends won't have any real effect anyhow.)
5508 static unsigned int buffer_hz
;
5510 static struct tcmsg
*
5511 netdev_linux_tc_make_request(const struct netdev
*netdev
, int type
,
5512 unsigned int flags
, struct ofpbuf
*request
)
5517 error
= get_ifindex(netdev
, &ifindex
);
5522 return tc_make_request(ifindex
, type
, flags
, request
);
5525 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5528 * This function is equivalent to running:
5529 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5530 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
5533 * The configuration and stats may be seen with the following command:
5534 * /sbin/tc -s filter show dev <devname> parent ffff:
5536 * Returns 0 if successful, otherwise a positive errno value.
5539 tc_add_policer(struct netdev
*netdev
,
5540 uint32_t kbits_rate
, uint32_t kbits_burst
)
5542 struct tc_police tc_police
;
5543 struct ofpbuf request
;
5544 struct tcmsg
*tcmsg
;
5545 size_t basic_offset
;
5546 size_t police_offset
;
5550 memset(&tc_police
, 0, sizeof tc_police
);
5551 tc_police
.action
= TC_POLICE_SHOT
;
5552 tc_police
.mtu
= mtu
;
5553 tc_fill_rate(&tc_police
.rate
, ((uint64_t) kbits_rate
* 1000)/8, mtu
);
5555 /* The following appears wrong in one way: In networking a kilobit is
5556 * usually 1000 bits but this uses 1024 bits.
5558 * However if you "fix" those problems then "tc filter show ..." shows
5559 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5560 * 1,000,000 bits, whereas this actually ends up doing the right thing from
5561 * tc's point of view. Whatever. */
5562 tc_police
.burst
= tc_bytes_to_ticks(
5563 tc_police
.rate
.rate
, MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024 / 8);
5565 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTFILTER
,
5566 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
5570 tcmsg
->tcm_parent
= tc_make_handle(0xffff, 0);
5571 tcmsg
->tcm_info
= tc_make_handle(49,
5572 (OVS_FORCE
uint16_t) htons(ETH_P_ALL
));
5574 nl_msg_put_string(&request
, TCA_KIND
, "basic");
5575 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
5576 police_offset
= nl_msg_start_nested(&request
, TCA_BASIC_POLICE
);
5577 nl_msg_put_unspec(&request
, TCA_POLICE_TBF
, &tc_police
, sizeof tc_police
);
5578 tc_put_rtab(&request
, TCA_POLICE_RATE
, &tc_police
.rate
);
5579 nl_msg_end_nested(&request
, police_offset
);
5580 nl_msg_end_nested(&request
, basic_offset
);
5582 error
= tc_transact(&request
, NULL
);
5593 /* The values in psched are not individually very meaningful, but they are
5594 * important. The tables below show some values seen in the wild.
5598 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5599 * (Before that, there are hints that it was 1000000000.)
5601 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5605 * -----------------------------------
5606 * [1] 000c8000 000f4240 000f4240 00000064
5607 * [2] 000003e8 00000400 000f4240 3b9aca00
5608 * [3] 000003e8 00000400 000f4240 3b9aca00
5609 * [4] 000003e8 00000400 000f4240 00000064
5610 * [5] 000003e8 00000040 000f4240 3b9aca00
5611 * [6] 000003e8 00000040 000f4240 000000f9
5613 * a b c d ticks_per_s buffer_hz
5614 * ------- --------- ---------- ------------- ----------- -------------
5615 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5616 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5617 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5618 * [4] 1,000 1,024 1,000,000 100 976,562 100
5619 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5620 * [6] 1,000 64 1,000,000 249 15,625,000 249
5622 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5623 * [2] 2.6.26-1-686-bigmem from Debian lenny
5624 * [3] 2.6.26-2-sparc64 from Debian lenny
5625 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5626 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5627 * [6] 2.6.34 from kernel.org on KVM
5629 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5630 static const char fn
[] = "/proc/net/psched";
5631 unsigned int a
, b
, c
, d
;
5634 if (!ovsthread_once_start(&once
)) {
5641 stream
= fopen(fn
, "r");
5643 VLOG_WARN("%s: open failed: %s", fn
, ovs_strerror(errno
));
5647 if (fscanf(stream
, "%x %x %x %x", &a
, &b
, &c
, &d
) != 4) {
5648 VLOG_WARN("%s: read failed", fn
);
5652 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn
, a
, b
, c
, d
);
5655 if (!a
|| !b
|| !c
) {
5656 VLOG_WARN("%s: invalid scheduler parameters", fn
);
5660 ticks_per_s
= (double) a
* c
/ b
;
5664 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5667 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn
, ticks_per_s
, buffer_hz
);
5670 ovsthread_once_done(&once
);
5673 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5674 * rate of 'rate' bytes per second. */
5676 tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
)
5679 return (rate
* ticks
) / ticks_per_s
;
5682 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
5683 * rate of 'rate' bytes per second. */
5685 tc_bytes_to_ticks(unsigned int rate
, unsigned int size
)
5688 return rate
? ((unsigned long long int) ticks_per_s
* size
) / rate
: 0;
5691 /* Returns the number of bytes that need to be reserved for qdisc buffering at
5692 * a transmission rate of 'rate' bytes per second. */
5694 tc_buffer_per_jiffy(unsigned int rate
)
5697 return rate
/ buffer_hz
;
5701 tc_time_to_ticks(uint32_t time
) {
5703 return time
* (ticks_per_s
/ 1000000);
5706 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5707 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5708 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5709 * stores NULL into it if it is absent.
5711 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5714 * Returns 0 if successful, otherwise a positive errno value. */
5716 tc_parse_qdisc(const struct ofpbuf
*msg
, const char **kind
,
5717 struct nlattr
**options
)
5719 static const struct nl_policy tca_policy
[] = {
5720 [TCA_KIND
] = { .type
= NL_A_STRING
, .optional
= false },
5721 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= true },
5723 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5725 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5726 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5727 VLOG_WARN_RL(&rl
, "failed to parse qdisc message");
5732 *kind
= nl_attr_get_string(ta
[TCA_KIND
]);
5736 *options
= ta
[TCA_OPTIONS
];
5751 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5752 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5753 * into '*options', and its queue statistics into '*stats'. Any of the output
5754 * arguments may be null.
5756 * Returns 0 if successful, otherwise a positive errno value. */
5758 tc_parse_class(const struct ofpbuf
*msg
, unsigned int *handlep
,
5759 struct nlattr
**options
, struct netdev_queue_stats
*stats
)
5761 static const struct nl_policy tca_policy
[] = {
5762 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= false },
5763 [TCA_STATS2
] = { .type
= NL_A_NESTED
, .optional
= false },
5765 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5767 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5768 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5769 VLOG_WARN_RL(&rl
, "failed to parse class message");
5774 struct tcmsg
*tc
= ofpbuf_at_assert(msg
, NLMSG_HDRLEN
, sizeof *tc
);
5775 *handlep
= tc
->tcm_handle
;
5779 *options
= ta
[TCA_OPTIONS
];
5783 const struct gnet_stats_queue
*gsq
;
5784 struct gnet_stats_basic gsb
;
5786 static const struct nl_policy stats_policy
[] = {
5787 [TCA_STATS_BASIC
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5788 .min_len
= sizeof gsb
},
5789 [TCA_STATS_QUEUE
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5790 .min_len
= sizeof *gsq
},
5792 struct nlattr
*sa
[ARRAY_SIZE(stats_policy
)];
5794 if (!nl_parse_nested(ta
[TCA_STATS2
], stats_policy
,
5795 sa
, ARRAY_SIZE(sa
))) {
5796 VLOG_WARN_RL(&rl
, "failed to parse class stats");
5800 /* Alignment issues screw up the length of struct gnet_stats_basic on
5801 * some arch/bitsize combinations. Newer versions of Linux have a
5802 * struct gnet_stats_basic_packed, but we can't depend on that. The
5803 * easiest thing to do is just to make a copy. */
5804 memset(&gsb
, 0, sizeof gsb
);
5805 memcpy(&gsb
, nl_attr_get(sa
[TCA_STATS_BASIC
]),
5806 MIN(nl_attr_get_size(sa
[TCA_STATS_BASIC
]), sizeof gsb
));
5807 stats
->tx_bytes
= gsb
.bytes
;
5808 stats
->tx_packets
= gsb
.packets
;
5810 gsq
= nl_attr_get(sa
[TCA_STATS_QUEUE
]);
5811 stats
->tx_errors
= gsq
->drops
;
5821 memset(stats
, 0, sizeof *stats
);
5826 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5829 tc_query_class(const struct netdev
*netdev
,
5830 unsigned int handle
, unsigned int parent
,
5831 struct ofpbuf
**replyp
)
5833 struct ofpbuf request
;
5834 struct tcmsg
*tcmsg
;
5837 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, NLM_F_ECHO
,
5842 tcmsg
->tcm_handle
= handle
;
5843 tcmsg
->tcm_parent
= parent
;
5845 error
= tc_transact(&request
, replyp
);
5847 VLOG_WARN_RL(&rl
, "query %s class %u:%u (parent %u:%u) failed (%s)",
5848 netdev_get_name(netdev
),
5849 tc_get_major(handle
), tc_get_minor(handle
),
5850 tc_get_major(parent
), tc_get_minor(parent
),
5851 ovs_strerror(error
));
5856 /* Equivalent to "tc class del dev <name> handle <handle>". */
5858 tc_delete_class(const struct netdev
*netdev
, unsigned int handle
)
5860 struct ofpbuf request
;
5861 struct tcmsg
*tcmsg
;
5864 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_DELTCLASS
, 0, &request
);
5868 tcmsg
->tcm_handle
= handle
;
5869 tcmsg
->tcm_parent
= 0;
5871 error
= tc_transact(&request
, NULL
);
5873 VLOG_WARN_RL(&rl
, "delete %s class %u:%u failed (%s)",
5874 netdev_get_name(netdev
),
5875 tc_get_major(handle
), tc_get_minor(handle
),
5876 ovs_strerror(error
));
5881 /* Equivalent to "tc qdisc del dev <name> root". */
5883 tc_del_qdisc(struct netdev
*netdev_
)
5885 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5886 struct ofpbuf request
;
5887 struct tcmsg
*tcmsg
;
5890 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_DELQDISC
, 0, &request
);
5894 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
5895 tcmsg
->tcm_parent
= TC_H_ROOT
;
5897 error
= tc_transact(&request
, NULL
);
5898 if (error
== EINVAL
) {
5899 /* EINVAL probably means that the default qdisc was in use, in which
5900 * case we've accomplished our purpose. */
5903 if (!error
&& netdev
->tc
) {
5904 if (netdev
->tc
->ops
->tc_destroy
) {
5905 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
5913 getqdisc_is_safe(void)
5915 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5916 static bool safe
= false;
5918 if (ovsthread_once_start(&once
)) {
5919 struct utsname utsname
;
5922 if (uname(&utsname
) == -1) {
5923 VLOG_WARN("uname failed (%s)", ovs_strerror(errno
));
5924 } else if (!ovs_scan(utsname
.release
, "%d.%d", &major
, &minor
)) {
5925 VLOG_WARN("uname reported bad OS release (%s)", utsname
.release
);
5926 } else if (major
< 2 || (major
== 2 && minor
< 35)) {
5927 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5932 ovsthread_once_done(&once
);
5937 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5938 * kernel to determine what they are. Returns 0 if successful, otherwise a
5939 * positive errno value. */
5941 tc_query_qdisc(const struct netdev
*netdev_
)
5943 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5944 struct ofpbuf request
, *qdisc
;
5945 const struct tc_ops
*ops
;
5946 struct tcmsg
*tcmsg
;
5954 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5955 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5956 * 2.6.35 without that fix backported to it.
5958 * To avoid the OOPS, we must not make a request that would attempt to dump
5959 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5960 * few others. There are a few ways that I can see to do this, but most of
5961 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5962 * technique chosen here is to assume that any non-default qdisc that we
5963 * create will have a class with handle 1:0. The built-in qdiscs only have
5964 * a class with handle 0:0.
5966 * On Linux 2.6.35+ we use the straightforward method because it allows us
5967 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5968 * in such a case we get no response at all from the kernel (!) if a
5969 * builtin qdisc is in use (which is later caught by "!error &&
5970 * !qdisc->size"). */
5971 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_GETQDISC
, NLM_F_ECHO
,
5976 tcmsg
->tcm_handle
= tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5977 tcmsg
->tcm_parent
= getqdisc_is_safe() ? TC_H_ROOT
: 0;
5979 /* Figure out what tc class to instantiate. */
5980 error
= tc_transact(&request
, &qdisc
);
5981 if (!error
&& qdisc
->size
) {
5984 error
= tc_parse_qdisc(qdisc
, &kind
, NULL
);
5986 ops
= &tc_ops_other
;
5988 ops
= tc_lookup_linux_name(kind
);
5990 static struct vlog_rate_limit rl2
= VLOG_RATE_LIMIT_INIT(1, 1);
5991 VLOG_DBG_RL(&rl2
, "unknown qdisc \"%s\"", kind
);
5993 ops
= &tc_ops_other
;
5996 } else if ((!error
&& !qdisc
->size
) || error
== ENOENT
) {
5997 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5998 * set up by some other entity that doesn't have a handle 1:0. We will
5999 * assume that it's the system default qdisc. */
6000 ops
= &tc_ops_default
;
6003 /* Who knows? Maybe the device got deleted. */
6004 VLOG_WARN_RL(&rl
, "query %s qdisc failed (%s)",
6005 netdev_get_name(netdev_
), ovs_strerror(error
));
6006 ops
= &tc_ops_other
;
6009 /* Instantiate it. */
6010 load_error
= ops
->tc_load(CONST_CAST(struct netdev
*, netdev_
), qdisc
);
6011 ovs_assert((load_error
== 0) == (netdev
->tc
!= NULL
));
6012 ofpbuf_delete(qdisc
);
6014 return error
? error
: load_error
;
6017 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
6018 approximate the time to transmit packets of various lengths. For an MTU of
6019 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
6020 represents two possible packet lengths; for a MTU of 513 through 1024, four
6021 possible lengths; and so on.
6023 Returns, for the specified 'mtu', the number of bits that packet lengths
6024 need to be shifted right to fit within such a 256-entry table. */
6026 tc_calc_cell_log(unsigned int mtu
)
6031 mtu
= ETH_PAYLOAD_MAX
;
6033 mtu
+= ETH_HEADER_LEN
+ VLAN_HEADER_LEN
;
6035 for (cell_log
= 0; mtu
>= 256; cell_log
++) {
6042 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
6045 tc_fill_rate(struct tc_ratespec
*rate
, uint64_t Bps
, int mtu
)
6047 memset(rate
, 0, sizeof *rate
);
6048 rate
->cell_log
= tc_calc_cell_log(mtu
);
6049 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
6050 /* rate->cell_align = 0; */ /* distro headers. */
6051 rate
->mpu
= ETH_TOTAL_MIN
;
6055 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
6056 * attribute of the specified "type".
6058 * See tc_calc_cell_log() above for a description of "rtab"s. */
6060 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
)
6065 rtab
= nl_msg_put_unspec_uninit(msg
, type
, TC_RTAB_SIZE
);
6066 for (i
= 0; i
< TC_RTAB_SIZE
/ sizeof *rtab
; i
++) {
6067 unsigned packet_size
= (i
+ 1) << rate
->cell_log
;
6068 if (packet_size
< rate
->mpu
) {
6069 packet_size
= rate
->mpu
;
6071 rtab
[i
] = tc_bytes_to_ticks(rate
->rate
, packet_size
);
6075 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
6076 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
6077 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
6080 tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
)
6082 unsigned int min_burst
= tc_buffer_per_jiffy(Bps
) + mtu
;
6083 return tc_bytes_to_ticks(Bps
, MAX(burst_bytes
, min_burst
));
6086 /* Linux-only functions declared in netdev-linux.h */
6088 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
6089 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
6091 netdev_linux_ethtool_set_flag(struct netdev
*netdev
, uint32_t flag
,
6092 const char *flag_name
, bool enable
)
6094 const char *netdev_name
= netdev_get_name(netdev
);
6095 struct ethtool_value evalue
;
6099 COVERAGE_INC(netdev_get_ethtool
);
6100 memset(&evalue
, 0, sizeof evalue
);
6101 error
= netdev_linux_do_ethtool(netdev_name
,
6102 (struct ethtool_cmd
*)&evalue
,
6103 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
6108 COVERAGE_INC(netdev_set_ethtool
);
6109 new_flags
= (evalue
.data
& ~flag
) | (enable
? flag
: 0);
6110 if (new_flags
== evalue
.data
) {
6113 evalue
.data
= new_flags
;
6114 error
= netdev_linux_do_ethtool(netdev_name
,
6115 (struct ethtool_cmd
*)&evalue
,
6116 ETHTOOL_SFLAGS
, "ETHTOOL_SFLAGS");
6121 COVERAGE_INC(netdev_get_ethtool
);
6122 memset(&evalue
, 0, sizeof evalue
);
6123 error
= netdev_linux_do_ethtool(netdev_name
,
6124 (struct ethtool_cmd
*)&evalue
,
6125 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
6130 if (new_flags
!= evalue
.data
) {
6131 VLOG_WARN_RL(&rl
, "attempt to %s ethtool %s flag on network "
6132 "device %s failed", enable
? "enable" : "disable",
6133 flag_name
, netdev_name
);
6140 /* Utility functions. */
6142 /* Copies 'src' into 'dst', performing format conversion in the process. */
6144 netdev_stats_from_rtnl_link_stats(struct netdev_stats
*dst
,
6145 const struct rtnl_link_stats
*src
)
6147 dst
->rx_packets
= src
->rx_packets
;
6148 dst
->tx_packets
= src
->tx_packets
;
6149 dst
->rx_bytes
= src
->rx_bytes
;
6150 dst
->tx_bytes
= src
->tx_bytes
;
6151 dst
->rx_errors
= src
->rx_errors
;
6152 dst
->tx_errors
= src
->tx_errors
;
6153 dst
->rx_dropped
= src
->rx_dropped
;
6154 dst
->tx_dropped
= src
->tx_dropped
;
6155 dst
->multicast
= src
->multicast
;
6156 dst
->collisions
= src
->collisions
;
6157 dst
->rx_length_errors
= src
->rx_length_errors
;
6158 dst
->rx_over_errors
= src
->rx_over_errors
;
6159 dst
->rx_crc_errors
= src
->rx_crc_errors
;
6160 dst
->rx_frame_errors
= src
->rx_frame_errors
;
6161 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
6162 dst
->rx_missed_errors
= src
->rx_missed_errors
;
6163 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
6164 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
6165 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
6166 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
6167 dst
->tx_window_errors
= src
->tx_window_errors
;
6170 /* Copies 'src' into 'dst', performing format conversion in the process. */
6172 netdev_stats_from_rtnl_link_stats64(struct netdev_stats
*dst
,
6173 const struct rtnl_link_stats64
*src
)
6175 dst
->rx_packets
= src
->rx_packets
;
6176 dst
->tx_packets
= src
->tx_packets
;
6177 dst
->rx_bytes
= src
->rx_bytes
;
6178 dst
->tx_bytes
= src
->tx_bytes
;
6179 dst
->rx_errors
= src
->rx_errors
;
6180 dst
->tx_errors
= src
->tx_errors
;
6181 dst
->rx_dropped
= src
->rx_dropped
;
6182 dst
->tx_dropped
= src
->tx_dropped
;
6183 dst
->multicast
= src
->multicast
;
6184 dst
->collisions
= src
->collisions
;
6185 dst
->rx_length_errors
= src
->rx_length_errors
;
6186 dst
->rx_over_errors
= src
->rx_over_errors
;
6187 dst
->rx_crc_errors
= src
->rx_crc_errors
;
6188 dst
->rx_frame_errors
= src
->rx_frame_errors
;
6189 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
6190 dst
->rx_missed_errors
= src
->rx_missed_errors
;
6191 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
6192 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
6193 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
6194 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
6195 dst
->tx_window_errors
= src
->tx_window_errors
;
6199 get_stats_via_netlink(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
6201 struct ofpbuf request
;
6202 struct ofpbuf
*reply
;
6205 /* Filtering all counters by default */
6206 memset(stats
, 0xFF, sizeof(struct netdev_stats
));
6208 ofpbuf_init(&request
, 0);
6209 nl_msg_put_nlmsghdr(&request
,
6210 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
6211 RTM_GETLINK
, NLM_F_REQUEST
);
6212 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
6213 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(netdev_
));
6214 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
6215 ofpbuf_uninit(&request
);
6220 if (ofpbuf_try_pull(reply
, NLMSG_HDRLEN
+ sizeof(struct ifinfomsg
))) {
6221 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS64
);
6222 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats64
)) {
6223 netdev_stats_from_rtnl_link_stats64(stats
, nl_attr_get(a
));
6226 a
= nl_attr_find(reply
, 0, IFLA_STATS
);
6227 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats
)) {
6228 netdev_stats_from_rtnl_link_stats(stats
, nl_attr_get(a
));
6231 VLOG_WARN_RL(&rl
, "RTM_GETLINK reply lacks stats");
6236 VLOG_WARN_RL(&rl
, "short RTM_GETLINK reply");
6241 ofpbuf_delete(reply
);
6246 get_flags(const struct netdev
*dev
, unsigned int *flags
)
6252 error
= af_inet_ifreq_ioctl(dev
->name
, &ifr
, SIOCGIFFLAGS
, "SIOCGIFFLAGS");
6254 *flags
= ifr
.ifr_flags
;
6260 set_flags(const char *name
, unsigned int flags
)
6264 ifr
.ifr_flags
= flags
;
6265 return af_inet_ifreq_ioctl(name
, &ifr
, SIOCSIFFLAGS
, "SIOCSIFFLAGS");
6269 linux_get_ifindex(const char *netdev_name
)
6274 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
6275 COVERAGE_INC(netdev_get_ifindex
);
6277 error
= af_inet_ioctl(SIOCGIFINDEX
, &ifr
);
6279 /* ENODEV probably means that a vif disappeared asynchronously and
6280 * hasn't been removed from the database yet, so reduce the log level
6281 * to INFO for that case. */
6282 VLOG_RL(&rl
, error
== ENODEV
? VLL_INFO
: VLL_ERR
,
6283 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
6284 netdev_name
, ovs_strerror(error
));
6287 return ifr
.ifr_ifindex
;
6291 get_ifindex(const struct netdev
*netdev_
, int *ifindexp
)
6293 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
6295 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
6296 netdev_linux_update_via_netlink(netdev
);
6299 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
6300 /* Fall back to ioctl if netlink fails */
6301 int ifindex
= linux_get_ifindex(netdev_get_name(netdev_
));
6304 netdev
->get_ifindex_error
= -ifindex
;
6305 netdev
->ifindex
= 0;
6307 netdev
->get_ifindex_error
= 0;
6308 netdev
->ifindex
= ifindex
;
6310 netdev
->cache_valid
|= VALID_IFINDEX
;
6313 *ifindexp
= netdev
->ifindex
;
6314 return netdev
->get_ifindex_error
;
6318 netdev_linux_update_via_netlink(struct netdev_linux
*netdev
)
6320 struct ofpbuf request
;
6321 struct ofpbuf
*reply
;
6322 struct rtnetlink_change chg
;
6323 struct rtnetlink_change
*change
= &chg
;
6326 ofpbuf_init(&request
, 0);
6327 nl_msg_put_nlmsghdr(&request
,
6328 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
) +
6329 NL_A_U32_SIZE
, RTM_GETLINK
, NLM_F_REQUEST
);
6330 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
6332 /* The correct identifiers for a Linux device are netnsid and ifindex,
6333 * but ifindex changes as the port is moved to another network namespace
6334 * and the interface name statically stored in ovsdb. */
6335 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(&netdev
->up
));
6336 if (netdev_linux_netnsid_is_remote(netdev
)) {
6337 nl_msg_put_u32(&request
, IFLA_IF_NETNSID
, netdev
->netnsid
);
6339 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
6340 ofpbuf_uninit(&request
);
6342 ofpbuf_delete(reply
);
6346 if (rtnetlink_parse(reply
, change
)
6347 && change
->nlmsg_type
== RTM_NEWLINK
) {
6348 bool changed
= false;
6351 /* Update netdev from rtnl msg and increment its seq if needed. */
6352 if ((change
->ifi_flags
^ netdev
->ifi_flags
) & IFF_RUNNING
) {
6353 netdev
->carrier_resets
++;
6356 if (change
->ifi_flags
!= netdev
->ifi_flags
) {
6357 netdev
->ifi_flags
= change
->ifi_flags
;
6360 if (change
->mtu
&& change
->mtu
!= netdev
->mtu
) {
6361 netdev
->mtu
= change
->mtu
;
6362 netdev
->cache_valid
|= VALID_MTU
;
6363 netdev
->netdev_mtu_error
= 0;
6366 if (!eth_addr_is_zero(change
->mac
)
6367 && !eth_addr_equals(change
->mac
, netdev
->etheraddr
)) {
6368 netdev
->etheraddr
= change
->mac
;
6369 netdev
->cache_valid
|= VALID_ETHERADDR
;
6370 netdev
->ether_addr_error
= 0;
6373 if (change
->if_index
!= netdev
->ifindex
) {
6374 netdev
->ifindex
= change
->if_index
;
6375 netdev
->cache_valid
|= VALID_IFINDEX
;
6376 netdev
->get_ifindex_error
= 0;
6379 if (change
->master
&& netdev_linux_kind_is_lag(change
->master
)) {
6380 netdev
->is_lag_master
= true;
6383 netdev_change_seq_changed(&netdev
->up
);
6389 ofpbuf_delete(reply
);
6394 get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
)
6400 memset(&ifr
, 0, sizeof ifr
);
6401 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
6402 COVERAGE_INC(netdev_get_hwaddr
);
6403 error
= af_inet_ioctl(SIOCGIFHWADDR
, &ifr
);
6405 /* ENODEV probably means that a vif disappeared asynchronously and
6406 * hasn't been removed from the database yet, so reduce the log level
6407 * to INFO for that case. */
6408 VLOG(error
== ENODEV
? VLL_INFO
: VLL_ERR
,
6409 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
6410 netdev_name
, ovs_strerror(error
));
6413 hwaddr_family
= ifr
.ifr_hwaddr
.sa_family
;
6414 if (hwaddr_family
!= AF_UNSPEC
&& hwaddr_family
!= ARPHRD_ETHER
&&
6415 hwaddr_family
!= ARPHRD_NONE
) {
6416 VLOG_INFO("%s device has unknown hardware address family %d",
6417 netdev_name
, hwaddr_family
);
6420 memcpy(ea
, ifr
.ifr_hwaddr
.sa_data
, ETH_ADDR_LEN
);
6425 set_etheraddr(const char *netdev_name
, const struct eth_addr mac
)
6430 memset(&ifr
, 0, sizeof ifr
);
6431 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
6432 ifr
.ifr_hwaddr
.sa_family
= ARPHRD_ETHER
;
6433 memcpy(ifr
.ifr_hwaddr
.sa_data
, &mac
, ETH_ADDR_LEN
);
6434 COVERAGE_INC(netdev_set_hwaddr
);
6435 error
= af_inet_ioctl(SIOCSIFHWADDR
, &ifr
);
6437 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
6438 netdev_name
, ovs_strerror(error
));
6444 netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*ecmd
,
6445 int cmd
, const char *cmd_name
)
6450 memset(&ifr
, 0, sizeof ifr
);
6451 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
6452 ifr
.ifr_data
= (caddr_t
) ecmd
;
6455 error
= af_inet_ioctl(SIOCETHTOOL
, &ifr
);
6457 if (error
!= EOPNOTSUPP
) {
6458 VLOG_WARN_RL(&rl
, "ethtool command %s on network device %s "
6459 "failed: %s", cmd_name
, name
, ovs_strerror(error
));
6461 /* The device doesn't support this operation. That's pretty
6462 * common, so there's no point in logging anything. */
6468 /* Returns an AF_PACKET raw socket or a negative errno value. */
6470 af_packet_sock(void)
6472 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
6475 if (ovsthread_once_start(&once
)) {
6476 sock
= socket(AF_PACKET
, SOCK_RAW
, 0);
6478 int error
= set_nonblocking(sock
);
6482 } else if (userspace_tso_enabled()) {
6484 error
= setsockopt(sock
, SOL_PACKET
, PACKET_VNET_HDR
, &val
,
6488 VLOG_ERR("failed to enable vnet hdr in raw socket: %s",
6489 ovs_strerror(errno
));
6496 VLOG_ERR("failed to create packet socket: %s",
6497 ovs_strerror(errno
));
6499 ovsthread_once_done(&once
);
6506 netdev_linux_parse_l2(struct dp_packet
*b
, uint16_t *l4proto
)
6508 struct eth_header
*eth_hdr
;
6512 eth_hdr
= dp_packet_at(b
, 0, ETH_HEADER_LEN
);
6517 l2_len
= ETH_HEADER_LEN
;
6518 eth_type
= eth_hdr
->eth_type
;
6519 if (eth_type_vlan(eth_type
)) {
6520 struct vlan_header
*vlan
= dp_packet_at(b
, l2_len
, VLAN_HEADER_LEN
);
6526 eth_type
= vlan
->vlan_next_type
;
6527 l2_len
+= VLAN_HEADER_LEN
;
6530 if (eth_type
== htons(ETH_TYPE_IP
)) {
6531 struct ip_header
*ip_hdr
= dp_packet_at(b
, l2_len
, IP_HEADER_LEN
);
6537 *l4proto
= ip_hdr
->ip_proto
;
6538 dp_packet_hwol_set_tx_ipv4(b
);
6539 } else if (eth_type
== htons(ETH_TYPE_IPV6
)) {
6540 struct ovs_16aligned_ip6_hdr
*nh6
;
6542 nh6
= dp_packet_at(b
, l2_len
, IPV6_HEADER_LEN
);
6547 *l4proto
= nh6
->ip6_ctlun
.ip6_un1
.ip6_un1_nxt
;
6548 dp_packet_hwol_set_tx_ipv6(b
);
6555 netdev_linux_parse_vnet_hdr(struct dp_packet
*b
)
6557 struct virtio_net_hdr
*vnet
= dp_packet_pull(b
, sizeof *vnet
);
6558 uint16_t l4proto
= 0;
6560 if (OVS_UNLIKELY(!vnet
)) {
6564 if (vnet
->flags
== 0 && vnet
->gso_type
== VIRTIO_NET_HDR_GSO_NONE
) {
6568 if (netdev_linux_parse_l2(b
, &l4proto
)) {
6572 if (vnet
->flags
== VIRTIO_NET_HDR_F_NEEDS_CSUM
) {
6573 if (l4proto
== IPPROTO_TCP
) {
6574 dp_packet_hwol_set_csum_tcp(b
);
6575 } else if (l4proto
== IPPROTO_UDP
) {
6576 dp_packet_hwol_set_csum_udp(b
);
6577 } else if (l4proto
== IPPROTO_SCTP
) {
6578 dp_packet_hwol_set_csum_sctp(b
);
6582 if (l4proto
&& vnet
->gso_type
!= VIRTIO_NET_HDR_GSO_NONE
) {
6583 uint8_t allowed_mask
= VIRTIO_NET_HDR_GSO_TCPV4
6584 | VIRTIO_NET_HDR_GSO_TCPV6
6585 | VIRTIO_NET_HDR_GSO_UDP
;
6586 uint8_t type
= vnet
->gso_type
& allowed_mask
;
6588 if (type
== VIRTIO_NET_HDR_GSO_TCPV4
6589 || type
== VIRTIO_NET_HDR_GSO_TCPV6
) {
6590 dp_packet_hwol_set_tcp_seg(b
);
6598 netdev_linux_prepend_vnet_hdr(struct dp_packet
*b
, int mtu
)
6600 struct virtio_net_hdr
*vnet
= dp_packet_push_zeros(b
, sizeof *vnet
);
6602 if (dp_packet_hwol_is_tso(b
)) {
6603 uint16_t hdr_len
= ((char *)dp_packet_l4(b
) - (char *)dp_packet_eth(b
))
6606 vnet
->hdr_len
= (OVS_FORCE __virtio16
)hdr_len
;
6607 vnet
->gso_size
= (OVS_FORCE __virtio16
)(mtu
- hdr_len
);
6608 if (dp_packet_hwol_is_ipv4(b
)) {
6609 vnet
->gso_type
= VIRTIO_NET_HDR_GSO_TCPV4
;
6611 vnet
->gso_type
= VIRTIO_NET_HDR_GSO_TCPV6
;
6615 vnet
->flags
= VIRTIO_NET_HDR_GSO_NONE
;
6618 if (dp_packet_hwol_l4_mask(b
)) {
6619 vnet
->flags
= VIRTIO_NET_HDR_F_NEEDS_CSUM
;
6620 vnet
->csum_start
= (OVS_FORCE __virtio16
)((char *)dp_packet_l4(b
)
6621 - (char *)dp_packet_eth(b
));
6623 if (dp_packet_hwol_l4_is_tcp(b
)) {
6624 vnet
->csum_offset
= (OVS_FORCE __virtio16
) __builtin_offsetof(
6625 struct tcp_header
, tcp_csum
);
6626 } else if (dp_packet_hwol_l4_is_udp(b
)) {
6627 vnet
->csum_offset
= (OVS_FORCE __virtio16
) __builtin_offsetof(
6628 struct udp_header
, udp_csum
);
6629 } else if (dp_packet_hwol_l4_is_sctp(b
)) {
6630 vnet
->csum_offset
= (OVS_FORCE __virtio16
) __builtin_offsetof(
6631 struct sctp_header
, sctp_csum
);
6633 VLOG_WARN_RL(&rl
, "Unsupported L4 protocol");