2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
20 #include "netdev-linux-private.h"
24 #include <sys/types.h>
25 #include <netinet/in.h>
26 #include <arpa/inet.h>
29 #include <linux/filter.h>
30 #include <linux/gen_stats.h>
31 #include <linux/if_ether.h>
32 #include <linux/if_packet.h>
33 #include <linux/if_tun.h>
34 #include <linux/types.h>
35 #include <linux/ethtool.h>
36 #include <linux/mii.h>
37 #include <linux/rtnetlink.h>
38 #include <linux/sockios.h>
39 #include <linux/virtio_net.h>
40 #include <sys/ioctl.h>
41 #include <sys/socket.h>
43 #include <sys/utsname.h>
45 #include <net/if_arp.h>
46 #include <net/route.h>
53 #include "dp-packet.h"
54 #include "dpif-netlink.h"
55 #include "dpif-netdev.h"
56 #include "openvswitch/dynamic-string.h"
57 #include "fatal-signal.h"
59 #include "openvswitch/hmap.h"
60 #include "netdev-afxdp.h"
61 #include "netdev-provider.h"
62 #include "netdev-vport.h"
63 #include "netlink-notifier.h"
64 #include "netlink-socket.h"
67 #include "openvswitch/ofpbuf.h"
68 #include "openflow/openflow.h"
69 #include "ovs-atomic.h"
72 #include "openvswitch/poll-loop.h"
73 #include "rtnetlink.h"
74 #include "openvswitch/shash.h"
75 #include "socket-util.h"
79 #include "unaligned.h"
80 #include "openvswitch/vlog.h"
81 #include "userspace-tso.h"
84 VLOG_DEFINE_THIS_MODULE(netdev_linux
);
86 COVERAGE_DEFINE(netdev_set_policing
);
87 COVERAGE_DEFINE(netdev_arp_lookup
);
88 COVERAGE_DEFINE(netdev_get_ifindex
);
89 COVERAGE_DEFINE(netdev_get_hwaddr
);
90 COVERAGE_DEFINE(netdev_set_hwaddr
);
91 COVERAGE_DEFINE(netdev_get_ethtool
);
92 COVERAGE_DEFINE(netdev_set_ethtool
);
95 #ifndef IFLA_IF_NETNSID
96 #define IFLA_IF_NETNSID 0x45
98 /* These were introduced in Linux 2.6.14, so they might be missing if we have
100 #ifndef ADVERTISED_Pause
101 #define ADVERTISED_Pause (1 << 13)
103 #ifndef ADVERTISED_Asym_Pause
104 #define ADVERTISED_Asym_Pause (1 << 14)
107 /* These were introduced in Linux 2.6.24, so they might be missing if we
108 * have old headers. */
109 #ifndef ETHTOOL_GFLAGS
110 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
112 #ifndef ETHTOOL_SFLAGS
113 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
116 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
119 #define TC_RTAB_SIZE 1024
122 #ifndef TCM_IFINDEX_MAGIC_BLOCK
123 #define TCM_IFINDEX_MAGIC_BLOCK (0xFFFFFFFFU)
126 /* Linux 2.6.21 introduced struct tpacket_auxdata.
127 * Linux 2.6.27 added the tp_vlan_tci member.
128 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
129 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
130 * TP_STATUS_VLAN_TPID_VALID.
132 * With all this churn it's easiest to unconditionally define a replacement
133 * structure that has everything we want.
135 #ifndef PACKET_AUXDATA
136 #define PACKET_AUXDATA 8
138 #ifndef TP_STATUS_VLAN_VALID
139 #define TP_STATUS_VLAN_VALID (1 << 4)
141 #ifndef TP_STATUS_VLAN_TPID_VALID
142 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
144 #undef tpacket_auxdata
145 #define tpacket_auxdata rpl_tpacket_auxdata
146 struct tpacket_auxdata
{
152 uint16_t tp_vlan_tci
;
153 uint16_t tp_vlan_tpid
;
156 /* Linux 2.6.27 introduced ethtool_cmd_speed
158 * To avoid revisiting problems reported with using configure to detect
159 * compatibility (see report at
160 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
161 * unconditionally replace ethtool_cmd_speed. */
162 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
163 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd
*ep
)
165 return ep
->speed
| (ep
->speed_hi
<< 16);
168 /* Linux 2.6.30 introduced supported and advertised flags for
169 * 1G base KX, and 10G base KX4, KR and R. */
170 #ifndef SUPPORTED_1000baseKX_Full
171 #define SUPPORTED_1000baseKX_Full (1 << 17)
172 #define SUPPORTED_10000baseKX4_Full (1 << 18)
173 #define SUPPORTED_10000baseKR_Full (1 << 19)
174 #define SUPPORTED_10000baseR_FEC (1 << 20)
175 #define ADVERTISED_1000baseKX_Full (1 << 17)
176 #define ADVERTISED_10000baseKX4_Full (1 << 18)
177 #define ADVERTISED_10000baseKR_Full (1 << 19)
178 #define ADVERTISED_10000baseR_FEC (1 << 20)
181 /* Linux 3.5 introduced supported and advertised flags for
182 * 40G base KR4, CR4, SR4 and LR4. */
183 #ifndef SUPPORTED_40000baseKR4_Full
184 #define SUPPORTED_40000baseKR4_Full (1 << 23)
185 #define SUPPORTED_40000baseCR4_Full (1 << 24)
186 #define SUPPORTED_40000baseSR4_Full (1 << 25)
187 #define SUPPORTED_40000baseLR4_Full (1 << 26)
188 #define ADVERTISED_40000baseKR4_Full (1 << 23)
189 #define ADVERTISED_40000baseCR4_Full (1 << 24)
190 #define ADVERTISED_40000baseSR4_Full (1 << 25)
191 #define ADVERTISED_40000baseLR4_Full (1 << 26)
194 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
196 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
197 * 2.6.32-431.29.2.el6.x86_64 (see report at
198 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
199 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
200 * unconditionally define a replacement. */
202 #define IFLA_STATS64 23
204 #define rtnl_link_stats64 rpl_rtnl_link_stats64
205 struct rtnl_link_stats64
{
217 uint64_t rx_length_errors
;
218 uint64_t rx_over_errors
;
219 uint64_t rx_crc_errors
;
220 uint64_t rx_frame_errors
;
221 uint64_t rx_fifo_errors
;
222 uint64_t rx_missed_errors
;
224 uint64_t tx_aborted_errors
;
225 uint64_t tx_carrier_errors
;
226 uint64_t tx_fifo_errors
;
227 uint64_t tx_heartbeat_errors
;
228 uint64_t tx_window_errors
;
230 uint64_t rx_compressed
;
231 uint64_t tx_compressed
;
235 VALID_IFINDEX
= 1 << 0,
236 VALID_ETHERADDR
= 1 << 1,
239 VALID_POLICING
= 1 << 4,
240 VALID_VPORT_STAT_ERROR
= 1 << 5,
241 VALID_DRVINFO
= 1 << 6,
242 VALID_FEATURES
= 1 << 7,
243 VALID_NUMA_ID
= 1 << 8,
246 /* Use one for the packet buffer and another for the aux buffer to receive
248 #define IOV_STD_SIZE 1
249 #define IOV_TSO_SIZE 2
256 struct linux_lag_slave
{
258 struct shash_node
*node
;
261 /* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
262 static struct ovs_mutex lag_mutex
= OVS_MUTEX_INITIALIZER
;
264 /* All slaves whose LAG masters are network devices in OvS. */
265 static struct shash lag_shash
OVS_GUARDED_BY(lag_mutex
)
266 = SHASH_INITIALIZER(&lag_shash
);
268 /* Traffic control. */
270 /* An instance of a traffic control class. Always associated with a particular
273 * Each TC implementation subclasses this with whatever additional data it
276 const struct tc_ops
*ops
;
277 struct hmap queues
; /* Contains "struct tc_queue"s.
278 * Read by generic TC layer.
279 * Written only by TC implementation. */
282 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
284 /* One traffic control queue.
286 * Each TC implementation subclasses this with whatever additional data it
289 struct hmap_node hmap_node
; /* In struct tc's "queues" hmap. */
290 unsigned int queue_id
; /* OpenFlow queue ID. */
291 long long int created
; /* Time queue was created, in msecs. */
294 /* A particular kind of traffic control. Each implementation generally maps to
295 * one particular Linux qdisc class.
297 * The functions below return 0 if successful or a positive errno value on
298 * failure, except where otherwise noted. All of them must be provided, except
299 * where otherwise noted. */
301 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
302 * This is null for tc_ops_default and tc_ops_other, for which there are no
303 * appropriate values. */
304 const char *linux_name
;
306 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
307 const char *ovs_name
;
309 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
310 * queues. The queues are numbered 0 through n_queues - 1. */
311 unsigned int n_queues
;
313 /* Called to install this TC class on 'netdev'. The implementation should
314 * make the Netlink calls required to set up 'netdev' with the right qdisc
315 * and configure it according to 'details'. The implementation may assume
316 * that the current qdisc is the default; that is, there is no need for it
317 * to delete the current qdisc before installing itself.
319 * The contents of 'details' should be documented as valid for 'ovs_name'
320 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
321 * (which is built as ovs-vswitchd.conf.db(8)).
323 * This function must return 0 if and only if it sets 'netdev->tc' to an
324 * initialized 'struct tc'.
326 * (This function is null for tc_ops_other, which cannot be installed. For
327 * other TC classes it should always be nonnull.) */
328 int (*tc_install
)(struct netdev
*netdev
, const struct smap
*details
);
330 /* Called when the netdev code determines (through a Netlink query) that
331 * this TC class's qdisc is installed on 'netdev', but we didn't install
332 * it ourselves and so don't know any of the details.
334 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
335 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
336 * implementation should parse the other attributes of 'nlmsg' as
337 * necessary to determine its configuration. If necessary it should also
338 * use Netlink queries to determine the configuration of queues on
341 * This function must return 0 if and only if it sets 'netdev->tc' to an
342 * initialized 'struct tc'. */
343 int (*tc_load
)(struct netdev
*netdev
, struct ofpbuf
*nlmsg
);
345 /* Destroys the data structures allocated by the implementation as part of
346 * 'tc'. (This includes destroying 'tc->queues' by calling
349 * The implementation should not need to perform any Netlink calls. If
350 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
351 * (But it may not be desirable.)
353 * This function may be null if 'tc' is trivial. */
354 void (*tc_destroy
)(struct tc
*tc
);
356 /* Retrieves details of 'netdev->tc' configuration into 'details'.
358 * The implementation should not need to perform any Netlink calls, because
359 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
360 * cached the configuration.
362 * The contents of 'details' should be documented as valid for 'ovs_name'
363 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
364 * (which is built as ovs-vswitchd.conf.db(8)).
366 * This function may be null if 'tc' is not configurable.
368 int (*qdisc_get
)(const struct netdev
*netdev
, struct smap
*details
);
370 /* Reconfigures 'netdev->tc' according to 'details', performing any
371 * required Netlink calls to complete the reconfiguration.
373 * The contents of 'details' should be documented as valid for 'ovs_name'
374 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
375 * (which is built as ovs-vswitchd.conf.db(8)).
377 * This function may be null if 'tc' is not configurable.
379 int (*qdisc_set
)(struct netdev
*, const struct smap
*details
);
381 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
382 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
384 * The contents of 'details' should be documented as valid for 'ovs_name'
385 * in the "other_config" column in the "Queue" table in
386 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
388 * The implementation should not need to perform any Netlink calls, because
389 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
390 * cached the queue configuration.
392 * This function may be null if 'tc' does not have queues ('n_queues' is
394 int (*class_get
)(const struct netdev
*netdev
, const struct tc_queue
*queue
,
395 struct smap
*details
);
397 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
398 * 'details', perfoming any required Netlink calls to complete the
399 * reconfiguration. The caller ensures that 'queue_id' is less than
402 * The contents of 'details' should be documented as valid for 'ovs_name'
403 * in the "other_config" column in the "Queue" table in
404 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
406 * This function may be null if 'tc' does not have queues or its queues are
407 * not configurable. */
408 int (*class_set
)(struct netdev
*, unsigned int queue_id
,
409 const struct smap
*details
);
411 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
412 * tc_queue's within 'netdev->tc->queues'.
414 * This function may be null if 'tc' does not have queues or its queues
415 * cannot be deleted. */
416 int (*class_delete
)(struct netdev
*, struct tc_queue
*queue
);
418 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
419 * 'struct tc_queue's within 'netdev->tc->queues'.
421 * On success, initializes '*stats'.
423 * This function may be null if 'tc' does not have queues or if it cannot
424 * report queue statistics. */
425 int (*class_get_stats
)(const struct netdev
*netdev
,
426 const struct tc_queue
*queue
,
427 struct netdev_queue_stats
*stats
);
429 /* Extracts queue stats from 'nlmsg', which is a response to a
430 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
432 * This function may be null if 'tc' does not have queues or if it cannot
433 * report queue statistics. */
434 int (*class_dump_stats
)(const struct netdev
*netdev
,
435 const struct ofpbuf
*nlmsg
,
436 netdev_dump_queue_stats_cb
*cb
, void *aux
);
440 tc_init(struct tc
*tc
, const struct tc_ops
*ops
)
443 hmap_init(&tc
->queues
);
447 tc_destroy(struct tc
*tc
)
449 hmap_destroy(&tc
->queues
);
452 static const struct tc_ops tc_ops_htb
;
453 static const struct tc_ops tc_ops_hfsc
;
454 static const struct tc_ops tc_ops_codel
;
455 static const struct tc_ops tc_ops_fqcodel
;
456 static const struct tc_ops tc_ops_sfq
;
457 static const struct tc_ops tc_ops_netem
;
458 static const struct tc_ops tc_ops_default
;
459 static const struct tc_ops tc_ops_noop
;
460 static const struct tc_ops tc_ops_other
;
462 static const struct tc_ops
*const tcs
[] = {
463 &tc_ops_htb
, /* Hierarchy token bucket (see tc-htb(8)). */
464 &tc_ops_hfsc
, /* Hierarchical fair service curve. */
465 &tc_ops_codel
, /* Controlled delay */
466 &tc_ops_fqcodel
, /* Fair queue controlled delay */
467 &tc_ops_sfq
, /* Stochastic fair queueing */
468 &tc_ops_netem
, /* Network Emulator */
469 &tc_ops_noop
, /* Non operating qos type. */
470 &tc_ops_default
, /* Default qdisc (see tc-pfifo_fast(8)). */
471 &tc_ops_other
, /* Some other qdisc. */
475 static unsigned int tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
);
476 static unsigned int tc_bytes_to_ticks(unsigned int rate
, unsigned int size
);
477 static unsigned int tc_buffer_per_jiffy(unsigned int rate
);
478 static uint32_t tc_time_to_ticks(uint32_t time
);
480 static struct tcmsg
*netdev_linux_tc_make_request(const struct netdev
*,
484 static int tc_add_policer(struct netdev
*,
485 uint32_t kbits_rate
, uint32_t kbits_burst
);
487 static int tc_parse_qdisc(const struct ofpbuf
*, const char **kind
,
488 struct nlattr
**options
);
489 static int tc_parse_class(const struct ofpbuf
*, unsigned int *queue_id
,
490 struct nlattr
**options
,
491 struct netdev_queue_stats
*);
492 static int tc_query_class(const struct netdev
*,
493 unsigned int handle
, unsigned int parent
,
494 struct ofpbuf
**replyp
);
495 static int tc_delete_class(const struct netdev
*, unsigned int handle
);
497 static int tc_del_qdisc(struct netdev
*netdev
);
498 static int tc_query_qdisc(const struct netdev
*netdev
);
501 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
);
502 static int tc_calc_cell_log(unsigned int mtu
);
503 static void tc_fill_rate(struct tc_ratespec
*rate
, uint64_t bps
, int mtu
);
504 static int tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
);
507 /* This is set pretty low because we probably won't learn anything from the
508 * additional log messages. */
509 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
511 /* Polling miimon status for all ports causes performance degradation when
512 * handling a large number of ports. If there are no devices using miimon, then
513 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
515 * Readers do not depend on this variable synchronizing with the related
516 * changes in the device miimon status, so we can use atomic_count. */
517 static atomic_count miimon_cnt
= ATOMIC_COUNT_INIT(0);
519 static int netdev_linux_parse_vnet_hdr(struct dp_packet
*b
);
520 static void netdev_linux_prepend_vnet_hdr(struct dp_packet
*b
, int mtu
);
521 static int netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*,
522 int cmd
, const char *cmd_name
);
523 static int get_flags(const struct netdev
*, unsigned int *flags
);
524 static int set_flags(const char *, unsigned int flags
);
525 static int update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
526 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
527 OVS_REQUIRES(netdev
->mutex
);
528 static int get_ifindex(const struct netdev
*, int *ifindexp
);
529 static int do_set_addr(struct netdev
*netdev
,
530 int ioctl_nr
, const char *ioctl_name
,
531 struct in_addr addr
);
532 static int get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
);
533 static int set_etheraddr(const char *netdev_name
, const struct eth_addr
);
534 static int af_packet_sock(void);
535 static bool netdev_linux_miimon_enabled(void);
536 static void netdev_linux_miimon_run(void);
537 static void netdev_linux_miimon_wait(void);
538 static int netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
);
541 is_tap_netdev(const struct netdev
*netdev
)
543 return netdev_get_class(netdev
) == &netdev_tap_class
;
547 netdev_linux_netnsid_update__(struct netdev_linux
*netdev
)
549 struct dpif_netlink_vport reply
;
553 error
= dpif_netlink_vport_get(netdev_get_name(&netdev
->up
), &reply
, &buf
);
555 if (error
== ENOENT
) {
556 /* Assume it is local if there is no API (e.g. if the openvswitch
557 * kernel module is not loaded). */
558 netnsid_set_local(&netdev
->netnsid
);
560 netnsid_unset(&netdev
->netnsid
);
565 netnsid_set(&netdev
->netnsid
, reply
.netnsid
);
571 netdev_linux_netnsid_update(struct netdev_linux
*netdev
)
573 if (netnsid_is_unset(netdev
->netnsid
)) {
574 if (netdev_get_class(&netdev
->up
) == &netdev_tap_class
) {
575 netnsid_set_local(&netdev
->netnsid
);
577 return netdev_linux_netnsid_update__(netdev
);
585 netdev_linux_netnsid_is_eq(struct netdev_linux
*netdev
, int nsid
)
587 netdev_linux_netnsid_update(netdev
);
588 return netnsid_eq(netdev
->netnsid
, nsid
);
592 netdev_linux_netnsid_is_remote(struct netdev_linux
*netdev
)
594 netdev_linux_netnsid_update(netdev
);
595 return netnsid_is_remote(netdev
->netnsid
);
598 static int netdev_linux_update_via_netlink(struct netdev_linux
*);
599 static void netdev_linux_update(struct netdev_linux
*netdev
, int,
600 const struct rtnetlink_change
*)
601 OVS_REQUIRES(netdev
->mutex
);
602 static void netdev_linux_changed(struct netdev_linux
*netdev
,
603 unsigned int ifi_flags
, unsigned int mask
)
604 OVS_REQUIRES(netdev
->mutex
);
606 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
607 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
608 * if no such socket could be created. */
609 static struct nl_sock
*
610 netdev_linux_notify_sock(void)
612 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
613 static struct nl_sock
*sock
;
614 unsigned int mcgroups
[] = {RTNLGRP_LINK
, RTNLGRP_IPV4_IFADDR
,
615 RTNLGRP_IPV6_IFADDR
, RTNLGRP_IPV6_IFINFO
};
617 if (ovsthread_once_start(&once
)) {
620 error
= nl_sock_create(NETLINK_ROUTE
, &sock
);
624 for (i
= 0; i
< ARRAY_SIZE(mcgroups
); i
++) {
625 error
= nl_sock_join_mcgroup(sock
, mcgroups
[i
]);
627 nl_sock_destroy(sock
);
633 nl_sock_listen_all_nsid(sock
, true);
634 ovsthread_once_done(&once
);
641 netdev_linux_miimon_enabled(void)
643 return atomic_count_get(&miimon_cnt
) > 0;
647 netdev_linux_kind_is_lag(const char *kind
)
649 if (!strcmp(kind
, "bond") || !strcmp(kind
, "team")) {
657 netdev_linux_update_lag(struct rtnetlink_change
*change
)
658 OVS_REQUIRES(lag_mutex
)
660 struct linux_lag_slave
*lag
;
662 if (!rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)) {
666 if (change
->slave
&& netdev_linux_kind_is_lag(change
->slave
)) {
667 lag
= shash_find_data(&lag_shash
, change
->ifname
);
670 struct netdev
*master_netdev
;
671 char master_name
[IFNAMSIZ
];
675 if_indextoname(change
->master_ifindex
, master_name
);
676 master_netdev
= netdev_from_name(master_name
);
677 if (!master_netdev
) {
681 if (is_netdev_linux_class(master_netdev
->netdev_class
)) {
682 block_id
= netdev_get_block_id(master_netdev
);
684 netdev_close(master_netdev
);
688 lag
= xmalloc(sizeof *lag
);
689 lag
->block_id
= block_id
;
690 lag
->node
= shash_add(&lag_shash
, change
->ifname
, lag
);
692 /* delete ingress block in case it exists */
693 tc_add_del_qdisc(change
->if_index
, false, 0, TC_INGRESS
);
694 /* LAG master is linux netdev so add slave to same block. */
695 error
= tc_add_del_qdisc(change
->if_index
, true, block_id
,
698 VLOG_WARN("failed to bind LAG slave %s to master's block",
700 shash_delete(&lag_shash
, lag
->node
);
705 netdev_close(master_netdev
);
707 } else if (change
->master_ifindex
== 0) {
708 /* Check if this was a lag slave that has been freed. */
709 lag
= shash_find_data(&lag_shash
, change
->ifname
);
712 tc_add_del_qdisc(change
->if_index
, false, lag
->block_id
,
714 shash_delete(&lag_shash
, lag
->node
);
721 netdev_linux_run(const struct netdev_class
*netdev_class OVS_UNUSED
)
723 struct nl_sock
*sock
;
726 if (netdev_linux_miimon_enabled()) {
727 netdev_linux_miimon_run();
730 sock
= netdev_linux_notify_sock();
736 uint64_t buf_stub
[4096 / 8];
740 ofpbuf_use_stub(&buf
, buf_stub
, sizeof buf_stub
);
741 error
= nl_sock_recv(sock
, &buf
, &nsid
, false);
743 struct rtnetlink_change change
;
745 if (rtnetlink_parse(&buf
, &change
)) {
746 struct netdev
*netdev_
= NULL
;
747 char dev_name
[IFNAMSIZ
];
749 if (!change
.ifname
) {
750 change
.ifname
= if_indextoname(change
.if_index
, dev_name
);
754 netdev_
= netdev_from_name(change
.ifname
);
756 if (netdev_
&& is_netdev_linux_class(netdev_
->netdev_class
)) {
757 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
759 ovs_mutex_lock(&netdev
->mutex
);
760 netdev_linux_update(netdev
, nsid
, &change
);
761 ovs_mutex_unlock(&netdev
->mutex
);
763 else if (!netdev_
&& change
.ifname
) {
764 /* Netdev is not present in OvS but its master could be. */
765 ovs_mutex_lock(&lag_mutex
);
766 netdev_linux_update_lag(&change
);
767 ovs_mutex_unlock(&lag_mutex
);
769 netdev_close(netdev_
);
771 } else if (error
== ENOBUFS
) {
772 struct shash device_shash
;
773 struct shash_node
*node
;
777 shash_init(&device_shash
);
778 netdev_get_devices(&netdev_linux_class
, &device_shash
);
779 SHASH_FOR_EACH (node
, &device_shash
) {
780 struct netdev
*netdev_
= node
->data
;
781 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
784 ovs_mutex_lock(&netdev
->mutex
);
785 get_flags(netdev_
, &flags
);
786 netdev_linux_changed(netdev
, flags
, 0);
787 ovs_mutex_unlock(&netdev
->mutex
);
789 netdev_close(netdev_
);
791 shash_destroy(&device_shash
);
792 } else if (error
!= EAGAIN
) {
793 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 5);
794 VLOG_WARN_RL(&rll
, "error reading or parsing netlink (%s)",
795 ovs_strerror(error
));
802 netdev_linux_wait(const struct netdev_class
*netdev_class OVS_UNUSED
)
804 struct nl_sock
*sock
;
806 if (netdev_linux_miimon_enabled()) {
807 netdev_linux_miimon_wait();
809 sock
= netdev_linux_notify_sock();
811 nl_sock_wait(sock
, POLLIN
);
816 netdev_linux_changed(struct netdev_linux
*dev
,
817 unsigned int ifi_flags
, unsigned int mask
)
818 OVS_REQUIRES(dev
->mutex
)
820 netdev_change_seq_changed(&dev
->up
);
822 if ((dev
->ifi_flags
^ ifi_flags
) & IFF_RUNNING
) {
823 dev
->carrier_resets
++;
825 dev
->ifi_flags
= ifi_flags
;
827 dev
->cache_valid
&= mask
;
828 if (!(mask
& VALID_IN
)) {
829 netdev_get_addrs_list_flush();
834 netdev_linux_update__(struct netdev_linux
*dev
,
835 const struct rtnetlink_change
*change
)
836 OVS_REQUIRES(dev
->mutex
)
838 if (rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)) {
839 if (change
->nlmsg_type
== RTM_NEWLINK
) {
840 /* Keep drv-info, ip addresses, and NUMA id. */
841 netdev_linux_changed(dev
, change
->ifi_flags
,
842 VALID_DRVINFO
| VALID_IN
| VALID_NUMA_ID
);
844 /* Update netdev from rtnl-change msg. */
846 dev
->mtu
= change
->mtu
;
847 dev
->cache_valid
|= VALID_MTU
;
848 dev
->netdev_mtu_error
= 0;
851 if (!eth_addr_is_zero(change
->mac
)) {
852 dev
->etheraddr
= change
->mac
;
853 dev
->cache_valid
|= VALID_ETHERADDR
;
854 dev
->ether_addr_error
= 0;
856 /* The mac addr has been changed, report it now. */
857 rtnetlink_report_link();
860 if (change
->master
&& netdev_linux_kind_is_lag(change
->master
)) {
861 dev
->is_lag_master
= true;
864 dev
->ifindex
= change
->if_index
;
865 dev
->cache_valid
|= VALID_IFINDEX
;
866 dev
->get_ifindex_error
= 0;
870 netdev_linux_changed(dev
, change
->ifi_flags
, 0);
871 dev
->present
= false;
872 netnsid_unset(&dev
->netnsid
);
874 } else if (rtnetlink_type_is_rtnlgrp_addr(change
->nlmsg_type
)) {
875 /* Invalidates in4, in6. */
876 netdev_linux_changed(dev
, dev
->ifi_flags
, ~VALID_IN
);
883 netdev_linux_update(struct netdev_linux
*dev
, int nsid
,
884 const struct rtnetlink_change
*change
)
885 OVS_REQUIRES(dev
->mutex
)
887 if (netdev_linux_netnsid_is_eq(dev
, nsid
)) {
888 netdev_linux_update__(dev
, change
);
892 static struct netdev
*
893 netdev_linux_alloc(void)
895 struct netdev_linux
*netdev
= xzalloc(sizeof *netdev
);
900 netdev_linux_common_construct(struct netdev
*netdev_
)
902 /* Prevent any attempt to create (or open) a network device named "default"
903 * or "all". These device names are effectively reserved on Linux because
904 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
905 * itself this wouldn't call for any special treatment, but in practice if
906 * a program tries to create devices with these names, it causes the kernel
907 * to fire a "new device" notification event even though creation failed,
908 * and in turn that causes OVS to wake up and try to create them again,
909 * which ends up as a 100% CPU loop. */
910 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
911 const char *name
= netdev_
->name
;
912 if (!strcmp(name
, "default") || !strcmp(name
, "all")) {
913 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 1);
914 VLOG_WARN_RL(&rll
, "%s: Linux forbids network device with this name",
919 /* The device could be in the same network namespace or in another one. */
920 netnsid_unset(&netdev
->netnsid
);
921 ovs_mutex_init(&netdev
->mutex
);
923 if (userspace_tso_enabled()) {
924 netdev_
->ol_flags
|= NETDEV_TX_OFFLOAD_TCP_TSO
;
925 netdev_
->ol_flags
|= NETDEV_TX_OFFLOAD_TCP_CKSUM
;
926 netdev_
->ol_flags
|= NETDEV_TX_OFFLOAD_UDP_CKSUM
;
927 netdev_
->ol_flags
|= NETDEV_TX_OFFLOAD_SCTP_CKSUM
;
928 netdev_
->ol_flags
|= NETDEV_TX_OFFLOAD_IPV4_CKSUM
;
934 /* Creates system and internal devices. */
936 netdev_linux_construct(struct netdev
*netdev_
)
938 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
939 int error
= netdev_linux_common_construct(netdev_
);
944 error
= get_flags(&netdev
->up
, &netdev
->ifi_flags
);
945 if (error
== ENODEV
) {
946 if (netdev
->up
.netdev_class
!= &netdev_internal_class
) {
947 /* The device does not exist, so don't allow it to be opened. */
950 /* "Internal" netdevs have to be created as netdev objects before
951 * they exist in the kernel, because creating them in the kernel
952 * happens by passing a netdev object to dpif_port_add().
953 * Therefore, ignore the error. */
960 /* For most types of netdevs we open the device for each call of
961 * netdev_open(). However, this is not the case with tap devices,
962 * since it is only possible to open the device once. In this
963 * situation we share a single file descriptor, and consequently
964 * buffers, across all readers. Therefore once data is read it will
965 * be unavailable to other reads for tap devices. */
967 netdev_linux_construct_tap(struct netdev
*netdev_
)
969 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
970 static const char tap_dev
[] = "/dev/net/tun";
971 const char *name
= netdev_
->name
;
974 int error
= netdev_linux_common_construct(netdev_
);
979 /* Open tap device. */
980 netdev
->tap_fd
= open(tap_dev
, O_RDWR
);
981 if (netdev
->tap_fd
< 0) {
983 VLOG_WARN("opening \"%s\" failed: %s", tap_dev
, ovs_strerror(error
));
987 /* Create tap device. */
988 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
989 ifr
.ifr_flags
= IFF_TAP
| IFF_NO_PI
;
990 if (userspace_tso_enabled()) {
991 ifr
.ifr_flags
|= IFF_VNET_HDR
;
994 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
995 if (ioctl(netdev
->tap_fd
, TUNSETIFF
, &ifr
) == -1) {
996 VLOG_WARN("%s: creating tap device failed: %s", name
,
997 ovs_strerror(errno
));
1002 /* Make non-blocking. */
1003 error
= set_nonblocking(netdev
->tap_fd
);
1008 if (ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 1)) {
1009 VLOG_WARN("%s: creating tap device failed (persist): %s", name
,
1010 ovs_strerror(errno
));
1015 if (userspace_tso_enabled()) {
1016 /* Old kernels don't support TUNSETOFFLOAD. If TUNSETOFFLOAD is
1017 * available, it will return EINVAL when a flag is unknown.
1018 * Therefore, try enabling offload with no flags to check
1019 * if TUNSETOFFLOAD support is available or not. */
1020 if (ioctl(netdev
->tap_fd
, TUNSETOFFLOAD
, 0) == 0 || errno
!= EINVAL
) {
1021 unsigned long oflags
= TUN_F_CSUM
| TUN_F_TSO4
| TUN_F_TSO6
;
1023 if (ioctl(netdev
->tap_fd
, TUNSETOFFLOAD
, oflags
) == -1) {
1024 VLOG_WARN("%s: enabling tap offloading failed: %s", name
,
1025 ovs_strerror(errno
));
1032 netdev
->present
= true;
1036 close(netdev
->tap_fd
);
1041 netdev_linux_destruct(struct netdev
*netdev_
)
1043 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1045 if (netdev
->tc
&& netdev
->tc
->ops
->tc_destroy
) {
1046 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
1049 if (netdev_get_class(netdev_
) == &netdev_tap_class
1050 && netdev
->tap_fd
>= 0)
1052 ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 0);
1053 close(netdev
->tap_fd
);
1056 if (netdev
->miimon_interval
> 0) {
1057 atomic_count_dec(&miimon_cnt
);
1060 ovs_mutex_destroy(&netdev
->mutex
);
1064 netdev_linux_dealloc(struct netdev
*netdev_
)
1066 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1070 static struct netdev_rxq
*
1071 netdev_linux_rxq_alloc(void)
1073 struct netdev_rxq_linux
*rx
= xzalloc(sizeof *rx
);
1078 netdev_linux_rxq_construct(struct netdev_rxq
*rxq_
)
1080 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1081 struct netdev
*netdev_
= rx
->up
.netdev
;
1082 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1085 ovs_mutex_lock(&netdev
->mutex
);
1086 rx
->is_tap
= is_tap_netdev(netdev_
);
1088 rx
->fd
= netdev
->tap_fd
;
1090 struct sockaddr_ll sll
;
1092 /* Result of tcpdump -dd inbound */
1093 static const struct sock_filter filt
[] = {
1094 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1095 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1096 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1097 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1099 static const struct sock_fprog fprog
= {
1100 ARRAY_SIZE(filt
), (struct sock_filter
*) filt
1103 /* Create file descriptor. */
1104 rx
->fd
= socket(PF_PACKET
, SOCK_RAW
, 0);
1107 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error
));
1112 if (setsockopt(rx
->fd
, SOL_PACKET
, PACKET_AUXDATA
, &val
, sizeof val
)) {
1114 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1115 netdev_get_name(netdev_
), ovs_strerror(error
));
1119 if (userspace_tso_enabled()
1120 && setsockopt(rx
->fd
, SOL_PACKET
, PACKET_VNET_HDR
, &val
,
1123 VLOG_ERR("%s: failed to enable vnet hdr in txq raw socket: %s",
1124 netdev_get_name(netdev_
), ovs_strerror(errno
));
1128 /* Set non-blocking mode. */
1129 error
= set_nonblocking(rx
->fd
);
1134 /* Get ethernet device index. */
1135 error
= get_ifindex(&netdev
->up
, &ifindex
);
1140 /* Bind to specific ethernet device. */
1141 memset(&sll
, 0, sizeof sll
);
1142 sll
.sll_family
= AF_PACKET
;
1143 sll
.sll_ifindex
= ifindex
;
1144 sll
.sll_protocol
= htons(ETH_P_ALL
);
1145 if (bind(rx
->fd
, (struct sockaddr
*) &sll
, sizeof sll
) < 0) {
1147 VLOG_ERR("%s: failed to bind raw socket (%s)",
1148 netdev_get_name(netdev_
), ovs_strerror(error
));
1152 /* Filter for only inbound packets. */
1153 error
= setsockopt(rx
->fd
, SOL_SOCKET
, SO_ATTACH_FILTER
, &fprog
,
1157 VLOG_ERR("%s: failed to attach filter (%s)",
1158 netdev_get_name(netdev_
), ovs_strerror(error
));
1162 ovs_mutex_unlock(&netdev
->mutex
);
1170 ovs_mutex_unlock(&netdev
->mutex
);
1175 netdev_linux_rxq_destruct(struct netdev_rxq
*rxq_
)
1177 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1184 for (i
= 0; i
< NETDEV_MAX_BURST
; i
++) {
1185 dp_packet_delete(rx
->aux_bufs
[i
]);
1190 netdev_linux_rxq_dealloc(struct netdev_rxq
*rxq_
)
1192 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1198 auxdata_to_vlan_tpid(const struct tpacket_auxdata
*aux
, bool double_tagged
)
1200 if (aux
->tp_status
& TP_STATUS_VLAN_TPID_VALID
) {
1201 return htons(aux
->tp_vlan_tpid
);
1202 } else if (double_tagged
) {
1203 return htons(ETH_TYPE_VLAN_8021AD
);
1205 return htons(ETH_TYPE_VLAN_8021Q
);
1210 auxdata_has_vlan_tci(const struct tpacket_auxdata
*aux
)
1212 return aux
->tp_vlan_tci
|| aux
->tp_status
& TP_STATUS_VLAN_VALID
;
1216 * Receive packets from raw socket in batch process for better performance,
1217 * it can receive NETDEV_MAX_BURST packets at most once, the received
1218 * packets are added into *batch. The return value is 0 or errno.
1220 * It also used recvmmsg to reduce multiple syscalls overhead;
1223 netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux
*rx
, int mtu
,
1224 struct dp_packet_batch
*batch
)
1229 int virtio_net_hdr_size
;
1230 struct iovec iovs
[NETDEV_MAX_BURST
][IOV_TSO_SIZE
];
1231 struct cmsghdr
*cmsg
;
1233 struct cmsghdr cmsg
;
1234 char buffer
[CMSG_SPACE(sizeof(struct tpacket_auxdata
))];
1235 } cmsg_buffers
[NETDEV_MAX_BURST
];
1236 struct mmsghdr mmsgs
[NETDEV_MAX_BURST
];
1237 struct dp_packet
*buffers
[NETDEV_MAX_BURST
];
1240 if (userspace_tso_enabled()) {
1241 /* Use the buffer from the allocated packet below to receive MTU
1242 * sized packets and an aux_buf for extra TSO data. */
1243 iovlen
= IOV_TSO_SIZE
;
1244 virtio_net_hdr_size
= sizeof(struct virtio_net_hdr
);
1246 /* Use only the buffer from the allocated packet. */
1247 iovlen
= IOV_STD_SIZE
;
1248 virtio_net_hdr_size
= 0;
1251 /* The length here needs to be accounted in the same way when the
1252 * aux_buf is allocated so that it can be prepended to TSO buffer. */
1253 std_len
= virtio_net_hdr_size
+ VLAN_ETH_HEADER_LEN
+ mtu
;
1254 for (i
= 0; i
< NETDEV_MAX_BURST
; i
++) {
1255 buffers
[i
] = dp_packet_new_with_headroom(std_len
, DP_NETDEV_HEADROOM
);
1256 iovs
[i
][IOV_PACKET
].iov_base
= dp_packet_data(buffers
[i
]);
1257 iovs
[i
][IOV_PACKET
].iov_len
= std_len
;
1258 if (iovlen
== IOV_TSO_SIZE
) {
1259 iovs
[i
][IOV_AUXBUF
].iov_base
= dp_packet_data(rx
->aux_bufs
[i
]);
1260 iovs
[i
][IOV_AUXBUF
].iov_len
= dp_packet_tailroom(rx
->aux_bufs
[i
]);
1263 mmsgs
[i
].msg_hdr
.msg_name
= NULL
;
1264 mmsgs
[i
].msg_hdr
.msg_namelen
= 0;
1265 mmsgs
[i
].msg_hdr
.msg_iov
= iovs
[i
];
1266 mmsgs
[i
].msg_hdr
.msg_iovlen
= iovlen
;
1267 mmsgs
[i
].msg_hdr
.msg_control
= &cmsg_buffers
[i
];
1268 mmsgs
[i
].msg_hdr
.msg_controllen
= sizeof cmsg_buffers
[i
];
1269 mmsgs
[i
].msg_hdr
.msg_flags
= 0;
1273 retval
= recvmmsg(rx
->fd
, mmsgs
, NETDEV_MAX_BURST
, MSG_TRUNC
, NULL
);
1274 } while (retval
< 0 && errno
== EINTR
);
1278 for (i
= 0; i
< NETDEV_MAX_BURST
; i
++) {
1279 dp_packet_delete(buffers
[i
]);
1285 for (i
= 0; i
< retval
; i
++) {
1286 struct dp_packet
*pkt
;
1288 if (mmsgs
[i
].msg_len
< ETH_HEADER_LEN
) {
1289 struct netdev
*netdev_
= netdev_rxq_get_netdev(&rx
->up
);
1290 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1292 dp_packet_delete(buffers
[i
]);
1293 netdev
->rx_dropped
+= 1;
1294 VLOG_WARN_RL(&rl
, "%s: Dropped packet: less than ether hdr size",
1295 netdev_get_name(netdev_
));
1299 if (mmsgs
[i
].msg_len
> std_len
) {
1300 /* Build a single linear TSO packet by prepending the data from
1301 * std_len buffer to the aux_buf. */
1302 pkt
= rx
->aux_bufs
[i
];
1303 dp_packet_set_size(pkt
, mmsgs
[i
].msg_len
- std_len
);
1304 dp_packet_push(pkt
, dp_packet_data(buffers
[i
]), std_len
);
1305 /* The headroom should be the same in buffers[i], pkt and
1306 * DP_NETDEV_HEADROOM. */
1307 dp_packet_resize(pkt
, DP_NETDEV_HEADROOM
, 0);
1308 dp_packet_delete(buffers
[i
]);
1309 rx
->aux_bufs
[i
] = NULL
;
1311 dp_packet_set_size(buffers
[i
], mmsgs
[i
].msg_len
);
1315 if (virtio_net_hdr_size
&& netdev_linux_parse_vnet_hdr(pkt
)) {
1316 struct netdev
*netdev_
= netdev_rxq_get_netdev(&rx
->up
);
1317 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1319 /* Unexpected error situation: the virtio header is not present
1320 * or corrupted. Drop the packet but continue in case next ones
1322 dp_packet_delete(pkt
);
1323 netdev
->rx_dropped
+= 1;
1324 VLOG_WARN_RL(&rl
, "%s: Dropped packet: Invalid virtio net header",
1325 netdev_get_name(netdev_
));
1329 for (cmsg
= CMSG_FIRSTHDR(&mmsgs
[i
].msg_hdr
); cmsg
;
1330 cmsg
= CMSG_NXTHDR(&mmsgs
[i
].msg_hdr
, cmsg
)) {
1331 const struct tpacket_auxdata
*aux
;
1333 if (cmsg
->cmsg_level
!= SOL_PACKET
1334 || cmsg
->cmsg_type
!= PACKET_AUXDATA
1336 CMSG_LEN(sizeof(struct tpacket_auxdata
))) {
1340 aux
= ALIGNED_CAST(struct tpacket_auxdata
*, CMSG_DATA(cmsg
));
1341 if (auxdata_has_vlan_tci(aux
)) {
1342 struct eth_header
*eth
;
1345 eth
= dp_packet_data(pkt
);
1346 double_tagged
= eth
->eth_type
== htons(ETH_TYPE_VLAN_8021Q
);
1349 auxdata_to_vlan_tpid(aux
, double_tagged
),
1350 htons(aux
->tp_vlan_tci
));
1354 dp_packet_batch_add(batch
, pkt
);
1357 /* Delete unused buffers. */
1358 for (; i
< NETDEV_MAX_BURST
; i
++) {
1359 dp_packet_delete(buffers
[i
]);
1366 * Receive packets from tap by batch process for better performance,
1367 * it can receive NETDEV_MAX_BURST packets at most once, the received
1368 * packets are added into *batch. The return value is 0 or errno.
1371 netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux
*rx
, int mtu
,
1372 struct dp_packet_batch
*batch
)
1374 int virtio_net_hdr_size
;
1380 if (userspace_tso_enabled()) {
1381 /* Use the buffer from the allocated packet below to receive MTU
1382 * sized packets and an aux_buf for extra TSO data. */
1383 iovlen
= IOV_TSO_SIZE
;
1384 virtio_net_hdr_size
= sizeof(struct virtio_net_hdr
);
1386 /* Use only the buffer from the allocated packet. */
1387 iovlen
= IOV_STD_SIZE
;
1388 virtio_net_hdr_size
= 0;
1391 /* The length here needs to be accounted in the same way when the
1392 * aux_buf is allocated so that it can be prepended to TSO buffer. */
1393 std_len
= virtio_net_hdr_size
+ VLAN_ETH_HEADER_LEN
+ mtu
;
1394 for (i
= 0; i
< NETDEV_MAX_BURST
; i
++) {
1395 struct dp_packet
*buffer
;
1396 struct dp_packet
*pkt
;
1397 struct iovec iov
[IOV_TSO_SIZE
];
1399 /* Assume Ethernet port. No need to set packet_type. */
1400 buffer
= dp_packet_new_with_headroom(std_len
, DP_NETDEV_HEADROOM
);
1401 iov
[IOV_PACKET
].iov_base
= dp_packet_data(buffer
);
1402 iov
[IOV_PACKET
].iov_len
= std_len
;
1403 if (iovlen
== IOV_TSO_SIZE
) {
1404 iov
[IOV_AUXBUF
].iov_base
= dp_packet_data(rx
->aux_bufs
[i
]);
1405 iov
[IOV_AUXBUF
].iov_len
= dp_packet_tailroom(rx
->aux_bufs
[i
]);
1409 retval
= readv(rx
->fd
, iov
, iovlen
);
1410 } while (retval
< 0 && errno
== EINTR
);
1413 dp_packet_delete(buffer
);
1417 if (retval
> std_len
) {
1418 /* Build a single linear TSO packet by prepending the data from
1419 * std_len buffer to the aux_buf. */
1420 pkt
= rx
->aux_bufs
[i
];
1421 dp_packet_set_size(pkt
, retval
- std_len
);
1422 dp_packet_push(pkt
, dp_packet_data(buffer
), std_len
);
1423 /* The headroom should be the same in buffers[i], pkt and
1424 * DP_NETDEV_HEADROOM. */
1425 dp_packet_resize(pkt
, DP_NETDEV_HEADROOM
, 0);
1426 dp_packet_delete(buffer
);
1427 rx
->aux_bufs
[i
] = NULL
;
1429 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1433 if (virtio_net_hdr_size
&& netdev_linux_parse_vnet_hdr(pkt
)) {
1434 struct netdev
*netdev_
= netdev_rxq_get_netdev(&rx
->up
);
1435 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1437 /* Unexpected error situation: the virtio header is not present
1438 * or corrupted. Drop the packet but continue in case next ones
1440 dp_packet_delete(pkt
);
1441 netdev
->rx_dropped
+= 1;
1442 VLOG_WARN_RL(&rl
, "%s: Dropped packet: Invalid virtio net header",
1443 netdev_get_name(netdev_
));
1447 dp_packet_batch_add(batch
, pkt
);
1450 if ((i
== 0) && (retval
< 0)) {
1458 netdev_linux_rxq_recv(struct netdev_rxq
*rxq_
, struct dp_packet_batch
*batch
,
1461 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1462 struct netdev
*netdev
= rx
->up
.netdev
;
1466 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
1467 mtu
= ETH_PAYLOAD_MAX
;
1470 if (userspace_tso_enabled()) {
1471 /* Allocate TSO packets. The packet has enough headroom to store
1472 * a full non-TSO packet. When a TSO packet is received, the data
1473 * from non-TSO buffer (std_len) is prepended to the TSO packet
1475 size_t std_len
= sizeof(struct virtio_net_hdr
) + VLAN_ETH_HEADER_LEN
1476 + DP_NETDEV_HEADROOM
+ mtu
;
1477 size_t data_len
= LINUX_RXQ_TSO_MAX_LEN
- std_len
;
1478 for (int i
= 0; i
< NETDEV_MAX_BURST
; i
++) {
1479 if (rx
->aux_bufs
[i
]) {
1483 rx
->aux_bufs
[i
] = dp_packet_new_with_headroom(data_len
, std_len
);
1487 dp_packet_batch_init(batch
);
1488 retval
= (rx
->is_tap
1489 ? netdev_linux_batch_rxq_recv_tap(rx
, mtu
, batch
)
1490 : netdev_linux_batch_rxq_recv_sock(rx
, mtu
, batch
));
1493 if (retval
!= EAGAIN
&& retval
!= EMSGSIZE
) {
1494 VLOG_WARN_RL(&rl
, "error receiving Ethernet packet on %s: %s",
1495 netdev_rxq_get_name(rxq_
), ovs_strerror(errno
));
1507 netdev_linux_rxq_wait(struct netdev_rxq
*rxq_
)
1509 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1510 poll_fd_wait(rx
->fd
, POLLIN
);
1514 netdev_linux_rxq_drain(struct netdev_rxq
*rxq_
)
1516 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1519 int error
= af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_
), &ifr
,
1520 SIOCGIFTXQLEN
, "SIOCGIFTXQLEN");
1524 drain_fd(rx
->fd
, ifr
.ifr_qlen
);
1527 return drain_rcvbuf(rx
->fd
);
1532 netdev_linux_sock_batch_send(int sock
, int ifindex
, bool tso
, int mtu
,
1533 struct dp_packet_batch
*batch
)
1535 const size_t size
= dp_packet_batch_size(batch
);
1536 /* We don't bother setting most fields in sockaddr_ll because the
1537 * kernel ignores them for SOCK_RAW. */
1538 struct sockaddr_ll sll
= { .sll_family
= AF_PACKET
,
1539 .sll_ifindex
= ifindex
};
1541 struct mmsghdr
*mmsg
= xmalloc(sizeof(*mmsg
) * size
);
1542 struct iovec
*iov
= xmalloc(sizeof(*iov
) * size
);
1544 struct dp_packet
*packet
;
1545 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1547 netdev_linux_prepend_vnet_hdr(packet
, mtu
);
1550 iov
[i
].iov_base
= dp_packet_data(packet
);
1551 iov
[i
].iov_len
= dp_packet_size(packet
);
1552 mmsg
[i
].msg_hdr
= (struct msghdr
) { .msg_name
= &sll
,
1553 .msg_namelen
= sizeof sll
,
1559 for (uint32_t ofs
= 0; ofs
< size
; ) {
1562 retval
= sendmmsg(sock
, mmsg
+ ofs
, size
- ofs
, 0);
1563 error
= retval
< 0 ? errno
: 0;
1564 } while (error
== EINTR
);
1576 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1577 * essential, because packets sent to a tap device with an AF_PACKET socket
1578 * will loop back to be *received* again on the tap device. This doesn't occur
1579 * on other interface types because we attach a socket filter to the rx
1582 netdev_linux_tap_batch_send(struct netdev
*netdev_
, bool tso
, int mtu
,
1583 struct dp_packet_batch
*batch
)
1585 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1586 struct dp_packet
*packet
;
1588 /* The Linux tap driver returns EIO if the device is not up,
1589 * so if the device is not up, don't waste time sending it.
1590 * However, if the device is in another network namespace
1591 * then OVS can't retrieve the state. In that case, send the
1592 * packets anyway. */
1593 if (netdev
->present
&& !(netdev
->ifi_flags
& IFF_UP
)) {
1594 netdev
->tx_dropped
+= dp_packet_batch_size(batch
);
1598 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1604 netdev_linux_prepend_vnet_hdr(packet
, mtu
);
1607 size
= dp_packet_size(packet
);
1609 retval
= write(netdev
->tap_fd
, dp_packet_data(packet
), size
);
1610 error
= retval
< 0 ? errno
: 0;
1611 } while (error
== EINTR
);
1614 /* The Linux tap driver returns EIO if the device is not up. From
1615 * the OVS side this is not an error, so we ignore it; otherwise,
1616 * return the erro. */
1620 } else if (retval
!= size
) {
1621 VLOG_WARN_RL(&rl
, "sent partial Ethernet packet (%"PRIuSIZE
" "
1622 "bytes of %"PRIuSIZE
") on %s",
1623 retval
, size
, netdev_get_name(netdev_
));
1631 netdev_linux_get_numa_id__(struct netdev_linux
*netdev
)
1632 OVS_REQUIRES(netdev
->mutex
)
1634 char *numa_node_path
;
1639 if (netdev
->cache_valid
& VALID_NUMA_ID
) {
1640 return netdev
->numa_id
;
1643 netdev
->numa_id
= 0;
1644 netdev
->cache_valid
|= VALID_NUMA_ID
;
1646 if (ovs_numa_get_n_numas() < 2) {
1647 /* No need to check on system with a single NUMA node. */
1651 name
= netdev_get_name(&netdev
->up
);
1652 if (strpbrk(name
, "/\\")) {
1653 VLOG_ERR_RL(&rl
, "\"%s\" is not a valid name for a port. "
1654 "A valid name must not include '/' or '\\'."
1655 "Using numa_id 0", name
);
1659 numa_node_path
= xasprintf("/sys/class/net/%s/device/numa_node", name
);
1661 stream
= fopen(numa_node_path
, "r");
1663 /* Virtual device does not have this info. */
1664 VLOG_INFO_RL(&rl
, "%s: Can't open '%s': %s, using numa_id 0",
1665 name
, numa_node_path
, ovs_strerror(errno
));
1666 free(numa_node_path
);
1670 if (fscanf(stream
, "%d", &node_id
) != 1
1671 || !ovs_numa_numa_id_is_valid(node_id
)) {
1672 VLOG_WARN_RL(&rl
, "%s: Can't detect NUMA node, using numa_id 0", name
);
1676 netdev
->numa_id
= node_id
;
1678 free(numa_node_path
);
1682 static int OVS_UNUSED
1683 netdev_linux_get_numa_id(const struct netdev
*netdev_
)
1685 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1688 ovs_mutex_lock(&netdev
->mutex
);
1689 numa_id
= netdev_linux_get_numa_id__(netdev
);
1690 ovs_mutex_unlock(&netdev
->mutex
);
1695 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1696 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1697 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1698 * the packet is too big or too small to transmit on the device.
1700 * The kernel maintains a packet transmission queue, so the caller is not
1701 * expected to do additional queuing of packets. */
1703 netdev_linux_send(struct netdev
*netdev_
, int qid OVS_UNUSED
,
1704 struct dp_packet_batch
*batch
,
1705 bool concurrent_txq OVS_UNUSED
)
1707 bool tso
= userspace_tso_enabled();
1708 int mtu
= ETH_PAYLOAD_MAX
;
1713 netdev_linux_get_mtu__(netdev_linux_cast(netdev_
), &mtu
);
1716 if (!is_tap_netdev(netdev_
)) {
1717 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_
))) {
1722 sock
= af_packet_sock();
1728 int ifindex
= netdev_get_ifindex(netdev_
);
1734 error
= netdev_linux_sock_batch_send(sock
, ifindex
, tso
, mtu
, batch
);
1736 error
= netdev_linux_tap_batch_send(netdev_
, tso
, mtu
, batch
);
1739 if (error
== ENOBUFS
) {
1740 /* The Linux AF_PACKET implementation never blocks waiting
1741 * for room for packets, instead returning ENOBUFS.
1742 * Translate this into EAGAIN for the caller. */
1745 VLOG_WARN_RL(&rl
, "error sending Ethernet packet on %s: %s",
1746 netdev_get_name(netdev_
), ovs_strerror(error
));
1751 dp_packet_delete_batch(batch
, true);
1755 /* Registers with the poll loop to wake up from the next call to poll_block()
1756 * when the packet transmission queue has sufficient room to transmit a packet
1757 * with netdev_send().
1759 * The kernel maintains a packet transmission queue, so the client is not
1760 * expected to do additional queuing of packets. Thus, this function is
1761 * unlikely to ever be used. It is included for completeness. */
1763 netdev_linux_send_wait(struct netdev
*netdev
, int qid OVS_UNUSED
)
1765 if (is_tap_netdev(netdev
)) {
1766 /* TAP device always accepts packets.*/
1767 poll_immediate_wake();
1771 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1772 * otherwise a positive errno value. */
1774 netdev_linux_set_etheraddr(struct netdev
*netdev_
, const struct eth_addr mac
)
1776 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1777 enum netdev_flags old_flags
= 0;
1780 ovs_mutex_lock(&netdev
->mutex
);
1781 if (netdev_linux_netnsid_is_remote(netdev
)) {
1786 if (netdev
->cache_valid
& VALID_ETHERADDR
) {
1787 error
= netdev
->ether_addr_error
;
1788 if (error
|| eth_addr_equals(netdev
->etheraddr
, mac
)) {
1791 netdev
->cache_valid
&= ~VALID_ETHERADDR
;
1794 /* Tap devices must be brought down before setting the address. */
1795 if (is_tap_netdev(netdev_
)) {
1796 update_flags(netdev
, NETDEV_UP
, 0, &old_flags
);
1798 error
= set_etheraddr(netdev_get_name(netdev_
), mac
);
1799 if (!error
|| error
== ENODEV
) {
1800 netdev
->ether_addr_error
= error
;
1801 netdev
->cache_valid
|= VALID_ETHERADDR
;
1803 netdev
->etheraddr
= mac
;
1807 if (is_tap_netdev(netdev_
) && old_flags
& NETDEV_UP
) {
1808 update_flags(netdev
, 0, NETDEV_UP
, &old_flags
);
1812 ovs_mutex_unlock(&netdev
->mutex
);
1816 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1818 netdev_linux_get_etheraddr(const struct netdev
*netdev_
, struct eth_addr
*mac
)
1820 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1823 ovs_mutex_lock(&netdev
->mutex
);
1824 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1825 netdev_linux_update_via_netlink(netdev
);
1828 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1829 /* Fall back to ioctl if netlink fails */
1830 netdev
->ether_addr_error
= get_etheraddr(netdev_get_name(netdev_
),
1831 &netdev
->etheraddr
);
1832 netdev
->cache_valid
|= VALID_ETHERADDR
;
1835 error
= netdev
->ether_addr_error
;
1837 *mac
= netdev
->etheraddr
;
1839 ovs_mutex_unlock(&netdev
->mutex
);
1845 netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
)
1849 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1850 netdev_linux_update_via_netlink(netdev
);
1853 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1854 /* Fall back to ioctl if netlink fails */
1857 netdev
->netdev_mtu_error
= af_inet_ifreq_ioctl(
1858 netdev_get_name(&netdev
->up
), &ifr
, SIOCGIFMTU
, "SIOCGIFMTU");
1859 netdev
->mtu
= ifr
.ifr_mtu
;
1860 netdev
->cache_valid
|= VALID_MTU
;
1863 error
= netdev
->netdev_mtu_error
;
1865 *mtup
= netdev
->mtu
;
1871 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1872 * in bytes, not including the hardware header; thus, this is typically 1500
1873 * bytes for Ethernet devices. */
1875 netdev_linux_get_mtu(const struct netdev
*netdev_
, int *mtup
)
1877 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1880 ovs_mutex_lock(&netdev
->mutex
);
1881 error
= netdev_linux_get_mtu__(netdev
, mtup
);
1882 ovs_mutex_unlock(&netdev
->mutex
);
1887 /* Sets the maximum size of transmitted (MTU) for given device using linux
1888 * networking ioctl interface.
1891 netdev_linux_set_mtu(struct netdev
*netdev_
, int mtu
)
1893 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1897 ovs_mutex_lock(&netdev
->mutex
);
1898 if (netdev_linux_netnsid_is_remote(netdev
)) {
1904 if (netdev_get_class(netdev_
) == &netdev_afxdp_class
) {
1905 error
= netdev_afxdp_verify_mtu_size(netdev_
, mtu
);
1912 if (netdev
->cache_valid
& VALID_MTU
) {
1913 error
= netdev
->netdev_mtu_error
;
1914 if (error
|| netdev
->mtu
== mtu
) {
1917 netdev
->cache_valid
&= ~VALID_MTU
;
1920 error
= af_inet_ifreq_ioctl(netdev_get_name(netdev_
), &ifr
,
1921 SIOCSIFMTU
, "SIOCSIFMTU");
1922 if (!error
|| error
== ENODEV
) {
1923 netdev
->netdev_mtu_error
= error
;
1924 netdev
->mtu
= ifr
.ifr_mtu
;
1925 netdev
->cache_valid
|= VALID_MTU
;
1928 ovs_mutex_unlock(&netdev
->mutex
);
1932 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1933 * On failure, returns a negative errno value. */
1935 netdev_linux_get_ifindex(const struct netdev
*netdev_
)
1937 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1940 ovs_mutex_lock(&netdev
->mutex
);
1941 if (netdev_linux_netnsid_is_remote(netdev
)) {
1945 error
= get_ifindex(netdev_
, &ifindex
);
1948 ovs_mutex_unlock(&netdev
->mutex
);
1949 return error
? -error
: ifindex
;
1953 netdev_linux_get_carrier(const struct netdev
*netdev_
, bool *carrier
)
1955 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1957 ovs_mutex_lock(&netdev
->mutex
);
1958 if (netdev
->miimon_interval
> 0) {
1959 *carrier
= netdev
->miimon
;
1961 *carrier
= (netdev
->ifi_flags
& IFF_RUNNING
) != 0;
1963 ovs_mutex_unlock(&netdev
->mutex
);
1968 static long long int
1969 netdev_linux_get_carrier_resets(const struct netdev
*netdev_
)
1971 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1972 long long int carrier_resets
;
1974 ovs_mutex_lock(&netdev
->mutex
);
1975 carrier_resets
= netdev
->carrier_resets
;
1976 ovs_mutex_unlock(&netdev
->mutex
);
1978 return carrier_resets
;
1982 netdev_linux_do_miimon(const char *name
, int cmd
, const char *cmd_name
,
1983 struct mii_ioctl_data
*data
)
1988 memset(&ifr
, 0, sizeof ifr
);
1989 memcpy(&ifr
.ifr_data
, data
, sizeof *data
);
1990 error
= af_inet_ifreq_ioctl(name
, &ifr
, cmd
, cmd_name
);
1991 memcpy(data
, &ifr
.ifr_data
, sizeof *data
);
1997 netdev_linux_get_miimon(const char *name
, bool *miimon
)
1999 struct mii_ioctl_data data
;
2004 memset(&data
, 0, sizeof data
);
2005 error
= netdev_linux_do_miimon(name
, SIOCGMIIPHY
, "SIOCGMIIPHY", &data
);
2007 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
2008 data
.reg_num
= MII_BMSR
;
2009 error
= netdev_linux_do_miimon(name
, SIOCGMIIREG
, "SIOCGMIIREG",
2013 *miimon
= !!(data
.val_out
& BMSR_LSTATUS
);
2017 struct ethtool_cmd ecmd
;
2019 VLOG_DBG_RL(&rl
, "%s: failed to query MII, falling back to ethtool",
2022 COVERAGE_INC(netdev_get_ethtool
);
2023 memset(&ecmd
, 0, sizeof ecmd
);
2024 error
= netdev_linux_do_ethtool(name
, &ecmd
, ETHTOOL_GLINK
,
2027 struct ethtool_value eval
;
2029 memcpy(&eval
, &ecmd
, sizeof eval
);
2030 *miimon
= !!eval
.data
;
2032 VLOG_WARN_RL(&rl
, "%s: ethtool link status failed", name
);
2040 netdev_linux_set_miimon_interval(struct netdev
*netdev_
,
2041 long long int interval
)
2043 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2045 ovs_mutex_lock(&netdev
->mutex
);
2046 interval
= interval
> 0 ? MAX(interval
, 100) : 0;
2047 if (netdev
->miimon_interval
!= interval
) {
2048 if (interval
&& !netdev
->miimon_interval
) {
2049 atomic_count_inc(&miimon_cnt
);
2050 } else if (!interval
&& netdev
->miimon_interval
) {
2051 atomic_count_dec(&miimon_cnt
);
2054 netdev
->miimon_interval
= interval
;
2055 timer_set_expired(&netdev
->miimon_timer
);
2057 ovs_mutex_unlock(&netdev
->mutex
);
2063 netdev_linux_miimon_run(void)
2065 struct shash device_shash
;
2066 struct shash_node
*node
;
2068 shash_init(&device_shash
);
2069 netdev_get_devices(&netdev_linux_class
, &device_shash
);
2070 SHASH_FOR_EACH (node
, &device_shash
) {
2071 struct netdev
*netdev
= node
->data
;
2072 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
2075 ovs_mutex_lock(&dev
->mutex
);
2076 if (dev
->miimon_interval
> 0 && timer_expired(&dev
->miimon_timer
)) {
2077 netdev_linux_get_miimon(dev
->up
.name
, &miimon
);
2078 if (miimon
!= dev
->miimon
) {
2079 dev
->miimon
= miimon
;
2080 netdev_linux_changed(dev
, dev
->ifi_flags
, 0);
2083 timer_set_duration(&dev
->miimon_timer
, dev
->miimon_interval
);
2085 ovs_mutex_unlock(&dev
->mutex
);
2086 netdev_close(netdev
);
2089 shash_destroy(&device_shash
);
2093 netdev_linux_miimon_wait(void)
2095 struct shash device_shash
;
2096 struct shash_node
*node
;
2098 shash_init(&device_shash
);
2099 netdev_get_devices(&netdev_linux_class
, &device_shash
);
2100 SHASH_FOR_EACH (node
, &device_shash
) {
2101 struct netdev
*netdev
= node
->data
;
2102 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
2104 ovs_mutex_lock(&dev
->mutex
);
2105 if (dev
->miimon_interval
> 0) {
2106 timer_wait(&dev
->miimon_timer
);
2108 ovs_mutex_unlock(&dev
->mutex
);
2109 netdev_close(netdev
);
2111 shash_destroy(&device_shash
);
2115 swap_uint64(uint64_t *a
, uint64_t *b
)
2122 /* Copies 'src' into 'dst', performing format conversion in the process.
2124 * 'src' is allowed to be misaligned. */
2126 netdev_stats_from_ovs_vport_stats(struct netdev_stats
*dst
,
2127 const struct ovs_vport_stats
*src
)
2129 dst
->rx_packets
= get_32aligned_u64(&src
->rx_packets
);
2130 dst
->tx_packets
= get_32aligned_u64(&src
->tx_packets
);
2131 dst
->rx_bytes
= get_32aligned_u64(&src
->rx_bytes
);
2132 dst
->tx_bytes
= get_32aligned_u64(&src
->tx_bytes
);
2133 dst
->rx_errors
= get_32aligned_u64(&src
->rx_errors
);
2134 dst
->tx_errors
= get_32aligned_u64(&src
->tx_errors
);
2135 dst
->rx_dropped
= get_32aligned_u64(&src
->rx_dropped
);
2136 dst
->tx_dropped
= get_32aligned_u64(&src
->tx_dropped
);
2138 dst
->collisions
= 0;
2139 dst
->rx_length_errors
= 0;
2140 dst
->rx_over_errors
= 0;
2141 dst
->rx_crc_errors
= 0;
2142 dst
->rx_frame_errors
= 0;
2143 dst
->rx_fifo_errors
= 0;
2144 dst
->rx_missed_errors
= 0;
2145 dst
->tx_aborted_errors
= 0;
2146 dst
->tx_carrier_errors
= 0;
2147 dst
->tx_fifo_errors
= 0;
2148 dst
->tx_heartbeat_errors
= 0;
2149 dst
->tx_window_errors
= 0;
2153 get_stats_via_vport__(const struct netdev
*netdev
, struct netdev_stats
*stats
)
2155 struct dpif_netlink_vport reply
;
2159 error
= dpif_netlink_vport_get(netdev_get_name(netdev
), &reply
, &buf
);
2162 } else if (!reply
.stats
) {
2167 netdev_stats_from_ovs_vport_stats(stats
, reply
.stats
);
2175 get_stats_via_vport(const struct netdev
*netdev_
,
2176 struct netdev_stats
*stats
)
2178 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2180 if (!netdev
->vport_stats_error
||
2181 !(netdev
->cache_valid
& VALID_VPORT_STAT_ERROR
)) {
2184 error
= get_stats_via_vport__(netdev_
, stats
);
2185 if (error
&& error
!= ENOENT
&& error
!= ENODEV
) {
2186 VLOG_WARN_RL(&rl
, "%s: obtaining netdev stats via vport failed "
2188 netdev_get_name(netdev_
), ovs_strerror(error
));
2190 netdev
->vport_stats_error
= error
;
2191 netdev
->cache_valid
|= VALID_VPORT_STAT_ERROR
;
2195 /* Retrieves current device stats for 'netdev-linux'. */
2197 netdev_linux_get_stats(const struct netdev
*netdev_
,
2198 struct netdev_stats
*stats
)
2200 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2201 struct netdev_stats dev_stats
;
2204 ovs_mutex_lock(&netdev
->mutex
);
2205 get_stats_via_vport(netdev_
, stats
);
2206 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
2208 if (!netdev
->vport_stats_error
) {
2211 } else if (netdev
->vport_stats_error
) {
2212 /* stats not available from OVS then use netdev stats. */
2215 /* Use kernel netdev's packet and byte counts since vport's counters
2216 * do not reflect packet counts on the wire when GSO, TSO or GRO are
2218 stats
->rx_packets
= dev_stats
.rx_packets
;
2219 stats
->rx_bytes
= dev_stats
.rx_bytes
;
2220 stats
->tx_packets
= dev_stats
.tx_packets
;
2221 stats
->tx_bytes
= dev_stats
.tx_bytes
;
2223 stats
->rx_errors
+= dev_stats
.rx_errors
;
2224 stats
->tx_errors
+= dev_stats
.tx_errors
;
2225 stats
->rx_dropped
+= dev_stats
.rx_dropped
;
2226 stats
->tx_dropped
+= dev_stats
.tx_dropped
;
2227 stats
->multicast
+= dev_stats
.multicast
;
2228 stats
->collisions
+= dev_stats
.collisions
;
2229 stats
->rx_length_errors
+= dev_stats
.rx_length_errors
;
2230 stats
->rx_over_errors
+= dev_stats
.rx_over_errors
;
2231 stats
->rx_crc_errors
+= dev_stats
.rx_crc_errors
;
2232 stats
->rx_frame_errors
+= dev_stats
.rx_frame_errors
;
2233 stats
->rx_fifo_errors
+= dev_stats
.rx_fifo_errors
;
2234 stats
->rx_missed_errors
+= dev_stats
.rx_missed_errors
;
2235 stats
->tx_aborted_errors
+= dev_stats
.tx_aborted_errors
;
2236 stats
->tx_carrier_errors
+= dev_stats
.tx_carrier_errors
;
2237 stats
->tx_fifo_errors
+= dev_stats
.tx_fifo_errors
;
2238 stats
->tx_heartbeat_errors
+= dev_stats
.tx_heartbeat_errors
;
2239 stats
->tx_window_errors
+= dev_stats
.tx_window_errors
;
2241 ovs_mutex_unlock(&netdev
->mutex
);
2246 /* Retrieves current device stats for 'netdev-tap' netdev or
2247 * netdev-internal. */
2249 netdev_tap_get_stats(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
2251 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2252 struct netdev_stats dev_stats
;
2255 ovs_mutex_lock(&netdev
->mutex
);
2256 get_stats_via_vport(netdev_
, stats
);
2257 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
2259 if (!netdev
->vport_stats_error
) {
2262 } else if (netdev
->vport_stats_error
) {
2263 /* Transmit and receive stats will appear to be swapped relative to the
2264 * other ports since we are the one sending the data, not a remote
2265 * computer. For consistency, we swap them back here. This does not
2266 * apply if we are getting stats from the vport layer because it always
2267 * tracks stats from the perspective of the switch. */
2270 swap_uint64(&stats
->rx_packets
, &stats
->tx_packets
);
2271 swap_uint64(&stats
->rx_bytes
, &stats
->tx_bytes
);
2272 swap_uint64(&stats
->rx_errors
, &stats
->tx_errors
);
2273 swap_uint64(&stats
->rx_dropped
, &stats
->tx_dropped
);
2274 stats
->rx_length_errors
= 0;
2275 stats
->rx_over_errors
= 0;
2276 stats
->rx_crc_errors
= 0;
2277 stats
->rx_frame_errors
= 0;
2278 stats
->rx_fifo_errors
= 0;
2279 stats
->rx_missed_errors
= 0;
2280 stats
->tx_aborted_errors
= 0;
2281 stats
->tx_carrier_errors
= 0;
2282 stats
->tx_fifo_errors
= 0;
2283 stats
->tx_heartbeat_errors
= 0;
2284 stats
->tx_window_errors
= 0;
2286 /* Use kernel netdev's packet and byte counts since vport counters
2287 * do not reflect packet counts on the wire when GSO, TSO or GRO
2289 stats
->rx_packets
= dev_stats
.tx_packets
;
2290 stats
->rx_bytes
= dev_stats
.tx_bytes
;
2291 stats
->tx_packets
= dev_stats
.rx_packets
;
2292 stats
->tx_bytes
= dev_stats
.rx_bytes
;
2294 stats
->rx_dropped
+= dev_stats
.tx_dropped
;
2295 stats
->tx_dropped
+= dev_stats
.rx_dropped
;
2297 stats
->rx_errors
+= dev_stats
.tx_errors
;
2298 stats
->tx_errors
+= dev_stats
.rx_errors
;
2300 stats
->multicast
+= dev_stats
.multicast
;
2301 stats
->collisions
+= dev_stats
.collisions
;
2303 stats
->tx_dropped
+= netdev
->tx_dropped
;
2304 stats
->rx_dropped
+= netdev
->rx_dropped
;
2305 ovs_mutex_unlock(&netdev
->mutex
);
2311 netdev_internal_get_stats(const struct netdev
*netdev_
,
2312 struct netdev_stats
*stats
)
2314 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2317 ovs_mutex_lock(&netdev
->mutex
);
2318 get_stats_via_vport(netdev_
, stats
);
2319 error
= netdev
->vport_stats_error
;
2320 ovs_mutex_unlock(&netdev
->mutex
);
2326 netdev_linux_read_features(struct netdev_linux
*netdev
)
2328 struct ethtool_cmd ecmd
;
2332 if (netdev
->cache_valid
& VALID_FEATURES
) {
2336 COVERAGE_INC(netdev_get_ethtool
);
2337 memset(&ecmd
, 0, sizeof ecmd
);
2338 error
= netdev_linux_do_ethtool(netdev
->up
.name
, &ecmd
,
2339 ETHTOOL_GSET
, "ETHTOOL_GSET");
2344 /* Supported features. */
2345 netdev
->supported
= 0;
2346 if (ecmd
.supported
& SUPPORTED_10baseT_Half
) {
2347 netdev
->supported
|= NETDEV_F_10MB_HD
;
2349 if (ecmd
.supported
& SUPPORTED_10baseT_Full
) {
2350 netdev
->supported
|= NETDEV_F_10MB_FD
;
2352 if (ecmd
.supported
& SUPPORTED_100baseT_Half
) {
2353 netdev
->supported
|= NETDEV_F_100MB_HD
;
2355 if (ecmd
.supported
& SUPPORTED_100baseT_Full
) {
2356 netdev
->supported
|= NETDEV_F_100MB_FD
;
2358 if (ecmd
.supported
& SUPPORTED_1000baseT_Half
) {
2359 netdev
->supported
|= NETDEV_F_1GB_HD
;
2361 if ((ecmd
.supported
& SUPPORTED_1000baseT_Full
) ||
2362 (ecmd
.supported
& SUPPORTED_1000baseKX_Full
)) {
2363 netdev
->supported
|= NETDEV_F_1GB_FD
;
2365 if ((ecmd
.supported
& SUPPORTED_10000baseT_Full
) ||
2366 (ecmd
.supported
& SUPPORTED_10000baseKX4_Full
) ||
2367 (ecmd
.supported
& SUPPORTED_10000baseKR_Full
) ||
2368 (ecmd
.supported
& SUPPORTED_10000baseR_FEC
)) {
2369 netdev
->supported
|= NETDEV_F_10GB_FD
;
2371 if ((ecmd
.supported
& SUPPORTED_40000baseKR4_Full
) ||
2372 (ecmd
.supported
& SUPPORTED_40000baseCR4_Full
) ||
2373 (ecmd
.supported
& SUPPORTED_40000baseSR4_Full
) ||
2374 (ecmd
.supported
& SUPPORTED_40000baseLR4_Full
)) {
2375 netdev
->supported
|= NETDEV_F_40GB_FD
;
2377 if (ecmd
.supported
& SUPPORTED_TP
) {
2378 netdev
->supported
|= NETDEV_F_COPPER
;
2380 if (ecmd
.supported
& SUPPORTED_FIBRE
) {
2381 netdev
->supported
|= NETDEV_F_FIBER
;
2383 if (ecmd
.supported
& SUPPORTED_Autoneg
) {
2384 netdev
->supported
|= NETDEV_F_AUTONEG
;
2386 if (ecmd
.supported
& SUPPORTED_Pause
) {
2387 netdev
->supported
|= NETDEV_F_PAUSE
;
2389 if (ecmd
.supported
& SUPPORTED_Asym_Pause
) {
2390 netdev
->supported
|= NETDEV_F_PAUSE_ASYM
;
2393 /* Advertised features. */
2394 netdev
->advertised
= 0;
2395 if (ecmd
.advertising
& ADVERTISED_10baseT_Half
) {
2396 netdev
->advertised
|= NETDEV_F_10MB_HD
;
2398 if (ecmd
.advertising
& ADVERTISED_10baseT_Full
) {
2399 netdev
->advertised
|= NETDEV_F_10MB_FD
;
2401 if (ecmd
.advertising
& ADVERTISED_100baseT_Half
) {
2402 netdev
->advertised
|= NETDEV_F_100MB_HD
;
2404 if (ecmd
.advertising
& ADVERTISED_100baseT_Full
) {
2405 netdev
->advertised
|= NETDEV_F_100MB_FD
;
2407 if (ecmd
.advertising
& ADVERTISED_1000baseT_Half
) {
2408 netdev
->advertised
|= NETDEV_F_1GB_HD
;
2410 if ((ecmd
.advertising
& ADVERTISED_1000baseT_Full
) ||
2411 (ecmd
.advertising
& ADVERTISED_1000baseKX_Full
)) {
2412 netdev
->advertised
|= NETDEV_F_1GB_FD
;
2414 if ((ecmd
.advertising
& ADVERTISED_10000baseT_Full
) ||
2415 (ecmd
.advertising
& ADVERTISED_10000baseKX4_Full
) ||
2416 (ecmd
.advertising
& ADVERTISED_10000baseKR_Full
) ||
2417 (ecmd
.advertising
& ADVERTISED_10000baseR_FEC
)) {
2418 netdev
->advertised
|= NETDEV_F_10GB_FD
;
2420 if ((ecmd
.advertising
& ADVERTISED_40000baseKR4_Full
) ||
2421 (ecmd
.advertising
& ADVERTISED_40000baseCR4_Full
) ||
2422 (ecmd
.advertising
& ADVERTISED_40000baseSR4_Full
) ||
2423 (ecmd
.advertising
& ADVERTISED_40000baseLR4_Full
)) {
2424 netdev
->advertised
|= NETDEV_F_40GB_FD
;
2426 if (ecmd
.advertising
& ADVERTISED_TP
) {
2427 netdev
->advertised
|= NETDEV_F_COPPER
;
2429 if (ecmd
.advertising
& ADVERTISED_FIBRE
) {
2430 netdev
->advertised
|= NETDEV_F_FIBER
;
2432 if (ecmd
.advertising
& ADVERTISED_Autoneg
) {
2433 netdev
->advertised
|= NETDEV_F_AUTONEG
;
2435 if (ecmd
.advertising
& ADVERTISED_Pause
) {
2436 netdev
->advertised
|= NETDEV_F_PAUSE
;
2438 if (ecmd
.advertising
& ADVERTISED_Asym_Pause
) {
2439 netdev
->advertised
|= NETDEV_F_PAUSE_ASYM
;
2442 /* Current settings. */
2443 speed
= ethtool_cmd_speed(&ecmd
);
2444 if (speed
== SPEED_10
) {
2445 netdev
->current
= ecmd
.duplex
? NETDEV_F_10MB_FD
: NETDEV_F_10MB_HD
;
2446 } else if (speed
== SPEED_100
) {
2447 netdev
->current
= ecmd
.duplex
? NETDEV_F_100MB_FD
: NETDEV_F_100MB_HD
;
2448 } else if (speed
== SPEED_1000
) {
2449 netdev
->current
= ecmd
.duplex
? NETDEV_F_1GB_FD
: NETDEV_F_1GB_HD
;
2450 } else if (speed
== SPEED_10000
) {
2451 netdev
->current
= NETDEV_F_10GB_FD
;
2452 } else if (speed
== 40000) {
2453 netdev
->current
= NETDEV_F_40GB_FD
;
2454 } else if (speed
== 100000) {
2455 netdev
->current
= NETDEV_F_100GB_FD
;
2456 } else if (speed
== 1000000) {
2457 netdev
->current
= NETDEV_F_1TB_FD
;
2459 netdev
->current
= 0;
2462 if (ecmd
.port
== PORT_TP
) {
2463 netdev
->current
|= NETDEV_F_COPPER
;
2464 } else if (ecmd
.port
== PORT_FIBRE
) {
2465 netdev
->current
|= NETDEV_F_FIBER
;
2469 netdev
->current
|= NETDEV_F_AUTONEG
;
2473 netdev
->cache_valid
|= VALID_FEATURES
;
2474 netdev
->get_features_error
= error
;
2477 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2478 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2479 * Returns 0 if successful, otherwise a positive errno value. */
2481 netdev_linux_get_features(const struct netdev
*netdev_
,
2482 enum netdev_features
*current
,
2483 enum netdev_features
*advertised
,
2484 enum netdev_features
*supported
,
2485 enum netdev_features
*peer
)
2487 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2490 ovs_mutex_lock(&netdev
->mutex
);
2491 if (netdev_linux_netnsid_is_remote(netdev
)) {
2496 netdev_linux_read_features(netdev
);
2497 if (!netdev
->get_features_error
) {
2498 *current
= netdev
->current
;
2499 *advertised
= netdev
->advertised
;
2500 *supported
= netdev
->supported
;
2501 *peer
= 0; /* XXX */
2503 error
= netdev
->get_features_error
;
2506 ovs_mutex_unlock(&netdev
->mutex
);
2510 /* Set the features advertised by 'netdev' to 'advertise'. */
2512 netdev_linux_set_advertisements(struct netdev
*netdev_
,
2513 enum netdev_features advertise
)
2515 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2516 struct ethtool_cmd ecmd
;
2519 ovs_mutex_lock(&netdev
->mutex
);
2521 COVERAGE_INC(netdev_get_ethtool
);
2523 if (netdev_linux_netnsid_is_remote(netdev
)) {
2528 memset(&ecmd
, 0, sizeof ecmd
);
2529 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2530 ETHTOOL_GSET
, "ETHTOOL_GSET");
2535 ecmd
.advertising
= 0;
2536 if (advertise
& NETDEV_F_10MB_HD
) {
2537 ecmd
.advertising
|= ADVERTISED_10baseT_Half
;
2539 if (advertise
& NETDEV_F_10MB_FD
) {
2540 ecmd
.advertising
|= ADVERTISED_10baseT_Full
;
2542 if (advertise
& NETDEV_F_100MB_HD
) {
2543 ecmd
.advertising
|= ADVERTISED_100baseT_Half
;
2545 if (advertise
& NETDEV_F_100MB_FD
) {
2546 ecmd
.advertising
|= ADVERTISED_100baseT_Full
;
2548 if (advertise
& NETDEV_F_1GB_HD
) {
2549 ecmd
.advertising
|= ADVERTISED_1000baseT_Half
;
2551 if (advertise
& NETDEV_F_1GB_FD
) {
2552 ecmd
.advertising
|= ADVERTISED_1000baseT_Full
;
2554 if (advertise
& NETDEV_F_10GB_FD
) {
2555 ecmd
.advertising
|= ADVERTISED_10000baseT_Full
;
2557 if (advertise
& NETDEV_F_COPPER
) {
2558 ecmd
.advertising
|= ADVERTISED_TP
;
2560 if (advertise
& NETDEV_F_FIBER
) {
2561 ecmd
.advertising
|= ADVERTISED_FIBRE
;
2563 if (advertise
& NETDEV_F_AUTONEG
) {
2564 ecmd
.advertising
|= ADVERTISED_Autoneg
;
2566 if (advertise
& NETDEV_F_PAUSE
) {
2567 ecmd
.advertising
|= ADVERTISED_Pause
;
2569 if (advertise
& NETDEV_F_PAUSE_ASYM
) {
2570 ecmd
.advertising
|= ADVERTISED_Asym_Pause
;
2572 COVERAGE_INC(netdev_set_ethtool
);
2573 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2574 ETHTOOL_SSET
, "ETHTOOL_SSET");
2577 ovs_mutex_unlock(&netdev
->mutex
);
2581 static struct tc_police
2582 tc_matchall_fill_police(uint32_t kbits_rate
, uint32_t kbits_burst
)
2584 unsigned int bsize
= MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024 / 64;
2585 unsigned int bps
= ((uint64_t) kbits_rate
* 1000) / 8;
2586 struct tc_police police
;
2587 struct tc_ratespec rate
;
2590 memset(&rate
, 0, sizeof rate
);
2592 rate
.cell_log
= tc_calc_cell_log(mtu
);
2593 rate
.mpu
= ETH_TOTAL_MIN
;
2595 memset(&police
, 0, sizeof police
);
2596 police
.burst
= tc_bytes_to_ticks(bps
, bsize
);
2597 police
.action
= TC_POLICE_SHOT
;
2605 nl_msg_put_act_police(struct ofpbuf
*request
, struct tc_police police
)
2609 nl_msg_put_string(request
, TCA_ACT_KIND
, "police");
2610 offset
= nl_msg_start_nested(request
, TCA_ACT_OPTIONS
);
2611 nl_msg_put_unspec(request
, TCA_POLICE_TBF
, &police
, sizeof police
);
2612 tc_put_rtab(request
, TCA_POLICE_RATE
, &police
.rate
);
2613 nl_msg_put_u32(request
, TCA_POLICE_RESULT
, TC_ACT_UNSPEC
);
2614 nl_msg_end_nested(request
, offset
);
2618 tc_add_matchall_policer(struct netdev
*netdev
, uint32_t kbits_rate
,
2619 uint32_t kbits_burst
)
2621 uint16_t eth_type
= (OVS_FORCE
uint16_t) htons(ETH_P_ALL
);
2622 size_t basic_offset
, action_offset
, inner_offset
;
2623 uint16_t prio
= TC_RESERVED_PRIORITY_POLICE
;
2624 int ifindex
, index
, err
= 0;
2625 struct tc_police pol_act
;
2626 uint32_t block_id
= 0;
2627 struct ofpbuf request
;
2628 struct ofpbuf
*reply
;
2629 struct tcmsg
*tcmsg
;
2630 uint32_t handle
= 1;
2632 err
= get_ifindex(netdev
, &ifindex
);
2637 index
= block_id
? TCM_IFINDEX_MAGIC_BLOCK
: ifindex
;
2638 tcmsg
= tc_make_request(index
, RTM_NEWTFILTER
, NLM_F_CREATE
| NLM_F_ECHO
,
2640 tcmsg
->tcm_parent
= block_id
? : TC_INGRESS_PARENT
;
2641 tcmsg
->tcm_info
= tc_make_handle(prio
, eth_type
);
2642 tcmsg
->tcm_handle
= handle
;
2644 pol_act
= tc_matchall_fill_police(kbits_rate
, kbits_burst
);
2645 nl_msg_put_string(&request
, TCA_KIND
, "matchall");
2646 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
2647 action_offset
= nl_msg_start_nested(&request
, TCA_MATCHALL_ACT
);
2648 inner_offset
= nl_msg_start_nested(&request
, 1);
2649 nl_msg_put_act_police(&request
, pol_act
);
2650 nl_msg_end_nested(&request
, inner_offset
);
2651 nl_msg_end_nested(&request
, action_offset
);
2652 nl_msg_end_nested(&request
, basic_offset
);
2654 err
= tc_transact(&request
, &reply
);
2657 ofpbuf_at_assert(reply
, NLMSG_HDRLEN
, sizeof *tc
);
2658 ofpbuf_delete(reply
);
2665 tc_del_matchall_policer(struct netdev
*netdev
)
2667 int prio
= TC_RESERVED_PRIORITY_POLICE
;
2668 uint32_t block_id
= 0;
2673 err
= get_ifindex(netdev
, &ifindex
);
2678 id
= tc_make_tcf_id(ifindex
, block_id
, prio
, TC_INGRESS
);
2679 err
= tc_del_filter(&id
);
2687 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2688 * successful, otherwise a positive errno value. */
2690 netdev_linux_set_policing(struct netdev
*netdev_
,
2691 uint32_t kbits_rate
, uint32_t kbits_burst
)
2693 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2694 const char *netdev_name
= netdev_get_name(netdev_
);
2698 kbits_burst
= (!kbits_rate
? 0 /* Force to 0 if no rate specified. */
2699 : !kbits_burst
? 8000 /* Default to 8000 kbits if 0. */
2700 : kbits_burst
); /* Stick with user-specified value. */
2702 ovs_mutex_lock(&netdev
->mutex
);
2703 if (netdev_linux_netnsid_is_remote(netdev
)) {
2708 if (netdev
->cache_valid
& VALID_POLICING
) {
2709 error
= netdev
->netdev_policing_error
;
2710 if (error
|| (netdev
->kbits_rate
== kbits_rate
&&
2711 netdev
->kbits_burst
== kbits_burst
)) {
2712 /* Assume that settings haven't changed since we last set them. */
2715 netdev
->cache_valid
&= ~VALID_POLICING
;
2718 COVERAGE_INC(netdev_set_policing
);
2720 /* Use matchall for policing when offloadling ovs with tc-flower. */
2721 if (netdev_is_flow_api_enabled()) {
2722 error
= tc_del_matchall_policer(netdev_
);
2724 error
= tc_add_matchall_policer(netdev_
, kbits_rate
, kbits_burst
);
2726 ovs_mutex_unlock(&netdev
->mutex
);
2730 error
= get_ifindex(netdev_
, &ifindex
);
2735 /* Remove any existing ingress qdisc. */
2736 error
= tc_add_del_qdisc(ifindex
, false, 0, TC_INGRESS
);
2738 VLOG_WARN_RL(&rl
, "%s: removing policing failed: %s",
2739 netdev_name
, ovs_strerror(error
));
2744 error
= tc_add_del_qdisc(ifindex
, true, 0, TC_INGRESS
);
2746 VLOG_WARN_RL(&rl
, "%s: adding policing qdisc failed: %s",
2747 netdev_name
, ovs_strerror(error
));
2751 error
= tc_add_policer(netdev_
, kbits_rate
, kbits_burst
);
2753 VLOG_WARN_RL(&rl
, "%s: adding policing action failed: %s",
2754 netdev_name
, ovs_strerror(error
));
2759 netdev
->kbits_rate
= kbits_rate
;
2760 netdev
->kbits_burst
= kbits_burst
;
2763 if (!error
|| error
== ENODEV
) {
2764 netdev
->netdev_policing_error
= error
;
2765 netdev
->cache_valid
|= VALID_POLICING
;
2767 ovs_mutex_unlock(&netdev
->mutex
);
2772 netdev_linux_get_qos_types(const struct netdev
*netdev OVS_UNUSED
,
2775 const struct tc_ops
*const *opsp
;
2776 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2777 const struct tc_ops
*ops
= *opsp
;
2778 if (ops
->tc_install
&& ops
->ovs_name
[0] != '\0') {
2779 sset_add(types
, ops
->ovs_name
);
2785 static const struct tc_ops
*
2786 tc_lookup_ovs_name(const char *name
)
2788 const struct tc_ops
*const *opsp
;
2790 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2791 const struct tc_ops
*ops
= *opsp
;
2792 if (!strcmp(name
, ops
->ovs_name
)) {
2799 static const struct tc_ops
*
2800 tc_lookup_linux_name(const char *name
)
2802 const struct tc_ops
*const *opsp
;
2804 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2805 const struct tc_ops
*ops
= *opsp
;
2806 if (ops
->linux_name
&& !strcmp(name
, ops
->linux_name
)) {
2813 static struct tc_queue
*
2814 tc_find_queue__(const struct netdev
*netdev_
, unsigned int queue_id
,
2817 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2818 struct tc_queue
*queue
;
2820 HMAP_FOR_EACH_IN_BUCKET (queue
, hmap_node
, hash
, &netdev
->tc
->queues
) {
2821 if (queue
->queue_id
== queue_id
) {
2828 static struct tc_queue
*
2829 tc_find_queue(const struct netdev
*netdev
, unsigned int queue_id
)
2831 return tc_find_queue__(netdev
, queue_id
, hash_int(queue_id
, 0));
2835 netdev_linux_get_qos_capabilities(const struct netdev
*netdev OVS_UNUSED
,
2837 struct netdev_qos_capabilities
*caps
)
2839 const struct tc_ops
*ops
= tc_lookup_ovs_name(type
);
2843 caps
->n_queues
= ops
->n_queues
;
2848 netdev_linux_get_qos(const struct netdev
*netdev_
,
2849 const char **typep
, struct smap
*details
)
2851 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2854 ovs_mutex_lock(&netdev
->mutex
);
2855 if (netdev_linux_netnsid_is_remote(netdev
)) {
2860 error
= tc_query_qdisc(netdev_
);
2862 *typep
= netdev
->tc
->ops
->ovs_name
;
2863 error
= (netdev
->tc
->ops
->qdisc_get
2864 ? netdev
->tc
->ops
->qdisc_get(netdev_
, details
)
2869 ovs_mutex_unlock(&netdev
->mutex
);
2874 netdev_linux_set_qos(struct netdev
*netdev_
,
2875 const char *type
, const struct smap
*details
)
2877 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2878 const struct tc_ops
*new_ops
;
2881 new_ops
= tc_lookup_ovs_name(type
);
2882 if (!new_ops
|| !new_ops
->tc_install
) {
2886 if (new_ops
== &tc_ops_noop
) {
2887 return new_ops
->tc_install(netdev_
, details
);
2890 ovs_mutex_lock(&netdev
->mutex
);
2891 if (netdev_linux_netnsid_is_remote(netdev
)) {
2896 error
= tc_query_qdisc(netdev_
);
2901 if (new_ops
== netdev
->tc
->ops
) {
2902 error
= new_ops
->qdisc_set
? new_ops
->qdisc_set(netdev_
, details
) : 0;
2904 /* Delete existing qdisc. */
2905 error
= tc_del_qdisc(netdev_
);
2909 ovs_assert(netdev
->tc
== NULL
);
2911 /* Install new qdisc. */
2912 error
= new_ops
->tc_install(netdev_
, details
);
2913 ovs_assert((error
== 0) == (netdev
->tc
!= NULL
));
2917 ovs_mutex_unlock(&netdev
->mutex
);
2922 netdev_linux_get_queue(const struct netdev
*netdev_
,
2923 unsigned int queue_id
, struct smap
*details
)
2925 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2928 ovs_mutex_lock(&netdev
->mutex
);
2929 if (netdev_linux_netnsid_is_remote(netdev
)) {
2934 error
= tc_query_qdisc(netdev_
);
2936 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2938 ? netdev
->tc
->ops
->class_get(netdev_
, queue
, details
)
2943 ovs_mutex_unlock(&netdev
->mutex
);
2948 netdev_linux_set_queue(struct netdev
*netdev_
,
2949 unsigned int queue_id
, const struct smap
*details
)
2951 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2954 ovs_mutex_lock(&netdev
->mutex
);
2955 if (netdev_linux_netnsid_is_remote(netdev
)) {
2960 error
= tc_query_qdisc(netdev_
);
2962 error
= (queue_id
< netdev
->tc
->ops
->n_queues
2963 && netdev
->tc
->ops
->class_set
2964 ? netdev
->tc
->ops
->class_set(netdev_
, queue_id
, details
)
2969 ovs_mutex_unlock(&netdev
->mutex
);
2974 netdev_linux_delete_queue(struct netdev
*netdev_
, unsigned int queue_id
)
2976 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2979 ovs_mutex_lock(&netdev
->mutex
);
2980 if (netdev_linux_netnsid_is_remote(netdev
)) {
2985 error
= tc_query_qdisc(netdev_
);
2987 if (netdev
->tc
->ops
->class_delete
) {
2988 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2990 ? netdev
->tc
->ops
->class_delete(netdev_
, queue
)
2998 ovs_mutex_unlock(&netdev
->mutex
);
3003 netdev_linux_get_queue_stats(const struct netdev
*netdev_
,
3004 unsigned int queue_id
,
3005 struct netdev_queue_stats
*stats
)
3007 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3010 ovs_mutex_lock(&netdev
->mutex
);
3011 if (netdev_linux_netnsid_is_remote(netdev
)) {
3016 error
= tc_query_qdisc(netdev_
);
3018 if (netdev
->tc
->ops
->class_get_stats
) {
3019 const struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
3021 stats
->created
= queue
->created
;
3022 error
= netdev
->tc
->ops
->class_get_stats(netdev_
, queue
,
3033 ovs_mutex_unlock(&netdev
->mutex
);
3037 struct queue_dump_state
{
3038 struct nl_dump dump
;
3043 start_queue_dump(const struct netdev
*netdev
, struct queue_dump_state
*state
)
3045 struct ofpbuf request
;
3046 struct tcmsg
*tcmsg
;
3048 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, 0, &request
);
3052 tcmsg
->tcm_parent
= 0;
3053 nl_dump_start(&state
->dump
, NETLINK_ROUTE
, &request
);
3054 ofpbuf_uninit(&request
);
3056 ofpbuf_init(&state
->buf
, NL_DUMP_BUFSIZE
);
3061 finish_queue_dump(struct queue_dump_state
*state
)
3063 ofpbuf_uninit(&state
->buf
);
3064 return nl_dump_done(&state
->dump
);
3067 struct netdev_linux_queue_state
{
3068 unsigned int *queues
;
3074 netdev_linux_queue_dump_start(const struct netdev
*netdev_
, void **statep
)
3076 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3079 ovs_mutex_lock(&netdev
->mutex
);
3080 if (netdev_linux_netnsid_is_remote(netdev
)) {
3085 error
= tc_query_qdisc(netdev_
);
3087 if (netdev
->tc
->ops
->class_get
) {
3088 struct netdev_linux_queue_state
*state
;
3089 struct tc_queue
*queue
;
3092 *statep
= state
= xmalloc(sizeof *state
);
3093 state
->n_queues
= hmap_count(&netdev
->tc
->queues
);
3094 state
->cur_queue
= 0;
3095 state
->queues
= xmalloc(state
->n_queues
* sizeof *state
->queues
);
3098 HMAP_FOR_EACH (queue
, hmap_node
, &netdev
->tc
->queues
) {
3099 state
->queues
[i
++] = queue
->queue_id
;
3107 ovs_mutex_unlock(&netdev
->mutex
);
3112 netdev_linux_queue_dump_next(const struct netdev
*netdev_
, void *state_
,
3113 unsigned int *queue_idp
, struct smap
*details
)
3115 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3116 struct netdev_linux_queue_state
*state
= state_
;
3119 ovs_mutex_lock(&netdev
->mutex
);
3120 if (netdev_linux_netnsid_is_remote(netdev
)) {
3125 while (state
->cur_queue
< state
->n_queues
) {
3126 unsigned int queue_id
= state
->queues
[state
->cur_queue
++];
3127 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
3130 *queue_idp
= queue_id
;
3131 error
= netdev
->tc
->ops
->class_get(netdev_
, queue
, details
);
3137 ovs_mutex_unlock(&netdev
->mutex
);
3142 netdev_linux_queue_dump_done(const struct netdev
*netdev OVS_UNUSED
,
3145 struct netdev_linux_queue_state
*state
= state_
;
3147 free(state
->queues
);
3153 netdev_linux_dump_queue_stats(const struct netdev
*netdev_
,
3154 netdev_dump_queue_stats_cb
*cb
, void *aux
)
3156 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3159 ovs_mutex_lock(&netdev
->mutex
);
3160 if (netdev_linux_netnsid_is_remote(netdev
)) {
3165 error
= tc_query_qdisc(netdev_
);
3167 struct queue_dump_state state
;
3169 if (!netdev
->tc
->ops
->class_dump_stats
) {
3171 } else if (!start_queue_dump(netdev_
, &state
)) {
3177 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
3178 retval
= netdev
->tc
->ops
->class_dump_stats(netdev_
, &msg
,
3185 retval
= finish_queue_dump(&state
);
3193 ovs_mutex_unlock(&netdev
->mutex
);
3198 netdev_linux_set_in4(struct netdev
*netdev_
, struct in_addr address
,
3199 struct in_addr netmask
)
3201 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3204 ovs_mutex_lock(&netdev
->mutex
);
3205 if (netdev_linux_netnsid_is_remote(netdev
)) {
3210 error
= do_set_addr(netdev_
, SIOCSIFADDR
, "SIOCSIFADDR", address
);
3212 if (address
.s_addr
!= INADDR_ANY
) {
3213 error
= do_set_addr(netdev_
, SIOCSIFNETMASK
,
3214 "SIOCSIFNETMASK", netmask
);
3219 ovs_mutex_unlock(&netdev
->mutex
);
3223 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
3224 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
3227 netdev_linux_get_addr_list(const struct netdev
*netdev_
,
3228 struct in6_addr
**addr
, struct in6_addr
**mask
, int *n_cnt
)
3230 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3233 ovs_mutex_lock(&netdev
->mutex
);
3234 if (netdev_linux_netnsid_is_remote(netdev
)) {
3239 error
= netdev_get_addrs(netdev_get_name(netdev_
), addr
, mask
, n_cnt
);
3242 ovs_mutex_unlock(&netdev
->mutex
);
3247 make_in4_sockaddr(struct sockaddr
*sa
, struct in_addr addr
)
3249 struct sockaddr_in sin
;
3250 memset(&sin
, 0, sizeof sin
);
3251 sin
.sin_family
= AF_INET
;
3252 sin
.sin_addr
= addr
;
3255 memset(sa
, 0, sizeof *sa
);
3256 memcpy(sa
, &sin
, sizeof sin
);
3260 do_set_addr(struct netdev
*netdev
,
3261 int ioctl_nr
, const char *ioctl_name
, struct in_addr addr
)
3265 make_in4_sockaddr(&ifr
.ifr_addr
, addr
);
3266 return af_inet_ifreq_ioctl(netdev_get_name(netdev
), &ifr
, ioctl_nr
,
3270 /* Adds 'router' as a default IP gateway. */
3272 netdev_linux_add_router(struct netdev
*netdev OVS_UNUSED
, struct in_addr router
)
3274 struct in_addr any
= { INADDR_ANY
};
3278 memset(&rt
, 0, sizeof rt
);
3279 make_in4_sockaddr(&rt
.rt_dst
, any
);
3280 make_in4_sockaddr(&rt
.rt_gateway
, router
);
3281 make_in4_sockaddr(&rt
.rt_genmask
, any
);
3282 rt
.rt_flags
= RTF_UP
| RTF_GATEWAY
;
3283 error
= af_inet_ioctl(SIOCADDRT
, &rt
);
3285 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error
));
3291 netdev_linux_get_next_hop(const struct in_addr
*host
, struct in_addr
*next_hop
,
3294 static const char fn
[] = "/proc/net/route";
3299 *netdev_name
= NULL
;
3300 stream
= fopen(fn
, "r");
3301 if (stream
== NULL
) {
3302 VLOG_WARN_RL(&rl
, "%s: open failed: %s", fn
, ovs_strerror(errno
));
3307 while (fgets(line
, sizeof line
, stream
)) {
3310 ovs_be32 dest
, gateway
, mask
;
3311 int refcnt
, metric
, mtu
;
3312 unsigned int flags
, use
, window
, irtt
;
3315 "%16s %"SCNx32
" %"SCNx32
" %04X %d %u %d %"SCNx32
3317 iface
, &dest
, &gateway
, &flags
, &refcnt
,
3318 &use
, &metric
, &mask
, &mtu
, &window
, &irtt
)) {
3319 VLOG_WARN_RL(&rl
, "%s: could not parse line %d: %s",
3323 if (!(flags
& RTF_UP
)) {
3324 /* Skip routes that aren't up. */
3328 /* The output of 'dest', 'mask', and 'gateway' were given in
3329 * network byte order, so we don't need need any endian
3330 * conversions here. */
3331 if ((dest
& mask
) == (host
->s_addr
& mask
)) {
3333 /* The host is directly reachable. */
3334 next_hop
->s_addr
= 0;
3336 /* To reach the host, we must go through a gateway. */
3337 next_hop
->s_addr
= gateway
;
3339 *netdev_name
= xstrdup(iface
);
3351 netdev_linux_get_status(const struct netdev
*netdev_
, struct smap
*smap
)
3353 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3356 ovs_mutex_lock(&netdev
->mutex
);
3357 if (!(netdev
->cache_valid
& VALID_DRVINFO
)) {
3358 struct ethtool_cmd
*cmd
= (struct ethtool_cmd
*) &netdev
->drvinfo
;
3360 COVERAGE_INC(netdev_get_ethtool
);
3361 memset(&netdev
->drvinfo
, 0, sizeof netdev
->drvinfo
);
3362 error
= netdev_linux_do_ethtool(netdev
->up
.name
,
3365 "ETHTOOL_GDRVINFO");
3367 netdev
->cache_valid
|= VALID_DRVINFO
;
3372 smap_add(smap
, "driver_name", netdev
->drvinfo
.driver
);
3373 smap_add(smap
, "driver_version", netdev
->drvinfo
.version
);
3374 smap_add(smap
, "firmware_version", netdev
->drvinfo
.fw_version
);
3376 ovs_mutex_unlock(&netdev
->mutex
);
3382 netdev_internal_get_status(const struct netdev
*netdev OVS_UNUSED
,
3385 smap_add(smap
, "driver_name", "openvswitch");
3390 netdev_linux_get_block_id(struct netdev
*netdev_
)
3392 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3393 uint32_t block_id
= 0;
3395 ovs_mutex_lock(&netdev
->mutex
);
3396 /* Ensure the linux netdev has had its fields populated. */
3397 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
3398 netdev_linux_update_via_netlink(netdev
);
3401 /* Only assigning block ids to linux netdevs that are LAG masters. */
3402 if (netdev
->is_lag_master
) {
3403 block_id
= netdev
->ifindex
;
3405 ovs_mutex_unlock(&netdev
->mutex
);
3410 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3411 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3412 * returns 0. Otherwise, it returns a positive errno value; in particular,
3413 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3415 netdev_linux_arp_lookup(const struct netdev
*netdev
,
3416 ovs_be32 ip
, struct eth_addr
*mac
)
3419 struct sockaddr_in sin
;
3422 memset(&r
, 0, sizeof r
);
3423 memset(&sin
, 0, sizeof sin
);
3424 sin
.sin_family
= AF_INET
;
3425 sin
.sin_addr
.s_addr
= ip
;
3427 memcpy(&r
.arp_pa
, &sin
, sizeof sin
);
3428 r
.arp_ha
.sa_family
= ARPHRD_ETHER
;
3430 ovs_strzcpy(r
.arp_dev
, netdev_get_name(netdev
), sizeof r
.arp_dev
);
3431 COVERAGE_INC(netdev_arp_lookup
);
3432 retval
= af_inet_ioctl(SIOCGARP
, &r
);
3434 memcpy(mac
, r
.arp_ha
.sa_data
, ETH_ADDR_LEN
);
3435 } else if (retval
!= ENXIO
) {
3436 VLOG_WARN_RL(&rl
, "%s: could not look up ARP entry for "IP_FMT
": %s",
3437 netdev_get_name(netdev
), IP_ARGS(ip
),
3438 ovs_strerror(retval
));
3444 nd_to_iff_flags(enum netdev_flags nd
)
3446 unsigned int iff
= 0;
3447 if (nd
& NETDEV_UP
) {
3450 if (nd
& NETDEV_PROMISC
) {
3453 if (nd
& NETDEV_LOOPBACK
) {
3454 iff
|= IFF_LOOPBACK
;
3460 iff_to_nd_flags(unsigned int iff
)
3462 enum netdev_flags nd
= 0;
3466 if (iff
& IFF_PROMISC
) {
3467 nd
|= NETDEV_PROMISC
;
3469 if (iff
& IFF_LOOPBACK
) {
3470 nd
|= NETDEV_LOOPBACK
;
3476 update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
3477 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
3478 OVS_REQUIRES(netdev
->mutex
)
3480 unsigned int old_flags
, new_flags
;
3483 old_flags
= netdev
->ifi_flags
;
3484 *old_flagsp
= iff_to_nd_flags(old_flags
);
3485 new_flags
= (old_flags
& ~nd_to_iff_flags(off
)) | nd_to_iff_flags(on
);
3486 if (new_flags
!= old_flags
) {
3487 error
= set_flags(netdev_get_name(&netdev
->up
), new_flags
);
3488 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
3495 netdev_linux_update_flags(struct netdev
*netdev_
, enum netdev_flags off
,
3496 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
3498 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3501 ovs_mutex_lock(&netdev
->mutex
);
3503 /* Changing flags over netlink isn't support yet. */
3504 if (netdev_linux_netnsid_is_remote(netdev
)) {
3508 error
= update_flags(netdev
, off
, on
, old_flagsp
);
3510 /* Try reading flags over netlink, or fall back to ioctl. */
3511 if (!netdev_linux_update_via_netlink(netdev
)) {
3512 *old_flagsp
= iff_to_nd_flags(netdev
->ifi_flags
);
3514 error
= update_flags(netdev
, off
, on
, old_flagsp
);
3519 ovs_mutex_unlock(&netdev
->mutex
);
3523 #define NETDEV_LINUX_CLASS_COMMON \
3524 .run = netdev_linux_run, \
3525 .wait = netdev_linux_wait, \
3526 .alloc = netdev_linux_alloc, \
3527 .dealloc = netdev_linux_dealloc, \
3528 .send_wait = netdev_linux_send_wait, \
3529 .set_etheraddr = netdev_linux_set_etheraddr, \
3530 .get_etheraddr = netdev_linux_get_etheraddr, \
3531 .get_mtu = netdev_linux_get_mtu, \
3532 .set_mtu = netdev_linux_set_mtu, \
3533 .get_ifindex = netdev_linux_get_ifindex, \
3534 .get_carrier = netdev_linux_get_carrier, \
3535 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3536 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3537 .set_advertisements = netdev_linux_set_advertisements, \
3538 .set_policing = netdev_linux_set_policing, \
3539 .get_qos_types = netdev_linux_get_qos_types, \
3540 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3541 .get_qos = netdev_linux_get_qos, \
3542 .set_qos = netdev_linux_set_qos, \
3543 .get_queue = netdev_linux_get_queue, \
3544 .set_queue = netdev_linux_set_queue, \
3545 .delete_queue = netdev_linux_delete_queue, \
3546 .get_queue_stats = netdev_linux_get_queue_stats, \
3547 .queue_dump_start = netdev_linux_queue_dump_start, \
3548 .queue_dump_next = netdev_linux_queue_dump_next, \
3549 .queue_dump_done = netdev_linux_queue_dump_done, \
3550 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3551 .set_in4 = netdev_linux_set_in4, \
3552 .get_addr_list = netdev_linux_get_addr_list, \
3553 .add_router = netdev_linux_add_router, \
3554 .get_next_hop = netdev_linux_get_next_hop, \
3555 .arp_lookup = netdev_linux_arp_lookup, \
3556 .update_flags = netdev_linux_update_flags, \
3557 .rxq_alloc = netdev_linux_rxq_alloc, \
3558 .rxq_dealloc = netdev_linux_rxq_dealloc, \
3559 .rxq_wait = netdev_linux_rxq_wait, \
3560 .rxq_drain = netdev_linux_rxq_drain
3562 const struct netdev_class netdev_linux_class
= {
3563 NETDEV_LINUX_CLASS_COMMON
,
3566 .construct
= netdev_linux_construct
,
3567 .destruct
= netdev_linux_destruct
,
3568 .get_stats
= netdev_linux_get_stats
,
3569 .get_features
= netdev_linux_get_features
,
3570 .get_status
= netdev_linux_get_status
,
3571 .get_block_id
= netdev_linux_get_block_id
,
3572 .send
= netdev_linux_send
,
3573 .rxq_construct
= netdev_linux_rxq_construct
,
3574 .rxq_destruct
= netdev_linux_rxq_destruct
,
3575 .rxq_recv
= netdev_linux_rxq_recv
,
3578 const struct netdev_class netdev_tap_class
= {
3579 NETDEV_LINUX_CLASS_COMMON
,
3582 .construct
= netdev_linux_construct_tap
,
3583 .destruct
= netdev_linux_destruct
,
3584 .get_stats
= netdev_tap_get_stats
,
3585 .get_features
= netdev_linux_get_features
,
3586 .get_status
= netdev_linux_get_status
,
3587 .send
= netdev_linux_send
,
3588 .rxq_construct
= netdev_linux_rxq_construct
,
3589 .rxq_destruct
= netdev_linux_rxq_destruct
,
3590 .rxq_recv
= netdev_linux_rxq_recv
,
3593 const struct netdev_class netdev_internal_class
= {
3594 NETDEV_LINUX_CLASS_COMMON
,
3597 .construct
= netdev_linux_construct
,
3598 .destruct
= netdev_linux_destruct
,
3599 .get_stats
= netdev_internal_get_stats
,
3600 .get_status
= netdev_internal_get_status
,
3601 .send
= netdev_linux_send
,
3602 .rxq_construct
= netdev_linux_rxq_construct
,
3603 .rxq_destruct
= netdev_linux_rxq_destruct
,
3604 .rxq_recv
= netdev_linux_rxq_recv
,
3608 const struct netdev_class netdev_afxdp_class
= {
3609 NETDEV_LINUX_CLASS_COMMON
,
3612 .init
= netdev_afxdp_init
,
3613 .construct
= netdev_afxdp_construct
,
3614 .destruct
= netdev_afxdp_destruct
,
3615 .get_stats
= netdev_afxdp_get_stats
,
3616 .get_custom_stats
= netdev_afxdp_get_custom_stats
,
3617 .get_status
= netdev_linux_get_status
,
3618 .set_config
= netdev_afxdp_set_config
,
3619 .get_config
= netdev_afxdp_get_config
,
3620 .reconfigure
= netdev_afxdp_reconfigure
,
3621 .get_numa_id
= netdev_linux_get_numa_id
,
3622 .send
= netdev_afxdp_batch_send
,
3623 .rxq_construct
= netdev_afxdp_rxq_construct
,
3624 .rxq_destruct
= netdev_afxdp_rxq_destruct
,
3625 .rxq_recv
= netdev_afxdp_rxq_recv
,
3630 #define CODEL_N_QUEUES 0x0000
3632 /* In sufficiently new kernel headers these are defined as enums in
3633 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3634 * kernels. (This overrides any enum definition in the header file but that's
3636 #define TCA_CODEL_TARGET 1
3637 #define TCA_CODEL_LIMIT 2
3638 #define TCA_CODEL_INTERVAL 3
3647 static struct codel
*
3648 codel_get__(const struct netdev
*netdev_
)
3650 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3651 return CONTAINER_OF(netdev
->tc
, struct codel
, tc
);
3655 codel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3658 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3659 struct codel
*codel
;
3661 codel
= xmalloc(sizeof *codel
);
3662 tc_init(&codel
->tc
, &tc_ops_codel
);
3663 codel
->target
= target
;
3664 codel
->limit
= limit
;
3665 codel
->interval
= interval
;
3667 netdev
->tc
= &codel
->tc
;
3671 codel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3675 struct ofpbuf request
;
3676 struct tcmsg
*tcmsg
;
3677 uint32_t otarget
, olimit
, ointerval
;
3680 tc_del_qdisc(netdev
);
3682 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3683 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3687 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3688 tcmsg
->tcm_parent
= TC_H_ROOT
;
3690 otarget
= target
? target
: 5000;
3691 olimit
= limit
? limit
: 10240;
3692 ointerval
= interval
? interval
: 100000;
3694 nl_msg_put_string(&request
, TCA_KIND
, "codel");
3695 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3696 nl_msg_put_u32(&request
, TCA_CODEL_TARGET
, otarget
);
3697 nl_msg_put_u32(&request
, TCA_CODEL_LIMIT
, olimit
);
3698 nl_msg_put_u32(&request
, TCA_CODEL_INTERVAL
, ointerval
);
3699 nl_msg_end_nested(&request
, opt_offset
);
3701 error
= tc_transact(&request
, NULL
);
3703 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3704 "target %u, limit %u, interval %u error %d(%s)",
3705 netdev_get_name(netdev
),
3706 otarget
, olimit
, ointerval
,
3707 error
, ovs_strerror(error
));
3713 codel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3714 const struct smap
*details
, struct codel
*codel
)
3716 codel
->target
= smap_get_ullong(details
, "target", 0);
3717 codel
->limit
= smap_get_ullong(details
, "limit", 0);
3718 codel
->interval
= smap_get_ullong(details
, "interval", 0);
3720 if (!codel
->target
) {
3721 codel
->target
= 5000;
3723 if (!codel
->limit
) {
3724 codel
->limit
= 10240;
3726 if (!codel
->interval
) {
3727 codel
->interval
= 100000;
3732 codel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3737 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3738 error
= codel_setup_qdisc__(netdev
, codel
.target
, codel
.limit
,
3741 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3747 codel_parse_tca_options__(struct nlattr
*nl_options
, struct codel
*codel
)
3749 static const struct nl_policy tca_codel_policy
[] = {
3750 [TCA_CODEL_TARGET
] = { .type
= NL_A_U32
},
3751 [TCA_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3752 [TCA_CODEL_INTERVAL
] = { .type
= NL_A_U32
}
3755 struct nlattr
*attrs
[ARRAY_SIZE(tca_codel_policy
)];
3757 if (!nl_parse_nested(nl_options
, tca_codel_policy
,
3758 attrs
, ARRAY_SIZE(tca_codel_policy
))) {
3759 VLOG_WARN_RL(&rl
, "failed to parse CoDel class options");
3763 codel
->target
= nl_attr_get_u32(attrs
[TCA_CODEL_TARGET
]);
3764 codel
->limit
= nl_attr_get_u32(attrs
[TCA_CODEL_LIMIT
]);
3765 codel
->interval
= nl_attr_get_u32(attrs
[TCA_CODEL_INTERVAL
]);
3770 codel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3772 struct nlattr
*nlattr
;
3777 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3782 error
= codel_parse_tca_options__(nlattr
, &codel
);
3787 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3793 codel_tc_destroy(struct tc
*tc
)
3795 struct codel
*codel
= CONTAINER_OF(tc
, struct codel
, tc
);
3801 codel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3803 const struct codel
*codel
= codel_get__(netdev
);
3804 smap_add_format(details
, "target", "%u", codel
->target
);
3805 smap_add_format(details
, "limit", "%u", codel
->limit
);
3806 smap_add_format(details
, "interval", "%u", codel
->interval
);
3811 codel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3815 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3816 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3817 codel_get__(netdev
)->target
= codel
.target
;
3818 codel_get__(netdev
)->limit
= codel
.limit
;
3819 codel_get__(netdev
)->interval
= codel
.interval
;
3823 static const struct tc_ops tc_ops_codel
= {
3824 .linux_name
= "codel",
3825 .ovs_name
= "linux-codel",
3826 .n_queues
= CODEL_N_QUEUES
,
3827 .tc_install
= codel_tc_install
,
3828 .tc_load
= codel_tc_load
,
3829 .tc_destroy
= codel_tc_destroy
,
3830 .qdisc_get
= codel_qdisc_get
,
3831 .qdisc_set
= codel_qdisc_set
,
3834 /* FQ-CoDel traffic control class. */
3836 #define FQCODEL_N_QUEUES 0x0000
3838 /* In sufficiently new kernel headers these are defined as enums in
3839 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3840 * kernels. (This overrides any enum definition in the header file but that's
3842 #define TCA_FQ_CODEL_TARGET 1
3843 #define TCA_FQ_CODEL_LIMIT 2
3844 #define TCA_FQ_CODEL_INTERVAL 3
3845 #define TCA_FQ_CODEL_ECN 4
3846 #define TCA_FQ_CODEL_FLOWS 5
3847 #define TCA_FQ_CODEL_QUANTUM 6
3858 static struct fqcodel
*
3859 fqcodel_get__(const struct netdev
*netdev_
)
3861 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3862 return CONTAINER_OF(netdev
->tc
, struct fqcodel
, tc
);
3866 fqcodel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3867 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3869 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3870 struct fqcodel
*fqcodel
;
3872 fqcodel
= xmalloc(sizeof *fqcodel
);
3873 tc_init(&fqcodel
->tc
, &tc_ops_fqcodel
);
3874 fqcodel
->target
= target
;
3875 fqcodel
->limit
= limit
;
3876 fqcodel
->interval
= interval
;
3877 fqcodel
->flows
= flows
;
3878 fqcodel
->quantum
= quantum
;
3880 netdev
->tc
= &fqcodel
->tc
;
3884 fqcodel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3885 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3888 struct ofpbuf request
;
3889 struct tcmsg
*tcmsg
;
3890 uint32_t otarget
, olimit
, ointerval
, oflows
, oquantum
;
3893 tc_del_qdisc(netdev
);
3895 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3896 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3900 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3901 tcmsg
->tcm_parent
= TC_H_ROOT
;
3903 otarget
= target
? target
: 5000;
3904 olimit
= limit
? limit
: 10240;
3905 ointerval
= interval
? interval
: 100000;
3906 oflows
= flows
? flows
: 1024;
3907 oquantum
= quantum
? quantum
: 1514; /* fq_codel default quantum is 1514
3910 nl_msg_put_string(&request
, TCA_KIND
, "fq_codel");
3911 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3912 nl_msg_put_u32(&request
, TCA_FQ_CODEL_TARGET
, otarget
);
3913 nl_msg_put_u32(&request
, TCA_FQ_CODEL_LIMIT
, olimit
);
3914 nl_msg_put_u32(&request
, TCA_FQ_CODEL_INTERVAL
, ointerval
);
3915 nl_msg_put_u32(&request
, TCA_FQ_CODEL_FLOWS
, oflows
);
3916 nl_msg_put_u32(&request
, TCA_FQ_CODEL_QUANTUM
, oquantum
);
3917 nl_msg_end_nested(&request
, opt_offset
);
3919 error
= tc_transact(&request
, NULL
);
3921 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3922 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3923 netdev_get_name(netdev
),
3924 otarget
, olimit
, ointerval
, oflows
, oquantum
,
3925 error
, ovs_strerror(error
));
3931 fqcodel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3932 const struct smap
*details
, struct fqcodel
*fqcodel
)
3934 fqcodel
->target
= smap_get_ullong(details
, "target", 0);
3935 fqcodel
->limit
= smap_get_ullong(details
, "limit", 0);
3936 fqcodel
->interval
= smap_get_ullong(details
, "interval", 0);
3937 fqcodel
->flows
= smap_get_ullong(details
, "flows", 0);
3938 fqcodel
->quantum
= smap_get_ullong(details
, "quantum", 0);
3940 if (!fqcodel
->target
) {
3941 fqcodel
->target
= 5000;
3943 if (!fqcodel
->limit
) {
3944 fqcodel
->limit
= 10240;
3946 if (!fqcodel
->interval
) {
3947 fqcodel
->interval
= 1000000;
3949 if (!fqcodel
->flows
) {
3950 fqcodel
->flows
= 1024;
3952 if (!fqcodel
->quantum
) {
3953 fqcodel
->quantum
= 1514;
3958 fqcodel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3961 struct fqcodel fqcodel
;
3963 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3964 error
= fqcodel_setup_qdisc__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3965 fqcodel
.interval
, fqcodel
.flows
,
3968 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3969 fqcodel
.interval
, fqcodel
.flows
, fqcodel
.quantum
);
3975 fqcodel_parse_tca_options__(struct nlattr
*nl_options
, struct fqcodel
*fqcodel
)
3977 static const struct nl_policy tca_fqcodel_policy
[] = {
3978 [TCA_FQ_CODEL_TARGET
] = { .type
= NL_A_U32
},
3979 [TCA_FQ_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3980 [TCA_FQ_CODEL_INTERVAL
] = { .type
= NL_A_U32
},
3981 [TCA_FQ_CODEL_FLOWS
] = { .type
= NL_A_U32
},
3982 [TCA_FQ_CODEL_QUANTUM
] = { .type
= NL_A_U32
}
3985 struct nlattr
*attrs
[ARRAY_SIZE(tca_fqcodel_policy
)];
3987 if (!nl_parse_nested(nl_options
, tca_fqcodel_policy
,
3988 attrs
, ARRAY_SIZE(tca_fqcodel_policy
))) {
3989 VLOG_WARN_RL(&rl
, "failed to parse FQ_CoDel class options");
3993 fqcodel
->target
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_TARGET
]);
3994 fqcodel
->limit
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_LIMIT
]);
3995 fqcodel
->interval
=nl_attr_get_u32(attrs
[TCA_FQ_CODEL_INTERVAL
]);
3996 fqcodel
->flows
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_FLOWS
]);
3997 fqcodel
->quantum
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_QUANTUM
]);
4002 fqcodel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
4004 struct nlattr
*nlattr
;
4007 struct fqcodel fqcodel
;
4009 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
4014 error
= fqcodel_parse_tca_options__(nlattr
, &fqcodel
);
4019 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
4020 fqcodel
.flows
, fqcodel
.quantum
);
4025 fqcodel_tc_destroy(struct tc
*tc
)
4027 struct fqcodel
*fqcodel
= CONTAINER_OF(tc
, struct fqcodel
, tc
);
4033 fqcodel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4035 const struct fqcodel
*fqcodel
= fqcodel_get__(netdev
);
4036 smap_add_format(details
, "target", "%u", fqcodel
->target
);
4037 smap_add_format(details
, "limit", "%u", fqcodel
->limit
);
4038 smap_add_format(details
, "interval", "%u", fqcodel
->interval
);
4039 smap_add_format(details
, "flows", "%u", fqcodel
->flows
);
4040 smap_add_format(details
, "quantum", "%u", fqcodel
->quantum
);
4045 fqcodel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4047 struct fqcodel fqcodel
;
4049 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
4050 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
4051 fqcodel
.flows
, fqcodel
.quantum
);
4052 fqcodel_get__(netdev
)->target
= fqcodel
.target
;
4053 fqcodel_get__(netdev
)->limit
= fqcodel
.limit
;
4054 fqcodel_get__(netdev
)->interval
= fqcodel
.interval
;
4055 fqcodel_get__(netdev
)->flows
= fqcodel
.flows
;
4056 fqcodel_get__(netdev
)->quantum
= fqcodel
.quantum
;
4060 static const struct tc_ops tc_ops_fqcodel
= {
4061 .linux_name
= "fq_codel",
4062 .ovs_name
= "linux-fq_codel",
4063 .n_queues
= FQCODEL_N_QUEUES
,
4064 .tc_install
= fqcodel_tc_install
,
4065 .tc_load
= fqcodel_tc_load
,
4066 .tc_destroy
= fqcodel_tc_destroy
,
4067 .qdisc_get
= fqcodel_qdisc_get
,
4068 .qdisc_set
= fqcodel_qdisc_set
,
4071 /* SFQ traffic control class. */
4073 #define SFQ_N_QUEUES 0x0000
4082 sfq_get__(const struct netdev
*netdev_
)
4084 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4085 return CONTAINER_OF(netdev
->tc
, struct sfq
, tc
);
4089 sfq_install__(struct netdev
*netdev_
, uint32_t quantum
, uint32_t perturb
)
4091 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4094 sfq
= xmalloc(sizeof *sfq
);
4095 tc_init(&sfq
->tc
, &tc_ops_sfq
);
4096 sfq
->perturb
= perturb
;
4097 sfq
->quantum
= quantum
;
4099 netdev
->tc
= &sfq
->tc
;
4103 sfq_setup_qdisc__(struct netdev
*netdev
, uint32_t quantum
, uint32_t perturb
)
4105 struct tc_sfq_qopt opt
;
4106 struct ofpbuf request
;
4107 struct tcmsg
*tcmsg
;
4109 int mtu_error
, error
;
4110 mtu_error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
4112 tc_del_qdisc(netdev
);
4114 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4115 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4119 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4120 tcmsg
->tcm_parent
= TC_H_ROOT
;
4122 memset(&opt
, 0, sizeof opt
);
4125 opt
.quantum
= mtu
; /* if we cannot find mtu, use default */
4128 opt
.quantum
= quantum
;
4132 opt
.perturb_period
= 10;
4134 opt
.perturb_period
= perturb
;
4137 nl_msg_put_string(&request
, TCA_KIND
, "sfq");
4138 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
4140 error
= tc_transact(&request
, NULL
);
4142 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
4143 "quantum %u, perturb %u error %d(%s)",
4144 netdev_get_name(netdev
),
4145 opt
.quantum
, opt
.perturb_period
,
4146 error
, ovs_strerror(error
));
4152 sfq_parse_qdisc_details__(struct netdev
*netdev
,
4153 const struct smap
*details
, struct sfq
*sfq
)
4155 sfq
->perturb
= smap_get_ullong(details
, "perturb", 0);
4156 sfq
->quantum
= smap_get_ullong(details
, "quantum", 0);
4158 if (!sfq
->perturb
) {
4162 if (!sfq
->quantum
) {
4164 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
4167 VLOG_WARN_RL(&rl
, "when using SFQ, you must specify quantum on a "
4168 "device without mtu");
4174 sfq_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4179 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
4180 error
= sfq_setup_qdisc__(netdev
, sfq
.quantum
, sfq
.perturb
);
4182 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
4188 sfq_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
4190 const struct tc_sfq_qopt
*sfq
;
4191 struct nlattr
*nlattr
;
4195 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
4197 sfq
= nl_attr_get(nlattr
);
4198 sfq_install__(netdev
, sfq
->quantum
, sfq
->perturb_period
);
4206 sfq_tc_destroy(struct tc
*tc
)
4208 struct sfq
*sfq
= CONTAINER_OF(tc
, struct sfq
, tc
);
4214 sfq_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4216 const struct sfq
*sfq
= sfq_get__(netdev
);
4217 smap_add_format(details
, "quantum", "%u", sfq
->quantum
);
4218 smap_add_format(details
, "perturb", "%u", sfq
->perturb
);
4223 sfq_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4227 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
4228 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
4229 sfq_get__(netdev
)->quantum
= sfq
.quantum
;
4230 sfq_get__(netdev
)->perturb
= sfq
.perturb
;
4234 static const struct tc_ops tc_ops_sfq
= {
4235 .linux_name
= "sfq",
4236 .ovs_name
= "linux-sfq",
4237 .n_queues
= SFQ_N_QUEUES
,
4238 .tc_install
= sfq_tc_install
,
4239 .tc_load
= sfq_tc_load
,
4240 .tc_destroy
= sfq_tc_destroy
,
4241 .qdisc_get
= sfq_qdisc_get
,
4242 .qdisc_set
= sfq_qdisc_set
,
4245 /* netem traffic control class. */
4254 static struct netem
*
4255 netem_get__(const struct netdev
*netdev_
)
4257 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4258 return CONTAINER_OF(netdev
->tc
, struct netem
, tc
);
4262 netem_install__(struct netdev
*netdev_
, uint32_t latency
,
4263 uint32_t limit
, uint32_t loss
)
4265 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4266 struct netem
*netem
;
4268 netem
= xmalloc(sizeof *netem
);
4269 tc_init(&netem
->tc
, &tc_ops_netem
);
4270 netem
->latency
= latency
;
4271 netem
->limit
= limit
;
4274 netdev
->tc
= &netem
->tc
;
4278 netem_setup_qdisc__(struct netdev
*netdev
, uint32_t latency
,
4279 uint32_t limit
, uint32_t loss
)
4281 struct tc_netem_qopt opt
;
4282 struct ofpbuf request
;
4283 struct tcmsg
*tcmsg
;
4286 tc_del_qdisc(netdev
);
4288 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4289 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4293 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4294 tcmsg
->tcm_parent
= TC_H_ROOT
;
4296 memset(&opt
, 0, sizeof opt
);
4307 "loss should be a percentage value between 0 to 100, "
4308 "loss was %u", loss
);
4311 opt
.loss
= floor(UINT32_MAX
* (loss
/ 100.0));
4314 opt
.latency
= tc_time_to_ticks(latency
);
4316 nl_msg_put_string(&request
, TCA_KIND
, "netem");
4317 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
4319 error
= tc_transact(&request
, NULL
);
4321 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
4322 "latency %u, limit %u, loss %u error %d(%s)",
4323 netdev_get_name(netdev
),
4324 opt
.latency
, opt
.limit
, opt
.loss
,
4325 error
, ovs_strerror(error
));
4331 netem_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
4332 const struct smap
*details
, struct netem
*netem
)
4334 netem
->latency
= smap_get_ullong(details
, "latency", 0);
4335 netem
->limit
= smap_get_ullong(details
, "limit", 0);
4336 netem
->loss
= smap_get_ullong(details
, "loss", 0);
4338 if (!netem
->limit
) {
4339 netem
->limit
= 1000;
4344 netem_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4349 netem_parse_qdisc_details__(netdev
, details
, &netem
);
4350 error
= netem_setup_qdisc__(netdev
, netem
.latency
,
4351 netem
.limit
, netem
.loss
);
4353 netem_install__(netdev
, netem
.latency
, netem
.limit
, netem
.loss
);
4359 netem_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
4361 const struct tc_netem_qopt
*netem
;
4362 struct nlattr
*nlattr
;
4366 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
4368 netem
= nl_attr_get(nlattr
);
4369 netem_install__(netdev
, netem
->latency
, netem
->limit
, netem
->loss
);
4377 netem_tc_destroy(struct tc
*tc
)
4379 struct netem
*netem
= CONTAINER_OF(tc
, struct netem
, tc
);
4385 netem_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4387 const struct netem
*netem
= netem_get__(netdev
);
4388 smap_add_format(details
, "latency", "%u", netem
->latency
);
4389 smap_add_format(details
, "limit", "%u", netem
->limit
);
4390 smap_add_format(details
, "loss", "%u", netem
->loss
);
4395 netem_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4399 netem_parse_qdisc_details__(netdev
, details
, &netem
);
4400 netem_install__(netdev
, netem
.latency
, netem
.limit
, netem
.loss
);
4401 netem_get__(netdev
)->latency
= netem
.latency
;
4402 netem_get__(netdev
)->limit
= netem
.limit
;
4403 netem_get__(netdev
)->loss
= netem
.loss
;
4407 static const struct tc_ops tc_ops_netem
= {
4408 .linux_name
= "netem",
4409 .ovs_name
= "linux-netem",
4411 .tc_install
= netem_tc_install
,
4412 .tc_load
= netem_tc_load
,
4413 .tc_destroy
= netem_tc_destroy
,
4414 .qdisc_get
= netem_qdisc_get
,
4415 .qdisc_set
= netem_qdisc_set
,
4418 /* HTB traffic control class. */
4420 #define HTB_N_QUEUES 0xf000
4421 #define HTB_RATE2QUANTUM 10
4425 unsigned int max_rate
; /* In bytes/s. */
4429 struct tc_queue tc_queue
;
4430 unsigned int min_rate
; /* In bytes/s. */
4431 unsigned int max_rate
; /* In bytes/s. */
4432 unsigned int burst
; /* In bytes. */
4433 unsigned int priority
; /* Lower values are higher priorities. */
4437 htb_get__(const struct netdev
*netdev_
)
4439 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4440 return CONTAINER_OF(netdev
->tc
, struct htb
, tc
);
4444 htb_install__(struct netdev
*netdev_
, uint64_t max_rate
)
4446 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4449 htb
= xmalloc(sizeof *htb
);
4450 tc_init(&htb
->tc
, &tc_ops_htb
);
4451 htb
->max_rate
= max_rate
;
4453 netdev
->tc
= &htb
->tc
;
4456 /* Create an HTB qdisc.
4458 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
4460 htb_setup_qdisc__(struct netdev
*netdev
)
4463 struct tc_htb_glob opt
;
4464 struct ofpbuf request
;
4465 struct tcmsg
*tcmsg
;
4467 tc_del_qdisc(netdev
);
4469 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4470 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4474 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4475 tcmsg
->tcm_parent
= TC_H_ROOT
;
4477 nl_msg_put_string(&request
, TCA_KIND
, "htb");
4479 memset(&opt
, 0, sizeof opt
);
4480 opt
.rate2quantum
= HTB_RATE2QUANTUM
;
4484 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4485 nl_msg_put_unspec(&request
, TCA_HTB_INIT
, &opt
, sizeof opt
);
4486 nl_msg_end_nested(&request
, opt_offset
);
4488 return tc_transact(&request
, NULL
);
4491 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
4492 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
4494 htb_setup_class__(struct netdev
*netdev
, unsigned int handle
,
4495 unsigned int parent
, struct htb_class
*class)
4498 struct tc_htb_opt opt
;
4499 struct ofpbuf request
;
4500 struct tcmsg
*tcmsg
;
4504 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
4506 VLOG_WARN_RL(&rl
, "cannot set up HTB on device %s that lacks MTU",
4507 netdev_get_name(netdev
));
4511 memset(&opt
, 0, sizeof opt
);
4512 tc_fill_rate(&opt
.rate
, class->min_rate
, mtu
);
4513 tc_fill_rate(&opt
.ceil
, class->max_rate
, mtu
);
4514 /* Makes sure the quantum is at least MTU. Setting quantum will
4515 * make htb ignore the r2q for this class. */
4516 if ((class->min_rate
/ HTB_RATE2QUANTUM
) < mtu
) {
4519 opt
.buffer
= tc_calc_buffer(opt
.rate
.rate
, mtu
, class->burst
);
4520 opt
.cbuffer
= tc_calc_buffer(opt
.ceil
.rate
, mtu
, class->burst
);
4521 opt
.prio
= class->priority
;
4523 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
4528 tcmsg
->tcm_handle
= handle
;
4529 tcmsg
->tcm_parent
= parent
;
4531 nl_msg_put_string(&request
, TCA_KIND
, "htb");
4532 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4533 nl_msg_put_unspec(&request
, TCA_HTB_PARMS
, &opt
, sizeof opt
);
4534 tc_put_rtab(&request
, TCA_HTB_RTAB
, &opt
.rate
);
4535 tc_put_rtab(&request
, TCA_HTB_CTAB
, &opt
.ceil
);
4536 nl_msg_end_nested(&request
, opt_offset
);
4538 error
= tc_transact(&request
, NULL
);
4540 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
4541 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
4542 netdev_get_name(netdev
),
4543 tc_get_major(handle
), tc_get_minor(handle
),
4544 tc_get_major(parent
), tc_get_minor(parent
),
4545 class->min_rate
, class->max_rate
,
4546 class->burst
, class->priority
, ovs_strerror(error
));
4551 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
4552 * description of them into 'details'. The description complies with the
4553 * specification given in the vswitch database documentation for linux-htb
4556 htb_parse_tca_options__(struct nlattr
*nl_options
, struct htb_class
*class)
4558 static const struct nl_policy tca_htb_policy
[] = {
4559 [TCA_HTB_PARMS
] = { .type
= NL_A_UNSPEC
, .optional
= false,
4560 .min_len
= sizeof(struct tc_htb_opt
) },
4563 struct nlattr
*attrs
[ARRAY_SIZE(tca_htb_policy
)];
4564 const struct tc_htb_opt
*htb
;
4566 if (!nl_parse_nested(nl_options
, tca_htb_policy
,
4567 attrs
, ARRAY_SIZE(tca_htb_policy
))) {
4568 VLOG_WARN_RL(&rl
, "failed to parse HTB class options");
4572 htb
= nl_attr_get(attrs
[TCA_HTB_PARMS
]);
4573 class->min_rate
= htb
->rate
.rate
;
4574 class->max_rate
= htb
->ceil
.rate
;
4575 class->burst
= tc_ticks_to_bytes(htb
->rate
.rate
, htb
->buffer
);
4576 class->priority
= htb
->prio
;
4581 htb_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4582 struct htb_class
*options
,
4583 struct netdev_queue_stats
*stats
)
4585 struct nlattr
*nl_options
;
4586 unsigned int handle
;
4589 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4590 if (!error
&& queue_id
) {
4591 unsigned int major
= tc_get_major(handle
);
4592 unsigned int minor
= tc_get_minor(handle
);
4593 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4594 *queue_id
= minor
- 1;
4599 if (!error
&& options
) {
4600 error
= htb_parse_tca_options__(nl_options
, options
);
4606 htb_parse_qdisc_details__(struct netdev
*netdev_
,
4607 const struct smap
*details
, struct htb_class
*hc
)
4609 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4611 hc
->max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
4612 if (!hc
->max_rate
) {
4613 enum netdev_features current
;
4615 netdev_linux_read_features(netdev
);
4616 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4617 hc
->max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4619 hc
->min_rate
= hc
->max_rate
;
4625 htb_parse_class_details__(struct netdev
*netdev
,
4626 const struct smap
*details
, struct htb_class
*hc
)
4628 const struct htb
*htb
= htb_get__(netdev
);
4630 unsigned long long int max_rate_bit
;
4632 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
4634 VLOG_WARN_RL(&rl
, "cannot parse HTB class on device %s that lacks MTU",
4635 netdev_get_name(netdev
));
4639 /* HTB requires at least an mtu sized min-rate to send any traffic even
4640 * on uncongested links. */
4641 hc
->min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
4642 hc
->min_rate
= MAX(hc
->min_rate
, mtu
);
4643 hc
->min_rate
= MIN(hc
->min_rate
, htb
->max_rate
);
4646 max_rate_bit
= smap_get_ullong(details
, "max-rate", 0);
4647 hc
->max_rate
= max_rate_bit
? max_rate_bit
/ 8 : htb
->max_rate
;
4648 hc
->max_rate
= MAX(hc
->max_rate
, hc
->min_rate
);
4649 hc
->max_rate
= MIN(hc
->max_rate
, htb
->max_rate
);
4653 * According to hints in the documentation that I've read, it is important
4654 * that 'burst' be at least as big as the largest frame that might be
4655 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4656 * but having it a bit too small is a problem. Since netdev_get_mtu()
4657 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4658 * the MTU. We actually add 64, instead of 14, as a guard against
4659 * additional headers get tacked on somewhere that we're not aware of. */
4660 hc
->burst
= smap_get_ullong(details
, "burst", 0) / 8;
4661 hc
->burst
= MAX(hc
->burst
, mtu
+ 64);
4664 hc
->priority
= smap_get_ullong(details
, "priority", 0);
4670 htb_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4671 unsigned int parent
, struct htb_class
*options
,
4672 struct netdev_queue_stats
*stats
)
4674 struct ofpbuf
*reply
;
4677 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4679 error
= htb_parse_tcmsg__(reply
, NULL
, options
, stats
);
4680 ofpbuf_delete(reply
);
4686 htb_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4690 error
= htb_setup_qdisc__(netdev
);
4692 struct htb_class hc
;
4694 htb_parse_qdisc_details__(netdev
, details
, &hc
);
4695 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4696 tc_make_handle(1, 0), &hc
);
4698 htb_install__(netdev
, hc
.max_rate
);
4704 static struct htb_class
*
4705 htb_class_cast__(const struct tc_queue
*queue
)
4707 return CONTAINER_OF(queue
, struct htb_class
, tc_queue
);
4711 htb_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4712 const struct htb_class
*hc
)
4714 struct htb
*htb
= htb_get__(netdev
);
4715 size_t hash
= hash_int(queue_id
, 0);
4716 struct tc_queue
*queue
;
4717 struct htb_class
*hcp
;
4719 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4721 hcp
= htb_class_cast__(queue
);
4723 hcp
= xmalloc(sizeof *hcp
);
4724 queue
= &hcp
->tc_queue
;
4725 queue
->queue_id
= queue_id
;
4726 queue
->created
= time_msec();
4727 hmap_insert(&htb
->tc
.queues
, &queue
->hmap_node
, hash
);
4730 hcp
->min_rate
= hc
->min_rate
;
4731 hcp
->max_rate
= hc
->max_rate
;
4732 hcp
->burst
= hc
->burst
;
4733 hcp
->priority
= hc
->priority
;
4737 htb_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4740 struct queue_dump_state state
;
4741 struct htb_class hc
;
4743 /* Get qdisc options. */
4745 htb_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4746 htb_install__(netdev
, hc
.max_rate
);
4749 if (!start_queue_dump(netdev
, &state
)) {
4752 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4753 unsigned int queue_id
;
4755 if (!htb_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4756 htb_update_queue__(netdev
, queue_id
, &hc
);
4759 finish_queue_dump(&state
);
4765 htb_tc_destroy(struct tc
*tc
)
4767 struct htb
*htb
= CONTAINER_OF(tc
, struct htb
, tc
);
4768 struct htb_class
*hc
;
4770 HMAP_FOR_EACH_POP (hc
, tc_queue
.hmap_node
, &htb
->tc
.queues
) {
4778 htb_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4780 const struct htb
*htb
= htb_get__(netdev
);
4781 smap_add_format(details
, "max-rate", "%llu", 8ULL * htb
->max_rate
);
4786 htb_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4788 struct htb_class hc
;
4791 htb_parse_qdisc_details__(netdev
, details
, &hc
);
4792 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4793 tc_make_handle(1, 0), &hc
);
4795 htb_get__(netdev
)->max_rate
= hc
.max_rate
;
4801 htb_class_get(const struct netdev
*netdev OVS_UNUSED
,
4802 const struct tc_queue
*queue
, struct smap
*details
)
4804 const struct htb_class
*hc
= htb_class_cast__(queue
);
4806 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4807 if (hc
->min_rate
!= hc
->max_rate
) {
4808 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4810 smap_add_format(details
, "burst", "%llu", 8ULL * hc
->burst
);
4812 smap_add_format(details
, "priority", "%u", hc
->priority
);
4818 htb_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4819 const struct smap
*details
)
4821 struct htb_class hc
;
4824 error
= htb_parse_class_details__(netdev
, details
, &hc
);
4829 error
= htb_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
4830 tc_make_handle(1, 0xfffe), &hc
);
4835 htb_update_queue__(netdev
, queue_id
, &hc
);
4840 htb_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4842 struct htb_class
*hc
= htb_class_cast__(queue
);
4843 struct htb
*htb
= htb_get__(netdev
);
4846 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4848 hmap_remove(&htb
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4855 htb_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4856 struct netdev_queue_stats
*stats
)
4858 return htb_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4859 tc_make_handle(1, 0xfffe), NULL
, stats
);
4863 htb_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4864 const struct ofpbuf
*nlmsg
,
4865 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4867 struct netdev_queue_stats stats
;
4868 unsigned int handle
, major
, minor
;
4871 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4876 major
= tc_get_major(handle
);
4877 minor
= tc_get_minor(handle
);
4878 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4879 (*cb
)(minor
- 1, &stats
, aux
);
4884 static const struct tc_ops tc_ops_htb
= {
4885 .linux_name
= "htb",
4886 .ovs_name
= "linux-htb",
4887 .n_queues
= HTB_N_QUEUES
,
4888 .tc_install
= htb_tc_install
,
4889 .tc_load
= htb_tc_load
,
4890 .tc_destroy
= htb_tc_destroy
,
4891 .qdisc_get
= htb_qdisc_get
,
4892 .qdisc_set
= htb_qdisc_set
,
4893 .class_get
= htb_class_get
,
4894 .class_set
= htb_class_set
,
4895 .class_delete
= htb_class_delete
,
4896 .class_get_stats
= htb_class_get_stats
,
4897 .class_dump_stats
= htb_class_dump_stats
4900 /* "linux-hfsc" traffic control class. */
4902 #define HFSC_N_QUEUES 0xf000
4910 struct tc_queue tc_queue
;
4915 static struct hfsc
*
4916 hfsc_get__(const struct netdev
*netdev_
)
4918 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4919 return CONTAINER_OF(netdev
->tc
, struct hfsc
, tc
);
4922 static struct hfsc_class
*
4923 hfsc_class_cast__(const struct tc_queue
*queue
)
4925 return CONTAINER_OF(queue
, struct hfsc_class
, tc_queue
);
4929 hfsc_install__(struct netdev
*netdev_
, uint32_t max_rate
)
4931 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4934 hfsc
= xmalloc(sizeof *hfsc
);
4935 tc_init(&hfsc
->tc
, &tc_ops_hfsc
);
4936 hfsc
->max_rate
= max_rate
;
4937 netdev
->tc
= &hfsc
->tc
;
4941 hfsc_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4942 const struct hfsc_class
*hc
)
4946 struct hfsc_class
*hcp
;
4947 struct tc_queue
*queue
;
4949 hfsc
= hfsc_get__(netdev
);
4950 hash
= hash_int(queue_id
, 0);
4952 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4954 hcp
= hfsc_class_cast__(queue
);
4956 hcp
= xmalloc(sizeof *hcp
);
4957 queue
= &hcp
->tc_queue
;
4958 queue
->queue_id
= queue_id
;
4959 queue
->created
= time_msec();
4960 hmap_insert(&hfsc
->tc
.queues
, &queue
->hmap_node
, hash
);
4963 hcp
->min_rate
= hc
->min_rate
;
4964 hcp
->max_rate
= hc
->max_rate
;
4968 hfsc_parse_tca_options__(struct nlattr
*nl_options
, struct hfsc_class
*class)
4970 const struct tc_service_curve
*rsc
, *fsc
, *usc
;
4971 static const struct nl_policy tca_hfsc_policy
[] = {
4973 .type
= NL_A_UNSPEC
,
4975 .min_len
= sizeof(struct tc_service_curve
),
4978 .type
= NL_A_UNSPEC
,
4980 .min_len
= sizeof(struct tc_service_curve
),
4983 .type
= NL_A_UNSPEC
,
4985 .min_len
= sizeof(struct tc_service_curve
),
4988 struct nlattr
*attrs
[ARRAY_SIZE(tca_hfsc_policy
)];
4990 if (!nl_parse_nested(nl_options
, tca_hfsc_policy
,
4991 attrs
, ARRAY_SIZE(tca_hfsc_policy
))) {
4992 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options");
4996 rsc
= nl_attr_get(attrs
[TCA_HFSC_RSC
]);
4997 fsc
= nl_attr_get(attrs
[TCA_HFSC_FSC
]);
4998 usc
= nl_attr_get(attrs
[TCA_HFSC_USC
]);
5000 if (rsc
->m1
!= 0 || rsc
->d
!= 0 ||
5001 fsc
->m1
!= 0 || fsc
->d
!= 0 ||
5002 usc
->m1
!= 0 || usc
->d
!= 0) {
5003 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
5004 "Non-linear service curves are not supported.");
5008 if (rsc
->m2
!= fsc
->m2
) {
5009 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
5010 "Real-time service curves are not supported ");
5014 if (rsc
->m2
> usc
->m2
) {
5015 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
5016 "Min-rate service curve is greater than "
5017 "the max-rate service curve.");
5021 class->min_rate
= fsc
->m2
;
5022 class->max_rate
= usc
->m2
;
5027 hfsc_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
5028 struct hfsc_class
*options
,
5029 struct netdev_queue_stats
*stats
)
5032 unsigned int handle
;
5033 struct nlattr
*nl_options
;
5035 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
5041 unsigned int major
, minor
;
5043 major
= tc_get_major(handle
);
5044 minor
= tc_get_minor(handle
);
5045 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
5046 *queue_id
= minor
- 1;
5053 error
= hfsc_parse_tca_options__(nl_options
, options
);
5060 hfsc_query_class__(const struct netdev
*netdev
, unsigned int handle
,
5061 unsigned int parent
, struct hfsc_class
*options
,
5062 struct netdev_queue_stats
*stats
)
5065 struct ofpbuf
*reply
;
5067 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
5072 error
= hfsc_parse_tcmsg__(reply
, NULL
, options
, stats
);
5073 ofpbuf_delete(reply
);
5078 hfsc_parse_qdisc_details__(struct netdev
*netdev_
, const struct smap
*details
,
5079 struct hfsc_class
*class)
5081 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5083 uint32_t max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
5085 enum netdev_features current
;
5087 netdev_linux_read_features(netdev
);
5088 current
= !netdev
->get_features_error
? netdev
->current
: 0;
5089 max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
5092 class->min_rate
= max_rate
;
5093 class->max_rate
= max_rate
;
5097 hfsc_parse_class_details__(struct netdev
*netdev
,
5098 const struct smap
*details
,
5099 struct hfsc_class
* class)
5101 const struct hfsc
*hfsc
;
5102 uint32_t min_rate
, max_rate
;
5104 hfsc
= hfsc_get__(netdev
);
5106 min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
5107 min_rate
= MAX(min_rate
, 1);
5108 min_rate
= MIN(min_rate
, hfsc
->max_rate
);
5110 max_rate
= smap_get_ullong(details
, "max-rate", hfsc
->max_rate
* 8) / 8;
5111 max_rate
= MAX(max_rate
, min_rate
);
5112 max_rate
= MIN(max_rate
, hfsc
->max_rate
);
5114 class->min_rate
= min_rate
;
5115 class->max_rate
= max_rate
;
5120 /* Create an HFSC qdisc.
5122 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
5124 hfsc_setup_qdisc__(struct netdev
* netdev
)
5126 struct tcmsg
*tcmsg
;
5127 struct ofpbuf request
;
5128 struct tc_hfsc_qopt opt
;
5130 tc_del_qdisc(netdev
);
5132 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
5133 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
5139 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
5140 tcmsg
->tcm_parent
= TC_H_ROOT
;
5142 memset(&opt
, 0, sizeof opt
);
5145 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
5146 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
5148 return tc_transact(&request
, NULL
);
5151 /* Create an HFSC class.
5153 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
5154 * sc rate <min_rate> ul rate <max_rate>" */
5156 hfsc_setup_class__(struct netdev
*netdev
, unsigned int handle
,
5157 unsigned int parent
, struct hfsc_class
*class)
5161 struct tcmsg
*tcmsg
;
5162 struct ofpbuf request
;
5163 struct tc_service_curve min
, max
;
5165 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
5172 tcmsg
->tcm_handle
= handle
;
5173 tcmsg
->tcm_parent
= parent
;
5177 min
.m2
= class->min_rate
;
5181 max
.m2
= class->max_rate
;
5183 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
5184 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
5185 nl_msg_put_unspec(&request
, TCA_HFSC_RSC
, &min
, sizeof min
);
5186 nl_msg_put_unspec(&request
, TCA_HFSC_FSC
, &min
, sizeof min
);
5187 nl_msg_put_unspec(&request
, TCA_HFSC_USC
, &max
, sizeof max
);
5188 nl_msg_end_nested(&request
, opt_offset
);
5190 error
= tc_transact(&request
, NULL
);
5192 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
5193 "min-rate %ubps, max-rate %ubps (%s)",
5194 netdev_get_name(netdev
),
5195 tc_get_major(handle
), tc_get_minor(handle
),
5196 tc_get_major(parent
), tc_get_minor(parent
),
5197 class->min_rate
, class->max_rate
, ovs_strerror(error
));
5204 hfsc_tc_install(struct netdev
*netdev
, const struct smap
*details
)
5207 struct hfsc_class
class;
5209 error
= hfsc_setup_qdisc__(netdev
);
5215 hfsc_parse_qdisc_details__(netdev
, details
, &class);
5216 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
5217 tc_make_handle(1, 0), &class);
5223 hfsc_install__(netdev
, class.max_rate
);
5228 hfsc_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5231 struct queue_dump_state state
;
5232 struct hfsc_class hc
;
5235 hfsc_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
5236 hfsc_install__(netdev
, hc
.max_rate
);
5238 if (!start_queue_dump(netdev
, &state
)) {
5242 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
5243 unsigned int queue_id
;
5245 if (!hfsc_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
5246 hfsc_update_queue__(netdev
, queue_id
, &hc
);
5250 finish_queue_dump(&state
);
5255 hfsc_tc_destroy(struct tc
*tc
)
5258 struct hfsc_class
*hc
, *next
;
5260 hfsc
= CONTAINER_OF(tc
, struct hfsc
, tc
);
5262 HMAP_FOR_EACH_SAFE (hc
, next
, tc_queue
.hmap_node
, &hfsc
->tc
.queues
) {
5263 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
5272 hfsc_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
5274 const struct hfsc
*hfsc
;
5275 hfsc
= hfsc_get__(netdev
);
5276 smap_add_format(details
, "max-rate", "%llu", 8ULL * hfsc
->max_rate
);
5281 hfsc_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
5284 struct hfsc_class
class;
5286 hfsc_parse_qdisc_details__(netdev
, details
, &class);
5287 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
5288 tc_make_handle(1, 0), &class);
5291 hfsc_get__(netdev
)->max_rate
= class.max_rate
;
5298 hfsc_class_get(const struct netdev
*netdev OVS_UNUSED
,
5299 const struct tc_queue
*queue
, struct smap
*details
)
5301 const struct hfsc_class
*hc
;
5303 hc
= hfsc_class_cast__(queue
);
5304 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
5305 if (hc
->min_rate
!= hc
->max_rate
) {
5306 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
5312 hfsc_class_set(struct netdev
*netdev
, unsigned int queue_id
,
5313 const struct smap
*details
)
5316 struct hfsc_class
class;
5318 error
= hfsc_parse_class_details__(netdev
, details
, &class);
5323 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
5324 tc_make_handle(1, 0xfffe), &class);
5329 hfsc_update_queue__(netdev
, queue_id
, &class);
5334 hfsc_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
5338 struct hfsc_class
*hc
;
5340 hc
= hfsc_class_cast__(queue
);
5341 hfsc
= hfsc_get__(netdev
);
5343 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
5345 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
5352 hfsc_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
5353 struct netdev_queue_stats
*stats
)
5355 return hfsc_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
5356 tc_make_handle(1, 0xfffe), NULL
, stats
);
5360 hfsc_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
5361 const struct ofpbuf
*nlmsg
,
5362 netdev_dump_queue_stats_cb
*cb
, void *aux
)
5364 struct netdev_queue_stats stats
;
5365 unsigned int handle
, major
, minor
;
5368 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
5373 major
= tc_get_major(handle
);
5374 minor
= tc_get_minor(handle
);
5375 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
5376 (*cb
)(minor
- 1, &stats
, aux
);
5381 static const struct tc_ops tc_ops_hfsc
= {
5382 .linux_name
= "hfsc",
5383 .ovs_name
= "linux-hfsc",
5384 .n_queues
= HFSC_N_QUEUES
, /* n_queues */
5385 .tc_install
= hfsc_tc_install
,
5386 .tc_load
= hfsc_tc_load
,
5387 .tc_destroy
= hfsc_tc_destroy
,
5388 .qdisc_get
= hfsc_qdisc_get
,
5389 .qdisc_set
= hfsc_qdisc_set
,
5390 .class_get
= hfsc_class_get
,
5391 .class_set
= hfsc_class_set
,
5392 .class_delete
= hfsc_class_delete
,
5393 .class_get_stats
= hfsc_class_get_stats
,
5394 .class_dump_stats
= hfsc_class_dump_stats
,
5397 /* "linux-noop" traffic control class. */
5400 noop_install__(struct netdev
*netdev_
)
5402 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5403 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
5405 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
5409 noop_tc_install(struct netdev
*netdev
,
5410 const struct smap
*details OVS_UNUSED
)
5412 noop_install__(netdev
);
5417 noop_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5419 noop_install__(netdev
);
5423 static const struct tc_ops tc_ops_noop
= {
5424 .ovs_name
= "linux-noop", /* ovs_name */
5425 .tc_install
= noop_tc_install
,
5426 .tc_load
= noop_tc_load
,
5429 /* "linux-default" traffic control class.
5431 * This class represents the default, unnamed Linux qdisc. It corresponds to
5432 * the "" (empty string) QoS type in the OVS database. */
5435 default_install__(struct netdev
*netdev_
)
5437 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5438 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
5440 /* Nothing but a tc class implementation is allowed to write to a tc. This
5441 * class never does that, so we can legitimately use a const tc object. */
5442 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
5446 default_tc_install(struct netdev
*netdev
,
5447 const struct smap
*details OVS_UNUSED
)
5449 default_install__(netdev
);
5454 default_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5456 default_install__(netdev
);
5460 static const struct tc_ops tc_ops_default
= {
5461 .ovs_name
= "", /* ovs_name */
5462 .tc_install
= default_tc_install
,
5463 .tc_load
= default_tc_load
,
5466 /* "linux-other" traffic control class.
5471 other_tc_load(struct netdev
*netdev_
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5473 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5474 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_other
);
5476 /* Nothing but a tc class implementation is allowed to write to a tc. This
5477 * class never does that, so we can legitimately use a const tc object. */
5478 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
5482 static const struct tc_ops tc_ops_other
= {
5483 .ovs_name
= "linux-other",
5484 .tc_load
= other_tc_load
,
5487 /* Traffic control. */
5489 /* Number of kernel "tc" ticks per second. */
5490 static double ticks_per_s
;
5492 /* Number of kernel "jiffies" per second. This is used for the purpose of
5493 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
5494 * one jiffy's worth of data.
5496 * There are two possibilities here:
5498 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5499 * approximate range of 100 to 1024. That means that we really need to
5500 * make sure that the qdisc can buffer that much data.
5502 * - 'buffer_hz' is an absurdly large number. That means that the kernel
5503 * has finely granular timers and there's no need to fudge additional room
5504 * for buffers. (There's no extra effort needed to implement that: the
5505 * large 'buffer_hz' is used as a divisor, so practically any number will
5506 * come out as 0 in the division. Small integer results in the case of
5507 * really high dividends won't have any real effect anyhow.)
5509 static unsigned int buffer_hz
;
5511 static struct tcmsg
*
5512 netdev_linux_tc_make_request(const struct netdev
*netdev
, int type
,
5513 unsigned int flags
, struct ofpbuf
*request
)
5518 error
= get_ifindex(netdev
, &ifindex
);
5523 return tc_make_request(ifindex
, type
, flags
, request
);
5526 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5529 * This function is equivalent to running:
5530 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5531 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
5534 * The configuration and stats may be seen with the following command:
5535 * /sbin/tc -s filter show dev <devname> parent ffff:
5537 * Returns 0 if successful, otherwise a positive errno value.
5540 tc_add_policer(struct netdev
*netdev
,
5541 uint32_t kbits_rate
, uint32_t kbits_burst
)
5543 struct tc_police tc_police
;
5544 struct ofpbuf request
;
5545 struct tcmsg
*tcmsg
;
5546 size_t basic_offset
;
5547 size_t police_offset
;
5551 memset(&tc_police
, 0, sizeof tc_police
);
5552 tc_police
.action
= TC_POLICE_SHOT
;
5553 tc_police
.mtu
= mtu
;
5554 tc_fill_rate(&tc_police
.rate
, ((uint64_t) kbits_rate
* 1000)/8, mtu
);
5556 /* The following appears wrong in one way: In networking a kilobit is
5557 * usually 1000 bits but this uses 1024 bits.
5559 * However if you "fix" those problems then "tc filter show ..." shows
5560 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5561 * 1,000,000 bits, whereas this actually ends up doing the right thing from
5562 * tc's point of view. Whatever. */
5563 tc_police
.burst
= tc_bytes_to_ticks(
5564 tc_police
.rate
.rate
, MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024 / 8);
5566 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTFILTER
,
5567 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
5571 tcmsg
->tcm_parent
= tc_make_handle(0xffff, 0);
5572 tcmsg
->tcm_info
= tc_make_handle(49,
5573 (OVS_FORCE
uint16_t) htons(ETH_P_ALL
));
5575 nl_msg_put_string(&request
, TCA_KIND
, "basic");
5576 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
5577 police_offset
= nl_msg_start_nested(&request
, TCA_BASIC_POLICE
);
5578 nl_msg_put_unspec(&request
, TCA_POLICE_TBF
, &tc_police
, sizeof tc_police
);
5579 tc_put_rtab(&request
, TCA_POLICE_RATE
, &tc_police
.rate
);
5580 nl_msg_end_nested(&request
, police_offset
);
5581 nl_msg_end_nested(&request
, basic_offset
);
5583 error
= tc_transact(&request
, NULL
);
5594 /* The values in psched are not individually very meaningful, but they are
5595 * important. The tables below show some values seen in the wild.
5599 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5600 * (Before that, there are hints that it was 1000000000.)
5602 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5606 * -----------------------------------
5607 * [1] 000c8000 000f4240 000f4240 00000064
5608 * [2] 000003e8 00000400 000f4240 3b9aca00
5609 * [3] 000003e8 00000400 000f4240 3b9aca00
5610 * [4] 000003e8 00000400 000f4240 00000064
5611 * [5] 000003e8 00000040 000f4240 3b9aca00
5612 * [6] 000003e8 00000040 000f4240 000000f9
5614 * a b c d ticks_per_s buffer_hz
5615 * ------- --------- ---------- ------------- ----------- -------------
5616 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5617 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5618 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5619 * [4] 1,000 1,024 1,000,000 100 976,562 100
5620 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5621 * [6] 1,000 64 1,000,000 249 15,625,000 249
5623 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5624 * [2] 2.6.26-1-686-bigmem from Debian lenny
5625 * [3] 2.6.26-2-sparc64 from Debian lenny
5626 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5627 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5628 * [6] 2.6.34 from kernel.org on KVM
5630 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5631 static const char fn
[] = "/proc/net/psched";
5632 unsigned int a
, b
, c
, d
;
5635 if (!ovsthread_once_start(&once
)) {
5642 stream
= fopen(fn
, "r");
5644 VLOG_WARN("%s: open failed: %s", fn
, ovs_strerror(errno
));
5648 if (fscanf(stream
, "%x %x %x %x", &a
, &b
, &c
, &d
) != 4) {
5649 VLOG_WARN("%s: read failed", fn
);
5653 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn
, a
, b
, c
, d
);
5656 if (!a
|| !b
|| !c
) {
5657 VLOG_WARN("%s: invalid scheduler parameters", fn
);
5661 ticks_per_s
= (double) a
* c
/ b
;
5665 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5668 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn
, ticks_per_s
, buffer_hz
);
5671 ovsthread_once_done(&once
);
5674 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5675 * rate of 'rate' bytes per second. */
5677 tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
)
5680 return (rate
* ticks
) / ticks_per_s
;
5683 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
5684 * rate of 'rate' bytes per second. */
5686 tc_bytes_to_ticks(unsigned int rate
, unsigned int size
)
5689 return rate
? ((unsigned long long int) ticks_per_s
* size
) / rate
: 0;
5692 /* Returns the number of bytes that need to be reserved for qdisc buffering at
5693 * a transmission rate of 'rate' bytes per second. */
5695 tc_buffer_per_jiffy(unsigned int rate
)
5698 return rate
/ buffer_hz
;
5702 tc_time_to_ticks(uint32_t time
) {
5704 return time
* (ticks_per_s
/ 1000000);
5707 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5708 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5709 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5710 * stores NULL into it if it is absent.
5712 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5715 * Returns 0 if successful, otherwise a positive errno value. */
5717 tc_parse_qdisc(const struct ofpbuf
*msg
, const char **kind
,
5718 struct nlattr
**options
)
5720 static const struct nl_policy tca_policy
[] = {
5721 [TCA_KIND
] = { .type
= NL_A_STRING
, .optional
= false },
5722 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= true },
5724 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5726 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5727 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5728 VLOG_WARN_RL(&rl
, "failed to parse qdisc message");
5733 *kind
= nl_attr_get_string(ta
[TCA_KIND
]);
5737 *options
= ta
[TCA_OPTIONS
];
5752 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5753 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5754 * into '*options', and its queue statistics into '*stats'. Any of the output
5755 * arguments may be null.
5757 * Returns 0 if successful, otherwise a positive errno value. */
5759 tc_parse_class(const struct ofpbuf
*msg
, unsigned int *handlep
,
5760 struct nlattr
**options
, struct netdev_queue_stats
*stats
)
5762 static const struct nl_policy tca_policy
[] = {
5763 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= false },
5764 [TCA_STATS2
] = { .type
= NL_A_NESTED
, .optional
= false },
5766 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5768 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5769 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5770 VLOG_WARN_RL(&rl
, "failed to parse class message");
5775 struct tcmsg
*tc
= ofpbuf_at_assert(msg
, NLMSG_HDRLEN
, sizeof *tc
);
5776 *handlep
= tc
->tcm_handle
;
5780 *options
= ta
[TCA_OPTIONS
];
5784 const struct gnet_stats_queue
*gsq
;
5785 struct gnet_stats_basic gsb
;
5787 static const struct nl_policy stats_policy
[] = {
5788 [TCA_STATS_BASIC
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5789 .min_len
= sizeof gsb
},
5790 [TCA_STATS_QUEUE
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5791 .min_len
= sizeof *gsq
},
5793 struct nlattr
*sa
[ARRAY_SIZE(stats_policy
)];
5795 if (!nl_parse_nested(ta
[TCA_STATS2
], stats_policy
,
5796 sa
, ARRAY_SIZE(sa
))) {
5797 VLOG_WARN_RL(&rl
, "failed to parse class stats");
5801 /* Alignment issues screw up the length of struct gnet_stats_basic on
5802 * some arch/bitsize combinations. Newer versions of Linux have a
5803 * struct gnet_stats_basic_packed, but we can't depend on that. The
5804 * easiest thing to do is just to make a copy. */
5805 memset(&gsb
, 0, sizeof gsb
);
5806 memcpy(&gsb
, nl_attr_get(sa
[TCA_STATS_BASIC
]),
5807 MIN(nl_attr_get_size(sa
[TCA_STATS_BASIC
]), sizeof gsb
));
5808 stats
->tx_bytes
= gsb
.bytes
;
5809 stats
->tx_packets
= gsb
.packets
;
5811 gsq
= nl_attr_get(sa
[TCA_STATS_QUEUE
]);
5812 stats
->tx_errors
= gsq
->drops
;
5822 memset(stats
, 0, sizeof *stats
);
5827 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5830 tc_query_class(const struct netdev
*netdev
,
5831 unsigned int handle
, unsigned int parent
,
5832 struct ofpbuf
**replyp
)
5834 struct ofpbuf request
;
5835 struct tcmsg
*tcmsg
;
5838 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, NLM_F_ECHO
,
5843 tcmsg
->tcm_handle
= handle
;
5844 tcmsg
->tcm_parent
= parent
;
5846 error
= tc_transact(&request
, replyp
);
5848 VLOG_WARN_RL(&rl
, "query %s class %u:%u (parent %u:%u) failed (%s)",
5849 netdev_get_name(netdev
),
5850 tc_get_major(handle
), tc_get_minor(handle
),
5851 tc_get_major(parent
), tc_get_minor(parent
),
5852 ovs_strerror(error
));
5857 /* Equivalent to "tc class del dev <name> handle <handle>". */
5859 tc_delete_class(const struct netdev
*netdev
, unsigned int handle
)
5861 struct ofpbuf request
;
5862 struct tcmsg
*tcmsg
;
5865 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_DELTCLASS
, 0, &request
);
5869 tcmsg
->tcm_handle
= handle
;
5870 tcmsg
->tcm_parent
= 0;
5872 error
= tc_transact(&request
, NULL
);
5874 VLOG_WARN_RL(&rl
, "delete %s class %u:%u failed (%s)",
5875 netdev_get_name(netdev
),
5876 tc_get_major(handle
), tc_get_minor(handle
),
5877 ovs_strerror(error
));
5882 /* Equivalent to "tc qdisc del dev <name> root". */
5884 tc_del_qdisc(struct netdev
*netdev_
)
5886 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5887 struct ofpbuf request
;
5888 struct tcmsg
*tcmsg
;
5891 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_DELQDISC
, 0, &request
);
5895 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
5896 tcmsg
->tcm_parent
= TC_H_ROOT
;
5898 error
= tc_transact(&request
, NULL
);
5899 if (error
== EINVAL
) {
5900 /* EINVAL probably means that the default qdisc was in use, in which
5901 * case we've accomplished our purpose. */
5904 if (!error
&& netdev
->tc
) {
5905 if (netdev
->tc
->ops
->tc_destroy
) {
5906 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
5914 getqdisc_is_safe(void)
5916 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5917 static bool safe
= false;
5919 if (ovsthread_once_start(&once
)) {
5920 struct utsname utsname
;
5923 if (uname(&utsname
) == -1) {
5924 VLOG_WARN("uname failed (%s)", ovs_strerror(errno
));
5925 } else if (!ovs_scan(utsname
.release
, "%d.%d", &major
, &minor
)) {
5926 VLOG_WARN("uname reported bad OS release (%s)", utsname
.release
);
5927 } else if (major
< 2 || (major
== 2 && minor
< 35)) {
5928 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5933 ovsthread_once_done(&once
);
5938 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5939 * kernel to determine what they are. Returns 0 if successful, otherwise a
5940 * positive errno value. */
5942 tc_query_qdisc(const struct netdev
*netdev_
)
5944 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5945 struct ofpbuf request
, *qdisc
;
5946 const struct tc_ops
*ops
;
5947 struct tcmsg
*tcmsg
;
5955 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5956 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5957 * 2.6.35 without that fix backported to it.
5959 * To avoid the OOPS, we must not make a request that would attempt to dump
5960 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5961 * few others. There are a few ways that I can see to do this, but most of
5962 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5963 * technique chosen here is to assume that any non-default qdisc that we
5964 * create will have a class with handle 1:0. The built-in qdiscs only have
5965 * a class with handle 0:0.
5967 * On Linux 2.6.35+ we use the straightforward method because it allows us
5968 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5969 * in such a case we get no response at all from the kernel (!) if a
5970 * builtin qdisc is in use (which is later caught by "!error &&
5971 * !qdisc->size"). */
5972 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_GETQDISC
, NLM_F_ECHO
,
5977 tcmsg
->tcm_handle
= tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5978 tcmsg
->tcm_parent
= getqdisc_is_safe() ? TC_H_ROOT
: 0;
5980 /* Figure out what tc class to instantiate. */
5981 error
= tc_transact(&request
, &qdisc
);
5982 if (!error
&& qdisc
->size
) {
5985 error
= tc_parse_qdisc(qdisc
, &kind
, NULL
);
5987 ops
= &tc_ops_other
;
5989 ops
= tc_lookup_linux_name(kind
);
5991 static struct vlog_rate_limit rl2
= VLOG_RATE_LIMIT_INIT(1, 1);
5992 VLOG_DBG_RL(&rl2
, "unknown qdisc \"%s\"", kind
);
5994 ops
= &tc_ops_other
;
5997 } else if ((!error
&& !qdisc
->size
) || error
== ENOENT
) {
5998 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5999 * set up by some other entity that doesn't have a handle 1:0. We will
6000 * assume that it's the system default qdisc. */
6001 ops
= &tc_ops_default
;
6004 /* Who knows? Maybe the device got deleted. */
6005 VLOG_WARN_RL(&rl
, "query %s qdisc failed (%s)",
6006 netdev_get_name(netdev_
), ovs_strerror(error
));
6007 ops
= &tc_ops_other
;
6010 /* Instantiate it. */
6011 load_error
= ops
->tc_load(CONST_CAST(struct netdev
*, netdev_
), qdisc
);
6012 ovs_assert((load_error
== 0) == (netdev
->tc
!= NULL
));
6013 ofpbuf_delete(qdisc
);
6015 return error
? error
: load_error
;
6018 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
6019 approximate the time to transmit packets of various lengths. For an MTU of
6020 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
6021 represents two possible packet lengths; for a MTU of 513 through 1024, four
6022 possible lengths; and so on.
6024 Returns, for the specified 'mtu', the number of bits that packet lengths
6025 need to be shifted right to fit within such a 256-entry table. */
6027 tc_calc_cell_log(unsigned int mtu
)
6032 mtu
= ETH_PAYLOAD_MAX
;
6034 mtu
+= ETH_HEADER_LEN
+ VLAN_HEADER_LEN
;
6036 for (cell_log
= 0; mtu
>= 256; cell_log
++) {
6043 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
6046 tc_fill_rate(struct tc_ratespec
*rate
, uint64_t Bps
, int mtu
)
6048 memset(rate
, 0, sizeof *rate
);
6049 rate
->cell_log
= tc_calc_cell_log(mtu
);
6050 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
6051 /* rate->cell_align = 0; */ /* distro headers. */
6052 rate
->mpu
= ETH_TOTAL_MIN
;
6056 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
6057 * attribute of the specified "type".
6059 * See tc_calc_cell_log() above for a description of "rtab"s. */
6061 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
)
6066 rtab
= nl_msg_put_unspec_uninit(msg
, type
, TC_RTAB_SIZE
);
6067 for (i
= 0; i
< TC_RTAB_SIZE
/ sizeof *rtab
; i
++) {
6068 unsigned packet_size
= (i
+ 1) << rate
->cell_log
;
6069 if (packet_size
< rate
->mpu
) {
6070 packet_size
= rate
->mpu
;
6072 rtab
[i
] = tc_bytes_to_ticks(rate
->rate
, packet_size
);
6076 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
6077 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
6078 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
6081 tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
)
6083 unsigned int min_burst
= tc_buffer_per_jiffy(Bps
) + mtu
;
6084 return tc_bytes_to_ticks(Bps
, MAX(burst_bytes
, min_burst
));
6087 /* Linux-only functions declared in netdev-linux.h */
6089 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
6090 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
6092 netdev_linux_ethtool_set_flag(struct netdev
*netdev
, uint32_t flag
,
6093 const char *flag_name
, bool enable
)
6095 const char *netdev_name
= netdev_get_name(netdev
);
6096 struct ethtool_value evalue
;
6100 COVERAGE_INC(netdev_get_ethtool
);
6101 memset(&evalue
, 0, sizeof evalue
);
6102 error
= netdev_linux_do_ethtool(netdev_name
,
6103 (struct ethtool_cmd
*)&evalue
,
6104 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
6109 COVERAGE_INC(netdev_set_ethtool
);
6110 new_flags
= (evalue
.data
& ~flag
) | (enable
? flag
: 0);
6111 if (new_flags
== evalue
.data
) {
6114 evalue
.data
= new_flags
;
6115 error
= netdev_linux_do_ethtool(netdev_name
,
6116 (struct ethtool_cmd
*)&evalue
,
6117 ETHTOOL_SFLAGS
, "ETHTOOL_SFLAGS");
6122 COVERAGE_INC(netdev_get_ethtool
);
6123 memset(&evalue
, 0, sizeof evalue
);
6124 error
= netdev_linux_do_ethtool(netdev_name
,
6125 (struct ethtool_cmd
*)&evalue
,
6126 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
6131 if (new_flags
!= evalue
.data
) {
6132 VLOG_WARN_RL(&rl
, "attempt to %s ethtool %s flag on network "
6133 "device %s failed", enable
? "enable" : "disable",
6134 flag_name
, netdev_name
);
6141 /* Utility functions. */
6143 /* Copies 'src' into 'dst', performing format conversion in the process. */
6145 netdev_stats_from_rtnl_link_stats(struct netdev_stats
*dst
,
6146 const struct rtnl_link_stats
*src
)
6148 dst
->rx_packets
= src
->rx_packets
;
6149 dst
->tx_packets
= src
->tx_packets
;
6150 dst
->rx_bytes
= src
->rx_bytes
;
6151 dst
->tx_bytes
= src
->tx_bytes
;
6152 dst
->rx_errors
= src
->rx_errors
;
6153 dst
->tx_errors
= src
->tx_errors
;
6154 dst
->rx_dropped
= src
->rx_dropped
;
6155 dst
->tx_dropped
= src
->tx_dropped
;
6156 dst
->multicast
= src
->multicast
;
6157 dst
->collisions
= src
->collisions
;
6158 dst
->rx_length_errors
= src
->rx_length_errors
;
6159 dst
->rx_over_errors
= src
->rx_over_errors
;
6160 dst
->rx_crc_errors
= src
->rx_crc_errors
;
6161 dst
->rx_frame_errors
= src
->rx_frame_errors
;
6162 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
6163 dst
->rx_missed_errors
= src
->rx_missed_errors
;
6164 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
6165 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
6166 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
6167 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
6168 dst
->tx_window_errors
= src
->tx_window_errors
;
6171 /* Copies 'src' into 'dst', performing format conversion in the process. */
6173 netdev_stats_from_rtnl_link_stats64(struct netdev_stats
*dst
,
6174 const struct rtnl_link_stats64
*src
)
6176 dst
->rx_packets
= src
->rx_packets
;
6177 dst
->tx_packets
= src
->tx_packets
;
6178 dst
->rx_bytes
= src
->rx_bytes
;
6179 dst
->tx_bytes
= src
->tx_bytes
;
6180 dst
->rx_errors
= src
->rx_errors
;
6181 dst
->tx_errors
= src
->tx_errors
;
6182 dst
->rx_dropped
= src
->rx_dropped
;
6183 dst
->tx_dropped
= src
->tx_dropped
;
6184 dst
->multicast
= src
->multicast
;
6185 dst
->collisions
= src
->collisions
;
6186 dst
->rx_length_errors
= src
->rx_length_errors
;
6187 dst
->rx_over_errors
= src
->rx_over_errors
;
6188 dst
->rx_crc_errors
= src
->rx_crc_errors
;
6189 dst
->rx_frame_errors
= src
->rx_frame_errors
;
6190 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
6191 dst
->rx_missed_errors
= src
->rx_missed_errors
;
6192 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
6193 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
6194 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
6195 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
6196 dst
->tx_window_errors
= src
->tx_window_errors
;
6200 get_stats_via_netlink(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
6202 struct ofpbuf request
;
6203 struct ofpbuf
*reply
;
6206 /* Filtering all counters by default */
6207 memset(stats
, 0xFF, sizeof(struct netdev_stats
));
6209 ofpbuf_init(&request
, 0);
6210 nl_msg_put_nlmsghdr(&request
,
6211 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
6212 RTM_GETLINK
, NLM_F_REQUEST
);
6213 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
6214 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(netdev_
));
6215 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
6216 ofpbuf_uninit(&request
);
6221 if (ofpbuf_try_pull(reply
, NLMSG_HDRLEN
+ sizeof(struct ifinfomsg
))) {
6222 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS64
);
6223 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats64
)) {
6224 netdev_stats_from_rtnl_link_stats64(stats
, nl_attr_get(a
));
6227 a
= nl_attr_find(reply
, 0, IFLA_STATS
);
6228 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats
)) {
6229 netdev_stats_from_rtnl_link_stats(stats
, nl_attr_get(a
));
6232 VLOG_WARN_RL(&rl
, "RTM_GETLINK reply lacks stats");
6237 VLOG_WARN_RL(&rl
, "short RTM_GETLINK reply");
6242 ofpbuf_delete(reply
);
6247 get_flags(const struct netdev
*dev
, unsigned int *flags
)
6253 error
= af_inet_ifreq_ioctl(dev
->name
, &ifr
, SIOCGIFFLAGS
, "SIOCGIFFLAGS");
6255 *flags
= ifr
.ifr_flags
;
6261 set_flags(const char *name
, unsigned int flags
)
6265 ifr
.ifr_flags
= flags
;
6266 return af_inet_ifreq_ioctl(name
, &ifr
, SIOCSIFFLAGS
, "SIOCSIFFLAGS");
6270 linux_get_ifindex(const char *netdev_name
)
6275 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
6276 COVERAGE_INC(netdev_get_ifindex
);
6278 error
= af_inet_ioctl(SIOCGIFINDEX
, &ifr
);
6280 /* ENODEV probably means that a vif disappeared asynchronously and
6281 * hasn't been removed from the database yet, so reduce the log level
6282 * to INFO for that case. */
6283 VLOG_RL(&rl
, error
== ENODEV
? VLL_INFO
: VLL_ERR
,
6284 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
6285 netdev_name
, ovs_strerror(error
));
6288 return ifr
.ifr_ifindex
;
6292 get_ifindex(const struct netdev
*netdev_
, int *ifindexp
)
6294 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
6296 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
6297 netdev_linux_update_via_netlink(netdev
);
6300 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
6301 /* Fall back to ioctl if netlink fails */
6302 int ifindex
= linux_get_ifindex(netdev_get_name(netdev_
));
6305 netdev
->get_ifindex_error
= -ifindex
;
6306 netdev
->ifindex
= 0;
6308 netdev
->get_ifindex_error
= 0;
6309 netdev
->ifindex
= ifindex
;
6311 netdev
->cache_valid
|= VALID_IFINDEX
;
6314 *ifindexp
= netdev
->ifindex
;
6315 return netdev
->get_ifindex_error
;
6319 netdev_linux_update_via_netlink(struct netdev_linux
*netdev
)
6321 struct ofpbuf request
;
6322 struct ofpbuf
*reply
;
6323 struct rtnetlink_change chg
;
6324 struct rtnetlink_change
*change
= &chg
;
6327 ofpbuf_init(&request
, 0);
6328 nl_msg_put_nlmsghdr(&request
,
6329 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
) +
6330 NL_A_U32_SIZE
, RTM_GETLINK
, NLM_F_REQUEST
);
6331 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
6333 /* The correct identifiers for a Linux device are netnsid and ifindex,
6334 * but ifindex changes as the port is moved to another network namespace
6335 * and the interface name statically stored in ovsdb. */
6336 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(&netdev
->up
));
6337 if (netdev_linux_netnsid_is_remote(netdev
)) {
6338 nl_msg_put_u32(&request
, IFLA_IF_NETNSID
, netdev
->netnsid
);
6340 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
6341 ofpbuf_uninit(&request
);
6343 ofpbuf_delete(reply
);
6347 if (rtnetlink_parse(reply
, change
)
6348 && change
->nlmsg_type
== RTM_NEWLINK
) {
6349 bool changed
= false;
6352 /* Update netdev from rtnl msg and increment its seq if needed. */
6353 if ((change
->ifi_flags
^ netdev
->ifi_flags
) & IFF_RUNNING
) {
6354 netdev
->carrier_resets
++;
6357 if (change
->ifi_flags
!= netdev
->ifi_flags
) {
6358 netdev
->ifi_flags
= change
->ifi_flags
;
6361 if (change
->mtu
&& change
->mtu
!= netdev
->mtu
) {
6362 netdev
->mtu
= change
->mtu
;
6363 netdev
->cache_valid
|= VALID_MTU
;
6364 netdev
->netdev_mtu_error
= 0;
6367 if (!eth_addr_is_zero(change
->mac
)
6368 && !eth_addr_equals(change
->mac
, netdev
->etheraddr
)) {
6369 netdev
->etheraddr
= change
->mac
;
6370 netdev
->cache_valid
|= VALID_ETHERADDR
;
6371 netdev
->ether_addr_error
= 0;
6374 if (change
->if_index
!= netdev
->ifindex
) {
6375 netdev
->ifindex
= change
->if_index
;
6376 netdev
->cache_valid
|= VALID_IFINDEX
;
6377 netdev
->get_ifindex_error
= 0;
6380 if (change
->master
&& netdev_linux_kind_is_lag(change
->master
)) {
6381 netdev
->is_lag_master
= true;
6384 netdev_change_seq_changed(&netdev
->up
);
6390 ofpbuf_delete(reply
);
6395 get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
)
6401 memset(&ifr
, 0, sizeof ifr
);
6402 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
6403 COVERAGE_INC(netdev_get_hwaddr
);
6404 error
= af_inet_ioctl(SIOCGIFHWADDR
, &ifr
);
6406 /* ENODEV probably means that a vif disappeared asynchronously and
6407 * hasn't been removed from the database yet, so reduce the log level
6408 * to INFO for that case. */
6409 VLOG(error
== ENODEV
? VLL_INFO
: VLL_ERR
,
6410 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
6411 netdev_name
, ovs_strerror(error
));
6414 hwaddr_family
= ifr
.ifr_hwaddr
.sa_family
;
6415 if (hwaddr_family
!= AF_UNSPEC
&& hwaddr_family
!= ARPHRD_ETHER
&&
6416 hwaddr_family
!= ARPHRD_NONE
) {
6417 VLOG_INFO("%s device has unknown hardware address family %d",
6418 netdev_name
, hwaddr_family
);
6421 memcpy(ea
, ifr
.ifr_hwaddr
.sa_data
, ETH_ADDR_LEN
);
6426 set_etheraddr(const char *netdev_name
, const struct eth_addr mac
)
6431 memset(&ifr
, 0, sizeof ifr
);
6432 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
6433 ifr
.ifr_hwaddr
.sa_family
= ARPHRD_ETHER
;
6434 memcpy(ifr
.ifr_hwaddr
.sa_data
, &mac
, ETH_ADDR_LEN
);
6435 COVERAGE_INC(netdev_set_hwaddr
);
6436 error
= af_inet_ioctl(SIOCSIFHWADDR
, &ifr
);
6438 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
6439 netdev_name
, ovs_strerror(error
));
6445 netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*ecmd
,
6446 int cmd
, const char *cmd_name
)
6451 memset(&ifr
, 0, sizeof ifr
);
6452 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
6453 ifr
.ifr_data
= (caddr_t
) ecmd
;
6456 error
= af_inet_ioctl(SIOCETHTOOL
, &ifr
);
6458 if (error
!= EOPNOTSUPP
) {
6459 VLOG_WARN_RL(&rl
, "ethtool command %s on network device %s "
6460 "failed: %s", cmd_name
, name
, ovs_strerror(error
));
6462 /* The device doesn't support this operation. That's pretty
6463 * common, so there's no point in logging anything. */
6469 /* Returns an AF_PACKET raw socket or a negative errno value. */
6471 af_packet_sock(void)
6473 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
6476 if (ovsthread_once_start(&once
)) {
6477 sock
= socket(AF_PACKET
, SOCK_RAW
, 0);
6479 int error
= set_nonblocking(sock
);
6483 } else if (userspace_tso_enabled()) {
6485 error
= setsockopt(sock
, SOL_PACKET
, PACKET_VNET_HDR
, &val
,
6489 VLOG_ERR("failed to enable vnet hdr in raw socket: %s",
6490 ovs_strerror(errno
));
6497 VLOG_ERR("failed to create packet socket: %s",
6498 ovs_strerror(errno
));
6500 ovsthread_once_done(&once
);
6507 netdev_linux_parse_l2(struct dp_packet
*b
, uint16_t *l4proto
)
6509 struct eth_header
*eth_hdr
;
6513 eth_hdr
= dp_packet_at(b
, 0, ETH_HEADER_LEN
);
6518 l2_len
= ETH_HEADER_LEN
;
6519 eth_type
= eth_hdr
->eth_type
;
6520 if (eth_type_vlan(eth_type
)) {
6521 struct vlan_header
*vlan
= dp_packet_at(b
, l2_len
, VLAN_HEADER_LEN
);
6527 eth_type
= vlan
->vlan_next_type
;
6528 l2_len
+= VLAN_HEADER_LEN
;
6531 if (eth_type
== htons(ETH_TYPE_IP
)) {
6532 struct ip_header
*ip_hdr
= dp_packet_at(b
, l2_len
, IP_HEADER_LEN
);
6538 *l4proto
= ip_hdr
->ip_proto
;
6539 dp_packet_hwol_set_tx_ipv4(b
);
6540 } else if (eth_type
== htons(ETH_TYPE_IPV6
)) {
6541 struct ovs_16aligned_ip6_hdr
*nh6
;
6543 nh6
= dp_packet_at(b
, l2_len
, IPV6_HEADER_LEN
);
6548 *l4proto
= nh6
->ip6_ctlun
.ip6_un1
.ip6_un1_nxt
;
6549 dp_packet_hwol_set_tx_ipv6(b
);
6556 netdev_linux_parse_vnet_hdr(struct dp_packet
*b
)
6558 struct virtio_net_hdr
*vnet
= dp_packet_pull(b
, sizeof *vnet
);
6559 uint16_t l4proto
= 0;
6561 if (OVS_UNLIKELY(!vnet
)) {
6565 if (vnet
->flags
== 0 && vnet
->gso_type
== VIRTIO_NET_HDR_GSO_NONE
) {
6569 if (netdev_linux_parse_l2(b
, &l4proto
)) {
6573 if (vnet
->flags
== VIRTIO_NET_HDR_F_NEEDS_CSUM
) {
6574 if (l4proto
== IPPROTO_TCP
) {
6575 dp_packet_hwol_set_csum_tcp(b
);
6576 } else if (l4proto
== IPPROTO_UDP
) {
6577 dp_packet_hwol_set_csum_udp(b
);
6578 } else if (l4proto
== IPPROTO_SCTP
) {
6579 dp_packet_hwol_set_csum_sctp(b
);
6583 if (l4proto
&& vnet
->gso_type
!= VIRTIO_NET_HDR_GSO_NONE
) {
6584 uint8_t allowed_mask
= VIRTIO_NET_HDR_GSO_TCPV4
6585 | VIRTIO_NET_HDR_GSO_TCPV6
6586 | VIRTIO_NET_HDR_GSO_UDP
;
6587 uint8_t type
= vnet
->gso_type
& allowed_mask
;
6589 if (type
== VIRTIO_NET_HDR_GSO_TCPV4
6590 || type
== VIRTIO_NET_HDR_GSO_TCPV6
) {
6591 dp_packet_hwol_set_tcp_seg(b
);
6599 netdev_linux_prepend_vnet_hdr(struct dp_packet
*b
, int mtu
)
6601 struct virtio_net_hdr
*vnet
= dp_packet_push_zeros(b
, sizeof *vnet
);
6603 if (dp_packet_hwol_is_tso(b
)) {
6604 uint16_t hdr_len
= ((char *)dp_packet_l4(b
) - (char *)dp_packet_eth(b
))
6607 vnet
->hdr_len
= (OVS_FORCE __virtio16
)hdr_len
;
6608 vnet
->gso_size
= (OVS_FORCE __virtio16
)(mtu
- hdr_len
);
6609 if (dp_packet_hwol_is_ipv4(b
)) {
6610 vnet
->gso_type
= VIRTIO_NET_HDR_GSO_TCPV4
;
6612 vnet
->gso_type
= VIRTIO_NET_HDR_GSO_TCPV6
;
6616 vnet
->flags
= VIRTIO_NET_HDR_GSO_NONE
;
6619 if (dp_packet_hwol_l4_mask(b
)) {
6620 vnet
->flags
= VIRTIO_NET_HDR_F_NEEDS_CSUM
;
6621 vnet
->csum_start
= (OVS_FORCE __virtio16
)((char *)dp_packet_l4(b
)
6622 - (char *)dp_packet_eth(b
));
6624 if (dp_packet_hwol_l4_is_tcp(b
)) {
6625 vnet
->csum_offset
= (OVS_FORCE __virtio16
) __builtin_offsetof(
6626 struct tcp_header
, tcp_csum
);
6627 } else if (dp_packet_hwol_l4_is_udp(b
)) {
6628 vnet
->csum_offset
= (OVS_FORCE __virtio16
) __builtin_offsetof(
6629 struct udp_header
, udp_csum
);
6630 } else if (dp_packet_hwol_l4_is_sctp(b
)) {
6631 vnet
->csum_offset
= (OVS_FORCE __virtio16
) __builtin_offsetof(
6632 struct sctp_header
, sctp_csum
);
6634 VLOG_WARN_RL(&rl
, "Unsupported L4 protocol");