2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
20 #include "netdev-linux-private.h"
24 #include <sys/types.h>
25 #include <netinet/in.h>
26 #include <arpa/inet.h>
29 #include <linux/filter.h>
30 #include <linux/gen_stats.h>
31 #include <linux/if_ether.h>
32 #include <linux/if_packet.h>
33 #include <linux/if_tun.h>
34 #include <linux/types.h>
35 #include <linux/ethtool.h>
36 #include <linux/mii.h>
37 #include <linux/rtnetlink.h>
38 #include <linux/sockios.h>
39 #include <linux/virtio_net.h>
40 #include <sys/ioctl.h>
41 #include <sys/socket.h>
43 #include <sys/utsname.h>
45 #include <net/if_arp.h>
46 #include <net/route.h>
53 #include "dp-packet.h"
54 #include "dpif-netlink.h"
55 #include "dpif-netdev.h"
56 #include "openvswitch/dynamic-string.h"
57 #include "fatal-signal.h"
59 #include "openvswitch/hmap.h"
60 #include "netdev-afxdp.h"
61 #include "netdev-provider.h"
62 #include "netdev-vport.h"
63 #include "netlink-notifier.h"
64 #include "netlink-socket.h"
67 #include "openvswitch/ofpbuf.h"
68 #include "openflow/openflow.h"
69 #include "ovs-atomic.h"
72 #include "openvswitch/poll-loop.h"
73 #include "rtnetlink.h"
74 #include "openvswitch/shash.h"
75 #include "socket-util.h"
79 #include "unaligned.h"
80 #include "openvswitch/vlog.h"
81 #include "userspace-tso.h"
84 VLOG_DEFINE_THIS_MODULE(netdev_linux
);
86 COVERAGE_DEFINE(netdev_set_policing
);
87 COVERAGE_DEFINE(netdev_arp_lookup
);
88 COVERAGE_DEFINE(netdev_get_ifindex
);
89 COVERAGE_DEFINE(netdev_get_hwaddr
);
90 COVERAGE_DEFINE(netdev_set_hwaddr
);
91 COVERAGE_DEFINE(netdev_get_ethtool
);
92 COVERAGE_DEFINE(netdev_set_ethtool
);
95 #ifndef IFLA_IF_NETNSID
96 #define IFLA_IF_NETNSID 0x45
98 /* These were introduced in Linux 2.6.14, so they might be missing if we have
100 #ifndef ADVERTISED_Pause
101 #define ADVERTISED_Pause (1 << 13)
103 #ifndef ADVERTISED_Asym_Pause
104 #define ADVERTISED_Asym_Pause (1 << 14)
107 /* These were introduced in Linux 2.6.24, so they might be missing if we
108 * have old headers. */
109 #ifndef ETHTOOL_GFLAGS
110 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
112 #ifndef ETHTOOL_SFLAGS
113 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
116 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
119 #define TC_RTAB_SIZE 1024
122 /* Linux 2.6.21 introduced struct tpacket_auxdata.
123 * Linux 2.6.27 added the tp_vlan_tci member.
124 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
125 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
126 * TP_STATUS_VLAN_TPID_VALID.
128 * With all this churn it's easiest to unconditionally define a replacement
129 * structure that has everything we want.
131 #ifndef PACKET_AUXDATA
132 #define PACKET_AUXDATA 8
134 #ifndef TP_STATUS_VLAN_VALID
135 #define TP_STATUS_VLAN_VALID (1 << 4)
137 #ifndef TP_STATUS_VLAN_TPID_VALID
138 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
140 #undef tpacket_auxdata
141 #define tpacket_auxdata rpl_tpacket_auxdata
142 struct tpacket_auxdata
{
148 uint16_t tp_vlan_tci
;
149 uint16_t tp_vlan_tpid
;
152 /* Linux 2.6.27 introduced ethtool_cmd_speed
154 * To avoid revisiting problems reported with using configure to detect
155 * compatibility (see report at
156 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
157 * unconditionally replace ethtool_cmd_speed. */
158 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
159 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd
*ep
)
161 return ep
->speed
| (ep
->speed_hi
<< 16);
164 /* Linux 2.6.30 introduced supported and advertised flags for
165 * 1G base KX, and 10G base KX4, KR and R. */
166 #ifndef SUPPORTED_1000baseKX_Full
167 #define SUPPORTED_1000baseKX_Full (1 << 17)
168 #define SUPPORTED_10000baseKX4_Full (1 << 18)
169 #define SUPPORTED_10000baseKR_Full (1 << 19)
170 #define SUPPORTED_10000baseR_FEC (1 << 20)
171 #define ADVERTISED_1000baseKX_Full (1 << 17)
172 #define ADVERTISED_10000baseKX4_Full (1 << 18)
173 #define ADVERTISED_10000baseKR_Full (1 << 19)
174 #define ADVERTISED_10000baseR_FEC (1 << 20)
177 /* Linux 3.5 introduced supported and advertised flags for
178 * 40G base KR4, CR4, SR4 and LR4. */
179 #ifndef SUPPORTED_40000baseKR4_Full
180 #define SUPPORTED_40000baseKR4_Full (1 << 23)
181 #define SUPPORTED_40000baseCR4_Full (1 << 24)
182 #define SUPPORTED_40000baseSR4_Full (1 << 25)
183 #define SUPPORTED_40000baseLR4_Full (1 << 26)
184 #define ADVERTISED_40000baseKR4_Full (1 << 23)
185 #define ADVERTISED_40000baseCR4_Full (1 << 24)
186 #define ADVERTISED_40000baseSR4_Full (1 << 25)
187 #define ADVERTISED_40000baseLR4_Full (1 << 26)
190 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
192 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
193 * 2.6.32-431.29.2.el6.x86_64 (see report at
194 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
195 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
196 * unconditionally define a replacement. */
198 #define IFLA_STATS64 23
200 #define rtnl_link_stats64 rpl_rtnl_link_stats64
201 struct rtnl_link_stats64
{
213 uint64_t rx_length_errors
;
214 uint64_t rx_over_errors
;
215 uint64_t rx_crc_errors
;
216 uint64_t rx_frame_errors
;
217 uint64_t rx_fifo_errors
;
218 uint64_t rx_missed_errors
;
220 uint64_t tx_aborted_errors
;
221 uint64_t tx_carrier_errors
;
222 uint64_t tx_fifo_errors
;
223 uint64_t tx_heartbeat_errors
;
224 uint64_t tx_window_errors
;
226 uint64_t rx_compressed
;
227 uint64_t tx_compressed
;
231 VALID_IFINDEX
= 1 << 0,
232 VALID_ETHERADDR
= 1 << 1,
235 VALID_POLICING
= 1 << 4,
236 VALID_VPORT_STAT_ERROR
= 1 << 5,
237 VALID_DRVINFO
= 1 << 6,
238 VALID_FEATURES
= 1 << 7,
239 VALID_NUMA_ID
= 1 << 8,
242 /* Use one for the packet buffer and another for the aux buffer to receive
244 #define IOV_STD_SIZE 1
245 #define IOV_TSO_SIZE 2
252 struct linux_lag_slave
{
254 struct shash_node
*node
;
257 /* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
258 static struct ovs_mutex lag_mutex
= OVS_MUTEX_INITIALIZER
;
260 /* All slaves whose LAG masters are network devices in OvS. */
261 static struct shash lag_shash
OVS_GUARDED_BY(lag_mutex
)
262 = SHASH_INITIALIZER(&lag_shash
);
264 /* Traffic control. */
266 /* An instance of a traffic control class. Always associated with a particular
269 * Each TC implementation subclasses this with whatever additional data it
272 const struct tc_ops
*ops
;
273 struct hmap queues
; /* Contains "struct tc_queue"s.
274 * Read by generic TC layer.
275 * Written only by TC implementation. */
278 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
280 /* One traffic control queue.
282 * Each TC implementation subclasses this with whatever additional data it
285 struct hmap_node hmap_node
; /* In struct tc's "queues" hmap. */
286 unsigned int queue_id
; /* OpenFlow queue ID. */
287 long long int created
; /* Time queue was created, in msecs. */
290 /* A particular kind of traffic control. Each implementation generally maps to
291 * one particular Linux qdisc class.
293 * The functions below return 0 if successful or a positive errno value on
294 * failure, except where otherwise noted. All of them must be provided, except
295 * where otherwise noted. */
297 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
298 * This is null for tc_ops_default and tc_ops_other, for which there are no
299 * appropriate values. */
300 const char *linux_name
;
302 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
303 const char *ovs_name
;
305 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
306 * queues. The queues are numbered 0 through n_queues - 1. */
307 unsigned int n_queues
;
309 /* Called to install this TC class on 'netdev'. The implementation should
310 * make the Netlink calls required to set up 'netdev' with the right qdisc
311 * and configure it according to 'details'. The implementation may assume
312 * that the current qdisc is the default; that is, there is no need for it
313 * to delete the current qdisc before installing itself.
315 * The contents of 'details' should be documented as valid for 'ovs_name'
316 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
317 * (which is built as ovs-vswitchd.conf.db(8)).
319 * This function must return 0 if and only if it sets 'netdev->tc' to an
320 * initialized 'struct tc'.
322 * (This function is null for tc_ops_other, which cannot be installed. For
323 * other TC classes it should always be nonnull.) */
324 int (*tc_install
)(struct netdev
*netdev
, const struct smap
*details
);
326 /* Called when the netdev code determines (through a Netlink query) that
327 * this TC class's qdisc is installed on 'netdev', but we didn't install
328 * it ourselves and so don't know any of the details.
330 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
331 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
332 * implementation should parse the other attributes of 'nlmsg' as
333 * necessary to determine its configuration. If necessary it should also
334 * use Netlink queries to determine the configuration of queues on
337 * This function must return 0 if and only if it sets 'netdev->tc' to an
338 * initialized 'struct tc'. */
339 int (*tc_load
)(struct netdev
*netdev
, struct ofpbuf
*nlmsg
);
341 /* Destroys the data structures allocated by the implementation as part of
342 * 'tc'. (This includes destroying 'tc->queues' by calling
345 * The implementation should not need to perform any Netlink calls. If
346 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
347 * (But it may not be desirable.)
349 * This function may be null if 'tc' is trivial. */
350 void (*tc_destroy
)(struct tc
*tc
);
352 /* Retrieves details of 'netdev->tc' configuration into 'details'.
354 * The implementation should not need to perform any Netlink calls, because
355 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
356 * cached the configuration.
358 * The contents of 'details' should be documented as valid for 'ovs_name'
359 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
360 * (which is built as ovs-vswitchd.conf.db(8)).
362 * This function may be null if 'tc' is not configurable.
364 int (*qdisc_get
)(const struct netdev
*netdev
, struct smap
*details
);
366 /* Reconfigures 'netdev->tc' according to 'details', performing any
367 * required Netlink calls to complete the reconfiguration.
369 * The contents of 'details' should be documented as valid for 'ovs_name'
370 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
371 * (which is built as ovs-vswitchd.conf.db(8)).
373 * This function may be null if 'tc' is not configurable.
375 int (*qdisc_set
)(struct netdev
*, const struct smap
*details
);
377 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
378 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
380 * The contents of 'details' should be documented as valid for 'ovs_name'
381 * in the "other_config" column in the "Queue" table in
382 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
384 * The implementation should not need to perform any Netlink calls, because
385 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
386 * cached the queue configuration.
388 * This function may be null if 'tc' does not have queues ('n_queues' is
390 int (*class_get
)(const struct netdev
*netdev
, const struct tc_queue
*queue
,
391 struct smap
*details
);
393 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
394 * 'details', perfoming any required Netlink calls to complete the
395 * reconfiguration. The caller ensures that 'queue_id' is less than
398 * The contents of 'details' should be documented as valid for 'ovs_name'
399 * in the "other_config" column in the "Queue" table in
400 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
402 * This function may be null if 'tc' does not have queues or its queues are
403 * not configurable. */
404 int (*class_set
)(struct netdev
*, unsigned int queue_id
,
405 const struct smap
*details
);
407 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
408 * tc_queue's within 'netdev->tc->queues'.
410 * This function may be null if 'tc' does not have queues or its queues
411 * cannot be deleted. */
412 int (*class_delete
)(struct netdev
*, struct tc_queue
*queue
);
414 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
415 * 'struct tc_queue's within 'netdev->tc->queues'.
417 * On success, initializes '*stats'.
419 * This function may be null if 'tc' does not have queues or if it cannot
420 * report queue statistics. */
421 int (*class_get_stats
)(const struct netdev
*netdev
,
422 const struct tc_queue
*queue
,
423 struct netdev_queue_stats
*stats
);
425 /* Extracts queue stats from 'nlmsg', which is a response to a
426 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
428 * This function may be null if 'tc' does not have queues or if it cannot
429 * report queue statistics. */
430 int (*class_dump_stats
)(const struct netdev
*netdev
,
431 const struct ofpbuf
*nlmsg
,
432 netdev_dump_queue_stats_cb
*cb
, void *aux
);
436 tc_init(struct tc
*tc
, const struct tc_ops
*ops
)
439 hmap_init(&tc
->queues
);
443 tc_destroy(struct tc
*tc
)
445 hmap_destroy(&tc
->queues
);
448 static const struct tc_ops tc_ops_htb
;
449 static const struct tc_ops tc_ops_hfsc
;
450 static const struct tc_ops tc_ops_codel
;
451 static const struct tc_ops tc_ops_fqcodel
;
452 static const struct tc_ops tc_ops_sfq
;
453 static const struct tc_ops tc_ops_netem
;
454 static const struct tc_ops tc_ops_default
;
455 static const struct tc_ops tc_ops_noop
;
456 static const struct tc_ops tc_ops_other
;
458 static const struct tc_ops
*const tcs
[] = {
459 &tc_ops_htb
, /* Hierarchy token bucket (see tc-htb(8)). */
460 &tc_ops_hfsc
, /* Hierarchical fair service curve. */
461 &tc_ops_codel
, /* Controlled delay */
462 &tc_ops_fqcodel
, /* Fair queue controlled delay */
463 &tc_ops_sfq
, /* Stochastic fair queueing */
464 &tc_ops_netem
, /* Network Emulator */
465 &tc_ops_noop
, /* Non operating qos type. */
466 &tc_ops_default
, /* Default qdisc (see tc-pfifo_fast(8)). */
467 &tc_ops_other
, /* Some other qdisc. */
471 static unsigned int tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
);
472 static unsigned int tc_bytes_to_ticks(unsigned int rate
, unsigned int size
);
473 static unsigned int tc_buffer_per_jiffy(unsigned int rate
);
474 static uint32_t tc_time_to_ticks(uint32_t time
);
476 static struct tcmsg
*netdev_linux_tc_make_request(const struct netdev
*,
480 static int tc_add_policer(struct netdev
*,
481 uint32_t kbits_rate
, uint32_t kbits_burst
);
483 static int tc_parse_qdisc(const struct ofpbuf
*, const char **kind
,
484 struct nlattr
**options
);
485 static int tc_parse_class(const struct ofpbuf
*, unsigned int *queue_id
,
486 struct nlattr
**options
,
487 struct netdev_queue_stats
*);
488 static int tc_query_class(const struct netdev
*,
489 unsigned int handle
, unsigned int parent
,
490 struct ofpbuf
**replyp
);
491 static int tc_delete_class(const struct netdev
*, unsigned int handle
);
493 static int tc_del_qdisc(struct netdev
*netdev
);
494 static int tc_query_qdisc(const struct netdev
*netdev
);
497 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
);
498 static int tc_calc_cell_log(unsigned int mtu
);
499 static void tc_fill_rate(struct tc_ratespec
*rate
, uint64_t bps
, int mtu
);
500 static int tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
);
503 /* This is set pretty low because we probably won't learn anything from the
504 * additional log messages. */
505 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
507 /* Polling miimon status for all ports causes performance degradation when
508 * handling a large number of ports. If there are no devices using miimon, then
509 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
511 * Readers do not depend on this variable synchronizing with the related
512 * changes in the device miimon status, so we can use atomic_count. */
513 static atomic_count miimon_cnt
= ATOMIC_COUNT_INIT(0);
515 static int netdev_linux_parse_vnet_hdr(struct dp_packet
*b
);
516 static void netdev_linux_prepend_vnet_hdr(struct dp_packet
*b
, int mtu
);
517 static int netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*,
518 int cmd
, const char *cmd_name
);
519 static int get_flags(const struct netdev
*, unsigned int *flags
);
520 static int set_flags(const char *, unsigned int flags
);
521 static int update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
522 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
523 OVS_REQUIRES(netdev
->mutex
);
524 static int get_ifindex(const struct netdev
*, int *ifindexp
);
525 static int do_set_addr(struct netdev
*netdev
,
526 int ioctl_nr
, const char *ioctl_name
,
527 struct in_addr addr
);
528 static int get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
);
529 static int set_etheraddr(const char *netdev_name
, const struct eth_addr
);
530 static int af_packet_sock(void);
531 static bool netdev_linux_miimon_enabled(void);
532 static void netdev_linux_miimon_run(void);
533 static void netdev_linux_miimon_wait(void);
534 static int netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
);
537 is_tap_netdev(const struct netdev
*netdev
)
539 return netdev_get_class(netdev
) == &netdev_tap_class
;
543 netdev_linux_netnsid_update__(struct netdev_linux
*netdev
)
545 struct dpif_netlink_vport reply
;
549 error
= dpif_netlink_vport_get(netdev_get_name(&netdev
->up
), &reply
, &buf
);
551 if (error
== ENOENT
) {
552 /* Assume it is local if there is no API (e.g. if the openvswitch
553 * kernel module is not loaded). */
554 netnsid_set_local(&netdev
->netnsid
);
556 netnsid_unset(&netdev
->netnsid
);
561 netnsid_set(&netdev
->netnsid
, reply
.netnsid
);
567 netdev_linux_netnsid_update(struct netdev_linux
*netdev
)
569 if (netnsid_is_unset(netdev
->netnsid
)) {
570 if (netdev_get_class(&netdev
->up
) == &netdev_tap_class
) {
571 netnsid_set_local(&netdev
->netnsid
);
573 return netdev_linux_netnsid_update__(netdev
);
581 netdev_linux_netnsid_is_eq(struct netdev_linux
*netdev
, int nsid
)
583 netdev_linux_netnsid_update(netdev
);
584 return netnsid_eq(netdev
->netnsid
, nsid
);
588 netdev_linux_netnsid_is_remote(struct netdev_linux
*netdev
)
590 netdev_linux_netnsid_update(netdev
);
591 return netnsid_is_remote(netdev
->netnsid
);
594 static int netdev_linux_update_via_netlink(struct netdev_linux
*);
595 static void netdev_linux_update(struct netdev_linux
*netdev
, int,
596 const struct rtnetlink_change
*)
597 OVS_REQUIRES(netdev
->mutex
);
598 static void netdev_linux_changed(struct netdev_linux
*netdev
,
599 unsigned int ifi_flags
, unsigned int mask
)
600 OVS_REQUIRES(netdev
->mutex
);
602 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
603 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
604 * if no such socket could be created. */
605 static struct nl_sock
*
606 netdev_linux_notify_sock(void)
608 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
609 static struct nl_sock
*sock
;
610 unsigned int mcgroups
[] = {RTNLGRP_LINK
, RTNLGRP_IPV4_IFADDR
,
611 RTNLGRP_IPV6_IFADDR
, RTNLGRP_IPV6_IFINFO
};
613 if (ovsthread_once_start(&once
)) {
616 error
= nl_sock_create(NETLINK_ROUTE
, &sock
);
620 for (i
= 0; i
< ARRAY_SIZE(mcgroups
); i
++) {
621 error
= nl_sock_join_mcgroup(sock
, mcgroups
[i
]);
623 nl_sock_destroy(sock
);
629 nl_sock_listen_all_nsid(sock
, true);
630 ovsthread_once_done(&once
);
637 netdev_linux_miimon_enabled(void)
639 return atomic_count_get(&miimon_cnt
) > 0;
643 netdev_linux_kind_is_lag(const char *kind
)
645 if (!strcmp(kind
, "bond") || !strcmp(kind
, "team")) {
653 netdev_linux_update_lag(struct rtnetlink_change
*change
)
654 OVS_REQUIRES(lag_mutex
)
656 struct linux_lag_slave
*lag
;
658 if (change
->slave
&& netdev_linux_kind_is_lag(change
->slave
)) {
659 lag
= shash_find_data(&lag_shash
, change
->ifname
);
662 struct netdev
*master_netdev
;
663 char master_name
[IFNAMSIZ
];
667 if_indextoname(change
->master_ifindex
, master_name
);
668 master_netdev
= netdev_from_name(master_name
);
669 if (!master_netdev
) {
673 if (is_netdev_linux_class(master_netdev
->netdev_class
)) {
674 block_id
= netdev_get_block_id(master_netdev
);
676 netdev_close(master_netdev
);
680 lag
= xmalloc(sizeof *lag
);
681 lag
->block_id
= block_id
;
682 lag
->node
= shash_add(&lag_shash
, change
->ifname
, lag
);
684 /* delete ingress block in case it exists */
685 tc_add_del_qdisc(change
->if_index
, false, 0, TC_INGRESS
);
686 /* LAG master is linux netdev so add slave to same block. */
687 error
= tc_add_del_qdisc(change
->if_index
, true, block_id
,
690 VLOG_WARN("failed to bind LAG slave %s to master's block",
692 shash_delete(&lag_shash
, lag
->node
);
697 netdev_close(master_netdev
);
699 } else if (change
->master_ifindex
== 0) {
700 /* Check if this was a lag slave that has been freed. */
701 lag
= shash_find_data(&lag_shash
, change
->ifname
);
704 tc_add_del_qdisc(change
->if_index
, false, lag
->block_id
,
706 shash_delete(&lag_shash
, lag
->node
);
713 netdev_linux_run(const struct netdev_class
*netdev_class OVS_UNUSED
)
715 struct nl_sock
*sock
;
718 if (netdev_linux_miimon_enabled()) {
719 netdev_linux_miimon_run();
722 sock
= netdev_linux_notify_sock();
728 uint64_t buf_stub
[4096 / 8];
732 ofpbuf_use_stub(&buf
, buf_stub
, sizeof buf_stub
);
733 error
= nl_sock_recv(sock
, &buf
, &nsid
, false);
735 struct rtnetlink_change change
;
737 if (rtnetlink_parse(&buf
, &change
)) {
738 struct netdev
*netdev_
= NULL
;
739 char dev_name
[IFNAMSIZ
];
741 if (!change
.ifname
) {
742 change
.ifname
= if_indextoname(change
.if_index
, dev_name
);
746 netdev_
= netdev_from_name(change
.ifname
);
748 if (netdev_
&& is_netdev_linux_class(netdev_
->netdev_class
)) {
749 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
751 ovs_mutex_lock(&netdev
->mutex
);
752 netdev_linux_update(netdev
, nsid
, &change
);
753 ovs_mutex_unlock(&netdev
->mutex
);
757 rtnetlink_type_is_rtnlgrp_link(change
.nlmsg_type
)) {
759 /* Need to try updating the LAG information. */
760 ovs_mutex_lock(&lag_mutex
);
761 netdev_linux_update_lag(&change
);
762 ovs_mutex_unlock(&lag_mutex
);
764 netdev_close(netdev_
);
766 } else if (error
== ENOBUFS
) {
767 struct shash device_shash
;
768 struct shash_node
*node
;
772 shash_init(&device_shash
);
773 netdev_get_devices(&netdev_linux_class
, &device_shash
);
774 SHASH_FOR_EACH (node
, &device_shash
) {
775 struct netdev
*netdev_
= node
->data
;
776 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
779 ovs_mutex_lock(&netdev
->mutex
);
780 get_flags(netdev_
, &flags
);
781 netdev_linux_changed(netdev
, flags
, 0);
782 ovs_mutex_unlock(&netdev
->mutex
);
784 netdev_close(netdev_
);
786 shash_destroy(&device_shash
);
787 } else if (error
!= EAGAIN
) {
788 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 5);
789 VLOG_WARN_RL(&rll
, "error reading or parsing netlink (%s)",
790 ovs_strerror(error
));
797 netdev_linux_wait(const struct netdev_class
*netdev_class OVS_UNUSED
)
799 struct nl_sock
*sock
;
801 if (netdev_linux_miimon_enabled()) {
802 netdev_linux_miimon_wait();
804 sock
= netdev_linux_notify_sock();
806 nl_sock_wait(sock
, POLLIN
);
811 netdev_linux_changed(struct netdev_linux
*dev
,
812 unsigned int ifi_flags
, unsigned int mask
)
813 OVS_REQUIRES(dev
->mutex
)
815 netdev_change_seq_changed(&dev
->up
);
817 if ((dev
->ifi_flags
^ ifi_flags
) & IFF_RUNNING
) {
818 dev
->carrier_resets
++;
820 dev
->ifi_flags
= ifi_flags
;
822 dev
->cache_valid
&= mask
;
823 if (!(mask
& VALID_IN
)) {
824 netdev_get_addrs_list_flush();
829 netdev_linux_update__(struct netdev_linux
*dev
,
830 const struct rtnetlink_change
*change
)
831 OVS_REQUIRES(dev
->mutex
)
833 if (rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)) {
834 if (change
->nlmsg_type
== RTM_NEWLINK
) {
835 /* Keep drv-info, ip addresses, and NUMA id. */
836 netdev_linux_changed(dev
, change
->ifi_flags
,
837 VALID_DRVINFO
| VALID_IN
| VALID_NUMA_ID
);
839 /* Update netdev from rtnl-change msg. */
841 dev
->mtu
= change
->mtu
;
842 dev
->cache_valid
|= VALID_MTU
;
843 dev
->netdev_mtu_error
= 0;
846 if (!eth_addr_is_zero(change
->mac
)) {
847 dev
->etheraddr
= change
->mac
;
848 dev
->cache_valid
|= VALID_ETHERADDR
;
849 dev
->ether_addr_error
= 0;
851 /* The mac addr has been changed, report it now. */
852 rtnetlink_report_link();
855 if (change
->master
&& netdev_linux_kind_is_lag(change
->master
)) {
856 dev
->is_lag_master
= true;
859 dev
->ifindex
= change
->if_index
;
860 dev
->cache_valid
|= VALID_IFINDEX
;
861 dev
->get_ifindex_error
= 0;
865 netdev_linux_changed(dev
, change
->ifi_flags
, 0);
866 dev
->present
= false;
867 netnsid_unset(&dev
->netnsid
);
869 } else if (rtnetlink_type_is_rtnlgrp_addr(change
->nlmsg_type
)) {
870 /* Invalidates in4, in6. */
871 netdev_linux_changed(dev
, dev
->ifi_flags
, ~VALID_IN
);
878 netdev_linux_update(struct netdev_linux
*dev
, int nsid
,
879 const struct rtnetlink_change
*change
)
880 OVS_REQUIRES(dev
->mutex
)
882 if (netdev_linux_netnsid_is_eq(dev
, nsid
)) {
883 netdev_linux_update__(dev
, change
);
887 static struct netdev
*
888 netdev_linux_alloc(void)
890 struct netdev_linux
*netdev
= xzalloc(sizeof *netdev
);
895 netdev_linux_common_construct(struct netdev
*netdev_
)
897 /* Prevent any attempt to create (or open) a network device named "default"
898 * or "all". These device names are effectively reserved on Linux because
899 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
900 * itself this wouldn't call for any special treatment, but in practice if
901 * a program tries to create devices with these names, it causes the kernel
902 * to fire a "new device" notification event even though creation failed,
903 * and in turn that causes OVS to wake up and try to create them again,
904 * which ends up as a 100% CPU loop. */
905 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
906 const char *name
= netdev_
->name
;
907 if (!strcmp(name
, "default") || !strcmp(name
, "all")) {
908 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 1);
909 VLOG_WARN_RL(&rll
, "%s: Linux forbids network device with this name",
914 /* The device could be in the same network namespace or in another one. */
915 netnsid_unset(&netdev
->netnsid
);
916 ovs_mutex_init(&netdev
->mutex
);
918 if (userspace_tso_enabled()) {
919 netdev_
->ol_flags
|= NETDEV_TX_OFFLOAD_TCP_TSO
;
920 netdev_
->ol_flags
|= NETDEV_TX_OFFLOAD_TCP_CKSUM
;
921 netdev_
->ol_flags
|= NETDEV_TX_OFFLOAD_UDP_CKSUM
;
922 netdev_
->ol_flags
|= NETDEV_TX_OFFLOAD_SCTP_CKSUM
;
923 netdev_
->ol_flags
|= NETDEV_TX_OFFLOAD_IPV4_CKSUM
;
929 /* Creates system and internal devices. */
931 netdev_linux_construct(struct netdev
*netdev_
)
933 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
934 int error
= netdev_linux_common_construct(netdev_
);
939 error
= get_flags(&netdev
->up
, &netdev
->ifi_flags
);
940 if (error
== ENODEV
) {
941 if (netdev
->up
.netdev_class
!= &netdev_internal_class
) {
942 /* The device does not exist, so don't allow it to be opened. */
945 /* "Internal" netdevs have to be created as netdev objects before
946 * they exist in the kernel, because creating them in the kernel
947 * happens by passing a netdev object to dpif_port_add().
948 * Therefore, ignore the error. */
955 /* For most types of netdevs we open the device for each call of
956 * netdev_open(). However, this is not the case with tap devices,
957 * since it is only possible to open the device once. In this
958 * situation we share a single file descriptor, and consequently
959 * buffers, across all readers. Therefore once data is read it will
960 * be unavailable to other reads for tap devices. */
962 netdev_linux_construct_tap(struct netdev
*netdev_
)
964 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
965 static const char tap_dev
[] = "/dev/net/tun";
966 const char *name
= netdev_
->name
;
969 int error
= netdev_linux_common_construct(netdev_
);
974 /* Open tap device. */
975 netdev
->tap_fd
= open(tap_dev
, O_RDWR
);
976 if (netdev
->tap_fd
< 0) {
978 VLOG_WARN("opening \"%s\" failed: %s", tap_dev
, ovs_strerror(error
));
982 /* Create tap device. */
983 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
984 ifr
.ifr_flags
= IFF_TAP
| IFF_NO_PI
;
985 if (userspace_tso_enabled()) {
986 ifr
.ifr_flags
|= IFF_VNET_HDR
;
989 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
990 if (ioctl(netdev
->tap_fd
, TUNSETIFF
, &ifr
) == -1) {
991 VLOG_WARN("%s: creating tap device failed: %s", name
,
992 ovs_strerror(errno
));
997 /* Make non-blocking. */
998 error
= set_nonblocking(netdev
->tap_fd
);
1003 if (ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 1)) {
1004 VLOG_WARN("%s: creating tap device failed (persist): %s", name
,
1005 ovs_strerror(errno
));
1010 if (userspace_tso_enabled()) {
1011 /* Old kernels don't support TUNSETOFFLOAD. If TUNSETOFFLOAD is
1012 * available, it will return EINVAL when a flag is unknown.
1013 * Therefore, try enabling offload with no flags to check
1014 * if TUNSETOFFLOAD support is available or not. */
1015 if (ioctl(netdev
->tap_fd
, TUNSETOFFLOAD
, 0) == 0 || errno
!= EINVAL
) {
1016 unsigned long oflags
= TUN_F_CSUM
| TUN_F_TSO4
| TUN_F_TSO6
;
1018 if (ioctl(netdev
->tap_fd
, TUNSETOFFLOAD
, oflags
) == -1) {
1019 VLOG_WARN("%s: enabling tap offloading failed: %s", name
,
1020 ovs_strerror(errno
));
1027 netdev
->present
= true;
1031 close(netdev
->tap_fd
);
1036 netdev_linux_destruct(struct netdev
*netdev_
)
1038 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1040 if (netdev
->tc
&& netdev
->tc
->ops
->tc_destroy
) {
1041 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
1044 if (netdev_get_class(netdev_
) == &netdev_tap_class
1045 && netdev
->tap_fd
>= 0)
1047 ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 0);
1048 close(netdev
->tap_fd
);
1051 if (netdev
->miimon_interval
> 0) {
1052 atomic_count_dec(&miimon_cnt
);
1055 ovs_mutex_destroy(&netdev
->mutex
);
1059 netdev_linux_dealloc(struct netdev
*netdev_
)
1061 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1065 static struct netdev_rxq
*
1066 netdev_linux_rxq_alloc(void)
1068 struct netdev_rxq_linux
*rx
= xzalloc(sizeof *rx
);
1073 netdev_linux_rxq_construct(struct netdev_rxq
*rxq_
)
1075 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1076 struct netdev
*netdev_
= rx
->up
.netdev
;
1077 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1080 ovs_mutex_lock(&netdev
->mutex
);
1081 rx
->is_tap
= is_tap_netdev(netdev_
);
1083 rx
->fd
= netdev
->tap_fd
;
1085 struct sockaddr_ll sll
;
1087 /* Result of tcpdump -dd inbound */
1088 static const struct sock_filter filt
[] = {
1089 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1090 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1091 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1092 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1094 static const struct sock_fprog fprog
= {
1095 ARRAY_SIZE(filt
), (struct sock_filter
*) filt
1098 /* Create file descriptor. */
1099 rx
->fd
= socket(PF_PACKET
, SOCK_RAW
, 0);
1102 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error
));
1107 if (setsockopt(rx
->fd
, SOL_PACKET
, PACKET_AUXDATA
, &val
, sizeof val
)) {
1109 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1110 netdev_get_name(netdev_
), ovs_strerror(error
));
1114 if (userspace_tso_enabled()
1115 && setsockopt(rx
->fd
, SOL_PACKET
, PACKET_VNET_HDR
, &val
,
1118 VLOG_ERR("%s: failed to enable vnet hdr in txq raw socket: %s",
1119 netdev_get_name(netdev_
), ovs_strerror(errno
));
1123 /* Set non-blocking mode. */
1124 error
= set_nonblocking(rx
->fd
);
1129 /* Get ethernet device index. */
1130 error
= get_ifindex(&netdev
->up
, &ifindex
);
1135 /* Bind to specific ethernet device. */
1136 memset(&sll
, 0, sizeof sll
);
1137 sll
.sll_family
= AF_PACKET
;
1138 sll
.sll_ifindex
= ifindex
;
1139 sll
.sll_protocol
= htons(ETH_P_ALL
);
1140 if (bind(rx
->fd
, (struct sockaddr
*) &sll
, sizeof sll
) < 0) {
1142 VLOG_ERR("%s: failed to bind raw socket (%s)",
1143 netdev_get_name(netdev_
), ovs_strerror(error
));
1147 /* Filter for only inbound packets. */
1148 error
= setsockopt(rx
->fd
, SOL_SOCKET
, SO_ATTACH_FILTER
, &fprog
,
1152 VLOG_ERR("%s: failed to attach filter (%s)",
1153 netdev_get_name(netdev_
), ovs_strerror(error
));
1157 ovs_mutex_unlock(&netdev
->mutex
);
1165 ovs_mutex_unlock(&netdev
->mutex
);
1170 netdev_linux_rxq_destruct(struct netdev_rxq
*rxq_
)
1172 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1179 for (i
= 0; i
< NETDEV_MAX_BURST
; i
++) {
1180 dp_packet_delete(rx
->aux_bufs
[i
]);
1185 netdev_linux_rxq_dealloc(struct netdev_rxq
*rxq_
)
1187 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1193 auxdata_to_vlan_tpid(const struct tpacket_auxdata
*aux
, bool double_tagged
)
1195 if (aux
->tp_status
& TP_STATUS_VLAN_TPID_VALID
) {
1196 return htons(aux
->tp_vlan_tpid
);
1197 } else if (double_tagged
) {
1198 return htons(ETH_TYPE_VLAN_8021AD
);
1200 return htons(ETH_TYPE_VLAN_8021Q
);
1205 auxdata_has_vlan_tci(const struct tpacket_auxdata
*aux
)
1207 return aux
->tp_vlan_tci
|| aux
->tp_status
& TP_STATUS_VLAN_VALID
;
1211 * Receive packets from raw socket in batch process for better performance,
1212 * it can receive NETDEV_MAX_BURST packets at most once, the received
1213 * packets are added into *batch. The return value is 0 or errno.
1215 * It also used recvmmsg to reduce multiple syscalls overhead;
1218 netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux
*rx
, int mtu
,
1219 struct dp_packet_batch
*batch
)
1224 int virtio_net_hdr_size
;
1225 struct iovec iovs
[NETDEV_MAX_BURST
][IOV_TSO_SIZE
];
1226 struct cmsghdr
*cmsg
;
1228 struct cmsghdr cmsg
;
1229 char buffer
[CMSG_SPACE(sizeof(struct tpacket_auxdata
))];
1230 } cmsg_buffers
[NETDEV_MAX_BURST
];
1231 struct mmsghdr mmsgs
[NETDEV_MAX_BURST
];
1232 struct dp_packet
*buffers
[NETDEV_MAX_BURST
];
1235 if (userspace_tso_enabled()) {
1236 /* Use the buffer from the allocated packet below to receive MTU
1237 * sized packets and an aux_buf for extra TSO data. */
1238 iovlen
= IOV_TSO_SIZE
;
1239 virtio_net_hdr_size
= sizeof(struct virtio_net_hdr
);
1241 /* Use only the buffer from the allocated packet. */
1242 iovlen
= IOV_STD_SIZE
;
1243 virtio_net_hdr_size
= 0;
1246 /* The length here needs to be accounted in the same way when the
1247 * aux_buf is allocated so that it can be prepended to TSO buffer. */
1248 std_len
= virtio_net_hdr_size
+ VLAN_ETH_HEADER_LEN
+ mtu
;
1249 for (i
= 0; i
< NETDEV_MAX_BURST
; i
++) {
1250 buffers
[i
] = dp_packet_new_with_headroom(std_len
, DP_NETDEV_HEADROOM
);
1251 iovs
[i
][IOV_PACKET
].iov_base
= dp_packet_data(buffers
[i
]);
1252 iovs
[i
][IOV_PACKET
].iov_len
= std_len
;
1253 if (iovlen
== IOV_TSO_SIZE
) {
1254 iovs
[i
][IOV_AUXBUF
].iov_base
= dp_packet_data(rx
->aux_bufs
[i
]);
1255 iovs
[i
][IOV_AUXBUF
].iov_len
= dp_packet_tailroom(rx
->aux_bufs
[i
]);
1258 mmsgs
[i
].msg_hdr
.msg_name
= NULL
;
1259 mmsgs
[i
].msg_hdr
.msg_namelen
= 0;
1260 mmsgs
[i
].msg_hdr
.msg_iov
= iovs
[i
];
1261 mmsgs
[i
].msg_hdr
.msg_iovlen
= iovlen
;
1262 mmsgs
[i
].msg_hdr
.msg_control
= &cmsg_buffers
[i
];
1263 mmsgs
[i
].msg_hdr
.msg_controllen
= sizeof cmsg_buffers
[i
];
1264 mmsgs
[i
].msg_hdr
.msg_flags
= 0;
1268 retval
= recvmmsg(rx
->fd
, mmsgs
, NETDEV_MAX_BURST
, MSG_TRUNC
, NULL
);
1269 } while (retval
< 0 && errno
== EINTR
);
1273 for (i
= 0; i
< NETDEV_MAX_BURST
; i
++) {
1274 dp_packet_delete(buffers
[i
]);
1280 for (i
= 0; i
< retval
; i
++) {
1281 struct dp_packet
*pkt
;
1283 if (mmsgs
[i
].msg_len
< ETH_HEADER_LEN
) {
1284 struct netdev
*netdev_
= netdev_rxq_get_netdev(&rx
->up
);
1285 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1287 dp_packet_delete(buffers
[i
]);
1288 netdev
->rx_dropped
+= 1;
1289 VLOG_WARN_RL(&rl
, "%s: Dropped packet: less than ether hdr size",
1290 netdev_get_name(netdev_
));
1294 if (mmsgs
[i
].msg_len
> std_len
) {
1295 /* Build a single linear TSO packet by prepending the data from
1296 * std_len buffer to the aux_buf. */
1297 pkt
= rx
->aux_bufs
[i
];
1298 dp_packet_set_size(pkt
, mmsgs
[i
].msg_len
- std_len
);
1299 dp_packet_push(pkt
, dp_packet_data(buffers
[i
]), std_len
);
1300 /* The headroom should be the same in buffers[i], pkt and
1301 * DP_NETDEV_HEADROOM. */
1302 dp_packet_resize(pkt
, DP_NETDEV_HEADROOM
, 0);
1303 dp_packet_delete(buffers
[i
]);
1304 rx
->aux_bufs
[i
] = NULL
;
1306 dp_packet_set_size(buffers
[i
], mmsgs
[i
].msg_len
);
1310 if (virtio_net_hdr_size
&& netdev_linux_parse_vnet_hdr(pkt
)) {
1311 struct netdev
*netdev_
= netdev_rxq_get_netdev(&rx
->up
);
1312 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1314 /* Unexpected error situation: the virtio header is not present
1315 * or corrupted. Drop the packet but continue in case next ones
1317 dp_packet_delete(pkt
);
1318 netdev
->rx_dropped
+= 1;
1319 VLOG_WARN_RL(&rl
, "%s: Dropped packet: Invalid virtio net header",
1320 netdev_get_name(netdev_
));
1324 for (cmsg
= CMSG_FIRSTHDR(&mmsgs
[i
].msg_hdr
); cmsg
;
1325 cmsg
= CMSG_NXTHDR(&mmsgs
[i
].msg_hdr
, cmsg
)) {
1326 const struct tpacket_auxdata
*aux
;
1328 if (cmsg
->cmsg_level
!= SOL_PACKET
1329 || cmsg
->cmsg_type
!= PACKET_AUXDATA
1331 CMSG_LEN(sizeof(struct tpacket_auxdata
))) {
1335 aux
= ALIGNED_CAST(struct tpacket_auxdata
*, CMSG_DATA(cmsg
));
1336 if (auxdata_has_vlan_tci(aux
)) {
1337 struct eth_header
*eth
;
1340 eth
= dp_packet_data(pkt
);
1341 double_tagged
= eth
->eth_type
== htons(ETH_TYPE_VLAN_8021Q
);
1344 auxdata_to_vlan_tpid(aux
, double_tagged
),
1345 htons(aux
->tp_vlan_tci
));
1349 dp_packet_batch_add(batch
, pkt
);
1352 /* Delete unused buffers. */
1353 for (; i
< NETDEV_MAX_BURST
; i
++) {
1354 dp_packet_delete(buffers
[i
]);
1361 * Receive packets from tap by batch process for better performance,
1362 * it can receive NETDEV_MAX_BURST packets at most once, the received
1363 * packets are added into *batch. The return value is 0 or errno.
1366 netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux
*rx
, int mtu
,
1367 struct dp_packet_batch
*batch
)
1369 int virtio_net_hdr_size
;
1375 if (userspace_tso_enabled()) {
1376 /* Use the buffer from the allocated packet below to receive MTU
1377 * sized packets and an aux_buf for extra TSO data. */
1378 iovlen
= IOV_TSO_SIZE
;
1379 virtio_net_hdr_size
= sizeof(struct virtio_net_hdr
);
1381 /* Use only the buffer from the allocated packet. */
1382 iovlen
= IOV_STD_SIZE
;
1383 virtio_net_hdr_size
= 0;
1386 /* The length here needs to be accounted in the same way when the
1387 * aux_buf is allocated so that it can be prepended to TSO buffer. */
1388 std_len
= virtio_net_hdr_size
+ VLAN_ETH_HEADER_LEN
+ mtu
;
1389 for (i
= 0; i
< NETDEV_MAX_BURST
; i
++) {
1390 struct dp_packet
*buffer
;
1391 struct dp_packet
*pkt
;
1392 struct iovec iov
[IOV_TSO_SIZE
];
1394 /* Assume Ethernet port. No need to set packet_type. */
1395 buffer
= dp_packet_new_with_headroom(std_len
, DP_NETDEV_HEADROOM
);
1396 iov
[IOV_PACKET
].iov_base
= dp_packet_data(buffer
);
1397 iov
[IOV_PACKET
].iov_len
= std_len
;
1398 if (iovlen
== IOV_TSO_SIZE
) {
1399 iov
[IOV_AUXBUF
].iov_base
= dp_packet_data(rx
->aux_bufs
[i
]);
1400 iov
[IOV_AUXBUF
].iov_len
= dp_packet_tailroom(rx
->aux_bufs
[i
]);
1404 retval
= readv(rx
->fd
, iov
, iovlen
);
1405 } while (retval
< 0 && errno
== EINTR
);
1408 dp_packet_delete(buffer
);
1412 if (retval
> std_len
) {
1413 /* Build a single linear TSO packet by prepending the data from
1414 * std_len buffer to the aux_buf. */
1415 pkt
= rx
->aux_bufs
[i
];
1416 dp_packet_set_size(pkt
, retval
- std_len
);
1417 dp_packet_push(pkt
, dp_packet_data(buffer
), std_len
);
1418 /* The headroom should be the same in buffers[i], pkt and
1419 * DP_NETDEV_HEADROOM. */
1420 dp_packet_resize(pkt
, DP_NETDEV_HEADROOM
, 0);
1421 dp_packet_delete(buffer
);
1422 rx
->aux_bufs
[i
] = NULL
;
1424 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1428 if (virtio_net_hdr_size
&& netdev_linux_parse_vnet_hdr(pkt
)) {
1429 struct netdev
*netdev_
= netdev_rxq_get_netdev(&rx
->up
);
1430 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1432 /* Unexpected error situation: the virtio header is not present
1433 * or corrupted. Drop the packet but continue in case next ones
1435 dp_packet_delete(pkt
);
1436 netdev
->rx_dropped
+= 1;
1437 VLOG_WARN_RL(&rl
, "%s: Dropped packet: Invalid virtio net header",
1438 netdev_get_name(netdev_
));
1442 dp_packet_batch_add(batch
, pkt
);
1445 if ((i
== 0) && (retval
< 0)) {
1453 netdev_linux_rxq_recv(struct netdev_rxq
*rxq_
, struct dp_packet_batch
*batch
,
1456 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1457 struct netdev
*netdev
= rx
->up
.netdev
;
1461 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
1462 mtu
= ETH_PAYLOAD_MAX
;
1465 if (userspace_tso_enabled()) {
1466 /* Allocate TSO packets. The packet has enough headroom to store
1467 * a full non-TSO packet. When a TSO packet is received, the data
1468 * from non-TSO buffer (std_len) is prepended to the TSO packet
1470 size_t std_len
= sizeof(struct virtio_net_hdr
) + VLAN_ETH_HEADER_LEN
1471 + DP_NETDEV_HEADROOM
+ mtu
;
1472 size_t data_len
= LINUX_RXQ_TSO_MAX_LEN
- std_len
;
1473 for (int i
= 0; i
< NETDEV_MAX_BURST
; i
++) {
1474 if (rx
->aux_bufs
[i
]) {
1478 rx
->aux_bufs
[i
] = dp_packet_new_with_headroom(data_len
, std_len
);
1482 dp_packet_batch_init(batch
);
1483 retval
= (rx
->is_tap
1484 ? netdev_linux_batch_rxq_recv_tap(rx
, mtu
, batch
)
1485 : netdev_linux_batch_rxq_recv_sock(rx
, mtu
, batch
));
1488 if (retval
!= EAGAIN
&& retval
!= EMSGSIZE
) {
1489 VLOG_WARN_RL(&rl
, "error receiving Ethernet packet on %s: %s",
1490 netdev_rxq_get_name(rxq_
), ovs_strerror(errno
));
1502 netdev_linux_rxq_wait(struct netdev_rxq
*rxq_
)
1504 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1505 poll_fd_wait(rx
->fd
, POLLIN
);
1509 netdev_linux_rxq_drain(struct netdev_rxq
*rxq_
)
1511 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1514 int error
= af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_
), &ifr
,
1515 SIOCGIFTXQLEN
, "SIOCGIFTXQLEN");
1519 drain_fd(rx
->fd
, ifr
.ifr_qlen
);
1522 return drain_rcvbuf(rx
->fd
);
1527 netdev_linux_sock_batch_send(int sock
, int ifindex
, bool tso
, int mtu
,
1528 struct dp_packet_batch
*batch
)
1530 const size_t size
= dp_packet_batch_size(batch
);
1531 /* We don't bother setting most fields in sockaddr_ll because the
1532 * kernel ignores them for SOCK_RAW. */
1533 struct sockaddr_ll sll
= { .sll_family
= AF_PACKET
,
1534 .sll_ifindex
= ifindex
};
1536 struct mmsghdr
*mmsg
= xmalloc(sizeof(*mmsg
) * size
);
1537 struct iovec
*iov
= xmalloc(sizeof(*iov
) * size
);
1539 struct dp_packet
*packet
;
1540 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1542 netdev_linux_prepend_vnet_hdr(packet
, mtu
);
1545 iov
[i
].iov_base
= dp_packet_data(packet
);
1546 iov
[i
].iov_len
= dp_packet_size(packet
);
1547 mmsg
[i
].msg_hdr
= (struct msghdr
) { .msg_name
= &sll
,
1548 .msg_namelen
= sizeof sll
,
1554 for (uint32_t ofs
= 0; ofs
< size
; ) {
1557 retval
= sendmmsg(sock
, mmsg
+ ofs
, size
- ofs
, 0);
1558 error
= retval
< 0 ? errno
: 0;
1559 } while (error
== EINTR
);
1571 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1572 * essential, because packets sent to a tap device with an AF_PACKET socket
1573 * will loop back to be *received* again on the tap device. This doesn't occur
1574 * on other interface types because we attach a socket filter to the rx
1577 netdev_linux_tap_batch_send(struct netdev
*netdev_
, bool tso
, int mtu
,
1578 struct dp_packet_batch
*batch
)
1580 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1581 struct dp_packet
*packet
;
1583 /* The Linux tap driver returns EIO if the device is not up,
1584 * so if the device is not up, don't waste time sending it.
1585 * However, if the device is in another network namespace
1586 * then OVS can't retrieve the state. In that case, send the
1587 * packets anyway. */
1588 if (netdev
->present
&& !(netdev
->ifi_flags
& IFF_UP
)) {
1589 netdev
->tx_dropped
+= dp_packet_batch_size(batch
);
1593 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1599 netdev_linux_prepend_vnet_hdr(packet
, mtu
);
1602 size
= dp_packet_size(packet
);
1604 retval
= write(netdev
->tap_fd
, dp_packet_data(packet
), size
);
1605 error
= retval
< 0 ? errno
: 0;
1606 } while (error
== EINTR
);
1609 /* The Linux tap driver returns EIO if the device is not up. From
1610 * the OVS side this is not an error, so we ignore it; otherwise,
1611 * return the erro. */
1615 } else if (retval
!= size
) {
1616 VLOG_WARN_RL(&rl
, "sent partial Ethernet packet (%"PRIuSIZE
" "
1617 "bytes of %"PRIuSIZE
") on %s",
1618 retval
, size
, netdev_get_name(netdev_
));
1626 netdev_linux_get_numa_id__(struct netdev_linux
*netdev
)
1627 OVS_REQUIRES(netdev
->mutex
)
1629 char *numa_node_path
;
1634 if (netdev
->cache_valid
& VALID_NUMA_ID
) {
1635 return netdev
->numa_id
;
1638 netdev
->numa_id
= 0;
1639 netdev
->cache_valid
|= VALID_NUMA_ID
;
1641 if (ovs_numa_get_n_numas() < 2) {
1642 /* No need to check on system with a single NUMA node. */
1646 name
= netdev_get_name(&netdev
->up
);
1647 if (strpbrk(name
, "/\\")) {
1648 VLOG_ERR_RL(&rl
, "\"%s\" is not a valid name for a port. "
1649 "A valid name must not include '/' or '\\'."
1650 "Using numa_id 0", name
);
1654 numa_node_path
= xasprintf("/sys/class/net/%s/device/numa_node", name
);
1656 stream
= fopen(numa_node_path
, "r");
1658 /* Virtual device does not have this info. */
1659 VLOG_INFO_RL(&rl
, "%s: Can't open '%s': %s, using numa_id 0",
1660 name
, numa_node_path
, ovs_strerror(errno
));
1661 free(numa_node_path
);
1665 if (fscanf(stream
, "%d", &node_id
) != 1
1666 || !ovs_numa_numa_id_is_valid(node_id
)) {
1667 VLOG_WARN_RL(&rl
, "%s: Can't detect NUMA node, using numa_id 0", name
);
1671 netdev
->numa_id
= node_id
;
1673 free(numa_node_path
);
1677 static int OVS_UNUSED
1678 netdev_linux_get_numa_id(const struct netdev
*netdev_
)
1680 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1683 ovs_mutex_lock(&netdev
->mutex
);
1684 numa_id
= netdev_linux_get_numa_id__(netdev
);
1685 ovs_mutex_unlock(&netdev
->mutex
);
1690 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1691 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1692 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1693 * the packet is too big or too small to transmit on the device.
1695 * The kernel maintains a packet transmission queue, so the caller is not
1696 * expected to do additional queuing of packets. */
1698 netdev_linux_send(struct netdev
*netdev_
, int qid OVS_UNUSED
,
1699 struct dp_packet_batch
*batch
,
1700 bool concurrent_txq OVS_UNUSED
)
1702 bool tso
= userspace_tso_enabled();
1703 int mtu
= ETH_PAYLOAD_MAX
;
1708 netdev_linux_get_mtu__(netdev_linux_cast(netdev_
), &mtu
);
1711 if (!is_tap_netdev(netdev_
)) {
1712 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_
))) {
1717 sock
= af_packet_sock();
1723 int ifindex
= netdev_get_ifindex(netdev_
);
1729 error
= netdev_linux_sock_batch_send(sock
, ifindex
, tso
, mtu
, batch
);
1731 error
= netdev_linux_tap_batch_send(netdev_
, tso
, mtu
, batch
);
1734 if (error
== ENOBUFS
) {
1735 /* The Linux AF_PACKET implementation never blocks waiting
1736 * for room for packets, instead returning ENOBUFS.
1737 * Translate this into EAGAIN for the caller. */
1740 VLOG_WARN_RL(&rl
, "error sending Ethernet packet on %s: %s",
1741 netdev_get_name(netdev_
), ovs_strerror(error
));
1746 dp_packet_delete_batch(batch
, true);
1750 /* Registers with the poll loop to wake up from the next call to poll_block()
1751 * when the packet transmission queue has sufficient room to transmit a packet
1752 * with netdev_send().
1754 * The kernel maintains a packet transmission queue, so the client is not
1755 * expected to do additional queuing of packets. Thus, this function is
1756 * unlikely to ever be used. It is included for completeness. */
1758 netdev_linux_send_wait(struct netdev
*netdev
, int qid OVS_UNUSED
)
1760 if (is_tap_netdev(netdev
)) {
1761 /* TAP device always accepts packets.*/
1762 poll_immediate_wake();
1766 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1767 * otherwise a positive errno value. */
1769 netdev_linux_set_etheraddr(struct netdev
*netdev_
, const struct eth_addr mac
)
1771 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1772 enum netdev_flags old_flags
= 0;
1775 ovs_mutex_lock(&netdev
->mutex
);
1776 if (netdev_linux_netnsid_is_remote(netdev
)) {
1781 if (netdev
->cache_valid
& VALID_ETHERADDR
) {
1782 error
= netdev
->ether_addr_error
;
1783 if (error
|| eth_addr_equals(netdev
->etheraddr
, mac
)) {
1786 netdev
->cache_valid
&= ~VALID_ETHERADDR
;
1789 /* Tap devices must be brought down before setting the address. */
1790 if (is_tap_netdev(netdev_
)) {
1791 update_flags(netdev
, NETDEV_UP
, 0, &old_flags
);
1793 error
= set_etheraddr(netdev_get_name(netdev_
), mac
);
1794 if (!error
|| error
== ENODEV
) {
1795 netdev
->ether_addr_error
= error
;
1796 netdev
->cache_valid
|= VALID_ETHERADDR
;
1798 netdev
->etheraddr
= mac
;
1802 if (is_tap_netdev(netdev_
) && old_flags
& NETDEV_UP
) {
1803 update_flags(netdev
, 0, NETDEV_UP
, &old_flags
);
1807 ovs_mutex_unlock(&netdev
->mutex
);
1811 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1813 netdev_linux_get_etheraddr(const struct netdev
*netdev_
, struct eth_addr
*mac
)
1815 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1818 ovs_mutex_lock(&netdev
->mutex
);
1819 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1820 netdev_linux_update_via_netlink(netdev
);
1823 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1824 /* Fall back to ioctl if netlink fails */
1825 netdev
->ether_addr_error
= get_etheraddr(netdev_get_name(netdev_
),
1826 &netdev
->etheraddr
);
1827 netdev
->cache_valid
|= VALID_ETHERADDR
;
1830 error
= netdev
->ether_addr_error
;
1832 *mac
= netdev
->etheraddr
;
1834 ovs_mutex_unlock(&netdev
->mutex
);
1840 netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
)
1844 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1845 netdev_linux_update_via_netlink(netdev
);
1848 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1849 /* Fall back to ioctl if netlink fails */
1852 netdev
->netdev_mtu_error
= af_inet_ifreq_ioctl(
1853 netdev_get_name(&netdev
->up
), &ifr
, SIOCGIFMTU
, "SIOCGIFMTU");
1854 netdev
->mtu
= ifr
.ifr_mtu
;
1855 netdev
->cache_valid
|= VALID_MTU
;
1858 error
= netdev
->netdev_mtu_error
;
1860 *mtup
= netdev
->mtu
;
1866 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1867 * in bytes, not including the hardware header; thus, this is typically 1500
1868 * bytes for Ethernet devices. */
1870 netdev_linux_get_mtu(const struct netdev
*netdev_
, int *mtup
)
1872 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1875 ovs_mutex_lock(&netdev
->mutex
);
1876 error
= netdev_linux_get_mtu__(netdev
, mtup
);
1877 ovs_mutex_unlock(&netdev
->mutex
);
1882 /* Sets the maximum size of transmitted (MTU) for given device using linux
1883 * networking ioctl interface.
1886 netdev_linux_set_mtu(struct netdev
*netdev_
, int mtu
)
1888 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1892 ovs_mutex_lock(&netdev
->mutex
);
1893 if (netdev_linux_netnsid_is_remote(netdev
)) {
1899 if (netdev_get_class(netdev_
) == &netdev_afxdp_class
) {
1900 error
= netdev_afxdp_verify_mtu_size(netdev_
, mtu
);
1907 if (netdev
->cache_valid
& VALID_MTU
) {
1908 error
= netdev
->netdev_mtu_error
;
1909 if (error
|| netdev
->mtu
== mtu
) {
1912 netdev
->cache_valid
&= ~VALID_MTU
;
1915 error
= af_inet_ifreq_ioctl(netdev_get_name(netdev_
), &ifr
,
1916 SIOCSIFMTU
, "SIOCSIFMTU");
1917 if (!error
|| error
== ENODEV
) {
1918 netdev
->netdev_mtu_error
= error
;
1919 netdev
->mtu
= ifr
.ifr_mtu
;
1920 netdev
->cache_valid
|= VALID_MTU
;
1923 ovs_mutex_unlock(&netdev
->mutex
);
1927 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1928 * On failure, returns a negative errno value. */
1930 netdev_linux_get_ifindex(const struct netdev
*netdev_
)
1932 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1935 ovs_mutex_lock(&netdev
->mutex
);
1936 if (netdev_linux_netnsid_is_remote(netdev
)) {
1940 error
= get_ifindex(netdev_
, &ifindex
);
1943 ovs_mutex_unlock(&netdev
->mutex
);
1944 return error
? -error
: ifindex
;
1948 netdev_linux_get_carrier(const struct netdev
*netdev_
, bool *carrier
)
1950 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1952 ovs_mutex_lock(&netdev
->mutex
);
1953 if (netdev
->miimon_interval
> 0) {
1954 *carrier
= netdev
->miimon
;
1956 *carrier
= (netdev
->ifi_flags
& IFF_RUNNING
) != 0;
1958 ovs_mutex_unlock(&netdev
->mutex
);
1963 static long long int
1964 netdev_linux_get_carrier_resets(const struct netdev
*netdev_
)
1966 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1967 long long int carrier_resets
;
1969 ovs_mutex_lock(&netdev
->mutex
);
1970 carrier_resets
= netdev
->carrier_resets
;
1971 ovs_mutex_unlock(&netdev
->mutex
);
1973 return carrier_resets
;
1977 netdev_linux_do_miimon(const char *name
, int cmd
, const char *cmd_name
,
1978 struct mii_ioctl_data
*data
)
1983 memset(&ifr
, 0, sizeof ifr
);
1984 memcpy(&ifr
.ifr_data
, data
, sizeof *data
);
1985 error
= af_inet_ifreq_ioctl(name
, &ifr
, cmd
, cmd_name
);
1986 memcpy(data
, &ifr
.ifr_data
, sizeof *data
);
1992 netdev_linux_get_miimon(const char *name
, bool *miimon
)
1994 struct mii_ioctl_data data
;
1999 memset(&data
, 0, sizeof data
);
2000 error
= netdev_linux_do_miimon(name
, SIOCGMIIPHY
, "SIOCGMIIPHY", &data
);
2002 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
2003 data
.reg_num
= MII_BMSR
;
2004 error
= netdev_linux_do_miimon(name
, SIOCGMIIREG
, "SIOCGMIIREG",
2008 *miimon
= !!(data
.val_out
& BMSR_LSTATUS
);
2012 struct ethtool_cmd ecmd
;
2014 VLOG_DBG_RL(&rl
, "%s: failed to query MII, falling back to ethtool",
2017 COVERAGE_INC(netdev_get_ethtool
);
2018 memset(&ecmd
, 0, sizeof ecmd
);
2019 error
= netdev_linux_do_ethtool(name
, &ecmd
, ETHTOOL_GLINK
,
2022 struct ethtool_value eval
;
2024 memcpy(&eval
, &ecmd
, sizeof eval
);
2025 *miimon
= !!eval
.data
;
2027 VLOG_WARN_RL(&rl
, "%s: ethtool link status failed", name
);
2035 netdev_linux_set_miimon_interval(struct netdev
*netdev_
,
2036 long long int interval
)
2038 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2040 ovs_mutex_lock(&netdev
->mutex
);
2041 interval
= interval
> 0 ? MAX(interval
, 100) : 0;
2042 if (netdev
->miimon_interval
!= interval
) {
2043 if (interval
&& !netdev
->miimon_interval
) {
2044 atomic_count_inc(&miimon_cnt
);
2045 } else if (!interval
&& netdev
->miimon_interval
) {
2046 atomic_count_dec(&miimon_cnt
);
2049 netdev
->miimon_interval
= interval
;
2050 timer_set_expired(&netdev
->miimon_timer
);
2052 ovs_mutex_unlock(&netdev
->mutex
);
2058 netdev_linux_miimon_run(void)
2060 struct shash device_shash
;
2061 struct shash_node
*node
;
2063 shash_init(&device_shash
);
2064 netdev_get_devices(&netdev_linux_class
, &device_shash
);
2065 SHASH_FOR_EACH (node
, &device_shash
) {
2066 struct netdev
*netdev
= node
->data
;
2067 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
2070 ovs_mutex_lock(&dev
->mutex
);
2071 if (dev
->miimon_interval
> 0 && timer_expired(&dev
->miimon_timer
)) {
2072 netdev_linux_get_miimon(dev
->up
.name
, &miimon
);
2073 if (miimon
!= dev
->miimon
) {
2074 dev
->miimon
= miimon
;
2075 netdev_linux_changed(dev
, dev
->ifi_flags
, 0);
2078 timer_set_duration(&dev
->miimon_timer
, dev
->miimon_interval
);
2080 ovs_mutex_unlock(&dev
->mutex
);
2081 netdev_close(netdev
);
2084 shash_destroy(&device_shash
);
2088 netdev_linux_miimon_wait(void)
2090 struct shash device_shash
;
2091 struct shash_node
*node
;
2093 shash_init(&device_shash
);
2094 netdev_get_devices(&netdev_linux_class
, &device_shash
);
2095 SHASH_FOR_EACH (node
, &device_shash
) {
2096 struct netdev
*netdev
= node
->data
;
2097 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
2099 ovs_mutex_lock(&dev
->mutex
);
2100 if (dev
->miimon_interval
> 0) {
2101 timer_wait(&dev
->miimon_timer
);
2103 ovs_mutex_unlock(&dev
->mutex
);
2104 netdev_close(netdev
);
2106 shash_destroy(&device_shash
);
2110 swap_uint64(uint64_t *a
, uint64_t *b
)
2117 /* Copies 'src' into 'dst', performing format conversion in the process.
2119 * 'src' is allowed to be misaligned. */
2121 netdev_stats_from_ovs_vport_stats(struct netdev_stats
*dst
,
2122 const struct ovs_vport_stats
*src
)
2124 dst
->rx_packets
= get_32aligned_u64(&src
->rx_packets
);
2125 dst
->tx_packets
= get_32aligned_u64(&src
->tx_packets
);
2126 dst
->rx_bytes
= get_32aligned_u64(&src
->rx_bytes
);
2127 dst
->tx_bytes
= get_32aligned_u64(&src
->tx_bytes
);
2128 dst
->rx_errors
= get_32aligned_u64(&src
->rx_errors
);
2129 dst
->tx_errors
= get_32aligned_u64(&src
->tx_errors
);
2130 dst
->rx_dropped
= get_32aligned_u64(&src
->rx_dropped
);
2131 dst
->tx_dropped
= get_32aligned_u64(&src
->tx_dropped
);
2133 dst
->collisions
= 0;
2134 dst
->rx_length_errors
= 0;
2135 dst
->rx_over_errors
= 0;
2136 dst
->rx_crc_errors
= 0;
2137 dst
->rx_frame_errors
= 0;
2138 dst
->rx_fifo_errors
= 0;
2139 dst
->rx_missed_errors
= 0;
2140 dst
->tx_aborted_errors
= 0;
2141 dst
->tx_carrier_errors
= 0;
2142 dst
->tx_fifo_errors
= 0;
2143 dst
->tx_heartbeat_errors
= 0;
2144 dst
->tx_window_errors
= 0;
2148 get_stats_via_vport__(const struct netdev
*netdev
, struct netdev_stats
*stats
)
2150 struct dpif_netlink_vport reply
;
2154 error
= dpif_netlink_vport_get(netdev_get_name(netdev
), &reply
, &buf
);
2157 } else if (!reply
.stats
) {
2162 netdev_stats_from_ovs_vport_stats(stats
, reply
.stats
);
2170 get_stats_via_vport(const struct netdev
*netdev_
,
2171 struct netdev_stats
*stats
)
2173 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2175 if (!netdev
->vport_stats_error
||
2176 !(netdev
->cache_valid
& VALID_VPORT_STAT_ERROR
)) {
2179 error
= get_stats_via_vport__(netdev_
, stats
);
2180 if (error
&& error
!= ENOENT
&& error
!= ENODEV
) {
2181 VLOG_WARN_RL(&rl
, "%s: obtaining netdev stats via vport failed "
2183 netdev_get_name(netdev_
), ovs_strerror(error
));
2185 netdev
->vport_stats_error
= error
;
2186 netdev
->cache_valid
|= VALID_VPORT_STAT_ERROR
;
2190 /* Retrieves current device stats for 'netdev-linux'. */
2192 netdev_linux_get_stats(const struct netdev
*netdev_
,
2193 struct netdev_stats
*stats
)
2195 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2196 struct netdev_stats dev_stats
;
2199 ovs_mutex_lock(&netdev
->mutex
);
2200 get_stats_via_vport(netdev_
, stats
);
2201 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
2203 if (!netdev
->vport_stats_error
) {
2206 } else if (netdev
->vport_stats_error
) {
2207 /* stats not available from OVS then use netdev stats. */
2210 stats
->multicast
+= dev_stats
.multicast
;
2211 stats
->collisions
+= dev_stats
.collisions
;
2212 stats
->rx_length_errors
+= dev_stats
.rx_length_errors
;
2213 stats
->rx_over_errors
+= dev_stats
.rx_over_errors
;
2214 stats
->rx_crc_errors
+= dev_stats
.rx_crc_errors
;
2215 stats
->rx_frame_errors
+= dev_stats
.rx_frame_errors
;
2216 stats
->rx_fifo_errors
+= dev_stats
.rx_fifo_errors
;
2217 stats
->rx_missed_errors
+= dev_stats
.rx_missed_errors
;
2218 stats
->tx_aborted_errors
+= dev_stats
.tx_aborted_errors
;
2219 stats
->tx_carrier_errors
+= dev_stats
.tx_carrier_errors
;
2220 stats
->tx_fifo_errors
+= dev_stats
.tx_fifo_errors
;
2221 stats
->tx_heartbeat_errors
+= dev_stats
.tx_heartbeat_errors
;
2222 stats
->tx_window_errors
+= dev_stats
.tx_window_errors
;
2224 ovs_mutex_unlock(&netdev
->mutex
);
2229 /* Retrieves current device stats for 'netdev-tap' netdev or
2230 * netdev-internal. */
2232 netdev_tap_get_stats(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
2234 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2235 struct netdev_stats dev_stats
;
2238 ovs_mutex_lock(&netdev
->mutex
);
2239 get_stats_via_vport(netdev_
, stats
);
2240 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
2242 if (!netdev
->vport_stats_error
) {
2245 } else if (netdev
->vport_stats_error
) {
2246 /* Transmit and receive stats will appear to be swapped relative to the
2247 * other ports since we are the one sending the data, not a remote
2248 * computer. For consistency, we swap them back here. This does not
2249 * apply if we are getting stats from the vport layer because it always
2250 * tracks stats from the perspective of the switch. */
2253 swap_uint64(&stats
->rx_packets
, &stats
->tx_packets
);
2254 swap_uint64(&stats
->rx_bytes
, &stats
->tx_bytes
);
2255 swap_uint64(&stats
->rx_errors
, &stats
->tx_errors
);
2256 swap_uint64(&stats
->rx_dropped
, &stats
->tx_dropped
);
2257 stats
->rx_length_errors
= 0;
2258 stats
->rx_over_errors
= 0;
2259 stats
->rx_crc_errors
= 0;
2260 stats
->rx_frame_errors
= 0;
2261 stats
->rx_fifo_errors
= 0;
2262 stats
->rx_missed_errors
= 0;
2263 stats
->tx_aborted_errors
= 0;
2264 stats
->tx_carrier_errors
= 0;
2265 stats
->tx_fifo_errors
= 0;
2266 stats
->tx_heartbeat_errors
= 0;
2267 stats
->tx_window_errors
= 0;
2269 /* Use kernel netdev's packet and byte counts since vport counters
2270 * do not reflect packet counts on the wire when GSO, TSO or GRO
2272 stats
->rx_packets
= dev_stats
.tx_packets
;
2273 stats
->rx_bytes
= dev_stats
.tx_bytes
;
2274 stats
->tx_packets
= dev_stats
.rx_packets
;
2275 stats
->tx_bytes
= dev_stats
.rx_bytes
;
2277 stats
->rx_dropped
+= dev_stats
.tx_dropped
;
2278 stats
->tx_dropped
+= dev_stats
.rx_dropped
;
2280 stats
->rx_errors
+= dev_stats
.tx_errors
;
2281 stats
->tx_errors
+= dev_stats
.rx_errors
;
2283 stats
->multicast
+= dev_stats
.multicast
;
2284 stats
->collisions
+= dev_stats
.collisions
;
2286 stats
->tx_dropped
+= netdev
->tx_dropped
;
2287 stats
->rx_dropped
+= netdev
->rx_dropped
;
2288 ovs_mutex_unlock(&netdev
->mutex
);
2294 netdev_internal_get_stats(const struct netdev
*netdev_
,
2295 struct netdev_stats
*stats
)
2297 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2300 ovs_mutex_lock(&netdev
->mutex
);
2301 get_stats_via_vport(netdev_
, stats
);
2302 error
= netdev
->vport_stats_error
;
2303 ovs_mutex_unlock(&netdev
->mutex
);
2309 netdev_linux_read_features(struct netdev_linux
*netdev
)
2311 struct ethtool_cmd ecmd
;
2315 if (netdev
->cache_valid
& VALID_FEATURES
) {
2319 COVERAGE_INC(netdev_get_ethtool
);
2320 memset(&ecmd
, 0, sizeof ecmd
);
2321 error
= netdev_linux_do_ethtool(netdev
->up
.name
, &ecmd
,
2322 ETHTOOL_GSET
, "ETHTOOL_GSET");
2327 /* Supported features. */
2328 netdev
->supported
= 0;
2329 if (ecmd
.supported
& SUPPORTED_10baseT_Half
) {
2330 netdev
->supported
|= NETDEV_F_10MB_HD
;
2332 if (ecmd
.supported
& SUPPORTED_10baseT_Full
) {
2333 netdev
->supported
|= NETDEV_F_10MB_FD
;
2335 if (ecmd
.supported
& SUPPORTED_100baseT_Half
) {
2336 netdev
->supported
|= NETDEV_F_100MB_HD
;
2338 if (ecmd
.supported
& SUPPORTED_100baseT_Full
) {
2339 netdev
->supported
|= NETDEV_F_100MB_FD
;
2341 if (ecmd
.supported
& SUPPORTED_1000baseT_Half
) {
2342 netdev
->supported
|= NETDEV_F_1GB_HD
;
2344 if ((ecmd
.supported
& SUPPORTED_1000baseT_Full
) ||
2345 (ecmd
.supported
& SUPPORTED_1000baseKX_Full
)) {
2346 netdev
->supported
|= NETDEV_F_1GB_FD
;
2348 if ((ecmd
.supported
& SUPPORTED_10000baseT_Full
) ||
2349 (ecmd
.supported
& SUPPORTED_10000baseKX4_Full
) ||
2350 (ecmd
.supported
& SUPPORTED_10000baseKR_Full
) ||
2351 (ecmd
.supported
& SUPPORTED_10000baseR_FEC
)) {
2352 netdev
->supported
|= NETDEV_F_10GB_FD
;
2354 if ((ecmd
.supported
& SUPPORTED_40000baseKR4_Full
) ||
2355 (ecmd
.supported
& SUPPORTED_40000baseCR4_Full
) ||
2356 (ecmd
.supported
& SUPPORTED_40000baseSR4_Full
) ||
2357 (ecmd
.supported
& SUPPORTED_40000baseLR4_Full
)) {
2358 netdev
->supported
|= NETDEV_F_40GB_FD
;
2360 if (ecmd
.supported
& SUPPORTED_TP
) {
2361 netdev
->supported
|= NETDEV_F_COPPER
;
2363 if (ecmd
.supported
& SUPPORTED_FIBRE
) {
2364 netdev
->supported
|= NETDEV_F_FIBER
;
2366 if (ecmd
.supported
& SUPPORTED_Autoneg
) {
2367 netdev
->supported
|= NETDEV_F_AUTONEG
;
2369 if (ecmd
.supported
& SUPPORTED_Pause
) {
2370 netdev
->supported
|= NETDEV_F_PAUSE
;
2372 if (ecmd
.supported
& SUPPORTED_Asym_Pause
) {
2373 netdev
->supported
|= NETDEV_F_PAUSE_ASYM
;
2376 /* Advertised features. */
2377 netdev
->advertised
= 0;
2378 if (ecmd
.advertising
& ADVERTISED_10baseT_Half
) {
2379 netdev
->advertised
|= NETDEV_F_10MB_HD
;
2381 if (ecmd
.advertising
& ADVERTISED_10baseT_Full
) {
2382 netdev
->advertised
|= NETDEV_F_10MB_FD
;
2384 if (ecmd
.advertising
& ADVERTISED_100baseT_Half
) {
2385 netdev
->advertised
|= NETDEV_F_100MB_HD
;
2387 if (ecmd
.advertising
& ADVERTISED_100baseT_Full
) {
2388 netdev
->advertised
|= NETDEV_F_100MB_FD
;
2390 if (ecmd
.advertising
& ADVERTISED_1000baseT_Half
) {
2391 netdev
->advertised
|= NETDEV_F_1GB_HD
;
2393 if ((ecmd
.advertising
& ADVERTISED_1000baseT_Full
) ||
2394 (ecmd
.advertising
& ADVERTISED_1000baseKX_Full
)) {
2395 netdev
->advertised
|= NETDEV_F_1GB_FD
;
2397 if ((ecmd
.advertising
& ADVERTISED_10000baseT_Full
) ||
2398 (ecmd
.advertising
& ADVERTISED_10000baseKX4_Full
) ||
2399 (ecmd
.advertising
& ADVERTISED_10000baseKR_Full
) ||
2400 (ecmd
.advertising
& ADVERTISED_10000baseR_FEC
)) {
2401 netdev
->advertised
|= NETDEV_F_10GB_FD
;
2403 if ((ecmd
.advertising
& ADVERTISED_40000baseKR4_Full
) ||
2404 (ecmd
.advertising
& ADVERTISED_40000baseCR4_Full
) ||
2405 (ecmd
.advertising
& ADVERTISED_40000baseSR4_Full
) ||
2406 (ecmd
.advertising
& ADVERTISED_40000baseLR4_Full
)) {
2407 netdev
->advertised
|= NETDEV_F_40GB_FD
;
2409 if (ecmd
.advertising
& ADVERTISED_TP
) {
2410 netdev
->advertised
|= NETDEV_F_COPPER
;
2412 if (ecmd
.advertising
& ADVERTISED_FIBRE
) {
2413 netdev
->advertised
|= NETDEV_F_FIBER
;
2415 if (ecmd
.advertising
& ADVERTISED_Autoneg
) {
2416 netdev
->advertised
|= NETDEV_F_AUTONEG
;
2418 if (ecmd
.advertising
& ADVERTISED_Pause
) {
2419 netdev
->advertised
|= NETDEV_F_PAUSE
;
2421 if (ecmd
.advertising
& ADVERTISED_Asym_Pause
) {
2422 netdev
->advertised
|= NETDEV_F_PAUSE_ASYM
;
2425 /* Current settings. */
2426 speed
= ethtool_cmd_speed(&ecmd
);
2427 if (speed
== SPEED_10
) {
2428 netdev
->current
= ecmd
.duplex
? NETDEV_F_10MB_FD
: NETDEV_F_10MB_HD
;
2429 } else if (speed
== SPEED_100
) {
2430 netdev
->current
= ecmd
.duplex
? NETDEV_F_100MB_FD
: NETDEV_F_100MB_HD
;
2431 } else if (speed
== SPEED_1000
) {
2432 netdev
->current
= ecmd
.duplex
? NETDEV_F_1GB_FD
: NETDEV_F_1GB_HD
;
2433 } else if (speed
== SPEED_10000
) {
2434 netdev
->current
= NETDEV_F_10GB_FD
;
2435 } else if (speed
== 40000) {
2436 netdev
->current
= NETDEV_F_40GB_FD
;
2437 } else if (speed
== 100000) {
2438 netdev
->current
= NETDEV_F_100GB_FD
;
2439 } else if (speed
== 1000000) {
2440 netdev
->current
= NETDEV_F_1TB_FD
;
2442 netdev
->current
= 0;
2445 if (ecmd
.port
== PORT_TP
) {
2446 netdev
->current
|= NETDEV_F_COPPER
;
2447 } else if (ecmd
.port
== PORT_FIBRE
) {
2448 netdev
->current
|= NETDEV_F_FIBER
;
2452 netdev
->current
|= NETDEV_F_AUTONEG
;
2456 netdev
->cache_valid
|= VALID_FEATURES
;
2457 netdev
->get_features_error
= error
;
2460 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2461 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2462 * Returns 0 if successful, otherwise a positive errno value. */
2464 netdev_linux_get_features(const struct netdev
*netdev_
,
2465 enum netdev_features
*current
,
2466 enum netdev_features
*advertised
,
2467 enum netdev_features
*supported
,
2468 enum netdev_features
*peer
)
2470 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2473 ovs_mutex_lock(&netdev
->mutex
);
2474 if (netdev_linux_netnsid_is_remote(netdev
)) {
2479 netdev_linux_read_features(netdev
);
2480 if (!netdev
->get_features_error
) {
2481 *current
= netdev
->current
;
2482 *advertised
= netdev
->advertised
;
2483 *supported
= netdev
->supported
;
2484 *peer
= 0; /* XXX */
2486 error
= netdev
->get_features_error
;
2489 ovs_mutex_unlock(&netdev
->mutex
);
2493 /* Set the features advertised by 'netdev' to 'advertise'. */
2495 netdev_linux_set_advertisements(struct netdev
*netdev_
,
2496 enum netdev_features advertise
)
2498 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2499 struct ethtool_cmd ecmd
;
2502 ovs_mutex_lock(&netdev
->mutex
);
2504 COVERAGE_INC(netdev_get_ethtool
);
2506 if (netdev_linux_netnsid_is_remote(netdev
)) {
2511 memset(&ecmd
, 0, sizeof ecmd
);
2512 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2513 ETHTOOL_GSET
, "ETHTOOL_GSET");
2518 ecmd
.advertising
= 0;
2519 if (advertise
& NETDEV_F_10MB_HD
) {
2520 ecmd
.advertising
|= ADVERTISED_10baseT_Half
;
2522 if (advertise
& NETDEV_F_10MB_FD
) {
2523 ecmd
.advertising
|= ADVERTISED_10baseT_Full
;
2525 if (advertise
& NETDEV_F_100MB_HD
) {
2526 ecmd
.advertising
|= ADVERTISED_100baseT_Half
;
2528 if (advertise
& NETDEV_F_100MB_FD
) {
2529 ecmd
.advertising
|= ADVERTISED_100baseT_Full
;
2531 if (advertise
& NETDEV_F_1GB_HD
) {
2532 ecmd
.advertising
|= ADVERTISED_1000baseT_Half
;
2534 if (advertise
& NETDEV_F_1GB_FD
) {
2535 ecmd
.advertising
|= ADVERTISED_1000baseT_Full
;
2537 if (advertise
& NETDEV_F_10GB_FD
) {
2538 ecmd
.advertising
|= ADVERTISED_10000baseT_Full
;
2540 if (advertise
& NETDEV_F_COPPER
) {
2541 ecmd
.advertising
|= ADVERTISED_TP
;
2543 if (advertise
& NETDEV_F_FIBER
) {
2544 ecmd
.advertising
|= ADVERTISED_FIBRE
;
2546 if (advertise
& NETDEV_F_AUTONEG
) {
2547 ecmd
.advertising
|= ADVERTISED_Autoneg
;
2549 if (advertise
& NETDEV_F_PAUSE
) {
2550 ecmd
.advertising
|= ADVERTISED_Pause
;
2552 if (advertise
& NETDEV_F_PAUSE_ASYM
) {
2553 ecmd
.advertising
|= ADVERTISED_Asym_Pause
;
2555 COVERAGE_INC(netdev_set_ethtool
);
2556 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2557 ETHTOOL_SSET
, "ETHTOOL_SSET");
2560 ovs_mutex_unlock(&netdev
->mutex
);
2564 static struct tc_police
2565 tc_matchall_fill_police(uint32_t kbits_rate
, uint32_t kbits_burst
)
2567 unsigned int bsize
= MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024 / 64;
2568 unsigned int bps
= ((uint64_t) kbits_rate
* 1000) / 8;
2569 struct tc_police police
;
2570 struct tc_ratespec rate
;
2573 memset(&rate
, 0, sizeof rate
);
2575 rate
.cell_log
= tc_calc_cell_log(mtu
);
2576 rate
.mpu
= ETH_TOTAL_MIN
;
2578 memset(&police
, 0, sizeof police
);
2579 police
.burst
= tc_bytes_to_ticks(bps
, bsize
);
2580 police
.action
= TC_POLICE_SHOT
;
2588 nl_msg_put_act_police(struct ofpbuf
*request
, struct tc_police police
)
2592 nl_msg_put_string(request
, TCA_ACT_KIND
, "police");
2593 offset
= nl_msg_start_nested(request
, TCA_ACT_OPTIONS
);
2594 nl_msg_put_unspec(request
, TCA_POLICE_TBF
, &police
, sizeof police
);
2595 tc_put_rtab(request
, TCA_POLICE_RATE
, &police
.rate
);
2596 nl_msg_put_u32(request
, TCA_POLICE_RESULT
, TC_ACT_UNSPEC
);
2597 nl_msg_end_nested(request
, offset
);
2601 tc_add_matchall_policer(struct netdev
*netdev
, uint32_t kbits_rate
,
2602 uint32_t kbits_burst
)
2604 uint16_t eth_type
= (OVS_FORCE
uint16_t) htons(ETH_P_ALL
);
2605 size_t basic_offset
, action_offset
, inner_offset
;
2606 uint16_t prio
= TC_RESERVED_PRIORITY_POLICE
;
2607 int ifindex
, err
= 0;
2608 struct tc_police pol_act
;
2609 struct ofpbuf request
;
2610 struct ofpbuf
*reply
;
2611 struct tcmsg
*tcmsg
;
2612 uint32_t handle
= 1;
2614 err
= get_ifindex(netdev
, &ifindex
);
2619 tcmsg
= tc_make_request(ifindex
, RTM_NEWTFILTER
, NLM_F_CREATE
| NLM_F_ECHO
,
2621 tcmsg
->tcm_parent
= TC_INGRESS_PARENT
;
2622 tcmsg
->tcm_info
= tc_make_handle(prio
, eth_type
);
2623 tcmsg
->tcm_handle
= handle
;
2625 pol_act
= tc_matchall_fill_police(kbits_rate
, kbits_burst
);
2626 nl_msg_put_string(&request
, TCA_KIND
, "matchall");
2627 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
2628 action_offset
= nl_msg_start_nested(&request
, TCA_MATCHALL_ACT
);
2629 inner_offset
= nl_msg_start_nested(&request
, 1);
2630 nl_msg_put_act_police(&request
, pol_act
);
2631 nl_msg_end_nested(&request
, inner_offset
);
2632 nl_msg_end_nested(&request
, action_offset
);
2633 nl_msg_end_nested(&request
, basic_offset
);
2635 err
= tc_transact(&request
, &reply
);
2638 ofpbuf_at_assert(reply
, NLMSG_HDRLEN
, sizeof *tc
);
2639 ofpbuf_delete(reply
);
2646 tc_del_matchall_policer(struct netdev
*netdev
)
2648 int prio
= TC_RESERVED_PRIORITY_POLICE
;
2649 uint32_t block_id
= 0;
2654 err
= get_ifindex(netdev
, &ifindex
);
2659 id
= tc_make_tcf_id(ifindex
, block_id
, prio
, TC_INGRESS
);
2660 err
= tc_del_filter(&id
);
2668 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2669 * successful, otherwise a positive errno value. */
2671 netdev_linux_set_policing(struct netdev
*netdev_
,
2672 uint32_t kbits_rate
, uint32_t kbits_burst
)
2674 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2675 const char *netdev_name
= netdev_get_name(netdev_
);
2679 kbits_burst
= (!kbits_rate
? 0 /* Force to 0 if no rate specified. */
2680 : !kbits_burst
? 8000 /* Default to 8000 kbits if 0. */
2681 : kbits_burst
); /* Stick with user-specified value. */
2683 ovs_mutex_lock(&netdev
->mutex
);
2684 if (netdev_linux_netnsid_is_remote(netdev
)) {
2689 if (netdev
->cache_valid
& VALID_POLICING
) {
2690 error
= netdev
->netdev_policing_error
;
2691 if (error
|| (netdev
->kbits_rate
== kbits_rate
&&
2692 netdev
->kbits_burst
== kbits_burst
)) {
2693 /* Assume that settings haven't changed since we last set them. */
2696 netdev
->cache_valid
&= ~VALID_POLICING
;
2699 COVERAGE_INC(netdev_set_policing
);
2701 /* Use matchall for policing when offloadling ovs with tc-flower. */
2702 if (netdev_is_flow_api_enabled()) {
2703 error
= tc_del_matchall_policer(netdev_
);
2705 error
= tc_add_matchall_policer(netdev_
, kbits_rate
, kbits_burst
);
2707 ovs_mutex_unlock(&netdev
->mutex
);
2711 error
= get_ifindex(netdev_
, &ifindex
);
2716 /* Remove any existing ingress qdisc. */
2717 error
= tc_add_del_qdisc(ifindex
, false, 0, TC_INGRESS
);
2719 VLOG_WARN_RL(&rl
, "%s: removing policing failed: %s",
2720 netdev_name
, ovs_strerror(error
));
2725 error
= tc_add_del_qdisc(ifindex
, true, 0, TC_INGRESS
);
2727 VLOG_WARN_RL(&rl
, "%s: adding policing qdisc failed: %s",
2728 netdev_name
, ovs_strerror(error
));
2732 error
= tc_add_policer(netdev_
, kbits_rate
, kbits_burst
);
2734 VLOG_WARN_RL(&rl
, "%s: adding policing action failed: %s",
2735 netdev_name
, ovs_strerror(error
));
2740 netdev
->kbits_rate
= kbits_rate
;
2741 netdev
->kbits_burst
= kbits_burst
;
2744 if (!error
|| error
== ENODEV
) {
2745 netdev
->netdev_policing_error
= error
;
2746 netdev
->cache_valid
|= VALID_POLICING
;
2748 ovs_mutex_unlock(&netdev
->mutex
);
2753 netdev_linux_get_qos_types(const struct netdev
*netdev OVS_UNUSED
,
2756 const struct tc_ops
*const *opsp
;
2757 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2758 const struct tc_ops
*ops
= *opsp
;
2759 if (ops
->tc_install
&& ops
->ovs_name
[0] != '\0') {
2760 sset_add(types
, ops
->ovs_name
);
2766 static const struct tc_ops
*
2767 tc_lookup_ovs_name(const char *name
)
2769 const struct tc_ops
*const *opsp
;
2771 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2772 const struct tc_ops
*ops
= *opsp
;
2773 if (!strcmp(name
, ops
->ovs_name
)) {
2780 static const struct tc_ops
*
2781 tc_lookup_linux_name(const char *name
)
2783 const struct tc_ops
*const *opsp
;
2785 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2786 const struct tc_ops
*ops
= *opsp
;
2787 if (ops
->linux_name
&& !strcmp(name
, ops
->linux_name
)) {
2794 static struct tc_queue
*
2795 tc_find_queue__(const struct netdev
*netdev_
, unsigned int queue_id
,
2798 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2799 struct tc_queue
*queue
;
2801 HMAP_FOR_EACH_IN_BUCKET (queue
, hmap_node
, hash
, &netdev
->tc
->queues
) {
2802 if (queue
->queue_id
== queue_id
) {
2809 static struct tc_queue
*
2810 tc_find_queue(const struct netdev
*netdev
, unsigned int queue_id
)
2812 return tc_find_queue__(netdev
, queue_id
, hash_int(queue_id
, 0));
2816 netdev_linux_get_qos_capabilities(const struct netdev
*netdev OVS_UNUSED
,
2818 struct netdev_qos_capabilities
*caps
)
2820 const struct tc_ops
*ops
= tc_lookup_ovs_name(type
);
2824 caps
->n_queues
= ops
->n_queues
;
2829 netdev_linux_get_qos(const struct netdev
*netdev_
,
2830 const char **typep
, struct smap
*details
)
2832 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2835 ovs_mutex_lock(&netdev
->mutex
);
2836 if (netdev_linux_netnsid_is_remote(netdev
)) {
2841 error
= tc_query_qdisc(netdev_
);
2843 *typep
= netdev
->tc
->ops
->ovs_name
;
2844 error
= (netdev
->tc
->ops
->qdisc_get
2845 ? netdev
->tc
->ops
->qdisc_get(netdev_
, details
)
2850 ovs_mutex_unlock(&netdev
->mutex
);
2855 netdev_linux_set_qos(struct netdev
*netdev_
,
2856 const char *type
, const struct smap
*details
)
2858 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2859 const struct tc_ops
*new_ops
;
2862 new_ops
= tc_lookup_ovs_name(type
);
2863 if (!new_ops
|| !new_ops
->tc_install
) {
2867 if (new_ops
== &tc_ops_noop
) {
2868 return new_ops
->tc_install(netdev_
, details
);
2871 ovs_mutex_lock(&netdev
->mutex
);
2872 if (netdev_linux_netnsid_is_remote(netdev
)) {
2877 error
= tc_query_qdisc(netdev_
);
2882 if (new_ops
== netdev
->tc
->ops
) {
2883 error
= new_ops
->qdisc_set
? new_ops
->qdisc_set(netdev_
, details
) : 0;
2885 /* Delete existing qdisc. */
2886 error
= tc_del_qdisc(netdev_
);
2890 ovs_assert(netdev
->tc
== NULL
);
2892 /* Install new qdisc. */
2893 error
= new_ops
->tc_install(netdev_
, details
);
2894 ovs_assert((error
== 0) == (netdev
->tc
!= NULL
));
2898 ovs_mutex_unlock(&netdev
->mutex
);
2903 netdev_linux_get_queue(const struct netdev
*netdev_
,
2904 unsigned int queue_id
, struct smap
*details
)
2906 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2909 ovs_mutex_lock(&netdev
->mutex
);
2910 if (netdev_linux_netnsid_is_remote(netdev
)) {
2915 error
= tc_query_qdisc(netdev_
);
2917 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2919 ? netdev
->tc
->ops
->class_get(netdev_
, queue
, details
)
2924 ovs_mutex_unlock(&netdev
->mutex
);
2929 netdev_linux_set_queue(struct netdev
*netdev_
,
2930 unsigned int queue_id
, const struct smap
*details
)
2932 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2935 ovs_mutex_lock(&netdev
->mutex
);
2936 if (netdev_linux_netnsid_is_remote(netdev
)) {
2941 error
= tc_query_qdisc(netdev_
);
2943 error
= (queue_id
< netdev
->tc
->ops
->n_queues
2944 && netdev
->tc
->ops
->class_set
2945 ? netdev
->tc
->ops
->class_set(netdev_
, queue_id
, details
)
2950 ovs_mutex_unlock(&netdev
->mutex
);
2955 netdev_linux_delete_queue(struct netdev
*netdev_
, unsigned int queue_id
)
2957 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2960 ovs_mutex_lock(&netdev
->mutex
);
2961 if (netdev_linux_netnsid_is_remote(netdev
)) {
2966 error
= tc_query_qdisc(netdev_
);
2968 if (netdev
->tc
->ops
->class_delete
) {
2969 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2971 ? netdev
->tc
->ops
->class_delete(netdev_
, queue
)
2979 ovs_mutex_unlock(&netdev
->mutex
);
2984 netdev_linux_get_queue_stats(const struct netdev
*netdev_
,
2985 unsigned int queue_id
,
2986 struct netdev_queue_stats
*stats
)
2988 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2991 ovs_mutex_lock(&netdev
->mutex
);
2992 if (netdev_linux_netnsid_is_remote(netdev
)) {
2997 error
= tc_query_qdisc(netdev_
);
2999 if (netdev
->tc
->ops
->class_get_stats
) {
3000 const struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
3002 stats
->created
= queue
->created
;
3003 error
= netdev
->tc
->ops
->class_get_stats(netdev_
, queue
,
3014 ovs_mutex_unlock(&netdev
->mutex
);
3018 struct queue_dump_state
{
3019 struct nl_dump dump
;
3024 start_queue_dump(const struct netdev
*netdev
, struct queue_dump_state
*state
)
3026 struct ofpbuf request
;
3027 struct tcmsg
*tcmsg
;
3029 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, 0, &request
);
3033 tcmsg
->tcm_parent
= 0;
3034 nl_dump_start(&state
->dump
, NETLINK_ROUTE
, &request
);
3035 ofpbuf_uninit(&request
);
3037 ofpbuf_init(&state
->buf
, NL_DUMP_BUFSIZE
);
3042 finish_queue_dump(struct queue_dump_state
*state
)
3044 ofpbuf_uninit(&state
->buf
);
3045 return nl_dump_done(&state
->dump
);
3048 struct netdev_linux_queue_state
{
3049 unsigned int *queues
;
3055 netdev_linux_queue_dump_start(const struct netdev
*netdev_
, void **statep
)
3057 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3060 ovs_mutex_lock(&netdev
->mutex
);
3061 if (netdev_linux_netnsid_is_remote(netdev
)) {
3066 error
= tc_query_qdisc(netdev_
);
3068 if (netdev
->tc
->ops
->class_get
) {
3069 struct netdev_linux_queue_state
*state
;
3070 struct tc_queue
*queue
;
3073 *statep
= state
= xmalloc(sizeof *state
);
3074 state
->n_queues
= hmap_count(&netdev
->tc
->queues
);
3075 state
->cur_queue
= 0;
3076 state
->queues
= xmalloc(state
->n_queues
* sizeof *state
->queues
);
3079 HMAP_FOR_EACH (queue
, hmap_node
, &netdev
->tc
->queues
) {
3080 state
->queues
[i
++] = queue
->queue_id
;
3088 ovs_mutex_unlock(&netdev
->mutex
);
3093 netdev_linux_queue_dump_next(const struct netdev
*netdev_
, void *state_
,
3094 unsigned int *queue_idp
, struct smap
*details
)
3096 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3097 struct netdev_linux_queue_state
*state
= state_
;
3100 ovs_mutex_lock(&netdev
->mutex
);
3101 if (netdev_linux_netnsid_is_remote(netdev
)) {
3106 while (state
->cur_queue
< state
->n_queues
) {
3107 unsigned int queue_id
= state
->queues
[state
->cur_queue
++];
3108 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
3111 *queue_idp
= queue_id
;
3112 error
= netdev
->tc
->ops
->class_get(netdev_
, queue
, details
);
3118 ovs_mutex_unlock(&netdev
->mutex
);
3123 netdev_linux_queue_dump_done(const struct netdev
*netdev OVS_UNUSED
,
3126 struct netdev_linux_queue_state
*state
= state_
;
3128 free(state
->queues
);
3134 netdev_linux_dump_queue_stats(const struct netdev
*netdev_
,
3135 netdev_dump_queue_stats_cb
*cb
, void *aux
)
3137 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3140 ovs_mutex_lock(&netdev
->mutex
);
3141 if (netdev_linux_netnsid_is_remote(netdev
)) {
3146 error
= tc_query_qdisc(netdev_
);
3148 struct queue_dump_state state
;
3150 if (!netdev
->tc
->ops
->class_dump_stats
) {
3152 } else if (!start_queue_dump(netdev_
, &state
)) {
3158 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
3159 retval
= netdev
->tc
->ops
->class_dump_stats(netdev_
, &msg
,
3166 retval
= finish_queue_dump(&state
);
3174 ovs_mutex_unlock(&netdev
->mutex
);
3179 netdev_linux_set_in4(struct netdev
*netdev_
, struct in_addr address
,
3180 struct in_addr netmask
)
3182 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3185 ovs_mutex_lock(&netdev
->mutex
);
3186 if (netdev_linux_netnsid_is_remote(netdev
)) {
3191 error
= do_set_addr(netdev_
, SIOCSIFADDR
, "SIOCSIFADDR", address
);
3193 if (address
.s_addr
!= INADDR_ANY
) {
3194 error
= do_set_addr(netdev_
, SIOCSIFNETMASK
,
3195 "SIOCSIFNETMASK", netmask
);
3200 ovs_mutex_unlock(&netdev
->mutex
);
3204 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
3205 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
3208 netdev_linux_get_addr_list(const struct netdev
*netdev_
,
3209 struct in6_addr
**addr
, struct in6_addr
**mask
, int *n_cnt
)
3211 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3214 ovs_mutex_lock(&netdev
->mutex
);
3215 if (netdev_linux_netnsid_is_remote(netdev
)) {
3220 error
= netdev_get_addrs(netdev_get_name(netdev_
), addr
, mask
, n_cnt
);
3223 ovs_mutex_unlock(&netdev
->mutex
);
3228 make_in4_sockaddr(struct sockaddr
*sa
, struct in_addr addr
)
3230 struct sockaddr_in sin
;
3231 memset(&sin
, 0, sizeof sin
);
3232 sin
.sin_family
= AF_INET
;
3233 sin
.sin_addr
= addr
;
3236 memset(sa
, 0, sizeof *sa
);
3237 memcpy(sa
, &sin
, sizeof sin
);
3241 do_set_addr(struct netdev
*netdev
,
3242 int ioctl_nr
, const char *ioctl_name
, struct in_addr addr
)
3246 make_in4_sockaddr(&ifr
.ifr_addr
, addr
);
3247 return af_inet_ifreq_ioctl(netdev_get_name(netdev
), &ifr
, ioctl_nr
,
3251 /* Adds 'router' as a default IP gateway. */
3253 netdev_linux_add_router(struct netdev
*netdev OVS_UNUSED
, struct in_addr router
)
3255 struct in_addr any
= { INADDR_ANY
};
3259 memset(&rt
, 0, sizeof rt
);
3260 make_in4_sockaddr(&rt
.rt_dst
, any
);
3261 make_in4_sockaddr(&rt
.rt_gateway
, router
);
3262 make_in4_sockaddr(&rt
.rt_genmask
, any
);
3263 rt
.rt_flags
= RTF_UP
| RTF_GATEWAY
;
3264 error
= af_inet_ioctl(SIOCADDRT
, &rt
);
3266 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error
));
3272 netdev_linux_get_next_hop(const struct in_addr
*host
, struct in_addr
*next_hop
,
3275 static const char fn
[] = "/proc/net/route";
3280 *netdev_name
= NULL
;
3281 stream
= fopen(fn
, "r");
3282 if (stream
== NULL
) {
3283 VLOG_WARN_RL(&rl
, "%s: open failed: %s", fn
, ovs_strerror(errno
));
3288 while (fgets(line
, sizeof line
, stream
)) {
3291 ovs_be32 dest
, gateway
, mask
;
3292 int refcnt
, metric
, mtu
;
3293 unsigned int flags
, use
, window
, irtt
;
3296 "%16s %"SCNx32
" %"SCNx32
" %04X %d %u %d %"SCNx32
3298 iface
, &dest
, &gateway
, &flags
, &refcnt
,
3299 &use
, &metric
, &mask
, &mtu
, &window
, &irtt
)) {
3300 VLOG_WARN_RL(&rl
, "%s: could not parse line %d: %s",
3304 if (!(flags
& RTF_UP
)) {
3305 /* Skip routes that aren't up. */
3309 /* The output of 'dest', 'mask', and 'gateway' were given in
3310 * network byte order, so we don't need need any endian
3311 * conversions here. */
3312 if ((dest
& mask
) == (host
->s_addr
& mask
)) {
3314 /* The host is directly reachable. */
3315 next_hop
->s_addr
= 0;
3317 /* To reach the host, we must go through a gateway. */
3318 next_hop
->s_addr
= gateway
;
3320 *netdev_name
= xstrdup(iface
);
3332 netdev_linux_get_status(const struct netdev
*netdev_
, struct smap
*smap
)
3334 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3337 ovs_mutex_lock(&netdev
->mutex
);
3338 if (!(netdev
->cache_valid
& VALID_DRVINFO
)) {
3339 struct ethtool_cmd
*cmd
= (struct ethtool_cmd
*) &netdev
->drvinfo
;
3341 COVERAGE_INC(netdev_get_ethtool
);
3342 memset(&netdev
->drvinfo
, 0, sizeof netdev
->drvinfo
);
3343 error
= netdev_linux_do_ethtool(netdev
->up
.name
,
3346 "ETHTOOL_GDRVINFO");
3348 netdev
->cache_valid
|= VALID_DRVINFO
;
3353 smap_add(smap
, "driver_name", netdev
->drvinfo
.driver
);
3354 smap_add(smap
, "driver_version", netdev
->drvinfo
.version
);
3355 smap_add(smap
, "firmware_version", netdev
->drvinfo
.fw_version
);
3357 ovs_mutex_unlock(&netdev
->mutex
);
3363 netdev_internal_get_status(const struct netdev
*netdev OVS_UNUSED
,
3366 smap_add(smap
, "driver_name", "openvswitch");
3371 netdev_linux_get_block_id(struct netdev
*netdev_
)
3373 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3374 uint32_t block_id
= 0;
3376 ovs_mutex_lock(&netdev
->mutex
);
3377 /* Ensure the linux netdev has had its fields populated. */
3378 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
3379 netdev_linux_update_via_netlink(netdev
);
3382 /* Only assigning block ids to linux netdevs that are LAG masters. */
3383 if (netdev
->is_lag_master
) {
3384 block_id
= netdev
->ifindex
;
3386 ovs_mutex_unlock(&netdev
->mutex
);
3391 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3392 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3393 * returns 0. Otherwise, it returns a positive errno value; in particular,
3394 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3396 netdev_linux_arp_lookup(const struct netdev
*netdev
,
3397 ovs_be32 ip
, struct eth_addr
*mac
)
3400 struct sockaddr_in sin
;
3403 memset(&r
, 0, sizeof r
);
3404 memset(&sin
, 0, sizeof sin
);
3405 sin
.sin_family
= AF_INET
;
3406 sin
.sin_addr
.s_addr
= ip
;
3408 memcpy(&r
.arp_pa
, &sin
, sizeof sin
);
3409 r
.arp_ha
.sa_family
= ARPHRD_ETHER
;
3411 ovs_strzcpy(r
.arp_dev
, netdev_get_name(netdev
), sizeof r
.arp_dev
);
3412 COVERAGE_INC(netdev_arp_lookup
);
3413 retval
= af_inet_ioctl(SIOCGARP
, &r
);
3415 memcpy(mac
, r
.arp_ha
.sa_data
, ETH_ADDR_LEN
);
3416 } else if (retval
!= ENXIO
) {
3417 VLOG_WARN_RL(&rl
, "%s: could not look up ARP entry for "IP_FMT
": %s",
3418 netdev_get_name(netdev
), IP_ARGS(ip
),
3419 ovs_strerror(retval
));
3425 nd_to_iff_flags(enum netdev_flags nd
)
3427 unsigned int iff
= 0;
3428 if (nd
& NETDEV_UP
) {
3431 if (nd
& NETDEV_PROMISC
) {
3434 if (nd
& NETDEV_LOOPBACK
) {
3435 iff
|= IFF_LOOPBACK
;
3441 iff_to_nd_flags(unsigned int iff
)
3443 enum netdev_flags nd
= 0;
3447 if (iff
& IFF_PROMISC
) {
3448 nd
|= NETDEV_PROMISC
;
3450 if (iff
& IFF_LOOPBACK
) {
3451 nd
|= NETDEV_LOOPBACK
;
3457 update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
3458 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
3459 OVS_REQUIRES(netdev
->mutex
)
3461 unsigned int old_flags
, new_flags
;
3464 old_flags
= netdev
->ifi_flags
;
3465 *old_flagsp
= iff_to_nd_flags(old_flags
);
3466 new_flags
= (old_flags
& ~nd_to_iff_flags(off
)) | nd_to_iff_flags(on
);
3467 if (new_flags
!= old_flags
) {
3468 error
= set_flags(netdev_get_name(&netdev
->up
), new_flags
);
3469 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
3476 netdev_linux_update_flags(struct netdev
*netdev_
, enum netdev_flags off
,
3477 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
3479 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3482 ovs_mutex_lock(&netdev
->mutex
);
3484 /* Changing flags over netlink isn't support yet. */
3485 if (netdev_linux_netnsid_is_remote(netdev
)) {
3489 error
= update_flags(netdev
, off
, on
, old_flagsp
);
3491 /* Try reading flags over netlink, or fall back to ioctl. */
3492 if (!netdev_linux_update_via_netlink(netdev
)) {
3493 *old_flagsp
= iff_to_nd_flags(netdev
->ifi_flags
);
3495 error
= update_flags(netdev
, off
, on
, old_flagsp
);
3500 ovs_mutex_unlock(&netdev
->mutex
);
3504 #define NETDEV_LINUX_CLASS_COMMON \
3505 .run = netdev_linux_run, \
3506 .wait = netdev_linux_wait, \
3507 .alloc = netdev_linux_alloc, \
3508 .dealloc = netdev_linux_dealloc, \
3509 .send_wait = netdev_linux_send_wait, \
3510 .set_etheraddr = netdev_linux_set_etheraddr, \
3511 .get_etheraddr = netdev_linux_get_etheraddr, \
3512 .get_mtu = netdev_linux_get_mtu, \
3513 .set_mtu = netdev_linux_set_mtu, \
3514 .get_ifindex = netdev_linux_get_ifindex, \
3515 .get_carrier = netdev_linux_get_carrier, \
3516 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3517 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3518 .set_advertisements = netdev_linux_set_advertisements, \
3519 .set_policing = netdev_linux_set_policing, \
3520 .get_qos_types = netdev_linux_get_qos_types, \
3521 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3522 .get_qos = netdev_linux_get_qos, \
3523 .set_qos = netdev_linux_set_qos, \
3524 .get_queue = netdev_linux_get_queue, \
3525 .set_queue = netdev_linux_set_queue, \
3526 .delete_queue = netdev_linux_delete_queue, \
3527 .get_queue_stats = netdev_linux_get_queue_stats, \
3528 .queue_dump_start = netdev_linux_queue_dump_start, \
3529 .queue_dump_next = netdev_linux_queue_dump_next, \
3530 .queue_dump_done = netdev_linux_queue_dump_done, \
3531 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3532 .set_in4 = netdev_linux_set_in4, \
3533 .get_addr_list = netdev_linux_get_addr_list, \
3534 .add_router = netdev_linux_add_router, \
3535 .get_next_hop = netdev_linux_get_next_hop, \
3536 .arp_lookup = netdev_linux_arp_lookup, \
3537 .update_flags = netdev_linux_update_flags, \
3538 .rxq_alloc = netdev_linux_rxq_alloc, \
3539 .rxq_dealloc = netdev_linux_rxq_dealloc, \
3540 .rxq_wait = netdev_linux_rxq_wait, \
3541 .rxq_drain = netdev_linux_rxq_drain
3543 const struct netdev_class netdev_linux_class
= {
3544 NETDEV_LINUX_CLASS_COMMON
,
3547 .construct
= netdev_linux_construct
,
3548 .destruct
= netdev_linux_destruct
,
3549 .get_stats
= netdev_linux_get_stats
,
3550 .get_features
= netdev_linux_get_features
,
3551 .get_status
= netdev_linux_get_status
,
3552 .get_block_id
= netdev_linux_get_block_id
,
3553 .send
= netdev_linux_send
,
3554 .rxq_construct
= netdev_linux_rxq_construct
,
3555 .rxq_destruct
= netdev_linux_rxq_destruct
,
3556 .rxq_recv
= netdev_linux_rxq_recv
,
3559 const struct netdev_class netdev_tap_class
= {
3560 NETDEV_LINUX_CLASS_COMMON
,
3563 .construct
= netdev_linux_construct_tap
,
3564 .destruct
= netdev_linux_destruct
,
3565 .get_stats
= netdev_tap_get_stats
,
3566 .get_features
= netdev_linux_get_features
,
3567 .get_status
= netdev_linux_get_status
,
3568 .send
= netdev_linux_send
,
3569 .rxq_construct
= netdev_linux_rxq_construct
,
3570 .rxq_destruct
= netdev_linux_rxq_destruct
,
3571 .rxq_recv
= netdev_linux_rxq_recv
,
3574 const struct netdev_class netdev_internal_class
= {
3575 NETDEV_LINUX_CLASS_COMMON
,
3578 .construct
= netdev_linux_construct
,
3579 .destruct
= netdev_linux_destruct
,
3580 .get_stats
= netdev_internal_get_stats
,
3581 .get_status
= netdev_internal_get_status
,
3582 .send
= netdev_linux_send
,
3583 .rxq_construct
= netdev_linux_rxq_construct
,
3584 .rxq_destruct
= netdev_linux_rxq_destruct
,
3585 .rxq_recv
= netdev_linux_rxq_recv
,
3589 #define NETDEV_AFXDP_CLASS_COMMON \
3590 .init = netdev_afxdp_init, \
3591 .construct = netdev_afxdp_construct, \
3592 .destruct = netdev_afxdp_destruct, \
3593 .get_stats = netdev_afxdp_get_stats, \
3594 .get_custom_stats = netdev_afxdp_get_custom_stats, \
3595 .get_status = netdev_linux_get_status, \
3596 .set_config = netdev_afxdp_set_config, \
3597 .get_config = netdev_afxdp_get_config, \
3598 .reconfigure = netdev_afxdp_reconfigure, \
3599 .get_numa_id = netdev_linux_get_numa_id, \
3600 .send = netdev_afxdp_batch_send, \
3601 .rxq_construct = netdev_afxdp_rxq_construct, \
3602 .rxq_destruct = netdev_afxdp_rxq_destruct, \
3603 .rxq_recv = netdev_afxdp_rxq_recv
3605 const struct netdev_class netdev_afxdp_class
= {
3606 NETDEV_LINUX_CLASS_COMMON
,
3607 NETDEV_AFXDP_CLASS_COMMON
,
3612 const struct netdev_class netdev_afxdp_nonpmd_class
= {
3613 NETDEV_LINUX_CLASS_COMMON
,
3614 NETDEV_AFXDP_CLASS_COMMON
,
3615 .type
= "afxdp-nonpmd",
3621 #define CODEL_N_QUEUES 0x0000
3623 /* In sufficiently new kernel headers these are defined as enums in
3624 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3625 * kernels. (This overrides any enum definition in the header file but that's
3627 #define TCA_CODEL_TARGET 1
3628 #define TCA_CODEL_LIMIT 2
3629 #define TCA_CODEL_INTERVAL 3
3638 static struct codel
*
3639 codel_get__(const struct netdev
*netdev_
)
3641 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3642 return CONTAINER_OF(netdev
->tc
, struct codel
, tc
);
3646 codel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3649 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3650 struct codel
*codel
;
3652 codel
= xmalloc(sizeof *codel
);
3653 tc_init(&codel
->tc
, &tc_ops_codel
);
3654 codel
->target
= target
;
3655 codel
->limit
= limit
;
3656 codel
->interval
= interval
;
3658 netdev
->tc
= &codel
->tc
;
3662 codel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3666 struct ofpbuf request
;
3667 struct tcmsg
*tcmsg
;
3668 uint32_t otarget
, olimit
, ointerval
;
3671 tc_del_qdisc(netdev
);
3673 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3674 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3678 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3679 tcmsg
->tcm_parent
= TC_H_ROOT
;
3681 otarget
= target
? target
: 5000;
3682 olimit
= limit
? limit
: 10240;
3683 ointerval
= interval
? interval
: 100000;
3685 nl_msg_put_string(&request
, TCA_KIND
, "codel");
3686 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3687 nl_msg_put_u32(&request
, TCA_CODEL_TARGET
, otarget
);
3688 nl_msg_put_u32(&request
, TCA_CODEL_LIMIT
, olimit
);
3689 nl_msg_put_u32(&request
, TCA_CODEL_INTERVAL
, ointerval
);
3690 nl_msg_end_nested(&request
, opt_offset
);
3692 error
= tc_transact(&request
, NULL
);
3694 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3695 "target %u, limit %u, interval %u error %d(%s)",
3696 netdev_get_name(netdev
),
3697 otarget
, olimit
, ointerval
,
3698 error
, ovs_strerror(error
));
3704 codel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3705 const struct smap
*details
, struct codel
*codel
)
3707 codel
->target
= smap_get_ullong(details
, "target", 0);
3708 codel
->limit
= smap_get_ullong(details
, "limit", 0);
3709 codel
->interval
= smap_get_ullong(details
, "interval", 0);
3711 if (!codel
->target
) {
3712 codel
->target
= 5000;
3714 if (!codel
->limit
) {
3715 codel
->limit
= 10240;
3717 if (!codel
->interval
) {
3718 codel
->interval
= 100000;
3723 codel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3728 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3729 error
= codel_setup_qdisc__(netdev
, codel
.target
, codel
.limit
,
3732 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3738 codel_parse_tca_options__(struct nlattr
*nl_options
, struct codel
*codel
)
3740 static const struct nl_policy tca_codel_policy
[] = {
3741 [TCA_CODEL_TARGET
] = { .type
= NL_A_U32
},
3742 [TCA_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3743 [TCA_CODEL_INTERVAL
] = { .type
= NL_A_U32
}
3746 struct nlattr
*attrs
[ARRAY_SIZE(tca_codel_policy
)];
3748 if (!nl_parse_nested(nl_options
, tca_codel_policy
,
3749 attrs
, ARRAY_SIZE(tca_codel_policy
))) {
3750 VLOG_WARN_RL(&rl
, "failed to parse CoDel class options");
3754 codel
->target
= nl_attr_get_u32(attrs
[TCA_CODEL_TARGET
]);
3755 codel
->limit
= nl_attr_get_u32(attrs
[TCA_CODEL_LIMIT
]);
3756 codel
->interval
= nl_attr_get_u32(attrs
[TCA_CODEL_INTERVAL
]);
3761 codel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3763 struct nlattr
*nlattr
;
3768 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3773 error
= codel_parse_tca_options__(nlattr
, &codel
);
3778 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3784 codel_tc_destroy(struct tc
*tc
)
3786 struct codel
*codel
= CONTAINER_OF(tc
, struct codel
, tc
);
3792 codel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3794 const struct codel
*codel
= codel_get__(netdev
);
3795 smap_add_format(details
, "target", "%u", codel
->target
);
3796 smap_add_format(details
, "limit", "%u", codel
->limit
);
3797 smap_add_format(details
, "interval", "%u", codel
->interval
);
3802 codel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3806 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3807 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3808 codel_get__(netdev
)->target
= codel
.target
;
3809 codel_get__(netdev
)->limit
= codel
.limit
;
3810 codel_get__(netdev
)->interval
= codel
.interval
;
3814 static const struct tc_ops tc_ops_codel
= {
3815 .linux_name
= "codel",
3816 .ovs_name
= "linux-codel",
3817 .n_queues
= CODEL_N_QUEUES
,
3818 .tc_install
= codel_tc_install
,
3819 .tc_load
= codel_tc_load
,
3820 .tc_destroy
= codel_tc_destroy
,
3821 .qdisc_get
= codel_qdisc_get
,
3822 .qdisc_set
= codel_qdisc_set
,
3825 /* FQ-CoDel traffic control class. */
3827 #define FQCODEL_N_QUEUES 0x0000
3829 /* In sufficiently new kernel headers these are defined as enums in
3830 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3831 * kernels. (This overrides any enum definition in the header file but that's
3833 #define TCA_FQ_CODEL_TARGET 1
3834 #define TCA_FQ_CODEL_LIMIT 2
3835 #define TCA_FQ_CODEL_INTERVAL 3
3836 #define TCA_FQ_CODEL_ECN 4
3837 #define TCA_FQ_CODEL_FLOWS 5
3838 #define TCA_FQ_CODEL_QUANTUM 6
3849 static struct fqcodel
*
3850 fqcodel_get__(const struct netdev
*netdev_
)
3852 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3853 return CONTAINER_OF(netdev
->tc
, struct fqcodel
, tc
);
3857 fqcodel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3858 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3860 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3861 struct fqcodel
*fqcodel
;
3863 fqcodel
= xmalloc(sizeof *fqcodel
);
3864 tc_init(&fqcodel
->tc
, &tc_ops_fqcodel
);
3865 fqcodel
->target
= target
;
3866 fqcodel
->limit
= limit
;
3867 fqcodel
->interval
= interval
;
3868 fqcodel
->flows
= flows
;
3869 fqcodel
->quantum
= quantum
;
3871 netdev
->tc
= &fqcodel
->tc
;
3875 fqcodel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3876 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3879 struct ofpbuf request
;
3880 struct tcmsg
*tcmsg
;
3881 uint32_t otarget
, olimit
, ointerval
, oflows
, oquantum
;
3884 tc_del_qdisc(netdev
);
3886 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3887 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3891 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3892 tcmsg
->tcm_parent
= TC_H_ROOT
;
3894 otarget
= target
? target
: 5000;
3895 olimit
= limit
? limit
: 10240;
3896 ointerval
= interval
? interval
: 100000;
3897 oflows
= flows
? flows
: 1024;
3898 oquantum
= quantum
? quantum
: 1514; /* fq_codel default quantum is 1514
3901 nl_msg_put_string(&request
, TCA_KIND
, "fq_codel");
3902 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3903 nl_msg_put_u32(&request
, TCA_FQ_CODEL_TARGET
, otarget
);
3904 nl_msg_put_u32(&request
, TCA_FQ_CODEL_LIMIT
, olimit
);
3905 nl_msg_put_u32(&request
, TCA_FQ_CODEL_INTERVAL
, ointerval
);
3906 nl_msg_put_u32(&request
, TCA_FQ_CODEL_FLOWS
, oflows
);
3907 nl_msg_put_u32(&request
, TCA_FQ_CODEL_QUANTUM
, oquantum
);
3908 nl_msg_end_nested(&request
, opt_offset
);
3910 error
= tc_transact(&request
, NULL
);
3912 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3913 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3914 netdev_get_name(netdev
),
3915 otarget
, olimit
, ointerval
, oflows
, oquantum
,
3916 error
, ovs_strerror(error
));
3922 fqcodel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3923 const struct smap
*details
, struct fqcodel
*fqcodel
)
3925 fqcodel
->target
= smap_get_ullong(details
, "target", 0);
3926 fqcodel
->limit
= smap_get_ullong(details
, "limit", 0);
3927 fqcodel
->interval
= smap_get_ullong(details
, "interval", 0);
3928 fqcodel
->flows
= smap_get_ullong(details
, "flows", 0);
3929 fqcodel
->quantum
= smap_get_ullong(details
, "quantum", 0);
3931 if (!fqcodel
->target
) {
3932 fqcodel
->target
= 5000;
3934 if (!fqcodel
->limit
) {
3935 fqcodel
->limit
= 10240;
3937 if (!fqcodel
->interval
) {
3938 fqcodel
->interval
= 1000000;
3940 if (!fqcodel
->flows
) {
3941 fqcodel
->flows
= 1024;
3943 if (!fqcodel
->quantum
) {
3944 fqcodel
->quantum
= 1514;
3949 fqcodel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3952 struct fqcodel fqcodel
;
3954 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3955 error
= fqcodel_setup_qdisc__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3956 fqcodel
.interval
, fqcodel
.flows
,
3959 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3960 fqcodel
.interval
, fqcodel
.flows
, fqcodel
.quantum
);
3966 fqcodel_parse_tca_options__(struct nlattr
*nl_options
, struct fqcodel
*fqcodel
)
3968 static const struct nl_policy tca_fqcodel_policy
[] = {
3969 [TCA_FQ_CODEL_TARGET
] = { .type
= NL_A_U32
},
3970 [TCA_FQ_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3971 [TCA_FQ_CODEL_INTERVAL
] = { .type
= NL_A_U32
},
3972 [TCA_FQ_CODEL_FLOWS
] = { .type
= NL_A_U32
},
3973 [TCA_FQ_CODEL_QUANTUM
] = { .type
= NL_A_U32
}
3976 struct nlattr
*attrs
[ARRAY_SIZE(tca_fqcodel_policy
)];
3978 if (!nl_parse_nested(nl_options
, tca_fqcodel_policy
,
3979 attrs
, ARRAY_SIZE(tca_fqcodel_policy
))) {
3980 VLOG_WARN_RL(&rl
, "failed to parse FQ_CoDel class options");
3984 fqcodel
->target
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_TARGET
]);
3985 fqcodel
->limit
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_LIMIT
]);
3986 fqcodel
->interval
=nl_attr_get_u32(attrs
[TCA_FQ_CODEL_INTERVAL
]);
3987 fqcodel
->flows
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_FLOWS
]);
3988 fqcodel
->quantum
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_QUANTUM
]);
3993 fqcodel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3995 struct nlattr
*nlattr
;
3998 struct fqcodel fqcodel
;
4000 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
4005 error
= fqcodel_parse_tca_options__(nlattr
, &fqcodel
);
4010 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
4011 fqcodel
.flows
, fqcodel
.quantum
);
4016 fqcodel_tc_destroy(struct tc
*tc
)
4018 struct fqcodel
*fqcodel
= CONTAINER_OF(tc
, struct fqcodel
, tc
);
4024 fqcodel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4026 const struct fqcodel
*fqcodel
= fqcodel_get__(netdev
);
4027 smap_add_format(details
, "target", "%u", fqcodel
->target
);
4028 smap_add_format(details
, "limit", "%u", fqcodel
->limit
);
4029 smap_add_format(details
, "interval", "%u", fqcodel
->interval
);
4030 smap_add_format(details
, "flows", "%u", fqcodel
->flows
);
4031 smap_add_format(details
, "quantum", "%u", fqcodel
->quantum
);
4036 fqcodel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4038 struct fqcodel fqcodel
;
4040 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
4041 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
4042 fqcodel
.flows
, fqcodel
.quantum
);
4043 fqcodel_get__(netdev
)->target
= fqcodel
.target
;
4044 fqcodel_get__(netdev
)->limit
= fqcodel
.limit
;
4045 fqcodel_get__(netdev
)->interval
= fqcodel
.interval
;
4046 fqcodel_get__(netdev
)->flows
= fqcodel
.flows
;
4047 fqcodel_get__(netdev
)->quantum
= fqcodel
.quantum
;
4051 static const struct tc_ops tc_ops_fqcodel
= {
4052 .linux_name
= "fq_codel",
4053 .ovs_name
= "linux-fq_codel",
4054 .n_queues
= FQCODEL_N_QUEUES
,
4055 .tc_install
= fqcodel_tc_install
,
4056 .tc_load
= fqcodel_tc_load
,
4057 .tc_destroy
= fqcodel_tc_destroy
,
4058 .qdisc_get
= fqcodel_qdisc_get
,
4059 .qdisc_set
= fqcodel_qdisc_set
,
4062 /* SFQ traffic control class. */
4064 #define SFQ_N_QUEUES 0x0000
4073 sfq_get__(const struct netdev
*netdev_
)
4075 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4076 return CONTAINER_OF(netdev
->tc
, struct sfq
, tc
);
4080 sfq_install__(struct netdev
*netdev_
, uint32_t quantum
, uint32_t perturb
)
4082 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4085 sfq
= xmalloc(sizeof *sfq
);
4086 tc_init(&sfq
->tc
, &tc_ops_sfq
);
4087 sfq
->perturb
= perturb
;
4088 sfq
->quantum
= quantum
;
4090 netdev
->tc
= &sfq
->tc
;
4094 sfq_setup_qdisc__(struct netdev
*netdev
, uint32_t quantum
, uint32_t perturb
)
4096 struct tc_sfq_qopt opt
;
4097 struct ofpbuf request
;
4098 struct tcmsg
*tcmsg
;
4100 int mtu_error
, error
;
4101 mtu_error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
4103 tc_del_qdisc(netdev
);
4105 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4106 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4110 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4111 tcmsg
->tcm_parent
= TC_H_ROOT
;
4113 memset(&opt
, 0, sizeof opt
);
4116 opt
.quantum
= mtu
; /* if we cannot find mtu, use default */
4119 opt
.quantum
= quantum
;
4123 opt
.perturb_period
= 10;
4125 opt
.perturb_period
= perturb
;
4128 nl_msg_put_string(&request
, TCA_KIND
, "sfq");
4129 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
4131 error
= tc_transact(&request
, NULL
);
4133 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
4134 "quantum %u, perturb %u error %d(%s)",
4135 netdev_get_name(netdev
),
4136 opt
.quantum
, opt
.perturb_period
,
4137 error
, ovs_strerror(error
));
4143 sfq_parse_qdisc_details__(struct netdev
*netdev
,
4144 const struct smap
*details
, struct sfq
*sfq
)
4146 sfq
->perturb
= smap_get_ullong(details
, "perturb", 0);
4147 sfq
->quantum
= smap_get_ullong(details
, "quantum", 0);
4149 if (!sfq
->perturb
) {
4153 if (!sfq
->quantum
) {
4155 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
4158 VLOG_WARN_RL(&rl
, "when using SFQ, you must specify quantum on a "
4159 "device without mtu");
4165 sfq_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4170 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
4171 error
= sfq_setup_qdisc__(netdev
, sfq
.quantum
, sfq
.perturb
);
4173 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
4179 sfq_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
4181 const struct tc_sfq_qopt
*sfq
;
4182 struct nlattr
*nlattr
;
4186 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
4188 sfq
= nl_attr_get(nlattr
);
4189 sfq_install__(netdev
, sfq
->quantum
, sfq
->perturb_period
);
4197 sfq_tc_destroy(struct tc
*tc
)
4199 struct sfq
*sfq
= CONTAINER_OF(tc
, struct sfq
, tc
);
4205 sfq_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4207 const struct sfq
*sfq
= sfq_get__(netdev
);
4208 smap_add_format(details
, "quantum", "%u", sfq
->quantum
);
4209 smap_add_format(details
, "perturb", "%u", sfq
->perturb
);
4214 sfq_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4218 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
4219 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
4220 sfq_get__(netdev
)->quantum
= sfq
.quantum
;
4221 sfq_get__(netdev
)->perturb
= sfq
.perturb
;
4225 static const struct tc_ops tc_ops_sfq
= {
4226 .linux_name
= "sfq",
4227 .ovs_name
= "linux-sfq",
4228 .n_queues
= SFQ_N_QUEUES
,
4229 .tc_install
= sfq_tc_install
,
4230 .tc_load
= sfq_tc_load
,
4231 .tc_destroy
= sfq_tc_destroy
,
4232 .qdisc_get
= sfq_qdisc_get
,
4233 .qdisc_set
= sfq_qdisc_set
,
4236 /* netem traffic control class. */
4245 static struct netem
*
4246 netem_get__(const struct netdev
*netdev_
)
4248 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4249 return CONTAINER_OF(netdev
->tc
, struct netem
, tc
);
4253 netem_install__(struct netdev
*netdev_
, uint32_t latency
,
4254 uint32_t limit
, uint32_t loss
)
4256 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4257 struct netem
*netem
;
4259 netem
= xmalloc(sizeof *netem
);
4260 tc_init(&netem
->tc
, &tc_ops_netem
);
4261 netem
->latency
= latency
;
4262 netem
->limit
= limit
;
4265 netdev
->tc
= &netem
->tc
;
4269 netem_setup_qdisc__(struct netdev
*netdev
, uint32_t latency
,
4270 uint32_t limit
, uint32_t loss
)
4272 struct tc_netem_qopt opt
;
4273 struct ofpbuf request
;
4274 struct tcmsg
*tcmsg
;
4277 tc_del_qdisc(netdev
);
4279 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4280 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4284 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4285 tcmsg
->tcm_parent
= TC_H_ROOT
;
4287 memset(&opt
, 0, sizeof opt
);
4298 "loss should be a percentage value between 0 to 100, "
4299 "loss was %u", loss
);
4302 opt
.loss
= floor(UINT32_MAX
* (loss
/ 100.0));
4305 opt
.latency
= tc_time_to_ticks(latency
);
4307 nl_msg_put_string(&request
, TCA_KIND
, "netem");
4308 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
4310 error
= tc_transact(&request
, NULL
);
4312 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
4313 "latency %u, limit %u, loss %u error %d(%s)",
4314 netdev_get_name(netdev
),
4315 opt
.latency
, opt
.limit
, opt
.loss
,
4316 error
, ovs_strerror(error
));
4322 netem_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
4323 const struct smap
*details
, struct netem
*netem
)
4325 netem
->latency
= smap_get_ullong(details
, "latency", 0);
4326 netem
->limit
= smap_get_ullong(details
, "limit", 0);
4327 netem
->loss
= smap_get_ullong(details
, "loss", 0);
4329 if (!netem
->limit
) {
4330 netem
->limit
= 1000;
4335 netem_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4340 netem_parse_qdisc_details__(netdev
, details
, &netem
);
4341 error
= netem_setup_qdisc__(netdev
, netem
.latency
,
4342 netem
.limit
, netem
.loss
);
4344 netem_install__(netdev
, netem
.latency
, netem
.limit
, netem
.loss
);
4350 netem_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
4352 const struct tc_netem_qopt
*netem
;
4353 struct nlattr
*nlattr
;
4357 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
4359 netem
= nl_attr_get(nlattr
);
4360 netem_install__(netdev
, netem
->latency
, netem
->limit
, netem
->loss
);
4368 netem_tc_destroy(struct tc
*tc
)
4370 struct netem
*netem
= CONTAINER_OF(tc
, struct netem
, tc
);
4376 netem_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4378 const struct netem
*netem
= netem_get__(netdev
);
4379 smap_add_format(details
, "latency", "%u", netem
->latency
);
4380 smap_add_format(details
, "limit", "%u", netem
->limit
);
4381 smap_add_format(details
, "loss", "%u", netem
->loss
);
4386 netem_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4390 netem_parse_qdisc_details__(netdev
, details
, &netem
);
4391 netem_install__(netdev
, netem
.latency
, netem
.limit
, netem
.loss
);
4392 netem_get__(netdev
)->latency
= netem
.latency
;
4393 netem_get__(netdev
)->limit
= netem
.limit
;
4394 netem_get__(netdev
)->loss
= netem
.loss
;
4398 static const struct tc_ops tc_ops_netem
= {
4399 .linux_name
= "netem",
4400 .ovs_name
= "linux-netem",
4402 .tc_install
= netem_tc_install
,
4403 .tc_load
= netem_tc_load
,
4404 .tc_destroy
= netem_tc_destroy
,
4405 .qdisc_get
= netem_qdisc_get
,
4406 .qdisc_set
= netem_qdisc_set
,
4409 /* HTB traffic control class. */
4411 #define HTB_N_QUEUES 0xf000
4412 #define HTB_RATE2QUANTUM 10
4416 unsigned int max_rate
; /* In bytes/s. */
4420 struct tc_queue tc_queue
;
4421 unsigned int min_rate
; /* In bytes/s. */
4422 unsigned int max_rate
; /* In bytes/s. */
4423 unsigned int burst
; /* In bytes. */
4424 unsigned int priority
; /* Lower values are higher priorities. */
4428 htb_get__(const struct netdev
*netdev_
)
4430 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4431 return CONTAINER_OF(netdev
->tc
, struct htb
, tc
);
4435 htb_install__(struct netdev
*netdev_
, uint64_t max_rate
)
4437 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4440 htb
= xmalloc(sizeof *htb
);
4441 tc_init(&htb
->tc
, &tc_ops_htb
);
4442 htb
->max_rate
= max_rate
;
4444 netdev
->tc
= &htb
->tc
;
4447 /* Create an HTB qdisc.
4449 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
4451 htb_setup_qdisc__(struct netdev
*netdev
)
4454 struct tc_htb_glob opt
;
4455 struct ofpbuf request
;
4456 struct tcmsg
*tcmsg
;
4458 tc_del_qdisc(netdev
);
4460 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4461 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4465 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4466 tcmsg
->tcm_parent
= TC_H_ROOT
;
4468 nl_msg_put_string(&request
, TCA_KIND
, "htb");
4470 memset(&opt
, 0, sizeof opt
);
4471 opt
.rate2quantum
= HTB_RATE2QUANTUM
;
4475 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4476 nl_msg_put_unspec(&request
, TCA_HTB_INIT
, &opt
, sizeof opt
);
4477 nl_msg_end_nested(&request
, opt_offset
);
4479 return tc_transact(&request
, NULL
);
4482 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
4483 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
4485 htb_setup_class__(struct netdev
*netdev
, unsigned int handle
,
4486 unsigned int parent
, struct htb_class
*class)
4489 struct tc_htb_opt opt
;
4490 struct ofpbuf request
;
4491 struct tcmsg
*tcmsg
;
4495 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
4497 VLOG_WARN_RL(&rl
, "cannot set up HTB on device %s that lacks MTU",
4498 netdev_get_name(netdev
));
4502 memset(&opt
, 0, sizeof opt
);
4503 tc_fill_rate(&opt
.rate
, class->min_rate
, mtu
);
4504 tc_fill_rate(&opt
.ceil
, class->max_rate
, mtu
);
4505 /* Makes sure the quantum is at least MTU. Setting quantum will
4506 * make htb ignore the r2q for this class. */
4507 if ((class->min_rate
/ HTB_RATE2QUANTUM
) < mtu
) {
4510 opt
.buffer
= tc_calc_buffer(opt
.rate
.rate
, mtu
, class->burst
);
4511 opt
.cbuffer
= tc_calc_buffer(opt
.ceil
.rate
, mtu
, class->burst
);
4512 opt
.prio
= class->priority
;
4514 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
4519 tcmsg
->tcm_handle
= handle
;
4520 tcmsg
->tcm_parent
= parent
;
4522 nl_msg_put_string(&request
, TCA_KIND
, "htb");
4523 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4524 nl_msg_put_unspec(&request
, TCA_HTB_PARMS
, &opt
, sizeof opt
);
4525 tc_put_rtab(&request
, TCA_HTB_RTAB
, &opt
.rate
);
4526 tc_put_rtab(&request
, TCA_HTB_CTAB
, &opt
.ceil
);
4527 nl_msg_end_nested(&request
, opt_offset
);
4529 error
= tc_transact(&request
, NULL
);
4531 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
4532 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
4533 netdev_get_name(netdev
),
4534 tc_get_major(handle
), tc_get_minor(handle
),
4535 tc_get_major(parent
), tc_get_minor(parent
),
4536 class->min_rate
, class->max_rate
,
4537 class->burst
, class->priority
, ovs_strerror(error
));
4542 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
4543 * description of them into 'details'. The description complies with the
4544 * specification given in the vswitch database documentation for linux-htb
4547 htb_parse_tca_options__(struct nlattr
*nl_options
, struct htb_class
*class)
4549 static const struct nl_policy tca_htb_policy
[] = {
4550 [TCA_HTB_PARMS
] = { .type
= NL_A_UNSPEC
, .optional
= false,
4551 .min_len
= sizeof(struct tc_htb_opt
) },
4554 struct nlattr
*attrs
[ARRAY_SIZE(tca_htb_policy
)];
4555 const struct tc_htb_opt
*htb
;
4557 if (!nl_parse_nested(nl_options
, tca_htb_policy
,
4558 attrs
, ARRAY_SIZE(tca_htb_policy
))) {
4559 VLOG_WARN_RL(&rl
, "failed to parse HTB class options");
4563 htb
= nl_attr_get(attrs
[TCA_HTB_PARMS
]);
4564 class->min_rate
= htb
->rate
.rate
;
4565 class->max_rate
= htb
->ceil
.rate
;
4566 class->burst
= tc_ticks_to_bytes(htb
->rate
.rate
, htb
->buffer
);
4567 class->priority
= htb
->prio
;
4572 htb_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4573 struct htb_class
*options
,
4574 struct netdev_queue_stats
*stats
)
4576 struct nlattr
*nl_options
;
4577 unsigned int handle
;
4580 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4581 if (!error
&& queue_id
) {
4582 unsigned int major
= tc_get_major(handle
);
4583 unsigned int minor
= tc_get_minor(handle
);
4584 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4585 *queue_id
= minor
- 1;
4590 if (!error
&& options
) {
4591 error
= htb_parse_tca_options__(nl_options
, options
);
4597 htb_parse_qdisc_details__(struct netdev
*netdev_
,
4598 const struct smap
*details
, struct htb_class
*hc
)
4600 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4602 hc
->max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
4603 if (!hc
->max_rate
) {
4604 enum netdev_features current
;
4606 netdev_linux_read_features(netdev
);
4607 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4608 hc
->max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4610 hc
->min_rate
= hc
->max_rate
;
4616 htb_parse_class_details__(struct netdev
*netdev
,
4617 const struct smap
*details
, struct htb_class
*hc
)
4619 const struct htb
*htb
= htb_get__(netdev
);
4621 unsigned long long int max_rate_bit
;
4623 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
4625 VLOG_WARN_RL(&rl
, "cannot parse HTB class on device %s that lacks MTU",
4626 netdev_get_name(netdev
));
4630 /* HTB requires at least an mtu sized min-rate to send any traffic even
4631 * on uncongested links. */
4632 hc
->min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
4633 hc
->min_rate
= MAX(hc
->min_rate
, mtu
);
4634 hc
->min_rate
= MIN(hc
->min_rate
, htb
->max_rate
);
4637 max_rate_bit
= smap_get_ullong(details
, "max-rate", 0);
4638 hc
->max_rate
= max_rate_bit
? max_rate_bit
/ 8 : htb
->max_rate
;
4639 hc
->max_rate
= MAX(hc
->max_rate
, hc
->min_rate
);
4640 hc
->max_rate
= MIN(hc
->max_rate
, htb
->max_rate
);
4644 * According to hints in the documentation that I've read, it is important
4645 * that 'burst' be at least as big as the largest frame that might be
4646 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4647 * but having it a bit too small is a problem. Since netdev_get_mtu()
4648 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4649 * the MTU. We actually add 64, instead of 14, as a guard against
4650 * additional headers get tacked on somewhere that we're not aware of. */
4651 hc
->burst
= smap_get_ullong(details
, "burst", 0) / 8;
4652 hc
->burst
= MAX(hc
->burst
, mtu
+ 64);
4655 hc
->priority
= smap_get_ullong(details
, "priority", 0);
4661 htb_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4662 unsigned int parent
, struct htb_class
*options
,
4663 struct netdev_queue_stats
*stats
)
4665 struct ofpbuf
*reply
;
4668 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4670 error
= htb_parse_tcmsg__(reply
, NULL
, options
, stats
);
4671 ofpbuf_delete(reply
);
4677 htb_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4681 error
= htb_setup_qdisc__(netdev
);
4683 struct htb_class hc
;
4685 htb_parse_qdisc_details__(netdev
, details
, &hc
);
4686 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4687 tc_make_handle(1, 0), &hc
);
4689 htb_install__(netdev
, hc
.max_rate
);
4695 static struct htb_class
*
4696 htb_class_cast__(const struct tc_queue
*queue
)
4698 return CONTAINER_OF(queue
, struct htb_class
, tc_queue
);
4702 htb_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4703 const struct htb_class
*hc
)
4705 struct htb
*htb
= htb_get__(netdev
);
4706 size_t hash
= hash_int(queue_id
, 0);
4707 struct tc_queue
*queue
;
4708 struct htb_class
*hcp
;
4710 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4712 hcp
= htb_class_cast__(queue
);
4714 hcp
= xmalloc(sizeof *hcp
);
4715 queue
= &hcp
->tc_queue
;
4716 queue
->queue_id
= queue_id
;
4717 queue
->created
= time_msec();
4718 hmap_insert(&htb
->tc
.queues
, &queue
->hmap_node
, hash
);
4721 hcp
->min_rate
= hc
->min_rate
;
4722 hcp
->max_rate
= hc
->max_rate
;
4723 hcp
->burst
= hc
->burst
;
4724 hcp
->priority
= hc
->priority
;
4728 htb_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4731 struct queue_dump_state state
;
4732 struct htb_class hc
;
4734 /* Get qdisc options. */
4736 htb_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4737 htb_install__(netdev
, hc
.max_rate
);
4740 if (!start_queue_dump(netdev
, &state
)) {
4743 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4744 unsigned int queue_id
;
4746 if (!htb_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4747 htb_update_queue__(netdev
, queue_id
, &hc
);
4750 finish_queue_dump(&state
);
4756 htb_tc_destroy(struct tc
*tc
)
4758 struct htb
*htb
= CONTAINER_OF(tc
, struct htb
, tc
);
4759 struct htb_class
*hc
;
4761 HMAP_FOR_EACH_POP (hc
, tc_queue
.hmap_node
, &htb
->tc
.queues
) {
4769 htb_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4771 const struct htb
*htb
= htb_get__(netdev
);
4772 smap_add_format(details
, "max-rate", "%llu", 8ULL * htb
->max_rate
);
4777 htb_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4779 struct htb_class hc
;
4782 htb_parse_qdisc_details__(netdev
, details
, &hc
);
4783 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4784 tc_make_handle(1, 0), &hc
);
4786 htb_get__(netdev
)->max_rate
= hc
.max_rate
;
4792 htb_class_get(const struct netdev
*netdev OVS_UNUSED
,
4793 const struct tc_queue
*queue
, struct smap
*details
)
4795 const struct htb_class
*hc
= htb_class_cast__(queue
);
4797 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4798 if (hc
->min_rate
!= hc
->max_rate
) {
4799 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4801 smap_add_format(details
, "burst", "%llu", 8ULL * hc
->burst
);
4803 smap_add_format(details
, "priority", "%u", hc
->priority
);
4809 htb_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4810 const struct smap
*details
)
4812 struct htb_class hc
;
4815 error
= htb_parse_class_details__(netdev
, details
, &hc
);
4820 error
= htb_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
4821 tc_make_handle(1, 0xfffe), &hc
);
4826 htb_update_queue__(netdev
, queue_id
, &hc
);
4831 htb_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4833 struct htb_class
*hc
= htb_class_cast__(queue
);
4834 struct htb
*htb
= htb_get__(netdev
);
4837 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4839 hmap_remove(&htb
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4846 htb_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4847 struct netdev_queue_stats
*stats
)
4849 return htb_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4850 tc_make_handle(1, 0xfffe), NULL
, stats
);
4854 htb_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4855 const struct ofpbuf
*nlmsg
,
4856 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4858 struct netdev_queue_stats stats
;
4859 unsigned int handle
, major
, minor
;
4862 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4867 major
= tc_get_major(handle
);
4868 minor
= tc_get_minor(handle
);
4869 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4870 (*cb
)(minor
- 1, &stats
, aux
);
4875 static const struct tc_ops tc_ops_htb
= {
4876 .linux_name
= "htb",
4877 .ovs_name
= "linux-htb",
4878 .n_queues
= HTB_N_QUEUES
,
4879 .tc_install
= htb_tc_install
,
4880 .tc_load
= htb_tc_load
,
4881 .tc_destroy
= htb_tc_destroy
,
4882 .qdisc_get
= htb_qdisc_get
,
4883 .qdisc_set
= htb_qdisc_set
,
4884 .class_get
= htb_class_get
,
4885 .class_set
= htb_class_set
,
4886 .class_delete
= htb_class_delete
,
4887 .class_get_stats
= htb_class_get_stats
,
4888 .class_dump_stats
= htb_class_dump_stats
4891 /* "linux-hfsc" traffic control class. */
4893 #define HFSC_N_QUEUES 0xf000
4901 struct tc_queue tc_queue
;
4906 static struct hfsc
*
4907 hfsc_get__(const struct netdev
*netdev_
)
4909 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4910 return CONTAINER_OF(netdev
->tc
, struct hfsc
, tc
);
4913 static struct hfsc_class
*
4914 hfsc_class_cast__(const struct tc_queue
*queue
)
4916 return CONTAINER_OF(queue
, struct hfsc_class
, tc_queue
);
4920 hfsc_install__(struct netdev
*netdev_
, uint32_t max_rate
)
4922 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4925 hfsc
= xmalloc(sizeof *hfsc
);
4926 tc_init(&hfsc
->tc
, &tc_ops_hfsc
);
4927 hfsc
->max_rate
= max_rate
;
4928 netdev
->tc
= &hfsc
->tc
;
4932 hfsc_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4933 const struct hfsc_class
*hc
)
4937 struct hfsc_class
*hcp
;
4938 struct tc_queue
*queue
;
4940 hfsc
= hfsc_get__(netdev
);
4941 hash
= hash_int(queue_id
, 0);
4943 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4945 hcp
= hfsc_class_cast__(queue
);
4947 hcp
= xmalloc(sizeof *hcp
);
4948 queue
= &hcp
->tc_queue
;
4949 queue
->queue_id
= queue_id
;
4950 queue
->created
= time_msec();
4951 hmap_insert(&hfsc
->tc
.queues
, &queue
->hmap_node
, hash
);
4954 hcp
->min_rate
= hc
->min_rate
;
4955 hcp
->max_rate
= hc
->max_rate
;
4959 hfsc_parse_tca_options__(struct nlattr
*nl_options
, struct hfsc_class
*class)
4961 const struct tc_service_curve
*rsc
, *fsc
, *usc
;
4962 static const struct nl_policy tca_hfsc_policy
[] = {
4964 .type
= NL_A_UNSPEC
,
4966 .min_len
= sizeof(struct tc_service_curve
),
4969 .type
= NL_A_UNSPEC
,
4971 .min_len
= sizeof(struct tc_service_curve
),
4974 .type
= NL_A_UNSPEC
,
4976 .min_len
= sizeof(struct tc_service_curve
),
4979 struct nlattr
*attrs
[ARRAY_SIZE(tca_hfsc_policy
)];
4981 if (!nl_parse_nested(nl_options
, tca_hfsc_policy
,
4982 attrs
, ARRAY_SIZE(tca_hfsc_policy
))) {
4983 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options");
4987 rsc
= nl_attr_get(attrs
[TCA_HFSC_RSC
]);
4988 fsc
= nl_attr_get(attrs
[TCA_HFSC_FSC
]);
4989 usc
= nl_attr_get(attrs
[TCA_HFSC_USC
]);
4991 if (rsc
->m1
!= 0 || rsc
->d
!= 0 ||
4992 fsc
->m1
!= 0 || fsc
->d
!= 0 ||
4993 usc
->m1
!= 0 || usc
->d
!= 0) {
4994 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4995 "Non-linear service curves are not supported.");
4999 if (rsc
->m2
!= fsc
->m2
) {
5000 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
5001 "Real-time service curves are not supported ");
5005 if (rsc
->m2
> usc
->m2
) {
5006 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
5007 "Min-rate service curve is greater than "
5008 "the max-rate service curve.");
5012 class->min_rate
= fsc
->m2
;
5013 class->max_rate
= usc
->m2
;
5018 hfsc_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
5019 struct hfsc_class
*options
,
5020 struct netdev_queue_stats
*stats
)
5023 unsigned int handle
;
5024 struct nlattr
*nl_options
;
5026 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
5032 unsigned int major
, minor
;
5034 major
= tc_get_major(handle
);
5035 minor
= tc_get_minor(handle
);
5036 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
5037 *queue_id
= minor
- 1;
5044 error
= hfsc_parse_tca_options__(nl_options
, options
);
5051 hfsc_query_class__(const struct netdev
*netdev
, unsigned int handle
,
5052 unsigned int parent
, struct hfsc_class
*options
,
5053 struct netdev_queue_stats
*stats
)
5056 struct ofpbuf
*reply
;
5058 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
5063 error
= hfsc_parse_tcmsg__(reply
, NULL
, options
, stats
);
5064 ofpbuf_delete(reply
);
5069 hfsc_parse_qdisc_details__(struct netdev
*netdev_
, const struct smap
*details
,
5070 struct hfsc_class
*class)
5072 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5074 uint32_t max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
5076 enum netdev_features current
;
5078 netdev_linux_read_features(netdev
);
5079 current
= !netdev
->get_features_error
? netdev
->current
: 0;
5080 max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
5083 class->min_rate
= max_rate
;
5084 class->max_rate
= max_rate
;
5088 hfsc_parse_class_details__(struct netdev
*netdev
,
5089 const struct smap
*details
,
5090 struct hfsc_class
* class)
5092 const struct hfsc
*hfsc
;
5093 uint32_t min_rate
, max_rate
;
5095 hfsc
= hfsc_get__(netdev
);
5097 min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
5098 min_rate
= MAX(min_rate
, 1);
5099 min_rate
= MIN(min_rate
, hfsc
->max_rate
);
5101 max_rate
= smap_get_ullong(details
, "max-rate", hfsc
->max_rate
* 8) / 8;
5102 max_rate
= MAX(max_rate
, min_rate
);
5103 max_rate
= MIN(max_rate
, hfsc
->max_rate
);
5105 class->min_rate
= min_rate
;
5106 class->max_rate
= max_rate
;
5111 /* Create an HFSC qdisc.
5113 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
5115 hfsc_setup_qdisc__(struct netdev
* netdev
)
5117 struct tcmsg
*tcmsg
;
5118 struct ofpbuf request
;
5119 struct tc_hfsc_qopt opt
;
5121 tc_del_qdisc(netdev
);
5123 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
5124 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
5130 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
5131 tcmsg
->tcm_parent
= TC_H_ROOT
;
5133 memset(&opt
, 0, sizeof opt
);
5136 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
5137 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
5139 return tc_transact(&request
, NULL
);
5142 /* Create an HFSC class.
5144 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
5145 * sc rate <min_rate> ul rate <max_rate>" */
5147 hfsc_setup_class__(struct netdev
*netdev
, unsigned int handle
,
5148 unsigned int parent
, struct hfsc_class
*class)
5152 struct tcmsg
*tcmsg
;
5153 struct ofpbuf request
;
5154 struct tc_service_curve min
, max
;
5156 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
5163 tcmsg
->tcm_handle
= handle
;
5164 tcmsg
->tcm_parent
= parent
;
5168 min
.m2
= class->min_rate
;
5172 max
.m2
= class->max_rate
;
5174 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
5175 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
5176 nl_msg_put_unspec(&request
, TCA_HFSC_RSC
, &min
, sizeof min
);
5177 nl_msg_put_unspec(&request
, TCA_HFSC_FSC
, &min
, sizeof min
);
5178 nl_msg_put_unspec(&request
, TCA_HFSC_USC
, &max
, sizeof max
);
5179 nl_msg_end_nested(&request
, opt_offset
);
5181 error
= tc_transact(&request
, NULL
);
5183 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
5184 "min-rate %ubps, max-rate %ubps (%s)",
5185 netdev_get_name(netdev
),
5186 tc_get_major(handle
), tc_get_minor(handle
),
5187 tc_get_major(parent
), tc_get_minor(parent
),
5188 class->min_rate
, class->max_rate
, ovs_strerror(error
));
5195 hfsc_tc_install(struct netdev
*netdev
, const struct smap
*details
)
5198 struct hfsc_class
class;
5200 error
= hfsc_setup_qdisc__(netdev
);
5206 hfsc_parse_qdisc_details__(netdev
, details
, &class);
5207 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
5208 tc_make_handle(1, 0), &class);
5214 hfsc_install__(netdev
, class.max_rate
);
5219 hfsc_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5222 struct queue_dump_state state
;
5223 struct hfsc_class hc
;
5226 hfsc_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
5227 hfsc_install__(netdev
, hc
.max_rate
);
5229 if (!start_queue_dump(netdev
, &state
)) {
5233 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
5234 unsigned int queue_id
;
5236 if (!hfsc_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
5237 hfsc_update_queue__(netdev
, queue_id
, &hc
);
5241 finish_queue_dump(&state
);
5246 hfsc_tc_destroy(struct tc
*tc
)
5249 struct hfsc_class
*hc
, *next
;
5251 hfsc
= CONTAINER_OF(tc
, struct hfsc
, tc
);
5253 HMAP_FOR_EACH_SAFE (hc
, next
, tc_queue
.hmap_node
, &hfsc
->tc
.queues
) {
5254 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
5263 hfsc_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
5265 const struct hfsc
*hfsc
;
5266 hfsc
= hfsc_get__(netdev
);
5267 smap_add_format(details
, "max-rate", "%llu", 8ULL * hfsc
->max_rate
);
5272 hfsc_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
5275 struct hfsc_class
class;
5277 hfsc_parse_qdisc_details__(netdev
, details
, &class);
5278 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
5279 tc_make_handle(1, 0), &class);
5282 hfsc_get__(netdev
)->max_rate
= class.max_rate
;
5289 hfsc_class_get(const struct netdev
*netdev OVS_UNUSED
,
5290 const struct tc_queue
*queue
, struct smap
*details
)
5292 const struct hfsc_class
*hc
;
5294 hc
= hfsc_class_cast__(queue
);
5295 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
5296 if (hc
->min_rate
!= hc
->max_rate
) {
5297 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
5303 hfsc_class_set(struct netdev
*netdev
, unsigned int queue_id
,
5304 const struct smap
*details
)
5307 struct hfsc_class
class;
5309 error
= hfsc_parse_class_details__(netdev
, details
, &class);
5314 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
5315 tc_make_handle(1, 0xfffe), &class);
5320 hfsc_update_queue__(netdev
, queue_id
, &class);
5325 hfsc_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
5329 struct hfsc_class
*hc
;
5331 hc
= hfsc_class_cast__(queue
);
5332 hfsc
= hfsc_get__(netdev
);
5334 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
5336 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
5343 hfsc_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
5344 struct netdev_queue_stats
*stats
)
5346 return hfsc_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
5347 tc_make_handle(1, 0xfffe), NULL
, stats
);
5351 hfsc_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
5352 const struct ofpbuf
*nlmsg
,
5353 netdev_dump_queue_stats_cb
*cb
, void *aux
)
5355 struct netdev_queue_stats stats
;
5356 unsigned int handle
, major
, minor
;
5359 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
5364 major
= tc_get_major(handle
);
5365 minor
= tc_get_minor(handle
);
5366 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
5367 (*cb
)(minor
- 1, &stats
, aux
);
5372 static const struct tc_ops tc_ops_hfsc
= {
5373 .linux_name
= "hfsc",
5374 .ovs_name
= "linux-hfsc",
5375 .n_queues
= HFSC_N_QUEUES
, /* n_queues */
5376 .tc_install
= hfsc_tc_install
,
5377 .tc_load
= hfsc_tc_load
,
5378 .tc_destroy
= hfsc_tc_destroy
,
5379 .qdisc_get
= hfsc_qdisc_get
,
5380 .qdisc_set
= hfsc_qdisc_set
,
5381 .class_get
= hfsc_class_get
,
5382 .class_set
= hfsc_class_set
,
5383 .class_delete
= hfsc_class_delete
,
5384 .class_get_stats
= hfsc_class_get_stats
,
5385 .class_dump_stats
= hfsc_class_dump_stats
,
5388 /* "linux-noop" traffic control class. */
5391 noop_install__(struct netdev
*netdev_
)
5393 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5394 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
5396 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
5400 noop_tc_install(struct netdev
*netdev
,
5401 const struct smap
*details OVS_UNUSED
)
5403 noop_install__(netdev
);
5408 noop_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5410 noop_install__(netdev
);
5414 static const struct tc_ops tc_ops_noop
= {
5415 .ovs_name
= "linux-noop", /* ovs_name */
5416 .tc_install
= noop_tc_install
,
5417 .tc_load
= noop_tc_load
,
5420 /* "linux-default" traffic control class.
5422 * This class represents the default, unnamed Linux qdisc. It corresponds to
5423 * the "" (empty string) QoS type in the OVS database. */
5426 default_install__(struct netdev
*netdev_
)
5428 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5429 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
5431 /* Nothing but a tc class implementation is allowed to write to a tc. This
5432 * class never does that, so we can legitimately use a const tc object. */
5433 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
5437 default_tc_install(struct netdev
*netdev
,
5438 const struct smap
*details OVS_UNUSED
)
5440 default_install__(netdev
);
5445 default_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5447 default_install__(netdev
);
5451 static const struct tc_ops tc_ops_default
= {
5452 .ovs_name
= "", /* ovs_name */
5453 .tc_install
= default_tc_install
,
5454 .tc_load
= default_tc_load
,
5457 /* "linux-other" traffic control class.
5462 other_tc_load(struct netdev
*netdev_
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5464 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5465 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_other
);
5467 /* Nothing but a tc class implementation is allowed to write to a tc. This
5468 * class never does that, so we can legitimately use a const tc object. */
5469 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
5473 static const struct tc_ops tc_ops_other
= {
5474 .ovs_name
= "linux-other",
5475 .tc_load
= other_tc_load
,
5478 /* Traffic control. */
5480 /* Number of kernel "tc" ticks per second. */
5481 static double ticks_per_s
;
5483 /* Number of kernel "jiffies" per second. This is used for the purpose of
5484 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
5485 * one jiffy's worth of data.
5487 * There are two possibilities here:
5489 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5490 * approximate range of 100 to 1024. That means that we really need to
5491 * make sure that the qdisc can buffer that much data.
5493 * - 'buffer_hz' is an absurdly large number. That means that the kernel
5494 * has finely granular timers and there's no need to fudge additional room
5495 * for buffers. (There's no extra effort needed to implement that: the
5496 * large 'buffer_hz' is used as a divisor, so practically any number will
5497 * come out as 0 in the division. Small integer results in the case of
5498 * really high dividends won't have any real effect anyhow.)
5500 static unsigned int buffer_hz
;
5502 static struct tcmsg
*
5503 netdev_linux_tc_make_request(const struct netdev
*netdev
, int type
,
5504 unsigned int flags
, struct ofpbuf
*request
)
5509 error
= get_ifindex(netdev
, &ifindex
);
5514 return tc_make_request(ifindex
, type
, flags
, request
);
5517 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5520 * This function is equivalent to running:
5521 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5522 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
5525 * The configuration and stats may be seen with the following command:
5526 * /sbin/tc -s filter show dev <devname> parent ffff:
5528 * Returns 0 if successful, otherwise a positive errno value.
5531 tc_add_policer(struct netdev
*netdev
,
5532 uint32_t kbits_rate
, uint32_t kbits_burst
)
5534 struct tc_police tc_police
;
5535 struct ofpbuf request
;
5536 struct tcmsg
*tcmsg
;
5537 size_t basic_offset
;
5538 size_t police_offset
;
5542 memset(&tc_police
, 0, sizeof tc_police
);
5543 tc_police
.action
= TC_POLICE_SHOT
;
5544 tc_police
.mtu
= mtu
;
5545 tc_fill_rate(&tc_police
.rate
, ((uint64_t) kbits_rate
* 1000)/8, mtu
);
5547 /* The following appears wrong in one way: In networking a kilobit is
5548 * usually 1000 bits but this uses 1024 bits.
5550 * However if you "fix" those problems then "tc filter show ..." shows
5551 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5552 * 1,000,000 bits, whereas this actually ends up doing the right thing from
5553 * tc's point of view. Whatever. */
5554 tc_police
.burst
= tc_bytes_to_ticks(
5555 tc_police
.rate
.rate
, MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024 / 8);
5557 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTFILTER
,
5558 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
5562 tcmsg
->tcm_parent
= tc_make_handle(0xffff, 0);
5563 tcmsg
->tcm_info
= tc_make_handle(49,
5564 (OVS_FORCE
uint16_t) htons(ETH_P_ALL
));
5566 nl_msg_put_string(&request
, TCA_KIND
, "basic");
5567 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
5568 police_offset
= nl_msg_start_nested(&request
, TCA_BASIC_POLICE
);
5569 nl_msg_put_unspec(&request
, TCA_POLICE_TBF
, &tc_police
, sizeof tc_police
);
5570 tc_put_rtab(&request
, TCA_POLICE_RATE
, &tc_police
.rate
);
5571 nl_msg_end_nested(&request
, police_offset
);
5572 nl_msg_end_nested(&request
, basic_offset
);
5574 error
= tc_transact(&request
, NULL
);
5585 /* The values in psched are not individually very meaningful, but they are
5586 * important. The tables below show some values seen in the wild.
5590 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5591 * (Before that, there are hints that it was 1000000000.)
5593 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5597 * -----------------------------------
5598 * [1] 000c8000 000f4240 000f4240 00000064
5599 * [2] 000003e8 00000400 000f4240 3b9aca00
5600 * [3] 000003e8 00000400 000f4240 3b9aca00
5601 * [4] 000003e8 00000400 000f4240 00000064
5602 * [5] 000003e8 00000040 000f4240 3b9aca00
5603 * [6] 000003e8 00000040 000f4240 000000f9
5605 * a b c d ticks_per_s buffer_hz
5606 * ------- --------- ---------- ------------- ----------- -------------
5607 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5608 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5609 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5610 * [4] 1,000 1,024 1,000,000 100 976,562 100
5611 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5612 * [6] 1,000 64 1,000,000 249 15,625,000 249
5614 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5615 * [2] 2.6.26-1-686-bigmem from Debian lenny
5616 * [3] 2.6.26-2-sparc64 from Debian lenny
5617 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5618 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5619 * [6] 2.6.34 from kernel.org on KVM
5621 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5622 static const char fn
[] = "/proc/net/psched";
5623 unsigned int a
, b
, c
, d
;
5626 if (!ovsthread_once_start(&once
)) {
5633 stream
= fopen(fn
, "r");
5635 VLOG_WARN("%s: open failed: %s", fn
, ovs_strerror(errno
));
5639 if (fscanf(stream
, "%x %x %x %x", &a
, &b
, &c
, &d
) != 4) {
5640 VLOG_WARN("%s: read failed", fn
);
5644 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn
, a
, b
, c
, d
);
5647 if (!a
|| !b
|| !c
) {
5648 VLOG_WARN("%s: invalid scheduler parameters", fn
);
5652 ticks_per_s
= (double) a
* c
/ b
;
5656 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5659 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn
, ticks_per_s
, buffer_hz
);
5662 ovsthread_once_done(&once
);
5665 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5666 * rate of 'rate' bytes per second. */
5668 tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
)
5671 return (rate
* ticks
) / ticks_per_s
;
5674 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
5675 * rate of 'rate' bytes per second. */
5677 tc_bytes_to_ticks(unsigned int rate
, unsigned int size
)
5680 return rate
? ((unsigned long long int) ticks_per_s
* size
) / rate
: 0;
5683 /* Returns the number of bytes that need to be reserved for qdisc buffering at
5684 * a transmission rate of 'rate' bytes per second. */
5686 tc_buffer_per_jiffy(unsigned int rate
)
5689 return rate
/ buffer_hz
;
5693 tc_time_to_ticks(uint32_t time
) {
5695 return time
* (ticks_per_s
/ 1000000);
5698 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5699 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5700 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5701 * stores NULL into it if it is absent.
5703 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5706 * Returns 0 if successful, otherwise a positive errno value. */
5708 tc_parse_qdisc(const struct ofpbuf
*msg
, const char **kind
,
5709 struct nlattr
**options
)
5711 static const struct nl_policy tca_policy
[] = {
5712 [TCA_KIND
] = { .type
= NL_A_STRING
, .optional
= false },
5713 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= true },
5715 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5717 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5718 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5719 VLOG_WARN_RL(&rl
, "failed to parse qdisc message");
5724 *kind
= nl_attr_get_string(ta
[TCA_KIND
]);
5728 *options
= ta
[TCA_OPTIONS
];
5743 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5744 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5745 * into '*options', and its queue statistics into '*stats'. Any of the output
5746 * arguments may be null.
5748 * Returns 0 if successful, otherwise a positive errno value. */
5750 tc_parse_class(const struct ofpbuf
*msg
, unsigned int *handlep
,
5751 struct nlattr
**options
, struct netdev_queue_stats
*stats
)
5753 static const struct nl_policy tca_policy
[] = {
5754 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= false },
5755 [TCA_STATS2
] = { .type
= NL_A_NESTED
, .optional
= false },
5757 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5759 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5760 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5761 VLOG_WARN_RL(&rl
, "failed to parse class message");
5766 struct tcmsg
*tc
= ofpbuf_at_assert(msg
, NLMSG_HDRLEN
, sizeof *tc
);
5767 *handlep
= tc
->tcm_handle
;
5771 *options
= ta
[TCA_OPTIONS
];
5775 const struct gnet_stats_queue
*gsq
;
5776 struct gnet_stats_basic gsb
;
5778 static const struct nl_policy stats_policy
[] = {
5779 [TCA_STATS_BASIC
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5780 .min_len
= sizeof gsb
},
5781 [TCA_STATS_QUEUE
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5782 .min_len
= sizeof *gsq
},
5784 struct nlattr
*sa
[ARRAY_SIZE(stats_policy
)];
5786 if (!nl_parse_nested(ta
[TCA_STATS2
], stats_policy
,
5787 sa
, ARRAY_SIZE(sa
))) {
5788 VLOG_WARN_RL(&rl
, "failed to parse class stats");
5792 /* Alignment issues screw up the length of struct gnet_stats_basic on
5793 * some arch/bitsize combinations. Newer versions of Linux have a
5794 * struct gnet_stats_basic_packed, but we can't depend on that. The
5795 * easiest thing to do is just to make a copy. */
5796 memset(&gsb
, 0, sizeof gsb
);
5797 memcpy(&gsb
, nl_attr_get(sa
[TCA_STATS_BASIC
]),
5798 MIN(nl_attr_get_size(sa
[TCA_STATS_BASIC
]), sizeof gsb
));
5799 stats
->tx_bytes
= gsb
.bytes
;
5800 stats
->tx_packets
= gsb
.packets
;
5802 gsq
= nl_attr_get(sa
[TCA_STATS_QUEUE
]);
5803 stats
->tx_errors
= gsq
->drops
;
5813 memset(stats
, 0, sizeof *stats
);
5818 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5821 tc_query_class(const struct netdev
*netdev
,
5822 unsigned int handle
, unsigned int parent
,
5823 struct ofpbuf
**replyp
)
5825 struct ofpbuf request
;
5826 struct tcmsg
*tcmsg
;
5829 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, NLM_F_ECHO
,
5834 tcmsg
->tcm_handle
= handle
;
5835 tcmsg
->tcm_parent
= parent
;
5837 error
= tc_transact(&request
, replyp
);
5839 VLOG_WARN_RL(&rl
, "query %s class %u:%u (parent %u:%u) failed (%s)",
5840 netdev_get_name(netdev
),
5841 tc_get_major(handle
), tc_get_minor(handle
),
5842 tc_get_major(parent
), tc_get_minor(parent
),
5843 ovs_strerror(error
));
5848 /* Equivalent to "tc class del dev <name> handle <handle>". */
5850 tc_delete_class(const struct netdev
*netdev
, unsigned int handle
)
5852 struct ofpbuf request
;
5853 struct tcmsg
*tcmsg
;
5856 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_DELTCLASS
, 0, &request
);
5860 tcmsg
->tcm_handle
= handle
;
5861 tcmsg
->tcm_parent
= 0;
5863 error
= tc_transact(&request
, NULL
);
5865 VLOG_WARN_RL(&rl
, "delete %s class %u:%u failed (%s)",
5866 netdev_get_name(netdev
),
5867 tc_get_major(handle
), tc_get_minor(handle
),
5868 ovs_strerror(error
));
5873 /* Equivalent to "tc qdisc del dev <name> root". */
5875 tc_del_qdisc(struct netdev
*netdev_
)
5877 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5878 struct ofpbuf request
;
5879 struct tcmsg
*tcmsg
;
5882 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_DELQDISC
, 0, &request
);
5886 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
5887 tcmsg
->tcm_parent
= TC_H_ROOT
;
5889 error
= tc_transact(&request
, NULL
);
5890 if (error
== EINVAL
) {
5891 /* EINVAL probably means that the default qdisc was in use, in which
5892 * case we've accomplished our purpose. */
5895 if (!error
&& netdev
->tc
) {
5896 if (netdev
->tc
->ops
->tc_destroy
) {
5897 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
5905 getqdisc_is_safe(void)
5907 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5908 static bool safe
= false;
5910 if (ovsthread_once_start(&once
)) {
5911 struct utsname utsname
;
5914 if (uname(&utsname
) == -1) {
5915 VLOG_WARN("uname failed (%s)", ovs_strerror(errno
));
5916 } else if (!ovs_scan(utsname
.release
, "%d.%d", &major
, &minor
)) {
5917 VLOG_WARN("uname reported bad OS release (%s)", utsname
.release
);
5918 } else if (major
< 2 || (major
== 2 && minor
< 35)) {
5919 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5924 ovsthread_once_done(&once
);
5929 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5930 * kernel to determine what they are. Returns 0 if successful, otherwise a
5931 * positive errno value. */
5933 tc_query_qdisc(const struct netdev
*netdev_
)
5935 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5936 struct ofpbuf request
, *qdisc
;
5937 const struct tc_ops
*ops
;
5938 struct tcmsg
*tcmsg
;
5946 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5947 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5948 * 2.6.35 without that fix backported to it.
5950 * To avoid the OOPS, we must not make a request that would attempt to dump
5951 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5952 * few others. There are a few ways that I can see to do this, but most of
5953 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5954 * technique chosen here is to assume that any non-default qdisc that we
5955 * create will have a class with handle 1:0. The built-in qdiscs only have
5956 * a class with handle 0:0.
5958 * On Linux 2.6.35+ we use the straightforward method because it allows us
5959 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5960 * in such a case we get no response at all from the kernel (!) if a
5961 * builtin qdisc is in use (which is later caught by "!error &&
5962 * !qdisc->size"). */
5963 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_GETQDISC
, NLM_F_ECHO
,
5968 tcmsg
->tcm_handle
= tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5969 tcmsg
->tcm_parent
= getqdisc_is_safe() ? TC_H_ROOT
: 0;
5971 /* Figure out what tc class to instantiate. */
5972 error
= tc_transact(&request
, &qdisc
);
5973 if (!error
&& qdisc
->size
) {
5976 error
= tc_parse_qdisc(qdisc
, &kind
, NULL
);
5978 ops
= &tc_ops_other
;
5980 ops
= tc_lookup_linux_name(kind
);
5982 static struct vlog_rate_limit rl2
= VLOG_RATE_LIMIT_INIT(1, 1);
5983 VLOG_DBG_RL(&rl2
, "unknown qdisc \"%s\"", kind
);
5985 ops
= &tc_ops_other
;
5988 } else if ((!error
&& !qdisc
->size
) || error
== ENOENT
) {
5989 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5990 * set up by some other entity that doesn't have a handle 1:0. We will
5991 * assume that it's the system default qdisc. */
5992 ops
= &tc_ops_default
;
5995 /* Who knows? Maybe the device got deleted. */
5996 VLOG_WARN_RL(&rl
, "query %s qdisc failed (%s)",
5997 netdev_get_name(netdev_
), ovs_strerror(error
));
5998 ops
= &tc_ops_other
;
6001 /* Instantiate it. */
6002 load_error
= ops
->tc_load(CONST_CAST(struct netdev
*, netdev_
), qdisc
);
6003 ovs_assert((load_error
== 0) == (netdev
->tc
!= NULL
));
6004 ofpbuf_delete(qdisc
);
6006 return error
? error
: load_error
;
6009 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
6010 approximate the time to transmit packets of various lengths. For an MTU of
6011 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
6012 represents two possible packet lengths; for a MTU of 513 through 1024, four
6013 possible lengths; and so on.
6015 Returns, for the specified 'mtu', the number of bits that packet lengths
6016 need to be shifted right to fit within such a 256-entry table. */
6018 tc_calc_cell_log(unsigned int mtu
)
6023 mtu
= ETH_PAYLOAD_MAX
;
6025 mtu
+= ETH_HEADER_LEN
+ VLAN_HEADER_LEN
;
6027 for (cell_log
= 0; mtu
>= 256; cell_log
++) {
6034 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
6037 tc_fill_rate(struct tc_ratespec
*rate
, uint64_t Bps
, int mtu
)
6039 memset(rate
, 0, sizeof *rate
);
6040 rate
->cell_log
= tc_calc_cell_log(mtu
);
6041 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
6042 /* rate->cell_align = 0; */ /* distro headers. */
6043 rate
->mpu
= ETH_TOTAL_MIN
;
6047 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
6048 * attribute of the specified "type".
6050 * See tc_calc_cell_log() above for a description of "rtab"s. */
6052 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
)
6057 rtab
= nl_msg_put_unspec_uninit(msg
, type
, TC_RTAB_SIZE
);
6058 for (i
= 0; i
< TC_RTAB_SIZE
/ sizeof *rtab
; i
++) {
6059 unsigned packet_size
= (i
+ 1) << rate
->cell_log
;
6060 if (packet_size
< rate
->mpu
) {
6061 packet_size
= rate
->mpu
;
6063 rtab
[i
] = tc_bytes_to_ticks(rate
->rate
, packet_size
);
6067 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
6068 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
6069 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
6072 tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
)
6074 unsigned int min_burst
= tc_buffer_per_jiffy(Bps
) + mtu
;
6075 return tc_bytes_to_ticks(Bps
, MAX(burst_bytes
, min_burst
));
6078 /* Linux-only functions declared in netdev-linux.h */
6080 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
6081 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
6083 netdev_linux_ethtool_set_flag(struct netdev
*netdev
, uint32_t flag
,
6084 const char *flag_name
, bool enable
)
6086 const char *netdev_name
= netdev_get_name(netdev
);
6087 struct ethtool_value evalue
;
6091 COVERAGE_INC(netdev_get_ethtool
);
6092 memset(&evalue
, 0, sizeof evalue
);
6093 error
= netdev_linux_do_ethtool(netdev_name
,
6094 (struct ethtool_cmd
*)&evalue
,
6095 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
6100 COVERAGE_INC(netdev_set_ethtool
);
6101 new_flags
= (evalue
.data
& ~flag
) | (enable
? flag
: 0);
6102 if (new_flags
== evalue
.data
) {
6105 evalue
.data
= new_flags
;
6106 error
= netdev_linux_do_ethtool(netdev_name
,
6107 (struct ethtool_cmd
*)&evalue
,
6108 ETHTOOL_SFLAGS
, "ETHTOOL_SFLAGS");
6113 COVERAGE_INC(netdev_get_ethtool
);
6114 memset(&evalue
, 0, sizeof evalue
);
6115 error
= netdev_linux_do_ethtool(netdev_name
,
6116 (struct ethtool_cmd
*)&evalue
,
6117 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
6122 if (new_flags
!= evalue
.data
) {
6123 VLOG_WARN_RL(&rl
, "attempt to %s ethtool %s flag on network "
6124 "device %s failed", enable
? "enable" : "disable",
6125 flag_name
, netdev_name
);
6132 /* Utility functions. */
6134 /* Copies 'src' into 'dst', performing format conversion in the process. */
6136 netdev_stats_from_rtnl_link_stats(struct netdev_stats
*dst
,
6137 const struct rtnl_link_stats
*src
)
6139 dst
->rx_packets
= src
->rx_packets
;
6140 dst
->tx_packets
= src
->tx_packets
;
6141 dst
->rx_bytes
= src
->rx_bytes
;
6142 dst
->tx_bytes
= src
->tx_bytes
;
6143 dst
->rx_errors
= src
->rx_errors
;
6144 dst
->tx_errors
= src
->tx_errors
;
6145 dst
->rx_dropped
= src
->rx_dropped
;
6146 dst
->tx_dropped
= src
->tx_dropped
;
6147 dst
->multicast
= src
->multicast
;
6148 dst
->collisions
= src
->collisions
;
6149 dst
->rx_length_errors
= src
->rx_length_errors
;
6150 dst
->rx_over_errors
= src
->rx_over_errors
;
6151 dst
->rx_crc_errors
= src
->rx_crc_errors
;
6152 dst
->rx_frame_errors
= src
->rx_frame_errors
;
6153 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
6154 dst
->rx_missed_errors
= src
->rx_missed_errors
;
6155 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
6156 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
6157 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
6158 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
6159 dst
->tx_window_errors
= src
->tx_window_errors
;
6162 /* Copies 'src' into 'dst', performing format conversion in the process. */
6164 netdev_stats_from_rtnl_link_stats64(struct netdev_stats
*dst
,
6165 const struct rtnl_link_stats64
*src
)
6167 dst
->rx_packets
= src
->rx_packets
;
6168 dst
->tx_packets
= src
->tx_packets
;
6169 dst
->rx_bytes
= src
->rx_bytes
;
6170 dst
->tx_bytes
= src
->tx_bytes
;
6171 dst
->rx_errors
= src
->rx_errors
;
6172 dst
->tx_errors
= src
->tx_errors
;
6173 dst
->rx_dropped
= src
->rx_dropped
;
6174 dst
->tx_dropped
= src
->tx_dropped
;
6175 dst
->multicast
= src
->multicast
;
6176 dst
->collisions
= src
->collisions
;
6177 dst
->rx_length_errors
= src
->rx_length_errors
;
6178 dst
->rx_over_errors
= src
->rx_over_errors
;
6179 dst
->rx_crc_errors
= src
->rx_crc_errors
;
6180 dst
->rx_frame_errors
= src
->rx_frame_errors
;
6181 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
6182 dst
->rx_missed_errors
= src
->rx_missed_errors
;
6183 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
6184 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
6185 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
6186 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
6187 dst
->tx_window_errors
= src
->tx_window_errors
;
6191 get_stats_via_netlink(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
6193 struct ofpbuf request
;
6194 struct ofpbuf
*reply
;
6197 /* Filtering all counters by default */
6198 memset(stats
, 0xFF, sizeof(struct netdev_stats
));
6200 ofpbuf_init(&request
, 0);
6201 nl_msg_put_nlmsghdr(&request
,
6202 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
6203 RTM_GETLINK
, NLM_F_REQUEST
);
6204 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
6205 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(netdev_
));
6206 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
6207 ofpbuf_uninit(&request
);
6212 if (ofpbuf_try_pull(reply
, NLMSG_HDRLEN
+ sizeof(struct ifinfomsg
))) {
6213 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS64
);
6214 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats64
)) {
6215 netdev_stats_from_rtnl_link_stats64(stats
, nl_attr_get(a
));
6218 a
= nl_attr_find(reply
, 0, IFLA_STATS
);
6219 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats
)) {
6220 netdev_stats_from_rtnl_link_stats(stats
, nl_attr_get(a
));
6223 VLOG_WARN_RL(&rl
, "RTM_GETLINK reply lacks stats");
6228 VLOG_WARN_RL(&rl
, "short RTM_GETLINK reply");
6233 ofpbuf_delete(reply
);
6238 get_flags(const struct netdev
*dev
, unsigned int *flags
)
6244 error
= af_inet_ifreq_ioctl(dev
->name
, &ifr
, SIOCGIFFLAGS
, "SIOCGIFFLAGS");
6246 *flags
= ifr
.ifr_flags
;
6252 set_flags(const char *name
, unsigned int flags
)
6256 ifr
.ifr_flags
= flags
;
6257 return af_inet_ifreq_ioctl(name
, &ifr
, SIOCSIFFLAGS
, "SIOCSIFFLAGS");
6261 linux_get_ifindex(const char *netdev_name
)
6266 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
6267 COVERAGE_INC(netdev_get_ifindex
);
6269 error
= af_inet_ioctl(SIOCGIFINDEX
, &ifr
);
6271 /* ENODEV probably means that a vif disappeared asynchronously and
6272 * hasn't been removed from the database yet, so reduce the log level
6273 * to INFO for that case. */
6274 VLOG_RL(&rl
, error
== ENODEV
? VLL_INFO
: VLL_ERR
,
6275 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
6276 netdev_name
, ovs_strerror(error
));
6279 return ifr
.ifr_ifindex
;
6283 get_ifindex(const struct netdev
*netdev_
, int *ifindexp
)
6285 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
6287 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
6288 netdev_linux_update_via_netlink(netdev
);
6291 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
6292 /* Fall back to ioctl if netlink fails */
6293 int ifindex
= linux_get_ifindex(netdev_get_name(netdev_
));
6296 netdev
->get_ifindex_error
= -ifindex
;
6297 netdev
->ifindex
= 0;
6299 netdev
->get_ifindex_error
= 0;
6300 netdev
->ifindex
= ifindex
;
6302 netdev
->cache_valid
|= VALID_IFINDEX
;
6305 *ifindexp
= netdev
->ifindex
;
6306 return netdev
->get_ifindex_error
;
6310 netdev_linux_update_via_netlink(struct netdev_linux
*netdev
)
6312 struct ofpbuf request
;
6313 struct ofpbuf
*reply
;
6314 struct rtnetlink_change chg
;
6315 struct rtnetlink_change
*change
= &chg
;
6318 ofpbuf_init(&request
, 0);
6319 nl_msg_put_nlmsghdr(&request
,
6320 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
) +
6321 NL_A_U32_SIZE
, RTM_GETLINK
, NLM_F_REQUEST
);
6322 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
6324 /* The correct identifiers for a Linux device are netnsid and ifindex,
6325 * but ifindex changes as the port is moved to another network namespace
6326 * and the interface name statically stored in ovsdb. */
6327 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(&netdev
->up
));
6328 if (netdev_linux_netnsid_is_remote(netdev
)) {
6329 nl_msg_put_u32(&request
, IFLA_IF_NETNSID
, netdev
->netnsid
);
6331 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
6332 ofpbuf_uninit(&request
);
6334 ofpbuf_delete(reply
);
6338 if (rtnetlink_parse(reply
, change
)
6339 && change
->nlmsg_type
== RTM_NEWLINK
) {
6340 bool changed
= false;
6343 /* Update netdev from rtnl msg and increment its seq if needed. */
6344 if ((change
->ifi_flags
^ netdev
->ifi_flags
) & IFF_RUNNING
) {
6345 netdev
->carrier_resets
++;
6348 if (change
->ifi_flags
!= netdev
->ifi_flags
) {
6349 netdev
->ifi_flags
= change
->ifi_flags
;
6352 if (change
->mtu
&& change
->mtu
!= netdev
->mtu
) {
6353 netdev
->mtu
= change
->mtu
;
6354 netdev
->cache_valid
|= VALID_MTU
;
6355 netdev
->netdev_mtu_error
= 0;
6358 if (!eth_addr_is_zero(change
->mac
)
6359 && !eth_addr_equals(change
->mac
, netdev
->etheraddr
)) {
6360 netdev
->etheraddr
= change
->mac
;
6361 netdev
->cache_valid
|= VALID_ETHERADDR
;
6362 netdev
->ether_addr_error
= 0;
6365 if (change
->if_index
!= netdev
->ifindex
) {
6366 netdev
->ifindex
= change
->if_index
;
6367 netdev
->cache_valid
|= VALID_IFINDEX
;
6368 netdev
->get_ifindex_error
= 0;
6371 if (change
->master
&& netdev_linux_kind_is_lag(change
->master
)) {
6372 netdev
->is_lag_master
= true;
6375 netdev_change_seq_changed(&netdev
->up
);
6381 ofpbuf_delete(reply
);
6386 get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
)
6392 memset(&ifr
, 0, sizeof ifr
);
6393 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
6394 COVERAGE_INC(netdev_get_hwaddr
);
6395 error
= af_inet_ioctl(SIOCGIFHWADDR
, &ifr
);
6397 /* ENODEV probably means that a vif disappeared asynchronously and
6398 * hasn't been removed from the database yet, so reduce the log level
6399 * to INFO for that case. */
6400 VLOG(error
== ENODEV
? VLL_INFO
: VLL_ERR
,
6401 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
6402 netdev_name
, ovs_strerror(error
));
6405 hwaddr_family
= ifr
.ifr_hwaddr
.sa_family
;
6406 if (hwaddr_family
!= AF_UNSPEC
&& hwaddr_family
!= ARPHRD_ETHER
&&
6407 hwaddr_family
!= ARPHRD_NONE
) {
6408 VLOG_INFO("%s device has unknown hardware address family %d",
6409 netdev_name
, hwaddr_family
);
6412 memcpy(ea
, ifr
.ifr_hwaddr
.sa_data
, ETH_ADDR_LEN
);
6417 set_etheraddr(const char *netdev_name
, const struct eth_addr mac
)
6422 memset(&ifr
, 0, sizeof ifr
);
6423 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
6424 ifr
.ifr_hwaddr
.sa_family
= ARPHRD_ETHER
;
6425 memcpy(ifr
.ifr_hwaddr
.sa_data
, &mac
, ETH_ADDR_LEN
);
6426 COVERAGE_INC(netdev_set_hwaddr
);
6427 error
= af_inet_ioctl(SIOCSIFHWADDR
, &ifr
);
6429 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
6430 netdev_name
, ovs_strerror(error
));
6436 netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*ecmd
,
6437 int cmd
, const char *cmd_name
)
6442 memset(&ifr
, 0, sizeof ifr
);
6443 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
6444 ifr
.ifr_data
= (caddr_t
) ecmd
;
6447 error
= af_inet_ioctl(SIOCETHTOOL
, &ifr
);
6449 if (error
!= EOPNOTSUPP
) {
6450 VLOG_WARN_RL(&rl
, "ethtool command %s on network device %s "
6451 "failed: %s", cmd_name
, name
, ovs_strerror(error
));
6453 /* The device doesn't support this operation. That's pretty
6454 * common, so there's no point in logging anything. */
6460 /* Returns an AF_PACKET raw socket or a negative errno value. */
6462 af_packet_sock(void)
6464 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
6467 if (ovsthread_once_start(&once
)) {
6468 sock
= socket(AF_PACKET
, SOCK_RAW
, 0);
6470 int error
= set_nonblocking(sock
);
6474 } else if (userspace_tso_enabled()) {
6476 error
= setsockopt(sock
, SOL_PACKET
, PACKET_VNET_HDR
, &val
,
6480 VLOG_ERR("failed to enable vnet hdr in raw socket: %s",
6481 ovs_strerror(errno
));
6488 VLOG_ERR("failed to create packet socket: %s",
6489 ovs_strerror(errno
));
6491 ovsthread_once_done(&once
);
6498 netdev_linux_parse_l2(struct dp_packet
*b
, uint16_t *l4proto
)
6500 struct eth_header
*eth_hdr
;
6504 eth_hdr
= dp_packet_at(b
, 0, ETH_HEADER_LEN
);
6509 l2_len
= ETH_HEADER_LEN
;
6510 eth_type
= eth_hdr
->eth_type
;
6511 if (eth_type_vlan(eth_type
)) {
6512 struct vlan_header
*vlan
= dp_packet_at(b
, l2_len
, VLAN_HEADER_LEN
);
6518 eth_type
= vlan
->vlan_next_type
;
6519 l2_len
+= VLAN_HEADER_LEN
;
6522 if (eth_type
== htons(ETH_TYPE_IP
)) {
6523 struct ip_header
*ip_hdr
= dp_packet_at(b
, l2_len
, IP_HEADER_LEN
);
6529 *l4proto
= ip_hdr
->ip_proto
;
6530 dp_packet_hwol_set_tx_ipv4(b
);
6531 } else if (eth_type
== htons(ETH_TYPE_IPV6
)) {
6532 struct ovs_16aligned_ip6_hdr
*nh6
;
6534 nh6
= dp_packet_at(b
, l2_len
, IPV6_HEADER_LEN
);
6539 *l4proto
= nh6
->ip6_ctlun
.ip6_un1
.ip6_un1_nxt
;
6540 dp_packet_hwol_set_tx_ipv6(b
);
6547 netdev_linux_parse_vnet_hdr(struct dp_packet
*b
)
6549 struct virtio_net_hdr
*vnet
= dp_packet_pull(b
, sizeof *vnet
);
6550 uint16_t l4proto
= 0;
6552 if (OVS_UNLIKELY(!vnet
)) {
6556 if (vnet
->flags
== 0 && vnet
->gso_type
== VIRTIO_NET_HDR_GSO_NONE
) {
6560 if (netdev_linux_parse_l2(b
, &l4proto
)) {
6564 if (vnet
->flags
== VIRTIO_NET_HDR_F_NEEDS_CSUM
) {
6565 if (l4proto
== IPPROTO_TCP
) {
6566 dp_packet_hwol_set_csum_tcp(b
);
6567 } else if (l4proto
== IPPROTO_UDP
) {
6568 dp_packet_hwol_set_csum_udp(b
);
6569 } else if (l4proto
== IPPROTO_SCTP
) {
6570 dp_packet_hwol_set_csum_sctp(b
);
6574 if (l4proto
&& vnet
->gso_type
!= VIRTIO_NET_HDR_GSO_NONE
) {
6575 uint8_t allowed_mask
= VIRTIO_NET_HDR_GSO_TCPV4
6576 | VIRTIO_NET_HDR_GSO_TCPV6
6577 | VIRTIO_NET_HDR_GSO_UDP
;
6578 uint8_t type
= vnet
->gso_type
& allowed_mask
;
6580 if (type
== VIRTIO_NET_HDR_GSO_TCPV4
6581 || type
== VIRTIO_NET_HDR_GSO_TCPV6
) {
6582 dp_packet_hwol_set_tcp_seg(b
);
6590 netdev_linux_prepend_vnet_hdr(struct dp_packet
*b
, int mtu
)
6592 struct virtio_net_hdr
*vnet
= dp_packet_push_zeros(b
, sizeof *vnet
);
6594 if (dp_packet_hwol_is_tso(b
)) {
6595 uint16_t hdr_len
= ((char *)dp_packet_l4(b
) - (char *)dp_packet_eth(b
))
6598 vnet
->hdr_len
= (OVS_FORCE __virtio16
)hdr_len
;
6599 vnet
->gso_size
= (OVS_FORCE __virtio16
)(mtu
- hdr_len
);
6600 if (dp_packet_hwol_is_ipv4(b
)) {
6601 vnet
->gso_type
= VIRTIO_NET_HDR_GSO_TCPV4
;
6603 vnet
->gso_type
= VIRTIO_NET_HDR_GSO_TCPV6
;
6607 vnet
->flags
= VIRTIO_NET_HDR_GSO_NONE
;
6610 if (dp_packet_hwol_l4_mask(b
)) {
6611 vnet
->flags
= VIRTIO_NET_HDR_F_NEEDS_CSUM
;
6612 vnet
->csum_start
= (OVS_FORCE __virtio16
)((char *)dp_packet_l4(b
)
6613 - (char *)dp_packet_eth(b
));
6615 if (dp_packet_hwol_l4_is_tcp(b
)) {
6616 vnet
->csum_offset
= (OVS_FORCE __virtio16
) __builtin_offsetof(
6617 struct tcp_header
, tcp_csum
);
6618 } else if (dp_packet_hwol_l4_is_udp(b
)) {
6619 vnet
->csum_offset
= (OVS_FORCE __virtio16
) __builtin_offsetof(
6620 struct udp_header
, udp_csum
);
6621 } else if (dp_packet_hwol_l4_is_sctp(b
)) {
6622 vnet
->csum_offset
= (OVS_FORCE __virtio16
) __builtin_offsetof(
6623 struct sctp_header
, sctp_csum
);
6625 VLOG_WARN_RL(&rl
, "Unsupported L4 protocol");