2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "netdev-linux.h"
23 #include <sys/types.h>
24 #include <netinet/in.h>
25 #include <arpa/inet.h>
28 #include <linux/filter.h>
29 #include <linux/gen_stats.h>
30 #include <linux/if_ether.h>
31 #include <linux/if_tun.h>
32 #include <linux/types.h>
33 #include <linux/ethtool.h>
34 #include <linux/mii.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/sockios.h>
37 #include <sys/ioctl.h>
38 #include <sys/socket.h>
39 #include <sys/utsname.h>
40 #include <netpacket/packet.h>
42 #include <net/if_arp.h>
43 #include <net/route.h>
50 #include "dp-packet.h"
51 #include "dpif-netlink.h"
52 #include "dpif-netdev.h"
53 #include "openvswitch/dynamic-string.h"
54 #include "fatal-signal.h"
56 #include "openvswitch/hmap.h"
57 #include "netdev-provider.h"
58 #include "netdev-tc-offloads.h"
59 #include "netdev-vport.h"
60 #include "netlink-notifier.h"
61 #include "netlink-socket.h"
64 #include "openvswitch/ofpbuf.h"
65 #include "openflow/openflow.h"
66 #include "ovs-atomic.h"
68 #include "openvswitch/poll-loop.h"
69 #include "rtnetlink.h"
70 #include "openvswitch/shash.h"
71 #include "socket-util.h"
75 #include "unaligned.h"
76 #include "openvswitch/vlog.h"
79 VLOG_DEFINE_THIS_MODULE(netdev_linux
);
81 COVERAGE_DEFINE(netdev_set_policing
);
82 COVERAGE_DEFINE(netdev_arp_lookup
);
83 COVERAGE_DEFINE(netdev_get_ifindex
);
84 COVERAGE_DEFINE(netdev_get_hwaddr
);
85 COVERAGE_DEFINE(netdev_set_hwaddr
);
86 COVERAGE_DEFINE(netdev_get_ethtool
);
87 COVERAGE_DEFINE(netdev_set_ethtool
);
90 #ifndef IFLA_IF_NETNSID
91 #define IFLA_IF_NETNSID 0x45
93 /* These were introduced in Linux 2.6.14, so they might be missing if we have
95 #ifndef ADVERTISED_Pause
96 #define ADVERTISED_Pause (1 << 13)
98 #ifndef ADVERTISED_Asym_Pause
99 #define ADVERTISED_Asym_Pause (1 << 14)
102 /* These were introduced in Linux 2.6.24, so they might be missing if we
103 * have old headers. */
104 #ifndef ETHTOOL_GFLAGS
105 #define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
107 #ifndef ETHTOOL_SFLAGS
108 #define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
111 /* This was introduced in Linux 2.6.25, so it might be missing if we have old
114 #define TC_RTAB_SIZE 1024
117 #ifndef TCM_IFINDEX_MAGIC_BLOCK
118 #define TCM_IFINDEX_MAGIC_BLOCK (0xFFFFFFFFU)
121 /* Linux 2.6.21 introduced struct tpacket_auxdata.
122 * Linux 2.6.27 added the tp_vlan_tci member.
123 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
124 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
125 * TP_STATUS_VLAN_TPID_VALID.
127 * With all this churn it's easiest to unconditionally define a replacement
128 * structure that has everything we want.
130 #ifndef PACKET_AUXDATA
131 #define PACKET_AUXDATA 8
133 #ifndef TP_STATUS_VLAN_VALID
134 #define TP_STATUS_VLAN_VALID (1 << 4)
136 #ifndef TP_STATUS_VLAN_TPID_VALID
137 #define TP_STATUS_VLAN_TPID_VALID (1 << 6)
139 #undef tpacket_auxdata
140 #define tpacket_auxdata rpl_tpacket_auxdata
141 struct tpacket_auxdata
{
147 uint16_t tp_vlan_tci
;
148 uint16_t tp_vlan_tpid
;
151 /* Linux 2.6.27 introduced ethtool_cmd_speed
153 * To avoid revisiting problems reported with using configure to detect
154 * compatibility (see report at
155 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
156 * unconditionally replace ethtool_cmd_speed. */
157 #define ethtool_cmd_speed rpl_ethtool_cmd_speed
158 static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd
*ep
)
160 return ep
->speed
| (ep
->speed_hi
<< 16);
163 /* Linux 2.6.30 introduced supported and advertised flags for
164 * 1G base KX, and 10G base KX4, KR and R. */
165 #ifndef SUPPORTED_1000baseKX_Full
166 #define SUPPORTED_1000baseKX_Full (1 << 17)
167 #define SUPPORTED_10000baseKX4_Full (1 << 18)
168 #define SUPPORTED_10000baseKR_Full (1 << 19)
169 #define SUPPORTED_10000baseR_FEC (1 << 20)
170 #define ADVERTISED_1000baseKX_Full (1 << 17)
171 #define ADVERTISED_10000baseKX4_Full (1 << 18)
172 #define ADVERTISED_10000baseKR_Full (1 << 19)
173 #define ADVERTISED_10000baseR_FEC (1 << 20)
176 /* Linux 3.5 introduced supported and advertised flags for
177 * 40G base KR4, CR4, SR4 and LR4. */
178 #ifndef SUPPORTED_40000baseKR4_Full
179 #define SUPPORTED_40000baseKR4_Full (1 << 23)
180 #define SUPPORTED_40000baseCR4_Full (1 << 24)
181 #define SUPPORTED_40000baseSR4_Full (1 << 25)
182 #define SUPPORTED_40000baseLR4_Full (1 << 26)
183 #define ADVERTISED_40000baseKR4_Full (1 << 23)
184 #define ADVERTISED_40000baseCR4_Full (1 << 24)
185 #define ADVERTISED_40000baseSR4_Full (1 << 25)
186 #define ADVERTISED_40000baseLR4_Full (1 << 26)
189 /* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
191 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
192 * 2.6.32-431.29.2.el6.x86_64 (see report at
193 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
194 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
195 * unconditionally define a replacement. */
197 #define IFLA_STATS64 23
199 #define rtnl_link_stats64 rpl_rtnl_link_stats64
200 struct rtnl_link_stats64
{
212 uint64_t rx_length_errors
;
213 uint64_t rx_over_errors
;
214 uint64_t rx_crc_errors
;
215 uint64_t rx_frame_errors
;
216 uint64_t rx_fifo_errors
;
217 uint64_t rx_missed_errors
;
219 uint64_t tx_aborted_errors
;
220 uint64_t tx_carrier_errors
;
221 uint64_t tx_fifo_errors
;
222 uint64_t tx_heartbeat_errors
;
223 uint64_t tx_window_errors
;
225 uint64_t rx_compressed
;
226 uint64_t tx_compressed
;
230 VALID_IFINDEX
= 1 << 0,
231 VALID_ETHERADDR
= 1 << 1,
234 VALID_POLICING
= 1 << 4,
235 VALID_VPORT_STAT_ERROR
= 1 << 5,
236 VALID_DRVINFO
= 1 << 6,
237 VALID_FEATURES
= 1 << 7,
240 struct linux_lag_slave
{
242 struct shash_node
*node
;
245 /* Protects 'lag_shash' and the mutable members of struct linux_lag_slave. */
246 static struct ovs_mutex lag_mutex
= OVS_MUTEX_INITIALIZER
;
248 /* All slaves whose LAG masters are network devices in OvS. */
249 static struct shash lag_shash
OVS_GUARDED_BY(lag_mutex
)
250 = SHASH_INITIALIZER(&lag_shash
);
252 /* Traffic control. */
254 /* An instance of a traffic control class. Always associated with a particular
257 * Each TC implementation subclasses this with whatever additional data it
260 const struct tc_ops
*ops
;
261 struct hmap queues
; /* Contains "struct tc_queue"s.
262 * Read by generic TC layer.
263 * Written only by TC implementation. */
266 #define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
268 /* One traffic control queue.
270 * Each TC implementation subclasses this with whatever additional data it
273 struct hmap_node hmap_node
; /* In struct tc's "queues" hmap. */
274 unsigned int queue_id
; /* OpenFlow queue ID. */
275 long long int created
; /* Time queue was created, in msecs. */
278 /* A particular kind of traffic control. Each implementation generally maps to
279 * one particular Linux qdisc class.
281 * The functions below return 0 if successful or a positive errno value on
282 * failure, except where otherwise noted. All of them must be provided, except
283 * where otherwise noted. */
285 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
286 * This is null for tc_ops_default and tc_ops_other, for which there are no
287 * appropriate values. */
288 const char *linux_name
;
290 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
291 const char *ovs_name
;
293 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
294 * queues. The queues are numbered 0 through n_queues - 1. */
295 unsigned int n_queues
;
297 /* Called to install this TC class on 'netdev'. The implementation should
298 * make the Netlink calls required to set up 'netdev' with the right qdisc
299 * and configure it according to 'details'. The implementation may assume
300 * that the current qdisc is the default; that is, there is no need for it
301 * to delete the current qdisc before installing itself.
303 * The contents of 'details' should be documented as valid for 'ovs_name'
304 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
305 * (which is built as ovs-vswitchd.conf.db(8)).
307 * This function must return 0 if and only if it sets 'netdev->tc' to an
308 * initialized 'struct tc'.
310 * (This function is null for tc_ops_other, which cannot be installed. For
311 * other TC classes it should always be nonnull.) */
312 int (*tc_install
)(struct netdev
*netdev
, const struct smap
*details
);
314 /* Called when the netdev code determines (through a Netlink query) that
315 * this TC class's qdisc is installed on 'netdev', but we didn't install
316 * it ourselves and so don't know any of the details.
318 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
319 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
320 * implementation should parse the other attributes of 'nlmsg' as
321 * necessary to determine its configuration. If necessary it should also
322 * use Netlink queries to determine the configuration of queues on
325 * This function must return 0 if and only if it sets 'netdev->tc' to an
326 * initialized 'struct tc'. */
327 int (*tc_load
)(struct netdev
*netdev
, struct ofpbuf
*nlmsg
);
329 /* Destroys the data structures allocated by the implementation as part of
330 * 'tc'. (This includes destroying 'tc->queues' by calling
333 * The implementation should not need to perform any Netlink calls. If
334 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
335 * (But it may not be desirable.)
337 * This function may be null if 'tc' is trivial. */
338 void (*tc_destroy
)(struct tc
*tc
);
340 /* Retrieves details of 'netdev->tc' configuration into 'details'.
342 * The implementation should not need to perform any Netlink calls, because
343 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
344 * cached the configuration.
346 * The contents of 'details' should be documented as valid for 'ovs_name'
347 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
348 * (which is built as ovs-vswitchd.conf.db(8)).
350 * This function may be null if 'tc' is not configurable.
352 int (*qdisc_get
)(const struct netdev
*netdev
, struct smap
*details
);
354 /* Reconfigures 'netdev->tc' according to 'details', performing any
355 * required Netlink calls to complete the reconfiguration.
357 * The contents of 'details' should be documented as valid for 'ovs_name'
358 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
359 * (which is built as ovs-vswitchd.conf.db(8)).
361 * This function may be null if 'tc' is not configurable.
363 int (*qdisc_set
)(struct netdev
*, const struct smap
*details
);
365 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
366 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
368 * The contents of 'details' should be documented as valid for 'ovs_name'
369 * in the "other_config" column in the "Queue" table in
370 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
372 * The implementation should not need to perform any Netlink calls, because
373 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
374 * cached the queue configuration.
376 * This function may be null if 'tc' does not have queues ('n_queues' is
378 int (*class_get
)(const struct netdev
*netdev
, const struct tc_queue
*queue
,
379 struct smap
*details
);
381 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
382 * 'details', perfoming any required Netlink calls to complete the
383 * reconfiguration. The caller ensures that 'queue_id' is less than
386 * The contents of 'details' should be documented as valid for 'ovs_name'
387 * in the "other_config" column in the "Queue" table in
388 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
390 * This function may be null if 'tc' does not have queues or its queues are
391 * not configurable. */
392 int (*class_set
)(struct netdev
*, unsigned int queue_id
,
393 const struct smap
*details
);
395 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
396 * tc_queue's within 'netdev->tc->queues'.
398 * This function may be null if 'tc' does not have queues or its queues
399 * cannot be deleted. */
400 int (*class_delete
)(struct netdev
*, struct tc_queue
*queue
);
402 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
403 * 'struct tc_queue's within 'netdev->tc->queues'.
405 * On success, initializes '*stats'.
407 * This function may be null if 'tc' does not have queues or if it cannot
408 * report queue statistics. */
409 int (*class_get_stats
)(const struct netdev
*netdev
,
410 const struct tc_queue
*queue
,
411 struct netdev_queue_stats
*stats
);
413 /* Extracts queue stats from 'nlmsg', which is a response to a
414 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
416 * This function may be null if 'tc' does not have queues or if it cannot
417 * report queue statistics. */
418 int (*class_dump_stats
)(const struct netdev
*netdev
,
419 const struct ofpbuf
*nlmsg
,
420 netdev_dump_queue_stats_cb
*cb
, void *aux
);
424 tc_init(struct tc
*tc
, const struct tc_ops
*ops
)
427 hmap_init(&tc
->queues
);
431 tc_destroy(struct tc
*tc
)
433 hmap_destroy(&tc
->queues
);
436 static const struct tc_ops tc_ops_htb
;
437 static const struct tc_ops tc_ops_hfsc
;
438 static const struct tc_ops tc_ops_codel
;
439 static const struct tc_ops tc_ops_fqcodel
;
440 static const struct tc_ops tc_ops_sfq
;
441 static const struct tc_ops tc_ops_netem
;
442 static const struct tc_ops tc_ops_default
;
443 static const struct tc_ops tc_ops_noop
;
444 static const struct tc_ops tc_ops_other
;
446 static const struct tc_ops
*const tcs
[] = {
447 &tc_ops_htb
, /* Hierarchy token bucket (see tc-htb(8)). */
448 &tc_ops_hfsc
, /* Hierarchical fair service curve. */
449 &tc_ops_codel
, /* Controlled delay */
450 &tc_ops_fqcodel
, /* Fair queue controlled delay */
451 &tc_ops_sfq
, /* Stochastic fair queueing */
452 &tc_ops_netem
, /* Network Emulator */
453 &tc_ops_noop
, /* Non operating qos type. */
454 &tc_ops_default
, /* Default qdisc (see tc-pfifo_fast(8)). */
455 &tc_ops_other
, /* Some other qdisc. */
459 static unsigned int tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
);
460 static unsigned int tc_bytes_to_ticks(unsigned int rate
, unsigned int size
);
461 static unsigned int tc_buffer_per_jiffy(unsigned int rate
);
462 static uint32_t tc_time_to_ticks(uint32_t time
);
464 static struct tcmsg
*netdev_linux_tc_make_request(const struct netdev
*,
468 static int tc_add_policer(struct netdev
*,
469 uint32_t kbits_rate
, uint32_t kbits_burst
);
471 static int tc_parse_qdisc(const struct ofpbuf
*, const char **kind
,
472 struct nlattr
**options
);
473 static int tc_parse_class(const struct ofpbuf
*, unsigned int *queue_id
,
474 struct nlattr
**options
,
475 struct netdev_queue_stats
*);
476 static int tc_query_class(const struct netdev
*,
477 unsigned int handle
, unsigned int parent
,
478 struct ofpbuf
**replyp
);
479 static int tc_delete_class(const struct netdev
*, unsigned int handle
);
481 static int tc_del_qdisc(struct netdev
*netdev
);
482 static int tc_query_qdisc(const struct netdev
*netdev
);
485 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
);
486 static int tc_calc_cell_log(unsigned int mtu
);
487 static void tc_fill_rate(struct tc_ratespec
*rate
, uint64_t bps
, int mtu
);
488 static int tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
);
490 struct netdev_linux
{
493 /* Protects all members below. */
494 struct ovs_mutex mutex
;
496 unsigned int cache_valid
;
498 bool miimon
; /* Link status of last poll. */
499 long long int miimon_interval
; /* Miimon Poll rate. Disabled if <= 0. */
500 struct timer miimon_timer
;
502 int netnsid
; /* Network namespace ID. */
503 /* The following are figured out "on demand" only. They are only valid
504 * when the corresponding VALID_* bit in 'cache_valid' is set. */
506 struct eth_addr etheraddr
;
508 unsigned int ifi_flags
;
509 long long int carrier_resets
;
510 uint32_t kbits_rate
; /* Policing data. */
511 uint32_t kbits_burst
;
512 int vport_stats_error
; /* Cached error code from vport_get_stats().
513 0 or an errno value. */
514 int netdev_mtu_error
; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
515 int ether_addr_error
; /* Cached error code from set/get etheraddr. */
516 int netdev_policing_error
; /* Cached error code from set policing. */
517 int get_features_error
; /* Cached error code from ETHTOOL_GSET. */
518 int get_ifindex_error
; /* Cached error code from SIOCGIFINDEX. */
520 enum netdev_features current
; /* Cached from ETHTOOL_GSET. */
521 enum netdev_features advertised
; /* Cached from ETHTOOL_GSET. */
522 enum netdev_features supported
; /* Cached from ETHTOOL_GSET. */
524 struct ethtool_drvinfo drvinfo
; /* Cached from ETHTOOL_GDRVINFO. */
527 /* For devices of class netdev_tap_class only. */
529 bool present
; /* If the device is present in the namespace */
530 uint64_t tx_dropped
; /* tap device can drop if the iface is down */
532 /* LAG information. */
533 bool is_lag_master
; /* True if the netdev is a LAG master. */
536 struct netdev_rxq_linux
{
537 struct netdev_rxq up
;
542 /* This is set pretty low because we probably won't learn anything from the
543 * additional log messages. */
544 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 20);
546 /* Polling miimon status for all ports causes performance degradation when
547 * handling a large number of ports. If there are no devices using miimon, then
548 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
550 * Readers do not depend on this variable synchronizing with the related
551 * changes in the device miimon status, so we can use atomic_count. */
552 static atomic_count miimon_cnt
= ATOMIC_COUNT_INIT(0);
554 static void netdev_linux_run(const struct netdev_class
*);
556 static int netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*,
557 int cmd
, const char *cmd_name
);
558 static int get_flags(const struct netdev
*, unsigned int *flags
);
559 static int set_flags(const char *, unsigned int flags
);
560 static int update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
561 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
562 OVS_REQUIRES(netdev
->mutex
);
563 static int get_ifindex(const struct netdev
*, int *ifindexp
);
564 static int do_set_addr(struct netdev
*netdev
,
565 int ioctl_nr
, const char *ioctl_name
,
566 struct in_addr addr
);
567 static int get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
);
568 static int set_etheraddr(const char *netdev_name
, const struct eth_addr
);
569 static int get_stats_via_netlink(const struct netdev
*, struct netdev_stats
*);
570 static int af_packet_sock(void);
571 static bool netdev_linux_miimon_enabled(void);
572 static void netdev_linux_miimon_run(void);
573 static void netdev_linux_miimon_wait(void);
574 static int netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
);
577 is_netdev_linux_class(const struct netdev_class
*netdev_class
)
579 return netdev_class
->run
== netdev_linux_run
;
583 is_tap_netdev(const struct netdev
*netdev
)
585 return netdev_get_class(netdev
) == &netdev_tap_class
;
588 static struct netdev_linux
*
589 netdev_linux_cast(const struct netdev
*netdev
)
591 ovs_assert(is_netdev_linux_class(netdev_get_class(netdev
)));
593 return CONTAINER_OF(netdev
, struct netdev_linux
, up
);
596 static struct netdev_rxq_linux
*
597 netdev_rxq_linux_cast(const struct netdev_rxq
*rx
)
599 ovs_assert(is_netdev_linux_class(netdev_get_class(rx
->netdev
)));
600 return CONTAINER_OF(rx
, struct netdev_rxq_linux
, up
);
604 netdev_linux_netnsid_update__(struct netdev_linux
*netdev
)
606 struct dpif_netlink_vport reply
;
610 error
= dpif_netlink_vport_get(netdev_get_name(&netdev
->up
), &reply
, &buf
);
612 if (error
== ENOENT
) {
613 /* Assume it is local if there is no API (e.g. if the openvswitch
614 * kernel module is not loaded). */
615 netnsid_set_local(&netdev
->netnsid
);
617 netnsid_unset(&netdev
->netnsid
);
622 netnsid_set(&netdev
->netnsid
, reply
.netnsid
);
628 netdev_linux_netnsid_update(struct netdev_linux
*netdev
)
630 if (netnsid_is_unset(netdev
->netnsid
)) {
631 if (netdev_get_class(&netdev
->up
) == &netdev_tap_class
) {
632 netnsid_set_local(&netdev
->netnsid
);
634 return netdev_linux_netnsid_update__(netdev
);
642 netdev_linux_netnsid_is_eq(struct netdev_linux
*netdev
, int nsid
)
644 netdev_linux_netnsid_update(netdev
);
645 return netnsid_eq(netdev
->netnsid
, nsid
);
649 netdev_linux_netnsid_is_remote(struct netdev_linux
*netdev
)
651 netdev_linux_netnsid_update(netdev
);
652 return netnsid_is_remote(netdev
->netnsid
);
655 static int netdev_linux_update_via_netlink(struct netdev_linux
*);
656 static void netdev_linux_update(struct netdev_linux
*netdev
, int,
657 const struct rtnetlink_change
*)
658 OVS_REQUIRES(netdev
->mutex
);
659 static void netdev_linux_changed(struct netdev_linux
*netdev
,
660 unsigned int ifi_flags
, unsigned int mask
)
661 OVS_REQUIRES(netdev
->mutex
);
663 /* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
664 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
665 * if no such socket could be created. */
666 static struct nl_sock
*
667 netdev_linux_notify_sock(void)
669 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
670 static struct nl_sock
*sock
;
671 unsigned int mcgroups
[] = {RTNLGRP_LINK
, RTNLGRP_IPV4_IFADDR
,
672 RTNLGRP_IPV6_IFADDR
, RTNLGRP_IPV6_IFINFO
};
674 if (ovsthread_once_start(&once
)) {
677 error
= nl_sock_create(NETLINK_ROUTE
, &sock
);
681 for (i
= 0; i
< ARRAY_SIZE(mcgroups
); i
++) {
682 error
= nl_sock_join_mcgroup(sock
, mcgroups
[i
]);
684 nl_sock_destroy(sock
);
690 nl_sock_listen_all_nsid(sock
, true);
691 ovsthread_once_done(&once
);
698 netdev_linux_miimon_enabled(void)
700 return atomic_count_get(&miimon_cnt
) > 0;
704 netdev_linux_kind_is_lag(const char *kind
)
706 if (!strcmp(kind
, "bond") || !strcmp(kind
, "team")) {
714 netdev_linux_update_lag(struct rtnetlink_change
*change
)
715 OVS_REQUIRES(lag_mutex
)
717 struct linux_lag_slave
*lag
;
719 if (!rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)) {
723 if (change
->slave
&& netdev_linux_kind_is_lag(change
->slave
)) {
724 lag
= shash_find_data(&lag_shash
, change
->ifname
);
727 struct netdev
*master_netdev
;
728 char master_name
[IFNAMSIZ
];
732 if_indextoname(change
->master_ifindex
, master_name
);
733 master_netdev
= netdev_from_name(master_name
);
734 if (!master_netdev
) {
738 if (is_netdev_linux_class(master_netdev
->netdev_class
)) {
739 block_id
= netdev_get_block_id(master_netdev
);
741 netdev_close(master_netdev
);
745 lag
= xmalloc(sizeof *lag
);
746 lag
->block_id
= block_id
;
747 lag
->node
= shash_add(&lag_shash
, change
->ifname
, lag
);
749 /* delete ingress block in case it exists */
750 tc_add_del_qdisc(change
->if_index
, false, 0, TC_INGRESS
);
751 /* LAG master is linux netdev so add slave to same block. */
752 error
= tc_add_del_qdisc(change
->if_index
, true, block_id
,
755 VLOG_WARN("failed to bind LAG slave %s to master's block",
757 shash_delete(&lag_shash
, lag
->node
);
762 netdev_close(master_netdev
);
764 } else if (change
->master_ifindex
== 0) {
765 /* Check if this was a lag slave that has been freed. */
766 lag
= shash_find_data(&lag_shash
, change
->ifname
);
769 tc_add_del_qdisc(change
->if_index
, false, lag
->block_id
,
771 shash_delete(&lag_shash
, lag
->node
);
778 netdev_linux_run(const struct netdev_class
*netdev_class OVS_UNUSED
)
780 struct nl_sock
*sock
;
783 if (netdev_linux_miimon_enabled()) {
784 netdev_linux_miimon_run();
787 sock
= netdev_linux_notify_sock();
793 uint64_t buf_stub
[4096 / 8];
797 ofpbuf_use_stub(&buf
, buf_stub
, sizeof buf_stub
);
798 error
= nl_sock_recv(sock
, &buf
, &nsid
, false);
800 struct rtnetlink_change change
;
802 if (rtnetlink_parse(&buf
, &change
)) {
803 struct netdev
*netdev_
= NULL
;
804 char dev_name
[IFNAMSIZ
];
806 if (!change
.ifname
) {
807 change
.ifname
= if_indextoname(change
.if_index
, dev_name
);
811 netdev_
= netdev_from_name(change
.ifname
);
813 if (netdev_
&& is_netdev_linux_class(netdev_
->netdev_class
)) {
814 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
816 ovs_mutex_lock(&netdev
->mutex
);
817 netdev_linux_update(netdev
, nsid
, &change
);
818 ovs_mutex_unlock(&netdev
->mutex
);
820 else if (!netdev_
&& change
.ifname
) {
821 /* Netdev is not present in OvS but its master could be. */
822 ovs_mutex_lock(&lag_mutex
);
823 netdev_linux_update_lag(&change
);
824 ovs_mutex_unlock(&lag_mutex
);
826 netdev_close(netdev_
);
828 } else if (error
== ENOBUFS
) {
829 struct shash device_shash
;
830 struct shash_node
*node
;
834 shash_init(&device_shash
);
835 netdev_get_devices(&netdev_linux_class
, &device_shash
);
836 SHASH_FOR_EACH (node
, &device_shash
) {
837 struct netdev
*netdev_
= node
->data
;
838 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
841 ovs_mutex_lock(&netdev
->mutex
);
842 get_flags(netdev_
, &flags
);
843 netdev_linux_changed(netdev
, flags
, 0);
844 ovs_mutex_unlock(&netdev
->mutex
);
846 netdev_close(netdev_
);
848 shash_destroy(&device_shash
);
849 } else if (error
!= EAGAIN
) {
850 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 5);
851 VLOG_WARN_RL(&rll
, "error reading or parsing netlink (%s)",
852 ovs_strerror(error
));
859 netdev_linux_wait(const struct netdev_class
*netdev_class OVS_UNUSED
)
861 struct nl_sock
*sock
;
863 if (netdev_linux_miimon_enabled()) {
864 netdev_linux_miimon_wait();
866 sock
= netdev_linux_notify_sock();
868 nl_sock_wait(sock
, POLLIN
);
873 netdev_linux_changed(struct netdev_linux
*dev
,
874 unsigned int ifi_flags
, unsigned int mask
)
875 OVS_REQUIRES(dev
->mutex
)
877 netdev_change_seq_changed(&dev
->up
);
879 if ((dev
->ifi_flags
^ ifi_flags
) & IFF_RUNNING
) {
880 dev
->carrier_resets
++;
882 dev
->ifi_flags
= ifi_flags
;
884 dev
->cache_valid
&= mask
;
885 if (!(mask
& VALID_IN
)) {
886 netdev_get_addrs_list_flush();
891 netdev_linux_update__(struct netdev_linux
*dev
,
892 const struct rtnetlink_change
*change
)
893 OVS_REQUIRES(dev
->mutex
)
895 if (rtnetlink_type_is_rtnlgrp_link(change
->nlmsg_type
)) {
896 if (change
->nlmsg_type
== RTM_NEWLINK
) {
897 /* Keep drv-info, and ip addresses. */
898 netdev_linux_changed(dev
, change
->ifi_flags
,
899 VALID_DRVINFO
| VALID_IN
);
901 /* Update netdev from rtnl-change msg. */
903 dev
->mtu
= change
->mtu
;
904 dev
->cache_valid
|= VALID_MTU
;
905 dev
->netdev_mtu_error
= 0;
908 if (!eth_addr_is_zero(change
->mac
)) {
909 dev
->etheraddr
= change
->mac
;
910 dev
->cache_valid
|= VALID_ETHERADDR
;
911 dev
->ether_addr_error
= 0;
913 /* The mac addr has been changed, report it now. */
914 rtnetlink_report_link();
917 if (change
->master
&& netdev_linux_kind_is_lag(change
->master
)) {
918 dev
->is_lag_master
= true;
921 dev
->ifindex
= change
->if_index
;
922 dev
->cache_valid
|= VALID_IFINDEX
;
923 dev
->get_ifindex_error
= 0;
927 netdev_linux_changed(dev
, change
->ifi_flags
, 0);
928 dev
->present
= false;
929 netnsid_unset(&dev
->netnsid
);
931 } else if (rtnetlink_type_is_rtnlgrp_addr(change
->nlmsg_type
)) {
932 /* Invalidates in4, in6. */
933 netdev_linux_changed(dev
, dev
->ifi_flags
, ~VALID_IN
);
940 netdev_linux_update(struct netdev_linux
*dev
, int nsid
,
941 const struct rtnetlink_change
*change
)
942 OVS_REQUIRES(dev
->mutex
)
944 if (netdev_linux_netnsid_is_eq(dev
, nsid
)) {
945 netdev_linux_update__(dev
, change
);
949 static struct netdev
*
950 netdev_linux_alloc(void)
952 struct netdev_linux
*netdev
= xzalloc(sizeof *netdev
);
957 netdev_linux_common_construct(struct netdev
*netdev_
)
959 /* Prevent any attempt to create (or open) a network device named "default"
960 * or "all". These device names are effectively reserved on Linux because
961 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
962 * itself this wouldn't call for any special treatment, but in practice if
963 * a program tries to create devices with these names, it causes the kernel
964 * to fire a "new device" notification event even though creation failed,
965 * and in turn that causes OVS to wake up and try to create them again,
966 * which ends up as a 100% CPU loop. */
967 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
968 const char *name
= netdev_
->name
;
969 if (!strcmp(name
, "default") || !strcmp(name
, "all")) {
970 static struct vlog_rate_limit rll
= VLOG_RATE_LIMIT_INIT(1, 1);
971 VLOG_WARN_RL(&rll
, "%s: Linux forbids network device with this name",
976 /* The device could be in the same network namespace or in another one. */
977 netnsid_unset(&netdev
->netnsid
);
978 ovs_mutex_init(&netdev
->mutex
);
982 /* Creates system and internal devices. */
984 netdev_linux_construct(struct netdev
*netdev_
)
986 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
987 int error
= netdev_linux_common_construct(netdev_
);
992 error
= get_flags(&netdev
->up
, &netdev
->ifi_flags
);
993 if (error
== ENODEV
) {
994 if (netdev
->up
.netdev_class
!= &netdev_internal_class
) {
995 /* The device does not exist, so don't allow it to be opened. */
998 /* "Internal" netdevs have to be created as netdev objects before
999 * they exist in the kernel, because creating them in the kernel
1000 * happens by passing a netdev object to dpif_port_add().
1001 * Therefore, ignore the error. */
1008 /* For most types of netdevs we open the device for each call of
1009 * netdev_open(). However, this is not the case with tap devices,
1010 * since it is only possible to open the device once. In this
1011 * situation we share a single file descriptor, and consequently
1012 * buffers, across all readers. Therefore once data is read it will
1013 * be unavailable to other reads for tap devices. */
1015 netdev_linux_construct_tap(struct netdev
*netdev_
)
1017 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1018 static const char tap_dev
[] = "/dev/net/tun";
1019 const char *name
= netdev_
->name
;
1022 int error
= netdev_linux_common_construct(netdev_
);
1027 /* Open tap device. */
1028 netdev
->tap_fd
= open(tap_dev
, O_RDWR
);
1029 if (netdev
->tap_fd
< 0) {
1031 VLOG_WARN("opening \"%s\" failed: %s", tap_dev
, ovs_strerror(error
));
1035 /* Create tap device. */
1036 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
1037 ifr
.ifr_flags
= IFF_TAP
| IFF_NO_PI
;
1038 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
1039 if (ioctl(netdev
->tap_fd
, TUNSETIFF
, &ifr
) == -1) {
1040 VLOG_WARN("%s: creating tap device failed: %s", name
,
1041 ovs_strerror(errno
));
1046 /* Make non-blocking. */
1047 error
= set_nonblocking(netdev
->tap_fd
);
1052 if (ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 1)) {
1053 VLOG_WARN("%s: creating tap device failed (persist): %s", name
,
1054 ovs_strerror(errno
));
1059 netdev
->present
= true;
1063 close(netdev
->tap_fd
);
1068 netdev_linux_destruct(struct netdev
*netdev_
)
1070 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1072 if (netdev
->tc
&& netdev
->tc
->ops
->tc_destroy
) {
1073 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
1076 if (netdev_get_class(netdev_
) == &netdev_tap_class
1077 && netdev
->tap_fd
>= 0)
1079 ioctl(netdev
->tap_fd
, TUNSETPERSIST
, 0);
1080 close(netdev
->tap_fd
);
1083 if (netdev
->miimon_interval
> 0) {
1084 atomic_count_dec(&miimon_cnt
);
1087 ovs_mutex_destroy(&netdev
->mutex
);
1091 netdev_linux_dealloc(struct netdev
*netdev_
)
1093 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1097 static struct netdev_rxq
*
1098 netdev_linux_rxq_alloc(void)
1100 struct netdev_rxq_linux
*rx
= xzalloc(sizeof *rx
);
1105 netdev_linux_rxq_construct(struct netdev_rxq
*rxq_
)
1107 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1108 struct netdev
*netdev_
= rx
->up
.netdev
;
1109 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1112 ovs_mutex_lock(&netdev
->mutex
);
1113 rx
->is_tap
= is_tap_netdev(netdev_
);
1115 rx
->fd
= netdev
->tap_fd
;
1117 struct sockaddr_ll sll
;
1119 /* Result of tcpdump -dd inbound */
1120 static const struct sock_filter filt
[] = {
1121 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1122 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1123 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1124 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1126 static const struct sock_fprog fprog
= {
1127 ARRAY_SIZE(filt
), (struct sock_filter
*) filt
1130 /* Create file descriptor. */
1131 rx
->fd
= socket(PF_PACKET
, SOCK_RAW
, 0);
1134 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error
));
1139 if (setsockopt(rx
->fd
, SOL_PACKET
, PACKET_AUXDATA
, &val
, sizeof val
)) {
1141 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1142 netdev_get_name(netdev_
), ovs_strerror(error
));
1146 /* Set non-blocking mode. */
1147 error
= set_nonblocking(rx
->fd
);
1152 /* Get ethernet device index. */
1153 error
= get_ifindex(&netdev
->up
, &ifindex
);
1158 /* Bind to specific ethernet device. */
1159 memset(&sll
, 0, sizeof sll
);
1160 sll
.sll_family
= AF_PACKET
;
1161 sll
.sll_ifindex
= ifindex
;
1162 sll
.sll_protocol
= htons(ETH_P_ALL
);
1163 if (bind(rx
->fd
, (struct sockaddr
*) &sll
, sizeof sll
) < 0) {
1165 VLOG_ERR("%s: failed to bind raw socket (%s)",
1166 netdev_get_name(netdev_
), ovs_strerror(error
));
1170 /* Filter for only inbound packets. */
1171 error
= setsockopt(rx
->fd
, SOL_SOCKET
, SO_ATTACH_FILTER
, &fprog
,
1175 VLOG_ERR("%s: failed to attach filter (%s)",
1176 netdev_get_name(netdev_
), ovs_strerror(error
));
1180 ovs_mutex_unlock(&netdev
->mutex
);
1188 ovs_mutex_unlock(&netdev
->mutex
);
1193 netdev_linux_rxq_destruct(struct netdev_rxq
*rxq_
)
1195 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1203 netdev_linux_rxq_dealloc(struct netdev_rxq
*rxq_
)
1205 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1211 auxdata_to_vlan_tpid(const struct tpacket_auxdata
*aux
, bool double_tagged
)
1213 if (aux
->tp_status
& TP_STATUS_VLAN_TPID_VALID
) {
1214 return htons(aux
->tp_vlan_tpid
);
1215 } else if (double_tagged
) {
1216 return htons(ETH_TYPE_VLAN_8021AD
);
1218 return htons(ETH_TYPE_VLAN_8021Q
);
1223 auxdata_has_vlan_tci(const struct tpacket_auxdata
*aux
)
1225 return aux
->tp_vlan_tci
|| aux
->tp_status
& TP_STATUS_VLAN_VALID
;
1229 netdev_linux_rxq_recv_sock(int fd
, struct dp_packet
*buffer
)
1234 struct cmsghdr
*cmsg
;
1236 struct cmsghdr cmsg
;
1237 char buffer
[CMSG_SPACE(sizeof(struct tpacket_auxdata
))];
1241 /* Reserve headroom for a single VLAN tag */
1242 dp_packet_reserve(buffer
, VLAN_HEADER_LEN
);
1243 size
= dp_packet_tailroom(buffer
);
1245 iov
.iov_base
= dp_packet_data(buffer
);
1247 msgh
.msg_name
= NULL
;
1248 msgh
.msg_namelen
= 0;
1249 msgh
.msg_iov
= &iov
;
1250 msgh
.msg_iovlen
= 1;
1251 msgh
.msg_control
= &cmsg_buffer
;
1252 msgh
.msg_controllen
= sizeof cmsg_buffer
;
1256 retval
= recvmsg(fd
, &msgh
, MSG_TRUNC
);
1257 } while (retval
< 0 && errno
== EINTR
);
1261 } else if (retval
> size
) {
1265 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1267 for (cmsg
= CMSG_FIRSTHDR(&msgh
); cmsg
; cmsg
= CMSG_NXTHDR(&msgh
, cmsg
)) {
1268 const struct tpacket_auxdata
*aux
;
1270 if (cmsg
->cmsg_level
!= SOL_PACKET
1271 || cmsg
->cmsg_type
!= PACKET_AUXDATA
1272 || cmsg
->cmsg_len
< CMSG_LEN(sizeof(struct tpacket_auxdata
))) {
1276 aux
= ALIGNED_CAST(struct tpacket_auxdata
*, CMSG_DATA(cmsg
));
1277 if (auxdata_has_vlan_tci(aux
)) {
1278 struct eth_header
*eth
;
1281 if (retval
< ETH_HEADER_LEN
) {
1285 eth
= dp_packet_data(buffer
);
1286 double_tagged
= eth
->eth_type
== htons(ETH_TYPE_VLAN_8021Q
);
1288 eth_push_vlan(buffer
, auxdata_to_vlan_tpid(aux
, double_tagged
),
1289 htons(aux
->tp_vlan_tci
));
1298 netdev_linux_rxq_recv_tap(int fd
, struct dp_packet
*buffer
)
1301 size_t size
= dp_packet_tailroom(buffer
);
1304 retval
= read(fd
, dp_packet_data(buffer
), size
);
1305 } while (retval
< 0 && errno
== EINTR
);
1311 dp_packet_set_size(buffer
, dp_packet_size(buffer
) + retval
);
1316 netdev_linux_rxq_recv(struct netdev_rxq
*rxq_
, struct dp_packet_batch
*batch
,
1319 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1320 struct netdev
*netdev
= rx
->up
.netdev
;
1321 struct dp_packet
*buffer
;
1325 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
1326 mtu
= ETH_PAYLOAD_MAX
;
1329 /* Assume Ethernet port. No need to set packet_type. */
1330 buffer
= dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN
+ mtu
,
1331 DP_NETDEV_HEADROOM
);
1332 retval
= (rx
->is_tap
1333 ? netdev_linux_rxq_recv_tap(rx
->fd
, buffer
)
1334 : netdev_linux_rxq_recv_sock(rx
->fd
, buffer
));
1337 if (retval
!= EAGAIN
&& retval
!= EMSGSIZE
) {
1338 VLOG_WARN_RL(&rl
, "error receiving Ethernet packet on %s: %s",
1339 netdev_rxq_get_name(rxq_
), ovs_strerror(errno
));
1341 dp_packet_delete(buffer
);
1343 dp_packet_batch_init_packet(batch
, buffer
);
1354 netdev_linux_rxq_wait(struct netdev_rxq
*rxq_
)
1356 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1357 poll_fd_wait(rx
->fd
, POLLIN
);
1361 netdev_linux_rxq_drain(struct netdev_rxq
*rxq_
)
1363 struct netdev_rxq_linux
*rx
= netdev_rxq_linux_cast(rxq_
);
1366 int error
= af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_
), &ifr
,
1367 SIOCGIFTXQLEN
, "SIOCGIFTXQLEN");
1371 drain_fd(rx
->fd
, ifr
.ifr_qlen
);
1374 return drain_rcvbuf(rx
->fd
);
1379 netdev_linux_sock_batch_send(int sock
, int ifindex
,
1380 struct dp_packet_batch
*batch
)
1382 const size_t size
= dp_packet_batch_size(batch
);
1383 /* We don't bother setting most fields in sockaddr_ll because the
1384 * kernel ignores them for SOCK_RAW. */
1385 struct sockaddr_ll sll
= { .sll_family
= AF_PACKET
,
1386 .sll_ifindex
= ifindex
};
1388 struct mmsghdr
*mmsg
= xmalloc(sizeof(*mmsg
) * size
);
1389 struct iovec
*iov
= xmalloc(sizeof(*iov
) * size
);
1391 struct dp_packet
*packet
;
1392 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1393 iov
[i
].iov_base
= dp_packet_data(packet
);
1394 iov
[i
].iov_len
= dp_packet_size(packet
);
1395 mmsg
[i
].msg_hdr
= (struct msghdr
) { .msg_name
= &sll
,
1396 .msg_namelen
= sizeof sll
,
1402 for (uint32_t ofs
= 0; ofs
< size
; ) {
1405 retval
= sendmmsg(sock
, mmsg
+ ofs
, size
- ofs
, 0);
1406 error
= retval
< 0 ? errno
: 0;
1407 } while (error
== EINTR
);
1419 /* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1420 * essential, because packets sent to a tap device with an AF_PACKET socket
1421 * will loop back to be *received* again on the tap device. This doesn't occur
1422 * on other interface types because we attach a socket filter to the rx
1425 netdev_linux_tap_batch_send(struct netdev
*netdev_
,
1426 struct dp_packet_batch
*batch
)
1428 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1429 struct dp_packet
*packet
;
1431 /* The Linux tap driver returns EIO if the device is not up,
1432 * so if the device is not up, don't waste time sending it.
1433 * However, if the device is in another network namespace
1434 * then OVS can't retrieve the state. In that case, send the
1435 * packets anyway. */
1436 if (netdev
->present
&& !(netdev
->ifi_flags
& IFF_UP
)) {
1437 netdev
->tx_dropped
+= dp_packet_batch_size(batch
);
1441 DP_PACKET_BATCH_FOR_EACH (i
, packet
, batch
) {
1442 size_t size
= dp_packet_size(packet
);
1447 retval
= write(netdev
->tap_fd
, dp_packet_data(packet
), size
);
1448 error
= retval
< 0 ? errno
: 0;
1449 } while (error
== EINTR
);
1452 /* The Linux tap driver returns EIO if the device is not up. From
1453 * the OVS side this is not an error, so we ignore it; otherwise,
1454 * return the erro. */
1458 } else if (retval
!= size
) {
1459 VLOG_WARN_RL(&rl
, "sent partial Ethernet packet (%"PRIuSIZE
" "
1460 "bytes of %"PRIuSIZE
") on %s",
1461 retval
, size
, netdev_get_name(netdev_
));
1468 /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
1469 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1470 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1471 * the packet is too big or too small to transmit on the device.
1473 * The kernel maintains a packet transmission queue, so the caller is not
1474 * expected to do additional queuing of packets. */
1476 netdev_linux_send(struct netdev
*netdev_
, int qid OVS_UNUSED
,
1477 struct dp_packet_batch
*batch
,
1478 bool concurrent_txq OVS_UNUSED
)
1483 if (!is_tap_netdev(netdev_
)) {
1484 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_
))) {
1489 sock
= af_packet_sock();
1495 int ifindex
= netdev_get_ifindex(netdev_
);
1501 error
= netdev_linux_sock_batch_send(sock
, ifindex
, batch
);
1503 error
= netdev_linux_tap_batch_send(netdev_
, batch
);
1506 if (error
== ENOBUFS
) {
1507 /* The Linux AF_PACKET implementation never blocks waiting
1508 * for room for packets, instead returning ENOBUFS.
1509 * Translate this into EAGAIN for the caller. */
1512 VLOG_WARN_RL(&rl
, "error sending Ethernet packet on %s: %s",
1513 netdev_get_name(netdev_
), ovs_strerror(error
));
1518 dp_packet_delete_batch(batch
, true);
1522 /* Registers with the poll loop to wake up from the next call to poll_block()
1523 * when the packet transmission queue has sufficient room to transmit a packet
1524 * with netdev_send().
1526 * The kernel maintains a packet transmission queue, so the client is not
1527 * expected to do additional queuing of packets. Thus, this function is
1528 * unlikely to ever be used. It is included for completeness. */
1530 netdev_linux_send_wait(struct netdev
*netdev
, int qid OVS_UNUSED
)
1532 if (is_tap_netdev(netdev
)) {
1533 /* TAP device always accepts packets.*/
1534 poll_immediate_wake();
1538 /* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1539 * otherwise a positive errno value. */
1541 netdev_linux_set_etheraddr(struct netdev
*netdev_
, const struct eth_addr mac
)
1543 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1544 enum netdev_flags old_flags
= 0;
1547 ovs_mutex_lock(&netdev
->mutex
);
1548 if (netdev_linux_netnsid_is_remote(netdev
)) {
1553 if (netdev
->cache_valid
& VALID_ETHERADDR
) {
1554 error
= netdev
->ether_addr_error
;
1555 if (error
|| eth_addr_equals(netdev
->etheraddr
, mac
)) {
1558 netdev
->cache_valid
&= ~VALID_ETHERADDR
;
1561 /* Tap devices must be brought down before setting the address. */
1562 if (is_tap_netdev(netdev_
)) {
1563 update_flags(netdev
, NETDEV_UP
, 0, &old_flags
);
1565 error
= set_etheraddr(netdev_get_name(netdev_
), mac
);
1566 if (!error
|| error
== ENODEV
) {
1567 netdev
->ether_addr_error
= error
;
1568 netdev
->cache_valid
|= VALID_ETHERADDR
;
1570 netdev
->etheraddr
= mac
;
1574 if (is_tap_netdev(netdev_
) && old_flags
& NETDEV_UP
) {
1575 update_flags(netdev
, 0, NETDEV_UP
, &old_flags
);
1579 ovs_mutex_unlock(&netdev
->mutex
);
1583 /* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1585 netdev_linux_get_etheraddr(const struct netdev
*netdev_
, struct eth_addr
*mac
)
1587 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1590 ovs_mutex_lock(&netdev
->mutex
);
1591 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1592 netdev_linux_update_via_netlink(netdev
);
1595 if (!(netdev
->cache_valid
& VALID_ETHERADDR
)) {
1596 /* Fall back to ioctl if netlink fails */
1597 netdev
->ether_addr_error
= get_etheraddr(netdev_get_name(netdev_
),
1598 &netdev
->etheraddr
);
1599 netdev
->cache_valid
|= VALID_ETHERADDR
;
1602 error
= netdev
->ether_addr_error
;
1604 *mac
= netdev
->etheraddr
;
1606 ovs_mutex_unlock(&netdev
->mutex
);
1612 netdev_linux_get_mtu__(struct netdev_linux
*netdev
, int *mtup
)
1616 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1617 netdev_linux_update_via_netlink(netdev
);
1620 if (!(netdev
->cache_valid
& VALID_MTU
)) {
1621 /* Fall back to ioctl if netlink fails */
1624 netdev
->netdev_mtu_error
= af_inet_ifreq_ioctl(
1625 netdev_get_name(&netdev
->up
), &ifr
, SIOCGIFMTU
, "SIOCGIFMTU");
1626 netdev
->mtu
= ifr
.ifr_mtu
;
1627 netdev
->cache_valid
|= VALID_MTU
;
1630 error
= netdev
->netdev_mtu_error
;
1632 *mtup
= netdev
->mtu
;
1638 /* Returns the maximum size of transmitted (and received) packets on 'netdev',
1639 * in bytes, not including the hardware header; thus, this is typically 1500
1640 * bytes for Ethernet devices. */
1642 netdev_linux_get_mtu(const struct netdev
*netdev_
, int *mtup
)
1644 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1647 ovs_mutex_lock(&netdev
->mutex
);
1648 error
= netdev_linux_get_mtu__(netdev
, mtup
);
1649 ovs_mutex_unlock(&netdev
->mutex
);
1654 /* Sets the maximum size of transmitted (MTU) for given device using linux
1655 * networking ioctl interface.
1658 netdev_linux_set_mtu(struct netdev
*netdev_
, int mtu
)
1660 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1664 ovs_mutex_lock(&netdev
->mutex
);
1665 if (netdev_linux_netnsid_is_remote(netdev
)) {
1670 if (netdev
->cache_valid
& VALID_MTU
) {
1671 error
= netdev
->netdev_mtu_error
;
1672 if (error
|| netdev
->mtu
== mtu
) {
1675 netdev
->cache_valid
&= ~VALID_MTU
;
1678 error
= af_inet_ifreq_ioctl(netdev_get_name(netdev_
), &ifr
,
1679 SIOCSIFMTU
, "SIOCSIFMTU");
1680 if (!error
|| error
== ENODEV
) {
1681 netdev
->netdev_mtu_error
= error
;
1682 netdev
->mtu
= ifr
.ifr_mtu
;
1683 netdev
->cache_valid
|= VALID_MTU
;
1686 ovs_mutex_unlock(&netdev
->mutex
);
1690 /* Returns the ifindex of 'netdev', if successful, as a positive number.
1691 * On failure, returns a negative errno value. */
1693 netdev_linux_get_ifindex(const struct netdev
*netdev_
)
1695 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1698 ovs_mutex_lock(&netdev
->mutex
);
1699 if (netdev_linux_netnsid_is_remote(netdev
)) {
1703 error
= get_ifindex(netdev_
, &ifindex
);
1706 ovs_mutex_unlock(&netdev
->mutex
);
1707 return error
? -error
: ifindex
;
1711 netdev_linux_get_carrier(const struct netdev
*netdev_
, bool *carrier
)
1713 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1715 ovs_mutex_lock(&netdev
->mutex
);
1716 if (netdev
->miimon_interval
> 0) {
1717 *carrier
= netdev
->miimon
;
1719 *carrier
= (netdev
->ifi_flags
& IFF_RUNNING
) != 0;
1721 ovs_mutex_unlock(&netdev
->mutex
);
1726 static long long int
1727 netdev_linux_get_carrier_resets(const struct netdev
*netdev_
)
1729 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1730 long long int carrier_resets
;
1732 ovs_mutex_lock(&netdev
->mutex
);
1733 carrier_resets
= netdev
->carrier_resets
;
1734 ovs_mutex_unlock(&netdev
->mutex
);
1736 return carrier_resets
;
1740 netdev_linux_do_miimon(const char *name
, int cmd
, const char *cmd_name
,
1741 struct mii_ioctl_data
*data
)
1746 memset(&ifr
, 0, sizeof ifr
);
1747 memcpy(&ifr
.ifr_data
, data
, sizeof *data
);
1748 error
= af_inet_ifreq_ioctl(name
, &ifr
, cmd
, cmd_name
);
1749 memcpy(data
, &ifr
.ifr_data
, sizeof *data
);
1755 netdev_linux_get_miimon(const char *name
, bool *miimon
)
1757 struct mii_ioctl_data data
;
1762 memset(&data
, 0, sizeof data
);
1763 error
= netdev_linux_do_miimon(name
, SIOCGMIIPHY
, "SIOCGMIIPHY", &data
);
1765 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1766 data
.reg_num
= MII_BMSR
;
1767 error
= netdev_linux_do_miimon(name
, SIOCGMIIREG
, "SIOCGMIIREG",
1771 *miimon
= !!(data
.val_out
& BMSR_LSTATUS
);
1775 struct ethtool_cmd ecmd
;
1777 VLOG_DBG_RL(&rl
, "%s: failed to query MII, falling back to ethtool",
1780 COVERAGE_INC(netdev_get_ethtool
);
1781 memset(&ecmd
, 0, sizeof ecmd
);
1782 error
= netdev_linux_do_ethtool(name
, &ecmd
, ETHTOOL_GLINK
,
1785 struct ethtool_value eval
;
1787 memcpy(&eval
, &ecmd
, sizeof eval
);
1788 *miimon
= !!eval
.data
;
1790 VLOG_WARN_RL(&rl
, "%s: ethtool link status failed", name
);
1798 netdev_linux_set_miimon_interval(struct netdev
*netdev_
,
1799 long long int interval
)
1801 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1803 ovs_mutex_lock(&netdev
->mutex
);
1804 interval
= interval
> 0 ? MAX(interval
, 100) : 0;
1805 if (netdev
->miimon_interval
!= interval
) {
1806 if (interval
&& !netdev
->miimon_interval
) {
1807 atomic_count_inc(&miimon_cnt
);
1808 } else if (!interval
&& netdev
->miimon_interval
) {
1809 atomic_count_dec(&miimon_cnt
);
1812 netdev
->miimon_interval
= interval
;
1813 timer_set_expired(&netdev
->miimon_timer
);
1815 ovs_mutex_unlock(&netdev
->mutex
);
1821 netdev_linux_miimon_run(void)
1823 struct shash device_shash
;
1824 struct shash_node
*node
;
1826 shash_init(&device_shash
);
1827 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1828 SHASH_FOR_EACH (node
, &device_shash
) {
1829 struct netdev
*netdev
= node
->data
;
1830 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1833 ovs_mutex_lock(&dev
->mutex
);
1834 if (dev
->miimon_interval
> 0 && timer_expired(&dev
->miimon_timer
)) {
1835 netdev_linux_get_miimon(dev
->up
.name
, &miimon
);
1836 if (miimon
!= dev
->miimon
) {
1837 dev
->miimon
= miimon
;
1838 netdev_linux_changed(dev
, dev
->ifi_flags
, 0);
1841 timer_set_duration(&dev
->miimon_timer
, dev
->miimon_interval
);
1843 ovs_mutex_unlock(&dev
->mutex
);
1844 netdev_close(netdev
);
1847 shash_destroy(&device_shash
);
1851 netdev_linux_miimon_wait(void)
1853 struct shash device_shash
;
1854 struct shash_node
*node
;
1856 shash_init(&device_shash
);
1857 netdev_get_devices(&netdev_linux_class
, &device_shash
);
1858 SHASH_FOR_EACH (node
, &device_shash
) {
1859 struct netdev
*netdev
= node
->data
;
1860 struct netdev_linux
*dev
= netdev_linux_cast(netdev
);
1862 ovs_mutex_lock(&dev
->mutex
);
1863 if (dev
->miimon_interval
> 0) {
1864 timer_wait(&dev
->miimon_timer
);
1866 ovs_mutex_unlock(&dev
->mutex
);
1867 netdev_close(netdev
);
1869 shash_destroy(&device_shash
);
1873 swap_uint64(uint64_t *a
, uint64_t *b
)
1880 /* Copies 'src' into 'dst', performing format conversion in the process.
1882 * 'src' is allowed to be misaligned. */
1884 netdev_stats_from_ovs_vport_stats(struct netdev_stats
*dst
,
1885 const struct ovs_vport_stats
*src
)
1887 dst
->rx_packets
= get_32aligned_u64(&src
->rx_packets
);
1888 dst
->tx_packets
= get_32aligned_u64(&src
->tx_packets
);
1889 dst
->rx_bytes
= get_32aligned_u64(&src
->rx_bytes
);
1890 dst
->tx_bytes
= get_32aligned_u64(&src
->tx_bytes
);
1891 dst
->rx_errors
= get_32aligned_u64(&src
->rx_errors
);
1892 dst
->tx_errors
= get_32aligned_u64(&src
->tx_errors
);
1893 dst
->rx_dropped
= get_32aligned_u64(&src
->rx_dropped
);
1894 dst
->tx_dropped
= get_32aligned_u64(&src
->tx_dropped
);
1896 dst
->collisions
= 0;
1897 dst
->rx_length_errors
= 0;
1898 dst
->rx_over_errors
= 0;
1899 dst
->rx_crc_errors
= 0;
1900 dst
->rx_frame_errors
= 0;
1901 dst
->rx_fifo_errors
= 0;
1902 dst
->rx_missed_errors
= 0;
1903 dst
->tx_aborted_errors
= 0;
1904 dst
->tx_carrier_errors
= 0;
1905 dst
->tx_fifo_errors
= 0;
1906 dst
->tx_heartbeat_errors
= 0;
1907 dst
->tx_window_errors
= 0;
1911 get_stats_via_vport__(const struct netdev
*netdev
, struct netdev_stats
*stats
)
1913 struct dpif_netlink_vport reply
;
1917 error
= dpif_netlink_vport_get(netdev_get_name(netdev
), &reply
, &buf
);
1920 } else if (!reply
.stats
) {
1925 netdev_stats_from_ovs_vport_stats(stats
, reply
.stats
);
1933 get_stats_via_vport(const struct netdev
*netdev_
,
1934 struct netdev_stats
*stats
)
1936 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1938 if (!netdev
->vport_stats_error
||
1939 !(netdev
->cache_valid
& VALID_VPORT_STAT_ERROR
)) {
1942 error
= get_stats_via_vport__(netdev_
, stats
);
1943 if (error
&& error
!= ENOENT
&& error
!= ENODEV
) {
1944 VLOG_WARN_RL(&rl
, "%s: obtaining netdev stats via vport failed "
1946 netdev_get_name(netdev_
), ovs_strerror(error
));
1948 netdev
->vport_stats_error
= error
;
1949 netdev
->cache_valid
|= VALID_VPORT_STAT_ERROR
;
1953 /* Retrieves current device stats for 'netdev-linux'. */
1955 netdev_linux_get_stats(const struct netdev
*netdev_
,
1956 struct netdev_stats
*stats
)
1958 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
1959 struct netdev_stats dev_stats
;
1962 ovs_mutex_lock(&netdev
->mutex
);
1963 get_stats_via_vport(netdev_
, stats
);
1964 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
1966 if (!netdev
->vport_stats_error
) {
1969 } else if (netdev
->vport_stats_error
) {
1970 /* stats not available from OVS then use netdev stats. */
1973 /* Use kernel netdev's packet and byte counts since vport's counters
1974 * do not reflect packet counts on the wire when GSO, TSO or GRO are
1976 stats
->rx_packets
= dev_stats
.rx_packets
;
1977 stats
->rx_bytes
= dev_stats
.rx_bytes
;
1978 stats
->tx_packets
= dev_stats
.tx_packets
;
1979 stats
->tx_bytes
= dev_stats
.tx_bytes
;
1981 stats
->rx_errors
+= dev_stats
.rx_errors
;
1982 stats
->tx_errors
+= dev_stats
.tx_errors
;
1983 stats
->rx_dropped
+= dev_stats
.rx_dropped
;
1984 stats
->tx_dropped
+= dev_stats
.tx_dropped
;
1985 stats
->multicast
+= dev_stats
.multicast
;
1986 stats
->collisions
+= dev_stats
.collisions
;
1987 stats
->rx_length_errors
+= dev_stats
.rx_length_errors
;
1988 stats
->rx_over_errors
+= dev_stats
.rx_over_errors
;
1989 stats
->rx_crc_errors
+= dev_stats
.rx_crc_errors
;
1990 stats
->rx_frame_errors
+= dev_stats
.rx_frame_errors
;
1991 stats
->rx_fifo_errors
+= dev_stats
.rx_fifo_errors
;
1992 stats
->rx_missed_errors
+= dev_stats
.rx_missed_errors
;
1993 stats
->tx_aborted_errors
+= dev_stats
.tx_aborted_errors
;
1994 stats
->tx_carrier_errors
+= dev_stats
.tx_carrier_errors
;
1995 stats
->tx_fifo_errors
+= dev_stats
.tx_fifo_errors
;
1996 stats
->tx_heartbeat_errors
+= dev_stats
.tx_heartbeat_errors
;
1997 stats
->tx_window_errors
+= dev_stats
.tx_window_errors
;
1999 ovs_mutex_unlock(&netdev
->mutex
);
2004 /* Retrieves current device stats for 'netdev-tap' netdev or
2005 * netdev-internal. */
2007 netdev_tap_get_stats(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
2009 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2010 struct netdev_stats dev_stats
;
2013 ovs_mutex_lock(&netdev
->mutex
);
2014 get_stats_via_vport(netdev_
, stats
);
2015 error
= get_stats_via_netlink(netdev_
, &dev_stats
);
2017 if (!netdev
->vport_stats_error
) {
2020 } else if (netdev
->vport_stats_error
) {
2021 /* Transmit and receive stats will appear to be swapped relative to the
2022 * other ports since we are the one sending the data, not a remote
2023 * computer. For consistency, we swap them back here. This does not
2024 * apply if we are getting stats from the vport layer because it always
2025 * tracks stats from the perspective of the switch. */
2028 swap_uint64(&stats
->rx_packets
, &stats
->tx_packets
);
2029 swap_uint64(&stats
->rx_bytes
, &stats
->tx_bytes
);
2030 swap_uint64(&stats
->rx_errors
, &stats
->tx_errors
);
2031 swap_uint64(&stats
->rx_dropped
, &stats
->tx_dropped
);
2032 stats
->rx_length_errors
= 0;
2033 stats
->rx_over_errors
= 0;
2034 stats
->rx_crc_errors
= 0;
2035 stats
->rx_frame_errors
= 0;
2036 stats
->rx_fifo_errors
= 0;
2037 stats
->rx_missed_errors
= 0;
2038 stats
->tx_aborted_errors
= 0;
2039 stats
->tx_carrier_errors
= 0;
2040 stats
->tx_fifo_errors
= 0;
2041 stats
->tx_heartbeat_errors
= 0;
2042 stats
->tx_window_errors
= 0;
2044 /* Use kernel netdev's packet and byte counts since vport counters
2045 * do not reflect packet counts on the wire when GSO, TSO or GRO
2047 stats
->rx_packets
= dev_stats
.tx_packets
;
2048 stats
->rx_bytes
= dev_stats
.tx_bytes
;
2049 stats
->tx_packets
= dev_stats
.rx_packets
;
2050 stats
->tx_bytes
= dev_stats
.rx_bytes
;
2052 stats
->rx_dropped
+= dev_stats
.tx_dropped
;
2053 stats
->tx_dropped
+= dev_stats
.rx_dropped
;
2055 stats
->rx_errors
+= dev_stats
.tx_errors
;
2056 stats
->tx_errors
+= dev_stats
.rx_errors
;
2058 stats
->multicast
+= dev_stats
.multicast
;
2059 stats
->collisions
+= dev_stats
.collisions
;
2061 stats
->tx_dropped
+= netdev
->tx_dropped
;
2062 ovs_mutex_unlock(&netdev
->mutex
);
2068 netdev_internal_get_stats(const struct netdev
*netdev_
,
2069 struct netdev_stats
*stats
)
2071 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2074 ovs_mutex_lock(&netdev
->mutex
);
2075 get_stats_via_vport(netdev_
, stats
);
2076 error
= netdev
->vport_stats_error
;
2077 ovs_mutex_unlock(&netdev
->mutex
);
2083 netdev_linux_read_features(struct netdev_linux
*netdev
)
2085 struct ethtool_cmd ecmd
;
2089 if (netdev
->cache_valid
& VALID_FEATURES
) {
2093 COVERAGE_INC(netdev_get_ethtool
);
2094 memset(&ecmd
, 0, sizeof ecmd
);
2095 error
= netdev_linux_do_ethtool(netdev
->up
.name
, &ecmd
,
2096 ETHTOOL_GSET
, "ETHTOOL_GSET");
2101 /* Supported features. */
2102 netdev
->supported
= 0;
2103 if (ecmd
.supported
& SUPPORTED_10baseT_Half
) {
2104 netdev
->supported
|= NETDEV_F_10MB_HD
;
2106 if (ecmd
.supported
& SUPPORTED_10baseT_Full
) {
2107 netdev
->supported
|= NETDEV_F_10MB_FD
;
2109 if (ecmd
.supported
& SUPPORTED_100baseT_Half
) {
2110 netdev
->supported
|= NETDEV_F_100MB_HD
;
2112 if (ecmd
.supported
& SUPPORTED_100baseT_Full
) {
2113 netdev
->supported
|= NETDEV_F_100MB_FD
;
2115 if (ecmd
.supported
& SUPPORTED_1000baseT_Half
) {
2116 netdev
->supported
|= NETDEV_F_1GB_HD
;
2118 if ((ecmd
.supported
& SUPPORTED_1000baseT_Full
) ||
2119 (ecmd
.supported
& SUPPORTED_1000baseKX_Full
)) {
2120 netdev
->supported
|= NETDEV_F_1GB_FD
;
2122 if ((ecmd
.supported
& SUPPORTED_10000baseT_Full
) ||
2123 (ecmd
.supported
& SUPPORTED_10000baseKX4_Full
) ||
2124 (ecmd
.supported
& SUPPORTED_10000baseKR_Full
) ||
2125 (ecmd
.supported
& SUPPORTED_10000baseR_FEC
)) {
2126 netdev
->supported
|= NETDEV_F_10GB_FD
;
2128 if ((ecmd
.supported
& SUPPORTED_40000baseKR4_Full
) ||
2129 (ecmd
.supported
& SUPPORTED_40000baseCR4_Full
) ||
2130 (ecmd
.supported
& SUPPORTED_40000baseSR4_Full
) ||
2131 (ecmd
.supported
& SUPPORTED_40000baseLR4_Full
)) {
2132 netdev
->supported
|= NETDEV_F_40GB_FD
;
2134 if (ecmd
.supported
& SUPPORTED_TP
) {
2135 netdev
->supported
|= NETDEV_F_COPPER
;
2137 if (ecmd
.supported
& SUPPORTED_FIBRE
) {
2138 netdev
->supported
|= NETDEV_F_FIBER
;
2140 if (ecmd
.supported
& SUPPORTED_Autoneg
) {
2141 netdev
->supported
|= NETDEV_F_AUTONEG
;
2143 if (ecmd
.supported
& SUPPORTED_Pause
) {
2144 netdev
->supported
|= NETDEV_F_PAUSE
;
2146 if (ecmd
.supported
& SUPPORTED_Asym_Pause
) {
2147 netdev
->supported
|= NETDEV_F_PAUSE_ASYM
;
2150 /* Advertised features. */
2151 netdev
->advertised
= 0;
2152 if (ecmd
.advertising
& ADVERTISED_10baseT_Half
) {
2153 netdev
->advertised
|= NETDEV_F_10MB_HD
;
2155 if (ecmd
.advertising
& ADVERTISED_10baseT_Full
) {
2156 netdev
->advertised
|= NETDEV_F_10MB_FD
;
2158 if (ecmd
.advertising
& ADVERTISED_100baseT_Half
) {
2159 netdev
->advertised
|= NETDEV_F_100MB_HD
;
2161 if (ecmd
.advertising
& ADVERTISED_100baseT_Full
) {
2162 netdev
->advertised
|= NETDEV_F_100MB_FD
;
2164 if (ecmd
.advertising
& ADVERTISED_1000baseT_Half
) {
2165 netdev
->advertised
|= NETDEV_F_1GB_HD
;
2167 if ((ecmd
.advertising
& ADVERTISED_1000baseT_Full
) ||
2168 (ecmd
.advertising
& ADVERTISED_1000baseKX_Full
)) {
2169 netdev
->advertised
|= NETDEV_F_1GB_FD
;
2171 if ((ecmd
.advertising
& ADVERTISED_10000baseT_Full
) ||
2172 (ecmd
.advertising
& ADVERTISED_10000baseKX4_Full
) ||
2173 (ecmd
.advertising
& ADVERTISED_10000baseKR_Full
) ||
2174 (ecmd
.advertising
& ADVERTISED_10000baseR_FEC
)) {
2175 netdev
->advertised
|= NETDEV_F_10GB_FD
;
2177 if ((ecmd
.advertising
& ADVERTISED_40000baseKR4_Full
) ||
2178 (ecmd
.advertising
& ADVERTISED_40000baseCR4_Full
) ||
2179 (ecmd
.advertising
& ADVERTISED_40000baseSR4_Full
) ||
2180 (ecmd
.advertising
& ADVERTISED_40000baseLR4_Full
)) {
2181 netdev
->advertised
|= NETDEV_F_40GB_FD
;
2183 if (ecmd
.advertising
& ADVERTISED_TP
) {
2184 netdev
->advertised
|= NETDEV_F_COPPER
;
2186 if (ecmd
.advertising
& ADVERTISED_FIBRE
) {
2187 netdev
->advertised
|= NETDEV_F_FIBER
;
2189 if (ecmd
.advertising
& ADVERTISED_Autoneg
) {
2190 netdev
->advertised
|= NETDEV_F_AUTONEG
;
2192 if (ecmd
.advertising
& ADVERTISED_Pause
) {
2193 netdev
->advertised
|= NETDEV_F_PAUSE
;
2195 if (ecmd
.advertising
& ADVERTISED_Asym_Pause
) {
2196 netdev
->advertised
|= NETDEV_F_PAUSE_ASYM
;
2199 /* Current settings. */
2200 speed
= ethtool_cmd_speed(&ecmd
);
2201 if (speed
== SPEED_10
) {
2202 netdev
->current
= ecmd
.duplex
? NETDEV_F_10MB_FD
: NETDEV_F_10MB_HD
;
2203 } else if (speed
== SPEED_100
) {
2204 netdev
->current
= ecmd
.duplex
? NETDEV_F_100MB_FD
: NETDEV_F_100MB_HD
;
2205 } else if (speed
== SPEED_1000
) {
2206 netdev
->current
= ecmd
.duplex
? NETDEV_F_1GB_FD
: NETDEV_F_1GB_HD
;
2207 } else if (speed
== SPEED_10000
) {
2208 netdev
->current
= NETDEV_F_10GB_FD
;
2209 } else if (speed
== 40000) {
2210 netdev
->current
= NETDEV_F_40GB_FD
;
2211 } else if (speed
== 100000) {
2212 netdev
->current
= NETDEV_F_100GB_FD
;
2213 } else if (speed
== 1000000) {
2214 netdev
->current
= NETDEV_F_1TB_FD
;
2216 netdev
->current
= 0;
2219 if (ecmd
.port
== PORT_TP
) {
2220 netdev
->current
|= NETDEV_F_COPPER
;
2221 } else if (ecmd
.port
== PORT_FIBRE
) {
2222 netdev
->current
|= NETDEV_F_FIBER
;
2226 netdev
->current
|= NETDEV_F_AUTONEG
;
2230 netdev
->cache_valid
|= VALID_FEATURES
;
2231 netdev
->get_features_error
= error
;
2234 /* Stores the features supported by 'netdev' into of '*current', '*advertised',
2235 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2236 * Returns 0 if successful, otherwise a positive errno value. */
2238 netdev_linux_get_features(const struct netdev
*netdev_
,
2239 enum netdev_features
*current
,
2240 enum netdev_features
*advertised
,
2241 enum netdev_features
*supported
,
2242 enum netdev_features
*peer
)
2244 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2247 ovs_mutex_lock(&netdev
->mutex
);
2248 if (netdev_linux_netnsid_is_remote(netdev
)) {
2253 netdev_linux_read_features(netdev
);
2254 if (!netdev
->get_features_error
) {
2255 *current
= netdev
->current
;
2256 *advertised
= netdev
->advertised
;
2257 *supported
= netdev
->supported
;
2258 *peer
= 0; /* XXX */
2260 error
= netdev
->get_features_error
;
2263 ovs_mutex_unlock(&netdev
->mutex
);
2267 /* Set the features advertised by 'netdev' to 'advertise'. */
2269 netdev_linux_set_advertisements(struct netdev
*netdev_
,
2270 enum netdev_features advertise
)
2272 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2273 struct ethtool_cmd ecmd
;
2276 ovs_mutex_lock(&netdev
->mutex
);
2278 COVERAGE_INC(netdev_get_ethtool
);
2280 if (netdev_linux_netnsid_is_remote(netdev
)) {
2285 memset(&ecmd
, 0, sizeof ecmd
);
2286 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2287 ETHTOOL_GSET
, "ETHTOOL_GSET");
2292 ecmd
.advertising
= 0;
2293 if (advertise
& NETDEV_F_10MB_HD
) {
2294 ecmd
.advertising
|= ADVERTISED_10baseT_Half
;
2296 if (advertise
& NETDEV_F_10MB_FD
) {
2297 ecmd
.advertising
|= ADVERTISED_10baseT_Full
;
2299 if (advertise
& NETDEV_F_100MB_HD
) {
2300 ecmd
.advertising
|= ADVERTISED_100baseT_Half
;
2302 if (advertise
& NETDEV_F_100MB_FD
) {
2303 ecmd
.advertising
|= ADVERTISED_100baseT_Full
;
2305 if (advertise
& NETDEV_F_1GB_HD
) {
2306 ecmd
.advertising
|= ADVERTISED_1000baseT_Half
;
2308 if (advertise
& NETDEV_F_1GB_FD
) {
2309 ecmd
.advertising
|= ADVERTISED_1000baseT_Full
;
2311 if (advertise
& NETDEV_F_10GB_FD
) {
2312 ecmd
.advertising
|= ADVERTISED_10000baseT_Full
;
2314 if (advertise
& NETDEV_F_COPPER
) {
2315 ecmd
.advertising
|= ADVERTISED_TP
;
2317 if (advertise
& NETDEV_F_FIBER
) {
2318 ecmd
.advertising
|= ADVERTISED_FIBRE
;
2320 if (advertise
& NETDEV_F_AUTONEG
) {
2321 ecmd
.advertising
|= ADVERTISED_Autoneg
;
2323 if (advertise
& NETDEV_F_PAUSE
) {
2324 ecmd
.advertising
|= ADVERTISED_Pause
;
2326 if (advertise
& NETDEV_F_PAUSE_ASYM
) {
2327 ecmd
.advertising
|= ADVERTISED_Asym_Pause
;
2329 COVERAGE_INC(netdev_set_ethtool
);
2330 error
= netdev_linux_do_ethtool(netdev_get_name(netdev_
), &ecmd
,
2331 ETHTOOL_SSET
, "ETHTOOL_SSET");
2334 ovs_mutex_unlock(&netdev
->mutex
);
2338 static struct tc_police
2339 tc_matchall_fill_police(uint32_t kbits_rate
, uint32_t kbits_burst
)
2341 unsigned int bsize
= MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024 / 64;
2342 unsigned int bps
= ((uint64_t) kbits_rate
* 1000) / 8;
2343 struct tc_police police
;
2344 struct tc_ratespec rate
;
2347 memset(&rate
, 0, sizeof rate
);
2349 rate
.cell_log
= tc_calc_cell_log(mtu
);
2350 rate
.mpu
= ETH_TOTAL_MIN
;
2352 memset(&police
, 0, sizeof police
);
2353 police
.burst
= tc_bytes_to_ticks(bps
, bsize
);
2354 police
.action
= TC_POLICE_SHOT
;
2362 nl_msg_put_act_police(struct ofpbuf
*request
, struct tc_police police
)
2366 nl_msg_put_string(request
, TCA_ACT_KIND
, "police");
2367 offset
= nl_msg_start_nested(request
, TCA_ACT_OPTIONS
);
2368 nl_msg_put_unspec(request
, TCA_POLICE_TBF
, &police
, sizeof police
);
2369 tc_put_rtab(request
, TCA_POLICE_RATE
, &police
.rate
);
2370 nl_msg_put_u32(request
, TCA_POLICE_RESULT
, TC_ACT_UNSPEC
);
2371 nl_msg_end_nested(request
, offset
);
2375 tc_add_matchall_policer(struct netdev
*netdev
, uint32_t kbits_rate
,
2376 uint32_t kbits_burst
)
2378 uint16_t eth_type
= (OVS_FORCE
uint16_t) htons(ETH_P_ALL
);
2379 size_t basic_offset
, action_offset
, inner_offset
;
2380 uint16_t prio
= TC_RESERVED_PRIORITY_POLICE
;
2381 int ifindex
, index
, err
= 0;
2382 struct tc_police pol_act
;
2383 uint32_t block_id
= 0;
2384 struct ofpbuf request
;
2385 struct ofpbuf
*reply
;
2386 struct tcmsg
*tcmsg
;
2387 uint32_t handle
= 1;
2389 err
= get_ifindex(netdev
, &ifindex
);
2394 index
= block_id
? TCM_IFINDEX_MAGIC_BLOCK
: ifindex
;
2395 tcmsg
= tc_make_request(index
, RTM_NEWTFILTER
, NLM_F_CREATE
| NLM_F_ECHO
,
2397 tcmsg
->tcm_parent
= block_id
? : TC_INGRESS_PARENT
;
2398 tcmsg
->tcm_info
= tc_make_handle(prio
, eth_type
);
2399 tcmsg
->tcm_handle
= handle
;
2401 pol_act
= tc_matchall_fill_police(kbits_rate
, kbits_burst
);
2402 nl_msg_put_string(&request
, TCA_KIND
, "matchall");
2403 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
2404 action_offset
= nl_msg_start_nested(&request
, TCA_MATCHALL_ACT
);
2405 inner_offset
= nl_msg_start_nested(&request
, 1);
2406 nl_msg_put_act_police(&request
, pol_act
);
2407 nl_msg_end_nested(&request
, inner_offset
);
2408 nl_msg_end_nested(&request
, action_offset
);
2409 nl_msg_end_nested(&request
, basic_offset
);
2411 err
= tc_transact(&request
, &reply
);
2414 ofpbuf_at_assert(reply
, NLMSG_HDRLEN
, sizeof *tc
);
2415 ofpbuf_delete(reply
);
2422 tc_del_matchall_policer(struct netdev
*netdev
)
2424 uint32_t block_id
= 0;
2428 err
= get_ifindex(netdev
, &ifindex
);
2433 err
= tc_del_filter(ifindex
, TC_RESERVED_PRIORITY_POLICE
, 1, block_id
,
2442 /* Attempts to set input rate limiting (policing) policy. Returns 0 if
2443 * successful, otherwise a positive errno value. */
2445 netdev_linux_set_policing(struct netdev
*netdev_
,
2446 uint32_t kbits_rate
, uint32_t kbits_burst
)
2448 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2449 const char *netdev_name
= netdev_get_name(netdev_
);
2453 kbits_burst
= (!kbits_rate
? 0 /* Force to 0 if no rate specified. */
2454 : !kbits_burst
? 8000 /* Default to 8000 kbits if 0. */
2455 : kbits_burst
); /* Stick with user-specified value. */
2457 ovs_mutex_lock(&netdev
->mutex
);
2458 if (netdev_linux_netnsid_is_remote(netdev
)) {
2463 if (netdev
->cache_valid
& VALID_POLICING
) {
2464 error
= netdev
->netdev_policing_error
;
2465 if (error
|| (netdev
->kbits_rate
== kbits_rate
&&
2466 netdev
->kbits_burst
== kbits_burst
)) {
2467 /* Assume that settings haven't changed since we last set them. */
2470 netdev
->cache_valid
&= ~VALID_POLICING
;
2473 error
= get_ifindex(netdev_
, &ifindex
);
2478 /* Use matchall for policing when offloadling ovs with tc-flower. */
2479 if (netdev_is_flow_api_enabled()) {
2480 error
= tc_del_matchall_policer(netdev_
);
2482 error
= tc_add_matchall_policer(netdev_
, kbits_rate
, kbits_burst
);
2484 ovs_mutex_unlock(&netdev
->mutex
);
2488 COVERAGE_INC(netdev_set_policing
);
2489 /* Remove any existing ingress qdisc. */
2490 error
= tc_add_del_qdisc(ifindex
, false, 0, TC_INGRESS
);
2492 VLOG_WARN_RL(&rl
, "%s: removing policing failed: %s",
2493 netdev_name
, ovs_strerror(error
));
2498 error
= tc_add_del_qdisc(ifindex
, true, 0, TC_INGRESS
);
2500 VLOG_WARN_RL(&rl
, "%s: adding policing qdisc failed: %s",
2501 netdev_name
, ovs_strerror(error
));
2505 error
= tc_add_policer(netdev_
, kbits_rate
, kbits_burst
);
2507 VLOG_WARN_RL(&rl
, "%s: adding policing action failed: %s",
2508 netdev_name
, ovs_strerror(error
));
2513 netdev
->kbits_rate
= kbits_rate
;
2514 netdev
->kbits_burst
= kbits_burst
;
2517 if (!error
|| error
== ENODEV
) {
2518 netdev
->netdev_policing_error
= error
;
2519 netdev
->cache_valid
|= VALID_POLICING
;
2521 ovs_mutex_unlock(&netdev
->mutex
);
2526 netdev_linux_get_qos_types(const struct netdev
*netdev OVS_UNUSED
,
2529 const struct tc_ops
*const *opsp
;
2530 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2531 const struct tc_ops
*ops
= *opsp
;
2532 if (ops
->tc_install
&& ops
->ovs_name
[0] != '\0') {
2533 sset_add(types
, ops
->ovs_name
);
2539 static const struct tc_ops
*
2540 tc_lookup_ovs_name(const char *name
)
2542 const struct tc_ops
*const *opsp
;
2544 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2545 const struct tc_ops
*ops
= *opsp
;
2546 if (!strcmp(name
, ops
->ovs_name
)) {
2553 static const struct tc_ops
*
2554 tc_lookup_linux_name(const char *name
)
2556 const struct tc_ops
*const *opsp
;
2558 for (opsp
= tcs
; *opsp
!= NULL
; opsp
++) {
2559 const struct tc_ops
*ops
= *opsp
;
2560 if (ops
->linux_name
&& !strcmp(name
, ops
->linux_name
)) {
2567 static struct tc_queue
*
2568 tc_find_queue__(const struct netdev
*netdev_
, unsigned int queue_id
,
2571 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2572 struct tc_queue
*queue
;
2574 HMAP_FOR_EACH_IN_BUCKET (queue
, hmap_node
, hash
, &netdev
->tc
->queues
) {
2575 if (queue
->queue_id
== queue_id
) {
2582 static struct tc_queue
*
2583 tc_find_queue(const struct netdev
*netdev
, unsigned int queue_id
)
2585 return tc_find_queue__(netdev
, queue_id
, hash_int(queue_id
, 0));
2589 netdev_linux_get_qos_capabilities(const struct netdev
*netdev OVS_UNUSED
,
2591 struct netdev_qos_capabilities
*caps
)
2593 const struct tc_ops
*ops
= tc_lookup_ovs_name(type
);
2597 caps
->n_queues
= ops
->n_queues
;
2602 netdev_linux_get_qos(const struct netdev
*netdev_
,
2603 const char **typep
, struct smap
*details
)
2605 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2608 ovs_mutex_lock(&netdev
->mutex
);
2609 if (netdev_linux_netnsid_is_remote(netdev
)) {
2614 error
= tc_query_qdisc(netdev_
);
2616 *typep
= netdev
->tc
->ops
->ovs_name
;
2617 error
= (netdev
->tc
->ops
->qdisc_get
2618 ? netdev
->tc
->ops
->qdisc_get(netdev_
, details
)
2623 ovs_mutex_unlock(&netdev
->mutex
);
2628 netdev_linux_set_qos(struct netdev
*netdev_
,
2629 const char *type
, const struct smap
*details
)
2631 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2632 const struct tc_ops
*new_ops
;
2635 new_ops
= tc_lookup_ovs_name(type
);
2636 if (!new_ops
|| !new_ops
->tc_install
) {
2640 if (new_ops
== &tc_ops_noop
) {
2641 return new_ops
->tc_install(netdev_
, details
);
2644 ovs_mutex_lock(&netdev
->mutex
);
2645 if (netdev_linux_netnsid_is_remote(netdev
)) {
2650 error
= tc_query_qdisc(netdev_
);
2655 if (new_ops
== netdev
->tc
->ops
) {
2656 error
= new_ops
->qdisc_set
? new_ops
->qdisc_set(netdev_
, details
) : 0;
2658 /* Delete existing qdisc. */
2659 error
= tc_del_qdisc(netdev_
);
2663 ovs_assert(netdev
->tc
== NULL
);
2665 /* Install new qdisc. */
2666 error
= new_ops
->tc_install(netdev_
, details
);
2667 ovs_assert((error
== 0) == (netdev
->tc
!= NULL
));
2671 ovs_mutex_unlock(&netdev
->mutex
);
2676 netdev_linux_get_queue(const struct netdev
*netdev_
,
2677 unsigned int queue_id
, struct smap
*details
)
2679 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2682 ovs_mutex_lock(&netdev
->mutex
);
2683 if (netdev_linux_netnsid_is_remote(netdev
)) {
2688 error
= tc_query_qdisc(netdev_
);
2690 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2692 ? netdev
->tc
->ops
->class_get(netdev_
, queue
, details
)
2697 ovs_mutex_unlock(&netdev
->mutex
);
2702 netdev_linux_set_queue(struct netdev
*netdev_
,
2703 unsigned int queue_id
, const struct smap
*details
)
2705 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2708 ovs_mutex_lock(&netdev
->mutex
);
2709 if (netdev_linux_netnsid_is_remote(netdev
)) {
2714 error
= tc_query_qdisc(netdev_
);
2716 error
= (queue_id
< netdev
->tc
->ops
->n_queues
2717 && netdev
->tc
->ops
->class_set
2718 ? netdev
->tc
->ops
->class_set(netdev_
, queue_id
, details
)
2723 ovs_mutex_unlock(&netdev
->mutex
);
2728 netdev_linux_delete_queue(struct netdev
*netdev_
, unsigned int queue_id
)
2730 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2733 ovs_mutex_lock(&netdev
->mutex
);
2734 if (netdev_linux_netnsid_is_remote(netdev
)) {
2739 error
= tc_query_qdisc(netdev_
);
2741 if (netdev
->tc
->ops
->class_delete
) {
2742 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2744 ? netdev
->tc
->ops
->class_delete(netdev_
, queue
)
2752 ovs_mutex_unlock(&netdev
->mutex
);
2757 netdev_linux_get_queue_stats(const struct netdev
*netdev_
,
2758 unsigned int queue_id
,
2759 struct netdev_queue_stats
*stats
)
2761 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2764 ovs_mutex_lock(&netdev
->mutex
);
2765 if (netdev_linux_netnsid_is_remote(netdev
)) {
2770 error
= tc_query_qdisc(netdev_
);
2772 if (netdev
->tc
->ops
->class_get_stats
) {
2773 const struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2775 stats
->created
= queue
->created
;
2776 error
= netdev
->tc
->ops
->class_get_stats(netdev_
, queue
,
2787 ovs_mutex_unlock(&netdev
->mutex
);
2791 struct queue_dump_state
{
2792 struct nl_dump dump
;
2797 start_queue_dump(const struct netdev
*netdev
, struct queue_dump_state
*state
)
2799 struct ofpbuf request
;
2800 struct tcmsg
*tcmsg
;
2802 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, 0, &request
);
2806 tcmsg
->tcm_parent
= 0;
2807 nl_dump_start(&state
->dump
, NETLINK_ROUTE
, &request
);
2808 ofpbuf_uninit(&request
);
2810 ofpbuf_init(&state
->buf
, NL_DUMP_BUFSIZE
);
2815 finish_queue_dump(struct queue_dump_state
*state
)
2817 ofpbuf_uninit(&state
->buf
);
2818 return nl_dump_done(&state
->dump
);
2821 struct netdev_linux_queue_state
{
2822 unsigned int *queues
;
2828 netdev_linux_queue_dump_start(const struct netdev
*netdev_
, void **statep
)
2830 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2833 ovs_mutex_lock(&netdev
->mutex
);
2834 if (netdev_linux_netnsid_is_remote(netdev
)) {
2839 error
= tc_query_qdisc(netdev_
);
2841 if (netdev
->tc
->ops
->class_get
) {
2842 struct netdev_linux_queue_state
*state
;
2843 struct tc_queue
*queue
;
2846 *statep
= state
= xmalloc(sizeof *state
);
2847 state
->n_queues
= hmap_count(&netdev
->tc
->queues
);
2848 state
->cur_queue
= 0;
2849 state
->queues
= xmalloc(state
->n_queues
* sizeof *state
->queues
);
2852 HMAP_FOR_EACH (queue
, hmap_node
, &netdev
->tc
->queues
) {
2853 state
->queues
[i
++] = queue
->queue_id
;
2861 ovs_mutex_unlock(&netdev
->mutex
);
2866 netdev_linux_queue_dump_next(const struct netdev
*netdev_
, void *state_
,
2867 unsigned int *queue_idp
, struct smap
*details
)
2869 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2870 struct netdev_linux_queue_state
*state
= state_
;
2873 ovs_mutex_lock(&netdev
->mutex
);
2874 if (netdev_linux_netnsid_is_remote(netdev
)) {
2879 while (state
->cur_queue
< state
->n_queues
) {
2880 unsigned int queue_id
= state
->queues
[state
->cur_queue
++];
2881 struct tc_queue
*queue
= tc_find_queue(netdev_
, queue_id
);
2884 *queue_idp
= queue_id
;
2885 error
= netdev
->tc
->ops
->class_get(netdev_
, queue
, details
);
2891 ovs_mutex_unlock(&netdev
->mutex
);
2896 netdev_linux_queue_dump_done(const struct netdev
*netdev OVS_UNUSED
,
2899 struct netdev_linux_queue_state
*state
= state_
;
2901 free(state
->queues
);
2907 netdev_linux_dump_queue_stats(const struct netdev
*netdev_
,
2908 netdev_dump_queue_stats_cb
*cb
, void *aux
)
2910 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2913 ovs_mutex_lock(&netdev
->mutex
);
2914 if (netdev_linux_netnsid_is_remote(netdev
)) {
2919 error
= tc_query_qdisc(netdev_
);
2921 struct queue_dump_state state
;
2923 if (!netdev
->tc
->ops
->class_dump_stats
) {
2925 } else if (!start_queue_dump(netdev_
, &state
)) {
2931 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
2932 retval
= netdev
->tc
->ops
->class_dump_stats(netdev_
, &msg
,
2939 retval
= finish_queue_dump(&state
);
2947 ovs_mutex_unlock(&netdev
->mutex
);
2952 netdev_linux_set_in4(struct netdev
*netdev_
, struct in_addr address
,
2953 struct in_addr netmask
)
2955 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2958 ovs_mutex_lock(&netdev
->mutex
);
2959 if (netdev_linux_netnsid_is_remote(netdev
)) {
2964 error
= do_set_addr(netdev_
, SIOCSIFADDR
, "SIOCSIFADDR", address
);
2966 if (address
.s_addr
!= INADDR_ANY
) {
2967 error
= do_set_addr(netdev_
, SIOCSIFNETMASK
,
2968 "SIOCSIFNETMASK", netmask
);
2973 ovs_mutex_unlock(&netdev
->mutex
);
2977 /* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
2978 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
2981 netdev_linux_get_addr_list(const struct netdev
*netdev_
,
2982 struct in6_addr
**addr
, struct in6_addr
**mask
, int *n_cnt
)
2984 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
2987 ovs_mutex_lock(&netdev
->mutex
);
2988 if (netdev_linux_netnsid_is_remote(netdev
)) {
2993 error
= netdev_get_addrs(netdev_get_name(netdev_
), addr
, mask
, n_cnt
);
2996 ovs_mutex_unlock(&netdev
->mutex
);
3001 make_in4_sockaddr(struct sockaddr
*sa
, struct in_addr addr
)
3003 struct sockaddr_in sin
;
3004 memset(&sin
, 0, sizeof sin
);
3005 sin
.sin_family
= AF_INET
;
3006 sin
.sin_addr
= addr
;
3009 memset(sa
, 0, sizeof *sa
);
3010 memcpy(sa
, &sin
, sizeof sin
);
3014 do_set_addr(struct netdev
*netdev
,
3015 int ioctl_nr
, const char *ioctl_name
, struct in_addr addr
)
3019 make_in4_sockaddr(&ifr
.ifr_addr
, addr
);
3020 return af_inet_ifreq_ioctl(netdev_get_name(netdev
), &ifr
, ioctl_nr
,
3024 /* Adds 'router' as a default IP gateway. */
3026 netdev_linux_add_router(struct netdev
*netdev OVS_UNUSED
, struct in_addr router
)
3028 struct in_addr any
= { INADDR_ANY
};
3032 memset(&rt
, 0, sizeof rt
);
3033 make_in4_sockaddr(&rt
.rt_dst
, any
);
3034 make_in4_sockaddr(&rt
.rt_gateway
, router
);
3035 make_in4_sockaddr(&rt
.rt_genmask
, any
);
3036 rt
.rt_flags
= RTF_UP
| RTF_GATEWAY
;
3037 error
= af_inet_ioctl(SIOCADDRT
, &rt
);
3039 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error
));
3045 netdev_linux_get_next_hop(const struct in_addr
*host
, struct in_addr
*next_hop
,
3048 static const char fn
[] = "/proc/net/route";
3053 *netdev_name
= NULL
;
3054 stream
= fopen(fn
, "r");
3055 if (stream
== NULL
) {
3056 VLOG_WARN_RL(&rl
, "%s: open failed: %s", fn
, ovs_strerror(errno
));
3061 while (fgets(line
, sizeof line
, stream
)) {
3064 ovs_be32 dest
, gateway
, mask
;
3065 int refcnt
, metric
, mtu
;
3066 unsigned int flags
, use
, window
, irtt
;
3069 "%16s %"SCNx32
" %"SCNx32
" %04X %d %u %d %"SCNx32
3071 iface
, &dest
, &gateway
, &flags
, &refcnt
,
3072 &use
, &metric
, &mask
, &mtu
, &window
, &irtt
)) {
3073 VLOG_WARN_RL(&rl
, "%s: could not parse line %d: %s",
3077 if (!(flags
& RTF_UP
)) {
3078 /* Skip routes that aren't up. */
3082 /* The output of 'dest', 'mask', and 'gateway' were given in
3083 * network byte order, so we don't need need any endian
3084 * conversions here. */
3085 if ((dest
& mask
) == (host
->s_addr
& mask
)) {
3087 /* The host is directly reachable. */
3088 next_hop
->s_addr
= 0;
3090 /* To reach the host, we must go through a gateway. */
3091 next_hop
->s_addr
= gateway
;
3093 *netdev_name
= xstrdup(iface
);
3105 netdev_linux_get_status(const struct netdev
*netdev_
, struct smap
*smap
)
3107 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3110 ovs_mutex_lock(&netdev
->mutex
);
3111 if (!(netdev
->cache_valid
& VALID_DRVINFO
)) {
3112 struct ethtool_cmd
*cmd
= (struct ethtool_cmd
*) &netdev
->drvinfo
;
3114 COVERAGE_INC(netdev_get_ethtool
);
3115 memset(&netdev
->drvinfo
, 0, sizeof netdev
->drvinfo
);
3116 error
= netdev_linux_do_ethtool(netdev
->up
.name
,
3119 "ETHTOOL_GDRVINFO");
3121 netdev
->cache_valid
|= VALID_DRVINFO
;
3126 smap_add(smap
, "driver_name", netdev
->drvinfo
.driver
);
3127 smap_add(smap
, "driver_version", netdev
->drvinfo
.version
);
3128 smap_add(smap
, "firmware_version", netdev
->drvinfo
.fw_version
);
3130 ovs_mutex_unlock(&netdev
->mutex
);
3136 netdev_internal_get_status(const struct netdev
*netdev OVS_UNUSED
,
3139 smap_add(smap
, "driver_name", "openvswitch");
3144 netdev_linux_get_block_id(struct netdev
*netdev_
)
3146 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3147 uint32_t block_id
= 0;
3149 ovs_mutex_lock(&netdev
->mutex
);
3150 /* Ensure the linux netdev has had its fields populated. */
3151 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
3152 netdev_linux_update_via_netlink(netdev
);
3155 /* Only assigning block ids to linux netdevs that are LAG masters. */
3156 if (netdev
->is_lag_master
) {
3157 block_id
= netdev
->ifindex
;
3159 ovs_mutex_unlock(&netdev
->mutex
);
3164 /* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3165 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3166 * returns 0. Otherwise, it returns a positive errno value; in particular,
3167 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3169 netdev_linux_arp_lookup(const struct netdev
*netdev
,
3170 ovs_be32 ip
, struct eth_addr
*mac
)
3173 struct sockaddr_in sin
;
3176 memset(&r
, 0, sizeof r
);
3177 memset(&sin
, 0, sizeof sin
);
3178 sin
.sin_family
= AF_INET
;
3179 sin
.sin_addr
.s_addr
= ip
;
3181 memcpy(&r
.arp_pa
, &sin
, sizeof sin
);
3182 r
.arp_ha
.sa_family
= ARPHRD_ETHER
;
3184 ovs_strzcpy(r
.arp_dev
, netdev_get_name(netdev
), sizeof r
.arp_dev
);
3185 COVERAGE_INC(netdev_arp_lookup
);
3186 retval
= af_inet_ioctl(SIOCGARP
, &r
);
3188 memcpy(mac
, r
.arp_ha
.sa_data
, ETH_ADDR_LEN
);
3189 } else if (retval
!= ENXIO
) {
3190 VLOG_WARN_RL(&rl
, "%s: could not look up ARP entry for "IP_FMT
": %s",
3191 netdev_get_name(netdev
), IP_ARGS(ip
),
3192 ovs_strerror(retval
));
3198 nd_to_iff_flags(enum netdev_flags nd
)
3200 unsigned int iff
= 0;
3201 if (nd
& NETDEV_UP
) {
3204 if (nd
& NETDEV_PROMISC
) {
3207 if (nd
& NETDEV_LOOPBACK
) {
3208 iff
|= IFF_LOOPBACK
;
3214 iff_to_nd_flags(unsigned int iff
)
3216 enum netdev_flags nd
= 0;
3220 if (iff
& IFF_PROMISC
) {
3221 nd
|= NETDEV_PROMISC
;
3223 if (iff
& IFF_LOOPBACK
) {
3224 nd
|= NETDEV_LOOPBACK
;
3230 update_flags(struct netdev_linux
*netdev
, enum netdev_flags off
,
3231 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
3232 OVS_REQUIRES(netdev
->mutex
)
3234 unsigned int old_flags
, new_flags
;
3237 old_flags
= netdev
->ifi_flags
;
3238 *old_flagsp
= iff_to_nd_flags(old_flags
);
3239 new_flags
= (old_flags
& ~nd_to_iff_flags(off
)) | nd_to_iff_flags(on
);
3240 if (new_flags
!= old_flags
) {
3241 error
= set_flags(netdev_get_name(&netdev
->up
), new_flags
);
3242 get_flags(&netdev
->up
, &netdev
->ifi_flags
);
3249 netdev_linux_update_flags(struct netdev
*netdev_
, enum netdev_flags off
,
3250 enum netdev_flags on
, enum netdev_flags
*old_flagsp
)
3252 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3255 ovs_mutex_lock(&netdev
->mutex
);
3257 /* Changing flags over netlink isn't support yet. */
3258 if (netdev_linux_netnsid_is_remote(netdev
)) {
3262 error
= update_flags(netdev
, off
, on
, old_flagsp
);
3264 /* Try reading flags over netlink, or fall back to ioctl. */
3265 if (!netdev_linux_update_via_netlink(netdev
)) {
3266 *old_flagsp
= iff_to_nd_flags(netdev
->ifi_flags
);
3268 error
= update_flags(netdev
, off
, on
, old_flagsp
);
3273 ovs_mutex_unlock(&netdev
->mutex
);
3277 #define NETDEV_LINUX_CLASS_COMMON \
3278 .run = netdev_linux_run, \
3279 .wait = netdev_linux_wait, \
3280 .alloc = netdev_linux_alloc, \
3281 .destruct = netdev_linux_destruct, \
3282 .dealloc = netdev_linux_dealloc, \
3283 .send = netdev_linux_send, \
3284 .send_wait = netdev_linux_send_wait, \
3285 .set_etheraddr = netdev_linux_set_etheraddr, \
3286 .get_etheraddr = netdev_linux_get_etheraddr, \
3287 .get_mtu = netdev_linux_get_mtu, \
3288 .set_mtu = netdev_linux_set_mtu, \
3289 .get_ifindex = netdev_linux_get_ifindex, \
3290 .get_carrier = netdev_linux_get_carrier, \
3291 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3292 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3293 .set_advertisements = netdev_linux_set_advertisements, \
3294 .set_policing = netdev_linux_set_policing, \
3295 .get_qos_types = netdev_linux_get_qos_types, \
3296 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3297 .get_qos = netdev_linux_get_qos, \
3298 .set_qos = netdev_linux_set_qos, \
3299 .get_queue = netdev_linux_get_queue, \
3300 .set_queue = netdev_linux_set_queue, \
3301 .delete_queue = netdev_linux_delete_queue, \
3302 .get_queue_stats = netdev_linux_get_queue_stats, \
3303 .queue_dump_start = netdev_linux_queue_dump_start, \
3304 .queue_dump_next = netdev_linux_queue_dump_next, \
3305 .queue_dump_done = netdev_linux_queue_dump_done, \
3306 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3307 .set_in4 = netdev_linux_set_in4, \
3308 .get_addr_list = netdev_linux_get_addr_list, \
3309 .add_router = netdev_linux_add_router, \
3310 .get_next_hop = netdev_linux_get_next_hop, \
3311 .arp_lookup = netdev_linux_arp_lookup, \
3312 .update_flags = netdev_linux_update_flags, \
3313 .rxq_alloc = netdev_linux_rxq_alloc, \
3314 .rxq_construct = netdev_linux_rxq_construct, \
3315 .rxq_destruct = netdev_linux_rxq_destruct, \
3316 .rxq_dealloc = netdev_linux_rxq_dealloc, \
3317 .rxq_recv = netdev_linux_rxq_recv, \
3318 .rxq_wait = netdev_linux_rxq_wait, \
3319 .rxq_drain = netdev_linux_rxq_drain
3321 const struct netdev_class netdev_linux_class
= {
3322 NETDEV_LINUX_CLASS_COMMON
,
3323 LINUX_FLOW_OFFLOAD_API
,
3325 .construct
= netdev_linux_construct
,
3326 .get_stats
= netdev_linux_get_stats
,
3327 .get_features
= netdev_linux_get_features
,
3328 .get_status
= netdev_linux_get_status
,
3329 .get_block_id
= netdev_linux_get_block_id
3332 const struct netdev_class netdev_tap_class
= {
3333 NETDEV_LINUX_CLASS_COMMON
,
3335 .construct
= netdev_linux_construct_tap
,
3336 .get_stats
= netdev_tap_get_stats
,
3337 .get_features
= netdev_linux_get_features
,
3338 .get_status
= netdev_linux_get_status
,
3341 const struct netdev_class netdev_internal_class
= {
3342 NETDEV_LINUX_CLASS_COMMON
,
3343 LINUX_FLOW_OFFLOAD_API
,
3345 .construct
= netdev_linux_construct
,
3346 .get_stats
= netdev_internal_get_stats
,
3347 .get_status
= netdev_internal_get_status
,
3351 #define CODEL_N_QUEUES 0x0000
3353 /* In sufficiently new kernel headers these are defined as enums in
3354 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3355 * kernels. (This overrides any enum definition in the header file but that's
3357 #define TCA_CODEL_TARGET 1
3358 #define TCA_CODEL_LIMIT 2
3359 #define TCA_CODEL_INTERVAL 3
3368 static struct codel
*
3369 codel_get__(const struct netdev
*netdev_
)
3371 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3372 return CONTAINER_OF(netdev
->tc
, struct codel
, tc
);
3376 codel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3379 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3380 struct codel
*codel
;
3382 codel
= xmalloc(sizeof *codel
);
3383 tc_init(&codel
->tc
, &tc_ops_codel
);
3384 codel
->target
= target
;
3385 codel
->limit
= limit
;
3386 codel
->interval
= interval
;
3388 netdev
->tc
= &codel
->tc
;
3392 codel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3396 struct ofpbuf request
;
3397 struct tcmsg
*tcmsg
;
3398 uint32_t otarget
, olimit
, ointerval
;
3401 tc_del_qdisc(netdev
);
3403 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3404 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3408 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3409 tcmsg
->tcm_parent
= TC_H_ROOT
;
3411 otarget
= target
? target
: 5000;
3412 olimit
= limit
? limit
: 10240;
3413 ointerval
= interval
? interval
: 100000;
3415 nl_msg_put_string(&request
, TCA_KIND
, "codel");
3416 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3417 nl_msg_put_u32(&request
, TCA_CODEL_TARGET
, otarget
);
3418 nl_msg_put_u32(&request
, TCA_CODEL_LIMIT
, olimit
);
3419 nl_msg_put_u32(&request
, TCA_CODEL_INTERVAL
, ointerval
);
3420 nl_msg_end_nested(&request
, opt_offset
);
3422 error
= tc_transact(&request
, NULL
);
3424 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3425 "target %u, limit %u, interval %u error %d(%s)",
3426 netdev_get_name(netdev
),
3427 otarget
, olimit
, ointerval
,
3428 error
, ovs_strerror(error
));
3434 codel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3435 const struct smap
*details
, struct codel
*codel
)
3437 codel
->target
= smap_get_ullong(details
, "target", 0);
3438 codel
->limit
= smap_get_ullong(details
, "limit", 0);
3439 codel
->interval
= smap_get_ullong(details
, "interval", 0);
3441 if (!codel
->target
) {
3442 codel
->target
= 5000;
3444 if (!codel
->limit
) {
3445 codel
->limit
= 10240;
3447 if (!codel
->interval
) {
3448 codel
->interval
= 100000;
3453 codel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3458 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3459 error
= codel_setup_qdisc__(netdev
, codel
.target
, codel
.limit
,
3462 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3468 codel_parse_tca_options__(struct nlattr
*nl_options
, struct codel
*codel
)
3470 static const struct nl_policy tca_codel_policy
[] = {
3471 [TCA_CODEL_TARGET
] = { .type
= NL_A_U32
},
3472 [TCA_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3473 [TCA_CODEL_INTERVAL
] = { .type
= NL_A_U32
}
3476 struct nlattr
*attrs
[ARRAY_SIZE(tca_codel_policy
)];
3478 if (!nl_parse_nested(nl_options
, tca_codel_policy
,
3479 attrs
, ARRAY_SIZE(tca_codel_policy
))) {
3480 VLOG_WARN_RL(&rl
, "failed to parse CoDel class options");
3484 codel
->target
= nl_attr_get_u32(attrs
[TCA_CODEL_TARGET
]);
3485 codel
->limit
= nl_attr_get_u32(attrs
[TCA_CODEL_LIMIT
]);
3486 codel
->interval
= nl_attr_get_u32(attrs
[TCA_CODEL_INTERVAL
]);
3491 codel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3493 struct nlattr
*nlattr
;
3498 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3503 error
= codel_parse_tca_options__(nlattr
, &codel
);
3508 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3514 codel_tc_destroy(struct tc
*tc
)
3516 struct codel
*codel
= CONTAINER_OF(tc
, struct codel
, tc
);
3522 codel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3524 const struct codel
*codel
= codel_get__(netdev
);
3525 smap_add_format(details
, "target", "%u", codel
->target
);
3526 smap_add_format(details
, "limit", "%u", codel
->limit
);
3527 smap_add_format(details
, "interval", "%u", codel
->interval
);
3532 codel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3536 codel_parse_qdisc_details__(netdev
, details
, &codel
);
3537 codel_install__(netdev
, codel
.target
, codel
.limit
, codel
.interval
);
3538 codel_get__(netdev
)->target
= codel
.target
;
3539 codel_get__(netdev
)->limit
= codel
.limit
;
3540 codel_get__(netdev
)->interval
= codel
.interval
;
3544 static const struct tc_ops tc_ops_codel
= {
3545 .linux_name
= "codel",
3546 .ovs_name
= "linux-codel",
3547 .n_queues
= CODEL_N_QUEUES
,
3548 .tc_install
= codel_tc_install
,
3549 .tc_load
= codel_tc_load
,
3550 .tc_destroy
= codel_tc_destroy
,
3551 .qdisc_get
= codel_qdisc_get
,
3552 .qdisc_set
= codel_qdisc_set
,
3555 /* FQ-CoDel traffic control class. */
3557 #define FQCODEL_N_QUEUES 0x0000
3559 /* In sufficiently new kernel headers these are defined as enums in
3560 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3561 * kernels. (This overrides any enum definition in the header file but that's
3563 #define TCA_FQ_CODEL_TARGET 1
3564 #define TCA_FQ_CODEL_LIMIT 2
3565 #define TCA_FQ_CODEL_INTERVAL 3
3566 #define TCA_FQ_CODEL_ECN 4
3567 #define TCA_FQ_CODEL_FLOWS 5
3568 #define TCA_FQ_CODEL_QUANTUM 6
3579 static struct fqcodel
*
3580 fqcodel_get__(const struct netdev
*netdev_
)
3582 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3583 return CONTAINER_OF(netdev
->tc
, struct fqcodel
, tc
);
3587 fqcodel_install__(struct netdev
*netdev_
, uint32_t target
, uint32_t limit
,
3588 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3590 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3591 struct fqcodel
*fqcodel
;
3593 fqcodel
= xmalloc(sizeof *fqcodel
);
3594 tc_init(&fqcodel
->tc
, &tc_ops_fqcodel
);
3595 fqcodel
->target
= target
;
3596 fqcodel
->limit
= limit
;
3597 fqcodel
->interval
= interval
;
3598 fqcodel
->flows
= flows
;
3599 fqcodel
->quantum
= quantum
;
3601 netdev
->tc
= &fqcodel
->tc
;
3605 fqcodel_setup_qdisc__(struct netdev
*netdev
, uint32_t target
, uint32_t limit
,
3606 uint32_t interval
, uint32_t flows
, uint32_t quantum
)
3609 struct ofpbuf request
;
3610 struct tcmsg
*tcmsg
;
3611 uint32_t otarget
, olimit
, ointerval
, oflows
, oquantum
;
3614 tc_del_qdisc(netdev
);
3616 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3617 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3621 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3622 tcmsg
->tcm_parent
= TC_H_ROOT
;
3624 otarget
= target
? target
: 5000;
3625 olimit
= limit
? limit
: 10240;
3626 ointerval
= interval
? interval
: 100000;
3627 oflows
= flows
? flows
: 1024;
3628 oquantum
= quantum
? quantum
: 1514; /* fq_codel default quantum is 1514
3631 nl_msg_put_string(&request
, TCA_KIND
, "fq_codel");
3632 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
3633 nl_msg_put_u32(&request
, TCA_FQ_CODEL_TARGET
, otarget
);
3634 nl_msg_put_u32(&request
, TCA_FQ_CODEL_LIMIT
, olimit
);
3635 nl_msg_put_u32(&request
, TCA_FQ_CODEL_INTERVAL
, ointerval
);
3636 nl_msg_put_u32(&request
, TCA_FQ_CODEL_FLOWS
, oflows
);
3637 nl_msg_put_u32(&request
, TCA_FQ_CODEL_QUANTUM
, oquantum
);
3638 nl_msg_end_nested(&request
, opt_offset
);
3640 error
= tc_transact(&request
, NULL
);
3642 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3643 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3644 netdev_get_name(netdev
),
3645 otarget
, olimit
, ointerval
, oflows
, oquantum
,
3646 error
, ovs_strerror(error
));
3652 fqcodel_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
3653 const struct smap
*details
, struct fqcodel
*fqcodel
)
3655 fqcodel
->target
= smap_get_ullong(details
, "target", 0);
3656 fqcodel
->limit
= smap_get_ullong(details
, "limit", 0);
3657 fqcodel
->interval
= smap_get_ullong(details
, "interval", 0);
3658 fqcodel
->flows
= smap_get_ullong(details
, "flows", 0);
3659 fqcodel
->quantum
= smap_get_ullong(details
, "quantum", 0);
3661 if (!fqcodel
->target
) {
3662 fqcodel
->target
= 5000;
3664 if (!fqcodel
->limit
) {
3665 fqcodel
->limit
= 10240;
3667 if (!fqcodel
->interval
) {
3668 fqcodel
->interval
= 1000000;
3670 if (!fqcodel
->flows
) {
3671 fqcodel
->flows
= 1024;
3673 if (!fqcodel
->quantum
) {
3674 fqcodel
->quantum
= 1514;
3679 fqcodel_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3682 struct fqcodel fqcodel
;
3684 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3685 error
= fqcodel_setup_qdisc__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3686 fqcodel
.interval
, fqcodel
.flows
,
3689 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
,
3690 fqcodel
.interval
, fqcodel
.flows
, fqcodel
.quantum
);
3696 fqcodel_parse_tca_options__(struct nlattr
*nl_options
, struct fqcodel
*fqcodel
)
3698 static const struct nl_policy tca_fqcodel_policy
[] = {
3699 [TCA_FQ_CODEL_TARGET
] = { .type
= NL_A_U32
},
3700 [TCA_FQ_CODEL_LIMIT
] = { .type
= NL_A_U32
},
3701 [TCA_FQ_CODEL_INTERVAL
] = { .type
= NL_A_U32
},
3702 [TCA_FQ_CODEL_FLOWS
] = { .type
= NL_A_U32
},
3703 [TCA_FQ_CODEL_QUANTUM
] = { .type
= NL_A_U32
}
3706 struct nlattr
*attrs
[ARRAY_SIZE(tca_fqcodel_policy
)];
3708 if (!nl_parse_nested(nl_options
, tca_fqcodel_policy
,
3709 attrs
, ARRAY_SIZE(tca_fqcodel_policy
))) {
3710 VLOG_WARN_RL(&rl
, "failed to parse FQ_CoDel class options");
3714 fqcodel
->target
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_TARGET
]);
3715 fqcodel
->limit
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_LIMIT
]);
3716 fqcodel
->interval
=nl_attr_get_u32(attrs
[TCA_FQ_CODEL_INTERVAL
]);
3717 fqcodel
->flows
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_FLOWS
]);
3718 fqcodel
->quantum
= nl_attr_get_u32(attrs
[TCA_FQ_CODEL_QUANTUM
]);
3723 fqcodel_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3725 struct nlattr
*nlattr
;
3728 struct fqcodel fqcodel
;
3730 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3735 error
= fqcodel_parse_tca_options__(nlattr
, &fqcodel
);
3740 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3741 fqcodel
.flows
, fqcodel
.quantum
);
3746 fqcodel_tc_destroy(struct tc
*tc
)
3748 struct fqcodel
*fqcodel
= CONTAINER_OF(tc
, struct fqcodel
, tc
);
3754 fqcodel_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3756 const struct fqcodel
*fqcodel
= fqcodel_get__(netdev
);
3757 smap_add_format(details
, "target", "%u", fqcodel
->target
);
3758 smap_add_format(details
, "limit", "%u", fqcodel
->limit
);
3759 smap_add_format(details
, "interval", "%u", fqcodel
->interval
);
3760 smap_add_format(details
, "flows", "%u", fqcodel
->flows
);
3761 smap_add_format(details
, "quantum", "%u", fqcodel
->quantum
);
3766 fqcodel_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3768 struct fqcodel fqcodel
;
3770 fqcodel_parse_qdisc_details__(netdev
, details
, &fqcodel
);
3771 fqcodel_install__(netdev
, fqcodel
.target
, fqcodel
.limit
, fqcodel
.interval
,
3772 fqcodel
.flows
, fqcodel
.quantum
);
3773 fqcodel_get__(netdev
)->target
= fqcodel
.target
;
3774 fqcodel_get__(netdev
)->limit
= fqcodel
.limit
;
3775 fqcodel_get__(netdev
)->interval
= fqcodel
.interval
;
3776 fqcodel_get__(netdev
)->flows
= fqcodel
.flows
;
3777 fqcodel_get__(netdev
)->quantum
= fqcodel
.quantum
;
3781 static const struct tc_ops tc_ops_fqcodel
= {
3782 .linux_name
= "fq_codel",
3783 .ovs_name
= "linux-fq_codel",
3784 .n_queues
= FQCODEL_N_QUEUES
,
3785 .tc_install
= fqcodel_tc_install
,
3786 .tc_load
= fqcodel_tc_load
,
3787 .tc_destroy
= fqcodel_tc_destroy
,
3788 .qdisc_get
= fqcodel_qdisc_get
,
3789 .qdisc_set
= fqcodel_qdisc_set
,
3792 /* SFQ traffic control class. */
3794 #define SFQ_N_QUEUES 0x0000
3803 sfq_get__(const struct netdev
*netdev_
)
3805 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3806 return CONTAINER_OF(netdev
->tc
, struct sfq
, tc
);
3810 sfq_install__(struct netdev
*netdev_
, uint32_t quantum
, uint32_t perturb
)
3812 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3815 sfq
= xmalloc(sizeof *sfq
);
3816 tc_init(&sfq
->tc
, &tc_ops_sfq
);
3817 sfq
->perturb
= perturb
;
3818 sfq
->quantum
= quantum
;
3820 netdev
->tc
= &sfq
->tc
;
3824 sfq_setup_qdisc__(struct netdev
*netdev
, uint32_t quantum
, uint32_t perturb
)
3826 struct tc_sfq_qopt opt
;
3827 struct ofpbuf request
;
3828 struct tcmsg
*tcmsg
;
3830 int mtu_error
, error
;
3831 mtu_error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
3833 tc_del_qdisc(netdev
);
3835 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
3836 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
3840 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
3841 tcmsg
->tcm_parent
= TC_H_ROOT
;
3843 memset(&opt
, 0, sizeof opt
);
3846 opt
.quantum
= mtu
; /* if we cannot find mtu, use default */
3849 opt
.quantum
= quantum
;
3853 opt
.perturb_period
= 10;
3855 opt
.perturb_period
= perturb
;
3858 nl_msg_put_string(&request
, TCA_KIND
, "sfq");
3859 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
3861 error
= tc_transact(&request
, NULL
);
3863 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
3864 "quantum %u, perturb %u error %d(%s)",
3865 netdev_get_name(netdev
),
3866 opt
.quantum
, opt
.perturb_period
,
3867 error
, ovs_strerror(error
));
3873 sfq_parse_qdisc_details__(struct netdev
*netdev
,
3874 const struct smap
*details
, struct sfq
*sfq
)
3876 sfq
->perturb
= smap_get_ullong(details
, "perturb", 0);
3877 sfq
->quantum
= smap_get_ullong(details
, "quantum", 0);
3879 if (!sfq
->perturb
) {
3883 if (!sfq
->quantum
) {
3885 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
)) {
3888 VLOG_WARN_RL(&rl
, "when using SFQ, you must specify quantum on a "
3889 "device without mtu");
3895 sfq_tc_install(struct netdev
*netdev
, const struct smap
*details
)
3900 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3901 error
= sfq_setup_qdisc__(netdev
, sfq
.quantum
, sfq
.perturb
);
3903 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3909 sfq_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
3911 const struct tc_sfq_qopt
*sfq
;
3912 struct nlattr
*nlattr
;
3916 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
3918 sfq
= nl_attr_get(nlattr
);
3919 sfq_install__(netdev
, sfq
->quantum
, sfq
->perturb_period
);
3927 sfq_tc_destroy(struct tc
*tc
)
3929 struct sfq
*sfq
= CONTAINER_OF(tc
, struct sfq
, tc
);
3935 sfq_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
3937 const struct sfq
*sfq
= sfq_get__(netdev
);
3938 smap_add_format(details
, "quantum", "%u", sfq
->quantum
);
3939 smap_add_format(details
, "perturb", "%u", sfq
->perturb
);
3944 sfq_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
3948 sfq_parse_qdisc_details__(netdev
, details
, &sfq
);
3949 sfq_install__(netdev
, sfq
.quantum
, sfq
.perturb
);
3950 sfq_get__(netdev
)->quantum
= sfq
.quantum
;
3951 sfq_get__(netdev
)->perturb
= sfq
.perturb
;
3955 static const struct tc_ops tc_ops_sfq
= {
3956 .linux_name
= "sfq",
3957 .ovs_name
= "linux-sfq",
3958 .n_queues
= SFQ_N_QUEUES
,
3959 .tc_install
= sfq_tc_install
,
3960 .tc_load
= sfq_tc_load
,
3961 .tc_destroy
= sfq_tc_destroy
,
3962 .qdisc_get
= sfq_qdisc_get
,
3963 .qdisc_set
= sfq_qdisc_set
,
3966 /* netem traffic control class. */
3975 static struct netem
*
3976 netem_get__(const struct netdev
*netdev_
)
3978 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3979 return CONTAINER_OF(netdev
->tc
, struct netem
, tc
);
3983 netem_install__(struct netdev
*netdev_
, uint32_t latency
,
3984 uint32_t limit
, uint32_t loss
)
3986 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
3987 struct netem
*netem
;
3989 netem
= xmalloc(sizeof *netem
);
3990 tc_init(&netem
->tc
, &tc_ops_netem
);
3991 netem
->latency
= latency
;
3992 netem
->limit
= limit
;
3995 netdev
->tc
= &netem
->tc
;
3999 netem_setup_qdisc__(struct netdev
*netdev
, uint32_t latency
,
4000 uint32_t limit
, uint32_t loss
)
4002 struct tc_netem_qopt opt
;
4003 struct ofpbuf request
;
4004 struct tcmsg
*tcmsg
;
4007 tc_del_qdisc(netdev
);
4009 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4010 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4014 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4015 tcmsg
->tcm_parent
= TC_H_ROOT
;
4017 memset(&opt
, 0, sizeof opt
);
4028 "loss should be a percentage value between 0 to 100, "
4029 "loss was %u", loss
);
4032 opt
.loss
= floor(UINT32_MAX
* (loss
/ 100.0));
4035 opt
.latency
= tc_time_to_ticks(latency
);
4037 nl_msg_put_string(&request
, TCA_KIND
, "netem");
4038 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
4040 error
= tc_transact(&request
, NULL
);
4042 VLOG_WARN_RL(&rl
, "failed to replace %s qdisc, "
4043 "latency %u, limit %u, loss %u error %d(%s)",
4044 netdev_get_name(netdev
),
4045 opt
.latency
, opt
.limit
, opt
.loss
,
4046 error
, ovs_strerror(error
));
4052 netem_parse_qdisc_details__(struct netdev
*netdev OVS_UNUSED
,
4053 const struct smap
*details
, struct netem
*netem
)
4055 netem
->latency
= smap_get_ullong(details
, "latency", 0);
4056 netem
->limit
= smap_get_ullong(details
, "limit", 0);
4057 netem
->loss
= smap_get_ullong(details
, "loss", 0);
4059 if (!netem
->limit
) {
4060 netem
->limit
= 1000;
4065 netem_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4070 netem_parse_qdisc_details__(netdev
, details
, &netem
);
4071 error
= netem_setup_qdisc__(netdev
, netem
.latency
,
4072 netem
.limit
, netem
.loss
);
4074 netem_install__(netdev
, netem
.latency
, netem
.limit
, netem
.loss
);
4080 netem_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg
)
4082 const struct tc_netem_qopt
*netem
;
4083 struct nlattr
*nlattr
;
4087 error
= tc_parse_qdisc(nlmsg
, &kind
, &nlattr
);
4089 netem
= nl_attr_get(nlattr
);
4090 netem_install__(netdev
, netem
->latency
, netem
->limit
, netem
->loss
);
4098 netem_tc_destroy(struct tc
*tc
)
4100 struct netem
*netem
= CONTAINER_OF(tc
, struct netem
, tc
);
4106 netem_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4108 const struct netem
*netem
= netem_get__(netdev
);
4109 smap_add_format(details
, "latency", "%u", netem
->latency
);
4110 smap_add_format(details
, "limit", "%u", netem
->limit
);
4111 smap_add_format(details
, "loss", "%u", netem
->loss
);
4116 netem_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4120 netem_parse_qdisc_details__(netdev
, details
, &netem
);
4121 netem_install__(netdev
, netem
.latency
, netem
.limit
, netem
.loss
);
4122 netem_get__(netdev
)->latency
= netem
.latency
;
4123 netem_get__(netdev
)->limit
= netem
.limit
;
4124 netem_get__(netdev
)->loss
= netem
.loss
;
4128 static const struct tc_ops tc_ops_netem
= {
4129 .linux_name
= "netem",
4130 .ovs_name
= "linux-netem",
4132 .tc_install
= netem_tc_install
,
4133 .tc_load
= netem_tc_load
,
4134 .tc_destroy
= netem_tc_destroy
,
4135 .qdisc_get
= netem_qdisc_get
,
4136 .qdisc_set
= netem_qdisc_set
,
4139 /* HTB traffic control class. */
4141 #define HTB_N_QUEUES 0xf000
4142 #define HTB_RATE2QUANTUM 10
4146 unsigned int max_rate
; /* In bytes/s. */
4150 struct tc_queue tc_queue
;
4151 unsigned int min_rate
; /* In bytes/s. */
4152 unsigned int max_rate
; /* In bytes/s. */
4153 unsigned int burst
; /* In bytes. */
4154 unsigned int priority
; /* Lower values are higher priorities. */
4158 htb_get__(const struct netdev
*netdev_
)
4160 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4161 return CONTAINER_OF(netdev
->tc
, struct htb
, tc
);
4165 htb_install__(struct netdev
*netdev_
, uint64_t max_rate
)
4167 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4170 htb
= xmalloc(sizeof *htb
);
4171 tc_init(&htb
->tc
, &tc_ops_htb
);
4172 htb
->max_rate
= max_rate
;
4174 netdev
->tc
= &htb
->tc
;
4177 /* Create an HTB qdisc.
4179 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
4181 htb_setup_qdisc__(struct netdev
*netdev
)
4184 struct tc_htb_glob opt
;
4185 struct ofpbuf request
;
4186 struct tcmsg
*tcmsg
;
4188 tc_del_qdisc(netdev
);
4190 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4191 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4195 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4196 tcmsg
->tcm_parent
= TC_H_ROOT
;
4198 nl_msg_put_string(&request
, TCA_KIND
, "htb");
4200 memset(&opt
, 0, sizeof opt
);
4201 opt
.rate2quantum
= HTB_RATE2QUANTUM
;
4205 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4206 nl_msg_put_unspec(&request
, TCA_HTB_INIT
, &opt
, sizeof opt
);
4207 nl_msg_end_nested(&request
, opt_offset
);
4209 return tc_transact(&request
, NULL
);
4212 /* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
4213 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
4215 htb_setup_class__(struct netdev
*netdev
, unsigned int handle
,
4216 unsigned int parent
, struct htb_class
*class)
4219 struct tc_htb_opt opt
;
4220 struct ofpbuf request
;
4221 struct tcmsg
*tcmsg
;
4225 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
4227 VLOG_WARN_RL(&rl
, "cannot set up HTB on device %s that lacks MTU",
4228 netdev_get_name(netdev
));
4232 memset(&opt
, 0, sizeof opt
);
4233 tc_fill_rate(&opt
.rate
, class->min_rate
, mtu
);
4234 tc_fill_rate(&opt
.ceil
, class->max_rate
, mtu
);
4235 /* Makes sure the quantum is at least MTU. Setting quantum will
4236 * make htb ignore the r2q for this class. */
4237 if ((class->min_rate
/ HTB_RATE2QUANTUM
) < mtu
) {
4240 opt
.buffer
= tc_calc_buffer(opt
.rate
.rate
, mtu
, class->burst
);
4241 opt
.cbuffer
= tc_calc_buffer(opt
.ceil
.rate
, mtu
, class->burst
);
4242 opt
.prio
= class->priority
;
4244 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
4249 tcmsg
->tcm_handle
= handle
;
4250 tcmsg
->tcm_parent
= parent
;
4252 nl_msg_put_string(&request
, TCA_KIND
, "htb");
4253 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4254 nl_msg_put_unspec(&request
, TCA_HTB_PARMS
, &opt
, sizeof opt
);
4255 tc_put_rtab(&request
, TCA_HTB_RTAB
, &opt
.rate
);
4256 tc_put_rtab(&request
, TCA_HTB_CTAB
, &opt
.ceil
);
4257 nl_msg_end_nested(&request
, opt_offset
);
4259 error
= tc_transact(&request
, NULL
);
4261 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
4262 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
4263 netdev_get_name(netdev
),
4264 tc_get_major(handle
), tc_get_minor(handle
),
4265 tc_get_major(parent
), tc_get_minor(parent
),
4266 class->min_rate
, class->max_rate
,
4267 class->burst
, class->priority
, ovs_strerror(error
));
4272 /* Parses Netlink attributes in 'options' for HTB parameters and stores a
4273 * description of them into 'details'. The description complies with the
4274 * specification given in the vswitch database documentation for linux-htb
4277 htb_parse_tca_options__(struct nlattr
*nl_options
, struct htb_class
*class)
4279 static const struct nl_policy tca_htb_policy
[] = {
4280 [TCA_HTB_PARMS
] = { .type
= NL_A_UNSPEC
, .optional
= false,
4281 .min_len
= sizeof(struct tc_htb_opt
) },
4284 struct nlattr
*attrs
[ARRAY_SIZE(tca_htb_policy
)];
4285 const struct tc_htb_opt
*htb
;
4287 if (!nl_parse_nested(nl_options
, tca_htb_policy
,
4288 attrs
, ARRAY_SIZE(tca_htb_policy
))) {
4289 VLOG_WARN_RL(&rl
, "failed to parse HTB class options");
4293 htb
= nl_attr_get(attrs
[TCA_HTB_PARMS
]);
4294 class->min_rate
= htb
->rate
.rate
;
4295 class->max_rate
= htb
->ceil
.rate
;
4296 class->burst
= tc_ticks_to_bytes(htb
->rate
.rate
, htb
->buffer
);
4297 class->priority
= htb
->prio
;
4302 htb_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4303 struct htb_class
*options
,
4304 struct netdev_queue_stats
*stats
)
4306 struct nlattr
*nl_options
;
4307 unsigned int handle
;
4310 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4311 if (!error
&& queue_id
) {
4312 unsigned int major
= tc_get_major(handle
);
4313 unsigned int minor
= tc_get_minor(handle
);
4314 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4315 *queue_id
= minor
- 1;
4320 if (!error
&& options
) {
4321 error
= htb_parse_tca_options__(nl_options
, options
);
4327 htb_parse_qdisc_details__(struct netdev
*netdev_
,
4328 const struct smap
*details
, struct htb_class
*hc
)
4330 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4332 hc
->max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
4333 if (!hc
->max_rate
) {
4334 enum netdev_features current
;
4336 netdev_linux_read_features(netdev
);
4337 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4338 hc
->max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4340 hc
->min_rate
= hc
->max_rate
;
4346 htb_parse_class_details__(struct netdev
*netdev
,
4347 const struct smap
*details
, struct htb_class
*hc
)
4349 const struct htb
*htb
= htb_get__(netdev
);
4351 unsigned long long int max_rate_bit
;
4353 error
= netdev_linux_get_mtu__(netdev_linux_cast(netdev
), &mtu
);
4355 VLOG_WARN_RL(&rl
, "cannot parse HTB class on device %s that lacks MTU",
4356 netdev_get_name(netdev
));
4360 /* HTB requires at least an mtu sized min-rate to send any traffic even
4361 * on uncongested links. */
4362 hc
->min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
4363 hc
->min_rate
= MAX(hc
->min_rate
, mtu
);
4364 hc
->min_rate
= MIN(hc
->min_rate
, htb
->max_rate
);
4367 max_rate_bit
= smap_get_ullong(details
, "max-rate", 0);
4368 hc
->max_rate
= max_rate_bit
? max_rate_bit
/ 8 : htb
->max_rate
;
4369 hc
->max_rate
= MAX(hc
->max_rate
, hc
->min_rate
);
4370 hc
->max_rate
= MIN(hc
->max_rate
, htb
->max_rate
);
4374 * According to hints in the documentation that I've read, it is important
4375 * that 'burst' be at least as big as the largest frame that might be
4376 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4377 * but having it a bit too small is a problem. Since netdev_get_mtu()
4378 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4379 * the MTU. We actually add 64, instead of 14, as a guard against
4380 * additional headers get tacked on somewhere that we're not aware of. */
4381 hc
->burst
= smap_get_ullong(details
, "burst", 0) / 8;
4382 hc
->burst
= MAX(hc
->burst
, mtu
+ 64);
4385 hc
->priority
= smap_get_ullong(details
, "priority", 0);
4391 htb_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4392 unsigned int parent
, struct htb_class
*options
,
4393 struct netdev_queue_stats
*stats
)
4395 struct ofpbuf
*reply
;
4398 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4400 error
= htb_parse_tcmsg__(reply
, NULL
, options
, stats
);
4401 ofpbuf_delete(reply
);
4407 htb_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4411 error
= htb_setup_qdisc__(netdev
);
4413 struct htb_class hc
;
4415 htb_parse_qdisc_details__(netdev
, details
, &hc
);
4416 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4417 tc_make_handle(1, 0), &hc
);
4419 htb_install__(netdev
, hc
.max_rate
);
4425 static struct htb_class
*
4426 htb_class_cast__(const struct tc_queue
*queue
)
4428 return CONTAINER_OF(queue
, struct htb_class
, tc_queue
);
4432 htb_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4433 const struct htb_class
*hc
)
4435 struct htb
*htb
= htb_get__(netdev
);
4436 size_t hash
= hash_int(queue_id
, 0);
4437 struct tc_queue
*queue
;
4438 struct htb_class
*hcp
;
4440 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4442 hcp
= htb_class_cast__(queue
);
4444 hcp
= xmalloc(sizeof *hcp
);
4445 queue
= &hcp
->tc_queue
;
4446 queue
->queue_id
= queue_id
;
4447 queue
->created
= time_msec();
4448 hmap_insert(&htb
->tc
.queues
, &queue
->hmap_node
, hash
);
4451 hcp
->min_rate
= hc
->min_rate
;
4452 hcp
->max_rate
= hc
->max_rate
;
4453 hcp
->burst
= hc
->burst
;
4454 hcp
->priority
= hc
->priority
;
4458 htb_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4461 struct queue_dump_state state
;
4462 struct htb_class hc
;
4464 /* Get qdisc options. */
4466 htb_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4467 htb_install__(netdev
, hc
.max_rate
);
4470 if (!start_queue_dump(netdev
, &state
)) {
4473 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4474 unsigned int queue_id
;
4476 if (!htb_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4477 htb_update_queue__(netdev
, queue_id
, &hc
);
4480 finish_queue_dump(&state
);
4486 htb_tc_destroy(struct tc
*tc
)
4488 struct htb
*htb
= CONTAINER_OF(tc
, struct htb
, tc
);
4489 struct htb_class
*hc
;
4491 HMAP_FOR_EACH_POP (hc
, tc_queue
.hmap_node
, &htb
->tc
.queues
) {
4499 htb_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4501 const struct htb
*htb
= htb_get__(netdev
);
4502 smap_add_format(details
, "max-rate", "%llu", 8ULL * htb
->max_rate
);
4507 htb_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
4509 struct htb_class hc
;
4512 htb_parse_qdisc_details__(netdev
, details
, &hc
);
4513 error
= htb_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4514 tc_make_handle(1, 0), &hc
);
4516 htb_get__(netdev
)->max_rate
= hc
.max_rate
;
4522 htb_class_get(const struct netdev
*netdev OVS_UNUSED
,
4523 const struct tc_queue
*queue
, struct smap
*details
)
4525 const struct htb_class
*hc
= htb_class_cast__(queue
);
4527 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
4528 if (hc
->min_rate
!= hc
->max_rate
) {
4529 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
4531 smap_add_format(details
, "burst", "%llu", 8ULL * hc
->burst
);
4533 smap_add_format(details
, "priority", "%u", hc
->priority
);
4539 htb_class_set(struct netdev
*netdev
, unsigned int queue_id
,
4540 const struct smap
*details
)
4542 struct htb_class hc
;
4545 error
= htb_parse_class_details__(netdev
, details
, &hc
);
4550 error
= htb_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
4551 tc_make_handle(1, 0xfffe), &hc
);
4556 htb_update_queue__(netdev
, queue_id
, &hc
);
4561 htb_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
4563 struct htb_class
*hc
= htb_class_cast__(queue
);
4564 struct htb
*htb
= htb_get__(netdev
);
4567 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
4569 hmap_remove(&htb
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4576 htb_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
4577 struct netdev_queue_stats
*stats
)
4579 return htb_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
4580 tc_make_handle(1, 0xfffe), NULL
, stats
);
4584 htb_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
4585 const struct ofpbuf
*nlmsg
,
4586 netdev_dump_queue_stats_cb
*cb
, void *aux
)
4588 struct netdev_queue_stats stats
;
4589 unsigned int handle
, major
, minor
;
4592 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
4597 major
= tc_get_major(handle
);
4598 minor
= tc_get_minor(handle
);
4599 if (major
== 1 && minor
> 0 && minor
<= HTB_N_QUEUES
) {
4600 (*cb
)(minor
- 1, &stats
, aux
);
4605 static const struct tc_ops tc_ops_htb
= {
4606 .linux_name
= "htb",
4607 .ovs_name
= "linux-htb",
4608 .n_queues
= HTB_N_QUEUES
,
4609 .tc_install
= htb_tc_install
,
4610 .tc_load
= htb_tc_load
,
4611 .tc_destroy
= htb_tc_destroy
,
4612 .qdisc_get
= htb_qdisc_get
,
4613 .qdisc_set
= htb_qdisc_set
,
4614 .class_get
= htb_class_get
,
4615 .class_set
= htb_class_set
,
4616 .class_delete
= htb_class_delete
,
4617 .class_get_stats
= htb_class_get_stats
,
4618 .class_dump_stats
= htb_class_dump_stats
4621 /* "linux-hfsc" traffic control class. */
4623 #define HFSC_N_QUEUES 0xf000
4631 struct tc_queue tc_queue
;
4636 static struct hfsc
*
4637 hfsc_get__(const struct netdev
*netdev_
)
4639 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4640 return CONTAINER_OF(netdev
->tc
, struct hfsc
, tc
);
4643 static struct hfsc_class
*
4644 hfsc_class_cast__(const struct tc_queue
*queue
)
4646 return CONTAINER_OF(queue
, struct hfsc_class
, tc_queue
);
4650 hfsc_install__(struct netdev
*netdev_
, uint32_t max_rate
)
4652 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4655 hfsc
= xmalloc(sizeof *hfsc
);
4656 tc_init(&hfsc
->tc
, &tc_ops_hfsc
);
4657 hfsc
->max_rate
= max_rate
;
4658 netdev
->tc
= &hfsc
->tc
;
4662 hfsc_update_queue__(struct netdev
*netdev
, unsigned int queue_id
,
4663 const struct hfsc_class
*hc
)
4667 struct hfsc_class
*hcp
;
4668 struct tc_queue
*queue
;
4670 hfsc
= hfsc_get__(netdev
);
4671 hash
= hash_int(queue_id
, 0);
4673 queue
= tc_find_queue__(netdev
, queue_id
, hash
);
4675 hcp
= hfsc_class_cast__(queue
);
4677 hcp
= xmalloc(sizeof *hcp
);
4678 queue
= &hcp
->tc_queue
;
4679 queue
->queue_id
= queue_id
;
4680 queue
->created
= time_msec();
4681 hmap_insert(&hfsc
->tc
.queues
, &queue
->hmap_node
, hash
);
4684 hcp
->min_rate
= hc
->min_rate
;
4685 hcp
->max_rate
= hc
->max_rate
;
4689 hfsc_parse_tca_options__(struct nlattr
*nl_options
, struct hfsc_class
*class)
4691 const struct tc_service_curve
*rsc
, *fsc
, *usc
;
4692 static const struct nl_policy tca_hfsc_policy
[] = {
4694 .type
= NL_A_UNSPEC
,
4696 .min_len
= sizeof(struct tc_service_curve
),
4699 .type
= NL_A_UNSPEC
,
4701 .min_len
= sizeof(struct tc_service_curve
),
4704 .type
= NL_A_UNSPEC
,
4706 .min_len
= sizeof(struct tc_service_curve
),
4709 struct nlattr
*attrs
[ARRAY_SIZE(tca_hfsc_policy
)];
4711 if (!nl_parse_nested(nl_options
, tca_hfsc_policy
,
4712 attrs
, ARRAY_SIZE(tca_hfsc_policy
))) {
4713 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options");
4717 rsc
= nl_attr_get(attrs
[TCA_HFSC_RSC
]);
4718 fsc
= nl_attr_get(attrs
[TCA_HFSC_FSC
]);
4719 usc
= nl_attr_get(attrs
[TCA_HFSC_USC
]);
4721 if (rsc
->m1
!= 0 || rsc
->d
!= 0 ||
4722 fsc
->m1
!= 0 || fsc
->d
!= 0 ||
4723 usc
->m1
!= 0 || usc
->d
!= 0) {
4724 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4725 "Non-linear service curves are not supported.");
4729 if (rsc
->m2
!= fsc
->m2
) {
4730 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4731 "Real-time service curves are not supported ");
4735 if (rsc
->m2
> usc
->m2
) {
4736 VLOG_WARN_RL(&rl
, "failed to parse HFSC class options. "
4737 "Min-rate service curve is greater than "
4738 "the max-rate service curve.");
4742 class->min_rate
= fsc
->m2
;
4743 class->max_rate
= usc
->m2
;
4748 hfsc_parse_tcmsg__(struct ofpbuf
*tcmsg
, unsigned int *queue_id
,
4749 struct hfsc_class
*options
,
4750 struct netdev_queue_stats
*stats
)
4753 unsigned int handle
;
4754 struct nlattr
*nl_options
;
4756 error
= tc_parse_class(tcmsg
, &handle
, &nl_options
, stats
);
4762 unsigned int major
, minor
;
4764 major
= tc_get_major(handle
);
4765 minor
= tc_get_minor(handle
);
4766 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
4767 *queue_id
= minor
- 1;
4774 error
= hfsc_parse_tca_options__(nl_options
, options
);
4781 hfsc_query_class__(const struct netdev
*netdev
, unsigned int handle
,
4782 unsigned int parent
, struct hfsc_class
*options
,
4783 struct netdev_queue_stats
*stats
)
4786 struct ofpbuf
*reply
;
4788 error
= tc_query_class(netdev
, handle
, parent
, &reply
);
4793 error
= hfsc_parse_tcmsg__(reply
, NULL
, options
, stats
);
4794 ofpbuf_delete(reply
);
4799 hfsc_parse_qdisc_details__(struct netdev
*netdev_
, const struct smap
*details
,
4800 struct hfsc_class
*class)
4802 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
4804 uint32_t max_rate
= smap_get_ullong(details
, "max-rate", 0) / 8;
4806 enum netdev_features current
;
4808 netdev_linux_read_features(netdev
);
4809 current
= !netdev
->get_features_error
? netdev
->current
: 0;
4810 max_rate
= netdev_features_to_bps(current
, 100 * 1000 * 1000) / 8;
4813 class->min_rate
= max_rate
;
4814 class->max_rate
= max_rate
;
4818 hfsc_parse_class_details__(struct netdev
*netdev
,
4819 const struct smap
*details
,
4820 struct hfsc_class
* class)
4822 const struct hfsc
*hfsc
;
4823 uint32_t min_rate
, max_rate
;
4825 hfsc
= hfsc_get__(netdev
);
4827 min_rate
= smap_get_ullong(details
, "min-rate", 0) / 8;
4828 min_rate
= MAX(min_rate
, 1);
4829 min_rate
= MIN(min_rate
, hfsc
->max_rate
);
4831 max_rate
= smap_get_ullong(details
, "max-rate", hfsc
->max_rate
* 8) / 8;
4832 max_rate
= MAX(max_rate
, min_rate
);
4833 max_rate
= MIN(max_rate
, hfsc
->max_rate
);
4835 class->min_rate
= min_rate
;
4836 class->max_rate
= max_rate
;
4841 /* Create an HFSC qdisc.
4843 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
4845 hfsc_setup_qdisc__(struct netdev
* netdev
)
4847 struct tcmsg
*tcmsg
;
4848 struct ofpbuf request
;
4849 struct tc_hfsc_qopt opt
;
4851 tc_del_qdisc(netdev
);
4853 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWQDISC
,
4854 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
4860 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
4861 tcmsg
->tcm_parent
= TC_H_ROOT
;
4863 memset(&opt
, 0, sizeof opt
);
4866 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4867 nl_msg_put_unspec(&request
, TCA_OPTIONS
, &opt
, sizeof opt
);
4869 return tc_transact(&request
, NULL
);
4872 /* Create an HFSC class.
4874 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
4875 * sc rate <min_rate> ul rate <max_rate>" */
4877 hfsc_setup_class__(struct netdev
*netdev
, unsigned int handle
,
4878 unsigned int parent
, struct hfsc_class
*class)
4882 struct tcmsg
*tcmsg
;
4883 struct ofpbuf request
;
4884 struct tc_service_curve min
, max
;
4886 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTCLASS
, NLM_F_CREATE
,
4893 tcmsg
->tcm_handle
= handle
;
4894 tcmsg
->tcm_parent
= parent
;
4898 min
.m2
= class->min_rate
;
4902 max
.m2
= class->max_rate
;
4904 nl_msg_put_string(&request
, TCA_KIND
, "hfsc");
4905 opt_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
4906 nl_msg_put_unspec(&request
, TCA_HFSC_RSC
, &min
, sizeof min
);
4907 nl_msg_put_unspec(&request
, TCA_HFSC_FSC
, &min
, sizeof min
);
4908 nl_msg_put_unspec(&request
, TCA_HFSC_USC
, &max
, sizeof max
);
4909 nl_msg_end_nested(&request
, opt_offset
);
4911 error
= tc_transact(&request
, NULL
);
4913 VLOG_WARN_RL(&rl
, "failed to replace %s class %u:%u, parent %u:%u, "
4914 "min-rate %ubps, max-rate %ubps (%s)",
4915 netdev_get_name(netdev
),
4916 tc_get_major(handle
), tc_get_minor(handle
),
4917 tc_get_major(parent
), tc_get_minor(parent
),
4918 class->min_rate
, class->max_rate
, ovs_strerror(error
));
4925 hfsc_tc_install(struct netdev
*netdev
, const struct smap
*details
)
4928 struct hfsc_class
class;
4930 error
= hfsc_setup_qdisc__(netdev
);
4936 hfsc_parse_qdisc_details__(netdev
, details
, &class);
4937 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
4938 tc_make_handle(1, 0), &class);
4944 hfsc_install__(netdev
, class.max_rate
);
4949 hfsc_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
4952 struct queue_dump_state state
;
4953 struct hfsc_class hc
;
4956 hfsc_query_class__(netdev
, tc_make_handle(1, 0xfffe), 0, &hc
, NULL
);
4957 hfsc_install__(netdev
, hc
.max_rate
);
4959 if (!start_queue_dump(netdev
, &state
)) {
4963 while (nl_dump_next(&state
.dump
, &msg
, &state
.buf
)) {
4964 unsigned int queue_id
;
4966 if (!hfsc_parse_tcmsg__(&msg
, &queue_id
, &hc
, NULL
)) {
4967 hfsc_update_queue__(netdev
, queue_id
, &hc
);
4971 finish_queue_dump(&state
);
4976 hfsc_tc_destroy(struct tc
*tc
)
4979 struct hfsc_class
*hc
, *next
;
4981 hfsc
= CONTAINER_OF(tc
, struct hfsc
, tc
);
4983 HMAP_FOR_EACH_SAFE (hc
, next
, tc_queue
.hmap_node
, &hfsc
->tc
.queues
) {
4984 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
4993 hfsc_qdisc_get(const struct netdev
*netdev
, struct smap
*details
)
4995 const struct hfsc
*hfsc
;
4996 hfsc
= hfsc_get__(netdev
);
4997 smap_add_format(details
, "max-rate", "%llu", 8ULL * hfsc
->max_rate
);
5002 hfsc_qdisc_set(struct netdev
*netdev
, const struct smap
*details
)
5005 struct hfsc_class
class;
5007 hfsc_parse_qdisc_details__(netdev
, details
, &class);
5008 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, 0xfffe),
5009 tc_make_handle(1, 0), &class);
5012 hfsc_get__(netdev
)->max_rate
= class.max_rate
;
5019 hfsc_class_get(const struct netdev
*netdev OVS_UNUSED
,
5020 const struct tc_queue
*queue
, struct smap
*details
)
5022 const struct hfsc_class
*hc
;
5024 hc
= hfsc_class_cast__(queue
);
5025 smap_add_format(details
, "min-rate", "%llu", 8ULL * hc
->min_rate
);
5026 if (hc
->min_rate
!= hc
->max_rate
) {
5027 smap_add_format(details
, "max-rate", "%llu", 8ULL * hc
->max_rate
);
5033 hfsc_class_set(struct netdev
*netdev
, unsigned int queue_id
,
5034 const struct smap
*details
)
5037 struct hfsc_class
class;
5039 error
= hfsc_parse_class_details__(netdev
, details
, &class);
5044 error
= hfsc_setup_class__(netdev
, tc_make_handle(1, queue_id
+ 1),
5045 tc_make_handle(1, 0xfffe), &class);
5050 hfsc_update_queue__(netdev
, queue_id
, &class);
5055 hfsc_class_delete(struct netdev
*netdev
, struct tc_queue
*queue
)
5059 struct hfsc_class
*hc
;
5061 hc
= hfsc_class_cast__(queue
);
5062 hfsc
= hfsc_get__(netdev
);
5064 error
= tc_delete_class(netdev
, tc_make_handle(1, queue
->queue_id
+ 1));
5066 hmap_remove(&hfsc
->tc
.queues
, &hc
->tc_queue
.hmap_node
);
5073 hfsc_class_get_stats(const struct netdev
*netdev
, const struct tc_queue
*queue
,
5074 struct netdev_queue_stats
*stats
)
5076 return hfsc_query_class__(netdev
, tc_make_handle(1, queue
->queue_id
+ 1),
5077 tc_make_handle(1, 0xfffe), NULL
, stats
);
5081 hfsc_class_dump_stats(const struct netdev
*netdev OVS_UNUSED
,
5082 const struct ofpbuf
*nlmsg
,
5083 netdev_dump_queue_stats_cb
*cb
, void *aux
)
5085 struct netdev_queue_stats stats
;
5086 unsigned int handle
, major
, minor
;
5089 error
= tc_parse_class(nlmsg
, &handle
, NULL
, &stats
);
5094 major
= tc_get_major(handle
);
5095 minor
= tc_get_minor(handle
);
5096 if (major
== 1 && minor
> 0 && minor
<= HFSC_N_QUEUES
) {
5097 (*cb
)(minor
- 1, &stats
, aux
);
5102 static const struct tc_ops tc_ops_hfsc
= {
5103 .linux_name
= "hfsc",
5104 .ovs_name
= "linux-hfsc",
5105 .n_queues
= HFSC_N_QUEUES
, /* n_queues */
5106 .tc_install
= hfsc_tc_install
,
5107 .tc_load
= hfsc_tc_load
,
5108 .tc_destroy
= hfsc_tc_destroy
,
5109 .qdisc_get
= hfsc_qdisc_get
,
5110 .qdisc_set
= hfsc_qdisc_set
,
5111 .class_get
= hfsc_class_get
,
5112 .class_set
= hfsc_class_set
,
5113 .class_delete
= hfsc_class_delete
,
5114 .class_get_stats
= hfsc_class_get_stats
,
5115 .class_dump_stats
= hfsc_class_dump_stats
,
5118 /* "linux-noop" traffic control class. */
5121 noop_install__(struct netdev
*netdev_
)
5123 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5124 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
5126 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
5130 noop_tc_install(struct netdev
*netdev
,
5131 const struct smap
*details OVS_UNUSED
)
5133 noop_install__(netdev
);
5138 noop_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5140 noop_install__(netdev
);
5144 static const struct tc_ops tc_ops_noop
= {
5145 .ovs_name
= "linux-noop", /* ovs_name */
5146 .tc_install
= noop_tc_install
,
5147 .tc_load
= noop_tc_load
,
5150 /* "linux-default" traffic control class.
5152 * This class represents the default, unnamed Linux qdisc. It corresponds to
5153 * the "" (empty string) QoS type in the OVS database. */
5156 default_install__(struct netdev
*netdev_
)
5158 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5159 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_default
);
5161 /* Nothing but a tc class implementation is allowed to write to a tc. This
5162 * class never does that, so we can legitimately use a const tc object. */
5163 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
5167 default_tc_install(struct netdev
*netdev
,
5168 const struct smap
*details OVS_UNUSED
)
5170 default_install__(netdev
);
5175 default_tc_load(struct netdev
*netdev
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5177 default_install__(netdev
);
5181 static const struct tc_ops tc_ops_default
= {
5182 .ovs_name
= "", /* ovs_name */
5183 .tc_install
= default_tc_install
,
5184 .tc_load
= default_tc_load
,
5187 /* "linux-other" traffic control class.
5192 other_tc_load(struct netdev
*netdev_
, struct ofpbuf
*nlmsg OVS_UNUSED
)
5194 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5195 static const struct tc tc
= TC_INITIALIZER(&tc
, &tc_ops_other
);
5197 /* Nothing but a tc class implementation is allowed to write to a tc. This
5198 * class never does that, so we can legitimately use a const tc object. */
5199 netdev
->tc
= CONST_CAST(struct tc
*, &tc
);
5203 static const struct tc_ops tc_ops_other
= {
5204 .ovs_name
= "linux-other",
5205 .tc_load
= other_tc_load
,
5208 /* Traffic control. */
5210 /* Number of kernel "tc" ticks per second. */
5211 static double ticks_per_s
;
5213 /* Number of kernel "jiffies" per second. This is used for the purpose of
5214 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
5215 * one jiffy's worth of data.
5217 * There are two possibilities here:
5219 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5220 * approximate range of 100 to 1024. That means that we really need to
5221 * make sure that the qdisc can buffer that much data.
5223 * - 'buffer_hz' is an absurdly large number. That means that the kernel
5224 * has finely granular timers and there's no need to fudge additional room
5225 * for buffers. (There's no extra effort needed to implement that: the
5226 * large 'buffer_hz' is used as a divisor, so practically any number will
5227 * come out as 0 in the division. Small integer results in the case of
5228 * really high dividends won't have any real effect anyhow.)
5230 static unsigned int buffer_hz
;
5232 static struct tcmsg
*
5233 netdev_linux_tc_make_request(const struct netdev
*netdev
, int type
,
5234 unsigned int flags
, struct ofpbuf
*request
)
5239 error
= get_ifindex(netdev
, &ifindex
);
5244 return tc_make_request(ifindex
, type
, flags
, request
);
5247 /* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5250 * This function is equivalent to running:
5251 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5252 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
5255 * The configuration and stats may be seen with the following command:
5256 * /sbin/tc -s filter show dev <devname> parent ffff:
5258 * Returns 0 if successful, otherwise a positive errno value.
5261 tc_add_policer(struct netdev
*netdev
,
5262 uint32_t kbits_rate
, uint32_t kbits_burst
)
5264 struct tc_police tc_police
;
5265 struct ofpbuf request
;
5266 struct tcmsg
*tcmsg
;
5267 size_t basic_offset
;
5268 size_t police_offset
;
5272 memset(&tc_police
, 0, sizeof tc_police
);
5273 tc_police
.action
= TC_POLICE_SHOT
;
5274 tc_police
.mtu
= mtu
;
5275 tc_fill_rate(&tc_police
.rate
, ((uint64_t) kbits_rate
* 1000)/8, mtu
);
5277 /* The following appears wrong in one way: In networking a kilobit is
5278 * usually 1000 bits but this uses 1024 bits.
5280 * However if you "fix" those problems then "tc filter show ..." shows
5281 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5282 * 1,000,000 bits, whereas this actually ends up doing the right thing from
5283 * tc's point of view. Whatever. */
5284 tc_police
.burst
= tc_bytes_to_ticks(
5285 tc_police
.rate
.rate
, MIN(UINT32_MAX
/ 1024, kbits_burst
) * 1024 / 8);
5287 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_NEWTFILTER
,
5288 NLM_F_EXCL
| NLM_F_CREATE
, &request
);
5292 tcmsg
->tcm_parent
= tc_make_handle(0xffff, 0);
5293 tcmsg
->tcm_info
= tc_make_handle(49,
5294 (OVS_FORCE
uint16_t) htons(ETH_P_ALL
));
5296 nl_msg_put_string(&request
, TCA_KIND
, "basic");
5297 basic_offset
= nl_msg_start_nested(&request
, TCA_OPTIONS
);
5298 police_offset
= nl_msg_start_nested(&request
, TCA_BASIC_POLICE
);
5299 nl_msg_put_unspec(&request
, TCA_POLICE_TBF
, &tc_police
, sizeof tc_police
);
5300 tc_put_rtab(&request
, TCA_POLICE_RATE
, &tc_police
.rate
);
5301 nl_msg_end_nested(&request
, police_offset
);
5302 nl_msg_end_nested(&request
, basic_offset
);
5304 error
= tc_transact(&request
, NULL
);
5315 /* The values in psched are not individually very meaningful, but they are
5316 * important. The tables below show some values seen in the wild.
5320 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5321 * (Before that, there are hints that it was 1000000000.)
5323 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5327 * -----------------------------------
5328 * [1] 000c8000 000f4240 000f4240 00000064
5329 * [2] 000003e8 00000400 000f4240 3b9aca00
5330 * [3] 000003e8 00000400 000f4240 3b9aca00
5331 * [4] 000003e8 00000400 000f4240 00000064
5332 * [5] 000003e8 00000040 000f4240 3b9aca00
5333 * [6] 000003e8 00000040 000f4240 000000f9
5335 * a b c d ticks_per_s buffer_hz
5336 * ------- --------- ---------- ------------- ----------- -------------
5337 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5338 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5339 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5340 * [4] 1,000 1,024 1,000,000 100 976,562 100
5341 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5342 * [6] 1,000 64 1,000,000 249 15,625,000 249
5344 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5345 * [2] 2.6.26-1-686-bigmem from Debian lenny
5346 * [3] 2.6.26-2-sparc64 from Debian lenny
5347 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5348 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5349 * [6] 2.6.34 from kernel.org on KVM
5351 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5352 static const char fn
[] = "/proc/net/psched";
5353 unsigned int a
, b
, c
, d
;
5356 if (!ovsthread_once_start(&once
)) {
5363 stream
= fopen(fn
, "r");
5365 VLOG_WARN("%s: open failed: %s", fn
, ovs_strerror(errno
));
5369 if (fscanf(stream
, "%x %x %x %x", &a
, &b
, &c
, &d
) != 4) {
5370 VLOG_WARN("%s: read failed", fn
);
5374 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn
, a
, b
, c
, d
);
5377 if (!a
|| !b
|| !c
) {
5378 VLOG_WARN("%s: invalid scheduler parameters", fn
);
5382 ticks_per_s
= (double) a
* c
/ b
;
5386 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5389 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn
, ticks_per_s
, buffer_hz
);
5392 ovsthread_once_done(&once
);
5395 /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5396 * rate of 'rate' bytes per second. */
5398 tc_ticks_to_bytes(unsigned int rate
, unsigned int ticks
)
5401 return (rate
* ticks
) / ticks_per_s
;
5404 /* Returns the number of ticks that it would take to transmit 'size' bytes at a
5405 * rate of 'rate' bytes per second. */
5407 tc_bytes_to_ticks(unsigned int rate
, unsigned int size
)
5410 return rate
? ((unsigned long long int) ticks_per_s
* size
) / rate
: 0;
5413 /* Returns the number of bytes that need to be reserved for qdisc buffering at
5414 * a transmission rate of 'rate' bytes per second. */
5416 tc_buffer_per_jiffy(unsigned int rate
)
5419 return rate
/ buffer_hz
;
5423 tc_time_to_ticks(uint32_t time
) {
5425 return time
* (ticks_per_s
/ 1000000);
5428 /* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5429 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5430 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5431 * stores NULL into it if it is absent.
5433 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5436 * Returns 0 if successful, otherwise a positive errno value. */
5438 tc_parse_qdisc(const struct ofpbuf
*msg
, const char **kind
,
5439 struct nlattr
**options
)
5441 static const struct nl_policy tca_policy
[] = {
5442 [TCA_KIND
] = { .type
= NL_A_STRING
, .optional
= false },
5443 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= true },
5445 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5447 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5448 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5449 VLOG_WARN_RL(&rl
, "failed to parse qdisc message");
5454 *kind
= nl_attr_get_string(ta
[TCA_KIND
]);
5458 *options
= ta
[TCA_OPTIONS
];
5473 /* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5474 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5475 * into '*options', and its queue statistics into '*stats'. Any of the output
5476 * arguments may be null.
5478 * Returns 0 if successful, otherwise a positive errno value. */
5480 tc_parse_class(const struct ofpbuf
*msg
, unsigned int *handlep
,
5481 struct nlattr
**options
, struct netdev_queue_stats
*stats
)
5483 static const struct nl_policy tca_policy
[] = {
5484 [TCA_OPTIONS
] = { .type
= NL_A_NESTED
, .optional
= false },
5485 [TCA_STATS2
] = { .type
= NL_A_NESTED
, .optional
= false },
5487 struct nlattr
*ta
[ARRAY_SIZE(tca_policy
)];
5489 if (!nl_policy_parse(msg
, NLMSG_HDRLEN
+ sizeof(struct tcmsg
),
5490 tca_policy
, ta
, ARRAY_SIZE(ta
))) {
5491 VLOG_WARN_RL(&rl
, "failed to parse class message");
5496 struct tcmsg
*tc
= ofpbuf_at_assert(msg
, NLMSG_HDRLEN
, sizeof *tc
);
5497 *handlep
= tc
->tcm_handle
;
5501 *options
= ta
[TCA_OPTIONS
];
5505 const struct gnet_stats_queue
*gsq
;
5506 struct gnet_stats_basic gsb
;
5508 static const struct nl_policy stats_policy
[] = {
5509 [TCA_STATS_BASIC
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5510 .min_len
= sizeof gsb
},
5511 [TCA_STATS_QUEUE
] = { .type
= NL_A_UNSPEC
, .optional
= false,
5512 .min_len
= sizeof *gsq
},
5514 struct nlattr
*sa
[ARRAY_SIZE(stats_policy
)];
5516 if (!nl_parse_nested(ta
[TCA_STATS2
], stats_policy
,
5517 sa
, ARRAY_SIZE(sa
))) {
5518 VLOG_WARN_RL(&rl
, "failed to parse class stats");
5522 /* Alignment issues screw up the length of struct gnet_stats_basic on
5523 * some arch/bitsize combinations. Newer versions of Linux have a
5524 * struct gnet_stats_basic_packed, but we can't depend on that. The
5525 * easiest thing to do is just to make a copy. */
5526 memset(&gsb
, 0, sizeof gsb
);
5527 memcpy(&gsb
, nl_attr_get(sa
[TCA_STATS_BASIC
]),
5528 MIN(nl_attr_get_size(sa
[TCA_STATS_BASIC
]), sizeof gsb
));
5529 stats
->tx_bytes
= gsb
.bytes
;
5530 stats
->tx_packets
= gsb
.packets
;
5532 gsq
= nl_attr_get(sa
[TCA_STATS_QUEUE
]);
5533 stats
->tx_errors
= gsq
->drops
;
5543 memset(stats
, 0, sizeof *stats
);
5548 /* Queries the kernel for class with identifier 'handle' and parent 'parent'
5551 tc_query_class(const struct netdev
*netdev
,
5552 unsigned int handle
, unsigned int parent
,
5553 struct ofpbuf
**replyp
)
5555 struct ofpbuf request
;
5556 struct tcmsg
*tcmsg
;
5559 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_GETTCLASS
, NLM_F_ECHO
,
5564 tcmsg
->tcm_handle
= handle
;
5565 tcmsg
->tcm_parent
= parent
;
5567 error
= tc_transact(&request
, replyp
);
5569 VLOG_WARN_RL(&rl
, "query %s class %u:%u (parent %u:%u) failed (%s)",
5570 netdev_get_name(netdev
),
5571 tc_get_major(handle
), tc_get_minor(handle
),
5572 tc_get_major(parent
), tc_get_minor(parent
),
5573 ovs_strerror(error
));
5578 /* Equivalent to "tc class del dev <name> handle <handle>". */
5580 tc_delete_class(const struct netdev
*netdev
, unsigned int handle
)
5582 struct ofpbuf request
;
5583 struct tcmsg
*tcmsg
;
5586 tcmsg
= netdev_linux_tc_make_request(netdev
, RTM_DELTCLASS
, 0, &request
);
5590 tcmsg
->tcm_handle
= handle
;
5591 tcmsg
->tcm_parent
= 0;
5593 error
= tc_transact(&request
, NULL
);
5595 VLOG_WARN_RL(&rl
, "delete %s class %u:%u failed (%s)",
5596 netdev_get_name(netdev
),
5597 tc_get_major(handle
), tc_get_minor(handle
),
5598 ovs_strerror(error
));
5603 /* Equivalent to "tc qdisc del dev <name> root". */
5605 tc_del_qdisc(struct netdev
*netdev_
)
5607 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5608 struct ofpbuf request
;
5609 struct tcmsg
*tcmsg
;
5612 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_DELQDISC
, 0, &request
);
5616 tcmsg
->tcm_handle
= tc_make_handle(1, 0);
5617 tcmsg
->tcm_parent
= TC_H_ROOT
;
5619 error
= tc_transact(&request
, NULL
);
5620 if (error
== EINVAL
) {
5621 /* EINVAL probably means that the default qdisc was in use, in which
5622 * case we've accomplished our purpose. */
5625 if (!error
&& netdev
->tc
) {
5626 if (netdev
->tc
->ops
->tc_destroy
) {
5627 netdev
->tc
->ops
->tc_destroy(netdev
->tc
);
5635 getqdisc_is_safe(void)
5637 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
5638 static bool safe
= false;
5640 if (ovsthread_once_start(&once
)) {
5641 struct utsname utsname
;
5644 if (uname(&utsname
) == -1) {
5645 VLOG_WARN("uname failed (%s)", ovs_strerror(errno
));
5646 } else if (!ovs_scan(utsname
.release
, "%d.%d", &major
, &minor
)) {
5647 VLOG_WARN("uname reported bad OS release (%s)", utsname
.release
);
5648 } else if (major
< 2 || (major
== 2 && minor
< 35)) {
5649 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5654 ovsthread_once_done(&once
);
5659 /* If 'netdev''s qdisc type and parameters are not yet known, queries the
5660 * kernel to determine what they are. Returns 0 if successful, otherwise a
5661 * positive errno value. */
5663 tc_query_qdisc(const struct netdev
*netdev_
)
5665 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
5666 struct ofpbuf request
, *qdisc
;
5667 const struct tc_ops
*ops
;
5668 struct tcmsg
*tcmsg
;
5676 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5677 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5678 * 2.6.35 without that fix backported to it.
5680 * To avoid the OOPS, we must not make a request that would attempt to dump
5681 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5682 * few others. There are a few ways that I can see to do this, but most of
5683 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5684 * technique chosen here is to assume that any non-default qdisc that we
5685 * create will have a class with handle 1:0. The built-in qdiscs only have
5686 * a class with handle 0:0.
5688 * On Linux 2.6.35+ we use the straightforward method because it allows us
5689 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5690 * in such a case we get no response at all from the kernel (!) if a
5691 * builtin qdisc is in use (which is later caught by "!error &&
5692 * !qdisc->size"). */
5693 tcmsg
= netdev_linux_tc_make_request(netdev_
, RTM_GETQDISC
, NLM_F_ECHO
,
5698 tcmsg
->tcm_handle
= tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5699 tcmsg
->tcm_parent
= getqdisc_is_safe() ? TC_H_ROOT
: 0;
5701 /* Figure out what tc class to instantiate. */
5702 error
= tc_transact(&request
, &qdisc
);
5703 if (!error
&& qdisc
->size
) {
5706 error
= tc_parse_qdisc(qdisc
, &kind
, NULL
);
5708 ops
= &tc_ops_other
;
5710 ops
= tc_lookup_linux_name(kind
);
5712 static struct vlog_rate_limit rl2
= VLOG_RATE_LIMIT_INIT(1, 1);
5713 VLOG_DBG_RL(&rl2
, "unknown qdisc \"%s\"", kind
);
5715 ops
= &tc_ops_other
;
5718 } else if ((!error
&& !qdisc
->size
) || error
== ENOENT
) {
5719 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5720 * set up by some other entity that doesn't have a handle 1:0. We will
5721 * assume that it's the system default qdisc. */
5722 ops
= &tc_ops_default
;
5725 /* Who knows? Maybe the device got deleted. */
5726 VLOG_WARN_RL(&rl
, "query %s qdisc failed (%s)",
5727 netdev_get_name(netdev_
), ovs_strerror(error
));
5728 ops
= &tc_ops_other
;
5731 /* Instantiate it. */
5732 load_error
= ops
->tc_load(CONST_CAST(struct netdev
*, netdev_
), qdisc
);
5733 ovs_assert((load_error
== 0) == (netdev
->tc
!= NULL
));
5734 ofpbuf_delete(qdisc
);
5736 return error
? error
: load_error
;
5739 /* Linux traffic control uses tables with 256 entries ("rtab" tables) to
5740 approximate the time to transmit packets of various lengths. For an MTU of
5741 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
5742 represents two possible packet lengths; for a MTU of 513 through 1024, four
5743 possible lengths; and so on.
5745 Returns, for the specified 'mtu', the number of bits that packet lengths
5746 need to be shifted right to fit within such a 256-entry table. */
5748 tc_calc_cell_log(unsigned int mtu
)
5753 mtu
= ETH_PAYLOAD_MAX
;
5755 mtu
+= ETH_HEADER_LEN
+ VLAN_HEADER_LEN
;
5757 for (cell_log
= 0; mtu
>= 256; cell_log
++) {
5764 /* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
5767 tc_fill_rate(struct tc_ratespec
*rate
, uint64_t Bps
, int mtu
)
5769 memset(rate
, 0, sizeof *rate
);
5770 rate
->cell_log
= tc_calc_cell_log(mtu
);
5771 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
5772 /* rate->cell_align = 0; */ /* distro headers. */
5773 rate
->mpu
= ETH_TOTAL_MIN
;
5777 /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
5778 * attribute of the specified "type".
5780 * See tc_calc_cell_log() above for a description of "rtab"s. */
5782 tc_put_rtab(struct ofpbuf
*msg
, uint16_t type
, const struct tc_ratespec
*rate
)
5787 rtab
= nl_msg_put_unspec_uninit(msg
, type
, TC_RTAB_SIZE
);
5788 for (i
= 0; i
< TC_RTAB_SIZE
/ sizeof *rtab
; i
++) {
5789 unsigned packet_size
= (i
+ 1) << rate
->cell_log
;
5790 if (packet_size
< rate
->mpu
) {
5791 packet_size
= rate
->mpu
;
5793 rtab
[i
] = tc_bytes_to_ticks(rate
->rate
, packet_size
);
5797 /* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
5798 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
5799 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
5802 tc_calc_buffer(unsigned int Bps
, int mtu
, uint64_t burst_bytes
)
5804 unsigned int min_burst
= tc_buffer_per_jiffy(Bps
) + mtu
;
5805 return tc_bytes_to_ticks(Bps
, MAX(burst_bytes
, min_burst
));
5808 /* Linux-only functions declared in netdev-linux.h */
5810 /* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
5811 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
5813 netdev_linux_ethtool_set_flag(struct netdev
*netdev
, uint32_t flag
,
5814 const char *flag_name
, bool enable
)
5816 const char *netdev_name
= netdev_get_name(netdev
);
5817 struct ethtool_value evalue
;
5821 COVERAGE_INC(netdev_get_ethtool
);
5822 memset(&evalue
, 0, sizeof evalue
);
5823 error
= netdev_linux_do_ethtool(netdev_name
,
5824 (struct ethtool_cmd
*)&evalue
,
5825 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5830 COVERAGE_INC(netdev_set_ethtool
);
5831 new_flags
= (evalue
.data
& ~flag
) | (enable
? flag
: 0);
5832 if (new_flags
== evalue
.data
) {
5835 evalue
.data
= new_flags
;
5836 error
= netdev_linux_do_ethtool(netdev_name
,
5837 (struct ethtool_cmd
*)&evalue
,
5838 ETHTOOL_SFLAGS
, "ETHTOOL_SFLAGS");
5843 COVERAGE_INC(netdev_get_ethtool
);
5844 memset(&evalue
, 0, sizeof evalue
);
5845 error
= netdev_linux_do_ethtool(netdev_name
,
5846 (struct ethtool_cmd
*)&evalue
,
5847 ETHTOOL_GFLAGS
, "ETHTOOL_GFLAGS");
5852 if (new_flags
!= evalue
.data
) {
5853 VLOG_WARN_RL(&rl
, "attempt to %s ethtool %s flag on network "
5854 "device %s failed", enable
? "enable" : "disable",
5855 flag_name
, netdev_name
);
5862 /* Utility functions. */
5864 /* Copies 'src' into 'dst', performing format conversion in the process. */
5866 netdev_stats_from_rtnl_link_stats(struct netdev_stats
*dst
,
5867 const struct rtnl_link_stats
*src
)
5869 dst
->rx_packets
= src
->rx_packets
;
5870 dst
->tx_packets
= src
->tx_packets
;
5871 dst
->rx_bytes
= src
->rx_bytes
;
5872 dst
->tx_bytes
= src
->tx_bytes
;
5873 dst
->rx_errors
= src
->rx_errors
;
5874 dst
->tx_errors
= src
->tx_errors
;
5875 dst
->rx_dropped
= src
->rx_dropped
;
5876 dst
->tx_dropped
= src
->tx_dropped
;
5877 dst
->multicast
= src
->multicast
;
5878 dst
->collisions
= src
->collisions
;
5879 dst
->rx_length_errors
= src
->rx_length_errors
;
5880 dst
->rx_over_errors
= src
->rx_over_errors
;
5881 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5882 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5883 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5884 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5885 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5886 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5887 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5888 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5889 dst
->tx_window_errors
= src
->tx_window_errors
;
5892 /* Copies 'src' into 'dst', performing format conversion in the process. */
5894 netdev_stats_from_rtnl_link_stats64(struct netdev_stats
*dst
,
5895 const struct rtnl_link_stats64
*src
)
5897 dst
->rx_packets
= src
->rx_packets
;
5898 dst
->tx_packets
= src
->tx_packets
;
5899 dst
->rx_bytes
= src
->rx_bytes
;
5900 dst
->tx_bytes
= src
->tx_bytes
;
5901 dst
->rx_errors
= src
->rx_errors
;
5902 dst
->tx_errors
= src
->tx_errors
;
5903 dst
->rx_dropped
= src
->rx_dropped
;
5904 dst
->tx_dropped
= src
->tx_dropped
;
5905 dst
->multicast
= src
->multicast
;
5906 dst
->collisions
= src
->collisions
;
5907 dst
->rx_length_errors
= src
->rx_length_errors
;
5908 dst
->rx_over_errors
= src
->rx_over_errors
;
5909 dst
->rx_crc_errors
= src
->rx_crc_errors
;
5910 dst
->rx_frame_errors
= src
->rx_frame_errors
;
5911 dst
->rx_fifo_errors
= src
->rx_fifo_errors
;
5912 dst
->rx_missed_errors
= src
->rx_missed_errors
;
5913 dst
->tx_aborted_errors
= src
->tx_aborted_errors
;
5914 dst
->tx_carrier_errors
= src
->tx_carrier_errors
;
5915 dst
->tx_fifo_errors
= src
->tx_fifo_errors
;
5916 dst
->tx_heartbeat_errors
= src
->tx_heartbeat_errors
;
5917 dst
->tx_window_errors
= src
->tx_window_errors
;
5921 get_stats_via_netlink(const struct netdev
*netdev_
, struct netdev_stats
*stats
)
5923 struct ofpbuf request
;
5924 struct ofpbuf
*reply
;
5927 /* Filtering all counters by default */
5928 memset(stats
, 0xFF, sizeof(struct netdev_stats
));
5930 ofpbuf_init(&request
, 0);
5931 nl_msg_put_nlmsghdr(&request
,
5932 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
),
5933 RTM_GETLINK
, NLM_F_REQUEST
);
5934 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
5935 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(netdev_
));
5936 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
5937 ofpbuf_uninit(&request
);
5942 if (ofpbuf_try_pull(reply
, NLMSG_HDRLEN
+ sizeof(struct ifinfomsg
))) {
5943 const struct nlattr
*a
= nl_attr_find(reply
, 0, IFLA_STATS64
);
5944 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats64
)) {
5945 netdev_stats_from_rtnl_link_stats64(stats
, nl_attr_get(a
));
5948 a
= nl_attr_find(reply
, 0, IFLA_STATS
);
5949 if (a
&& nl_attr_get_size(a
) >= sizeof(struct rtnl_link_stats
)) {
5950 netdev_stats_from_rtnl_link_stats(stats
, nl_attr_get(a
));
5953 VLOG_WARN_RL(&rl
, "RTM_GETLINK reply lacks stats");
5958 VLOG_WARN_RL(&rl
, "short RTM_GETLINK reply");
5963 ofpbuf_delete(reply
);
5968 get_flags(const struct netdev
*dev
, unsigned int *flags
)
5974 error
= af_inet_ifreq_ioctl(dev
->name
, &ifr
, SIOCGIFFLAGS
, "SIOCGIFFLAGS");
5976 *flags
= ifr
.ifr_flags
;
5982 set_flags(const char *name
, unsigned int flags
)
5986 ifr
.ifr_flags
= flags
;
5987 return af_inet_ifreq_ioctl(name
, &ifr
, SIOCSIFFLAGS
, "SIOCSIFFLAGS");
5991 linux_get_ifindex(const char *netdev_name
)
5996 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
5997 COVERAGE_INC(netdev_get_ifindex
);
5999 error
= af_inet_ioctl(SIOCGIFINDEX
, &ifr
);
6001 /* ENODEV probably means that a vif disappeared asynchronously and
6002 * hasn't been removed from the database yet, so reduce the log level
6003 * to INFO for that case. */
6004 VLOG_RL(&rl
, error
== ENODEV
? VLL_INFO
: VLL_ERR
,
6005 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
6006 netdev_name
, ovs_strerror(error
));
6009 return ifr
.ifr_ifindex
;
6013 get_ifindex(const struct netdev
*netdev_
, int *ifindexp
)
6015 struct netdev_linux
*netdev
= netdev_linux_cast(netdev_
);
6017 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
6018 netdev_linux_update_via_netlink(netdev
);
6021 if (!(netdev
->cache_valid
& VALID_IFINDEX
)) {
6022 /* Fall back to ioctl if netlink fails */
6023 int ifindex
= linux_get_ifindex(netdev_get_name(netdev_
));
6026 netdev
->get_ifindex_error
= -ifindex
;
6027 netdev
->ifindex
= 0;
6029 netdev
->get_ifindex_error
= 0;
6030 netdev
->ifindex
= ifindex
;
6032 netdev
->cache_valid
|= VALID_IFINDEX
;
6035 *ifindexp
= netdev
->ifindex
;
6036 return netdev
->get_ifindex_error
;
6040 netdev_linux_update_via_netlink(struct netdev_linux
*netdev
)
6042 struct ofpbuf request
;
6043 struct ofpbuf
*reply
;
6044 struct rtnetlink_change chg
;
6045 struct rtnetlink_change
*change
= &chg
;
6048 ofpbuf_init(&request
, 0);
6049 nl_msg_put_nlmsghdr(&request
,
6050 sizeof(struct ifinfomsg
) + NL_ATTR_SIZE(IFNAMSIZ
) +
6051 NL_A_U32_SIZE
, RTM_GETLINK
, NLM_F_REQUEST
);
6052 ofpbuf_put_zeros(&request
, sizeof(struct ifinfomsg
));
6054 /* The correct identifiers for a Linux device are netnsid and ifindex,
6055 * but ifindex changes as the port is moved to another network namespace
6056 * and the interface name statically stored in ovsdb. */
6057 nl_msg_put_string(&request
, IFLA_IFNAME
, netdev_get_name(&netdev
->up
));
6058 if (netdev_linux_netnsid_is_remote(netdev
)) {
6059 nl_msg_put_u32(&request
, IFLA_IF_NETNSID
, netdev
->netnsid
);
6061 error
= nl_transact(NETLINK_ROUTE
, &request
, &reply
);
6062 ofpbuf_uninit(&request
);
6064 ofpbuf_delete(reply
);
6068 if (rtnetlink_parse(reply
, change
)
6069 && change
->nlmsg_type
== RTM_NEWLINK
) {
6070 bool changed
= false;
6073 /* Update netdev from rtnl msg and increment its seq if needed. */
6074 if ((change
->ifi_flags
^ netdev
->ifi_flags
) & IFF_RUNNING
) {
6075 netdev
->carrier_resets
++;
6078 if (change
->ifi_flags
!= netdev
->ifi_flags
) {
6079 netdev
->ifi_flags
= change
->ifi_flags
;
6082 if (change
->mtu
&& change
->mtu
!= netdev
->mtu
) {
6083 netdev
->mtu
= change
->mtu
;
6084 netdev
->cache_valid
|= VALID_MTU
;
6085 netdev
->netdev_mtu_error
= 0;
6088 if (!eth_addr_is_zero(change
->mac
)
6089 && !eth_addr_equals(change
->mac
, netdev
->etheraddr
)) {
6090 netdev
->etheraddr
= change
->mac
;
6091 netdev
->cache_valid
|= VALID_ETHERADDR
;
6092 netdev
->ether_addr_error
= 0;
6095 if (change
->if_index
!= netdev
->ifindex
) {
6096 netdev
->ifindex
= change
->if_index
;
6097 netdev
->cache_valid
|= VALID_IFINDEX
;
6098 netdev
->get_ifindex_error
= 0;
6101 if (change
->master
&& netdev_linux_kind_is_lag(change
->master
)) {
6102 netdev
->is_lag_master
= true;
6105 netdev_change_seq_changed(&netdev
->up
);
6111 ofpbuf_delete(reply
);
6116 get_etheraddr(const char *netdev_name
, struct eth_addr
*ea
)
6122 memset(&ifr
, 0, sizeof ifr
);
6123 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
6124 COVERAGE_INC(netdev_get_hwaddr
);
6125 error
= af_inet_ioctl(SIOCGIFHWADDR
, &ifr
);
6127 /* ENODEV probably means that a vif disappeared asynchronously and
6128 * hasn't been removed from the database yet, so reduce the log level
6129 * to INFO for that case. */
6130 VLOG(error
== ENODEV
? VLL_INFO
: VLL_ERR
,
6131 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
6132 netdev_name
, ovs_strerror(error
));
6135 hwaddr_family
= ifr
.ifr_hwaddr
.sa_family
;
6136 if (hwaddr_family
!= AF_UNSPEC
&& hwaddr_family
!= ARPHRD_ETHER
&&
6137 hwaddr_family
!= ARPHRD_NONE
) {
6138 VLOG_INFO("%s device has unknown hardware address family %d",
6139 netdev_name
, hwaddr_family
);
6142 memcpy(ea
, ifr
.ifr_hwaddr
.sa_data
, ETH_ADDR_LEN
);
6147 set_etheraddr(const char *netdev_name
, const struct eth_addr mac
)
6152 memset(&ifr
, 0, sizeof ifr
);
6153 ovs_strzcpy(ifr
.ifr_name
, netdev_name
, sizeof ifr
.ifr_name
);
6154 ifr
.ifr_hwaddr
.sa_family
= ARPHRD_ETHER
;
6155 memcpy(ifr
.ifr_hwaddr
.sa_data
, &mac
, ETH_ADDR_LEN
);
6156 COVERAGE_INC(netdev_set_hwaddr
);
6157 error
= af_inet_ioctl(SIOCSIFHWADDR
, &ifr
);
6159 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
6160 netdev_name
, ovs_strerror(error
));
6166 netdev_linux_do_ethtool(const char *name
, struct ethtool_cmd
*ecmd
,
6167 int cmd
, const char *cmd_name
)
6172 memset(&ifr
, 0, sizeof ifr
);
6173 ovs_strzcpy(ifr
.ifr_name
, name
, sizeof ifr
.ifr_name
);
6174 ifr
.ifr_data
= (caddr_t
) ecmd
;
6177 error
= af_inet_ioctl(SIOCETHTOOL
, &ifr
);
6179 if (error
!= EOPNOTSUPP
) {
6180 VLOG_WARN_RL(&rl
, "ethtool command %s on network device %s "
6181 "failed: %s", cmd_name
, name
, ovs_strerror(error
));
6183 /* The device doesn't support this operation. That's pretty
6184 * common, so there's no point in logging anything. */
6190 /* Returns an AF_PACKET raw socket or a negative errno value. */
6192 af_packet_sock(void)
6194 static struct ovsthread_once once
= OVSTHREAD_ONCE_INITIALIZER
;
6197 if (ovsthread_once_start(&once
)) {
6198 sock
= socket(AF_PACKET
, SOCK_RAW
, 0);
6200 int error
= set_nonblocking(sock
);
6207 VLOG_ERR("failed to create packet socket: %s",
6208 ovs_strerror(errno
));
6210 ovsthread_once_done(&once
);